view example/cuda_fft/main.cc @ 2008:2c8eab01cc78 draft

implement fft using cuda
author Shohei KOKUBO <e105744@ie.u-ryukyu.ac.jp>
date Tue, 03 Jun 2014 18:10:19 +0900
parents bc2121b09cbc
children 6fced32f85fd
line wrap: on
line source

#include <stdio.h>
#include <sys/time.h>
#include <string.h>
#include <cuda.h>
#include <vector_types.h>

#include "pgm.h"

#define PI 3.14159265358979
#define MAX_SOURCE_SIZE (0x100000)
#define AMP(a, b) (sqrt((a)*(a)+(b)))

static double st_time;
static double ed_time;
enum Mode {
    forward = 0,
    inverse = 1
};

CUmodule module;

static double
getTime() {
    struct timeval tv;
    gettimeofday(&tv, NULL);
    return tv.tv_sec + (double)tv.tv_usec*1e-6;
}

int
setWorkSize(int* block, int* thread, int x, int y)
{
    switch(y) {
    case 1:
        *block = x;
        *thread = 1;
        break;
    default:
        *block = x;
        *thread = y;
        break;
    }

    return 0;
}
int
fftCore(CUdeviceptr dst, CUdeviceptr src, CUdeviceptr spin, int m, enum Mode direction)
{

    unsigned int flag;
    switch (direction) {
    case forward:flag = 0x00000000; break;
    case inverse:flag = 0x80000000; break;
    }

    int n = 1<<m;
    int block, thread;
    setWorkSize(&block, &thread, n, n);

    CUfunction bitReverse;
    cuModuleGetFunction(&bitReverse, module, "bitReverse");
    
    void* bitReverse_args[] = {&dst, &src, &m, &n};

    cuLaunchKernel(bitReverse,
                   block, 1, 1,
                   thread, 1, 1,
                   0, NULL, bitReverse_args, NULL);

    CUfunction butterfly;
    cuModuleGetFunction(&butterfly, module, "butterfly");

    setWorkSize(&block, &thread, n/2, n);
    void* butterfly_args[] = {&dst, &spin, &m, &n, 0, &flag};
    for (int i=1;i<=m;i++) {
        butterfly_args[4] = &i;
        cuLaunchKernel(butterfly,
                       block, 1, 1,
                       thread, 1, 1,
                       0, NULL, butterfly_args, NULL);
    }
    
    CUfunction norm;
    cuModuleGetFunction(&norm, module, "norm");

    void* norm_args[] = {&dst, &m};
    if (direction == inverse) {
        setWorkSize(&block, &thread, n, n);
        cuLaunchKernel(norm,
                       block, 1, 1,
                       thread, 1, 1,
                       0, NULL, norm_args, NULL);
    }

    return 0;
}

char*
init(int argc, char**argv){

    char *filename = 0;

    for (int i = 1; argv[i]; ++i) {
        if (strcmp(argv[i], "-file") == 0) {
            filename = argv[i+1];
        }
    }

    if ( (argc == 1)||(filename==0)) {
        printf("Usage: ./fft -file [image filename] \n");
        exit(-1);
    }

    return filename;
}

int main(int args, char* argv[]) {
    cuInit(0);

    CUdevice device;
    cuDeviceGet(&device, 0);

    CUcontext context;
    cuCtxCreate(&context, CU_CTX_SCHED_SPIN, device);

    cuModuleLoad(&module, "fft.ptx");

    char* pgm_file = init(args, argv);

    pgm_t ipgm;
    int err = readPGM(&ipgm, pgm_file);
    if (err<0) {
        fprintf(stderr, "Failed to read image file.\n");
        exit(1);
    }

    int n = ipgm.width;
    int m = (int)(log((double)n)/log(2.0));

    pgm_t opgm;

    float2* xm = (float2*)malloc(n*n*sizeof(float2));
    float2* rm = (float2*)malloc(n*n*sizeof(float2));
    float2* wm = (float2*)malloc(n/2*sizeof(float2));

    for (int i=0; i<n*n; i++) {
        xm[i].x = (float)ipgm.buf[i];
        xm[i].y = (float)0;
    }
    
    st_time = getTime();

    // memory allocate
    CUdeviceptr xmobj;
    cuMemAlloc(&xmobj, n*n*sizeof(float2));

    CUdeviceptr rmobj;
    cuMemAlloc(&rmobj, n*n*sizeof(float2));

    CUdeviceptr wmobj;
    cuMemAlloc(&wmobj, (n/2)*sizeof(float2));

    // Synchronous data transfer(host to device)
    cuMemcpyHtoD(xmobj, xm, n*n*sizeof(float2));

    CUfunction spinFact;
    cuModuleGetFunction(&spinFact, module, "spinFact");
    
    int block, thread;
    setWorkSize(&block, &thread, n/2, 1);

    void* spinFact_args[] = {&xmobj, &n};
    cuLaunchKernel(spinFact,
                   block, 1, 1,
                   thread, 1, 1,
                   0, NULL, spinFact_args, NULL);
    
    fftCore(rmobj, xmobj, wmobj, m, forward);
    
    CUfunction transpose;
    cuModuleGetFunction(&transpose, module, "transpose");

    setWorkSize(&block, &thread, n, n);

    void* transpose_args[] = {&xmobj, &rmobj, &n};
    cuLaunchKernel(transpose,
                   block, 1, 1,
                   thread, 1, 1,
                   0, NULL, transpose_args, NULL);

    fftCore(rmobj, xmobj, wmobj, m, forward);

    CUfunction highPassFilter;
    cuModuleGetFunction(&highPassFilter, module, "highPassFilter");

    setWorkSize(&block, &thread, n, n);

    int radius = n/8;
    void*highPassFilter_args[] = {&rmobj, &n, &radius};
    cuLaunchKernel(highPassFilter,
                   block, 1, 1,
                   thread, 1, 1,
                   0, NULL, highPassFilter_args, NULL);

    fftCore(xmobj, rmobj, wmobj, m, inverse);

    setWorkSize(&block, &thread, n, n);
    
    void* transpose2_args[] = {&rmobj, &xmobj, &n};
    cuLaunchKernel(transpose,
                   block, 1, 1,
                   thread, 1, 1,
                   0, NULL, transpose2_args, NULL);
    
    fftCore(xmobj, rmobj, wmobj, m, inverse);

    cuMemcpyDtoH(xm, xmobj, n*n*sizeof(float2));

    float* ampd;
    ampd = (float*)malloc(n*n*sizeof(float));

    for (int i=0;i<n*n;i++)
        ampd[i] = (AMP(xm[i].x, xm[i].y));
    
    opgm.width = n;
    opgm.height = n;
    normalizeF2PGM(&opgm, ampd);
    free(ampd);

    ed_time = getTime();

    writePGM(&opgm, "output.pgm");

    // memory release
    cuMemFree(xmobj);
    cuMemFree(rmobj);
    cuMemFree(wmobj);
    cuModuleUnload(module);
    cuCtxDestroy(context);

    destroyPGM(&ipgm);
    destroyPGM(&opgm);

    free(xm);
    free(rm);
    free(wm);

    printf("Time: %0.6f\n", ed_time-st_time);

    return 0;
}