view example/cuda_fft/main.cc @ 2069:26aa08c9a1de draft default tip

cuda example fix
author Shinji KONO <kono@ie.u-ryukyu.ac.jp>
date Sun, 12 Feb 2017 10:04:55 +0900
parents faaea4e1ce1c
children
line wrap: on
line source

#include <stdio.h>
#include <sys/time.h>
#include <string.h>
#include <cuda.h>
#include <vector_types.h>

#include "pgm.h"

#define PI 3.14159265358979
#define MAX_SOURCE_SIZE (0x100000)
#define AMP(a, b) (sqrt((a)*(a)+(b)))

static double st_time;
static double ed_time;
enum Mode {
    forward = 0,
    inverse = 1
};

CUmodule module;

static double
getTime() {
    struct timeval tv;
    gettimeofday(&tv, NULL);
    return tv.tv_sec + (double)tv.tv_usec*1e-6;
}

int
setWorkSize(int* xblocks, int* yblocks, int x, int y)
{
    switch(y) {
    case 1:
        *xblocks = x;
        *yblocks = 1;
        break;
    default:
        *xblocks = x;
        *yblocks = y;
        break;
    }

    return 0;
}

int
fftCore(CUdeviceptr dst, CUdeviceptr src, CUdeviceptr spin, int m, enum Mode direction)
{

    unsigned int flag;
    switch (direction) {
    case forward:flag = 0x00000000; break;
    case inverse:flag = 0x80000000; break;
    }

    int n = 1<<m;
    int xblocks, yblocks;
    setWorkSize(&xblocks, &yblocks, n, n);

    CUfunction bitReverse;
    cuModuleGetFunction(&bitReverse, module, "bitReverse");
    
    void* bitReverse_args[] = {&dst, &src, &m, &n};

    cuLaunchKernel(bitReverse,
                   xblocks, yblocks, 1,
                   1, 1, 1,
                   0, NULL, bitReverse_args, NULL);

    CUfunction butterfly;
    cuModuleGetFunction(&butterfly, module, "butterfly");

    setWorkSize(&xblocks, &yblocks, n/2, n);
    void* butterfly_args[] = {&dst, &spin, &m, &n, 0, &flag};
    for (int i=1;i<=m;i++) {
        butterfly_args[4] = &i;
        cuLaunchKernel(butterfly,
                       xblocks, yblocks, 1,
                       1, 1, 1,
                       0, NULL, butterfly_args, NULL);
    }
    
    CUfunction norm;
    cuModuleGetFunction(&norm, module, "norm");

    void* norm_args[] = {&dst, &n};
    if (direction == inverse) {
        setWorkSize(&xblocks, &yblocks, n, n);
        cuLaunchKernel(norm,
                       xblocks, yblocks, 1,
                       1, 1, 1,
                       0, NULL, norm_args, NULL);
    }

    return 0;
}

char*
init(int argc, char**argv){

    char *filename = 0;

    for (int i = 1; argv[i]; ++i) {
        if (strcmp(argv[i], "-file") == 0) {
            filename = argv[i+1];
        }
    }

    if ( (argc == 1)||(filename==0)) {
        printf("Usage: ./fft -file [image filename] \n");
        exit(-1);
    }

    return filename;
}

int main(int args, char* argv[]) {
    cuInit(0);

    CUdevice device;
    cuDeviceGet(&device, 0);

    CUcontext context;
    cuCtxCreate(&context, CU_CTX_SCHED_SPIN, device);

    cuModuleLoad(&module, "fft.ptx");

    char* pgm_file = init(args, argv);

    pgm_t ipgm;
    int err = readPGM(&ipgm, pgm_file);
    if (err<0) {
        fprintf(stderr, "Failed to read image file.\n");
        exit(1);
    }

    int n = ipgm.width;
    int m = (int)(log((double)n)/log(2.0));

    pgm_t opgm;

    float2* xm = (float2*)malloc(n*n*sizeof(float2));
    float2* rm = (float2*)malloc(n*n*sizeof(float2));
    float2* wm = (float2*)malloc(n/2*sizeof(float2));

    for (int i=0; i<n*n; i++) {
        xm[i].x = (float)ipgm.buf[i];
        xm[i].y = (float)0;
    }
    
    st_time = getTime();

    // memory allocate
    CUdeviceptr xmobj;
    cuMemAlloc(&xmobj, n*n*sizeof(float2));
    
    CUdeviceptr rmobj;
    cuMemAlloc(&rmobj, n*n*sizeof(float2));

    CUdeviceptr wmobj;
    cuMemAlloc(&wmobj, n/2*sizeof(float2));

    CUfunction spinFact;
    cuModuleGetFunction(&spinFact, module, "spinFact");
    
    int xblocks, yblocks;
    setWorkSize(&xblocks, &yblocks, n/2, 1);

    // Synchronous data transfer(host to device)
    cuMemcpyHtoD(xmobj, xm, n*n*sizeof(float2));

    void* spinFact_args[] = {&wmobj, &n};
    cuLaunchKernel(spinFact,
                   xblocks, yblocks, 1,
                   1, 1, 1,
                   0, NULL, spinFact_args, NULL);
    

    fftCore(rmobj, xmobj, wmobj, m, forward);
    
    CUfunction transpose;
    cuModuleGetFunction(&transpose, module, "transpose");

    setWorkSize(&xblocks, &yblocks, n, n);

    void* transpose_args[] = {&xmobj, &rmobj, &n};
    cuLaunchKernel(transpose,
                   xblocks, yblocks, 1,
                   1, 1, 1,
                   0, NULL, transpose_args, NULL);


    fftCore(rmobj, xmobj, wmobj, m, forward);
    

    CUfunction highPassFilter;
    cuModuleGetFunction(&highPassFilter, module, "highPassFilter");

    setWorkSize(&xblocks, &yblocks, n, n);

    int radius = n/8;
    void*highPassFilter_args[] = {&rmobj, &n, &radius};
    cuLaunchKernel(highPassFilter,
                   xblocks, yblocks, 1,
                   1, 1, 1,
                   0, NULL, highPassFilter_args, NULL);


    fftCore(xmobj, rmobj, wmobj, m, inverse);

    setWorkSize(&xblocks, &yblocks, n, n);
    
    void* transpose2_args[] = {&rmobj, &xmobj, &n};
    cuLaunchKernel(transpose,
                   xblocks, yblocks, 1,
                   1, 1, 1,
                   0, NULL, transpose2_args, NULL);
    
    fftCore(xmobj, rmobj, wmobj, m, inverse);


    cuMemcpyDtoH(xm, xmobj, n*n*sizeof(float2));

    cuStreamSynchronize(NULL);

    float* ampd;
    ampd = (float*)malloc(n*n*sizeof(float));

    for (int i=0;i<n*n;i++)
        ampd[i] = (AMP(xm[i].x, xm[i].y));
    
    opgm.width = n;
    opgm.height = n;
    normalizeF2PGM(&opgm, ampd);
    free(ampd);

    ed_time = getTime();

    writePGM(&opgm, "output.pgm");

    // memory release
    cuMemFree(xmobj);
    cuMemFree(rmobj);
    cuMemFree(wmobj);
    cuModuleUnload(module);
    cuCtxDestroy(context);

    destroyPGM(&ipgm);
    destroyPGM(&opgm);

    free(xm);
    free(rm);
    free(wm);

    printf("Time: %0.6f\n", ed_time-st_time);

    return 0;
}