Mercurial > hg > Game > Cerium
view example/cuda_fft/main.cc @ 2008:2c8eab01cc78 draft
implement fft using cuda
author | Shohei KOKUBO <e105744@ie.u-ryukyu.ac.jp> |
---|---|
date | Tue, 03 Jun 2014 18:10:19 +0900 |
parents | bc2121b09cbc |
children | 6fced32f85fd |
line wrap: on
line source
#include <stdio.h> #include <sys/time.h> #include <string.h> #include <cuda.h> #include <vector_types.h> #include "pgm.h" #define PI 3.14159265358979 #define MAX_SOURCE_SIZE (0x100000) #define AMP(a, b) (sqrt((a)*(a)+(b))) static double st_time; static double ed_time; enum Mode { forward = 0, inverse = 1 }; CUmodule module; static double getTime() { struct timeval tv; gettimeofday(&tv, NULL); return tv.tv_sec + (double)tv.tv_usec*1e-6; } int setWorkSize(int* block, int* thread, int x, int y) { switch(y) { case 1: *block = x; *thread = 1; break; default: *block = x; *thread = y; break; } return 0; } int fftCore(CUdeviceptr dst, CUdeviceptr src, CUdeviceptr spin, int m, enum Mode direction) { unsigned int flag; switch (direction) { case forward:flag = 0x00000000; break; case inverse:flag = 0x80000000; break; } int n = 1<<m; int block, thread; setWorkSize(&block, &thread, n, n); CUfunction bitReverse; cuModuleGetFunction(&bitReverse, module, "bitReverse"); void* bitReverse_args[] = {&dst, &src, &m, &n}; cuLaunchKernel(bitReverse, block, 1, 1, thread, 1, 1, 0, NULL, bitReverse_args, NULL); CUfunction butterfly; cuModuleGetFunction(&butterfly, module, "butterfly"); setWorkSize(&block, &thread, n/2, n); void* butterfly_args[] = {&dst, &spin, &m, &n, 0, &flag}; for (int i=1;i<=m;i++) { butterfly_args[4] = &i; cuLaunchKernel(butterfly, block, 1, 1, thread, 1, 1, 0, NULL, butterfly_args, NULL); } CUfunction norm; cuModuleGetFunction(&norm, module, "norm"); void* norm_args[] = {&dst, &m}; if (direction == inverse) { setWorkSize(&block, &thread, n, n); cuLaunchKernel(norm, block, 1, 1, thread, 1, 1, 0, NULL, norm_args, NULL); } return 0; } char* init(int argc, char**argv){ char *filename = 0; for (int i = 1; argv[i]; ++i) { if (strcmp(argv[i], "-file") == 0) { filename = argv[i+1]; } } if ( (argc == 1)||(filename==0)) { printf("Usage: ./fft -file [image filename] \n"); exit(-1); } return filename; } int main(int args, char* argv[]) { cuInit(0); CUdevice device; cuDeviceGet(&device, 0); CUcontext context; cuCtxCreate(&context, CU_CTX_SCHED_SPIN, device); cuModuleLoad(&module, "fft.ptx"); char* pgm_file = init(args, argv); pgm_t ipgm; int err = readPGM(&ipgm, pgm_file); if (err<0) { fprintf(stderr, "Failed to read image file.\n"); exit(1); } int n = ipgm.width; int m = (int)(log((double)n)/log(2.0)); pgm_t opgm; float2* xm = (float2*)malloc(n*n*sizeof(float2)); float2* rm = (float2*)malloc(n*n*sizeof(float2)); float2* wm = (float2*)malloc(n/2*sizeof(float2)); for (int i=0; i<n*n; i++) { xm[i].x = (float)ipgm.buf[i]; xm[i].y = (float)0; } st_time = getTime(); // memory allocate CUdeviceptr xmobj; cuMemAlloc(&xmobj, n*n*sizeof(float2)); CUdeviceptr rmobj; cuMemAlloc(&rmobj, n*n*sizeof(float2)); CUdeviceptr wmobj; cuMemAlloc(&wmobj, (n/2)*sizeof(float2)); // Synchronous data transfer(host to device) cuMemcpyHtoD(xmobj, xm, n*n*sizeof(float2)); CUfunction spinFact; cuModuleGetFunction(&spinFact, module, "spinFact"); int block, thread; setWorkSize(&block, &thread, n/2, 1); void* spinFact_args[] = {&xmobj, &n}; cuLaunchKernel(spinFact, block, 1, 1, thread, 1, 1, 0, NULL, spinFact_args, NULL); fftCore(rmobj, xmobj, wmobj, m, forward); CUfunction transpose; cuModuleGetFunction(&transpose, module, "transpose"); setWorkSize(&block, &thread, n, n); void* transpose_args[] = {&xmobj, &rmobj, &n}; cuLaunchKernel(transpose, block, 1, 1, thread, 1, 1, 0, NULL, transpose_args, NULL); fftCore(rmobj, xmobj, wmobj, m, forward); CUfunction highPassFilter; cuModuleGetFunction(&highPassFilter, module, "highPassFilter"); setWorkSize(&block, &thread, n, n); int radius = n/8; void*highPassFilter_args[] = {&rmobj, &n, &radius}; cuLaunchKernel(highPassFilter, block, 1, 1, thread, 1, 1, 0, NULL, highPassFilter_args, NULL); fftCore(xmobj, rmobj, wmobj, m, inverse); setWorkSize(&block, &thread, n, n); void* transpose2_args[] = {&rmobj, &xmobj, &n}; cuLaunchKernel(transpose, block, 1, 1, thread, 1, 1, 0, NULL, transpose2_args, NULL); fftCore(xmobj, rmobj, wmobj, m, inverse); cuMemcpyDtoH(xm, xmobj, n*n*sizeof(float2)); float* ampd; ampd = (float*)malloc(n*n*sizeof(float)); for (int i=0;i<n*n;i++) ampd[i] = (AMP(xm[i].x, xm[i].y)); opgm.width = n; opgm.height = n; normalizeF2PGM(&opgm, ampd); free(ampd); ed_time = getTime(); writePGM(&opgm, "output.pgm"); // memory release cuMemFree(xmobj); cuMemFree(rmobj); cuMemFree(wmobj); cuModuleUnload(module); cuCtxDestroy(context); destroyPGM(&ipgm); destroyPGM(&opgm); free(xm); free(rm); free(wm); printf("Time: %0.6f\n", ed_time-st_time); return 0; }