Game/Cerium: example/cuda_fft/main.cc comparison

comparison example/cuda_fft/main.cc @ 2008:2c8eab01cc78 draft

implement fft using cuda

author	Shohei KOKUBO <e105744@ie.u-ryukyu.ac.jp>
date	Tue, 03 Jun 2014 18:10:19 +0900
parents	bc2121b09cbc
children	6fced32f85fd

comparison

equal deleted inserted replaced

-:bc2121b09cbc
+:2c8eab01cc78
 #include <stdio.h>
 #include <sys/time.h>
 #include <string.h>
 #include <cuda.h>
+#include <vector_types.h>
 #include "pgm.h"
 #define PI 3.14159265358979
 #define MAX_SOURCE_SIZE (0x100000)
 enum Mode {
 forward = 0,
 inverse = 1
 };
-struct int2 {
-int x;
-int y;
-};
-struct float2 {
-float x;
-float y;
-};
 CUmodule module;
 static double
 getTime() {
 struct timeval tv;
 int
 setWorkSize(int* block, int* thread, int x, int y)
 {
 switch(y) {
 case 1:
-block = x;
+*block = x;
-thread = 1;
+*thread = 1;
 break;
 default:
-block = x;
+*block = x;
-thread = y;
+*thread = y;
 break;
 }
 return 0;
 }
 int n = 1<<m;
 int block, thread;
 setWorkSize(&block, &thread, n, n);
 CUfunction bitReverse;
-cuModuleGetFunction(bitReverse, module, "bitReverse");
+cuModuleGetFunction(&bitReverse, module, "bitReverse");
-void* kernel_args[] = {&dst, &src, &m, &n};
+void* bitReverse_args[] = {&dst, &src, &m, &n};
 cuLaunchKernel(bitReverse,
 block, 1, 1,
 thread, 1, 1,
-0, NULL, kernel_args, NULL);
+0, NULL, bitReverse_args, NULL);
 CUfunction butterfly;
-cuModuleGetFunction(butterfly, module, "butterfly");
+cuModuleGetFunction(&butterfly, module, "butterfly");
 setWorkSize(&block, &thread, n/2, n);
+void* butterfly_args[] = {&dst, &spin, &m, &n, 0, &flag};
 for (int i=1;i<=m;i++) {
-kernel_args[] = {&dst, &spin, &m, &n, &i, &flag};
+butterfly_args[4] = &i;
 cuLaunchKernel(butterfly,
 block, 1, 1,
 thread, 1, 1,
-0, NULL, kernel_args, NULL);
+0, NULL, butterfly_args, NULL);
 }
 CUfunction norm;
-cuModuleGetFunction(norm, module, "norm");
+cuModuleGetFunction(&norm, module, "norm");
+void* norm_args[] = {&dst, &m};
 if (direction == inverse) {
 setWorkSize(&block, &thread, n, n);
-kernel_args[] = {&dst, &m};
 cuLaunchKernel(norm,
 block, 1, 1,
 thread, 1, 1,
-0, NULL, kernel_args, NULL);
+0, NULL, norm_args, NULL);
 }
 return 0;
 }
 CUcontext context;
 cuCtxCreate(&context, CU_CTX_SCHED_SPIN, device);
 cuModuleLoad(&module, "fft.ptx");
-char* pgm_file = init(argc, argv);
+char* pgm_file = init(args, argv);
 pgm_t ipgm;
 int err = readPGM(&ipgm, pgm_file);
 if (err<0) {
 fprintf(stderr, "Failed to read image file.\n");
 // Synchronous data transfer(host to device)
 cuMemcpyHtoD(xmobj, xm, n*n*sizeof(float2));
 CUfunction spinFact;
-cuModuleGetFunction(spinFact, module, "spinFact");
+cuModuleGetFunction(&spinFact, module, "spinFact");
 int block, thread;
 setWorkSize(&block, &thread, n/2, 1);
-void* kernel_args[] = {&xmobj, &n};
+void* spinFact_args[] = {&xmobj, &n};
 cuLaunchKernel(spinFact,
 block, 1, 1,
 thread, 1, 1,
-0, NULL, kernel_args, NULL);
+0, NULL, spinFact_args, NULL);
 fftCore(rmobj, xmobj, wmobj, m, forward);
-CUfunction transfer;
+CUfunction transpose;
-cuModuleGetFunction(transfer, module, "transfer");
+cuModuleGetFunction(&transpose, module, "transpose");
 setWorkSize(&block, &thread, n, n);
-kernel_args[] = {&xmobj, &rmobj, &n};
+void* transpose_args[] = {&xmobj, &rmobj, &n};
-cuLaunchKernel(transfer,
+cuLaunchKernel(transpose,
 block, 1, 1,
 thread, 1, 1,
-0, NULL, kernel_args, NULL);
+0, NULL, transpose_args, NULL);
 fftCore(rmobj, xmobj, wmobj, m, forward);
 CUfunction highPassFilter;
-cuModuleGetFunction(transfer, module, "highPassFilter");
+cuModuleGetFunction(&highPassFilter, module, "highPassFilter");
 setWorkSize(&block, &thread, n, n);
 int radius = n/8;
-kernel_args[] = {&rmobj, &n, &radius};
+void*highPassFilter_args[] = {&rmobj, &n, &radius};
 cuLaunchKernel(highPassFilter,
 block, 1, 1,
 thread, 1, 1,
-0, NULL, kernel_args, NULL);
+0, NULL, highPassFilter_args, NULL);
 fftCore(xmobj, rmobj, wmobj, m, inverse);
 setWorkSize(&block, &thread, n, n);
-kernel_args[] = {&rmobj, &xmobj};
+void* transpose2_args[] = {&rmobj, &xmobj, &n};
-cuLaunchKernel(transfer,
+cuLaunchKernel(transpose,
 block, 1, 1,
 thread, 1, 1,
-0, NULL, kernel_args, NULL);
+0, NULL, transpose2_args, NULL);
 fftCore(xmobj, rmobj, wmobj, m, inverse);
+cuMemcpyDtoH(xm, xmobj, n*n*sizeof(float2));
+float* ampd;
+ampd = (float*)malloc(n*n*sizeof(float));
+for (int i=0;i<n*n;i++)
+ampd[i] = (AMP(xm[i].x, xm[i].y));
+opgm.width = n;
+opgm.height = n;
+normalizeF2PGM(&opgm, ampd);
+free(ampd);
+ed_time = getTime();
+writePGM(&opgm, "output.pgm");
 // memory release
-cuMemFree(devA);
+cuMemFree(xmobj);
-for (int i=0;i<num_exec;i++) {
+cuMemFree(rmobj);
-cuMemFree(devB[i]);
+cuMemFree(wmobj);
-cuMemFree(devOut[i]);
-}
-for (int i=0;i<num_stream;i++)
-cuStreamDestroy(stream[i]);
 cuModuleUnload(module);
 cuCtxDestroy(context);
-delete[] A;
+destroyPGM(&ipgm);
-delete[] B;
+destroyPGM(&opgm);
-for (int i=0;i<num_exec;i++)
-delete[] result[i];
+free(xm);
-delete[] result;
+free(rm);
+free(wm);
+printf("Time: %0.6f\n", ed_time-st_time);
 return 0;
 }

Mercurial > hg > Game > Cerium

comparison example/cuda_fft/main.cc @ 2008:2c8eab01cc78 draft