Mercurial > hg > Game > Cerium
changeset 2006:f6aa6d6a3fa2 draft
add fft using cuda, not running
author | Shohei KOKUBO <e105744@ie.u-ryukyu.ac.jp> |
---|---|
date | Tue, 03 Jun 2014 12:07:00 +0900 |
parents | 4d1bc8cd3a62 |
children | bc2121b09cbc |
files | example/Cuda/main.cc example/Cuda/multiply.cu example/cuda_fft/Makefile example/cuda_fft/Makefile.def example/cuda_fft/fft.cu example/cuda_fft/main.cc |
diffstat | 6 files changed, 492 insertions(+), 3 deletions(-) [+] |
line wrap: on
line diff
--- a/example/Cuda/main.cc Tue May 27 14:23:44 2014 +0900 +++ b/example/Cuda/main.cc Tue Jun 03 12:07:00 2014 +0900 @@ -3,8 +3,8 @@ #include <string.h> #include <cuda.h> -#define LENGTH 10000 -#define THREAD 100 +#define LENGTH 10 +#define THREAD 10 static double getTime() {
--- a/example/Cuda/multiply.cu Tue May 27 14:23:44 2014 +0900 +++ b/example/Cuda/multiply.cu Tue Jun 03 12:07:00 2014 +0900 @@ -1,5 +1,6 @@ extern "C" { - __global__ void multiply(float* A, float* B, float* C) { + __global__ void multiply(float* A, float* B, float* C,int* i) { + printf("%d %d\n",i[0],i[1]); int index = blockIdx.x * blockDim.x + threadIdx.x; C[index] = A[index] * B[0]; }
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/example/cuda_fft/Makefile Tue Jun 03 12:07:00 2014 +0900 @@ -0,0 +1,37 @@ +include ./Makefile.def + +SRCS_TMP = $(wildcard *.cc) +SRCS_EXCLUDE = # 除外するファイルを書く +SRCS = $(filter-out $(SRCS_EXCLUDE),$(SRCS_TMP)) +OBJS = $(SRCS:.cc=.o) + +CUDA_SRCS_TMP = $(wildcard *.cu) +CUDA_SRCS_EXCLUDE = # 除外するファイルを書く +CUDA_SRCS = $(filter-out $(CUDA_SRCS_EXCLUDE),$(CUDA_SRCS_TMP)) +CUDA_OBJS = $(CUDA_SRCS:.cu=.ptx) + +CC += $(ABI) + +LIBS = -F/Library/Frameworks -framework CUDA +INCLUDE = -I$(CUDA_PATH) + +.SUFFIXES: .cc .o .cu .ptx +.cc.o: + $(CC) $(CFLAGS) $(INCLUDE) -c $< -o $@ +.cu.ptx: + $(NVCC) $(NVCCFLAGS) $< + +all: $(TARGET) + +$(TARGET): $(OBJS) $(TASK_OBJS) $(CUDA_OBJS) + $(CC) -o $@ $(OBJS) $(TASK_OBJS) $(LIBS) + +link: + $(CC) -o $(TARGET) $(OBJS) $(TASK_OBJS) $(LIBS) + +debug: $(TARGET) + sudo gdb ./$(TARGET) + +clean: + rm -f $(TARGET) $(OBJS) $(TASK_OBJS) $(CUDA_OBJS) + rm -f *~ \#*
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/example/cuda_fft/Makefile.def Tue Jun 03 12:07:00 2014 +0900 @@ -0,0 +1,8 @@ +TARGET = multiply + +OPT = -g -O0 + +CC = clang++ +NVCC = nvcc +CFLAGS = -Wall $(OPT) +NVCCFLAGS = -ptx -arch=sm_20 \ No newline at end of file
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/example/cuda_fft/fft.cu Tue Jun 03 12:07:00 2014 +0900 @@ -0,0 +1,198 @@ +extern "C" { + + __global__ void + bitReverse(long* param, float* src, float* dst) + { + unsigned long gid = blockIdx.x*blockDim.x+threadIdx.x; // (unsigned long)s->get_param(0); + unsigned long nid = blockIdx.y*blockDim.y+threadIdx.y; // (unsigned long)s->get_param(1); + + unsigned int j = gid; + + unsigned long m = param[0]; + unsigned long n = param[1]; + + j = (j & 0x55555555) << 1 | (j & 0xAAAAAAAA) >> 1; + j = (j & 0x33333333) << 2 | (j & 0xCCCCCCCC) >> 2; + j = (j & 0x0F0F0F0F) << 4 | (j & 0xF0F0F0F0) >> 4; + j = (j & 0x00FF00FF) << 8 | (j & 0xFF00FF00) >> 8; + j = (j & 0x0000FFFF) << 16 | (j & 0xFFFF0000) >> 16; + + j >>= (32-m); + + dst[(nid*n+j)*2] = src[(nid*n+gid)*2]; + dst[(nid*n+j)*2+1] = src[(nid*n+gid)*2+1]; + } +} +extern "C" { + + __global__ void + bitReverse(long* param, float* src, float* dst) + { + unsigned long gid = blockIdx.x*blockDim.x+threadIdx.x; // (unsigned long)s->get_param(0); + unsigned long nid = blockIdx.y*blockDim.y+threadIdx.y; // (unsigned long)s->get_param(1); + + unsigned int j = gid; + + unsigned long m = param[0]; + unsigned long n = param[1]; + + j = (j & 0x55555555) << 1 | (j & 0xAAAAAAAA) >> 1; + j = (j & 0x33333333) << 2 | (j & 0xCCCCCCCC) >> 2; + j = (j & 0x0F0F0F0F) << 4 | (j & 0xF0F0F0F0) >> 4; + j = (j & 0x00FF00FF) << 8 | (j & 0xFF00FF00) >> 8; + j = (j & 0x0000FFFF) << 16 | (j & 0xFFFF0000) >> 16; + + j >>= (32-m); + + dst[(nid*n+j)*2] = src[(nid*n+gid)*2]; + dst[(nid*n+j)*2+1] = src[(nid*n+gid)*2+1]; + } +} +extern "C" { + __global__ void + butterfly(long* param, float* x_in, float* w, float* x_out) + { + unsigned long gid = blockIdx.x*blockDim.x+threadIdx.x; // (unsigned long)s->get_param(0); + unsigned long nid = blockIdx.y*blockDim.y+threadIdx.y; // (unsigned long)s->get_param(1); + + long n = param[0]; + long direction_flag = param[1]; + long iter = param[2]; + + int butterflySize = 1 << (iter-1); + int butterflyGrpDist = 1 << iter; + int butterflyGrpNum = n >> iter; + int butterflyGrpBase = (gid >> (iter-1))*(butterflyGrpDist); + int butterflyGrpOffset = gid & (butterflySize-1); + + int a = nid * n + butterflyGrpBase + butterflyGrpOffset; + int b = a + butterflySize; + + int l = butterflyGrpNum * butterflyGrpOffset; + + float xa[2], xb[2], xbxx[2], xbyy[2], wab[2], wayx[2], wbyx[2], resa[2], resb[2]; + + xa[0] = x_in[2*a]; + xa[1] = x_in[2*a+1]; + xb[0] = x_in[2*b]; + xb[1] = x_in[2*b+1]; + xbxx[0] = xbxx[1] = xb[0]; + xbyy[0] = xbyy[1] = xb[1]; + + wab[0] = w[2*l]; + if(direction_flag == 0x80000000) { + wab[1] = -w[2*l+1]; + } else { + wab[1] = w[2*l+1]; + } + + wayx[0] = -wab[1]; + wayx[1] = wab[0]; + + wbyx[0] = wab[1]; + wbyx[1] = -wab[0]; + + resa[0] = xa[0] + xbxx[0]*wab[0] + xbyy[0]*wayx[0]; + resa[1] = xa[1] + xbxx[1]*wab[1] + xbyy[1]*wayx[1]; + + resb[0] = xa[0] - xbxx[0]*wab[0] + xbyy[0]*wbyx[0]; + resb[1] = xa[1] - xbxx[1]*wab[1] + xbyy[1]*wbyx[1]; + + x_out[2*a] = resa[0]; + x_out[2*a+1] = resa[1]; + x_out[2*b] = resb[0]; + x_out[2*b+1] = resb[1]; + } +} +extern "C" { + __global__ void + highPassFilter(long* param, float* in, float* image) + { + unsigned long xgid = blockIdx.x*blockDim.x+threadIdx.x; // (unsigned long)s->get_param(0); + unsigned long ygid = blockIdx.y*blockDim.y+threadIdx.y; // (unsigned long)s->get_param(1); + + long n = param[0]; + long radius = param[1]; + + int n_2[2]; + n_2[0] = n_2[1] = n>>1; + + int mask[2]; + mask[0] = mask[1] = n-1; + + int gid[2]; + gid[0] = (xgid + n_2[0]) & mask[0]; + gid[1] = (ygid + n_2[1]) & mask[1]; + + int diff[2]; + diff[0] = n_2[0] - gid[0]; + diff[1] = n_2[1] - gid[1]; + + int diff2[2]; + diff2[0] = diff[0] * diff[0]; + diff2[1] = diff[1] * diff[1]; + + int dist2 = diff2[0] + diff2[1]; + + int window[2]; + + if (dist2 < radius*radius) { + window[0] = window[1] = (int)0L; + } else { + window[0] = window[1] = (int)-1L; + } + + image[(ygid*n+xgid)*2] = (float)((int)in[(ygid*n+xgid)*2] & window[0]); + image[(ygid*n+xgid)*2+1] = (float)((int)in[(ygid*n+xgid)*2+1] & window[1]); + } +} +extern "C" { + __global__ void + norm(long* param, float* in_x,float* out_x) + { + unsigned long gid = blockIdx.x*blockDim.x+threadIdx.x; // (unsigned long)s->get_param(0); + unsigned long nid = blockIdx.y*blockDim.y+threadIdx.y; //(unsigned long)s->get_param(1); + + long n = param[0]; + + out_x[(nid*n+gid)*2] = in_x[(nid*n+gid)*2] / (float)n; + out_x[(nid*n+gid)*2+1] = in_x[(nid*n+gid)*2+1] / (float)n; + } +} +extern "C" { +#include <math.h> + +#define PI 3.14159265358979323846 +#define PI_2 1.57079632679489661923 + + __global__ void + spinFact(long* param, float* w) + { + unsigned long i = blockIdx.x*blockDim.x+threadIdx.x; // (unsigned long)s->get_param(0); + + long n = param[0]; + + float angle[2]; + angle[0] = (float)(2*i*PI/(float)n); + angle[1] = (float)((2*i*PI/(float)n) + PI_2); + + w[2*i] = cos(angle[0]); + w[2*i+1] = cos(angle[1]); + } +} +extern "C" { + __global__ void + transpose(long* param, float* src, float* dst) + { + unsigned long xgid = blockIdx.x*blockDim.x+threadIdx.x; // (unsigned long)s->get_param(0); + unsigned long ygid = blockIdx.y*blockDim.y*threadIdx.y; // (unsigned long)s->get_param(1); + + long n = param[0]; + + unsigned int iid = ygid * n + xgid; + unsigned int oid = xgid * n + ygid; + + dst[2*oid] = src[2*iid]; + dst[2*oid+1] = src[2*iid+1]; + } +}
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/example/cuda_fft/main.cc Tue Jun 03 12:07:00 2014 +0900 @@ -0,0 +1,245 @@ +#include <stdio.h> +#include <sys/time.h> +#include <string.h> +#include <cuda.h> + +#include "pgm.h" + +#define PI 3.14159265358979 +#define MAX_SOURCE_SIZE (0x100000) +#define AMP(a, b) (sqrt((a)*(a)+(b))) + +static double st_time; +static double ed_time; +enum Mode { + forward = 0, + inverse = 1 +}; + +struct int2 { + int x; + int y; +}; + +struct float2 { + float x; + float y; +}; + +CUmodule module; + +static double +getTime() { + struct timeval tv; + gettimeofday(&tv, NULL); + return tv.tv_sec + (double)tv.tv_usec*1e-6; +} + +int +setWorkSize(int* block, int* thread, int x, int y) +{ + switch(y) { + case 1: + block = x; + thread = 1; + break; + default: + block = x; + thread = y; + break; + } + + return 0; +} +int +fftCore(CUdeviceptr dst, CUdeviceptr src, CUdeviceptr spin, int m, enum Mode direction) +{ + + unsigned int flag; + switch (direction) { + case forward:flag = 0x00000000; break; + case inverse:flag = 0x80000000; break; + } + + int n = 1<<m; + int block, thread; + setWorkSize(&block, &thread, n, n); + + CUfunction bitReverse; + cuModuleGetFunction(bitReverse, module, "bitReverse"); + + void* kernel_args[] = {&dst, &src, &m, &n}; + + cuLaunchKernel(bitReverse, + block, 1, 1, + thread, 1, 1, + 0, NULL, kernel_args, NULL); + + CUfunction butterfly; + cuModuleGetFunction(butterfly, module, "butterfly"); + + setWorkSize(&block, &thread, n/2, n); + for (int i=1;i<=m;i++) { + kernel_args[] = {&dst, &spin, &m, &n, &i, &flag}; + cuLaunchKernel(butterfly, + block, 1, 1, + thread, 1, 1, + 0, NULL, kernel_args, NULL); + } + + CUfunction norm; + cuModuleGetFunction(norm, module, "norm"); + + if (direction == inverse) { + setWorkSize(&block, &thread, n, n); + kernel_args[] = {&dst, &m}; + cuLaunchKernel(norm, + block, 1, 1, + thread, 1, 1, + 0, NULL, kernel_args, NULL); + } + + return 0; +} + +char* +init(int argc, char**argv){ + + char *filename = 0; + + for (int i = 1; argv[i]; ++i) { + if (strcmp(argv[i], "-file") == 0) { + filename = argv[i+1]; + } + } + + if ( (argc == 1)||(filename==0)) { + printf("Usage: ./fft -file [image filename] \n"); + exit(-1); + } + + return filename; +} + +int main(int args, char* argv[]) { + // initialize and load kernel + cuInit(0); + + CUdevice device; + cuDeviceGet(&device, 0); + + CUcontext context; + cuCtxCreate(&context, CU_CTX_SCHED_SPIN, device); + + cuModuleLoad(&module, "fft.ptx"); + + char* pgm_file = init(argc, argv); + + pgm_t ipgm; + int err = readPGM(&ipgm, pgm_file); + if (err<0) { + fprintf(stderr, "Failed to read image file.\n"); + exit(1); + } + + int n = ipgm.width; + int m = (int)(log((double)n)/log(2.0)); + + pgm_t opgm; + + float2* xm = (float2*)malloc(n*n*sizeof(float2)); + float2* rm = (float2*)malloc(n*n*sizeof(float2)); + float2* wm = (float2*)malloc(n/2*sizeof(float2)); + + for (int i=0; i<n*n; i++) { + xm[i].x = (float)ipgm.buf[i]; + xm[i].y = (float)0; + } + + st_time = getTime(); + + // memory allocate + CUdeviceptr xmobj; + cuMemAlloc(&xmobj, n*n*sizeof(float2)); + + CUdeviceptr rmobj; + cuMemAlloc(&rmobj, n*n*sizeof(float2)); + + CUdeviceptr wmobj; + cuMemAlloc(&wmobj, (n/2)*sizeof(float2)); + + // Synchronous data transfer(host to device) + cuMemcpyHtoD(xmobj, xm, n*n*sizeof(float2)); + + CUfunction spinFact; + cuModuleGetFunction(spinFact, module, "spinFact"); + + int block, thread; + setWorkSize(&block, &thread, n/2, 1); + + void* kernel_args[] = {&xmobj, &n}; + cuLaunchKernel(spinFact, + block, 1, 1, + thread, 1, 1, + 0, NULL, kernel_args, NULL); + + fftCore(rmobj, xmobj, wmobj, m, forward); + + CUfunction transfer; + cuModuleGetFunction(transfer, module, "transfer"); + + setWorkSize(&block, &thread, n, n); + + kernel_args[] = {&xmobj, &rmobj, &n}; + cuLaunchKernel(transfer, + block, 1, 1, + thread, 1, 1, + 0, NULL, kernel_args, NULL); + + fftCore(rmobj, xmobj, wmobj, m, forward); + + CUfunction highPassFilter; + cuModuleGetFunction(transfer, module, "highPassFilter"); + + setWorkSize(&block, &thread, n, n); + + int radius = n/8; + kernel_args[] = {&rmobj, &n, &radius}; + cuLaunchKernel(highPassFilter, + block, 1, 1, + thread, 1, 1, + 0, NULL, kernel_args, NULL); + + fftCore(xmobj, rmobj, wmobj, m, inverse); + + setWorkSize(&block, &thread, n, n); + + kernel_args[] = {&rmobj, &xmobj}; + cuLaunchKernel(transfer, + block, 1, 1, + thread, 1, 1, + 0, NULL, kernel_args, NULL); + + fftCore(xmobj, rmobj, wmobj, m, inverse); + + + + // memory release + cuMemFree(devA); + for (int i=0;i<num_exec;i++) { + cuMemFree(devB[i]); + cuMemFree(devOut[i]); + } + for (int i=0;i<num_stream;i++) + cuStreamDestroy(stream[i]); + cuModuleUnload(module); + cuCtxDestroy(context); + + delete[] A; + delete[] B; + for (int i=0;i<num_exec;i++) + delete[] result[i]; + delete[] result; + + return 0; +}