Mercurial > hg > Members > yuuhi > OpenCL
changeset 3:f3cfea46e585
add fft_fixstar sample
author | Yuhi TOMARI <yuhi@cr.ie.u-ryukyu.ac.jp> |
---|---|
date | Mon, 04 Feb 2013 02:59:58 +0900 |
parents | ccea4e6a1945 |
children | 8df0d3128672 |
files | fft_Example/main.cc fft_fixstart/.DS_Store fft_fixstart/Makefile fft_fixstart/Makefile.def fft_fixstart/fft.cl fft_fixstart/main.cc fft_fixstart/pgm.h fft_fixstart/sample.jpg fft_fixstart/sample.pgm |
diffstat | 9 files changed, 673 insertions(+), 1 deletions(-) [+] |
line wrap: on
line diff
--- a/fft_Example/main.cc Tue Jan 22 23:19:41 2013 +0900 +++ b/fft_Example/main.cc Mon Feb 04 02:59:58 2013 +0900 @@ -670,7 +670,7 @@ return CL_DEVICE_TYPE_DEFAULT; } // default - return CL_DEVICE_TYPE_GPU; + return CL_DEVICE_TYPE_CPU; } void
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/fft_fixstart/Makefile Mon Feb 04 02:59:58 2013 +0900 @@ -0,0 +1,33 @@ +include ./Makefile.def +SRCS_TMP = $(wildcard *.cc) +SRCS_EXCLUDE = # 除外するファイルを書く +SRCS = $(filter-out $(SRCS_EXCLUDE),$(SRCS_TMP)) +OBJS = $(SRCS:.cc=.o) + +TASK_SRCS_TMP = $(wildcard $(TASK_DIR2)/*.cc $(TASK_DIR1)/*.cc) +TASK_SRCS = $(filter-out $(TASK_DIR1)/$(TASK_SRCS_EXCLUDE),$(TASK_SRCS_TMP)) +TASK_OBJS = $(TASK_SRCS:.cc=.o) + +CC += $(ABI) + +LIBS = -framework opencl + +.SUFFIXES: .cc .o + +.cc.o: + $(CC) $(CFLAGS) $(INCLUDE) -c $< -o $@ + +all: $(TARGET) + +$(TARGET): $(OBJS) $(TASK_OBJS) + $(CC) -o $@ $(OBJS) $(TASK_OBJS) $(LIBS) + +link: + $(CC) -o $(TARGET) $(OBJS) $(TASK_OBJS) $(LIBS) + +debug: $(TARGET) + sudo gdb ./$(TARGET) + +clean: + rm -f $(TARGET) $(OBJS) $(TASK_OBJS) + rm -f *~ \#*
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/fft_fixstart/Makefile.def Mon Feb 04 02:59:58 2013 +0900 @@ -0,0 +1,13 @@ +TARGET = fft + +# include/library path +# ex macosx +#CERIUM = /Users/gongo/Source/Cerium +ABIBIT=64 + +# ex linux/ps3 + +OPT = -g + +CC = clang++ +CFLAGS = -Wall $(OPT)
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/fft_fixstart/fft.cl Mon Feb 04 02:59:58 2013 +0900 @@ -0,0 +1,104 @@ +#define PI 3.14159265358979323846 +#define PI_2 1.57079632679489661923 + +__kernel void spinFact(__global float2* w, int n) +{ + unsigned int i = get_global_id(0); + + float2 angle = (float2)(2*i*PI/(float)n,(2*i*PI/(float)n)+PI_2); + w[i] = cos(angle); +} + +__kernel void bitReverse(__global float2 *dst, __global float2 *src, int m, int n) +{ + unsigned int gid = get_global_id(0); + unsigned int nid = get_global_id(1); + + unsigned int j = gid; + j = (j & 0x55555555) << 1 | (j & 0xAAAAAAAA) >> 1; + j = (j & 0x33333333) << 2 | (j & 0xCCCCCCCC) >> 2; + j = (j & 0x0F0F0F0F) << 4 | (j & 0xF0F0F0F0) >> 4; + j = (j & 0x00FF00FF) << 8 | (j & 0xFF00FF00) >> 8; + j = (j & 0x0000FFFF) << 16 | (j & 0xFFFF0000) >> 16; + + j >>= (32-m); + + dst[nid*n+j] = src[nid*n+gid]; +} + +__kernel void norm(__global float2 *x, int n) +{ + unsigned int gid = get_global_id(0); + unsigned int nid = get_global_id(1); + + x[nid*n+gid] = x[nid*n+gid] / (float2)((float)n, (float)n); +} + +__kernel void butterfly(__global float2 *x, __global float2* w, int m, int n, int iter, uint flag) +{ + unsigned int gid = get_global_id(0); + unsigned int nid = get_global_id(1); + + int butterflySize = 1 << (iter-1); + int butterflyGrpDist = 1 << iter; + int butterflyGrpNum = n >> iter; + int butterflyGrpBase = (gid >> (iter-1))*(butterflyGrpDist); + int butterflyGrpOffset = gid & (butterflySize-1); + + int a = nid * n + butterflyGrpBase + butterflyGrpOffset; + int b = a + butterflySize; + + int l = butterflyGrpNum * butterflyGrpOffset; + + float2 xa, xb, xbxx, xbyy, wab, wayx, wbyx, resa, resb; + + xa = x[a]; + xb = x[b]; + xbxx = xb.xx; + xbyy = xb.yy; + + wab = as_float2(as_uint2(w[l]) ^ (uint2)(0x0, flag)); + wayx = as_float2(as_uint2(wab.yx) ^ (uint2)(0x80000000, 0x0)); + wbyx = as_float2(as_uint2(wab.yx) ^ (uint2)(0x0, 0x80000000)); + + resa = xa + xbxx*wab + xbyy*wayx; + resb = xa - xbxx*wab + xbyy*wbyx; + + x[a] = resa; + x[b] = resb; +} + +__kernel void transpose(__global float2 *dst, __global float2* src, int n) +{ + unsigned int xgid = get_global_id(0); + unsigned int ygid = get_global_id(1); + + unsigned int iid = ygid * n + xgid; + unsigned int oid = xgid * n + ygid; + + dst[oid] = src[iid]; +} + +__kernel void highPassFilter(__global float2* image, int n, int radius) +{ + unsigned int xgid = get_global_id(0); + unsigned int ygid = get_global_id(1); + + int2 n_2 = (int2)(n>>1, n>>1); + int2 mask = (int2)(n-1, n-1); + + int2 gid = ((int2)(xgid, ygid) + n_2) & mask; + + int2 diff = n_2 - gid; + int2 diff2 = diff * diff; + int dist2 = diff2.x + diff2.y; + + int2 window; + + if (dist2 < radius*radius) { + window = (int2)(0L, 0L); + } else { + window = (int2)(-1L, -1L); + } + image[ygid*n+xgid] = as_float2(as_int2(image[ygid*n+xgid]) & window); +}
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/fft_fixstart/main.cc Mon Feb 04 02:59:58 2013 +0900 @@ -0,0 +1,335 @@ +#include <stdio.h> +#include <stdlib.h> +#include <math.h> +#include <sys/stat.h> +#include <fcntl.h> + +#ifdef __APPLE__ +#include <OpenCL/opencl.h> +#else +#include <CL/cl.h> +#endif + +#include "pgm.h" + +#define PI 3.14159265358979 + +#define MAX_SOURCE_SIZE (0x100000) + +#define AMP(a, b) (sqrt((a)*(a)+(b)*(b))) + +cl_device_id device_id = NULL; +cl_context context = NULL; +cl_command_queue queue = NULL; +cl_program program = NULL; +cl_device_type device_type = CL_DEVICE_TYPE_GPU; + +enum Mode { + forward = 0, + inverse = 1 +}; + +int setWorkSize(size_t* gws, size_t* lws, cl_int x, cl_int y) +{ + switch(y) { + case 1: + gws[0] = x; + gws[1] = 1; + lws[0] = 1; + lws[1] = 1; + break; + default: + gws[0] = x; + gws[1] = y; + lws[0] = 1; + lws[1] = 1; + break; + } + + return 0; +} + +int fftCore(cl_mem dst, cl_mem src, cl_mem spin, cl_int m, enum Mode direction) +{ + cl_int ret; + + cl_int iter; + cl_uint flag; + + cl_int n = 1<<m; + + cl_event kernelDone; + + cl_kernel brev = NULL; + cl_kernel bfly = NULL; + cl_kernel norm = NULL; + + brev = clCreateKernel(program, "bitReverse", &ret); + bfly = clCreateKernel(program, "butterfly", &ret); + norm = clCreateKernel(program, "norm", &ret); + + size_t gws[2]; + size_t lws[2]; + + switch (direction) { + case forward:flag = 0x00000000; break; + case inverse:flag = 0x80000000; break; + } + + ret = clSetKernelArg(brev, 0, sizeof(cl_mem), (void *)&dst); + ret = clSetKernelArg(brev, 1, sizeof(cl_mem), (void *)&src); + ret = clSetKernelArg(brev, 2, sizeof(cl_int), (void *)&m); + ret = clSetKernelArg(brev, 3, sizeof(cl_int), (void *)&n); + + ret = clSetKernelArg(bfly, 0, sizeof(cl_mem), (void *)&dst); + ret = clSetKernelArg(bfly, 1, sizeof(cl_mem), (void *)&spin); + ret = clSetKernelArg(bfly, 2, sizeof(cl_int), (void *)&m); + ret = clSetKernelArg(bfly, 3, sizeof(cl_int), (void *)&n); + ret = clSetKernelArg(bfly, 5, sizeof(cl_uint), (void *)&flag); + + ret = clSetKernelArg(norm, 0, sizeof(cl_mem), (void *)&dst); + ret = clSetKernelArg(norm, 1, sizeof(cl_int), (void *)&n); + + /* Reverse bit ordering */ + setWorkSize(gws, lws, n, n); + ret = clEnqueueNDRangeKernel(queue, brev, 2, NULL, gws, lws, 0, NULL, NULL); + + /* Perform Butterfly Operations*/ + setWorkSize(gws, lws, n/2, n); + for (iter=1; iter <= m; iter++) { + ret = clSetKernelArg(bfly, 4, sizeof(cl_int), (void *)&iter); + ret = clEnqueueNDRangeKernel(queue, bfly, 2, NULL, gws, lws, 0, NULL, &kernelDone); + ret = clWaitForEvents(1, &kernelDone); + } + + if (direction == inverse) { + setWorkSize(gws, lws, n, n); + ret = clEnqueueNDRangeKernel(queue, norm, 2, NULL, gws, lws, 0, NULL, &kernelDone); + ret = clWaitForEvents(1, &kernelDone); + } + + ret = clReleaseKernel(bfly); + ret = clReleaseKernel(brev); + ret = clReleaseKernel(norm); + + return 0; +} + +char * +init(int argc, char**argv){ + + char *filename = 0; + + for (int i = 1; argv[i]; ++i) { + if (strcmp(argv[i], "-file") == 0) { + filename = argv[i+1]; + } else if (strcmp(argv[i], "-cpu") == 0) { + device_type = CL_DEVICE_TYPE_CPU; + } else if (strcmp(argv[i], "-gpu") == 0) { + device_type = CL_DEVICE_TYPE_GPU; + } + } + if ( (argc == 1)||(filename==0)) { + printf("Usage: ./fft [image filename]\n"); + exit(-1); + } + + return filename; +} + +int main(int argc, char** argv) { + cl_mem xmobj = NULL; + cl_mem rmobj = NULL; + cl_mem wmobj = NULL; + cl_kernel sfac = NULL; + cl_kernel trns = NULL; + cl_kernel hpfl = NULL; + + cl_platform_id platform_id = NULL; + + cl_uint ret_num_devices; + cl_uint ret_num_platforms; + + cl_int ret; + + cl_float2 *xm; + cl_float2 *rm; + cl_float2 *wm; + + pgm_t ipgm; + pgm_t opgm; + + const char fileName[] = "./fft.cl"; + size_t source_size; + char *source_str; + cl_int i, j; + cl_int n; + cl_int m; + + size_t gws[2]; + size_t lws[2]; + + /* Load kernel source code */ + int fd = open(fileName, O_RDONLY); + + if (fd<0) { + fprintf(stderr, "Failed to load kernel %s.\n",fileName); + exit(1); + } + struct stat stats; + fstat(fd, &stats); + off_t size = stats.st_size; + if (size<=0) { + fprintf(stderr, "Failed to load kernel.\n"); + exit(1); + } + source_str = (char*)alloca(size); + source_size = read(fd, source_str, size); + close( fd ); + + char * pgm_file = init(argc,argv); + + /* Read image */ + int err = readPGM(&ipgm, pgm_file); + if (err<0) { + fprintf(stderr, "Failed to read image file.\n"); + exit(1); + } + + n = ipgm.width; + m = (cl_int)(log((double)n)/log(2.0)); + + xm = (cl_float2 *)malloc(n * n * sizeof(cl_float2)); + rm = (cl_float2 *)malloc(n * n * sizeof(cl_float2)); + wm = (cl_float2 *)malloc(n / 2 * sizeof(cl_float2)); + + for (i=0; i < n; i++) { + for (j=0; j < n; j++) { + ((float*)xm)[(2*n*j)+2*i+0] = (float)ipgm.buf[n*j+i]; + ((float*)xm)[(2*n*j)+2*i+1] = (float)0; + } + } + + /* Get platform/device */ + ret = clGetPlatformIDs(1, &platform_id, &ret_num_platforms); + ret = clGetDeviceIDs( platform_id, CL_DEVICE_TYPE_DEFAULT, 1, &device_id, &ret_num_devices); + + /* Create OpenCL context */ + context = clCreateContext(NULL, 1, &device_id, NULL, NULL, &ret); + + /* Create Command queue */ + queue = clCreateCommandQueue(context, device_id, 0, &ret); + + /* Create Buffer Objects */ + xmobj = clCreateBuffer(context, CL_MEM_READ_WRITE, n*n*sizeof(cl_float2), NULL, &ret); + rmobj = clCreateBuffer(context, CL_MEM_READ_WRITE, n*n*sizeof(cl_float2), NULL, &ret); + wmobj = clCreateBuffer(context, CL_MEM_READ_WRITE, (n/2)*sizeof(cl_float2), NULL, &ret); + + /* Transfer data to memory buffer */ + ret = clEnqueueWriteBuffer(queue, xmobj, CL_TRUE, 0, n*n*sizeof(cl_float2), xm, 0, NULL, NULL); + + /* Create kernel program from source */ + program = clCreateProgramWithSource(context, 1, (const char **)&source_str, (const size_t *)&source_size, &ret); + + /* Build kernel program */ + ret = clBuildProgram(program, 1, &device_id, NULL, NULL, NULL); + + if (ret<0) { + size_t size; + clGetProgramBuildInfo(program, device_id, CL_PROGRAM_BUILD_LOG, 0, NULL, &size); + + char *log = new char[size]; + clGetProgramBuildInfo(program, device_id, CL_PROGRAM_BUILD_LOG, size, log, NULL); + printf("%s ",log); + exit (ret); + } + + /* Create OpenCL Kernel */ + sfac = clCreateKernel(program, "spinFact", &ret); + trns = clCreateKernel(program, "transpose", &ret); + hpfl = clCreateKernel(program, "highPassFilter", &ret); + + /* Create spin factor */ + ret = clSetKernelArg(sfac, 0, sizeof(cl_mem), (void *)&wmobj); + ret = clSetKernelArg(sfac, 1, sizeof(cl_int), (void *)&n); + setWorkSize(gws, lws, n/2, 1); + ret = clEnqueueNDRangeKernel(queue, sfac, 1, NULL, gws, lws, 0, NULL, NULL); + + /* Butterfly Operation */ + fftCore(rmobj, xmobj, wmobj, m, forward); + + /* Transpose matrix */ + ret = clSetKernelArg(trns, 0, sizeof(cl_mem), (void *)&xmobj); + ret = clSetKernelArg(trns, 1, sizeof(cl_mem), (void *)&rmobj); + ret = clSetKernelArg(trns, 2, sizeof(cl_int), (void *)&n); + setWorkSize(gws, lws, n, n); + ret = clEnqueueNDRangeKernel(queue, trns, 2, NULL, gws, lws, 0, NULL, NULL); + + /* Butterfly Operation */ + fftCore(rmobj, xmobj, wmobj, m, forward); + + /* Apply high-pass filter */ + cl_int radius = n/8; + ret = clSetKernelArg(hpfl, 0, sizeof(cl_mem), (void *)&rmobj); + ret = clSetKernelArg(hpfl, 1, sizeof(cl_int), (void *)&n); + ret = clSetKernelArg(hpfl, 2, sizeof(cl_int), (void *)&radius); + setWorkSize(gws, lws, n, n); + ret = clEnqueueNDRangeKernel(queue, hpfl, 2, NULL, gws, lws, 0, NULL, NULL); + + /* Inverse FFT */ + + /* Butterfly Operation */ + fftCore(xmobj, rmobj, wmobj, m, inverse); + + /* Transpose matrix */ + ret = clSetKernelArg(trns, 0, sizeof(cl_mem), (void *)&rmobj); + ret = clSetKernelArg(trns, 1, sizeof(cl_mem), (void *)&xmobj); + setWorkSize(gws, lws, n, n); + ret = clEnqueueNDRangeKernel(queue, trns, 2, NULL, gws, lws, 0, NULL, NULL); + + /* Butterfly Operation */ + + fftCore(xmobj, rmobj, wmobj, m, inverse); + + /* Read data from memory buffer */ + ret = clEnqueueReadBuffer(queue, xmobj, CL_TRUE, 0, n*n*sizeof(cl_float2), xm, 0, NULL, NULL); + + /* */ + float* ampd; + ampd = (float*)malloc(n*n*sizeof(float)); + for (i=0; i < n; i++) { + for (j=0; j < n; j++) { + ampd[n*((i))+((j))] = (AMP(((float*)xm)[(2*n*i)+2*j], ((float*)xm)[(2*n*i)+2*j+1])); + } + } + opgm.width = n; + opgm.height = n; + normalizeF2PGM(&opgm, ampd); + free(ampd); + + /* Write out image */ + writePGM(&opgm, "output.pgm"); + + /* Finalizations*/ + ret = clFlush(queue); + ret = clFinish(queue); + ret = clReleaseKernel(hpfl); + ret = clReleaseKernel(trns); + ret = clReleaseKernel(sfac); + ret = clReleaseProgram(program); + ret = clReleaseMemObject(xmobj); + ret = clReleaseMemObject(rmobj); + ret = clReleaseMemObject(wmobj); + ret = clReleaseCommandQueue(queue); + ret = clReleaseContext(context); + + destroyPGM(&ipgm); + destroyPGM(&opgm); + + free(wm); + free(rm); + free(xm); + + fprintf(stdout, "image out put succeeded.\n"); + return 0; +}
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/fft_fixstart/pgm.h Mon Feb 04 02:59:58 2013 +0900 @@ -0,0 +1,187 @@ +#ifndef _PGM_H_ +#define _PGM_H_ + +#include <math.h> +#include <string.h> + +#define PGM_MAGIC "P5" + +#ifdef _WIN32 +#define STRTOK_R(ptr, del, saveptr) strtok_s(ptr, del, saveptr) +#else +#define STRTOK_R(ptr, del, saveptr) strtok_r(ptr, del, saveptr) +#endif + +typedef struct _pgm_t { + int width; + int height; + unsigned char *buf; +} pgm_t; + +int readPGM(pgm_t* pgm, const char* filename) +{ + char *token, *pc, *saveptr; + char *buf; + size_t bufsize; + char del[] = " \t\n"; + unsigned char *dot; + + long begin, end; + int filesize; + int i, w, h, luma, pixs; + + + FILE* fp; + if ((fp = fopen(filename, "rb"))==NULL) { + fprintf(stderr, "Failed to open file\n"); + return -1; + } + + fseek(fp, 0, SEEK_SET); + begin = ftell(fp); + fseek(fp, 0, SEEK_END); + end = ftell(fp); + filesize = (int)(end - begin); + + buf = (char*)malloc(filesize * sizeof(char)); + fseek(fp, 0, SEEK_SET); + bufsize = fread(buf, filesize * sizeof(char), 1, fp); + + fclose(fp); + + token = (char *)STRTOK_R(buf, del, &saveptr); + if (strncmp(token, PGM_MAGIC, 2) != 0) { + return -1; + } + + token = (char *)STRTOK_R(NULL, del, &saveptr); + if (token[0] == '#' ) { + token = (char *)STRTOK_R(NULL, "\n", &saveptr); + token = (char *)STRTOK_R(NULL, del, &saveptr); + } + + w = strtoul(token, &pc, 10); + token = (char *)STRTOK_R(NULL, del, &saveptr); + h = strtoul(token, &pc, 10); + token = (char *)STRTOK_R(NULL, del, &saveptr); + luma = strtoul(token, &pc, 10); + + token = pc + 1; + pixs = w * h; + + pgm->buf = (unsigned char *)malloc(pixs * sizeof(unsigned char)); + + dot = pgm->buf; + + for (i=0; i< pixs; i++, dot++) { + *dot = *token++; + } + + pgm->width = w; + pgm->height = h; + + return 0; +} + +int writePGM(pgm_t* pgm, const char* filename) +{ + int i, w, h, pixs; + FILE* fp; + unsigned char* dot; + + w = pgm->width; + h = pgm->height; + pixs = w * h; + + if ((fp = fopen(filename, "wb+")) ==NULL) { + fprintf(stderr, "Failed to open file\n"); + return -1; + } + + fprintf (fp, "%s\n%d %d\n255\n", PGM_MAGIC, w, h); + + dot = pgm->buf; + + for (i=0; i<pixs; i++, dot++) { + putc((unsigned char)*dot, fp); + } + + fclose(fp); + + return 0; +} + +int normalizeD2PGM(pgm_t* pgm, double* x) +{ + int i, j, w, h; + + w = pgm->width; + h = pgm->height; + + pgm->buf = (unsigned char*)malloc(w * h * sizeof(unsigned char)); + + double min = 0; + double max = 0; + for (i=0; i < h; i++) { + for (j=0; j < w; j++) { + if (max < x[i*w+j]) + max = x[i*w+j]; + if (min > x[i*w+j]) + min = x[i*w+j]; + } + } + + for (i=0; i < h; i++) { + for (j=0; j < w; j++) { + if((max-min)!=0) + pgm->buf[i*w+j] = (unsigned char)(255*(x[i*w+j]-min)/(max-min)); + else + pgm->buf[i*w+j]= 0; + } + } + + return 0; +} + +int normalizeF2PGM(pgm_t* pgm, float* x) +{ + int i, j, w, h; + + w = pgm->width; + h = pgm->height; + + pgm->buf = (unsigned char*)malloc(w * h * sizeof(unsigned char)); + + float min = 0; + float max = 0; + for (i=0; i < h; i++) { + for (j=0; j < w; j++) { + if (max < x[i*w+j]) + max = x[i*w+j]; + if (min > x[i*w+j]) + min = x[i*w+j]; + } + } + + for (i=0; i < h; i++) { + for (j=0; j < w; j++) { + if((max-min)!=0) + pgm->buf[i*w+j] = (unsigned char)(255*(x[i*w+j]-min)/(max-min)); + else + pgm->buf[i*w+j]= 0; + } + } + + return 0; +} + +int destroyPGM(pgm_t* pgm) +{ + if (pgm->buf) { + free(pgm->buf); + } + + return 0; +} + +#endif /* _PGM_H_ */