Mercurial > hg > Game > Cerium
changeset 1941:f19885ea776d draft
add wordcount for cuda. fix CudaScheduler. add makefile
author | Shohei KOKUBO <e105744@ie.u-ryukyu.ac.jp> |
---|---|
date | Tue, 04 Feb 2014 02:18:07 +0900 |
parents | e8ca9cae59fc |
children | b5d778f00bf1 |
files | TaskManager/Cuda/CudaScheduler.cc example/multiply/Func.h example/multiply/cuda/multiply.cu example/multiply/gpu/Multi.cl example/multiply/main.cc example/word_count/Makefile example/word_count/Makefile.cuda example/word_count/cuda/Exec.cu example/word_count/cuda/Exec_Data_Parallel.cu example/word_count/main.cc example/word_count/task_init.cc |
diffstat | 11 files changed, 158 insertions(+), 15 deletions(-) [+] |
line wrap: on
line diff
--- a/TaskManager/Cuda/CudaScheduler.cc Sun Feb 02 18:34:31 2014 +0900 +++ b/TaskManager/Cuda/CudaScheduler.cc Tue Feb 04 02:18:07 2014 +0900 @@ -93,7 +93,7 @@ if (mem[cur-1].event[i] != 0) cuEventDestroy(mem[cur-1].event[i]); mem[cur-1].event[i] = 0; - if (mem[1-cur].memout[i]) + if (mem[cur-1].memout[i]) cuMemFree(mem[cur-1].memout[i]); mem[cur-1].memout[i] = 0; } @@ -107,7 +107,7 @@ } else if (kernel_event[cur-1] != NULL){ ret = cuEventSynchronize(kernel_event[cur-1]); - + if (ret!=0) { error(convert_error_status(ret)); } @@ -299,16 +299,19 @@ } else { wait_for_event(kernel_event, cudabuffer, tasklist, cur); } - cur += 1; + cur++; if (STAGE <= cur) cur = 0; free(kernelParams); - cuModuleUnload(module); } reply = (memaddr)tasklist->waiter; param_addr = (memaddr)tasklist->next; } - wait_for_event(kernel_event, cudabuffer, tasklist, cur); + if (cur == 0) { + wait_for_event(kernel_event, cudabuffer, tasklist, STAGE); + } else { + wait_for_event(kernel_event, cudabuffer, tasklist, cur); + } for (int i = 0; i<STAGE; i++) { ret = cuStreamSynchronize(cudabuffer[i].stream); if (ret!=0) { CudaTaskError(cudabuffer , cur, tasklist, ret); continue; }
--- a/example/multiply/Func.h Sun Feb 02 18:34:31 2014 +0900 +++ b/example/multiply/Func.h Tue Feb 04 02:18:07 2014 +0900 @@ -3,4 +3,4 @@ MULTIPLY_TASK, }; -#define DATA_NUM 1000 +#define DATA_NUM 60000
--- a/example/multiply/cuda/multiply.cu Sun Feb 02 18:34:31 2014 +0900 +++ b/example/multiply/cuda/multiply.cu Tue Feb 04 02:18:07 2014 +0900 @@ -1,6 +1,8 @@ extern "C" { - __global__ void multi(void* params, float* A, float* B, float* C) { - int id = blockIdx.x * blockDim.x + threadIdx.x; - C[id]=A[id]*B[id]; + __global__ void multi(long* params, float* A, float* B, float* C) { + //int id = blockIdx.x * blockDim.x + threadIdx.x; + long length = params[0]; + for (int id = 0; id < length; id++) + C[id]=A[id]*B[id]; } }
--- a/example/multiply/gpu/Multi.cl Sun Feb 02 18:34:31 2014 +0900 +++ b/example/multiply/gpu/Multi.cl Tue Feb 04 02:18:07 2014 +0900 @@ -2,9 +2,7 @@ multi(__global const long *params, __global const float* A, __global const float* B, __global float* C) { // int i=get_global_id(0); - long length = (long)params[0]; long id = get_global_id(0); // for(int i=0;i<length;i++) { - if(length) - C[id]=A[id]*B[id]; + C[id]=A[id]*B[id]; }
--- a/example/multiply/main.cc Sun Feb 02 18:34:31 2014 +0900 +++ b/example/multiply/main.cc Tue Feb 04 02:18:07 2014 +0900 @@ -105,7 +105,7 @@ multiply->set_outData(0,(memaddr)C, sizeof(float)*length); multiply->set_param(0,(long)length); // param 0に0~length-1をsetしたtaskをlength個spawnする - multiply->iterate(length); + //multiply->iterate(length); // hoge = manager->create_task(MULTIPLY_TASK); // hoge->set_cpu(spe_cpu); @@ -116,7 +116,7 @@ // hoge->set_param(0,(long)0); // hoge->wait_for(multiply); // hoge->iterate(length); - // multiply->spawn(); + multiply->spawn(); // } }
--- a/example/word_count/Makefile Sun Feb 02 18:34:31 2014 +0900 +++ b/example/word_count/Makefile Tue Feb 04 02:18:07 2014 +0900 @@ -23,9 +23,14 @@ @echo "Make for OpenCL" @$(MAKE) -f Makefile.gpu test +cuda: FORCE + @echo "Make for Cuda" + @$(MAKE) -f Makefile.cuda + FORCE: clean: @$(MAKE) -f Makefile.macosx clean @$(MAKE) -f Makefile.linux clean @$(MAKE) -f Makefile.cell clean + @$(MAKE) -f Makefile.cuda clean
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/example/word_count/Makefile.cuda Tue Feb 04 02:18:07 2014 +0900 @@ -0,0 +1,52 @@ +include ./Makefile.def + +SRCS_TMP = $(wildcard *.cc) +SRCS_EXCLUDE = # 除外するファイルを書く +SRCS = $(filter-out $(SRCS_EXCLUDE),$(SRCS_TMP)) +OBJS = $(SRCS:.cc=.o) + +TASK_DIR = ppe +CUDA_TASK_DIR = cuda + +TASK_SRCS_TMP = $(wildcard $(TASK_DIR)/*.cc) +TASK_SRCS_EXCLUDE = +TASK_SRCS = $(filter-out $(TASK_DIR)/$(TASK_SRCS_EXCLUDE),$(TASK_SRCS_TMP)) $(wildcard $(CUDA_TASK_DIR)/*.cc) +TASK_OBJS = $(TASK_SRCS:.cc=.o) + +CUDA_SRCS_TMP = $(wildcard $(CUDA_TASK_DIR)/*.cu) +CUDA_SRCS_EXCLUDE = # 除外するファイルを書く +CUDA_SRCS = $(filter-out $(CUDA_TASK_DIR)/$(CUDA_SRCS_EXCLUDE),$(CUDA_SRCS_TMP)) +CUDA_OBJS = $(CUDA_SRCS:.cu=.ptx) + +CFLAGS += -D__CERIUM_CUDA__ +LIBS += `sdl-config --libs` -lCudaManager -F/Library/Frameworks -framework CUDA + +INCLUDE += -I/Developer/NVIDIA/CUDA-5.5/include + +NVCC = /Developer/NVIDIA/CUDA-5.5/bin/nvcc +NVCCFLAGS = -ptx -arch=sm_20 +INDEX = 0 + +.SUFFIXES: .cc .o .cu .ptx + +.cc.o: + $(CC) $(CFLAGS) $(INCLUDE) -c $< -o $@ + +.cu.ptx: + $(NVCC) $(NVCCFLAGS) $< -o $@ + +all: $(TARGET) $(CUDA_OBJS) + +$(TARGET): $(OBJS) $(TASK_OBJS) $(CUDA_OBJS) + $(CC) -o $@ $(OBJS) $(TASK_OBJS) $(LIBS) + +link: + $(CC) -o $(TARGET) $(OBJS) $(TASK_OBJS) $(LIBS) + +debug: $(TARGET) + sudo ppu-gdb ./$(TARGET) + +clean: + rm -f $(TARGET) $(OBJS) $(TASK_OBJS) $(CUDA_OBJS) + rm -f *~ \#* + rm -f cuda/*~ cuda/\#*
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/example/word_count/cuda/Exec.cu Tue Feb 04 02:18:07 2014 +0900 @@ -0,0 +1,34 @@ +extern "C" { + __global__ void + wordcount(long *param, + char *i_data, + unsigned long *o_data) + { + unsigned long *head_tail_flag = o_data+2; + long length = param[0]; + long word_flag = 0; + long word_num = 0; + long line_num = 0; + long i = 0; + + head_tail_flag[0] = (i_data[0] != 0x20) && (i_data[0] != 0x0A); + word_num -= 1-head_tail_flag[0]; + for (; i < length; i++) { + if (i_data[i] == 0x20) { + word_flag = 1; + } else if (i_data[i] == 0x0A) { + line_num += 1; + word_flag = 1; + } else { + word_num += word_flag; + word_flag = 0; + } + } + + word_num += word_flag; + head_tail_flag[1] = (i_data[i-1] != 0x20) && (i_data[i-1] != 0x0A); + + o_data[0] = (unsigned long)word_num; + o_data[1] = (unsigned long)line_num; + } +}
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/example/word_count/cuda/Exec_Data_Parallel.cu Tue Feb 04 02:18:07 2014 +0900 @@ -0,0 +1,41 @@ +extern "C" { + __global__ void + wordcount_parallel(long *param, + char *rbuf, + unsigned long *wbuf) + { + long task_spwaned = param[0]; + long division_size = param[1]; + long length = param[2]; + long out_size = param[3]; + int allocation = (int)task_spwaned + (blockIdx.x * blockDim.x + threadIdx.x); + char *i_data = rbuf + allocation*division_size; + unsigned long *o_data = wbuf + allocation*out_size; + unsigned long *head_tail_flag = o_data+2; + long word_flag = 0; + long word_num = 0; + long line_num = 0; + long i = 0; + + head_tail_flag[0] = (i_data[0] != 0x20) && (i_data[0] != 0x0A); + word_num -= 1-head_tail_flag[0]; + + for (; i < length; i++) { + if (i_data[i] == 0x20) { + word_flag = 1; + } else if (i_data[i] == 0x0A) { + line_num += 1; + word_flag = 1; + } else { + word_num += word_flag; + word_flag = 0; + } + } + + word_num += word_flag; + head_tail_flag[1] = (i_data[i-1] != 0x20) && (i_data[i-1] != 0x0A); + + o_data[0] = (unsigned long)word_num; + o_data[1] = (unsigned long)line_num; + } +}
--- a/example/word_count/main.cc Sun Feb 02 18:34:31 2014 +0900 +++ b/example/word_count/main.cc Tue Feb 04 02:18:07 2014 +0900 @@ -339,7 +339,7 @@ array_task_num = atoi(argv[i+1]); i++; } else if (strcmp(argv[i], "-g") == 0) { - spe_cpu = GPU_ANY; + spe_cpu = GPU_0; } else if (strcmp(argv[i], "-any") == 0) { spe_cpu = ANY_ANY; } else if (strcmp(argv[i], "-i") == 0) {
--- a/example/word_count/task_init.cc Sun Feb 02 18:34:31 2014 +0900 +++ b/example/word_count/task_init.cc Tue Feb 04 02:18:07 2014 +0900 @@ -3,6 +3,9 @@ #ifdef __CERIUM_GPU__ #include "GpuScheduler.h" #endif +#ifdef __CERIUM_CUDA__ +#include "CudaScheduler.h" +#endif /* 必ずこの位置に書いて */ SchedExternTask(Exec); @@ -22,6 +25,11 @@ GpuSchedRegister(TASK_EXEC, "gpu/Exec.cl", "wordcount"); GpuSchedRegister(TASK_EXEC_DATA_PARALLEL, "gpu/Exec_Data_Parallel.cl","wordcount_parallel"); #endif +#ifdef __CERIUM_CUDA__ + CudaSchedRegister(TASK_EXEC, "cuda/Exec.ptx", "wordcount"); + CudaSchedRegister(TASK_EXEC_DATA_PARALLEL, "cuda/Exec_Data_Parallel.ptx","wordcount_parallel"); +#endif + SchedRegisterTask(TASK_EXEC, Exec); SchedRegisterTask(TASK_EXEC_DATA_PARALLEL, Exec_Data_Parallel);