Mercurial > hg > Game > Cerium
changeset 1938:71cb3365d9fb draft
merge
author | Masataka Kohagura <e085726@ie.u-ryukyu.ac.jp> |
---|---|
date | Mon, 03 Feb 2014 17:44:49 +0900 |
parents | e5b4e61b6f85 (current diff) e8ca9cae59fc (diff) |
children | d862c573bf84 |
files | |
diffstat | 14 files changed, 74 insertions(+), 67 deletions(-) [+] |
line wrap: on
line diff
--- a/TaskManager/Cell/spe/SpeTaskManagerImpl.h Mon Feb 03 14:15:36 2014 +0900 +++ b/TaskManager/Cell/spe/SpeTaskManagerImpl.h Mon Feb 03 17:44:49 2014 +0900 @@ -40,7 +40,6 @@ void append_waitTask(HTask* p); #endif - #ifdef __CERIUM_CUDA__ SpeTaskManagerImpl(int i);
--- a/TaskManager/Cuda/CudaError.h Mon Feb 03 14:15:36 2014 +0900 +++ b/TaskManager/Cuda/CudaError.h Mon Feb 03 17:44:49 2014 +0900 @@ -41,7 +41,7 @@ {CUDA_ERROR_UNKNOWN, "CUDA_ERROR_UNKNOWN"}, {0, NULL} }; - const char* message = "UNKNOW ERROR."; + const char* message = "UNKNOWN ERROR."; for(int i=0; Error_Status[i].status_string != NULL; i++) { if (Error_Status[i].status == status) {
--- a/TaskManager/Cuda/CudaScheduler.cc Mon Feb 03 14:15:36 2014 +0900 +++ b/TaskManager/Cuda/CudaScheduler.cc Mon Feb 03 17:44:49 2014 +0900 @@ -17,7 +17,6 @@ TaskObject cuda_task_list[MAX_TASK_OBJECT]; CudaScheduler::CudaScheduler() { - init_gpu(); } void @@ -47,7 +46,7 @@ m->memin = (CUdeviceptr*)malloc(m->allcate_size*sizeof(CUdeviceptr*)); m->memout = (CUdeviceptr*)malloc(m->allcate_size*sizeof(CUdeviceptr*)); m->event = (CUevent*)malloc(m->allcate_size*sizeof(CUevent*)); - ret = cuStreamCreate(&m->stream, 0); + ret = cuStreamCreate(&(m->stream), 0); if (ret!=0) error(convert_error_status(ret)); } @@ -69,8 +68,8 @@ m->stream = 0; } -CUdeviceptr -CudaScheduler::createBuffer(CudaBufferPtr cudabuffer, CUdeviceptr* mem, int i, size_t size, int* error) { +void +CudaScheduler::createBuffer(CudaBufferPtr cudabuffer, CUdeviceptr* mem, int i, size_t size) { if (i > cudabuffer->allcate_size) { cudabuffer->allcate_size *= 2; cudabuffer->memin = (CUdeviceptr*)realloc(cudabuffer->memin, cudabuffer->allcate_size*sizeof(CUdeviceptr*)); @@ -78,9 +77,7 @@ cudabuffer->event = (CUevent*)realloc(cudabuffer->event, cudabuffer->allcate_size*sizeof(CUevent*)); } - error = (int*)cuMemAlloc(&mem[i], size); - - return mem[i]; + ret = cuMemAlloc(&mem[i], size); } #define NOP_REPLY NULL @@ -155,29 +152,25 @@ void CudaScheduler::run() { + init_gpu(); int cur = 0; - int stage = 8; TaskListPtr tasklist = NULL; reply = 0; - cudabuffer = (CudaBuffer*)malloc(sizeof(CudaBuffer*)*stage); - - for (int i = 0; i<stage; i++) { + + for (int i = 0; i<STAGE; i++) { initCudaBuffer(&cudabuffer[i]); + kernel_event[i]=0; } - memset(&flag, 0, sizeof(HTask::htask_flag)*2); + memset(&flag, 0, sizeof(HTask::htask_flag)*STAGE); for (;;) { memaddr param_addr = connector->task_list_mail_read(); if ((memaddr)param_addr == (memaddr)MY_SPE_COMMAND_EXIT) { - for (int i = 0; i<stage; i++) { - ret = cuStreamSynchronize(cudabuffer[i].stream); - if (ret!=0) { CudaTaskError(cudabuffer , cur, tasklist, ret); continue; } - + for (int i = 0; i<STAGE; i++) { destroyCudaBuffer(&cudabuffer[i]); } - free(cudabuffer); return; } @@ -210,15 +203,15 @@ ret = cuModuleGetFunction(&kernel[cur], module, funcname); if (ret!=0) { CudaTaskError(cudabuffer, cur, tasklist, ret); continue; } - + int param = 0; // set arg count - CUdeviceptr memparam = createBuffer(&cudabuffer[cur], cudabuffer[cur].memin, param, sizeof(memaddr)*nextTask->param_count, &ret); + createBuffer(&cudabuffer[cur], cudabuffer[cur].memin, param, sizeof(memaddr)*nextTask->param_count); if (ret!=0) { CudaTaskError(cudabuffer, cur, tasklist, ret); continue; } // parameter is passed as first kernel arg - ret = cuMemcpyHtoDAsync(memparam, nextTask->param(0), sizeof(memaddr)*nextTask->param_count, cudabuffer[cur].stream); + ret = cuMemcpyHtoDAsync(cudabuffer[cur].memin[param], nextTask->param(0), sizeof(memaddr)*nextTask->param_count, cudabuffer[cur].stream); if (ret!=0) { CudaTaskError(cudabuffer, cur, tasklist, ret); continue; } param++; @@ -226,7 +219,7 @@ for(int i=0;i<nextTask->inData_count;i++) { ListElement *input_buf = nextTask->inData(i); if (input_buf->size==0) break; - createBuffer(&cudabuffer[cur], cudabuffer[cur].memin, param, input_buf->size, &ret); + createBuffer(&cudabuffer[cur], cudabuffer[cur].memin, param, input_buf->size); if (ret!=0) { CudaTaskError(cudabuffer, cur, tasklist, ret); continue; } ret = cuMemcpyHtoDAsync(cudabuffer[cur].memin[param], input_buf->addr, input_buf->size, cudabuffer[cur].stream); if (ret!=0) { CudaTaskError(cudabuffer, cur, tasklist, ret); continue; } @@ -239,30 +232,28 @@ ListElement *output_buf = nextTask->outData(i); if (output_buf->size==0) break; if (!flag[cur].flip) { // flip use memin for output - createBuffer(&cudabuffer[cur], cudabuffer[cur].memout, i, output_buf->size, &ret); + createBuffer(&cudabuffer[cur], cudabuffer[cur].memout, i, output_buf->size); if (ret!=0) { CudaTaskError(cudabuffer, cur, tasklist, ret); continue; } // enqueue later } param++; } cudabuffer[cur].out_size = param - cudabuffer[cur].in_size; // no buffer on flip, but flip use memout event - + void** kernelParams; - + if (!flag[cur].flip) { kernelParams = (void**)malloc(sizeof(void*)*param); - kernelParams[0] = &memparam; - for (int i = 1; i<cudabuffer[cur].in_size; i++) { - kernelParams[i] = &cudabuffer[cur].memin[i-1]; + for (int i = 0; i<cudabuffer[cur].in_size; i++) { + kernelParams[i] = &cudabuffer[cur].memin[i]; } for (int i = 0; i<cudabuffer[cur].out_size; i++) { kernelParams[i+cudabuffer[cur].in_size] = &cudabuffer[cur].memout[i]; } } else { kernelParams = (void**)malloc(sizeof(void*)*cudabuffer[cur].in_size); - kernelParams[0] = &memparam; - for (int i = 1; i<cudabuffer[cur].in_size; i++) { - kernelParams[i] = &cudabuffer[cur].memin[i-1]; + for (int i = 0; i<cudabuffer[cur].in_size-1; i++) { + kernelParams[i] = &cudabuffer[cur].memin[i]; } } @@ -304,12 +295,12 @@ // pipeline : cur // to stop pipeline set cur+1 if (cur == 0) { - wait_for_event(kernel_event, cudabuffer, tasklist, stage); // to stop pipeline comment out this line + wait_for_event(kernel_event, cudabuffer, tasklist, STAGE); // to stop pipeline comment out this line } else { wait_for_event(kernel_event, cudabuffer, tasklist, cur); } cur += 1; - if (stage <= cur) + if (STAGE <= cur) cur = 0; free(kernelParams); cuModuleUnload(module); @@ -318,6 +309,10 @@ param_addr = (memaddr)tasklist->next; } wait_for_event(kernel_event, cudabuffer, tasklist, cur); + for (int i = 0; i<STAGE; i++) { + ret = cuStreamSynchronize(cudabuffer[i].stream); + if (ret!=0) { CudaTaskError(cudabuffer , cur, tasklist, ret); continue; } + } unsigned long long wait = 0; (*connector->end_dmawait_profile)(&wait, &(connector->start_time), &(connector->stop_time)); @@ -343,13 +338,13 @@ return 1; } - if (cuda_task_list[cmd].cudatask == 0 || cuda_task_list[cmd].cudatask->module == 0) { + if (cuda_task_list[cmd].cudatask == 0 || cuda_task_list[cmd].cudatask->filename == 0) { fprintf(stderr, "CUDA module %d not defined.\n",cmd); return 0; } CUmodule* module = new CUmodule; - ret = cuModuleLoad(module, (const char*)cuda_task_list[cmd].cudatask->module); + ret = cuModuleLoad(module, cuda_task_list[cmd].cudatask->filename); if(ret!=0) { error(convert_error_status(ret)); @@ -367,7 +362,7 @@ cuda_task_list[cmd].load = null_loader; cuda_task_list[cmd].wait = null_loader; cuda_task_list[cmd].name = functionname; - cuda_task_list[cmd].cudatask->module = (CUmodule*)filename; + cuda_task_list[cmd].cudatask->filename = (const char*)filename; } /* end */
--- a/TaskManager/Cuda/CudaScheduler.h Mon Feb 03 14:15:36 2014 +0900 +++ b/TaskManager/Cuda/CudaScheduler.h Mon Feb 03 17:44:49 2014 +0900 @@ -10,6 +10,8 @@ extern TaskObject cuda_task_list[MAX_TASK_OBJECT]; +#define STAGE 8 + class CudaScheduler : public MainScheduler { public: typedef struct cudabuffer { @@ -42,14 +44,15 @@ // cl_kernel に相当 // 変数名は function にすべきか kernel にすべきか // とりあえず、kernel で - CUfunction kernel[2]; - CUevent kernel_event[2]; - CudaBuffer* cudabuffer; - HTask::htask_flag flag[2]; + CUfunction kernel[STAGE]; + CUevent kernel_event[STAGE]; + CudaBuffer cudabuffer[STAGE]; + + HTask::htask_flag flag[STAGE]; private: int load_kernel(int cmd); - CUdeviceptr createBuffer(CudaBufferPtr cudabuffer, CUdeviceptr* mem, int i, size_t size, int* error); + void createBuffer(CudaBufferPtr cudabuffer, CUdeviceptr* mem, int i, size_t size); void initCudaBuffer(CudaBufferPtr m); void destroyCudaBuffer(CudaBufferPtr m); void CudaTaskError(CudaBufferPtr cudabuffer, int cur, TaskListPtr taskList, int ret);
--- a/TaskManager/Makefile Mon Feb 03 14:15:36 2014 +0900 +++ b/TaskManager/Makefile Mon Feb 03 17:44:49 2014 +0900 @@ -54,6 +54,7 @@ $(MAKE) -f Makefile.cell cellclean $(MAKE) -f Makefile.fifo fifoclean $(MAKE) -f Makefile.gpu gpuclean + $(MAKE) -f Makefile.cuda cudaclean rm -rf *.a ../include tags:
--- a/TaskManager/kernel/schedule/Scheduler.cc Mon Feb 03 14:15:36 2014 +0900 +++ b/TaskManager/kernel/schedule/Scheduler.cc Mon Feb 03 17:44:49 2014 +0900 @@ -41,6 +41,7 @@ task_list[i].load = null_loader; task_list[i].wait = null_loader; task_list[i].gputask = new GpuTaskObject(); + task_list[i].cudatask = new CudaTaskObject(); } }
--- a/TaskManager/kernel/schedule/Scheduler.h Mon Feb 03 14:15:36 2014 +0900 +++ b/TaskManager/kernel/schedule/Scheduler.h Mon Feb 03 17:44:49 2014 +0900 @@ -46,6 +46,7 @@ typedef struct cuda_task_object { #ifdef __CERIUM_CUDA__ + const char* filename; CUmodule* module; #endif } CudaTaskObject;
--- a/example/Cuda/main.cc Mon Feb 03 14:15:36 2014 +0900 +++ b/example/Cuda/main.cc Mon Feb 03 17:44:49 2014 +0900 @@ -26,7 +26,6 @@ CUcontext context; CUmodule module; CUfunction function; - CUstream stream; cuInit(0); cuDeviceGet(&device, 0); @@ -34,7 +33,15 @@ cuModuleLoad(&module, "multiply.ptx"); cuModuleGetFunction(&function, module, "multiply"); - cuStreamCreate(&stream,0); + CUresult ret; + int size = 8; + CUstream stream1[size]; + + for (int i=0;i<size;i++) { + ret=cuStreamCreate(&stream1[i],0); + } + + printf("%d\n",ret); float* A = new float[LENGTH]; @@ -52,22 +59,23 @@ cuMemAlloc(&devB, LENGTH*sizeof(float)); cuMemAlloc(&devC, LENGTH*sizeof(float)); - cuMemcpyHtoDAsync(devA, A, LENGTH*sizeof(float), stream); - cuMemcpyHtoDAsync(devB, B, LENGTH*sizeof(float), stream); - cuMemcpyHtoDAsync(devC, C, LENGTH*sizeof(float), stream); - + cuMemcpyHtoDAsync(devA, A, LENGTH*sizeof(float), stream1[0]); + cuMemcpyHtoDAsync(devB, B, LENGTH*sizeof(float), stream1[0]); + // void* args[] = {&devA, &devB, &devC}; - void** args=(void**)malloc(sizeof(void*)*3); - args[0] = &devA; - args[1] = &devB; - args[2] = &devC; + void** args=NULL; + // args=(void**)malloc(sizeof(void*)*8); + // args[0] = &devA; + // args[1] = &devB; + // args[2] = &devC; - cuLaunchKernel(function, - LENGTH, 1, 1, - 1, 1, 1, - 0, stream, args, NULL); + ret=cuLaunchKernel(function, + LENGTH, 1, 1, + 1, 1, 1, + 0, stream1[0], args, NULL); + printf("%d\n",ret); - cuMemcpyDtoHAsync(C, devC, LENGTH*sizeof(float), stream); + cuMemcpyDtoHAsync(C, devC, LENGTH*sizeof(float), stream1[0]); // print_result(C); check_data(A, B, C); @@ -79,7 +87,7 @@ cuMemFree(devB); cuMemFree(devC); cuModuleUnload(module); - cuStreamDestroy(stream); + cuStreamDestroy(stream1[0]); cuCtxDestroy(context); return 0;
--- a/example/Cuda/multiply.cu Mon Feb 03 14:15:36 2014 +0900 +++ b/example/Cuda/multiply.cu Mon Feb 03 17:44:49 2014 +0900 @@ -1,6 +1,7 @@ extern "C" { - __global__ void multiply(float* A, float* B, float* C) { + __global__ void multiply(/*float* A, float* B, float* C*/) { int index = blockIdx.x * blockDim.x + threadIdx.x; - C[index] = A[index] * B[index]; + //C[index] = A[index] * B[index]; + printf("%d\n",index); } }
--- a/example/multiply/Func.h Mon Feb 03 14:15:36 2014 +0900 +++ b/example/multiply/Func.h Mon Feb 03 17:44:49 2014 +0900 @@ -3,4 +3,4 @@ MULTIPLY_TASK, }; -#define DATA_NUM 10000000 +#define DATA_NUM 1000
--- a/example/multiply/cuda/gpu_task_init.cc Mon Feb 03 14:15:36 2014 +0900 +++ b/example/multiply/cuda/gpu_task_init.cc Mon Feb 03 17:44:49 2014 +0900 @@ -12,5 +12,5 @@ void gpu_task_init(void) { - CudaSchedRegister(MULTIPLY_TASK, "gpu/Multi.ptx","multi"); + CudaSchedRegister(MULTIPLY_TASK, "cuda/multiply.ptx","multi"); }
--- a/example/multiply/cuda/multiply.cu Mon Feb 03 14:15:36 2014 +0900 +++ b/example/multiply/cuda/multiply.cu Mon Feb 03 17:44:49 2014 +0900 @@ -1,5 +1,5 @@ extern "C" { - __global__ void multi(float* A, float* B, float* C) { + __global__ void multi(void* params, float* A, float* B, float* C) { int id = blockIdx.x * blockDim.x + threadIdx.x; C[id]=A[id]*B[id]; }
--- a/example/multiply/gpu/Multi.cl Mon Feb 03 14:15:36 2014 +0900 +++ b/example/multiply/gpu/Multi.cl Mon Feb 03 17:44:49 2014 +0900 @@ -1,5 +1,5 @@ __kernel void -multi(__global const long *params,__global const float *A, __global const float*B,__global float* C_, __global float *C) +multi(__global const long *params, __global const float* A, __global const float* B, __global float* C) { // int i=get_global_id(0); long length = (long)params[0]; @@ -7,7 +7,4 @@ // for(int i=0;i<length;i++) { if(length) C[id]=A[id]*B[id]; - else - C[id] = C_[id]; - //} }
--- a/example/multiply/main.cc Mon Feb 03 14:15:36 2014 +0900 +++ b/example/multiply/main.cc Mon Feb 03 17:44:49 2014 +0900 @@ -103,6 +103,7 @@ * add_outData(address of output area, size of output area); */ multiply->set_outData(0,(memaddr)C, sizeof(float)*length); + multiply->set_param(0,(long)length); // param 0に0~length-1をsetしたtaskをlength個spawnする multiply->iterate(length);