Mercurial > hg > Game > Cerium
changeset 2014:8c618e912c88 draft
optimization data transfer. wrong result
author | Shohei KOKUBO <e105744@ie.u-ryukyu.ac.jp> |
---|---|
date | Tue, 01 Jul 2014 17:04:01 +0900 |
parents | d43c2b7932ea |
children | 6bf6450bd45a |
files | TaskManager/Cuda/CudaScheduler.cc TaskManager/Cuda/CudaScheduler.h example/fft/main.cc |
diffstat | 3 files changed, 33 insertions(+), 19 deletions(-) [+] |
line wrap: on
line diff
--- a/TaskManager/Cuda/CudaScheduler.cc Tue Jul 01 11:17:12 2014 +0900 +++ b/TaskManager/Cuda/CudaScheduler.cc Tue Jul 01 17:04:01 2014 +0900 @@ -13,6 +13,9 @@ #include <sys/stat.h> #include <string.h> #include <cuda.h> +#include <map> + +using namespace std; TaskObject cuda_task_list[MAX_TASK_OBJECT]; @@ -137,6 +140,7 @@ ret = cuMemcpyHtoDAsync(cudabuffer[cur].memin[param], input_buf->addr, input_buf->size, cudabuffer[cur].stream); if (ret!=0) { CudaTaskError(cudabuffer, cur, tasklist, ret); continue; } transmitted.insert(make_pair(input_buf, &cudabuffer[cur].memin[param])); + reverse_map.insert(make_pair(&cudabuffer[cur].memin[param], input_buf)); } cudabuffer[cur].kernelParams[param] = transmitted[input_buf]; param++; @@ -151,6 +155,7 @@ createBuffer(&cudabuffer[cur], cudabuffer[cur].memout, i, output_buf->size); if (ret!=0) { CudaTaskError(cudabuffer, cur, tasklist, ret); continue; } transmitted.insert(make_pair(output_buf, &cudabuffer[cur].memout[i])); + reverse_map.insert(make_pair(&cudabuffer[cur].memout[i], output_buf)); } cudabuffer[cur].kernelParams[param] = transmitted[output_buf]; param++; @@ -183,34 +188,36 @@ int cur = 0; for (;nextTask < tasklist->last(); nextTask = nextTask->next(), cur++) { if (STAGE <= cur) break; + // enable flip : not data transfer device to host + if (flag[cur].flip) continue; for(int i=0;i<nextTask->outData_count;i++) { // read output data ListElement *output_buf = nextTask->outData(i); if (output_buf->size==0) break; - CUdeviceptr* mem = flag[cur].flip ? cudabuffer[cur].memin : cudabuffer[cur].memout ; - int i0 = flag[cur].flip ? i+1 : i ; - // flip use memin buffer and memout event - ret = cuMemcpyDtoHAsync(output_buf->addr, mem[i0], output_buf->size, cudabuffer[cur].stream); - if (ret!=0) { CudaTaskError(cudabuffer, cur, tasklist, ret); continue; } - transmitted.erase(output_buf); + if (transmitted.count(output_buf)) { + ret = cuMemcpyDtoHAsync(output_buf->addr, *transmitted[output_buf], output_buf->size, cudabuffer[cur].stream); + if (ret!=0) { CudaTaskError(cudabuffer, cur, tasklist, ret); continue; } + reverse_map.erase(transmitted[output_buf]); + transmitted.erase(output_buf); + } } } return nextTask; } static void -release_buf_event(int cur, CudaScheduler::CudaBufferPtr mem) { +release_buf_event(int cur, CudaScheduler::CudaBufferPtr mem, map<CUdeviceptr*, ListElement*> map) { for (int i=0; i<mem[cur].in_size; i++) { - if (mem[cur].memin[i]) + if (!map.count(&mem[cur].memin[i])) { cuMemFree(mem[cur].memin[i]); - mem[cur].memin[i] = 0; + mem[cur].memin[i] = 0; + } } for (int i=0; i<mem[cur].out_size; i++) { - if (mem[cur].memout[i]) + if (!map.count(&mem[cur].memout[i])) { cuMemFree(mem[cur].memout[i]); - mem[cur].memout[i] = 0; + mem[cur].memout[i] = 0; + } } - mem[cur].in_size = 0; - mem[cur].out_size = 0; } void @@ -235,7 +242,7 @@ for (int i=0;i<cur;i++) { if (cudabuffer[i].in_size > 0 || cudabuffer[i].out_size > 0) - release_buf_event(i, cudabuffer); + release_buf_event(i, cudabuffer, reverse_map); } if(reply) {
--- a/TaskManager/Cuda/CudaScheduler.h Tue Jul 01 11:17:12 2014 +0900 +++ b/TaskManager/Cuda/CudaScheduler.h Tue Jul 01 17:04:01 2014 +0900 @@ -48,7 +48,8 @@ CudaBuffer cudabuffer[STAGE]; // record transmitted data. - map<ListElement*, void*> transmitted; + map<ListElement*, CUdeviceptr*> transmitted; + map<CUdeviceptr*, ListElement*> reverse_map; HTask::htask_flag flag[STAGE];
--- a/example/fft/main.cc Tue Jul 01 11:17:12 2014 +0900 +++ b/example/fft/main.cc Tue Jul 01 17:04:01 2014 +0900 @@ -126,6 +126,7 @@ brev->set_inData(0, src, length_src*sizeof(cl_float2)); brev->set_outData(0, dst, length_dst*sizeof(cl_float2)); brev->set_cpu(spe_cpu); + brev->flip(); brev->wait_for(waitTask); brev->iterate(gws[0],gws[1]); @@ -141,6 +142,7 @@ bfly->set_inData(1, spin, sizeof(cl_float2)*(n/2)); bfly->set_outData(0,dst,length_dst*sizeof(cl_float2)); bfly->set_cpu(spe_cpu); + bfly->flip(); bfly->wait_for(waitTask); bfly->iterate(gws[0],gws[1]); waitTask = bfly; @@ -163,9 +165,9 @@ char * init(int argc, char**argv){ - + char *filename = 0; - + // printf("%s ",argv[4]); for (int i = 1; argv[i]; ++i) { if (strcmp(argv[i], "-file") == 0) { @@ -191,11 +193,11 @@ long m = (cl_int)(log((double)n)/log(2.0)); size_t *gws = new size_t[2]; size_t *lws = new size_t[2]; - + xm = (cl_float2 *)malloc(n * n * sizeof(cl_float2)); rm = (cl_float2 *)malloc(n * n * sizeof(cl_float2)); wm = (cl_float2 *)malloc(n / 2 * sizeof(cl_float2)); - + HTask* waitTask; /* * [cl_float2] @@ -226,6 +228,7 @@ sfac->set_outData(0, wm, length_w*sizeof(cl_float2)); sfac->set_param(0,n); sfac->set_cpu(spe_cpu); + sfac->flip(); sfac->iterate(gws[0]); // Butterfly Operation @@ -239,6 +242,7 @@ first_trns->set_outData(0,xm,length_r*sizeof(cl_float2)); first_trns->set_param(0,n); first_trns->set_cpu(spe_cpu); + first_trns->flip(); first_trns->wait_for(waitTask); first_trns->iterate(gws[0],gws[1]); @@ -254,6 +258,7 @@ hpfl->set_param(0,n); hpfl->set_param(1,(long)radius); hpfl->set_cpu(spe_cpu); + hpfl->flip(); hpfl->wait_for(waitTask); hpfl->iterate(gws[0],gws[1]); @@ -269,6 +274,7 @@ second_trns->set_outData(0,rm,length_r*sizeof(cl_float2)); second_trns->set_param(0,n); second_trns->set_cpu(spe_cpu); + second_trns->flip(); second_trns->wait_for(waitTask); second_trns->iterate(gws[0],gws[1]);