Mercurial > hg > Gears > GearsAgda
diff src/parallel_execution/CUDAtwice.cbc @ 308:aeddca686007
CUDAtwice
author | ikkun |
---|---|
date | Tue, 14 Feb 2017 16:55:22 +0900 |
parents | ae4f6aa427f5 |
children | 8c2123bb577b |
line wrap: on
line diff
--- a/src/parallel_execution/CUDAtwice.cbc Tue Feb 14 12:31:58 2017 +0900 +++ b/src/parallel_execution/CUDAtwice.cbc Tue Feb 14 16:55:22 2017 +0900 @@ -15,73 +15,38 @@ } __code twice_stub(struct Context* context) { + struct LoopCounter* loopCounter = &context->data[context->dataNum]->LoopCounter; + struct Array* array = &context->data[context->dataNum+1]->Array; + Worker *worker = context->worker; + CUDAWorker* cudaWorker = (CUDAWorker*)worker->worker; // memory allocate CUdeviceptr devA; - CUdeviceptr devB[num_exec]; CUdeviceptr devOut[num_exec]; - checkCudaErrors(cuMemAlloc(&devA, LENGTH*THREAD*sizeof(float))); - for (int i=0;i<num_exec;i++) { - checkCudaErrors(cuMemAlloc(&devB[i], sizeof(float))); - checkCudaErrors(cuMemAlloc(&devOut[i], LENGTH*THREAD*sizeof(float))); - } + checkCudaErrors(cuMemAlloc(&devA, array->size)); //twiceカーネルが定義されてなければそれをロードする - checkCudaErrors(cuModuleLoad(&module, "multiply.ptx")); - checkCudaErrors(cuModuleGetFunction(&function, module, "multiply")); + checkCudaErrors(cuModuleLoad(&context->module, "CUDAtwice.ptx")); + checkCudaErrors(cuModuleGetFunction(context->&function, module, "twice")); //入力のDataGearをGPUにbuffer経由で送る // Synchronous data transfer(host to device) - checkCudaErrors(cuMemcpyHtoD(devA, A, LENGTH*THREAD*sizeof(float))); + checkCudaErrors(cuMemcpyHtoD(devA, array->array, array->size)); // Asynchronous launch kernel - for (int i=0;i<num_exec;i++,cur++) { - if (num_stream <= cur) - cur=0; - //B[i] = (float)(i+1); - //cuMemcpyHtoDAsync(devB[i], &B[i], sizeof(float), stream[cur]); - void* args[] = {&devA, &devB[i], &devOut[i]}; - checkCudaErrors(cuLaunchKernel(function, - LENGTH, 1, 1, - THREAD, 1, 1, - 0, num_stream ? stream[cur] : NULL , args, NULL)); - //cuMemcpyDtoHAsync(result[i], devOut[i], LENGTH*THREAD*sizeof(float), stream[cur]); - } + context->num_exec = 1; + void* args[] = {&devA}; + checkCudaErrors(cuLaunchKernel(function, + array->prefix, 1, 1, + context->num_exec, 1, 1, + 0, NULL , args, NULL)); //結果を取ってくるコマンドを入力する - //コマンドの終了待ちを行う - // Asynchronous data transfer(device to host) - for (int i=0;i<num_exec;i++,cur++) { - if (num_stream <= cur) - cur = 0; - if (num_stream) { - checkCudaErrors(cuMemcpyDtoHAsync(result[i], devOut[i], LENGTH*THREAD*sizeof(float), stream[cur])); - } else { - checkCudaErrors(cuMemcpyDtoH(result[i], devOut[i], LENGTH*THREAD*sizeof(float))); - } - } - + //コマンドの終了待ちを行う + checkCudaErrors(cuMemcpyDtoH(array->array, devA, array->size)); // wait for stream - for (int i=0;i<num_stream;i++) - checkCudaErrors(cuStreamSynchronize(stream[i])); - // Asynchronous data transfer(device to host) - for (int i=0;i<num_exec;i++,cur++) { - if (num_stream <= cur) - cur = 0; - if (num_stream) { - checkCudaErrors(cuMemcpyDtoHAsync(result[i], devOut[i], LENGTH*THREAD*sizeof(float), stream[cur])); - } else { - checkCudaErrors(cuMemcpyDtoH(result[i], devOut[i], LENGTH*THREAD*sizeof(float))); - } - } - - // wait for stream - for (int i=0;i<num_stream;i++) - checkCudaErrors(cuStreamSynchronize(stream[i])); - //continuationにそってGPUworkerに戻る - struct Context* workerContext = context->worker->worker->CUDAWorker.context; - goto twice(context, Gearef(context, LoopCounter), 0, 0, NULL, workerContext); + goto meta(context, context->next); }