Mercurial > hg > Gears > GearsAgda
diff src/parallel_execution/CUDAtwice.cbc @ 303:1dbaef86593b
CUDAtwice.cbc
author | ikkun |
---|---|
date | Mon, 13 Feb 2017 18:23:29 +0900 |
parents | 8e7926f3e271 |
children | ae4f6aa427f5 |
line wrap: on
line diff
--- a/src/parallel_execution/CUDAtwice.cbc Mon Feb 13 17:58:04 2017 +0900 +++ b/src/parallel_execution/CUDAtwice.cbc Mon Feb 13 18:23:29 2017 +0900 @@ -18,10 +18,72 @@ __code twice_stub(struct Context* context) { struct Context* workerContext = context->worker->worker->CPUWorker.context; + + // memory allocate + CUdeviceptr devA; + CUdeviceptr devB[num_exec]; + CUdeviceptr devOut[num_exec]; + + checkCudaErrors(cuMemAlloc(&devA, LENGTH*THREAD*sizeof(float))); + for (int i=0;i<num_exec;i++) { + checkCudaErrors(cuMemAlloc(&devB[i], sizeof(float))); + checkCudaErrors(cuMemAlloc(&devOut[i], LENGTH*THREAD*sizeof(float))); + } + + //twiceカーネルが定義されてなければそれをロードする + checkCudaErrors(cuModuleLoad(&module, "multiply.ptx")); + checkCudaErrors(cuModuleGetFunction(&function, module, "multiply")); + //入力のDataGearをGPUにbuffer経由で送る - //twiceカーネルが定義されてなければそれをロードする + // Synchronous data transfer(host to device) + checkCudaErrors(cuMemcpyHtoD(devA, A, LENGTH*THREAD*sizeof(float))); + + // Asynchronous launch kernel + for (int i=0;i<num_exec;i++,cur++) { + if (num_stream <= cur) + cur=0; + //B[i] = (float)(i+1); + //cuMemcpyHtoDAsync(devB[i], &B[i], sizeof(float), stream[cur]); + void* args[] = {&devA, &devB[i], &devOut[i]}; + checkCudaErrors(cuLaunchKernel(function, + LENGTH, 1, 1, + THREAD, 1, 1, + 0, num_stream ? stream[cur] : NULL , args, NULL)); + //cuMemcpyDtoHAsync(result[i], devOut[i], LENGTH*THREAD*sizeof(float), stream[cur]); + } + //結果を取ってくるコマンドを入力する //コマンドの終了待ちを行う + // Asynchronous data transfer(device to host) + for (int i=0;i<num_exec;i++,cur++) { + if (num_stream <= cur) + cur = 0; + if (num_stream) { + checkCudaErrors(cuMemcpyDtoHAsync(result[i], devOut[i], LENGTH*THREAD*sizeof(float), stream[cur])); + } else { + checkCudaErrors(cuMemcpyDtoH(result[i], devOut[i], LENGTH*THREAD*sizeof(float))); + } + } + + // wait for stream + for (int i=0;i<num_stream;i++) + checkCudaErrors(cuStreamSynchronize(stream[i])); + // Asynchronous data transfer(device to host) + for (int i=0;i<num_exec;i++,cur++) { + if (num_stream <= cur) + cur = 0; + if (num_stream) { + checkCudaErrors(cuMemcpyDtoHAsync(result[i], devOut[i], LENGTH*THREAD*sizeof(float), stream[cur])); + } else { + checkCudaErrors(cuMemcpyDtoH(result[i], devOut[i], LENGTH*THREAD*sizeof(float))); + } + } + + // wait for stream + for (int i=0;i<num_stream;i++) + checkCudaErrors(cuStreamSynchronize(stream[i])); + + //continationにそってGPUworkerに戻る goto twice(context, Gearef(context, LoopCounter), 0, 0, NULL, workerContext); }