Mercurial > hg > Members > Moririn
view src/parallel_execution/CUDAtwice.cbc @ 303:1dbaef86593b
CUDAtwice.cbc
author | ikkun |
---|---|
date | Mon, 13 Feb 2017 18:23:29 +0900 |
parents | 8e7926f3e271 |
children | ae4f6aa427f5 |
line wrap: on
line source
#include <stdio.h> #include "context.h" #include "origin_cs.h" __code twice(struct Context* context, struct LoopCounter* loopCounter, int index, int prefix, int* array, struct Context* workerContext) { int i = loopCounter->i; if (i < prefix) { array[i+index*prefix] = array[i+index*prefix]*2; loopCounter->i++; goto meta(context, C_twice); } loopCounter->i = 0; goto meta(workerContext, workerContext->next); } __code twice_stub(struct Context* context) { struct Context* workerContext = context->worker->worker->CPUWorker.context; // memory allocate CUdeviceptr devA; CUdeviceptr devB[num_exec]; CUdeviceptr devOut[num_exec]; checkCudaErrors(cuMemAlloc(&devA, LENGTH*THREAD*sizeof(float))); for (int i=0;i<num_exec;i++) { checkCudaErrors(cuMemAlloc(&devB[i], sizeof(float))); checkCudaErrors(cuMemAlloc(&devOut[i], LENGTH*THREAD*sizeof(float))); } //twiceカーネルが定義されてなければそれをロードする checkCudaErrors(cuModuleLoad(&module, "multiply.ptx")); checkCudaErrors(cuModuleGetFunction(&function, module, "multiply")); //入力のDataGearをGPUにbuffer経由で送る // Synchronous data transfer(host to device) checkCudaErrors(cuMemcpyHtoD(devA, A, LENGTH*THREAD*sizeof(float))); // Asynchronous launch kernel for (int i=0;i<num_exec;i++,cur++) { if (num_stream <= cur) cur=0; //B[i] = (float)(i+1); //cuMemcpyHtoDAsync(devB[i], &B[i], sizeof(float), stream[cur]); void* args[] = {&devA, &devB[i], &devOut[i]}; checkCudaErrors(cuLaunchKernel(function, LENGTH, 1, 1, THREAD, 1, 1, 0, num_stream ? stream[cur] : NULL , args, NULL)); //cuMemcpyDtoHAsync(result[i], devOut[i], LENGTH*THREAD*sizeof(float), stream[cur]); } //結果を取ってくるコマンドを入力する //コマンドの終了待ちを行う // Asynchronous data transfer(device to host) for (int i=0;i<num_exec;i++,cur++) { if (num_stream <= cur) cur = 0; if (num_stream) { checkCudaErrors(cuMemcpyDtoHAsync(result[i], devOut[i], LENGTH*THREAD*sizeof(float), stream[cur])); } else { checkCudaErrors(cuMemcpyDtoH(result[i], devOut[i], LENGTH*THREAD*sizeof(float))); } } // wait for stream for (int i=0;i<num_stream;i++) checkCudaErrors(cuStreamSynchronize(stream[i])); // Asynchronous data transfer(device to host) for (int i=0;i<num_exec;i++,cur++) { if (num_stream <= cur) cur = 0; if (num_stream) { checkCudaErrors(cuMemcpyDtoHAsync(result[i], devOut[i], LENGTH*THREAD*sizeof(float), stream[cur])); } else { checkCudaErrors(cuMemcpyDtoH(result[i], devOut[i], LENGTH*THREAD*sizeof(float))); } } // wait for stream for (int i=0;i<num_stream;i++) checkCudaErrors(cuStreamSynchronize(stream[i])); //continationにそってGPUworkerに戻る goto twice(context, Gearef(context, LoopCounter), 0, 0, NULL, workerContext); }