# HG changeset patch # User kkb # Date 1390995144 -32400 # Node ID e801016bd47c18e46a22ead9115ea59ed9064f36 # Parent bbd209709ca1725781211557599c63efc5a8b44e fix diff -r bbd209709ca1 -r e801016bd47c TaskManager/Cuda/CudaScheduler.cc --- a/TaskManager/Cuda/CudaScheduler.cc Wed Jan 29 18:55:59 2014 +0900 +++ b/TaskManager/Cuda/CudaScheduler.cc Wed Jan 29 20:32:24 2014 +0900 @@ -47,12 +47,14 @@ m->allcate_size = 64; m->buf = (CUdeviceptr*)malloc(m->allcate_size*sizeof(CUdeviceptr*)); m->event = (CUevent*)malloc(m->allcate_size*sizeof(CUevent*)); + m->stream = (CUStream*)malloc(m->allcate_size*sizeof(CUStream*)); } void CudaScheduler::destroyCudaBuffer(CudaBufferPtr m) { free(m->buf); free(m->event); + free(m->stream); m->size = 0; m->allcate_size = 0; m->buf = 0; @@ -65,6 +67,7 @@ m->allcate_size *= 2; m->buf = (CUdeviceptr*)realloc(m->buf, m->allcate_size*sizeof(CUdeviceptr*)); m->event = (CUevent*)remalloc(m->allcate_size*sizeof(CUevent*)); + m->stream = (CUStream*)remalloc(m->allcate_size*sizeof(CUStream*)); } error = cuMemAlloc(&m->buf[i], size); @@ -250,7 +253,7 @@ if (tasklist->dim > 0) { ret = cuLaunchKernel(kernel[cur], - tasklist->x*tasklist->y*tasklist->z, 0, 0, + tasklist->x*tasklist->y*tasklist->z, 1, 1, 1, 1, 1, stream, kernelParams, NULL); } else { diff -r bbd209709ca1 -r e801016bd47c TaskManager/Cuda/CudaScheduler.h --- a/TaskManager/Cuda/CudaScheduler.h Wed Jan 29 18:55:59 2014 +0900 +++ b/TaskManager/Cuda/CudaScheduler.h Wed Jan 29 20:32:24 2014 +0900 @@ -19,6 +19,7 @@ int size; CUdeviceptr* buf; CUevent* event; + CUStream* stream; } CudaBuffer; cudabuffer* CudaBufferPtr; CudaScheduler(); diff -r bbd209709ca1 -r e801016bd47c example/Cuda/main.cc --- a/example/Cuda/main.cc Wed Jan 29 18:55:59 2014 +0900 +++ b/example/Cuda/main.cc Wed Jan 29 20:32:24 2014 +0900 @@ -26,6 +26,7 @@ CUcontext context; CUmodule module; CUfunction function; + CUStream stream; cuInit(0); cuDeviceGet(&device, 0); @@ -33,6 +34,9 @@ cuModuleLoad(&module, "multiply.ptx"); cuModuleGetFunction(&function, module, "multiply"); + cuStramCreate(&steam,0); + + float* A = new float[LENGTH]; float* B = new float[LENGTH]; float* C = new float[LENGTH]; @@ -48,18 +52,19 @@ cuMemAlloc(&devB, LENGTH*sizeof(float)); cuMemAlloc(&devC, LENGTH*sizeof(float)); - cuMemcpyHtoD(devA, A, LENGTH*sizeof(float)); - cuMemcpyHtoD(devB, B, LENGTH*sizeof(float)); - cuMemcpyHtoD(devC, C, LENGTH*sizeof(float)); + cuMemcpyHtoDAsync(devA, A, LENGTH*sizeof(float), stream); + cuMemcpyHtoDAsync(devB, B, LENGTH*sizeof(float), stream); + cuMemcpyHtoDAsync(devC, C, LENGTH*sizeof(float), stream); void* args[] = {&devA, &devB, &devC}; cuLaunchKernel(function, LENGTH, 1, 1, 1, 1, 1, - 0, NULL, args, NULL); + 0, stream, args, NULL); cuMemcpyDtoH(C, devC, LENGTH*sizeof(float)); + cuStreamWaitEvent(stream, ,0); // print_result(C); check_data(A, B, C); @@ -71,6 +76,7 @@ cuMemFree(devB); cuMemFree(devC); cuModuleUnload(module); + cuStreamDestroy(stream); cuCtxDestroy(context); return 0; diff -r bbd209709ca1 -r e801016bd47c example/Cuda/multiply.cu --- a/example/Cuda/multiply.cu Wed Jan 29 18:55:59 2014 +0900 +++ b/example/Cuda/multiply.cu Wed Jan 29 20:32:24 2014 +0900 @@ -1,7 +1,6 @@ extern "C" { __global__ void multiply(float* A, float* B, float* C) { int index = blockIdx.x * blockDim.x + threadIdx.x; - printf("%d\n",index); C[index] = A[index] * B[index]; } }