diff src/parallel_execution/cuda.c @ 431:b3359544adbb

Edit cudaExec but not work
author Tatsuki IHA <innparusu@cr.ie.u-ryukyu.ac.jp>
date Tue, 17 Oct 2017 01:50:12 +0900
parents 35b37fe8d3a7
children d920f3a3f037
line wrap: on
line diff
--- a/src/parallel_execution/cuda.c	Mon Oct 09 17:46:42 2017 +0900
+++ b/src/parallel_execution/cuda.c	Tue Oct 17 01:50:12 2017 +0900
@@ -85,58 +85,66 @@
 }
 
 
-void CUDAExec(struct Context* context, struct SortArray* inputSortArray, struct SortArray* outputSortArray) {
-    //printf("cuda exec start\n");
-    // Worker *worker = context->worker;
-    // CUDAWorker* cudaWorker = (CUDAWorker*)worker->worker;
-    // memory allocate
-    CUdeviceptr devA;
-    CUdeviceptr devB;
-    CUdeviceptr devC;
-    CUdeviceptr devD;
+void cudaExec(struct Context* context, struct CudaBuffer* buffer, char* filename, char* function) {
+    buffer->kernelParams = (void **)calloc(buffer->inputLen + buffer->outputLen, sizeof(void *));
+    int paramCount = 0;
+    for (int i = 0; i < buffer->inputLen; i++) {
+        CUdeviceptr* deviceptr = (CUdeviceptr *)calloc(1, sizeof(CUdeviceptr));
+        // memory allocate
+        checkCudaErrors(cuMemAlloc(deviceptr, GET_SIZE(buffer->inputData[i])));
+        // Synchronous data transfer(host to device)
+        checkCudaErrors(cuMemcpyHtoD(*deviceptr, buffer->inputData[i], GET_SIZE(buffer->inputData[i])));
+        buffer->kernelParams[paramCount++] = deviceptr;
+    }
 
-    checkCudaErrors(cuMemAlloc(&devA, sizeof(struct Integer)*GET_LEN(inputSortArray->array)));
-    checkCudaErrors(cuMemAlloc(&devB, sizeof(int)));
-    checkCudaErrors(cuMemAlloc(&devC, sizeof(int)));
-    checkCudaErrors(cuMemAlloc(&devD, sizeof(int)));
+    for (int i = 0; i < buffer->outputLen; i++) {
+        CUdeviceptr* deviceptr = (CUdeviceptr *)calloc(1, sizeof(CUdeviceptr));
+        // memory allocate
+        checkCudaErrors(cuMemAlloc(deviceptr, GET_SIZE(buffer->outputData[i])));
+        // Synchronous data transfer(host to device)
+        checkCudaErrors(cuMemcpyHtoD(*deviceptr, buffer->outputData[i], GET_SIZE(buffer->outputData[i])));
+        buffer->kernelParams[paramCount++] = deviceptr;
+    }
 
-    //twiceカーネルが定義されてなければそれをロードする
-    checkCudaErrors(cuModuleLoad(&context->module, "c/examples/bitonicSort/CUDAbitonicSwap.ptx"));
-    checkCudaErrors(cuModuleGetFunction(&context->function, context->module, "bitonicSwap"));
-
-    //入力のDataGearをGPUにbuffer経由で送る
-    // Synchronous data transfer(host to device)
-    checkCudaErrors(cuMemcpyHtoD(devA, inputSortArray->array, sizeof(struct Integer)*GET_LEN(inputSortArray->array)));
-    checkCudaErrors(cuMemcpyHtoD(devB, &inputSortArray->block, sizeof(int)));
-    checkCudaErrors(cuMemcpyHtoD(devC, &inputSortArray->first, sizeof(int)));
-    checkCudaErrors(cuMemcpyHtoD(devD, &inputSortArray->prefix, sizeof(int)));
+    // カーネルが定義されてなければそれをロードする
+    checkCudaErrors(cuModuleLoad(&context->module, filename));
+    checkCudaErrors(cuModuleGetFunction(&context->function, context->module, function));
 
     // Asynchronous launch kernel
     context->num_exec = 1;
-    void* args[] = {&devA, &devB, &devC, &devD};
     if (context->iterate) {
         struct MultiDimIterator* iterator = &context->iterator->iterator->MultiDimIterator;
         checkCudaErrors(cuLaunchKernel(context->function,
                     iterator->x, iterator->y, iterator->z,
                     1, 1, 1,
-                    0, NULL, args, NULL));
+                    0, NULL, buffer->kernelParams, NULL));
 
     } else {
         checkCudaErrors(cuLaunchKernel(context->function,
                     1, 1, 1,
                     1, 1, 1,
-                    0, NULL, args, NULL));
+                    0, NULL, buffer->kernelParams, NULL));
     }
+
     //結果を取ってくるコマンドを入力する
     //コマンドの終了待ちを行う   
-    checkCudaErrors(cuMemcpyDtoH(inputSortArray->array, devA, sizeof(struct Integer)*GET_LEN(inputSortArray->array)));
-    outputSortArray->array = inputSortArray->array;
+    paramCount = 0;
+    for (int i = 0; i < buffer->inputLen; i++) {
+        CUdeviceptr* deviceptr =  buffer->kernelParams[paramCount++];
+        checkCudaErrors(cuMemcpyDtoH(buffer->inputData[i], *deviceptr, GET_SIZE(buffer->inputData[i])));
+        cuMemFree(*deviceptr);
+        free(deviceptr);
+    }
+
+    for (int i = 0; i < buffer->outputLen; i++) {
+        CUdeviceptr* deviceptr =  buffer->kernelParams[paramCount++];
+        checkCudaErrors(cuMemcpyDtoH(buffer->outputData[i], *deviceptr, GET_SIZE(buffer->outputData[i])));
+        cuMemFree(*deviceptr);
+        free(deviceptr);
+    }
+    free(buffer->kernelParams);
     // wait for stream
     checkCudaErrors(cuCtxSynchronize());
-    cuMemFree(devA);
-    cuMemFree(devB);
-    cuMemFree(devC);
-    cuMemFree(devD);
 }
 
 void cudaShutdown( struct CUDAWorker *worker) {