changeset 2067:0e2389a5ac4e draft

fix Cudaexmple
author ikkun
date Fri, 03 Feb 2017 19:09:16 +0900
parents 53643db3f0f9
children 45c230f52257
files example/Cuda/Makefile example/Cuda/main.cc example/Cuda/multiply.cu
diffstat 3 files changed, 44 insertions(+), 26 deletions(-) [+]
line wrap: on
line diff
--- a/example/Cuda/Makefile	Wed Feb 17 17:49:13 2016 +0900
+++ b/example/Cuda/Makefile	Fri Feb 03 19:09:16 2017 +0900
@@ -30,7 +30,7 @@
 	$(CC) -o $(TARGET) $(OBJS) $(TASK_OBJS) $(LIBS)
 
 debug: $(TARGET)
-	sudo gdb ./$(TARGET)
+	sudo lldb ./$(TARGET)
 
 clean:
 	rm -f $(TARGET) $(OBJS) $(TASK_OBJS) $(CUDA_OBJS)
--- a/example/Cuda/main.cc	Wed Feb 17 17:49:13 2016 +0900
+++ b/example/Cuda/main.cc	Fri Feb 03 19:09:16 2017 +0900
@@ -2,11 +2,28 @@
 #include <sys/time.h>
 #include <string.h>
 #include <cuda.h>
+#include <cuda_runtime.h>
+#include <stdlib.h>
 
-#define LENGTH 10
-#define THREAD 10
+
+#define LENGTH (10)
+#define THREAD (10)
+
+void
+report_error(cudaError_t err, const char* file, int lineNo) {
+    fprintf(stderr, "[cudaError] %s (error code: %d) at %s line %d\n", cudaGetErrorString(err), err, file, lineNo);
+}
 
-static double
+#define CUDA_CALL(func) \
+    do { \
+        if ((func) != CUDA_SUCCESS) { \
+            cudaError_t err = cudaGetLastError();     \
+            report_error(err, __FILE__, __LINE__);      \
+            exit(err); \
+        } \
+    } while(0)
+
+double
 getTime() {
     struct timeval tv;
     gettimeofday(&tv, NULL);
@@ -47,23 +64,23 @@
     CUfunction function;
     CUstream stream[num_stream];
 
-    cuInit(0);
-    cuDeviceGet(&device, 0);
-    cuCtxCreate(&context, CU_CTX_SCHED_SPIN, device);
-    cuModuleLoad(&module, "multiply.ptx");
-    cuModuleGetFunction(&function, module, "multiply");
+    CUDA_CALL(cuInit(0));
+    CUDA_CALL(cuDeviceGet(&device, 0));
+    CUDA_CALL(cuCtxCreate(&context, CU_CTX_SCHED_SPIN, device));
+    CUDA_CALL(cuModuleLoad(&module, "multiply.ptx"));
+    CUDA_CALL(cuModuleGetFunction(&function, module, "multiply"));
     for (int i=0;i<num_stream;i++)
-        cuStreamCreate(&stream[i],0);
+        CUDA_CALL(cuStreamCreate(&stream[i],0));
 
     // memory allocate
     CUdeviceptr devA;
     CUdeviceptr devB[num_exec];
     CUdeviceptr devOut[num_exec];
 
-    cuMemAlloc(&devA, LENGTH*THREAD*sizeof(float));
+    CUDA_CALL(cuMemAlloc(&devA, LENGTH*THREAD*sizeof(float)));
     for (int i=0;i<num_exec;i++) {
-        cuMemAlloc(&devB[i], sizeof(float));
-        cuMemAlloc(&devOut[i], LENGTH*THREAD*sizeof(float));
+        CUDA_CALL(cuMemAlloc(&devB[i], sizeof(float)));
+        CUDA_CALL(cuMemAlloc(&devOut[i], LENGTH*THREAD*sizeof(float)));
     }
 
     // input buffer
@@ -80,7 +97,7 @@
         result[i] = new float[LENGTH*THREAD];
 
     // Synchronous data transfer(host to device)
-    cuMemcpyHtoD(devA, A, LENGTH*THREAD*sizeof(float));
+    CUDA_CALL(cuMemcpyHtoD(devA, A, LENGTH*THREAD*sizeof(float)));
     
     // Asynchronous data transfer(host to device)
     int cur = 0;
@@ -89,7 +106,7 @@
          if (num_stream <= cur)
              cur = 0;
          B[i] = (float)(i+1);
-         cuMemcpyHtoDAsync(devB[i], &B[i], sizeof(float), stream[cur]);
+         CUDA_CALL(cuMemcpyHtoDAsync(devB[i], &B[i], sizeof(float), stream[cur]));
      }
 
     cur = 0;
@@ -101,10 +118,10 @@
         B[i] = (float)(i+1);
         //cuMemcpyHtoDAsync(devB[i], &B[i], sizeof(float), stream[cur]);
         void* args[] = {&devA, &devB[i], &devOut[i]};
-        cuLaunchKernel(function,
+        CUDA_CALL(cuLaunchKernel(function,
                        LENGTH, 1, 1,
                        THREAD, 1, 1,
-                       0, stream[cur], args, NULL);
+                                 0, stream[cur], args, NULL));
         //cuMemcpyDtoHAsync(result[i], devOut[i], LENGTH*THREAD*sizeof(float), stream[cur]);
     }
 
@@ -115,12 +132,12 @@
      for (int i=0;i<num_exec;i++,cur++) {
          if (num_stream <= cur)
              cur = 0;
-         cuMemcpyDtoHAsync(result[i], devOut[i], LENGTH*THREAD*sizeof(float), stream[cur]);
+         CUDA_CALL(cuMemcpyDtoHAsync(result[i], devOut[i], LENGTH*THREAD*sizeof(float), stream[cur]));
      }
     
     // wait for stream
     for (int i=0;i<num_stream;i++)
-        cuStreamSynchronize(stream[i]);
+        CUDA_CALL(cuStreamSynchronize(stream[i]));
     
     //printf("%0.6f\n",getTime()-start);
 
@@ -128,15 +145,15 @@
         check_data(A,(float)(i+1),result[i]);
 
     // memory release
-    cuMemFree(devA);
+    CUDA_CALL(cuMemFree(devA));
     for (int i=0;i<num_exec;i++) {
-        cuMemFree(devB[i]);
-        cuMemFree(devOut[i]);
+        CUDA_CALL(cuMemFree(devB[i]));
+        CUDA_CALL(cuMemFree(devOut[i]));
     }
     for (int i=0;i<num_stream;i++)
-        cuStreamDestroy(stream[i]);
-    cuModuleUnload(module);
-    cuCtxDestroy(context);
+        CUDA_CALL(cuStreamDestroy(stream[i]));
+    CUDA_CALL(cuModuleUnload(module));
+    CUDA_CALL(cuCtxDestroy(context));
 
     delete[] A;
     delete[] B;
@@ -146,3 +163,4 @@
 
     return 0;
 }
+//
--- a/example/Cuda/multiply.cu	Wed Feb 17 17:49:13 2016 +0900
+++ b/example/Cuda/multiply.cu	Fri Feb 03 19:09:16 2017 +0900
@@ -4,5 +4,5 @@
         int index = blockIdx.x * blockDim.x + threadIdx.x;
         C[index] = A[index] * B[0];
     }
+
 }
-