changeset 1919:d6e033734c12 draft

running cuda sample
author Shohei KOKUBO <e105744@ie.u-ryukyu.ac.jp>
date Tue, 28 Jan 2014 18:33:19 +0900
parents 15e8c50ed570
children 273638411ebf
files example/Cuda/Makefile example/Cuda/Makefile.def example/Cuda/main.cc example/Cuda/multiply.cu
diffstat 4 files changed, 37 insertions(+), 21 deletions(-) [+]
line wrap: on
line diff
--- a/example/Cuda/Makefile	Fri Jan 24 07:16:26 2014 +0900
+++ b/example/Cuda/Makefile	Tue Jan 28 18:33:19 2014 +0900
@@ -1,7 +1,8 @@
 include ./Makefile.def
 
 SRCS_TMP = $(wildcard *.cc)
-SRCS_EXCLUDE = # 除外するファイルを書く																				 
+SRCS_EXCLUDE = # 除外するファイルを書く
+
 SRCS = $(filter-out $(SRCS_EXCLUDE),$(SRCS_TMP))
 OBJS = $(SRCS:.cc=.o)
 
@@ -13,7 +14,17 @@
 
 LIBS = -I/Developer/NVIDIA/CUDA-5.5/include -F/Library/Frameworks -framework CUDA
 
-.SUFFIXES: .cc .o
+CUDA_SRCS_TMP = $(wildcard *.cu)
+CUDA_SRCS_EXCLUDE = # 除外するファイルを書く
+
+CUDA_SRCS = $(filter-out $(CUDA_SRCS_EXCLUDE),$(CUDA_SRCS_TMP))
+CUDA_OBJS = $(CUDA_SRCS:.cu)
+
+CUDA_TASK_SRCS_TMP = $(wildcard $(TASK_DIR2)/*.cu $(TASK_DIR1)/*.cu)
+CUDA_TASK_SRCS = $(filter-out $(TASK_DIR1)/$(TASK_SRCS_EXCLUDE),$(CUDA_TASK_SRCS_TMP))
+CUDA_TASK_OBJS = $(CUDA_TASK_SRCS:.cu)
+
+.SUFFIXES: .cc .o .cu
 
 .cc.o:
 	$(CC) $(CFLAGS) $(LIBS) $(INCLUDE) -c $< -o $@
--- a/example/Cuda/Makefile.def	Fri Jan 24 07:16:26 2014 +0900
+++ b/example/Cuda/Makefile.def	Tue Jan 28 18:33:19 2014 +0900
@@ -5,4 +5,6 @@
 OPT = -g -O0
 
 CC = clang++
-CFLAGS = -Wall $(OPT)
\ No newline at end of file
+NVCC = nvcc
+CFLAGS = -Wall $(OPT)
+NVCCFLAGS = -ptx
\ No newline at end of file
--- a/example/Cuda/main.cc	Fri Jan 24 07:16:26 2014 +0900
+++ b/example/Cuda/main.cc	Tue Jan 28 18:33:19 2014 +0900
@@ -15,25 +15,29 @@
     return;
 }
 
-
+void print_result(float* C) {
+    for (int i=0; i<LENGTH; i++) {
+        printf("%f\n",C[i]);
+    }
+}
 
 int main() {
     CUdevice device;
     CUcontext context;
     CUmodule module;
     CUfunction function;
-    //    CUresult result;
+    CUresult result;
 
     cuInit(0);
     cuDeviceGet(&device, 0);
     cuCtxCreate(&context, 0, device);
-    cuModuleLoad(&module, "multiply.cu");
+    cuModuleLoad(&module, "multiply.ptx");
     cuModuleGetFunction(&function, module, "multiply");
-
+    
     float* A = new float[LENGTH];
     float* B = new float[LENGTH];
     float* C = new float[LENGTH];
-
+    
     for (int i=0; i<LENGTH; i++) {
         A[i] = (float)(i+1000);
         B[i] = (float)(i+1)/10.f;
@@ -49,28 +53,26 @@
     cuMemcpyHtoD(devB, B, LENGTH*sizeof(float));
     cuMemcpyHtoD(devC, C, LENGTH*sizeof(float));
 
-    cuParamSetv(function, 0, A, LENGTH*sizeof(float));
-    cuParamSetv(function, 0, B, LENGTH*sizeof(float));
-    cuParamSetv(function, 0, C, LENGTH*sizeof(float));
-
+    void* args[] = {&devA, &devB, &devC};
+    
     cuLaunchKernel(function,
                    LENGTH, 1, 1,
                    1, 1, 1,
-                   0, NULL, NULL, NULL);
-
+                   0, NULL, args, NULL);
+    
     cuMemcpyDtoH(C, devC, LENGTH*sizeof(float));
 
+    //    print_result(C);
     check_data(A, B, C);
 
     delete[] A;
     delete[] B;
     delete[] C;
-    cuModuleUnload(module);
     cuMemFree(devA);
     cuMemFree(devB);
     cuMemFree(devC);
-    
+    cuModuleUnload(module);
+    cuCtxDestroy(context);
 
     return 0;
 }
-    
--- a/example/Cuda/multiply.cu	Fri Jan 24 07:16:26 2014 +0900
+++ b/example/Cuda/multiply.cu	Tue Jan 28 18:33:19 2014 +0900
@@ -1,5 +1,6 @@
-__global__ void multiply(int* A, int* B, int* C) {
-    int index = blockIdx.x * blockDim.x + threadIdx.x;
-    
-    C[index] = A[index] * B[index];
+extern "C" {
+    __global__ void multiply(float* A, float* B, float* C) {
+        int index = blockIdx.x * blockDim.x + threadIdx.x;
+        C[index] = A[index] * B[index];
+    }
 }