changeset 292:2bc63a22dd21

add twice
author ikkun
date Thu, 09 Feb 2017 19:51:32 +0900
parents 87128b876c63
children 198affea1be1
files src/test/CMakeLists.txt src/test/main.cc src/test/twice.cu
diffstat 3 files changed, 174 insertions(+), 174 deletions(-) [+]
line wrap: on
line diff
--- a/src/test/CMakeLists.txt	Thu Feb 09 19:02:15 2017 +0900
+++ b/src/test/CMakeLists.txt	Thu Feb 09 19:51:32 2017 +0900
@@ -2,13 +2,28 @@
 
 # add_definitions("-Wall -g -O0")
 
-# set(CMAKE_C_COMPILER $ENV{CbC_Clang}/clang)
+set(CMAKE_C_COMPILER $ENV{CBC_COMPILER})
+set(CUDA_LINK_FLAGS "-Wl,-search_paths_first -Wl,-headerpad_max_install_names /Developer/NVIDIA/CUDA-8.0/lib/libcudart_static.a -Wl,-rpath,/usr/local/cuda/lib") 
+SET( CMAKE_EXE_LINKER_FLAGS  "${CMAKE_EXE_LINKER_FLAGS} ${CUDA_LINK_FLAGS}" )
 
 cmake_minimum_required(VERSION 2.8)
 find_package(CUDA REQUIRED)
 
-add_custom_command(OUTPUT test.o
-   DEPENDS test.c                          
-   COMMAND cbclang -c test.c 
+add_custom_command(OUTPUT main.o
+   DEPENDS main.cu                          
+   COMMAND nvcc -O -c  main.cu 
 )
-cuda_add_executable(cudaExmple main.cu test.o)
+
+add_executable(cudaExmple main.o test.c)
+
+add_custom_command(OUTPUT twice.o
+   DEPENDS twice.cu                          
+   COMMAND nvcc -O -c  twice.cu 
+)
+
+add_custom_command(OUTPUT multiply.o
+   DEPENDS multiply.cu                          
+   COMMAND nvcc -O -c  multiply.cu 
+)
+
+add_executable(twiceExmple twice.o multiply.o test.c)
--- a/src/test/main.cc	Thu Feb 09 19:02:15 2017 +0900
+++ /dev/null	Thu Jan 01 00:00:00 1970 +0000
@@ -1,169 +0,0 @@
-#include <stdio.h>
-#include <sys/time.h>
-#include <string.h>
-#include <stdlib.h>
-
-extern "C"
-{
-//#include <cuda.h>
-}
-#include <cuda_runtime.h>
-
-#define LENGTH (10)
-#define THREAD (10)
-
-void
-report_error(cudaError_t err, const char* file, int lineNo) {
-    fprintf(stderr, "[cudaError] %s (error code: %d) at %s line %d\n", cudaGetErrorString(err), err, file, lineNo);
-}
-
-#define CUDA_CALL(func) \
-    do { \
-        if ((func) != CUDA_SUCCESS) { \
-            cudaError_t err = cudaGetLastError();     \
-            report_error(err, __FILE__, __LINE__);      \
-            exit(err); \
-        } \
-    } while(0)
-
-double
-getTime() {
-    struct timeval tv;
-    gettimeofday(&tv, NULL);
-    return tv.tv_sec + (double)tv.tv_usec*1e-6;
-}
-
-void
-check_data(float* A, float B, float* C) {
-    for (int i=0; i<LENGTH*THREAD; i++) {
-        if (A[i]*B!=C[i]) {
-            puts("multiply failure.");
-            return;
-        }
-    }
-    puts("success.");
-}
-
-void print_result(float* C) {
-    for (int i=0; i<LENGTH*THREAD; i++) {
-        printf("%f\n",C[i]);
-    }
-}
-
-int main(int args, char* argv[]) {
-    int num_stream = 1; // number of stream
-    int num_exec = 16; // number of executed kernel
-    
-    for (int i=1;argv[i];i++) {
-        if (strcmp(argv[i], "--stream") == 0 || strcmp(argv[i], "-s") == 0) {
-            num_stream = atoi(argv[++i]);
-        }
-    }
-
-    // initialize and load kernel
-    CUdevice device;
-    CUcontext context;
-    CUmodule module;
-    CUfunction function;
-    CUstream stream[num_stream];
-
-    CUDA_CALL(cuInit(0));
-    CUDA_CALL(cuDeviceGet(&device, 0));
-    CUDA_CALL(cuCtxCreate(&context, CU_CTX_SCHED_SPIN, device));
-    CUDA_CALL(cuModuleLoad(&module, "multiply.ptx"));
-    CUDA_CALL(cuModuleGetFunction(&function, module, "multiply"));
-    for (int i=0;i<num_stream;i++)
-        CUDA_CALL(cuStreamCreate(&stream[i],0));
-
-    // memory allocate
-    CUdeviceptr devA;
-    CUdeviceptr devB[num_exec];
-    CUdeviceptr devOut[num_exec];
-
-    CUDA_CALL(cuMemAlloc(&devA, LENGTH*THREAD*sizeof(float)));
-    for (int i=0;i<num_exec;i++) {
-        CUDA_CALL(cuMemAlloc(&devB[i], sizeof(float)));
-        CUDA_CALL(cuMemAlloc(&devOut[i], LENGTH*THREAD*sizeof(float)));
-    }
-
-    // input buffer
-    float* A = new float[LENGTH*THREAD];
-    float* B = new float[num_exec];
-
-    for (int i=0; i<LENGTH*THREAD; i++)
-        A[i] = (float)(i+1000);
-
-    // output buffer
-    float** result = new float* [num_exec];
-
-    for (int i=0;i<num_exec;i++)
-        result[i] = new float[LENGTH*THREAD];
-
-    // Synchronous data transfer(host to device)
-    CUDA_CALL(cuMemcpyHtoD(devA, A, LENGTH*THREAD*sizeof(float)));
-    
-    // Asynchronous data transfer(host to device)
-    int cur = 0;
-
-     for (int i=0;i<num_exec;i++,cur++) {
-         if (num_stream <= cur)
-             cur = 0;
-         B[i] = (float)(i+1);
-         CUDA_CALL(cuMemcpyHtoDAsync(devB[i], &B[i], sizeof(float), stream[cur]));
-     }
-
-    cur = 0;
-
-    // Asynchronous launch kernel
-    for (int i=0;i<num_exec;i++,cur++) {
-        if (num_stream <= cur)
-            cur=0;
-        B[i] = (float)(i+1);
-        //cuMemcpyHtoDAsync(devB[i], &B[i], sizeof(float), stream[cur]);
-        void* args[] = {&devA, &devB[i], &devOut[i]};
-        CUDA_CALL(cuLaunchKernel(function,
-                       LENGTH, 1, 1,
-                       THREAD, 1, 1,
-                                 0, stream[cur], args, NULL));
-        //cuMemcpyDtoHAsync(result[i], devOut[i], LENGTH*THREAD*sizeof(float), stream[cur]);
-    }
-
-    cur = 0;
-
-    
-    // Asynchronous data transfer(device to host)
-     for (int i=0;i<num_exec;i++,cur++) {
-         if (num_stream <= cur)
-             cur = 0;
-         CUDA_CALL(cuMemcpyDtoHAsync(result[i], devOut[i], LENGTH*THREAD*sizeof(float), stream[cur]));
-     }
-    
-    // wait for stream
-    for (int i=0;i<num_stream;i++)
-        CUDA_CALL(cuStreamSynchronize(stream[i]));
-    
-    //printf("%0.6f\n",getTime()-start);
-
-    for (int i=0;i<num_exec;i++)
-        check_data(A,(float)(i+1),result[i]);
-
-    // memory release
-    CUDA_CALL(cuMemFree(devA));
-    for (int i=0;i<num_exec;i++) {
-        CUDA_CALL(cuMemFree(devB[i]));
-        CUDA_CALL(cuMemFree(devOut[i]));
-    }
-    for (int i=0;i<num_stream;i++)
-        CUDA_CALL(cuStreamDestroy(stream[i]));
-    CUDA_CALL(cuModuleUnload(module));
-    CUDA_CALL(cuCtxDestroy(context));
-
-    delete[] A;
-    delete[] B;
-    for (int i=0;i<num_exec;i++)
-        delete[] result[i];
-    delete[] result;
-
-    return 0;
-}
-//
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/src/test/twice.cu	Thu Feb 09 19:51:32 2017 +0900
@@ -0,0 +1,154 @@
+#include <stdio.h>
+#include <sys/time.h>
+#include <string.h>
+#include <stdlib.h>
+
+#include <cuda.h>
+
+#include <cuda_runtime.h>
+#include "helper_cuda.h"
+
+#define LENGTH (10)
+#define THREAD (10)
+
+double
+getTime() {
+    struct timeval tv;
+    gettimeofday(&tv, NULL);
+    return tv.tv_sec + (double)tv.tv_usec*1e-6;
+}
+
+void
+check_data(float* A, float B, float* C) {
+    for (int i=0; i<LENGTH*THREAD; i++) {
+        if (A[i]*B!=C[i]) {
+            puts("multiply failure.");
+            return;
+        }
+    }
+    puts("success.");
+}
+
+void print_result(float* C) {
+    for (int i=0; i<LENGTH*THREAD; i++) {
+        printf("%f\n",C[i]);
+    }
+}
+
+int main(int args, char* argv[]) {
+    int num_stream = 1; // number of stream
+    int num_exec = 16; // number of executed kernel
+    
+    for (int i=1;argv[i];i++) {
+        if (strcmp(argv[i], "--stream") == 0 || strcmp(argv[i], "-s") == 0) {
+            num_stream = atoi(argv[++i]);
+        }
+    }
+
+    // initialize and load kernel
+    CUdevice device;
+    CUcontext context;
+    CUmodule module;
+    CUfunction function;
+    CUstream stream[num_stream];
+
+    checkCudaErrors(cuInit(0));
+    checkCudaErrors(cuDeviceGet(&device, 0));
+    checkCudaErrors(cuCtxCreate(&context, CU_CTX_SCHED_SPIN, device));
+    checkCudaErrors(cuModuleLoad(&module, "multiply.ptx"));
+    checkCudaErrors(cuModuleGetFunction(&function, module, "multiply"));
+    for (int i=0;i<num_stream;i++)
+        checkCudaErrors(cuStreamCreate(&stream[i],0));
+
+    // memory allocate
+    CUdeviceptr devA;
+    CUdeviceptr devB[num_exec];
+    CUdeviceptr devOut[num_exec];
+
+    checkCudaErrors(cuMemAlloc(&devA, LENGTH*THREAD*sizeof(float)));
+    for (int i=0;i<num_exec;i++) {
+        checkCudaErrors(cuMemAlloc(&devB[i], sizeof(float)));
+        checkCudaErrors(cuMemAlloc(&devOut[i], LENGTH*THREAD*sizeof(float)));
+    }
+
+    // input buffer
+    float* A = new float[LENGTH*THREAD];
+    float* B = new float[num_exec];
+
+    for (int i=0; i<LENGTH*THREAD; i++)
+        A[i] = (float)(i+1000);
+
+    // output buffer
+    float** result = new float* [num_exec];
+
+    for (int i=0;i<num_exec;i++)
+        result[i] = new float[LENGTH*THREAD];
+
+    // Synchronous data transfer(host to device)
+    checkCudaErrors(cuMemcpyHtoD(devA, A, LENGTH*THREAD*sizeof(float)));
+    
+    // Asynchronous data transfer(host to device)
+    int cur = 0;
+
+     for (int i=0;i<num_exec;i++,cur++) {
+         if (num_stream <= cur)
+             cur = 0;
+         B[i] = (float)(i+1);
+         checkCudaErrors(cuMemcpyHtoDAsync(devB[i], &B[i], sizeof(float), stream[cur]));
+     }
+
+    cur = 0;
+
+    // Asynchronous launch kernel
+    for (int i=0;i<num_exec;i++,cur++) {
+        if (num_stream <= cur)
+            cur=0;
+        B[i] = (float)(i+1);
+        //cuMemcpyHtoDAsync(devB[i], &B[i], sizeof(float), stream[cur]);
+        void* args[] = {&devA, &devB[i], &devOut[i]};
+        checkCudaErrors(cuLaunchKernel(function,
+                       LENGTH, 1, 1,
+                       THREAD, 1, 1,
+                                 0, stream[cur], args, NULL));
+        //cuMemcpyDtoHAsync(result[i], devOut[i], LENGTH*THREAD*sizeof(float), stream[cur]);
+    }
+
+    cur = 0;
+
+    
+    // Asynchronous data transfer(device to host)
+     for (int i=0;i<num_exec;i++,cur++) {
+         if (num_stream <= cur)
+             cur = 0;
+         checkCudaErrors(cuMemcpyDtoHAsync(result[i], devOut[i], LENGTH*THREAD*sizeof(float), stream[cur]));
+     }
+    
+    // wait for stream
+    for (int i=0;i<num_stream;i++)
+        checkCudaErrors(cuStreamSynchronize(stream[i]));
+    
+    //printf("%0.6f\n",getTime()-start);
+
+    for (int i=0;i<num_exec;i++)
+        check_data(A,(float)(i+1),result[i]);
+
+    // memory release
+    checkCudaErrors(cuMemFree(devA));
+    for (int i=0;i<num_exec;i++) {
+        checkCudaErrors(cuMemFree(devB[i]));
+        checkCudaErrors(cuMemFree(devOut[i]));
+    }
+    for (int i=0;i<num_stream;i++)
+        checkCudaErrors(cuStreamDestroy(stream[i]));
+    checkCudaErrors(cuModuleUnload(module));
+    checkCudaErrors(cuCtxDestroy(context));
+
+    delete[] A;
+    delete[] B;
+    for (int i=0;i<num_exec;i++)
+        delete[] result[i];
+    delete[] result;
+
+    return 0;
+}
+//