Mercurial > hg > GearsTemplate
changeset 298:898fce27f334
use cbccompiler
author | ikkun |
---|---|
date | Sat, 11 Feb 2017 19:12:09 +0900 |
parents | b46398081fe4 |
children | b387b224790c |
files | src/test/CMakeLists.txt src/test/twice.cc src/test/twice.cu |
diffstat | 3 files changed, 156 insertions(+), 161 deletions(-) [+] |
line wrap: on
line diff
--- a/src/test/CMakeLists.txt Sat Feb 11 10:55:36 2017 +0900 +++ b/src/test/CMakeLists.txt Sat Feb 11 19:12:09 2017 +0900 @@ -6,7 +6,7 @@ include_directories("/usr/local/cuda/include") -# set(CMAKE_C_COMPILER $ENV{CBC_COMPILER}) +set(CMAKE_C_COMPILER $ENV{CBC_COMPILER}) set(CUDA_LINK_FLAGS "-framework CUDA -lc++ -Wl,-search_paths_first -Wl,-headerpad_max_install_names /Developer/NVIDIA/CUDA-8.0/lib/libcudart_static.a -Wl,-rpath,/usr/local/cuda/lib") # for linux use -lcuda @@ -23,17 +23,12 @@ add_executable(cudaExmple main.o test.c) -add_custom_command(OUTPUT twice.o - DEPENDS twice.cu - COMMAND nvcc ${NVCCFLAG} -c twice.cu -) - add_custom_command(OUTPUT multiply.ptx DEPENDS multiply.cu COMMAND nvcc ${NVCCFLAG} -c multiply.cu -ptx ) -add_executable(twiceExample twice.o multiply.ptx test.c) +add_executable(twiceExample twice.cc multiply.ptx test.c) add_custom_command(OUTPUT vectorAdd_kernel.ptx DEPENDS vectorAdd_kernel.cu
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/src/test/twice.cc Sat Feb 11 19:12:09 2017 +0900 @@ -0,0 +1,154 @@ +#include <stdio.h> +#include <sys/time.h> +#include <string.h> +#include <stdlib.h> + +#include <cuda.h> + +#include <cuda_runtime.h> +#include "helper_cuda.h" + +#define LENGTH (10) +#define THREAD (10) + +double +getTime() { + struct timeval tv; + gettimeofday(&tv, NULL); + return tv.tv_sec + (double)tv.tv_usec*1e-6; +} + +void +check_data(float* A, float B, float* C) { + for (int i=0; i<LENGTH*THREAD; i++) { + if (A[i]*B!=C[i]) { + puts("multiply failure."); + return; + } + } + puts("success."); +} + +void print_result(float* C) { + for (int i=0; i<LENGTH*THREAD; i++) { + printf("%f\n",C[i]); + } +} + +int main(int args, char* argv[]) { + int num_stream = 1; // number of stream + int num_exec = 16; // number of executed kernel + + for (int i=1;argv[i];i++) { + if (strcmp(argv[i], "--stream") == 0 || strcmp(argv[i], "-s") == 0) { + num_stream = atoi(argv[++i]); + } + } + + // initialize and load kernel + CUdevice device; + CUcontext context; + CUmodule module; + CUfunction function; + CUstream stream[num_stream]; + + checkCudaErrors(cuInit(0)); + checkCudaErrors(cuDeviceGet(&device, 0)); + checkCudaErrors(cuCtxCreate(&context, CU_CTX_SCHED_SPIN, device)); + checkCudaErrors(cuModuleLoad(&module, "multiply.ptx")); + checkCudaErrors(cuModuleGetFunction(&function, module, "multiply")); + for (int i=0;i<num_stream;i++) + checkCudaErrors(cuStreamCreate(&stream[i],0)); + + // memory allocate + CUdeviceptr devA; + CUdeviceptr devB[num_exec]; + CUdeviceptr devOut[num_exec]; + + checkCudaErrors(cuMemAlloc(&devA, LENGTH*THREAD*sizeof(float))); + for (int i=0;i<num_exec;i++) { + checkCudaErrors(cuMemAlloc(&devB[i], sizeof(float))); + checkCudaErrors(cuMemAlloc(&devOut[i], LENGTH*THREAD*sizeof(float))); + } + + // input buffer + float* A = new float[LENGTH*THREAD]; + float* B = new float[num_exec]; + + for (int i=0; i<LENGTH*THREAD; i++) + A[i] = (float)(i+1000); + + // output buffer + float** result = new float* [num_exec]; + + for (int i=0;i<num_exec;i++) + result[i] = new float[LENGTH*THREAD]; + + // Synchronous data transfer(host to device) + checkCudaErrors(cuMemcpyHtoD(devA, A, LENGTH*THREAD*sizeof(float))); + + // Asynchronous data transfer(host to device) + int cur = 0; + + for (int i=0;i<num_exec;i++,cur++) { + if (num_stream <= cur) + cur = 0; + B[i] = (float)(i+1); + checkCudaErrors(cuMemcpyHtoDAsync(devB[i], &B[i], sizeof(float), stream[cur])); + } + + cur = 0; + + // Asynchronous launch kernel + for (int i=0;i<num_exec;i++,cur++) { + if (num_stream <= cur) + cur=0; + B[i] = (float)(i+1); + //cuMemcpyHtoDAsync(devB[i], &B[i], sizeof(float), stream[cur]); + void* args[] = {&devA, &devB[i], &devOut[i]}; + checkCudaErrors(cuLaunchKernel(function, + LENGTH, 1, 1, + THREAD, 1, 1, + 0, stream[cur], args, NULL)); + //cuMemcpyDtoHAsync(result[i], devOut[i], LENGTH*THREAD*sizeof(float), stream[cur]); + } + + cur = 0; + + + // Asynchronous data transfer(device to host) + for (int i=0;i<num_exec;i++,cur++) { + if (num_stream <= cur) + cur = 0; + checkCudaErrors(cuMemcpyDtoHAsync(result[i], devOut[i], LENGTH*THREAD*sizeof(float), stream[cur])); + } + + // wait for stream + for (int i=0;i<num_stream;i++) + checkCudaErrors(cuStreamSynchronize(stream[i])); + + //printf("%0.6f\n",getTime()-start); + + for (int i=0;i<num_exec;i++) + check_data(A,(float)(i+1),result[i]); + + // memory release + checkCudaErrors(cuMemFree(devA)); + for (int i=0;i<num_exec;i++) { + checkCudaErrors(cuMemFree(devB[i])); + checkCudaErrors(cuMemFree(devOut[i])); + } + for (int i=0;i<num_stream;i++) + checkCudaErrors(cuStreamDestroy(stream[i])); + checkCudaErrors(cuModuleUnload(module)); + checkCudaErrors(cuCtxDestroy(context)); + + delete[] A; + delete[] B; + for (int i=0;i<num_exec;i++) + delete[] result[i]; + delete[] result; + + return 0; +} +//
--- a/src/test/twice.cu Sat Feb 11 10:55:36 2017 +0900 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,154 +0,0 @@ -#include <stdio.h> -#include <sys/time.h> -#include <string.h> -#include <stdlib.h> - -#include <cuda.h> - -#include <cuda_runtime.h> -#include "helper_cuda.h" - -#define LENGTH (10) -#define THREAD (10) - -double -getTime() { - struct timeval tv; - gettimeofday(&tv, NULL); - return tv.tv_sec + (double)tv.tv_usec*1e-6; -} - -void -check_data(float* A, float B, float* C) { - for (int i=0; i<LENGTH*THREAD; i++) { - if (A[i]*B!=C[i]) { - puts("multiply failure."); - return; - } - } - puts("success."); -} - -void print_result(float* C) { - for (int i=0; i<LENGTH*THREAD; i++) { - printf("%f\n",C[i]); - } -} - -int main(int args, char* argv[]) { - int num_stream = 1; // number of stream - int num_exec = 16; // number of executed kernel - - for (int i=1;argv[i];i++) { - if (strcmp(argv[i], "--stream") == 0 || strcmp(argv[i], "-s") == 0) { - num_stream = atoi(argv[++i]); - } - } - - // initialize and load kernel - CUdevice device; - CUcontext context; - CUmodule module; - CUfunction function; - CUstream stream[num_stream]; - - checkCudaErrors(cuInit(0)); - checkCudaErrors(cuDeviceGet(&device, 0)); - checkCudaErrors(cuCtxCreate(&context, CU_CTX_SCHED_SPIN, device)); - checkCudaErrors(cuModuleLoad(&module, "multiply.ptx")); - checkCudaErrors(cuModuleGetFunction(&function, module, "multiply")); - for (int i=0;i<num_stream;i++) - checkCudaErrors(cuStreamCreate(&stream[i],0)); - - // memory allocate - CUdeviceptr devA; - CUdeviceptr devB[num_exec]; - CUdeviceptr devOut[num_exec]; - - checkCudaErrors(cuMemAlloc(&devA, LENGTH*THREAD*sizeof(float))); - for (int i=0;i<num_exec;i++) { - checkCudaErrors(cuMemAlloc(&devB[i], sizeof(float))); - checkCudaErrors(cuMemAlloc(&devOut[i], LENGTH*THREAD*sizeof(float))); - } - - // input buffer - float* A = new float[LENGTH*THREAD]; - float* B = new float[num_exec]; - - for (int i=0; i<LENGTH*THREAD; i++) - A[i] = (float)(i+1000); - - // output buffer - float** result = new float* [num_exec]; - - for (int i=0;i<num_exec;i++) - result[i] = new float[LENGTH*THREAD]; - - // Synchronous data transfer(host to device) - checkCudaErrors(cuMemcpyHtoD(devA, A, LENGTH*THREAD*sizeof(float))); - - // Asynchronous data transfer(host to device) - int cur = 0; - - for (int i=0;i<num_exec;i++,cur++) { - if (num_stream <= cur) - cur = 0; - B[i] = (float)(i+1); - checkCudaErrors(cuMemcpyHtoDAsync(devB[i], &B[i], sizeof(float), stream[cur])); - } - - cur = 0; - - // Asynchronous launch kernel - for (int i=0;i<num_exec;i++,cur++) { - if (num_stream <= cur) - cur=0; - B[i] = (float)(i+1); - //cuMemcpyHtoDAsync(devB[i], &B[i], sizeof(float), stream[cur]); - void* args[] = {&devA, &devB[i], &devOut[i]}; - checkCudaErrors(cuLaunchKernel(function, - LENGTH, 1, 1, - THREAD, 1, 1, - 0, stream[cur], args, NULL)); - //cuMemcpyDtoHAsync(result[i], devOut[i], LENGTH*THREAD*sizeof(float), stream[cur]); - } - - cur = 0; - - - // Asynchronous data transfer(device to host) - for (int i=0;i<num_exec;i++,cur++) { - if (num_stream <= cur) - cur = 0; - checkCudaErrors(cuMemcpyDtoHAsync(result[i], devOut[i], LENGTH*THREAD*sizeof(float), stream[cur])); - } - - // wait for stream - for (int i=0;i<num_stream;i++) - checkCudaErrors(cuStreamSynchronize(stream[i])); - - //printf("%0.6f\n",getTime()-start); - - for (int i=0;i<num_exec;i++) - check_data(A,(float)(i+1),result[i]); - - // memory release - checkCudaErrors(cuMemFree(devA)); - for (int i=0;i<num_exec;i++) { - checkCudaErrors(cuMemFree(devB[i])); - checkCudaErrors(cuMemFree(devOut[i])); - } - for (int i=0;i<num_stream;i++) - checkCudaErrors(cuStreamDestroy(stream[i])); - checkCudaErrors(cuModuleUnload(module)); - checkCudaErrors(cuCtxDestroy(context)); - - delete[] A; - delete[] B; - for (int i=0;i<num_exec;i++) - delete[] result[i]; - delete[] result; - - return 0; -} -//