Mercurial > hg > GearsTemplate
changeset 290:625a19d81ed7
add Cmake
author | ikkun |
---|---|
date | Wed, 08 Feb 2017 18:25:32 +0900 |
parents | bc17237bc8cf |
children | 87128b876c63 |
files | src/test/CMakeLists.txt src/test/main.cc src/test/multiply.cu |
diffstat | 3 files changed, 186 insertions(+), 0 deletions(-) [+] |
line wrap: on
line diff
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/src/test/CMakeLists.txt Wed Feb 08 18:25:32 2017 +0900 @@ -0,0 +1,12 @@ +cmake_minimum_required(VERSION 2.8) + +add_definitions("-Wall -g -O0") + +set(CMAKE_C_COMPILER $ENV{CbC_Clang}/clang) + +# include_directories(include) +add_executable(cudaExample + main.cc + multiply.cu +) +
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/src/test/main.cc Wed Feb 08 18:25:32 2017 +0900 @@ -0,0 +1,166 @@ +#include <stdio.h> +#include <sys/time.h> +#include <string.h> +#include <cuda.h> +#include <cuda_runtime.h> +#include <stdlib.h> + + +#define LENGTH (10) +#define THREAD (10) + +void +report_error(cudaError_t err, const char* file, int lineNo) { + fprintf(stderr, "[cudaError] %s (error code: %d) at %s line %d\n", cudaGetErrorString(err), err, file, lineNo); +} + +#define CUDA_CALL(func) \ + do { \ + if ((func) != CUDA_SUCCESS) { \ + cudaError_t err = cudaGetLastError(); \ + report_error(err, __FILE__, __LINE__); \ + exit(err); \ + } \ + } while(0) + +double +getTime() { + struct timeval tv; + gettimeofday(&tv, NULL); + return tv.tv_sec + (double)tv.tv_usec*1e-6; +} + +void +check_data(float* A, float B, float* C) { + for (int i=0; i<LENGTH*THREAD; i++) { + if (A[i]*B!=C[i]) { + puts("multiply failure."); + return; + } + } + puts("success."); +} + +void print_result(float* C) { + for (int i=0; i<LENGTH*THREAD; i++) { + printf("%f\n",C[i]); + } +} + +int main(int args, char* argv[]) { + int num_stream = 1; // number of stream + int num_exec = 16; // number of executed kernel + + for (int i=1;argv[i];i++) { + if (strcmp(argv[i], "--stream") == 0 || strcmp(argv[i], "-s") == 0) { + num_stream = atoi(argv[++i]); + } + } + + // initialize and load kernel + CUdevice device; + CUcontext context; + CUmodule module; + CUfunction function; + CUstream stream[num_stream]; + + CUDA_CALL(cuInit(0)); + CUDA_CALL(cuDeviceGet(&device, 0)); + CUDA_CALL(cuCtxCreate(&context, CU_CTX_SCHED_SPIN, device)); + CUDA_CALL(cuModuleLoad(&module, "multiply.ptx")); + CUDA_CALL(cuModuleGetFunction(&function, module, "multiply")); + for (int i=0;i<num_stream;i++) + CUDA_CALL(cuStreamCreate(&stream[i],0)); + + // memory allocate + CUdeviceptr devA; + CUdeviceptr devB[num_exec]; + CUdeviceptr devOut[num_exec]; + + CUDA_CALL(cuMemAlloc(&devA, LENGTH*THREAD*sizeof(float))); + for (int i=0;i<num_exec;i++) { + CUDA_CALL(cuMemAlloc(&devB[i], sizeof(float))); + CUDA_CALL(cuMemAlloc(&devOut[i], LENGTH*THREAD*sizeof(float))); + } + + // input buffer + float* A = new float[LENGTH*THREAD]; + float* B = new float[num_exec]; + + for (int i=0; i<LENGTH*THREAD; i++) + A[i] = (float)(i+1000); + + // output buffer + float** result = new float* [num_exec]; + + for (int i=0;i<num_exec;i++) + result[i] = new float[LENGTH*THREAD]; + + // Synchronous data transfer(host to device) + CUDA_CALL(cuMemcpyHtoD(devA, A, LENGTH*THREAD*sizeof(float))); + + // Asynchronous data transfer(host to device) + int cur = 0; + + for (int i=0;i<num_exec;i++,cur++) { + if (num_stream <= cur) + cur = 0; + B[i] = (float)(i+1); + CUDA_CALL(cuMemcpyHtoDAsync(devB[i], &B[i], sizeof(float), stream[cur])); + } + + cur = 0; + + // Asynchronous launch kernel + for (int i=0;i<num_exec;i++,cur++) { + if (num_stream <= cur) + cur=0; + B[i] = (float)(i+1); + //cuMemcpyHtoDAsync(devB[i], &B[i], sizeof(float), stream[cur]); + void* args[] = {&devA, &devB[i], &devOut[i]}; + CUDA_CALL(cuLaunchKernel(function, + LENGTH, 1, 1, + THREAD, 1, 1, + 0, stream[cur], args, NULL)); + //cuMemcpyDtoHAsync(result[i], devOut[i], LENGTH*THREAD*sizeof(float), stream[cur]); + } + + cur = 0; + + + // Asynchronous data transfer(device to host) + for (int i=0;i<num_exec;i++,cur++) { + if (num_stream <= cur) + cur = 0; + CUDA_CALL(cuMemcpyDtoHAsync(result[i], devOut[i], LENGTH*THREAD*sizeof(float), stream[cur])); + } + + // wait for stream + for (int i=0;i<num_stream;i++) + CUDA_CALL(cuStreamSynchronize(stream[i])); + + //printf("%0.6f\n",getTime()-start); + + for (int i=0;i<num_exec;i++) + check_data(A,(float)(i+1),result[i]); + + // memory release + CUDA_CALL(cuMemFree(devA)); + for (int i=0;i<num_exec;i++) { + CUDA_CALL(cuMemFree(devB[i])); + CUDA_CALL(cuMemFree(devOut[i])); + } + for (int i=0;i<num_stream;i++) + CUDA_CALL(cuStreamDestroy(stream[i])); + CUDA_CALL(cuModuleUnload(module)); + CUDA_CALL(cuCtxDestroy(context)); + + delete[] A; + delete[] B; + for (int i=0;i<num_exec;i++) + delete[] result[i]; + delete[] result; + + return 0; +} +//
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/src/test/multiply.cu Wed Feb 08 18:25:32 2017 +0900 @@ -0,0 +1,8 @@ +extern "C" { + __global__ void multiply(float* A, float* B, float* C,int* i) { +// printf("%d %d\n",i[0],i[1]); + int index = blockIdx.x * blockDim.x + threadIdx.x; + C[index] = A[index] * B[0]; + } + +}