Mercurial > hg > Gears > GearsAgda
view src/test/main.cc @ 291:87128b876c63
add test
author | ikkun |
---|---|
date | Thu, 09 Feb 2017 19:02:15 +0900 |
parents | 625a19d81ed7 |
children |
line wrap: on
line source
#include <stdio.h> #include <sys/time.h> #include <string.h> #include <stdlib.h> extern "C" { //#include <cuda.h> } #include <cuda_runtime.h> #define LENGTH (10) #define THREAD (10) void report_error(cudaError_t err, const char* file, int lineNo) { fprintf(stderr, "[cudaError] %s (error code: %d) at %s line %d\n", cudaGetErrorString(err), err, file, lineNo); } #define CUDA_CALL(func) \ do { \ if ((func) != CUDA_SUCCESS) { \ cudaError_t err = cudaGetLastError(); \ report_error(err, __FILE__, __LINE__); \ exit(err); \ } \ } while(0) double getTime() { struct timeval tv; gettimeofday(&tv, NULL); return tv.tv_sec + (double)tv.tv_usec*1e-6; } void check_data(float* A, float B, float* C) { for (int i=0; i<LENGTH*THREAD; i++) { if (A[i]*B!=C[i]) { puts("multiply failure."); return; } } puts("success."); } void print_result(float* C) { for (int i=0; i<LENGTH*THREAD; i++) { printf("%f\n",C[i]); } } int main(int args, char* argv[]) { int num_stream = 1; // number of stream int num_exec = 16; // number of executed kernel for (int i=1;argv[i];i++) { if (strcmp(argv[i], "--stream") == 0 || strcmp(argv[i], "-s") == 0) { num_stream = atoi(argv[++i]); } } // initialize and load kernel CUdevice device; CUcontext context; CUmodule module; CUfunction function; CUstream stream[num_stream]; CUDA_CALL(cuInit(0)); CUDA_CALL(cuDeviceGet(&device, 0)); CUDA_CALL(cuCtxCreate(&context, CU_CTX_SCHED_SPIN, device)); CUDA_CALL(cuModuleLoad(&module, "multiply.ptx")); CUDA_CALL(cuModuleGetFunction(&function, module, "multiply")); for (int i=0;i<num_stream;i++) CUDA_CALL(cuStreamCreate(&stream[i],0)); // memory allocate CUdeviceptr devA; CUdeviceptr devB[num_exec]; CUdeviceptr devOut[num_exec]; CUDA_CALL(cuMemAlloc(&devA, LENGTH*THREAD*sizeof(float))); for (int i=0;i<num_exec;i++) { CUDA_CALL(cuMemAlloc(&devB[i], sizeof(float))); CUDA_CALL(cuMemAlloc(&devOut[i], LENGTH*THREAD*sizeof(float))); } // input buffer float* A = new float[LENGTH*THREAD]; float* B = new float[num_exec]; for (int i=0; i<LENGTH*THREAD; i++) A[i] = (float)(i+1000); // output buffer float** result = new float* [num_exec]; for (int i=0;i<num_exec;i++) result[i] = new float[LENGTH*THREAD]; // Synchronous data transfer(host to device) CUDA_CALL(cuMemcpyHtoD(devA, A, LENGTH*THREAD*sizeof(float))); // Asynchronous data transfer(host to device) int cur = 0; for (int i=0;i<num_exec;i++,cur++) { if (num_stream <= cur) cur = 0; B[i] = (float)(i+1); CUDA_CALL(cuMemcpyHtoDAsync(devB[i], &B[i], sizeof(float), stream[cur])); } cur = 0; // Asynchronous launch kernel for (int i=0;i<num_exec;i++,cur++) { if (num_stream <= cur) cur=0; B[i] = (float)(i+1); //cuMemcpyHtoDAsync(devB[i], &B[i], sizeof(float), stream[cur]); void* args[] = {&devA, &devB[i], &devOut[i]}; CUDA_CALL(cuLaunchKernel(function, LENGTH, 1, 1, THREAD, 1, 1, 0, stream[cur], args, NULL)); //cuMemcpyDtoHAsync(result[i], devOut[i], LENGTH*THREAD*sizeof(float), stream[cur]); } cur = 0; // Asynchronous data transfer(device to host) for (int i=0;i<num_exec;i++,cur++) { if (num_stream <= cur) cur = 0; CUDA_CALL(cuMemcpyDtoHAsync(result[i], devOut[i], LENGTH*THREAD*sizeof(float), stream[cur])); } // wait for stream for (int i=0;i<num_stream;i++) CUDA_CALL(cuStreamSynchronize(stream[i])); //printf("%0.6f\n",getTime()-start); for (int i=0;i<num_exec;i++) check_data(A,(float)(i+1),result[i]); // memory release CUDA_CALL(cuMemFree(devA)); for (int i=0;i<num_exec;i++) { CUDA_CALL(cuMemFree(devB[i])); CUDA_CALL(cuMemFree(devOut[i])); } for (int i=0;i<num_stream;i++) CUDA_CALL(cuStreamDestroy(stream[i])); CUDA_CALL(cuModuleUnload(module)); CUDA_CALL(cuCtxDestroy(context)); delete[] A; delete[] B; for (int i=0;i<num_exec;i++) delete[] result[i]; delete[] result; return 0; } //