Mercurial > hg > Gears > GearsAgda
view src/test/twice.cc @ 301:609bf62768b9
add -DUSE_CUDA=1 flag to cmake
author | Shinji KONO <kono@ie.u-ryukyu.ac.jp> |
---|---|
date | Sun, 12 Feb 2017 12:35:11 +0900 |
parents | 8bbc0012e1a4 |
children | 1839586f5b41 |
line wrap: on
line source
#include <stdio.h> #include <sys/time.h> #include <string.h> #include <stdlib.h> #include <cuda.h> #include <cuda_runtime.h> #include "helper_cuda.h" #define LENGTH (10) #define THREAD (10) double getTime() { struct timeval tv; gettimeofday(&tv, NULL); return tv.tv_sec + (double)tv.tv_usec*1e-6; } void check_data(float* A, float B, float* C) { for (int i=0; i<LENGTH*THREAD; i++) { if (A[i]*B!=C[i]) { puts("multiply failure."); return; } } puts("success."); } void print_result(float* C) { for (int i=0; i<LENGTH*THREAD; i++) { printf("%f\n",C[i]); } } int main(int args, char* argv[]) { int num_stream = 1; // number of stream int num_exec = 16; // number of executed kernel for (int i=1;argv[i];i++) { if (strcmp(argv[i], "--stream") == 0 || strcmp(argv[i], "-s") == 0) { num_stream = atoi(argv[++i]); } if (strcmp(argv[i], "--numExec") == 0 || strcmp(argv[i], "-e") == 0) { num_exec = atoi(argv[++i]); } } // initialize and load kernel CUdevice device; CUcontext context; CUmodule module; CUfunction function; CUstream stream[num_stream]; checkCudaErrors(cuInit(0)); checkCudaErrors(cuDeviceGet(&device, 0)); checkCudaErrors(cuCtxCreate(&context, CU_CTX_SCHED_SPIN, device)); checkCudaErrors(cuModuleLoad(&module, "multiply.ptx")); checkCudaErrors(cuModuleGetFunction(&function, module, "multiply")); if (num_stream) { for (int i=0;i<num_stream;i++) checkCudaErrors(cuStreamCreate(&stream[i],0)); } // memory allocate CUdeviceptr devA; CUdeviceptr devB[num_exec]; CUdeviceptr devOut[num_exec]; checkCudaErrors(cuMemAlloc(&devA, LENGTH*THREAD*sizeof(float))); for (int i=0;i<num_exec;i++) { checkCudaErrors(cuMemAlloc(&devB[i], sizeof(float))); checkCudaErrors(cuMemAlloc(&devOut[i], LENGTH*THREAD*sizeof(float))); } // input buffer float* A = new float[LENGTH*THREAD]; float* B = new float[num_exec]; for (int i=0; i<LENGTH*THREAD; i++) A[i] = (float)(i+1000); // output buffer float** result = new float* [num_exec]; for (int i=0;i<num_exec;i++) result[i] = new float[LENGTH*THREAD]; // Synchronous data transfer(host to device) checkCudaErrors(cuMemcpyHtoD(devA, A, LENGTH*THREAD*sizeof(float))); // Asynchronous data transfer(host to device) int cur = 0; for (int i=0;i<num_exec;i++,cur++) { if (num_stream <= cur) cur = 0; B[i] = (float)(i+1); if (num_stream) { checkCudaErrors(cuMemcpyHtoDAsync(devB[i], &B[i], sizeof(float), stream[cur])); } else { checkCudaErrors(cuMemcpyHtoD(devB[i], &B[i], sizeof(float))); } } cur = 0; // Asynchronous launch kernel for (int i=0;i<num_exec;i++,cur++) { if (num_stream <= cur) cur=0; //B[i] = (float)(i+1); //cuMemcpyHtoDAsync(devB[i], &B[i], sizeof(float), stream[cur]); void* args[] = {&devA, &devB[i], &devOut[i]}; checkCudaErrors(cuLaunchKernel(function, LENGTH, 1, 1, THREAD, 1, 1, 0, num_stream ? stream[cur] : NULL , args, NULL)); //cuMemcpyDtoHAsync(result[i], devOut[i], LENGTH*THREAD*sizeof(float), stream[cur]); } cur = 0; // Asynchronous data transfer(device to host) for (int i=0;i<num_exec;i++,cur++) { if (num_stream <= cur) cur = 0; if (num_stream) { checkCudaErrors(cuMemcpyDtoHAsync(result[i], devOut[i], LENGTH*THREAD*sizeof(float), stream[cur])); } else { checkCudaErrors(cuMemcpyDtoH(result[i], devOut[i], LENGTH*THREAD*sizeof(float))); } } // wait for stream for (int i=0;i<num_stream;i++) checkCudaErrors(cuStreamSynchronize(stream[i])); //printf("%0.6f\n",getTime()-start); for (int i=0;i<num_exec;i++) check_data(A,(float)(i+1),result[i]); // memory release checkCudaErrors(cuMemFree(devA)); for (int i=0;i<num_exec;i++) { checkCudaErrors(cuMemFree(devB[i])); checkCudaErrors(cuMemFree(devOut[i])); } for (int i=0;i<num_stream;i++) checkCudaErrors(cuStreamDestroy(stream[i])); checkCudaErrors(cuModuleUnload(module)); checkCudaErrors(cuCtxDestroy(context)); delete[] A; delete[] B; for (int i=0;i<num_exec;i++) delete[] result[i]; delete[] result; return 0; }