Mercurial > hg > Gears > GearsAgda
changeset 292:2bc63a22dd21
add twice
author | ikkun |
---|---|
date | Thu, 09 Feb 2017 19:51:32 +0900 |
parents | 87128b876c63 |
children | 198affea1be1 |
files | src/test/CMakeLists.txt src/test/main.cc src/test/twice.cu |
diffstat | 3 files changed, 174 insertions(+), 174 deletions(-) [+] |
line wrap: on
line diff
--- a/src/test/CMakeLists.txt Thu Feb 09 19:02:15 2017 +0900 +++ b/src/test/CMakeLists.txt Thu Feb 09 19:51:32 2017 +0900 @@ -2,13 +2,28 @@ # add_definitions("-Wall -g -O0") -# set(CMAKE_C_COMPILER $ENV{CbC_Clang}/clang) +set(CMAKE_C_COMPILER $ENV{CBC_COMPILER}) +set(CUDA_LINK_FLAGS "-Wl,-search_paths_first -Wl,-headerpad_max_install_names /Developer/NVIDIA/CUDA-8.0/lib/libcudart_static.a -Wl,-rpath,/usr/local/cuda/lib") +SET( CMAKE_EXE_LINKER_FLAGS "${CMAKE_EXE_LINKER_FLAGS} ${CUDA_LINK_FLAGS}" ) cmake_minimum_required(VERSION 2.8) find_package(CUDA REQUIRED) -add_custom_command(OUTPUT test.o - DEPENDS test.c - COMMAND cbclang -c test.c +add_custom_command(OUTPUT main.o + DEPENDS main.cu + COMMAND nvcc -O -c main.cu ) -cuda_add_executable(cudaExmple main.cu test.o) + +add_executable(cudaExmple main.o test.c) + +add_custom_command(OUTPUT twice.o + DEPENDS twice.cu + COMMAND nvcc -O -c twice.cu +) + +add_custom_command(OUTPUT multiply.o + DEPENDS multiply.cu + COMMAND nvcc -O -c multiply.cu +) + +add_executable(twiceExmple twice.o multiply.o test.c)
--- a/src/test/main.cc Thu Feb 09 19:02:15 2017 +0900 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,169 +0,0 @@ -#include <stdio.h> -#include <sys/time.h> -#include <string.h> -#include <stdlib.h> - -extern "C" -{ -//#include <cuda.h> -} -#include <cuda_runtime.h> - -#define LENGTH (10) -#define THREAD (10) - -void -report_error(cudaError_t err, const char* file, int lineNo) { - fprintf(stderr, "[cudaError] %s (error code: %d) at %s line %d\n", cudaGetErrorString(err), err, file, lineNo); -} - -#define CUDA_CALL(func) \ - do { \ - if ((func) != CUDA_SUCCESS) { \ - cudaError_t err = cudaGetLastError(); \ - report_error(err, __FILE__, __LINE__); \ - exit(err); \ - } \ - } while(0) - -double -getTime() { - struct timeval tv; - gettimeofday(&tv, NULL); - return tv.tv_sec + (double)tv.tv_usec*1e-6; -} - -void -check_data(float* A, float B, float* C) { - for (int i=0; i<LENGTH*THREAD; i++) { - if (A[i]*B!=C[i]) { - puts("multiply failure."); - return; - } - } - puts("success."); -} - -void print_result(float* C) { - for (int i=0; i<LENGTH*THREAD; i++) { - printf("%f\n",C[i]); - } -} - -int main(int args, char* argv[]) { - int num_stream = 1; // number of stream - int num_exec = 16; // number of executed kernel - - for (int i=1;argv[i];i++) { - if (strcmp(argv[i], "--stream") == 0 || strcmp(argv[i], "-s") == 0) { - num_stream = atoi(argv[++i]); - } - } - - // initialize and load kernel - CUdevice device; - CUcontext context; - CUmodule module; - CUfunction function; - CUstream stream[num_stream]; - - CUDA_CALL(cuInit(0)); - CUDA_CALL(cuDeviceGet(&device, 0)); - CUDA_CALL(cuCtxCreate(&context, CU_CTX_SCHED_SPIN, device)); - CUDA_CALL(cuModuleLoad(&module, "multiply.ptx")); - CUDA_CALL(cuModuleGetFunction(&function, module, "multiply")); - for (int i=0;i<num_stream;i++) - CUDA_CALL(cuStreamCreate(&stream[i],0)); - - // memory allocate - CUdeviceptr devA; - CUdeviceptr devB[num_exec]; - CUdeviceptr devOut[num_exec]; - - CUDA_CALL(cuMemAlloc(&devA, LENGTH*THREAD*sizeof(float))); - for (int i=0;i<num_exec;i++) { - CUDA_CALL(cuMemAlloc(&devB[i], sizeof(float))); - CUDA_CALL(cuMemAlloc(&devOut[i], LENGTH*THREAD*sizeof(float))); - } - - // input buffer - float* A = new float[LENGTH*THREAD]; - float* B = new float[num_exec]; - - for (int i=0; i<LENGTH*THREAD; i++) - A[i] = (float)(i+1000); - - // output buffer - float** result = new float* [num_exec]; - - for (int i=0;i<num_exec;i++) - result[i] = new float[LENGTH*THREAD]; - - // Synchronous data transfer(host to device) - CUDA_CALL(cuMemcpyHtoD(devA, A, LENGTH*THREAD*sizeof(float))); - - // Asynchronous data transfer(host to device) - int cur = 0; - - for (int i=0;i<num_exec;i++,cur++) { - if (num_stream <= cur) - cur = 0; - B[i] = (float)(i+1); - CUDA_CALL(cuMemcpyHtoDAsync(devB[i], &B[i], sizeof(float), stream[cur])); - } - - cur = 0; - - // Asynchronous launch kernel - for (int i=0;i<num_exec;i++,cur++) { - if (num_stream <= cur) - cur=0; - B[i] = (float)(i+1); - //cuMemcpyHtoDAsync(devB[i], &B[i], sizeof(float), stream[cur]); - void* args[] = {&devA, &devB[i], &devOut[i]}; - CUDA_CALL(cuLaunchKernel(function, - LENGTH, 1, 1, - THREAD, 1, 1, - 0, stream[cur], args, NULL)); - //cuMemcpyDtoHAsync(result[i], devOut[i], LENGTH*THREAD*sizeof(float), stream[cur]); - } - - cur = 0; - - - // Asynchronous data transfer(device to host) - for (int i=0;i<num_exec;i++,cur++) { - if (num_stream <= cur) - cur = 0; - CUDA_CALL(cuMemcpyDtoHAsync(result[i], devOut[i], LENGTH*THREAD*sizeof(float), stream[cur])); - } - - // wait for stream - for (int i=0;i<num_stream;i++) - CUDA_CALL(cuStreamSynchronize(stream[i])); - - //printf("%0.6f\n",getTime()-start); - - for (int i=0;i<num_exec;i++) - check_data(A,(float)(i+1),result[i]); - - // memory release - CUDA_CALL(cuMemFree(devA)); - for (int i=0;i<num_exec;i++) { - CUDA_CALL(cuMemFree(devB[i])); - CUDA_CALL(cuMemFree(devOut[i])); - } - for (int i=0;i<num_stream;i++) - CUDA_CALL(cuStreamDestroy(stream[i])); - CUDA_CALL(cuModuleUnload(module)); - CUDA_CALL(cuCtxDestroy(context)); - - delete[] A; - delete[] B; - for (int i=0;i<num_exec;i++) - delete[] result[i]; - delete[] result; - - return 0; -} -//
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/src/test/twice.cu Thu Feb 09 19:51:32 2017 +0900 @@ -0,0 +1,154 @@ +#include <stdio.h> +#include <sys/time.h> +#include <string.h> +#include <stdlib.h> + +#include <cuda.h> + +#include <cuda_runtime.h> +#include "helper_cuda.h" + +#define LENGTH (10) +#define THREAD (10) + +double +getTime() { + struct timeval tv; + gettimeofday(&tv, NULL); + return tv.tv_sec + (double)tv.tv_usec*1e-6; +} + +void +check_data(float* A, float B, float* C) { + for (int i=0; i<LENGTH*THREAD; i++) { + if (A[i]*B!=C[i]) { + puts("multiply failure."); + return; + } + } + puts("success."); +} + +void print_result(float* C) { + for (int i=0; i<LENGTH*THREAD; i++) { + printf("%f\n",C[i]); + } +} + +int main(int args, char* argv[]) { + int num_stream = 1; // number of stream + int num_exec = 16; // number of executed kernel + + for (int i=1;argv[i];i++) { + if (strcmp(argv[i], "--stream") == 0 || strcmp(argv[i], "-s") == 0) { + num_stream = atoi(argv[++i]); + } + } + + // initialize and load kernel + CUdevice device; + CUcontext context; + CUmodule module; + CUfunction function; + CUstream stream[num_stream]; + + checkCudaErrors(cuInit(0)); + checkCudaErrors(cuDeviceGet(&device, 0)); + checkCudaErrors(cuCtxCreate(&context, CU_CTX_SCHED_SPIN, device)); + checkCudaErrors(cuModuleLoad(&module, "multiply.ptx")); + checkCudaErrors(cuModuleGetFunction(&function, module, "multiply")); + for (int i=0;i<num_stream;i++) + checkCudaErrors(cuStreamCreate(&stream[i],0)); + + // memory allocate + CUdeviceptr devA; + CUdeviceptr devB[num_exec]; + CUdeviceptr devOut[num_exec]; + + checkCudaErrors(cuMemAlloc(&devA, LENGTH*THREAD*sizeof(float))); + for (int i=0;i<num_exec;i++) { + checkCudaErrors(cuMemAlloc(&devB[i], sizeof(float))); + checkCudaErrors(cuMemAlloc(&devOut[i], LENGTH*THREAD*sizeof(float))); + } + + // input buffer + float* A = new float[LENGTH*THREAD]; + float* B = new float[num_exec]; + + for (int i=0; i<LENGTH*THREAD; i++) + A[i] = (float)(i+1000); + + // output buffer + float** result = new float* [num_exec]; + + for (int i=0;i<num_exec;i++) + result[i] = new float[LENGTH*THREAD]; + + // Synchronous data transfer(host to device) + checkCudaErrors(cuMemcpyHtoD(devA, A, LENGTH*THREAD*sizeof(float))); + + // Asynchronous data transfer(host to device) + int cur = 0; + + for (int i=0;i<num_exec;i++,cur++) { + if (num_stream <= cur) + cur = 0; + B[i] = (float)(i+1); + checkCudaErrors(cuMemcpyHtoDAsync(devB[i], &B[i], sizeof(float), stream[cur])); + } + + cur = 0; + + // Asynchronous launch kernel + for (int i=0;i<num_exec;i++,cur++) { + if (num_stream <= cur) + cur=0; + B[i] = (float)(i+1); + //cuMemcpyHtoDAsync(devB[i], &B[i], sizeof(float), stream[cur]); + void* args[] = {&devA, &devB[i], &devOut[i]}; + checkCudaErrors(cuLaunchKernel(function, + LENGTH, 1, 1, + THREAD, 1, 1, + 0, stream[cur], args, NULL)); + //cuMemcpyDtoHAsync(result[i], devOut[i], LENGTH*THREAD*sizeof(float), stream[cur]); + } + + cur = 0; + + + // Asynchronous data transfer(device to host) + for (int i=0;i<num_exec;i++,cur++) { + if (num_stream <= cur) + cur = 0; + checkCudaErrors(cuMemcpyDtoHAsync(result[i], devOut[i], LENGTH*THREAD*sizeof(float), stream[cur])); + } + + // wait for stream + for (int i=0;i<num_stream;i++) + checkCudaErrors(cuStreamSynchronize(stream[i])); + + //printf("%0.6f\n",getTime()-start); + + for (int i=0;i<num_exec;i++) + check_data(A,(float)(i+1),result[i]); + + // memory release + checkCudaErrors(cuMemFree(devA)); + for (int i=0;i<num_exec;i++) { + checkCudaErrors(cuMemFree(devB[i])); + checkCudaErrors(cuMemFree(devOut[i])); + } + for (int i=0;i<num_stream;i++) + checkCudaErrors(cuStreamDestroy(stream[i])); + checkCudaErrors(cuModuleUnload(module)); + checkCudaErrors(cuCtxDestroy(context)); + + delete[] A; + delete[] B; + for (int i=0;i<num_exec;i++) + delete[] result[i]; + delete[] result; + + return 0; +} +//