view src/test/main.cc @ 291:87128b876c63

add test
author ikkun
date Thu, 09 Feb 2017 19:02:15 +0900
parents 625a19d81ed7
children
line wrap: on
line source

#include <stdio.h>
#include <sys/time.h>
#include <string.h>
#include <stdlib.h>

extern "C"
{
//#include <cuda.h>
}
#include <cuda_runtime.h>

#define LENGTH (10)
#define THREAD (10)

void
report_error(cudaError_t err, const char* file, int lineNo) {
    fprintf(stderr, "[cudaError] %s (error code: %d) at %s line %d\n", cudaGetErrorString(err), err, file, lineNo);
}

#define CUDA_CALL(func) \
    do { \
        if ((func) != CUDA_SUCCESS) { \
            cudaError_t err = cudaGetLastError();     \
            report_error(err, __FILE__, __LINE__);      \
            exit(err); \
        } \
    } while(0)

double
getTime() {
    struct timeval tv;
    gettimeofday(&tv, NULL);
    return tv.tv_sec + (double)tv.tv_usec*1e-6;
}

void
check_data(float* A, float B, float* C) {
    for (int i=0; i<LENGTH*THREAD; i++) {
        if (A[i]*B!=C[i]) {
            puts("multiply failure.");
            return;
        }
    }
    puts("success.");
}

void print_result(float* C) {
    for (int i=0; i<LENGTH*THREAD; i++) {
        printf("%f\n",C[i]);
    }
}

int main(int args, char* argv[]) {
    int num_stream = 1; // number of stream
    int num_exec = 16; // number of executed kernel
    
    for (int i=1;argv[i];i++) {
        if (strcmp(argv[i], "--stream") == 0 || strcmp(argv[i], "-s") == 0) {
            num_stream = atoi(argv[++i]);
        }
    }

    // initialize and load kernel
    CUdevice device;
    CUcontext context;
    CUmodule module;
    CUfunction function;
    CUstream stream[num_stream];

    CUDA_CALL(cuInit(0));
    CUDA_CALL(cuDeviceGet(&device, 0));
    CUDA_CALL(cuCtxCreate(&context, CU_CTX_SCHED_SPIN, device));
    CUDA_CALL(cuModuleLoad(&module, "multiply.ptx"));
    CUDA_CALL(cuModuleGetFunction(&function, module, "multiply"));
    for (int i=0;i<num_stream;i++)
        CUDA_CALL(cuStreamCreate(&stream[i],0));

    // memory allocate
    CUdeviceptr devA;
    CUdeviceptr devB[num_exec];
    CUdeviceptr devOut[num_exec];

    CUDA_CALL(cuMemAlloc(&devA, LENGTH*THREAD*sizeof(float)));
    for (int i=0;i<num_exec;i++) {
        CUDA_CALL(cuMemAlloc(&devB[i], sizeof(float)));
        CUDA_CALL(cuMemAlloc(&devOut[i], LENGTH*THREAD*sizeof(float)));
    }

    // input buffer
    float* A = new float[LENGTH*THREAD];
    float* B = new float[num_exec];

    for (int i=0; i<LENGTH*THREAD; i++)
        A[i] = (float)(i+1000);

    // output buffer
    float** result = new float* [num_exec];

    for (int i=0;i<num_exec;i++)
        result[i] = new float[LENGTH*THREAD];

    // Synchronous data transfer(host to device)
    CUDA_CALL(cuMemcpyHtoD(devA, A, LENGTH*THREAD*sizeof(float)));
    
    // Asynchronous data transfer(host to device)
    int cur = 0;

     for (int i=0;i<num_exec;i++,cur++) {
         if (num_stream <= cur)
             cur = 0;
         B[i] = (float)(i+1);
         CUDA_CALL(cuMemcpyHtoDAsync(devB[i], &B[i], sizeof(float), stream[cur]));
     }

    cur = 0;

    // Asynchronous launch kernel
    for (int i=0;i<num_exec;i++,cur++) {
        if (num_stream <= cur)
            cur=0;
        B[i] = (float)(i+1);
        //cuMemcpyHtoDAsync(devB[i], &B[i], sizeof(float), stream[cur]);
        void* args[] = {&devA, &devB[i], &devOut[i]};
        CUDA_CALL(cuLaunchKernel(function,
                       LENGTH, 1, 1,
                       THREAD, 1, 1,
                                 0, stream[cur], args, NULL));
        //cuMemcpyDtoHAsync(result[i], devOut[i], LENGTH*THREAD*sizeof(float), stream[cur]);
    }

    cur = 0;

    
    // Asynchronous data transfer(device to host)
     for (int i=0;i<num_exec;i++,cur++) {
         if (num_stream <= cur)
             cur = 0;
         CUDA_CALL(cuMemcpyDtoHAsync(result[i], devOut[i], LENGTH*THREAD*sizeof(float), stream[cur]));
     }
    
    // wait for stream
    for (int i=0;i<num_stream;i++)
        CUDA_CALL(cuStreamSynchronize(stream[i]));
    
    //printf("%0.6f\n",getTime()-start);

    for (int i=0;i<num_exec;i++)
        check_data(A,(float)(i+1),result[i]);

    // memory release
    CUDA_CALL(cuMemFree(devA));
    for (int i=0;i<num_exec;i++) {
        CUDA_CALL(cuMemFree(devB[i]));
        CUDA_CALL(cuMemFree(devOut[i]));
    }
    for (int i=0;i<num_stream;i++)
        CUDA_CALL(cuStreamDestroy(stream[i]));
    CUDA_CALL(cuModuleUnload(module));
    CUDA_CALL(cuCtxDestroy(context));

    delete[] A;
    delete[] B;
    for (int i=0;i<num_exec;i++)
        delete[] result[i];
    delete[] result;

    return 0;
}
//