view src/parallel_execution/CUDAExecutor.cbc @ 435:af0ec811b20e

Add CUDAExecutor
author Tatsuki IHA <innparusu@cr.ie.u-ryukyu.ac.jp>
date Sat, 04 Nov 2017 04:14:36 +0900
parents
children 08a93fc2f0d3
line wrap: on
line source

#include "../context.h"
#include <stdio.h>

// includes, project
#include <driver_types.h>
#include <cuda_runtime.h>
#include <cuda.h>
#include "helper_cuda.h"

Executor* createCUDAExecutor(struct Context* context) {
    struct Executor* executor = new Executor();
    struct CUDAExecutor* cudaExecutor = new CUDAExecutor();
    executor->executor = (union Data*)cudaExecutor;
    executor->read  = C_readCUDAExecutor;
    executor->exec  = C_execCUDAExecutor;
    executor->write = C_writeCUDAExecutor;
    return executor;
}

__code readCUDAExecutor(struct CUDAExecutor* executor, struct Context* task) {
    int paramLen = buffer->inputLen + buffer->outputLen;
    struct CUDABuffer buffer = executor->buffer;
    buffer->kernelParams = ALLOCATE_PTR_ARRAY(context, CudevicePtr, paramLen);
    struct CUDABuffer buffer = executor->buffer;
    CUdeviceptr* deviceptrs = ALLOCATE_ARRAY(context, CudevicePtr, paramLen);
    for (int i = 0; i < paramLen; i++) {
        CUdeviceptr deviceptr = deviceptrs[i];
        // memory allocate
        union Data* data = i < inputLen? buffer->inputData[i] : buffer->outputData[i-inputLen];
        checkCUDAErrors(cuMemAlloc(deviceptr, GET_SIZE(data)));
        checkCUDAErrors(cuMemcpyHtoD(deviceptr, data, GET_SIZE(data)));
        // Synchronous data transfer(host to device)
        buffer->kernelParams[paramCount++] = &deviceptr;
    }
}

void cudaLoadFunction(struct Context* context, char* filename, char* function) {
    checkCUDAErrors(cuModuleLoad(&context->module, filename));
    checkCUDAErrors(cuModuleGetFunction(&context->function, context->module, function));
}

__code execCUDAExecutor(struct CUDAExecutor* executor, struct Context* task) {
    // Asynchronous launch kernel
    task->num_exec = 1;
    struct CUDABuffer buffer = executor->buffer;
    if (task->iterate) {
        struct MultiDimIterator* iterator = &task->iterator->iterator->MultiDimIterator;
        checkCUDAErrors(cuLaunchKernel(task->function,
                    iterator->x, iterator->y, iterator->z,
                    1, 1, 1,
                    0, NULL, (void**)buffer->kernelParams, NULL));
    } else {
        checkCUDAErrors(cuLaunchKernel(task->function,
                    1, 1, 1,
                    1, 1, 1,
                    0, NULL, (void**)buffer->kernelParams, NULL));
    }
}

__code writeCUDAExecutor(struct CUDAExecutor* executor, struct Context* task) {
    //結果を取ってくるコマンドを入力する
    //コマンドの終了待ちを行う   
    int paramLen = buffer->inputLen + buffer->outputLen;
    struct CUDABuffer buffer = executor->buffer;
    for (int i = 0; i < paramLen; i++) {
        CUdeviceptr* deviceptr =  buffer->kernelParams[i];
        union Data* data = i < inputLen? buffer->inputData[i] : buffer->outputData[i-inputLen];
        checkCUDAErrors(cuMemcpyDtoH(data, *deviceptr, GET_SIZE(data)));
        cuMemFree(*deviceptr);
    }
    // wait for stream
    checkCUDAErrors(cuCtxSynchronize());
}