view src/parallel_execution/CUDAExecutor.cbc @ 462:8d7e5d48cad3

Running CPU examples
author Tatsuki IHA <innparusu@cr.ie.u-ryukyu.ac.jp>
date Wed, 20 Dec 2017 22:05:08 +0900
parents dcc42f3e7e97
children 7d67c9cf09ee
line wrap: on
line source

#include "../context.h"
#include <stdio.h>

Executor* createCUDAExecutor(struct Context* context, CUdevice device) {
    struct Executor* executor = new Executor();
    struct CUDAExecutor* cudaExecutor = new CUDAExecutor();
    checkCudaErrors(cuDeviceGetAttribute(&cudaExecutor->maxThreadPerBlock, CU_DEVICE_ATTRIBUTE_MAX_THREADS_PER_BLOCK, device));
    executor->executor = (union Data*)cudaExecutor;
    executor->read  = C_readCUDAExecutor;
    executor->exec  = C_execCUDAExecutor;
    executor->write = C_writeCUDAExecutor;
    return executor;
}

__code readCUDAExecutor(struct CUDAExecutor* executor, struct Context* task, __code next(...)) {
    struct CUDABuffer* buffer = executor->buffer;
    int paramLen = buffer->inputLen + buffer->outputLen;
    executor->kernelParams = (CUdeviceptr**)ALLOCATE_PTR_ARRAY(context, CUdeviceptr, paramLen);
    for (int i = 0; i < paramLen; i++) {
        CUdeviceptr* deviceptr = new CUdeviceptr();
        // memory allocate
        union Data* data = i < buffer->inputLen? buffer->inputData[i] : buffer->outputData[i-buffer->inputLen];
        checkCudaErrors(cuMemAlloc(deviceptr, GET_SIZE(data)));
        checkCudaErrors(cuMemcpyHtoD(*deviceptr, data, GET_SIZE(data)));
        // Synchronous data transfer(host to device)
        executor->kernelParams[i] = deviceptr;
    }
    // TODO: Implements pipeline
    // goto next(...);
    goto meta(context, C_execCUDAExecutor);
}

int computeblockDim(int count, int maxThreadPerBlock) {
    return count < maxThreadPerBlock ? count : maxThreadPerBlock;
}

__code execCUDAExecutor(struct CUDAExecutor* executor, struct Context* task, __code next(...)) {
    // Asynchronous launch kernel
    task->num_exec = 1;
    if (task->iterate) {
        struct MultiDimIterator* iterator = &task->iterator->iterator->MultiDimIterator;
        int blockDimX = computeblockDim(iterator->x, executor->maxThreadPerBlock);
        int blockDimY = computeblockDim(iterator->y, executor->maxThreadPerBlock);
        int blockDimZ = computeblockDim(iterator->z, executor->maxThreadPerBlock);
        checkCudaErrors(cuLaunchKernel(task->function,
                    iterator->x/blockDimX, iterator->y/blockDimY, iterator->z/blockDimZ,
                    blockDimX, blockDimY, blockDimZ,
                    0, NULL, (void**)executor->kernelParams, NULL));
    } else {
        checkCudaErrors(cuLaunchKernel(task->function,
                    1, 1, 1,
                    1, 1, 1,
                    0, NULL, (void**)executor->kernelParams, NULL));
    }
    // TODO: Implements pipeline
    // goto next(...);
    goto meta(context, C_writeCUDAExecutor);
}

__code writeCUDAExecutor(struct CUDAExecutor* executor, struct Context* task, __code next(...)) {
    //結果を取ってくるコマンドを入力する
    //コマンドの終了待ちを行う   
    struct CUDABuffer* buffer = executor->buffer;
    int paramLen = buffer->inputLen + buffer->outputLen;
    for (int i = 0; i < paramLen; i++) {
        CUdeviceptr deviceptr =  *(executor->kernelParams[i]);
        union Data* data = i < buffer->inputLen? buffer->inputData[i] : buffer->outputData[i-buffer->inputLen];
        checkCudaErrors(cuMemcpyDtoH(data, deviceptr, GET_SIZE(data)));
        cuMemFree(deviceptr);
    }
    // wait for stream
    checkCudaErrors(cuCtxSynchronize());
    goto next(...);
}