view src/parallel_execution/CUDAExecutor.cbc @ 437:2c1b1d56bf1e

Work CUDAbitonicSort by CUDAExecutor
author Tatsuki IHA <innparusu@cr.ie.u-ryukyu.ac.jp>
date Sat, 04 Nov 2017 08:30:25 +0900
parents 08a93fc2f0d3
children 7679093bdd72
line wrap: on
line source

#include "../context.h"
#include <stdio.h>

// includes, project
#include <driver_types.h>
#include <cuda_runtime.h>
#include "../helper_cuda.h"
#include "pthread.h"

Executor* createCUDAExecutor(struct Context* context) {
    struct Executor* executor = new Executor();
    struct CUDAExecutor* cudaExecutor = new CUDAExecutor();
    executor->executor = (union Data*)cudaExecutor;
    executor->read  = C_readCUDAExecutor;
    executor->exec  = C_execCUDAExecutor;
    executor->write = C_writeCUDAExecutor;
    return executor;
}

__code readCUDAExecutor(struct CUDAExecutor* executor, struct Context* task, __code next(...)) {
    struct CUDABuffer* buffer = executor->buffer;
    int paramLen = buffer->inputLen + buffer->outputLen;
    executor->kernelParams = (CUdeviceptr**)ALLOCATE_PTR_ARRAY(context, CUdeviceptr, paramLen);
    for (int i = 0; i < paramLen; i++) {
        CUdeviceptr* deviceptr = new CUdeviceptr();
        // memory allocate
        union Data* data = i < buffer->inputLen? buffer->inputData[i] : buffer->outputData[i-buffer->inputLen];
        checkCudaErrors(cuMemAlloc(deviceptr, GET_SIZE(data)));
        checkCudaErrors(cuMemcpyHtoD(*deviceptr, data, GET_SIZE(data)));
        // Synchronous data transfer(host to device)
        executor->kernelParams[i] = deviceptr;
    }
    // TODO: Implements pipeline
    // goto next(...);
    goto meta(context, C_execCUDAExecutor);
}

__code execCUDAExecutor(struct CUDAExecutor* executor, struct Context* task, __code next(...)) {
    // Asynchronous launch kernel
    task->num_exec = 1;
    if (task->iterate) {
        struct MultiDimIterator* iterator = &task->iterator->iterator->MultiDimIterator;
        checkCudaErrors(cuLaunchKernel(task->function,
                    iterator->x, iterator->y, iterator->z,
                    1, 1, 1,
                    0, NULL, (void**)executor->kernelParams, NULL));
    } else {
        checkCudaErrors(cuLaunchKernel(task->function,
                    1, 1, 1,
                    1, 1, 1,
                    0, NULL, (void**)executor->kernelParams, NULL));
    }
    // TODO: Implements pipeline
    // goto next(...);
    goto meta(context, C_writeCUDAExecutor);
}

__code writeCUDAExecutor(struct CUDAExecutor* executor, struct Context* task, __code next(...)) {
    //結果を取ってくるコマンドを入力する
    //コマンドの終了待ちを行う   
    struct CUDABuffer* buffer = executor->buffer;
    int paramLen = buffer->inputLen + buffer->outputLen;
    for (int i = 0; i < paramLen; i++) {
        CUdeviceptr deviceptr =  *(executor->kernelParams[i]);
        union Data* data = i < buffer->inputLen? buffer->inputData[i] : buffer->outputData[i-buffer->inputLen];
        checkCudaErrors(cuMemcpyDtoH(data, deviceptr, GET_SIZE(data)));
        cuMemFree(deviceptr);
    }
    // wait for stream
    checkCudaErrors(cuCtxSynchronize());
    goto next(...);
}