view src/parallel_execution/cuda.c @ 433:d920f3a3f037

Refactoring cuda.c
author Tatsuki IHA <innparusu@cr.ie.u-ryukyu.ac.jp>
date Tue, 17 Oct 2017 15:47:33 +0900
parents b3359544adbb
children 08a93fc2f0d3
line wrap: on
line source

#include <stdio.h>
#include <sys/time.h>
#include <string.h>
#include <stdlib.h>
#include <libkern/OSAtomic.h>

// includes, project
#include <driver_types.h>
#include <cuda_runtime.h>
#include <cuda.h>
#include "helper_cuda.h"
#include "pthread.h"

#include "context.h"

/*
struct Context {
    int next;
    struct Worker* worker;
    struct TaskManager* taskManager;
    int codeNum;
    void  (**code) (struct Context*);
    void* heapStart;
    void* heap;
    long heapLimit;
    int dataNum;
    int idgCount; //number of waiting dataGear
    int idg;
    int maxIdg;
    int odg;
    int maxOdg;
    int workerId;
    struct Context* task;
    struct Queue* tasks;
    int num_exec;
    CUmodule module;
    CUfunction function;
    union Data **data;

    // multi dimension parameter
    int iterate;
    struct Iterator* iterator;
};

struct CUDAWorker {
    CUdevice device;
    CUcontext cuCtx;
    pthread_t thread;
    struct Context* context;
    int id;
    struct Queue* tasks;
    int runFlag;
    int next;
    int num_stream;
    CUstream *stream;
} CUDAWorker;

struct LoopCounter {
    int i;
} LoopCounter;

struct Array {
    int size;
    int index;
    int prefix;
    int* array;
} Array;
*/

void cudaInit(struct CUDAWorker *cudaWorker,int phase) {
    // initialize and load kernel
    cudaWorker->num_stream = 1; // number of stream
    //    cudaWorker->stream = NEWN(cudaWorker->num_stream, CUstream );
    if (phase==0)
        checkCudaErrors(cuInit(0));
    if (phase==0)
        checkCudaErrors(cuDeviceGet(&cudaWorker->device, 0));
    if (phase==0)
        checkCudaErrors(cuCtxCreate(&cudaWorker->cuCtx, CU_CTX_SCHED_SPIN, cudaWorker->device));
    //    if (cudaWorker->num_stream) {
    //        for (int i=0;i<cudaWorker->num_stream;i++)
    //            checkCudaErrors(cuStreamCreate(&cudaWorker->stream[i],0));
    //    }
    printf("cuda Init: Done\n");
}

void cudaRead(struct CudaBuffer* buffer) {
    buffer->kernelParams = (void **)calloc(buffer->inputLen + buffer->outputLen, sizeof(void *));
    int paramCount = 0;
    for (int i = 0; i < buffer->inputLen; i++) {
        CUdeviceptr* deviceptr = (CUdeviceptr *)calloc(1, sizeof(CUdeviceptr));
        // memory allocate
        checkCudaErrors(cuMemAlloc(deviceptr, GET_SIZE(buffer->inputData[i])));
        // Synchronous data transfer(host to device)
        checkCudaErrors(cuMemcpyHtoD(*deviceptr, buffer->inputData[i], GET_SIZE(buffer->inputData[i])));
        buffer->kernelParams[paramCount++] = deviceptr;
    }

    for (int i = 0; i < buffer->outputLen; i++) {
        CUdeviceptr* deviceptr = (CUdeviceptr *)calloc(1, sizeof(CUdeviceptr));
        // memory allocate
        checkCudaErrors(cuMemAlloc(deviceptr, GET_SIZE(buffer->outputData[i])));
        // Synchronous data transfer(host to device)
        checkCudaErrors(cuMemcpyHtoD(*deviceptr, buffer->outputData[i], GET_SIZE(buffer->outputData[i])));
        buffer->kernelParams[paramCount++] = deviceptr;
    }
}

void cudaLoadFunction(struct Context* context, char* filename, char* function) {
    checkCudaErrors(cuModuleLoad(&context->module, filename));
    checkCudaErrors(cuModuleGetFunction(&context->function, context->module, function));
}

void cudaExec2(struct Context* context, struct CudaBuffer* buffer) {
    // Asynchronous launch kernel
    context->num_exec = 1;
    if (context->iterate) {
        struct MultiDimIterator* iterator = &context->iterator->iterator->MultiDimIterator;
        checkCudaErrors(cuLaunchKernel(context->function,
                    iterator->x/1024, iterator->y, iterator->z,
                    1024, 1, 1,
                    0, NULL, buffer->kernelParams, NULL));

    } else {
        checkCudaErrors(cuLaunchKernel(context->function,
                    1, 1, 1,
                    1, 1, 1,
                    0, NULL, buffer->kernelParams, NULL));
    }
}

void cudaWrite(struct CudaBuffer* buffer) {
    //結果を取ってくるコマンドを入力する
    //コマンドの終了待ちを行う   
    int paramCount = 0;
    for (int i = 0; i < buffer->inputLen; i++) {
        CUdeviceptr* deviceptr =  buffer->kernelParams[paramCount++];
        checkCudaErrors(cuMemcpyDtoH(buffer->inputData[i], *deviceptr, GET_SIZE(buffer->inputData[i])));
        cuMemFree(*deviceptr);
        free(deviceptr);
    }

    for (int i = 0; i < buffer->outputLen; i++) {
        CUdeviceptr* deviceptr =  buffer->kernelParams[paramCount++];
        checkCudaErrors(cuMemcpyDtoH(buffer->outputData[i], *deviceptr, GET_SIZE(buffer->outputData[i])));
        cuMemFree(*deviceptr);
        free(deviceptr);
    }
    free(buffer->kernelParams);
    // wait for stream
    checkCudaErrors(cuCtxSynchronize());
}

void cudaExec(struct Context* context, struct CudaBuffer* buffer, char* filename, char* function) {
    // カーネルが定義されてなければそれをロードする
    cudaLoadFunction(context, filename, function);
    cudaRead(buffer);
    cudaExec2(context, buffer);
    cudaWrite(buffer);
}

void cudaShutdown( struct CUDAWorker *worker) {
    //    for (int i=0;i<worker->num_stream;i++)
    //        checkCudaErrors(cuStreamDestroy(worker->stream[i]));
    checkCudaErrors(cuCtxDestroy(worker->cuCtx));
}