view src/parallel_execution/cuda.c @ 319:a15511b1a6e0

separate cuda.c, and USE_CUDA_MAIN_THREAD flag
author Shinji KONO <kono@ie.u-ryukyu.ac.jp>
date Wed, 15 Feb 2017 20:43:55 +0900
parents
children 408b4aab7610
line wrap: on
line source

#include <stdio.h>
#include <sys/time.h>
#include <string.h>
#include <stdlib.h>
#include <libkern/OSAtomic.h>

// includes, project
#include <driver_types.h>
#include <cuda_runtime.h>
#include <cuda.h>
#include "helper_cuda.h"
#include "pthread.h"

// #include "context.h"

struct Context {
    int next;
    struct Worker* worker;
    struct TaskManager* taskManager;
    int codeNum;
    void  (**code) (struct Context*);
    void* heapStart;
    void* heap;
    long heapLimit;
    int dataNum;
    int idgCount; //number of waiting dataGear
    int odg;
    int maxOdg;
    int workerId;
    int num_exec;
    CUmodule module;
    CUfunction function;
    union Data **data;
};

    struct CUDAWorker {
        CUdevice device;
        CUcontext cuCtx;
        pthread_t thread;
        struct Context* context;
        int id;
        struct Queue* tasks;
        int runFlag;
        int next;
        int num_stream;
        CUstream *stream;
    } CUDAWorker;

    struct LoopCounter {
        int i;
    } LoopCounter;

    struct Array {
        int size;
        int index;
        int prefix;
        int* array;
    } Array;



void cudaInit(struct CUDAWorker *cudaWorker,int phase) {
    // initialize and load kernel
    cudaWorker->num_stream = 1; // number of stream
//    cudaWorker->stream = NEWN(cudaWorker->num_stream, CUstream );
   if (phase==0)
    checkCudaErrors(cuInit(0));
   if (phase==0)
    checkCudaErrors(cuDeviceGet(&cudaWorker->device, 0));
   if (phase==0)
    checkCudaErrors(cuCtxCreate(&cudaWorker->cuCtx, CU_CTX_SCHED_SPIN, cudaWorker->device));
//    if (cudaWorker->num_stream) {
//        for (int i=0;i<cudaWorker->num_stream;i++)
//            checkCudaErrors(cuStreamCreate(&cudaWorker->stream[i],0));
//    }
}


void CUDAExec(struct Context* context, struct Array* array, struct LoopCounter *loopCounter) {
    // Worker *worker = context->worker;
    // CUDAWorker* cudaWorker = (CUDAWorker*)worker->worker;
  // memory allocate
    CUdeviceptr devA;
    CUdeviceptr devLoopCounter;

    checkCudaErrors(cuMemAlloc(&devA, array->size));
    checkCudaErrors(cuMemAlloc(&devLoopCounter, sizeof(LoopCounter)));

    //twiceカーネルが定義されてなければそれをロードする
    checkCudaErrors(cuModuleLoad(&context->module, "c/CUDAtwice.ptx"));
    checkCudaErrors(cuModuleGetFunction(&context->function, context->module, "twice"));

    //入力のDataGearをGPUにbuffer経由で送る
    // Synchronous data transfer(host to device)
    checkCudaErrors(cuMemcpyHtoD(devLoopCounter, loopCounter, sizeof(LoopCounter)));
    checkCudaErrors(cuMemcpyHtoD(devA, array->array, array->size));

  // Asynchronous launch kernel
     context->num_exec = 1;
     void* args[] = {&devLoopCounter,&array->index,&array->prefix,&devA};
     checkCudaErrors(cuLaunchKernel(context->function,
                       1, 1, 1,
                       1, 1, 1,
                                 0, NULL , args, NULL));

    //結果を取ってくるコマンドを入力する
    //コマンドの終了待ちを行う   
    checkCudaErrors(cuMemcpyDtoH(array->array, devA, array->size));

    // wait for stream
    checkCudaErrors(cuCtxSynchronize());
}

void cudaShutdown( struct CUDAWorker *worker) {
//    for (int i=0;i<worker->num_stream;i++)
//        checkCudaErrors(cuStreamDestroy(worker->stream[i]));
    checkCudaErrors(cuCtxDestroy(worker->cuCtx));
}