view src/parallel_execution/CUDAWorker.cbc @ 303:1dbaef86593b

CUDAtwice.cbc
author ikkun
date Mon, 13 Feb 2017 18:23:29 +0900
parents 8e7926f3e271
children 9755206813cb
line wrap: on
line source

#include <stdio.h>
#include <sys/time.h>
#include <string.h>
#include <stdlib.h>

#include <cuda.h>

#include <cuda_runtime.h>
#include "helper_cuda.h"

#include <libkern/OSAtomic.h>

#include "../context.h"

static void start_CUDAworker(Worker* worker);

union Data* createCUDAWorker(struct Context* context, int id, Queue* queue) {
    struct Worker* worker = ALLOC(context, Worker);
    struct CUDAWorker* CUDAWorker = ALLOC(context, CUDAWorker);
    worker->worker = (union Data*)CUDAWorker;
    worker->tasks = queue;
    cpuWorker->id = id;
    worker->taskReceive = C_taskReceiveCUDAWorker;
    worker->shutdown = C_shutdownCUDAWorker;
    pthread_create(&worker->worker->CUDAWorker.thread, NULL, (void*)&start_CUDAworker, worker);
    return (union Data*)(worker);
}

static void start_CUDAworker(Worker* worker) {
    CUDAWorker* CUDAWorker = (CUDAWorker*)worker->worker;
    CUDAWorker->context = NEW(struct Context);
    initContext(CUDAWorker->context);
    Gearef(CUDAWorker->context, Worker)->worker = (union Data*)worker;
    int num_stream = 1; // number of stream
    int num_exec = 16; // number of executed kernel

    // initialize and load kernel
    CUdevice device;
    CUcontext context;
    CUmodule module;
    CUfunction function;
    CUstream stream[num_stream];

    checkCudaErrors(cuInit(0));
    checkCudaErrors(cuDeviceGet(&device, 0));
    checkCudaErrors(cuCtxCreate(&context, CU_CTX_SCHED_SPIN, device));
    checkCudaErrors(cuModuleLoad(&module, "multiply.ptx"));
    checkCudaErrors(cuModuleGetFunction(&function, module, "multiply"));
    if (num_stream) {
        for (int i=0;i<num_stream;i++)
            checkCudaErrors(cuStreamCreate(&stream[i],0));
    }

    goto meta(CUDAWorker->context, C_taskReceiveCUDAWorker);
}

__code taskReceiveCUDAWorker(struct Context* context, Worker* worker, Queue* queue) {
    queue->queue = (union Data*)worker->tasks;
    queue->next = C_getTask;
    goto meta(context, worker->tasks->take);
}

__code taskReceiveCUDAWorker_stub(struct Context* context) {
    CUDAWorker* CUDAWorker = (CUDAWorker *)GearImpl(context, CUDAWorker, CUDAworker);
    pthread_cond_wait(&CUDAWorker->cond, &CUDAWorker->mutex);
    goto taskReceiveCUDAWorker(context, &Gearef(context, Worker)->worker->Worker, Gearef(context, Queue));
}

__code getCUDATask(struct Context* context, Worker* worker, struct Context* task) {
    if (!task)
        return; // end thread
    task->worker = worker;
    context->next = C_taskReceiveCUDAWorker; // set CG after task exec
    goto meta(task, task->next);
}

__code getCUDATask_stub(struct Context* context) {
    Worker* worker = &Gearef(context,Worker)->worker->Worker;
    struct Context* task = &Gearef(context, Queue)->data->Context;
    goto getCUDATask(context, worker, task);
}

#ifdef USE_CUDA
__code twiceCUDA(struct Context* context) {
    cuMemcpyHtoDAsync(context,context,context,context->stream);
    cuLaunchkanel();
    cuMemcpyDtoHAsync();
}
#endif

__code shutdownCUDAWorker(struct Context* context, CPUWorker* worker) {
    for (int i=0;i<num_stream;i++)
        checkCudaErrors(cuStreamDestroy(stream[i]));
    checkCudaErrors(cuModuleUnload(module));
    checkCudaErrors(cuCtxDestroy(context));
}

__code shutdownCUDAWorker_stub(struct Context* context) {
    CPUWorker* worker = (CPUWorker *)GearImpl(context, Worker, worker);
    goto shutdownCUDAWorker(context,worker);
}