view TaskManager/Cuda/CudaScheduler.cc @ 1951:da22fc4db5b2 draft

fix
author Shohei KOKUBO <e105744@ie.u-ryukyu.ac.jp>
date Thu, 06 Feb 2014 18:14:49 +0900
parents f19885ea776d
children 273de551f726
line wrap: on
line source

#include "TaskManager.h"
#include "CudaScheduler.h"
#include "ReferencedDmaManager.h"
#include "PreRefDmaManager.h"
#include "SchedTask.h"
#include "CudaError.h"
#include "ListData.h"
#include "SysFunc.h"
#include "gettime.h"
#include "error.h"
#include <stdio.h>
#include <fcntl.h>
#include <sys/stat.h>
#include <string.h>
#include <cuda.h>

TaskObject cuda_task_list[MAX_TASK_OBJECT];

CudaScheduler::CudaScheduler() {
}

void
CudaScheduler::init_gpu() {
    cuInit(0);
    cuDeviceGetCount(&ret_num_devices);
    if (ret_num_devices == 0) {
        exit(EXIT_FAILURE);
    }
    cuDeviceGet(&device, 0);
    ret = cuCtxCreate(&context, 0, device);
    if (ret!=0) {
        error(convert_error_status(ret));
    }
}

CudaScheduler::~CudaScheduler()
{
    cuCtxDestroy(context);
}

void
CudaScheduler::initCudaBuffer(CudaBufferPtr m) {
    m->allcate_size = 64;
    m->in_size = 0;
    m->out_size = 0;
    m->memin = (CUdeviceptr*)malloc(m->allcate_size*sizeof(CUdeviceptr*));
    m->memout = (CUdeviceptr*)malloc(m->allcate_size*sizeof(CUdeviceptr*));
    m->event = (CUevent*)malloc(m->allcate_size*sizeof(CUevent*));
    ret = cuStreamCreate(&(m->stream), 0);
    if (ret!=0)
        error(convert_error_status(ret));
}

void
CudaScheduler::destroyCudaBuffer(CudaBufferPtr m) {
    free(m->memin);
    free(m->memout);
    free(m->event);
    ret = cuStreamDestroy(m->stream);
    if (ret!=0)
        error(convert_error_status(ret));
    m->memin = 0;
    m->memout = 0;
    m->in_size = 0;
    m->out_size = 0;
    m->allcate_size = 0;
    m->event = 0;
    m->stream = 0;
}

void
CudaScheduler::createBuffer(CudaBufferPtr cudabuffer, CUdeviceptr* mem, int i, size_t size) {
    if (i > cudabuffer->allcate_size) {
        cudabuffer->allcate_size *= 2;
        cudabuffer->memin = (CUdeviceptr*)realloc(cudabuffer->memin, cudabuffer->allcate_size*sizeof(CUdeviceptr*));
        cudabuffer->memout = (CUdeviceptr*)realloc(cudabuffer->memout, cudabuffer->allcate_size*sizeof(CUdeviceptr*));
        cudabuffer->event = (CUevent*)realloc(cudabuffer->event, cudabuffer->allcate_size*sizeof(CUevent*));
    }

    ret = cuMemAlloc(&mem[i], size);
}

#define NOP_REPLY NULL

static void
release_buf_event(int cur, CudaScheduler::CudaBufferPtr mem) {
    for (int i=0; i<mem[cur-1].in_size; i++) {
        if (mem[cur-1].memin[i])
            cuMemFree(mem[cur-1].memin[i]);
        mem[cur-1].memin[i] = 0;
    }
    for (int i=0; i<mem[cur-1].out_size; i++) {
        if (mem[cur-1].event[i] != 0)
            cuEventDestroy(mem[cur-1].event[i]);
        mem[cur-1].event[i] = 0;
        if (mem[cur-1].memout[i])
            cuMemFree(mem[cur-1].memout[i]);
        mem[cur-1].memout[i] = 0;
    }
    mem[cur-1].in_size = 0;
    mem[cur-1].out_size = 0;
}

void
CudaScheduler::wait_for_event(CUevent* kernel_event, CudaBufferPtr cudabuffer, TaskListPtr taskList, int cur) {
    if (kernel_event[cur-1] == NOP_REPLY) {
        
    } else if (kernel_event[cur-1] != NULL){
        ret = cuEventSynchronize(kernel_event[cur-1]);
        
        if (ret!=0) {
            error(convert_error_status(ret));
        }
        if (taskList!=NULL) {
            unsigned long start = 0;
            unsigned long end = 0;
            // timestamp 取る方法がない?
        }
        ret = cuEventDestroy(kernel_event[cur-1]);
        if (ret!=0) {
            error(convert_error_status(ret));
        }
        kernel_event[cur-1] = 0;
        
        if (cudabuffer[cur-1].out_size > 0) {
            for (int i = 0; i<cudabuffer[cur-1].out_size; i++) {
                ret = cuEventSynchronize(cudabuffer[cur-1].event[i]);
                if (ret!=0) error(convert_error_status(ret));
            }
        }
        release_buf_event(cur, cudabuffer);
    }

    if(reply) {
        connector->mail_write(reply);
        __debug(this, "CUDA %d %s\t%lld\n", taskList->cpu_type, (char*)(cuda_task_list[taskList->tasks[0].command].name), taskList->task_end_time-taskList->task_start_time);
        reply = 0;
    }
}

void
CudaScheduler::CudaTaskError(CudaBufferPtr cudabuffer, int cur, TaskListPtr taskList, int ret) {
    error(convert_error_status(ret));
    if (kernel_event[cur] != 0)
        cuEventDestroy(kernel_event[cur]);
    kernel_event[cur] = NOP_REPLY;
    kernel[cur] = 0;
    release_buf_event(cur+1, cudabuffer);

    wait_for_event(kernel_event, cudabuffer, taskList, cur);
}

void
CudaScheduler::run() {
    init_gpu();
    int cur = 0;
    TaskListPtr tasklist = NULL;
    reply = 0;
    
    for (int i = 0; i<STAGE; i++) {
        initCudaBuffer(&cudabuffer[i]);
        kernel_event[i]=0;
    }

    memset(&flag, 0, sizeof(HTask::htask_flag)*STAGE);

    for (;;) {
        memaddr param_addr = connector->task_list_mail_read();

        if ((memaddr)param_addr == (memaddr)MY_SPE_COMMAND_EXIT) {
            for (int i = 0; i<STAGE; i++) {
                destroyCudaBuffer(&cudabuffer[i]);
            }
            return;
        }

        (*connector->start_dmawait_profile)(&(connector->start_time));
        while (param_addr) {
            // since we are on the same memory space, we don't has to use dma_load here
            tasklist = (TaskListPtr)connector->dma_load(this, param_addr,
                                                        sizeof(TaskList), DMA_READ_TASKLIST);
            //            tasklist[cur]->task_start_time = gettime();
            tasklist->task_start_time = 0;
            /*
             * get flip flag
             * flip : When caluculate on input data, to treat this as a output data
             */
            if (tasklist->self) {
                flag[cur] = tasklist->self->flag;
            } else {
                memset(&flag[cur], 0, sizeof(HTask::htask_flag));
            }
            for (TaskPtr nextTask = tasklist->tasks; nextTask < tasklist->last(); nextTask = nextTask->next()) {
                if(nextTask->command==ShowTime) {
                    connector->show_profile(); continue;
                }
                if(nextTask->command==StartProfile) {
                    connector->start_profile(); continue;
                }
                if (load_kernel(nextTask->command) == 0) { CudaTaskError(cudabuffer, cur, tasklist, ret); continue; }
                CUmodule& module = *cuda_task_list[nextTask->command].cudatask->module;
                const char *funcname = cuda_task_list[nextTask->command].name;
                
                ret = cuModuleGetFunction(&kernel[cur], module, funcname);
                if (ret!=0) { CudaTaskError(cudabuffer, cur, tasklist, ret); continue; }
                
                int param = 0;

                // set arg count
                createBuffer(&cudabuffer[cur], cudabuffer[cur].memin, param, sizeof(memaddr)*nextTask->param_count);
                if (ret!=0) { CudaTaskError(cudabuffer, cur, tasklist, ret); continue; }

                // parameter is passed as first kernel arg 
                ret = cuMemcpyHtoDAsync(cudabuffer[cur].memin[param], nextTask->param(0), sizeof(memaddr)*nextTask->param_count, cudabuffer[cur].stream);
                if (ret!=0) { CudaTaskError(cudabuffer, cur, tasklist, ret); continue; }
                
                param++;
                
                for(int i=0;i<nextTask->inData_count;i++) {
                    ListElement *input_buf = nextTask->inData(i);
                    if (input_buf->size==0) break;
                    createBuffer(&cudabuffer[cur], cudabuffer[cur].memin, param, input_buf->size);
                    if (ret!=0) { CudaTaskError(cudabuffer, cur, tasklist, ret); continue; }
                    ret = cuMemcpyHtoDAsync(cudabuffer[cur].memin[param], input_buf->addr, input_buf->size, cudabuffer[cur].stream);
                    if (ret!=0) { CudaTaskError(cudabuffer, cur, tasklist, ret); continue; }
                    
                    param++;
                }
                cudabuffer[cur].in_size = param; // +1 means param
                
                for(int i = 0; i<nextTask->outData_count;i++) { // set output data
                    ListElement *output_buf = nextTask->outData(i);
                    if (output_buf->size==0) break;
                    if (!flag[cur].flip) { // flip use memin for output 
                        createBuffer(&cudabuffer[cur], cudabuffer[cur].memout, i, output_buf->size);
                        if (ret!=0) { CudaTaskError(cudabuffer, cur, tasklist, ret); continue; }
                        // enqueue later
                    }
                    param++;
                }
                cudabuffer[cur].out_size = param - cudabuffer[cur].in_size; // no buffer on flip, but flip use memout event
                
                void** kernelParams;
                
                if (!flag[cur].flip) {
                    kernelParams = (void**)malloc(sizeof(void*)*param);
                    for (int i = 0; i<cudabuffer[cur].in_size; i++) {
                        kernelParams[i] = &cudabuffer[cur].memin[i];
                    }
                    for (int i = 0; i<cudabuffer[cur].out_size; i++) {
                        kernelParams[i+cudabuffer[cur].in_size] = &cudabuffer[cur].memout[i];
                    }
                } else {
                    kernelParams = (void**)malloc(sizeof(void*)*cudabuffer[cur].in_size);
                    for (int i = 0; i<cudabuffer[cur].in_size; i++) {
                        kernelParams[i] = &cudabuffer[cur].memin[i];
                    }
                }

                ret = cuEventCreate(&kernel_event[cur], 0);
                if (ret!=0) { CudaTaskError(cudabuffer , cur, tasklist, ret); continue; }
                    
                if (tasklist->dim > 0) {
                    ret = cuLaunchKernel(kernel[cur],
                                         tasklist->x*tasklist->y*tasklist->z, 1, 1,
                                         1, 1, 1,
                                         0, cudabuffer[cur].stream, kernelParams, NULL);
                } else {
                    ret = cuLaunchKernel(kernel[cur],
                                         1, 1, 1,
                                         1, 1, 1,
                                         0, cudabuffer[cur].stream, kernelParams, NULL);
                }
                if (ret!=0) { CudaTaskError(cudabuffer , cur, tasklist, ret); continue; }

                ret = cuEventRecord(kernel_event[cur], cudabuffer[cur].stream);
                if (ret!=0) { CudaTaskError(cudabuffer , cur, tasklist, ret); continue; }
                
                for(int i=0;i<nextTask->outData_count;i++) { // read output data
                    ListElement *output_buf = nextTask->outData(i);
                    if (output_buf->size==0) break;
                    CUdeviceptr* mem = flag[cur].flip ? cudabuffer[cur].memin : cudabuffer[cur].memout ;
                    int i0 = flag[cur].flip ? i+1 : i ;
                    // flip use memin buffer and memout event
                    ret = cuMemcpyDtoHAsync(output_buf->addr, mem[i0], output_buf->size, cudabuffer[cur].stream);
                    if (ret!=0) { CudaTaskError(cudabuffer, cur, tasklist, ret); continue; }

                    ret = cuEventCreate(&cudabuffer[cur].event[i], 0);
                    if (ret!=0) { CudaTaskError(cudabuffer, cur, tasklist, ret); continue; }

                    ret = cuEventRecord(cudabuffer[cur].event[i], cudabuffer[cur].stream);
                    if (ret!=0) { CudaTaskError(cudabuffer, cur, tasklist, ret); continue; }
                }
                // wait kernel[1-cur] and write[1-cur]
                // pipeline    : cur
                // to stop pipeline set cur+1
                if (cur == 0) {
                    wait_for_event(kernel_event, cudabuffer, tasklist, STAGE); // to stop pipeline comment out this line
                } else {
                    wait_for_event(kernel_event, cudabuffer, tasklist, cur);
                }
                cur++;
                if (STAGE <= cur)
                    cur = 0;
                free(kernelParams);
            }
            reply = (memaddr)tasklist->waiter;
            param_addr = (memaddr)tasklist->next;
        }
        if (cur == 0) {
            wait_for_event(kernel_event, cudabuffer, tasklist, STAGE);
        } else {
            wait_for_event(kernel_event, cudabuffer, tasklist, cur);
        }
        for (int i = 0; i<STAGE; i++) {
            ret = cuStreamSynchronize(cudabuffer[i].stream);
            if (ret!=0) { CudaTaskError(cudabuffer , cur, tasklist, ret); continue; }
        }
        
        unsigned long long wait = 0;
        (*connector->end_dmawait_profile)(&wait, &(connector->start_time), &(connector->stop_time));
        connector->mail_write((memaddr)MY_SPE_STATUS_READY);
    }
    /* NOT REACHED */
}

int
not_ready(SchedTask* smanager, void* r, void *w)
{
    smanager->printf("GPU task not ready %d\n", smanager->atask->command);
    return 0;
}

/*
 * kernel file open and build program
 */
int
CudaScheduler::load_kernel(int cmd)
{
    if (cuda_task_list[cmd].run == null_run) {
        return 1;
    }

    if (cuda_task_list[cmd].cudatask == 0 || cuda_task_list[cmd].cudatask->filename == 0) {
        fprintf(stderr, "CUDA module %d not defined.\n",cmd);
        return 0;
    }

    CUmodule* module = new CUmodule;
    ret = cuModuleLoad(module, cuda_task_list[cmd].cudatask->filename);

    if(ret!=0) {
        error(convert_error_status(ret));
    }
    cuda_task_list[cmd].cudatask->module = module;
    cuda_task_list[cmd].run = null_run; // kernel is ready
    return 1;
}

// regist kernel file name
void
cuda_register_task(int cmd, const char* filename, const char* functionname)
{
    cuda_task_list[cmd].run = not_ready;  // not yet ready
    cuda_task_list[cmd].load = null_loader;
    cuda_task_list[cmd].wait = null_loader;
    cuda_task_list[cmd].name = functionname;
    cuda_task_list[cmd].cudatask->filename = (const char*)filename;
}

/* end */