view TaskManager/Gpu/GpuScheduler.cc @ 1886:c3573a5ac6a1 draft

GPU also waits
author Shinji KONO <kono@ie.u-ryukyu.ac.jp>
date Fri, 03 Jan 2014 16:57:34 +0900
parents 44fa0f1320a9
children 5238ca826d6e
line wrap: on
line source

#include "TaskManager.h"
#include "GpuScheduler.h"
#include "ReferencedDmaManager.h"
#include "PreRefDmaManager.h"
#include "SchedTask.h"
#include "stdio.h"
#include "GpuError.h"
#include "ListData.h"
#include "SysFunc.h"
#include "gettime.h"
#include <fcntl.h>
#include <sys/stat.h>
#include <string.h>

TaskObject gpu_task_list[MAX_TASK_OBJECT];

GpuScheduler::GpuScheduler()
{
    init_gpu();
}

void
GpuScheduler::init_impl(int useRefDma)
{
    if (useRefDma & 0x10) {
        fifoDmaManager = new PreRefDmaManager(); // Prefetch command and no copy
    } else if (useRefDma & 0x01) {
        fifoDmaManager = new FifoDmaManager(); // memcpy
    } else {
        fifoDmaManager = new ReferencedDmaManager(); // no copy
    }
    connector = fifoDmaManager;
}


/*
 * Prepare OpenCL:
 * get OpenCL information
 * create command queue
 */
void
GpuScheduler::init_gpu()
{
    clGetPlatformIDs(1, &platform_id, &ret_num_platforms);
    clGetDeviceIDs(platform_id, CL_DEVICE_TYPE_GPU, 1, &device_id, &ret_num_devices);
    //    clGetDeviceIDs(platform_id, CL_DEVICE_TYPE_CPU, 1, &device_id, &ret_num_devices);
    // unavailable GPU
    if(ret_num_devices == 0) {
        exit(EXIT_FAILURE);
    }
    context = clCreateContext(NULL, 1, &device_id, NULL, NULL, &ret);
    command_queue = clCreateCommandQueue(context, device_id, CL_QUEUE_PROFILING_ENABLE, &ret);
    if (ret<0) {
        const char *msg=convert_error_status(ret);
        error(msg);
    }

}

GpuScheduler::~GpuScheduler()
{
    clReleaseCommandQueue(command_queue);
    clReleaseContext(context);
}

void
GpuScheduler::initGpuBuffer(GpuBufferPtr m) {
    m->size = 0;
    m->allocate_size = 64;
    m->buf  = (cl_mem*)malloc(m->allocate_size*sizeof(cl_mem*));
    m->event  = (cl_event*)malloc(m->allocate_size*sizeof(cl_event*));
}

void
GpuScheduler::destroyGpuBuffer(GpuBufferPtr m) {
    free(m->buf);
    free(m->event);
    m->size = 0;
    m->allocate_size = 0;
    m->buf = 0;
    m->event = 0;
}

cl_mem
GpuScheduler::createBuffer(GpuBufferPtr m, int i,  cl_context context, cl_mem_flags flags, size_t size, cl_int *error) {
    if (i > m->allocate_size) {
        // reallocate buffer size 
        m->allocate_size *= 2;
        m->buf = (cl_mem*)realloc(m->buf, m->allocate_size*sizeof(cl_mem*));
        m->event = (cl_event*)realloc(m->event, m->allocate_size*sizeof(cl_event*));
    }

    m->buf[i] = clCreateBuffer(context, flags, size, 0, error);
    return m->buf[i];
}

#define NOP_REPLY NULL

static void
release_buf_event(int cur, GpuScheduler::GpuBufferPtr memout) {
    for (int i=0; i < memout[1-cur].size; i++) {
        if (memout[1-cur].event[i] != 0)
            clReleaseEvent(memout[1-cur].event[i]);
        memout[1-cur].event[i] = 0;
        if (memout[1-cur].buf[i] != 0)
            clReleaseMemObject(memout[1-cur].buf[i]);
        memout[1-cur].buf[i]   = 0;
    }
    memout[1-cur].size = 0;
}

/**
 * wait for previous pipeline termination
 * kernel_event, memout_event
 */
void
GpuScheduler::wait_for_event(cl_event* kernel_event, GpuBufferPtr memout, TaskListPtr taskList, int cur) {
    if (kernel_event[1-cur] == NOP_REPLY) {
        
    } else if (kernel_event[1-cur] != NULL) {
        int ret=clWaitForEvents(1,&kernel_event[1-cur]);

        if (ret<0) {
            error(convert_error_status(ret));
        }
        if (taskList!=NULL){
            cl_ulong start = 0;
            cl_ulong end   = 0;
            clGetEventProfilingInfo(kernel_event[1-cur],CL_PROFILING_COMMAND_START, sizeof(cl_ulong), &start, NULL);
            clGetEventProfilingInfo(kernel_event[1-cur],CL_PROFILING_COMMAND_END, sizeof(cl_ulong), &end, NULL);
            if (taskList->task_start_time == 0)
                taskList->task_start_time = start;
            taskList->task_end_time   = end;
        }
        clReleaseEvent(kernel_event[1-cur]);
        kernel_event[1-cur] = 0;

    }
    
    if (memout[1-cur].size > 0) {
        int ret=clWaitForEvents(memout[1-cur].size, memout[1-cur].event);
        if (ret<0) error(convert_error_status(ret));
        release_buf_event(cur,memout);
    }

    if (memin[1-cur].size > 0) {
        release_buf_event(cur,memin);
    }
    if(reply) {
        connector->mail_write(reply);
        printf("GPU %d %s\t%lld\n",taskList->self->cpu_type,(char*)(gpu_task_list[taskList->tasks[0].command].name),taskList->task_end_time-taskList->task_start_time);
        reply = 0;
    }
}

void
GpuScheduler::gpuTaskError(int cur, TaskListPtr tasklist, int ret)
{
    error(convert_error_status(ret));
    if (kernel_event[cur] != 0)
        clReleaseEvent(kernel_event[cur]);
    kernel_event[cur] = NOP_REPLY;
    if (kernel[cur] != 0)
        clReleaseKernel(kernel[cur]);
    kernel[cur] = 0;
    release_buf_event(1-cur,memout);
    release_buf_event(1-cur,memin);

    // wait kernel[1-cur] and write[1-cur]
    wait_for_event(kernel_event, memout, tasklist, cur);
}

/*
 * run GPU task
 * Get input and output data from tasklist.
 * Enqueue OpenCL command and clflush.
 * Enqueue and clflush are pipelined structure.
 *
 * flip means that kernel modify input buffer only, copy GPU input buffer to outData.
 *
 */
void
GpuScheduler::run()
{
    int cur = 0;
    TaskListPtr tasklist = NULL;
    reply = 0;
    initGpuBuffer(&memin[0]);initGpuBuffer(&memin[1]);
    initGpuBuffer(&memout[0]);initGpuBuffer(&memout[1]);
    memset(&flag, 0, sizeof(HTask::htask_flag)*2);
    
    for (;;) {
        memaddr params_addr = connector->task_list_mail_read();
        // read task list mail from DmaManager

        if ((memaddr)params_addr == (memaddr)MY_SPE_COMMAND_EXIT) {
            // wait_for_envet was called, so all kernel,buf,event have been released.
            clFinish(command_queue);
            destroyGpuBuffer(&memout[1-cur]);
            destroyGpuBuffer(&memout[cur]);
            destroyGpuBuffer(&memin[cur]);
            destroyGpuBuffer(&memin[1-cur]);
            return ;
        }

        (*connector->start_dmawait_profile)(&(connector->start_time));
        while (params_addr) {
            // since we are on the same memory space, we don't has to use dma_load here
            tasklist = (TaskListPtr)connector->dma_load(this, params_addr,
                                                                    sizeof(TaskList), DMA_READ_TASKLIST);
            //            tasklist[cur]->task_start_time = gettime();
            tasklist->task_start_time = 0;
            /*
             * get flip flag
             * flip : When caluculate on input data, to treat this as a output data
             */
            if (tasklist->self) {
                flag[cur] = tasklist->self->flag;
            } else {
                 memset(&flag[cur], 0, sizeof(HTask::htask_flag));
            }
            for (TaskPtr nextTask = tasklist->tasks; nextTask < tasklist->last(); nextTask = nextTask->next()) {
                if(nextTask->command==ShowTime) {
                    connector->show_profile(); continue;
                }
                if(nextTask->command==StartProfile) {
                    connector->start_profile(); continue;
                }
                if (load_kernel(nextTask->command) == 0) { gpuTaskError(cur,tasklist,ret); continue; }
                cl_program& program = *gpu_task_list[nextTask->command].gputask->program;
                const char *function = gpu_task_list[nextTask->command].name;
                
                if (kernel[cur])
                    clReleaseKernel(kernel[cur]);
                kernel[cur] = clCreateKernel(program, function, &ret);
                if (ret<0) { gpuTaskError(cur,tasklist,ret); continue; }

                int param = 0;

                // set arg count
                cl_mem memparam = createBuffer(&memin[cur], 0, context, CL_MEM_READ_ONLY,
                                               sizeof(memaddr)*nextTask->param_count, &ret);
                if (ret<0) { gpuTaskError(cur,tasklist,ret); continue; }

                // parameter is passed as first kernel arg 
                ret = clEnqueueWriteBuffer(command_queue, memparam, CL_TRUE, 0,sizeof(memaddr)*nextTask->param_count,
                                           nextTask->param(0), 0, NULL, &memin[cur].event[0]);
                if (ret<0) { gpuTaskError(cur,tasklist,ret); continue; }
                
                ret = clSetKernelArg(kernel[cur], 0, sizeof(memaddr),(void *)&memin[cur].buf[0]);
                if (ret<0) { gpuTaskError(cur,tasklist,ret); continue; }

                param++;

                cl_mem_flags mem_flag = flag[cur].flip ? CL_MEM_READ_WRITE : CL_MEM_READ_ONLY;
                
                for(int i=0;i<nextTask->inData_count;i++) {
                    ListElement *input_buf = nextTask->inData(i);
                    if (input_buf->size==0) break;
                    createBuffer(&memin[cur], param, context, mem_flag, input_buf->size, &ret);
                    if (ret<0) { gpuTaskError(cur,tasklist,ret); continue; }
                    ret = clEnqueueWriteBuffer(command_queue, memin[cur].buf[param], CL_TRUE, 0,
                                               input_buf->size, input_buf->addr, 0, 
                                               NULL, &memin[cur].event[param]);
                    if (ret<0) { gpuTaskError(cur,tasklist,ret); continue; }
                    ret = clSetKernelArg(kernel[cur],  param, sizeof(memaddr), (void *)&memin[cur].buf[param]);
                    if (ret<0) { gpuTaskError(cur,tasklist,ret); continue; }
                    
                    param++;
                }
                memin[cur].size  = param; // +1 means param
                
                for(int i = 0; i<nextTask->outData_count;i++) { // set output data
                    ListElement *output_buf = nextTask->outData(i);
                    if (output_buf->size==0) break;
                    if (!flag[cur].flip) { // flip use memin for output 
                        createBuffer(&memout[cur], i, context, CL_MEM_WRITE_ONLY, output_buf->size, &ret);
                        if (ret<0) { gpuTaskError(cur,tasklist,ret); continue; }
                        ret = clSetKernelArg(kernel[cur],  param, sizeof(memaddr), (void *)&memout[cur].buf[i]);
                        if (ret<0) { gpuTaskError(cur,tasklist,ret); continue;}
                        // enqueue later
                    }
                    param++;
                }
                memout[cur].size = param - memin[cur].size;  // no buffer on flip, but flip use memout event

                if (tasklist->dim > 0) {
                    ret = clEnqueueNDRangeKernel(command_queue, kernel[cur], tasklist->dim,
                                 NULL, &tasklist->x, 0, memin[cur].size, memin[cur].event, &kernel_event[cur]);
                } else {
                    ret = clEnqueueTask(command_queue, kernel[cur], memin[cur].size,
                                        memin[cur].event, &kernel_event[cur]);
                }
                if (ret<0) { gpuTaskError(cur, tasklist, ret); continue; }

                for(int i=0;i<nextTask->outData_count;i++) { // read output data
                    ListElement *output_buf = nextTask->outData(i);
                    if (output_buf->size==0) break;
                    GpuBufferPtr mem = flag[cur].flip ? memin : memout ;
                    int i0 = flag[cur].flip ? i+1 : i ;
                    // flip use memin buffer and memout event
                    ret = clEnqueueReadBuffer(command_queue, mem[cur].buf[i0], CL_FALSE, 0,
                                              output_buf->size, output_buf->addr, 1, &kernel_event[cur], &memout[cur].event[i]);
                    if (ret<0) { gpuTaskError(cur,tasklist,ret); continue; }
                }
                // wait kernel[1-cur] and write[1-cur]
                // pipeline    : cur
                // to stop pipeline set 1-cur
                wait_for_event(kernel_event, memout, tasklist, cur);
                cur = 1 - cur;
            }
            reply = (memaddr)tasklist->waiter;
            params_addr = (memaddr)tasklist->next;
        }
        wait_for_event(kernel_event, memout, tasklist, cur);

        unsigned long long wait = 0;
        (*connector->end_dmawait_profile)(&wait, &(connector->start_time), &(connector->stop_time));
        connector->mail_write((memaddr)MY_SPE_STATUS_READY);
    }
    /* NOT REACHED */
}

int
not_ready(SchedTask* smanager, void* r, void *w)
{
    smanager->printf("GPU task not ready %d\n", smanager->atask->command);
    return 0;
}

/*
 * kernel file open and build program
 */
int
GpuScheduler::load_kernel(int cmd)
{
    if (gpu_task_list[cmd].run == null_run) {
        return 1;
    }

    if (gpu_task_list[cmd].gputask == 0 || gpu_task_list[cmd].gputask->program == 0) {
        fprintf(stderr, "GPU kernel %d not defined.\n",cmd);
        return 0;
    }
    const char *filename = (const char *)gpu_task_list[cmd].gputask->program;

    int fd;
    char *source_str;
    size_t source_size;

    fd = open(filename, O_RDONLY);

    if (fd<0) {
        fprintf(stderr, "Failed to open kernel %s.\n",filename);
        exit(1);
    }

    struct stat stats;
    fstat(fd,&stats);
    off_t size = stats.st_size;

    if (size<=0) {
        fprintf(stderr, "Failed to read kernel.\n");
        exit(1);
    }

    source_str = (char*)alloca(size+1);
    source_size = read(fd, source_str, size);
    close(fd);
    source_str[size] = 0;

    cl_program *program = new cl_program;
    *program = clCreateProgramWithSource(context, 1,
                                         (const char **)&source_str, 0, &ret);
    ret = clBuildProgram(*program, 1, &device_id, NULL, NULL, NULL);

    if(ret<0) {
        size_t size;
        clGetProgramBuildInfo(*program, device_id, CL_PROGRAM_BUILD_LOG, 0, NULL, &size);

        char *log = new char[size];
        clGetProgramBuildInfo(*program, device_id, CL_PROGRAM_BUILD_LOG, size, log, NULL);
        error(log);
    }
    gpu_task_list[cmd].gputask->program = program;
    gpu_task_list[cmd].run = null_run; // kernel is ready
    return 1;
}

// regist kernel file name
void
gpu_register_task(int cmd, const char* filename, const char* functionname)
{
    gpu_task_list[cmd].run = not_ready;  // not yet ready
    gpu_task_list[cmd].load = null_loader;
    gpu_task_list[cmd].wait = null_loader;
    gpu_task_list[cmd].name = functionname;
    gpu_task_list[cmd].gputask->program = (cl_program *) filename;
}

/* end */