view TaskManager/Gpu/GpuScheduler.cc @ 1716:c12df61ded45 draft

fix gpu profile. not work yet
author Yuhi TOMARI <yuhi@cr.ie.u-ryukyu.ac.jp>
date Tue, 15 Oct 2013 17:21:10 +0900
parents 9392f4d97cff
children d911bef11c8a
line wrap: on
line source

#include "TaskManager.h"
#include "GpuScheduler.h"
#include "ReferencedDmaManager.h"
#include "PreRefDmaManager.h"
#include "SchedTask.h"
#include "stdio.h"
#include "GpuError.h"
#include "ListData.h"
#include "SysFunc.h"
#include "gettime.h"
#include <fcntl.h>
#include <sys/stat.h>
#include <string.h>

TaskObject gpu_task_list[MAX_TASK_OBJECT];

GpuScheduler::GpuScheduler()
{
    init_impl(0);
    init_gpu();
}

void
GpuScheduler::init_impl(int useRefDma)
{
    if (useRefDma & 0x10) {
        fifoDmaManager = new PreRefDmaManager(); // Prefetch command and no copy
    } else if (useRefDma & 0x01) {
        fifoDmaManager = new FifoDmaManager(); // memcpy
    } else {
        fifoDmaManager = new ReferencedDmaManager(); // no copy
    }
    connector = fifoDmaManager;
}


/*
 * Prepare OpenCL:
 * get OpenCL information
 * create command queue
 */
void
GpuScheduler::init_gpu()
{
    clGetPlatformIDs(1, &platform_id, &ret_num_platforms);
    clGetDeviceIDs(platform_id, CL_DEVICE_TYPE_GPU, 1, &device_id, &ret_num_devices);
    // unavailable GPU
    if(ret_num_devices == 0) {
        exit(EXIT_FAILURE);
    }
    context = clCreateContext(NULL, 1, &device_id, NULL, NULL, &ret);
    command_queue = clCreateCommandQueue(context, device_id, 0,&ret);
    if (ret<0) {
        const char *msg=convert_error_status(ret);
        error(msg);
    }
}

GpuScheduler::~GpuScheduler()
{
    clReleaseCommandQueue(command_queue);
    clReleaseContext(context);
}



void
GpuScheduler::wait_for_event(cl_event* event,memaddr* reply,int cur) {
    if (event[1-cur] != NULL) {
        int ret=clWaitForEvents(1,&event[1-cur]);
        clReleaseEvent(event[1-cur]);
        if (ret<0) {
            const char *msg=convert_error_status(ret);
            error(msg);
        }
        if(reply[1-cur]) {
            connector->mail_write(reply[1-cur]);
            reply[1-cur]=0;
        }
        event[1-cur]=NULL;
    }
}


/*
 * run GPU task
 * Get input and output data from tasklist.
 * Enqueue OpenCL command and clflush.
 * Enqueue and clflush are pipelined structure.
 */
void
GpuScheduler::run()
{
    int cur = 0;
    memaddr reply[2]={0,0};
    cl_kernel kernel[2]={0,0};
    cl_event event[2];
    event[0]=NULL;event[1]=NULL;

    cl_mem *memin[2];
    cl_mem *memout[2];
    TaskListPtr tasklist[2];
    HTask::htask_flag flag;
    memset(&flag, 0, sizeof(HTask::htask_flag));

    for (;;) {
        memaddr params_addr = connector->task_list_mail_read();
        // read task list mail from DmaManager

        if ((memaddr)params_addr == (memaddr)MY_SPE_COMMAND_EXIT) {
            clFinish(command_queue);
            if (kernel[0])
                clReleaseKernel(kernel[0]);
            if (kernel[1])
                clReleaseKernel(kernel[1]);
            if (event[0])
                clReleaseEvent(event[0]);
            if (event[1])
                clReleaseEvent(event[1]);
            return ;
        }

        (*connector->start_dmawait_profile)();
        while (params_addr) {
            // since we are on the same memory space, we don't has to use dma_load here
            tasklist[cur] = (TaskListPtr)connector->dma_load(this, params_addr,
                                                                    sizeof(TaskList), DMA_READ_TASKLIST);
            tasklist[cur]->start_time = gettime();
            /*
             * get flip flag
             * flip : When caluculate on input data, to treat this as a output data
             */
            if (tasklist[cur]->self) {
                flag = tasklist[cur]->self->flag;
            }
            
            for (TaskPtr nextTask = tasklist[cur]->tasks;nextTask < tasklist[cur]->last(); nextTask = nextTask->next()) {
                if(nextTask->command==ShowTime) {
                    connector->show_profile();
                    continue;
                }
                if(nextTask->command==StartProfile) {
                    connector->start_profile();
                    continue;
                }
                load_kernel(nextTask->command);
                cl_program& program = *gpu_task_list[nextTask->command].gputask->program;
                const char *function = gpu_task_list[nextTask->command].name;

                if (kernel[cur])
                    clReleaseKernel(kernel[cur]);
                kernel[cur] = clCreateKernel(program, function, &ret);
                if (ret<0) {
                    const char *msg=convert_error_status(ret);
                    error(msg);
                }

                int param = 0;

                // set arg count
                cl_mem memparam = clCreateBuffer(context, CL_MEM_READ_ONLY,
                                                 sizeof(memaddr)*nextTask->param_count, NULL, &ret);
                if (ret<0) {
                    const char *msg=convert_error_status(ret);
                    error(msg);
                }

                ret = clEnqueueWriteBuffer(command_queue, memparam, CL_TRUE, 0,sizeof(memaddr)*nextTask->param_count,
                                           nextTask->param(param), 0, NULL, NULL);
                param=0;
                if (ret<0) {
                    const char *msg=convert_error_status(ret);
                    error(msg);
                }

                ret = clSetKernelArg(kernel[cur], param, sizeof(memaddr),(void *)&memparam);
                if (ret<0) {
                    const char *msg=convert_error_status(ret);
                    error(msg);
                }

                param++;

                cl_mem_flags mem_flag = CL_MEM_READ_ONLY;
                memin[cur] = new cl_mem[nextTask->inData_count];

                if (!flag.flip) { // set input data when not flip
                    for(int i=0;i<nextTask->inData_count;i++) {
                        ListElement *input_buf = nextTask->inData(i);
                        if (input_buf->size==0) break;
                        memin[cur][i] = clCreateBuffer(context, mem_flag, input_buf->size, NULL, &ret);
                        if (ret<0) {
                            const char *msg=convert_error_status(ret);
                            error(msg);
                        }
                        ret = clEnqueueWriteBuffer(command_queue, memin[cur][i], CL_TRUE, 0,
                                                   input_buf->size, input_buf->addr, 0, NULL, NULL);
                        if (ret<0) {
                            const char *msg=convert_error_status(ret);
                            error(msg);
                        }
                        ret = clSetKernelArg(kernel[cur],  param, sizeof(memaddr), (void *)&memin[cur][i]);
                        if (ret<0) {
                            const char *msg=convert_error_status(ret);
                            error(msg);
                        }

                        param++;
                    }
                }
                cl_mem_flags out_mem_flag;
                if (flag.flip) {
                    memout[cur] = new cl_mem[nextTask->inData_count];
                    out_mem_flag = CL_MEM_READ_WRITE;
                } else {
                    memout[cur] = new cl_mem[nextTask->outData_count];
                    out_mem_flag = CL_MEM_WRITE_ONLY;
                }

                
                for(int i = 0; i<nextTask->outData_count;i++) { // set output data
                    ListElement *output_buf = flag.flip? nextTask->inData(i) : nextTask->outData(i);
                    if (output_buf->size==0) break;
                    memout[cur][i] = clCreateBuffer(context, out_mem_flag, output_buf->size, NULL, &ret);
                    if (ret<0) {
                        const char *msg=convert_error_status(ret);
                        error(msg);
                    }

                    if (flag.flip) { // use output buffer as input buffer
                        ListElement *input_buf = nextTask->inData(i);

                        ret = clEnqueueWriteBuffer(command_queue, memout[cur][i], CL_TRUE, 0,
                                                   input_buf->size, input_buf->addr, 0, NULL, NULL);
                        if (ret<0) {
                            const char *msg=convert_error_status(ret);
                            error(msg);
                        }
                    }
                    ret = clSetKernelArg(kernel[cur],  param, sizeof(memaddr), (void *)&memout[cur][i]);
                    if (ret<0) {
                        const char *msg=convert_error_status(ret);
                        error(msg);
                    }
                    param++;
                }
                
                if (tasklist[cur]->dim > 0) {
                    ret = clEnqueueNDRangeKernel(command_queue, kernel[cur], tasklist[cur]->dim,
                                                 NULL, &tasklist[cur]->x, 0, 0, NULL, NULL);
                } else {
                    ret = clEnqueueTask(command_queue, kernel[cur], 0, NULL, NULL);
                }
                if (ret<0) {
                    const char *msg=convert_error_status(ret);
                    error(msg);
                }

                for(int i=0;i<nextTask->outData_count;i++) { // read output data
                    ListElement *output_buf = flag.flip? nextTask->inData(i) :nextTask->outData(i);
                    if (output_buf->size==0) break;
                    ret = clEnqueueReadBuffer(command_queue, memout[cur][i], CL_FALSE, 0,
                                              output_buf->size, output_buf->addr, 0, NULL, &event[cur]);
                    if (ret<0) {
                        const char *msg=convert_error_status(ret);
                        error(msg);
                    }
                }
                    
                if (ret<0) {
                    const char *msg=convert_error_status(ret);
                    error(msg);
                }

                reply[cur] = (memaddr)tasklist[cur]->waiter;

                wait_for_event(event,reply,cur);
                tasklist[1-cur]->stop_time = gettime();
                //clFlush(command_queue); // waiting for queued task

                // clFlush(command_queue);
                // pipeline    : 1-cur
                // no pipeline : cur
                /* should be released
                 *  clReleaseMemObject(memin[1-cur]);
                 *  clReleaseMemObject(memout[1-cur]);
                 */
                
                params_addr = (memaddr)tasklist[cur]->next;
                cur = 1 - cur;
            }
        }
        wait_for_event(event,reply,cur);
        tasklist[1-cur]->stop_time = gettime();
        //clFlush(command_queue); // waiting for queued task
        //clFinish(command_queue); // waiting for queued task
        (*connector->end_dmawait_profile)(&connector->global_busy_time);
        connector->mail_write((memaddr)MY_SPE_STATUS_READY);
    }
    // TaskArrayの処理
}

int
not_ready(SchedTask* smanager, void* r, void *w)
{
    smanager->printf("GPU task not ready %d\n", smanager->atask->command);
    return 0;
}

/*
 * kernel file open and build program
 */
void
GpuScheduler::load_kernel(int cmd)
{
    if (gpu_task_list[cmd].run == null_run) return;

    const char *filename = (const char *)gpu_task_list[cmd].gputask->program;

    int fd;
    char *source_str;
    size_t source_size;

    fd = open(filename, O_RDONLY);

    if (fd<0) {
        fprintf(stderr, "Failed to load kernel %s.\n",filename);
        exit(1);
    }

    struct stat stats;
    fstat(fd,&stats);
    off_t size = stats.st_size;

    if (size<=0) {
        fprintf(stderr, "Failed to load kernel.\n");
        exit(1);
    }

    source_str = (char*)alloca(size);
    source_size = read(fd, source_str, size);
    close(fd);

    cl_program *program = new cl_program;
    *program = clCreateProgramWithSource(context, 1,
                                         (const char **)&source_str,
                                         (const size_t *)&source_size, &ret);
    ret = clBuildProgram(*program, 1, &device_id, NULL, NULL, NULL);

    if(ret<0) {
        size_t size;
        clGetProgramBuildInfo(*program, device_id, CL_PROGRAM_BUILD_LOG, 0, NULL, &size);

        char *log = new char[size];
        clGetProgramBuildInfo(*program, device_id, CL_PROGRAM_BUILD_LOG, size, log, NULL);
        error(log);
    }
    gpu_task_list[cmd].gputask->program = program;
    gpu_task_list[cmd].run = null_run; // kernel is ready

}

// regist kernel file name
void
gpu_register_task(int cmd, const char* filename, const char* functionname)
{
    gpu_task_list[cmd].run = not_ready;  // not yet ready
    gpu_task_list[cmd].load = null_loader;
    gpu_task_list[cmd].wait = null_loader;
    gpu_task_list[cmd].name = functionname;
    gpu_task_list[cmd].gputask->program = (cl_program *) filename;
}

/* end */