view TaskManager/Gpu/GpuScheduler.cc @ 1548:614a3f62c881 draft

add set work item size function
author Yuhi TOMARI <yuhi@cr.ie.u-ryukyu.ac.jp>
date Fri, 15 Feb 2013 07:37:04 +0900
parents 2983e9e93d24
children 68200bc3ab6b
line wrap: on
line source

#include "TaskManager.h"
#include "GpuScheduler.h"
#include "ReferencedDmaManager.h"
#include "PreRefDmaManager.h"
#include "SchedTask.h"
#include "stdio.h"
#include "GpuError.h"
#include "ListData.h"
#include <fcntl.h>
#include <sys/stat.h>
#include <string.h>

GpuScheduler::GpuScheduler()
{
    init_impl(0);
    init_gpu();
}

void
GpuScheduler::init_impl(int useRefDma)
{
    if (useRefDma & 0x10) {
        fifoDmaManager = new PreRefDmaManager(); // Prefetch command and no copy
    } else if (useRefDma & 0x01) {
        fifoDmaManager = new FifoDmaManager(); // memcpy
    } else {
        fifoDmaManager = new ReferencedDmaManager(); // no copy
    }
    connector = fifoDmaManager;
}


/*
 * Prepare OpenCL:
 * get OpenCL information
 * create command queue
 */
void
GpuScheduler::init_gpu()
{
    clGetPlatformIDs(1, &platform_id, &ret_num_platforms);
    clGetDeviceIDs(platform_id, CL_DEVICE_TYPE_GPU, 1, &device_id, &ret_num_devices);
    // unavailable GPU
    if(ret_num_devices == 0) {
        exit(EXIT_FAILURE);
    }
    context = clCreateContext(NULL, 1, &device_id, NULL, NULL, &ret);
    command_queue = new cl_command_queue[2];
    command_queue[0] = clCreateCommandQueue(context, device_id, 0, &ret);
    command_queue[1] = clCreateCommandQueue(context, device_id, 0, &ret);
}

GpuScheduler::~GpuScheduler()
{
    clReleaseCommandQueue(command_queue[0]);
    clReleaseCommandQueue(command_queue[1]);
    clReleaseContext(context);
}


/*
 * run GPU task
 * Get input and output data from tasklist.
 * Enqueue OpenCL command and clflush.
 * Enqueue and clflush are pipelined structure.
 */

void
GpuScheduler::run()
{
    int cur = 0;
    memaddr reply[2];
    cl_kernel *kernel = new cl_kernel[2];
    cl_mem *memin[2];
    cl_mem *memout[2];
    HTask::htask_flag flag;
    memset(reply, 0, sizeof(memaddr)*2);
    memset(&flag, 0, sizeof(HTask::htask_flag));

    for (;;) {
        memaddr params_addr = connector->task_list_mail_read();
        // read task list mail from DmaManager

        if ((memaddr)params_addr == (memaddr)MY_SPE_COMMAND_EXIT) {
            clFinish(command_queue[0]);
            clFinish(command_queue[1]);
            return ;
        }

        while (params_addr) {
            // since we are on the same memory space, we don't has to use dma_load here
            TaskListPtr tasklist = (TaskListPtr)connector->dma_load(this, params_addr,
                                                        sizeof(TaskList), DMA_READ_TASKLIST);

            if (tasklist->self) {
                /*
                 * get flip flag
                 * flip : When caluculate on input data, to treat this as a output data
                 */
                flag = tasklist->self->flag;
            }

            for (TaskPtr nextTask = tasklist->tasks;
                 nextTask < tasklist->last(); nextTask = nextTask->next()) {

                load_kernel(nextTask->command);
                cl_program& program = *task_list[nextTask->command].gputask->program;
                const char *function = task_list[nextTask->command].name;

                kernel[cur] = clCreateKernel(program, function, &ret);
                if (ret<0) {
                    const char *msg=convert_error_status(ret);
                    error(msg);
                }
                int param = 0;
                
                size_t gws[3],lws[3];
                memset(gws, 0, sizeof(size_t)*3);
                memset(lws, 0, sizeof(size_t)*3);
                cl_uint dimension;
                if (flag.nd_range) {
                    ListElement *input_buf = nextTask->inData(0);
                    size_t *ws_buf = (size_t*)input_buf->addr;
                    dimension = (cl_uint)ws_buf[0];
                    /* dimension check
                     * if () {
                     * error("Invalid work item dimension\n");
                     * }
                     */
                    for (int i=0; i<dimension; i++) {
                        gws[i] = ws_buf[i+1];
                        lws[i] = ws_buf[i+1+dimension];
                    }
                }

                // set arg count
                cl_mem memparam = clCreateBuffer(context, CL_MEM_READ_ONLY,
                                                 sizeof(memaddr)*nextTask->param_count, NULL, NULL);
                ret = clEnqueueWriteBuffer(command_queue[cur], memparam, CL_TRUE, 0,
                                           sizeof(memaddr)*nextTask->param_count,nextTask->param(0), 0, NULL, NULL);
                if (ret<0) {
                    const char *msg=convert_error_status(ret);
                    error(msg);
                }

                ret = clSetKernelArg(kernel[cur], param, sizeof(memaddr),(void *)&memparam);
                if (ret<0) {
                    const char *msg=convert_error_status(ret);
                    error(msg);
                }

                param++;

                cl_mem_flags mem_flag = CL_MEM_READ_ONLY;
                memin[cur] = new cl_mem[nextTask->inData_count];
                if (!flag.flip) { // set input data when not flip
                    int i=flag.nd_range? 1:0;

                    for(;i<nextTask->inData_count;i++) {
                        ListElement *input_buf = nextTask->inData(i);
                        memin[cur][i] = clCreateBuffer(context, mem_flag, input_buf->size, NULL, NULL);
                        ret = clEnqueueWriteBuffer(command_queue[cur], memin[cur][i], CL_TRUE, 0,
                                                   input_buf->size, input_buf->addr, 0, NULL, NULL);
                        if (ret<0) {
                            const char *msg=convert_error_status(ret);
                            error(msg);
                        }
                        ret = clSetKernelArg(kernel[cur],  param, sizeof(memaddr), (void *)&memin[cur][i]);
                        if (ret<0) {
                            const char *msg=convert_error_status(ret);
                            error(msg);
                        }

                        param++;
                    }
                }
                cl_mem_flags out_mem_flag;
                if (flag.flip) {
                    memout[cur] = new cl_mem[nextTask->inData_count];
                    out_mem_flag = CL_MEM_READ_WRITE;
                } else {
                    memout[cur] = new cl_mem[nextTask->outData_count];
                    out_mem_flag = CL_MEM_WRITE_ONLY;
                }

                int i = (flag.nd_range)&&(flag.flip)? 1:0;
                for(;i<nextTask->outData_count;i++) { // set output data
                    ListElement *output_buf = flag.flip? nextTask->inData(i) : nextTask->outData(i);
                    memout[cur][i] = clCreateBuffer(context, out_mem_flag, output_buf->size, NULL, &ret);
                    if (ret<0) {
                        const char *msg=convert_error_status(ret);
                        error(msg);
                    }

                    if (flag.flip) { // use output buffer as input buffer
                        ListElement *input_buf = nextTask->inData(i);

                        ret = clEnqueueWriteBuffer(command_queue[cur], memout[cur][i], CL_TRUE, 0,
                                                   input_buf->size, input_buf->addr, 0, NULL, NULL);
                        if (ret<0) {
                            const char *msg=convert_error_status(ret);
                            error(msg);
                        }
                    }
                    ret = clSetKernelArg(kernel[cur],  param, sizeof(memaddr), (void *)&memout[cur][i]);
                    if (ret<0) {
                        const char *msg=convert_error_status(ret);
                        error(msg);
                    }
                    param++;
                }

                cl_event ev = NULL;
                if (flag.nd_range){
                    ret = clEnqueueNDRangeKernel(command_queue[cur],kernel[cur],dimension,NULL,gws,lws,0,NULL,NULL);
                } else {
                    ret = clEnqueueTask(command_queue[cur], kernel[cur], 0, NULL, &ev);    
                }

                if (ret<0) {
                    const char *msg=convert_error_status(ret);
                    error(msg);
                }
                // ndrange flagが0ならdim,global_work_size[0],local_work_size[0] = 1で固定に
                // clEnqueueNDRange
                // (command_queue[cur], kernel[cur], dim, NULL,global_work_size[0],local_work_size[0],NULL&ev);

                for(int i=0;i<nextTask->outData_count;i++) { // read output data
                    ListElement *output_buf = flag.flip? nextTask->inData(i) :nextTask->outData(i);
                    ret = clEnqueueReadBuffer(command_queue[cur], memout[cur][i], CL_TRUE, 0,
                                              output_buf->size, output_buf->addr, 1, &ev, NULL);
                    if (ret<0) {
                        const char *msg=convert_error_status(ret);
                        error(msg);
                    }
                }
            }

            reply[cur] = (memaddr)tasklist->waiter;
            clFlush(command_queue[1-cur]); // waiting for queued task
            // pipeline    : 1-cur
            // no pipeline : cur
            clReleaseKernel(kernel[1-cur]);
            /* should be released
             *  clReleaseMemObject(memin[1-cur]);
             *  clReleaseMemObject(memout[1-cur]);
             */
            if(reply[1-cur]) {
                connector->mail_write(reply[1-cur]);
            }

            params_addr = (memaddr)tasklist->next;
            cur = 1 - cur;
        }

          clFlush(command_queue[1-cur]); // waiting for queued task
          connector->mail_write(reply[1-cur]);


          connector->mail_write((memaddr)MY_SPE_STATUS_READY);
    }
    // TaskArrayの処理
}

int
not_ready(SchedTask* smanager, void* r, void *w)
{
    smanager->printf("GPU task not ready %d\n", smanager->atask->command);
    return 0;
}

/*
 * kernel file open and build program
 */
void
GpuScheduler::load_kernel(int cmd)
{
    if (task_list[cmd].run == null_run) return;

    const char *filename = (const char *)task_list[cmd].gputask->program;

    int fd;
    char *source_str;
    size_t source_size;

    fd = open(filename, O_RDONLY);

    if (fd<0) {
        fprintf(stderr, "Failed to load kernel %s.\n",filename);
        exit(1);
    }

    struct stat stats;
    fstat(fd,&stats);
    off_t size = stats.st_size;

    if (size<=0) {
        fprintf(stderr, "Failed to load kernel.\n");
        exit(1);
    }

    source_str = (char*)alloca(size);
    source_size = read(fd, source_str, size);
    close(fd);

    cl_program *program = new cl_program;
    *program = clCreateProgramWithSource(context, 1,
                                  (const char **)&source_str,
                                  (const size_t *)&source_size, &ret);
    ret = clBuildProgram(*program, 1, &device_id, NULL, NULL, NULL);

    if(ret<0) {
        size_t size;
        clGetProgramBuildInfo(*program, device_id, CL_PROGRAM_BUILD_LOG, 0, NULL, &size);

        char *log = new char[size];
        clGetProgramBuildInfo(*program, device_id, CL_PROGRAM_BUILD_LOG, size, log, NULL);
        error(log);
    }
    task_list[cmd].gputask->program = program;
    task_list[cmd].run = null_run; // kernel is ready

}

// regist kernel file name
void
gpu_register_task(int cmd, const char* filename, const char* functionname)
{
    task_list[cmd].run = not_ready;  // not yet ready
    task_list[cmd].load = null_loader;
    task_list[cmd].wait = null_loader;
    task_list[cmd].name = functionname;
    task_list[cmd].gputask->program = (cl_program *) filename;
}

/* end */