view src/parallel_execution/cuda.c @ 414:49159fbdd1fb

Work CUDAbitonicSort
author Tatsuki IHA <innparusu@cr.ie.u-ryukyu.ac.jp>
date Fri, 15 Sep 2017 22:49:45 +0900
parents 409e6b5fb775
children 35b37fe8d3a7
line wrap: on
line source

#include <stdio.h>
#include <sys/time.h>
#include <string.h>
#include <stdlib.h>
#include <libkern/OSAtomic.h>

// includes, project
#include <driver_types.h>
#include <cuda_runtime.h>
#include <cuda.h>
#include "helper_cuda.h"
#include "pthread.h"

#include "context.h"

/*
struct Context {
    int next;
    struct Worker* worker;
    struct TaskManager* taskManager;
    int codeNum;
    void  (**code) (struct Context*);
    void* heapStart;
    void* heap;
    long heapLimit;
    int dataNum;
    int idgCount; //number of waiting dataGear
    int idg;
    int maxIdg;
    int odg;
    int maxOdg;
    int workerId;
    struct Context* task;
    struct Queue* tasks;
    int num_exec;
    CUmodule module;
    CUfunction function;
    union Data **data;

    // multi dimension parameter
    int iterate;
    struct Iterator* iterator;
};

struct CUDAWorker {
    CUdevice device;
    CUcontext cuCtx;
    pthread_t thread;
    struct Context* context;
    int id;
    struct Queue* tasks;
    int runFlag;
    int next;
    int num_stream;
    CUstream *stream;
} CUDAWorker;

struct LoopCounter {
    int i;
} LoopCounter;

struct Array {
    int size;
    int index;
    int prefix;
    int* array;
} Array;
*/

void cudaInit(struct CUDAWorker *cudaWorker,int phase) {
    // initialize and load kernel
    cudaWorker->num_stream = 1; // number of stream
    //    cudaWorker->stream = NEWN(cudaWorker->num_stream, CUstream );
    if (phase==0)
        checkCudaErrors(cuInit(0));
    if (phase==0)
        checkCudaErrors(cuDeviceGet(&cudaWorker->device, 0));
    if (phase==0)
        checkCudaErrors(cuCtxCreate(&cudaWorker->cuCtx, CU_CTX_SCHED_SPIN, cudaWorker->device));
    //    if (cudaWorker->num_stream) {
    //        for (int i=0;i<cudaWorker->num_stream;i++)
    //            checkCudaErrors(cuStreamCreate(&cudaWorker->stream[i],0));
    //    }
    printf("cuda Init: Done\n");
}


void CUDAExec(struct Context* context, struct SortArray* inputSortArray, struct SortArray* outputSortArray) {
    //printf("cuda exec start\n");
    // Worker *worker = context->worker;
    // CUDAWorker* cudaWorker = (CUDAWorker*)worker->worker;
    // memory allocate
    CUdeviceptr devA;
    CUdeviceptr devB;
    CUdeviceptr devC;
    CUdeviceptr devD;

    checkCudaErrors(cuMemAlloc(&devA, sizeof(struct Integer)*GET_SIZE(inputSortArray->array)));
    checkCudaErrors(cuMemAlloc(&devB, sizeof(int)));
    checkCudaErrors(cuMemAlloc(&devC, sizeof(int)));
    checkCudaErrors(cuMemAlloc(&devD, sizeof(int)));

    //twiceカーネルが定義されてなければそれをロードする
    checkCudaErrors(cuModuleLoad(&context->module, "c/examples/bitonicSort/CUDAbitonicSwap.ptx"));
    checkCudaErrors(cuModuleGetFunction(&context->function, context->module, "bitonicSwap"));

    //入力のDataGearをGPUにbuffer経由で送る
    // Synchronous data transfer(host to device)
    checkCudaErrors(cuMemcpyHtoD(devA, inputSortArray->array, sizeof(struct Integer)*GET_SIZE(inputSortArray->array)));
    checkCudaErrors(cuMemcpyHtoD(devB, &inputSortArray->block, sizeof(int)));
    checkCudaErrors(cuMemcpyHtoD(devC, &inputSortArray->first, sizeof(int)));
    checkCudaErrors(cuMemcpyHtoD(devD, &inputSortArray->prefix, sizeof(int)));

    // Asynchronous launch kernel
    context->num_exec = 1;
    void* args[] = {&devA, &devB, &devC, &devD};
    if (context->iterate) {
        struct MultiDimIterator* iterator = &context->iterator->iterator->MultiDimIterator;
        checkCudaErrors(cuLaunchKernel(context->function,
                    iterator->x, iterator->y, iterator->z,
                    1, 1, 1,
                    0, NULL, args, NULL));

    } else {
        checkCudaErrors(cuLaunchKernel(context->function,
                    1, 1, 1,
                    1, 1, 1,
                    0, NULL, args, NULL));
    }
    //結果を取ってくるコマンドを入力する
    //コマンドの終了待ちを行う   
    checkCudaErrors(cuMemcpyDtoH(inputSortArray->array, devA, sizeof(struct Integer)*GET_SIZE(inputSortArray->array)));
    outputSortArray->array = inputSortArray->array;
    // wait for stream
    checkCudaErrors(cuCtxSynchronize());
    cuMemFree(devA);
    cuMemFree(devB);
    cuMemFree(devC);
    cuMemFree(devD);
}

void cudaShutdown( struct CUDAWorker *worker) {
    //    for (int i=0;i<worker->num_stream;i++)
    //        checkCudaErrors(cuStreamDestroy(worker->stream[i]));
    checkCudaErrors(cuCtxDestroy(worker->cuCtx));
}