view src/parallel_execution/CUDAtwice.cbc @ 303:1dbaef86593b

CUDAtwice.cbc
author ikkun
date Mon, 13 Feb 2017 18:23:29 +0900
parents 8e7926f3e271
children ae4f6aa427f5
line wrap: on
line source

#include <stdio.h>

#include "context.h"
#include "origin_cs.h"

__code twice(struct Context* context, struct LoopCounter* loopCounter, int index, int prefix, int* array, struct Context* workerContext) {
    int i = loopCounter->i;
    if (i < prefix) {
        array[i+index*prefix] = array[i+index*prefix]*2;
        loopCounter->i++;

        goto meta(context, C_twice);
    }

    loopCounter->i = 0;
    goto meta(workerContext, workerContext->next);
}

__code twice_stub(struct Context* context) {
    struct Context* workerContext = context->worker->worker->CPUWorker.context;

  // memory allocate
    CUdeviceptr devA;
    CUdeviceptr devB[num_exec];
    CUdeviceptr devOut[num_exec];

    checkCudaErrors(cuMemAlloc(&devA, LENGTH*THREAD*sizeof(float)));
    for (int i=0;i<num_exec;i++) {
        checkCudaErrors(cuMemAlloc(&devB[i], sizeof(float)));
        checkCudaErrors(cuMemAlloc(&devOut[i], LENGTH*THREAD*sizeof(float)));
    }

    //twiceカーネルが定義されてなければそれをロードする
    checkCudaErrors(cuModuleLoad(&module, "multiply.ptx"));
    checkCudaErrors(cuModuleGetFunction(&function, module, "multiply"));

    //入力のDataGearをGPUにbuffer経由で送る
    // Synchronous data transfer(host to device)
    checkCudaErrors(cuMemcpyHtoD(devA, A, LENGTH*THREAD*sizeof(float)));

  // Asynchronous launch kernel
    for (int i=0;i<num_exec;i++,cur++) {
        if (num_stream <= cur)
            cur=0;
        //B[i] = (float)(i+1);
        //cuMemcpyHtoDAsync(devB[i], &B[i], sizeof(float), stream[cur]);
        void* args[] = {&devA, &devB[i], &devOut[i]};
        checkCudaErrors(cuLaunchKernel(function,
                       LENGTH, 1, 1,
                       THREAD, 1, 1,
                                 0, num_stream ? stream[cur] : NULL , args, NULL));
        //cuMemcpyDtoHAsync(result[i], devOut[i], LENGTH*THREAD*sizeof(float), stream[cur]);
    }

    //結果を取ってくるコマンドを入力する
    //コマンドの終了待ちを行う
 // Asynchronous data transfer(device to host)
     for (int i=0;i<num_exec;i++,cur++) {
         if (num_stream <= cur)
             cur = 0;
         if (num_stream) {
             checkCudaErrors(cuMemcpyDtoHAsync(result[i], devOut[i], LENGTH*THREAD*sizeof(float), stream[cur]));
         } else {
             checkCudaErrors(cuMemcpyDtoH(result[i], devOut[i], LENGTH*THREAD*sizeof(float)));
         }
     }
    
    // wait for stream
    for (int i=0;i<num_stream;i++)
        checkCudaErrors(cuStreamSynchronize(stream[i]));
 // Asynchronous data transfer(device to host)
     for (int i=0;i<num_exec;i++,cur++) {
         if (num_stream <= cur)
             cur = 0;
         if (num_stream) {
             checkCudaErrors(cuMemcpyDtoHAsync(result[i], devOut[i], LENGTH*THREAD*sizeof(float), stream[cur]));
         } else {
             checkCudaErrors(cuMemcpyDtoH(result[i], devOut[i], LENGTH*THREAD*sizeof(float)));
         }
     }
    
    // wait for stream
    for (int i=0;i<num_stream;i++)
        checkCudaErrors(cuStreamSynchronize(stream[i]));
    

    //continationにそってGPUworkerに戻る
    goto twice(context, Gearef(context, LoopCounter), 0, 0, NULL, workerContext);
}