comparison src/test/twice.cu @ 292:2bc63a22dd21

add twice
author ikkun
date Thu, 09 Feb 2017 19:51:32 +0900
parents src/test/main.cc@87128b876c63
children
comparison
equal deleted inserted replaced
291:87128b876c63 292:2bc63a22dd21
1 #include <stdio.h>
2 #include <sys/time.h>
3 #include <string.h>
4 #include <stdlib.h>
5
6 #include <cuda.h>
7
8 #include <cuda_runtime.h>
9 #include "helper_cuda.h"
10
11 #define LENGTH (10)
12 #define THREAD (10)
13
14 double
15 getTime() {
16 struct timeval tv;
17 gettimeofday(&tv, NULL);
18 return tv.tv_sec + (double)tv.tv_usec*1e-6;
19 }
20
21 void
22 check_data(float* A, float B, float* C) {
23 for (int i=0; i<LENGTH*THREAD; i++) {
24 if (A[i]*B!=C[i]) {
25 puts("multiply failure.");
26 return;
27 }
28 }
29 puts("success.");
30 }
31
32 void print_result(float* C) {
33 for (int i=0; i<LENGTH*THREAD; i++) {
34 printf("%f\n",C[i]);
35 }
36 }
37
38 int main(int args, char* argv[]) {
39 int num_stream = 1; // number of stream
40 int num_exec = 16; // number of executed kernel
41
42 for (int i=1;argv[i];i++) {
43 if (strcmp(argv[i], "--stream") == 0 || strcmp(argv[i], "-s") == 0) {
44 num_stream = atoi(argv[++i]);
45 }
46 }
47
48 // initialize and load kernel
49 CUdevice device;
50 CUcontext context;
51 CUmodule module;
52 CUfunction function;
53 CUstream stream[num_stream];
54
55 checkCudaErrors(cuInit(0));
56 checkCudaErrors(cuDeviceGet(&device, 0));
57 checkCudaErrors(cuCtxCreate(&context, CU_CTX_SCHED_SPIN, device));
58 checkCudaErrors(cuModuleLoad(&module, "multiply.ptx"));
59 checkCudaErrors(cuModuleGetFunction(&function, module, "multiply"));
60 for (int i=0;i<num_stream;i++)
61 checkCudaErrors(cuStreamCreate(&stream[i],0));
62
63 // memory allocate
64 CUdeviceptr devA;
65 CUdeviceptr devB[num_exec];
66 CUdeviceptr devOut[num_exec];
67
68 checkCudaErrors(cuMemAlloc(&devA, LENGTH*THREAD*sizeof(float)));
69 for (int i=0;i<num_exec;i++) {
70 checkCudaErrors(cuMemAlloc(&devB[i], sizeof(float)));
71 checkCudaErrors(cuMemAlloc(&devOut[i], LENGTH*THREAD*sizeof(float)));
72 }
73
74 // input buffer
75 float* A = new float[LENGTH*THREAD];
76 float* B = new float[num_exec];
77
78 for (int i=0; i<LENGTH*THREAD; i++)
79 A[i] = (float)(i+1000);
80
81 // output buffer
82 float** result = new float* [num_exec];
83
84 for (int i=0;i<num_exec;i++)
85 result[i] = new float[LENGTH*THREAD];
86
87 // Synchronous data transfer(host to device)
88 checkCudaErrors(cuMemcpyHtoD(devA, A, LENGTH*THREAD*sizeof(float)));
89
90 // Asynchronous data transfer(host to device)
91 int cur = 0;
92
93 for (int i=0;i<num_exec;i++,cur++) {
94 if (num_stream <= cur)
95 cur = 0;
96 B[i] = (float)(i+1);
97 checkCudaErrors(cuMemcpyHtoDAsync(devB[i], &B[i], sizeof(float), stream[cur]));
98 }
99
100 cur = 0;
101
102 // Asynchronous launch kernel
103 for (int i=0;i<num_exec;i++,cur++) {
104 if (num_stream <= cur)
105 cur=0;
106 B[i] = (float)(i+1);
107 //cuMemcpyHtoDAsync(devB[i], &B[i], sizeof(float), stream[cur]);
108 void* args[] = {&devA, &devB[i], &devOut[i]};
109 checkCudaErrors(cuLaunchKernel(function,
110 LENGTH, 1, 1,
111 THREAD, 1, 1,
112 0, stream[cur], args, NULL));
113 //cuMemcpyDtoHAsync(result[i], devOut[i], LENGTH*THREAD*sizeof(float), stream[cur]);
114 }
115
116 cur = 0;
117
118
119 // Asynchronous data transfer(device to host)
120 for (int i=0;i<num_exec;i++,cur++) {
121 if (num_stream <= cur)
122 cur = 0;
123 checkCudaErrors(cuMemcpyDtoHAsync(result[i], devOut[i], LENGTH*THREAD*sizeof(float), stream[cur]));
124 }
125
126 // wait for stream
127 for (int i=0;i<num_stream;i++)
128 checkCudaErrors(cuStreamSynchronize(stream[i]));
129
130 //printf("%0.6f\n",getTime()-start);
131
132 for (int i=0;i<num_exec;i++)
133 check_data(A,(float)(i+1),result[i]);
134
135 // memory release
136 checkCudaErrors(cuMemFree(devA));
137 for (int i=0;i<num_exec;i++) {
138 checkCudaErrors(cuMemFree(devB[i]));
139 checkCudaErrors(cuMemFree(devOut[i]));
140 }
141 for (int i=0;i<num_stream;i++)
142 checkCudaErrors(cuStreamDestroy(stream[i]));
143 checkCudaErrors(cuModuleUnload(module));
144 checkCudaErrors(cuCtxDestroy(context));
145
146 delete[] A;
147 delete[] B;
148 for (int i=0;i<num_exec;i++)
149 delete[] result[i];
150 delete[] result;
151
152 return 0;
153 }
154 //