Mercurial > hg > Papers > 2017 > ikkun-osc
changeset 9:556456198f52
fix
author | ikkun |
---|---|
date | Fri, 16 Jun 2017 05:31:42 +0900 |
parents | df645e67040a |
children | 33cbafd41036 |
files | osc.md sample/Makefile sample/cmake_install.cmake sample/hello sample2/matrixMul.cu |
diffstat | 5 files changed, 468 insertions(+), 11 deletions(-) [+] |
line wrap: on
line diff
--- a/osc.md Fri Jun 16 01:06:22 2017 +0900 +++ b/osc.md Fri Jun 16 05:31:42 2017 +0900 @@ -77,6 +77,11 @@ cmake_minimum_required(VERSION 3.7) - 必要なCUDAのパッケージを設定します。 find_package(CUDA REQUIRED) +- 必要なCUDAライブラリのpathを設定します。 +include_directories(/Developer/NVIDIA/CUDA-8.0/samples/common/inc) +- target名とソースコードを以下のように指定する +cuda_add_executable(matrixMul matrixMul.cu) +- cmake .でMakefileを作成し、make matrixMul.cuでビルドします # Ninja
--- a/sample/Makefile Fri Jun 16 01:06:22 2017 +0900 +++ b/sample/Makefile Fri Jun 16 05:31:42 2017 +0900 @@ -1,5 +1,5 @@ # CMAKE generated file: DO NOT EDIT! -# Generated by "Unix Makefiles" Generator, CMake Version 3.8 +# Generated by "Unix Makefiles" Generator, CMake Version 3.7 # Default target executed when no arguments are given to make. default_target: all @@ -39,19 +39,19 @@ SHELL = /bin/sh # The CMake executable. -CMAKE_COMMAND = /usr/local/Cellar/cmake/3.8.2/bin/cmake +CMAKE_COMMAND = /usr/local/Cellar/cmake/3.7.2/bin/cmake # The command to remove a file. -RM = /usr/local/Cellar/cmake/3.8.2/bin/cmake -E remove -f +RM = /usr/local/Cellar/cmake/3.7.2/bin/cmake -E remove -f # Escaping for special characters. EQUALS = = # The top-level source directory on which CMake was run. -CMAKE_SOURCE_DIR = /Users/e135704/seminar/osc/ikkun-osc/sample +CMAKE_SOURCE_DIR = /Users/one/hg/Papers/2017/ikkun-osc/sample # The top-level build directory on which CMake was run. -CMAKE_BINARY_DIR = /Users/e135704/seminar/osc/ikkun-osc/sample +CMAKE_BINARY_DIR = /Users/one/hg/Papers/2017/ikkun-osc/sample #============================================================================= # Targets provided globally by CMake. @@ -59,7 +59,7 @@ # Special rule for the target rebuild_cache rebuild_cache: @$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --cyan "Running CMake to regenerate build system..." - /usr/local/Cellar/cmake/3.8.2/bin/cmake -H$(CMAKE_SOURCE_DIR) -B$(CMAKE_BINARY_DIR) + /usr/local/Cellar/cmake/3.7.2/bin/cmake -H$(CMAKE_SOURCE_DIR) -B$(CMAKE_BINARY_DIR) .PHONY : rebuild_cache # Special rule for the target rebuild_cache @@ -70,7 +70,7 @@ # Special rule for the target edit_cache edit_cache: @$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --cyan "Running CMake cache editor..." - /usr/local/Cellar/cmake/3.8.2/bin/ccmake -H$(CMAKE_SOURCE_DIR) -B$(CMAKE_BINARY_DIR) + /usr/local/Cellar/cmake/3.7.2/bin/ccmake -H$(CMAKE_SOURCE_DIR) -B$(CMAKE_BINARY_DIR) .PHONY : edit_cache # Special rule for the target edit_cache @@ -80,9 +80,9 @@ # The main all target all: cmake_check_build_system - $(CMAKE_COMMAND) -E cmake_progress_start /Users/e135704/seminar/osc/ikkun-osc/sample/CMakeFiles /Users/e135704/seminar/osc/ikkun-osc/sample/CMakeFiles/progress.marks + $(CMAKE_COMMAND) -E cmake_progress_start /Users/one/hg/Papers/2017/ikkun-osc/sample/CMakeFiles /Users/one/hg/Papers/2017/ikkun-osc/sample/CMakeFiles/progress.marks $(MAKE) -f CMakeFiles/Makefile2 all - $(CMAKE_COMMAND) -E cmake_progress_start /Users/e135704/seminar/osc/ikkun-osc/sample/CMakeFiles 0 + $(CMAKE_COMMAND) -E cmake_progress_start /Users/one/hg/Papers/2017/ikkun-osc/sample/CMakeFiles 0 .PHONY : all # The main clean target
--- a/sample/cmake_install.cmake Fri Jun 16 01:06:22 2017 +0900 +++ b/sample/cmake_install.cmake Fri Jun 16 05:31:42 2017 +0900 @@ -1,4 +1,4 @@ -# Install script for directory: /Users/e135704/seminar/osc/ikkun-osc/sample +# Install script for directory: /Users/one/hg/Papers/2017/ikkun-osc/sample # Set the install prefix if(NOT DEFINED CMAKE_INSTALL_PREFIX) @@ -35,5 +35,5 @@ string(REPLACE ";" "\n" CMAKE_INSTALL_MANIFEST_CONTENT "${CMAKE_INSTALL_MANIFEST_FILES}") -file(WRITE "/Users/e135704/seminar/osc/ikkun-osc/sample/${CMAKE_INSTALL_MANIFEST}" +file(WRITE "/Users/one/hg/Papers/2017/ikkun-osc/sample/${CMAKE_INSTALL_MANIFEST}" "${CMAKE_INSTALL_MANIFEST_CONTENT}")
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/sample2/matrixMul.cu Fri Jun 16 05:31:42 2017 +0900 @@ -0,0 +1,452 @@ +/** + * Copyright 1993-2015 NVIDIA Corporation. All rights reserved. + * + * Please refer to the NVIDIA end user license agreement (EULA) associated + * with this source code for terms and conditions that govern your use of + * this software. Any use, reproduction, disclosure, or distribution of + * this software and related documentation outside the terms of the EULA + * is strictly prohibited. + * + */ + +/** + * Matrix multiplication: C = A * B. + * Host code. + * + * This sample implements matrix multiplication as described in Chapter 3 + * of the programming guide. + * It has been written for clarity of exposition to illustrate various CUDA + * programming principles, not with the goal of providing the most + * performant generic kernel for matrix multiplication. + * + * See also: + * V. Volkov and J. Demmel, "Benchmarking GPUs to tune dense linear algebra," + * in Proc. 2008 ACM/IEEE Conf. on Supercomputing (SC '08), + * Piscataway, NJ: IEEE Press, 2008, pp. Art. 31:1-11. + */ + +// System includes +#include <stdio.h> +#include <assert.h> + +// CUDA runtime +#include <cuda_runtime.h> + +// Helper functions and utilities to work with CUDA +#include <helper_functions.h> +#include <helper_cuda.h> + +/** + * Matrix multiplication (CUDA Kernel) on the device: C = A * B + * wA is A's width and wB is B's width + */ +template <int BLOCK_SIZE> __global__ void +matrixMulCUDA(float *C, float *A, float *B, int wA, int wB) +{ + // Block index + int bx = blockIdx.x; + int by = blockIdx.y; + + // Thread index + int tx = threadIdx.x; + int ty = threadIdx.y; + + // Index of the first sub-matrix of A processed by the block + int aBegin = wA * BLOCK_SIZE * by; + + // Index of the last sub-matrix of A processed by the block + int aEnd = aBegin + wA - 1; + + // Step size used to iterate through the sub-matrices of A + int aStep = BLOCK_SIZE; + + // Index of the first sub-matrix of B processed by the block + int bBegin = BLOCK_SIZE * bx; + + // Step size used to iterate through the sub-matrices of B + int bStep = BLOCK_SIZE * wB; + + // Csub is used to store the element of the block sub-matrix + // that is computed by the thread + float Csub = 0; + + // Loop over all the sub-matrices of A and B + // required to compute the block sub-matrix + for (int a = aBegin, b = bBegin; + a <= aEnd; + a += aStep, b += bStep) + { + + // Declaration of the shared memory array As used to + // store the sub-matrix of A + __shared__ float As[BLOCK_SIZE][BLOCK_SIZE]; + + // Declaration of the shared memory array Bs used to + // store the sub-matrix of B + __shared__ float Bs[BLOCK_SIZE][BLOCK_SIZE]; + + // Load the matrices from device memory + // to shared memory; each thread loads + // one element of each matrix + As[ty][tx] = A[a + wA * ty + tx]; + Bs[ty][tx] = B[b + wB * ty + tx]; + + // Synchronize to make sure the matrices are loaded + __syncthreads(); + + // Multiply the two matrices together; + // each thread computes one element + // of the block sub-matrix +#pragma unroll + + for (int k = 0; k < BLOCK_SIZE; ++k) + { + Csub += As[ty][k] * Bs[k][tx]; + } + + // Synchronize to make sure that the preceding + // computation is done before loading two new + // sub-matrices of A and B in the next iteration + __syncthreads(); + } + + // Write the block sub-matrix to device memory; + // each thread writes one element + int c = wB * BLOCK_SIZE * by + BLOCK_SIZE * bx; + C[c + wB * ty + tx] = Csub; +} + +void constantInit(float *data, int size, float val) +{ + for (int i = 0; i < size; ++i) + { + data[i] = val; + } +} + +/** + * Run a simple test of matrix multiplication using CUDA + */ +int matrixMultiply(int argc, char **argv, int block_size, dim3 &dimsA, dim3 &dimsB) +{ + // Allocate host memory for matrices A and B + unsigned int size_A = dimsA.x * dimsA.y; + unsigned int mem_size_A = sizeof(float) * size_A; + float *h_A = (float *)malloc(mem_size_A); + unsigned int size_B = dimsB.x * dimsB.y; + unsigned int mem_size_B = sizeof(float) * size_B; + float *h_B = (float *)malloc(mem_size_B); + + // Initialize host memory + const float valB = 0.01f; + constantInit(h_A, size_A, 1.0f); + constantInit(h_B, size_B, valB); + + // Allocate device memory + float *d_A, *d_B, *d_C; + + // Allocate host matrix C + dim3 dimsC(dimsB.x, dimsA.y, 1); + unsigned int mem_size_C = dimsC.x * dimsC.y * sizeof(float); + float *h_C = (float *) malloc(mem_size_C); + + if (h_C == NULL) + { + fprintf(stderr, "Failed to allocate host matrix C!\n"); + exit(EXIT_FAILURE); + } + + cudaError_t error; + + error = cudaMalloc((void **) &d_A, mem_size_A); + + if (error != cudaSuccess) + { + printf("cudaMalloc d_A returned error %s (code %d), line(%d)\n", cudaGetErrorString(error), error, __LINE__); + exit(EXIT_FAILURE); + } + + error = cudaMalloc((void **) &d_B, mem_size_B); + + if (error != cudaSuccess) + { + printf("cudaMalloc d_B returned error %s (code %d), line(%d)\n", cudaGetErrorString(error), error, __LINE__); + exit(EXIT_FAILURE); + } + + error = cudaMalloc((void **) &d_C, mem_size_C); + + if (error != cudaSuccess) + { + printf("cudaMalloc d_C returned error %s (code %d), line(%d)\n", cudaGetErrorString(error), error, __LINE__); + exit(EXIT_FAILURE); + } + + // copy host memory to device + error = cudaMemcpy(d_A, h_A, mem_size_A, cudaMemcpyHostToDevice); + + if (error != cudaSuccess) + { + printf("cudaMemcpy (d_A,h_A) returned error %s (code %d), line(%d)\n", cudaGetErrorString(error), error, __LINE__); + exit(EXIT_FAILURE); + } + + error = cudaMemcpy(d_B, h_B, mem_size_B, cudaMemcpyHostToDevice); + + if (error != cudaSuccess) + { + printf("cudaMemcpy (d_B,h_B) returned error %s (code %d), line(%d)\n", cudaGetErrorString(error), error, __LINE__); + exit(EXIT_FAILURE); + } + + // Setup execution parameters + dim3 threads(block_size, block_size); + dim3 grid(dimsB.x / threads.x, dimsA.y / threads.y); + + // Create and start timer + printf("Computing result using CUDA Kernel...\n"); + + // Performs warmup operation using matrixMul CUDA kernel + if (block_size == 16) + { + matrixMulCUDA<16><<< grid, threads >>>(d_C, d_A, d_B, dimsA.x, dimsB.x); + } + else + { + matrixMulCUDA<32><<< grid, threads >>>(d_C, d_A, d_B, dimsA.x, dimsB.x); + } + + printf("done\n"); + + cudaDeviceSynchronize(); + + // Allocate CUDA events that we'll use for timing + cudaEvent_t start; + error = cudaEventCreate(&start); + + if (error != cudaSuccess) + { + fprintf(stderr, "Failed to create start event (error code %s)!\n", cudaGetErrorString(error)); + exit(EXIT_FAILURE); + } + + cudaEvent_t stop; + error = cudaEventCreate(&stop); + + if (error != cudaSuccess) + { + fprintf(stderr, "Failed to create stop event (error code %s)!\n", cudaGetErrorString(error)); + exit(EXIT_FAILURE); + } + + // Record the start event + error = cudaEventRecord(start, NULL); + + if (error != cudaSuccess) + { + fprintf(stderr, "Failed to record start event (error code %s)!\n", cudaGetErrorString(error)); + exit(EXIT_FAILURE); + } + + // Execute the kernel + int nIter = 300; + + for (int j = 0; j < nIter; j++) + { + if (block_size == 16) + { + matrixMulCUDA<16><<< grid, threads >>>(d_C, d_A, d_B, dimsA.x, dimsB.x); + } + else + { + matrixMulCUDA<32><<< grid, threads >>>(d_C, d_A, d_B, dimsA.x, dimsB.x); + } + } + + // Record the stop event + error = cudaEventRecord(stop, NULL); + + if (error != cudaSuccess) + { + fprintf(stderr, "Failed to record stop event (error code %s)!\n", cudaGetErrorString(error)); + exit(EXIT_FAILURE); + } + + // Wait for the stop event to complete + error = cudaEventSynchronize(stop); + + if (error != cudaSuccess) + { + fprintf(stderr, "Failed to synchronize on the stop event (error code %s)!\n", cudaGetErrorString(error)); + exit(EXIT_FAILURE); + } + + float msecTotal = 0.0f; + error = cudaEventElapsedTime(&msecTotal, start, stop); + + if (error != cudaSuccess) + { + fprintf(stderr, "Failed to get time elapsed between events (error code %s)!\n", cudaGetErrorString(error)); + exit(EXIT_FAILURE); + } + + // Compute and print the performance + float msecPerMatrixMul = msecTotal / nIter; + double flopsPerMatrixMul = 2.0 * (double)dimsA.x * (double)dimsA.y * (double)dimsB.x; + double gigaFlops = (flopsPerMatrixMul * 1.0e-9f) / (msecPerMatrixMul / 1000.0f); + printf( + "Performance= %.2f GFlop/s, Time= %.3f msec, Size= %.0f Ops, WorkgroupSize= %u threads/block\n", + gigaFlops, + msecPerMatrixMul, + flopsPerMatrixMul, + threads.x * threads.y); + + // Copy result from device to host + error = cudaMemcpy(h_C, d_C, mem_size_C, cudaMemcpyDeviceToHost); + + if (error != cudaSuccess) + { + printf("cudaMemcpy (h_C,d_C) returned error %s (code %d), line(%d)\n", cudaGetErrorString(error), error, __LINE__); + exit(EXIT_FAILURE); + } + + printf("Checking computed result for correctness: "); + bool correct = true; + + // test relative error by the formula + // |<x, y>_cpu - <x,y>_gpu|/<|x|, |y|> < eps + double eps = 1.e-6 ; // machine zero + + for (int i = 0; i < (int)(dimsC.x * dimsC.y); i++) + { + double abs_err = fabs(h_C[i] - (dimsA.x * valB)); + double dot_length = dimsA.x; + double abs_val = fabs(h_C[i]); + double rel_err = abs_err/abs_val/dot_length ; + + if (rel_err > eps) + { + printf("Error! Matrix[%05d]=%.8f, ref=%.8f error term is > %E\n", i, h_C[i], dimsA.x*valB, eps); + correct = false; + } + } + + printf("%s\n", correct ? "Result = PASS" : "Result = FAIL"); + + // Clean up memory + free(h_A); + free(h_B); + free(h_C); + cudaFree(d_A); + cudaFree(d_B); + cudaFree(d_C); + + printf("\nNOTE: The CUDA Samples are not meant for performance measurements. Results may vary when GPU Boost is enabled.\n"); + + if (correct) + { + return EXIT_SUCCESS; + } + else + { + return EXIT_FAILURE; + } +} + + +/** + * Program main + */ +int main(int argc, char **argv) +{ + printf("[Matrix Multiply Using CUDA] - Starting...\n"); + + if (checkCmdLineFlag(argc, (const char **)argv, "help") || + checkCmdLineFlag(argc, (const char **)argv, "?")) + { + printf("Usage -device=n (n >= 0 for deviceID)\n"); + printf(" -wA=WidthA -hA=HeightA (Width x Height of Matrix A)\n"); + printf(" -wB=WidthB -hB=HeightB (Width x Height of Matrix B)\n"); + printf(" Note: Outer matrix dimensions of A & B matrices must be equal.\n"); + + exit(EXIT_SUCCESS); + } + + // By default, we use device 0, otherwise we override the device ID based on what is provided at the command line + int devID = 0; + + if (checkCmdLineFlag(argc, (const char **)argv, "device")) + { + devID = getCmdLineArgumentInt(argc, (const char **)argv, "device"); + cudaSetDevice(devID); + } + + cudaError_t error; + cudaDeviceProp deviceProp; + error = cudaGetDevice(&devID); + + if (error != cudaSuccess) + { + printf("cudaGetDevice returned error %s (code %d), line(%d)\n", cudaGetErrorString(error), error, __LINE__); + } + + error = cudaGetDeviceProperties(&deviceProp, devID); + + if (deviceProp.computeMode == cudaComputeModeProhibited) + { + fprintf(stderr, "Error: device is running in <Compute Mode Prohibited>, no threads can use ::cudaSetDevice().\n"); + exit(EXIT_SUCCESS); + } + + if (error != cudaSuccess) + { + printf("cudaGetDeviceProperties returned error %s (code %d), line(%d)\n", cudaGetErrorString(error), error, __LINE__); + } + else + { + printf("GPU Device %d: \"%s\" with compute capability %d.%d\n\n", devID, deviceProp.name, deviceProp.major, deviceProp.minor); + } + + // Use a larger block size for Fermi and above + int block_size = (deviceProp.major < 2) ? 16 : 32; + + dim3 dimsA(5*2*block_size, 5*2*block_size, 1); + dim3 dimsB(5*4*block_size, 5*2*block_size, 1); + + // width of Matrix A + if (checkCmdLineFlag(argc, (const char **)argv, "wA")) + { + dimsA.x = getCmdLineArgumentInt(argc, (const char **)argv, "wA"); + } + + // height of Matrix A + if (checkCmdLineFlag(argc, (const char **)argv, "hA")) + { + dimsA.y = getCmdLineArgumentInt(argc, (const char **)argv, "hA"); + } + + // width of Matrix B + if (checkCmdLineFlag(argc, (const char **)argv, "wB")) + { + dimsB.x = getCmdLineArgumentInt(argc, (const char **)argv, "wB"); + } + + // height of Matrix B + if (checkCmdLineFlag(argc, (const char **)argv, "hB")) + { + dimsB.y = getCmdLineArgumentInt(argc, (const char **)argv, "hB"); + } + + if (dimsA.x != dimsB.y) + { + printf("Error: outer matrix dimensions must be equal. (%d != %d)\n", + dimsA.x, dimsB.y); + exit(EXIT_FAILURE); + } + + printf("MatrixA(%d,%d), MatrixB(%d,%d)\n", dimsA.x, dimsA.y, dimsB.x, dimsB.y); + + int matrix_result = matrixMultiply(argc, argv, block_size, dimsA, dimsB); + + exit(matrix_result); +}