CbC/CbC_llvm: clang/lib/Headers/__clang_cuda_runtime

annotate clang/lib/Headers/__clang_cuda_runtime_wrapper.h @ 176:de4ac79aef9d

...

author	Shinji KONO <kono@ie.u-ryukyu.ac.jp>
date	Mon, 25 May 2020 17:13:11 +0900
parents	0572611fdcc8
children	2e18cbf3894f

rev	line source
150 1d019706d866 LLVM10 anatofuz parents: diff changeset	1 /*===---- __clang_cuda_runtime_wrapper.h - CUDA runtime support -------------===
1d019706d866 LLVM10 anatofuz parents: diff changeset	2 *
1d019706d866 LLVM10 anatofuz parents: diff changeset	3 * Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
1d019706d866 LLVM10 anatofuz parents: diff changeset	4 * See https://llvm.org/LICENSE.txt for license information.
1d019706d866 LLVM10 anatofuz parents: diff changeset	5 * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
1d019706d866 LLVM10 anatofuz parents: diff changeset	6 *
1d019706d866 LLVM10 anatofuz parents: diff changeset	7 *===-----------------------------------------------------------------------===
1d019706d866 LLVM10 anatofuz parents: diff changeset	8 */
1d019706d866 LLVM10 anatofuz parents: diff changeset	9
1d019706d866 LLVM10 anatofuz parents: diff changeset	10 /*
1d019706d866 LLVM10 anatofuz parents: diff changeset	11 * WARNING: This header is intended to be directly -include'd by
1d019706d866 LLVM10 anatofuz parents: diff changeset	12 * the compiler and is not supposed to be included by users.
1d019706d866 LLVM10 anatofuz parents: diff changeset	13 *
1d019706d866 LLVM10 anatofuz parents: diff changeset	14 * CUDA headers are implemented in a way that currently makes it
1d019706d866 LLVM10 anatofuz parents: diff changeset	15 * impossible for user code to #include directly when compiling with
1d019706d866 LLVM10 anatofuz parents: diff changeset	16 * Clang. They present different view of CUDA-supplied functions
1d019706d866 LLVM10 anatofuz parents: diff changeset	17 * depending on where in NVCC's compilation pipeline the headers are
1d019706d866 LLVM10 anatofuz parents: diff changeset	18 * included. Neither of these modes provides function definitions with
1d019706d866 LLVM10 anatofuz parents: diff changeset	19 * correct attributes, so we use preprocessor to force the headers
1d019706d866 LLVM10 anatofuz parents: diff changeset	20 * into a form that Clang can use.
1d019706d866 LLVM10 anatofuz parents: diff changeset	21 *
1d019706d866 LLVM10 anatofuz parents: diff changeset	22 * Similarly to NVCC which -include's cuda_runtime.h, Clang -include's
1d019706d866 LLVM10 anatofuz parents: diff changeset	23 * this file during every CUDA compilation.
1d019706d866 LLVM10 anatofuz parents: diff changeset	24 */
1d019706d866 LLVM10 anatofuz parents: diff changeset	25
1d019706d866 LLVM10 anatofuz parents: diff changeset	26 #ifndef __CLANG_CUDA_RUNTIME_WRAPPER_H__
1d019706d866 LLVM10 anatofuz parents: diff changeset	27 #define __CLANG_CUDA_RUNTIME_WRAPPER_H__
1d019706d866 LLVM10 anatofuz parents: diff changeset	28
1d019706d866 LLVM10 anatofuz parents: diff changeset	29 #if defined(__CUDA__) && defined(__clang__)
1d019706d866 LLVM10 anatofuz parents: diff changeset	30
1d019706d866 LLVM10 anatofuz parents: diff changeset	31 // Include some forward declares that must come before cmath.
1d019706d866 LLVM10 anatofuz parents: diff changeset	32 #include <__clang_cuda_math_forward_declares.h>
1d019706d866 LLVM10 anatofuz parents: diff changeset	33
173 0572611fdcc8 reorgnization done Shinji KONO <kono@ie.u-ryukyu.ac.jp> parents: 150 diff changeset	34 // Define __CUDACC__ early as libstdc++ standard headers with GNU extensions
0572611fdcc8 reorgnization done Shinji KONO <kono@ie.u-ryukyu.ac.jp> parents: 150 diff changeset	35 // enabled depend on it to avoid using __float128, which is unsupported in
0572611fdcc8 reorgnization done Shinji KONO <kono@ie.u-ryukyu.ac.jp> parents: 150 diff changeset	36 // CUDA.
0572611fdcc8 reorgnization done Shinji KONO <kono@ie.u-ryukyu.ac.jp> parents: 150 diff changeset	37 #define __CUDACC__
0572611fdcc8 reorgnization done Shinji KONO <kono@ie.u-ryukyu.ac.jp> parents: 150 diff changeset	38
150 1d019706d866 LLVM10 anatofuz parents: diff changeset	39 // Include some standard headers to avoid CUDA headers including them
1d019706d866 LLVM10 anatofuz parents: diff changeset	40 // while some required macros (like __THROW) are in a weird state.
1d019706d866 LLVM10 anatofuz parents: diff changeset	41 #include <cmath>
1d019706d866 LLVM10 anatofuz parents: diff changeset	42 #include <cstdlib>
1d019706d866 LLVM10 anatofuz parents: diff changeset	43 #include <stdlib.h>
173 0572611fdcc8 reorgnization done Shinji KONO <kono@ie.u-ryukyu.ac.jp> parents: 150 diff changeset	44 #undef __CUDACC__
150 1d019706d866 LLVM10 anatofuz parents: diff changeset	45
1d019706d866 LLVM10 anatofuz parents: diff changeset	46 // Preserve common macros that will be changed below by us or by CUDA
1d019706d866 LLVM10 anatofuz parents: diff changeset	47 // headers.
1d019706d866 LLVM10 anatofuz parents: diff changeset	48 #pragma push_macro("__THROW")
1d019706d866 LLVM10 anatofuz parents: diff changeset	49 #pragma push_macro("__CUDA_ARCH__")
1d019706d866 LLVM10 anatofuz parents: diff changeset	50
1d019706d866 LLVM10 anatofuz parents: diff changeset	51 // WARNING: Preprocessor hacks below are based on specific details of
1d019706d866 LLVM10 anatofuz parents: diff changeset	52 // CUDA-7.x headers and are not expected to work with any other
1d019706d866 LLVM10 anatofuz parents: diff changeset	53 // version of CUDA headers.
1d019706d866 LLVM10 anatofuz parents: diff changeset	54 #include "cuda.h"
1d019706d866 LLVM10 anatofuz parents: diff changeset	55 #if !defined(CUDA_VERSION)
1d019706d866 LLVM10 anatofuz parents: diff changeset	56 #error "cuda.h did not define CUDA_VERSION"
1d019706d866 LLVM10 anatofuz parents: diff changeset	57 #elif CUDA_VERSION < 7000
1d019706d866 LLVM10 anatofuz parents: diff changeset	58 #error "Unsupported CUDA version!"
1d019706d866 LLVM10 anatofuz parents: diff changeset	59 #endif
1d019706d866 LLVM10 anatofuz parents: diff changeset	60
1d019706d866 LLVM10 anatofuz parents: diff changeset	61 #pragma push_macro("__CUDA_INCLUDE_COMPILER_INTERNAL_HEADERS__")
1d019706d866 LLVM10 anatofuz parents: diff changeset	62 #if CUDA_VERSION >= 10000
1d019706d866 LLVM10 anatofuz parents: diff changeset	63 #define __CUDA_INCLUDE_COMPILER_INTERNAL_HEADERS__
1d019706d866 LLVM10 anatofuz parents: diff changeset	64 #endif
1d019706d866 LLVM10 anatofuz parents: diff changeset	65
1d019706d866 LLVM10 anatofuz parents: diff changeset	66 // Make largest subset of device functions available during host
1d019706d866 LLVM10 anatofuz parents: diff changeset	67 // compilation -- SM_35 for the time being.
1d019706d866 LLVM10 anatofuz parents: diff changeset	68 #ifndef __CUDA_ARCH__
1d019706d866 LLVM10 anatofuz parents: diff changeset	69 #define __CUDA_ARCH__ 350
1d019706d866 LLVM10 anatofuz parents: diff changeset	70 #endif
1d019706d866 LLVM10 anatofuz parents: diff changeset	71
1d019706d866 LLVM10 anatofuz parents: diff changeset	72 #include "__clang_cuda_builtin_vars.h"
1d019706d866 LLVM10 anatofuz parents: diff changeset	73
1d019706d866 LLVM10 anatofuz parents: diff changeset	74 // No need for device_launch_parameters.h as __clang_cuda_builtin_vars.h above
1d019706d866 LLVM10 anatofuz parents: diff changeset	75 // has taken care of builtin variables declared in the file.
1d019706d866 LLVM10 anatofuz parents: diff changeset	76 #define __DEVICE_LAUNCH_PARAMETERS_H__
1d019706d866 LLVM10 anatofuz parents: diff changeset	77
1d019706d866 LLVM10 anatofuz parents: diff changeset	78 // {math,device}_functions.h only have declarations of the
1d019706d866 LLVM10 anatofuz parents: diff changeset	79 // functions. We don't need them as we're going to pull in their
1d019706d866 LLVM10 anatofuz parents: diff changeset	80 // definitions from .hpp files.
1d019706d866 LLVM10 anatofuz parents: diff changeset	81 #define __DEVICE_FUNCTIONS_H__
1d019706d866 LLVM10 anatofuz parents: diff changeset	82 #define __MATH_FUNCTIONS_H__
1d019706d866 LLVM10 anatofuz parents: diff changeset	83 #define __COMMON_FUNCTIONS_H__
1d019706d866 LLVM10 anatofuz parents: diff changeset	84 // device_functions_decls is replaced by __clang_cuda_device_functions.h
1d019706d866 LLVM10 anatofuz parents: diff changeset	85 // included below.
1d019706d866 LLVM10 anatofuz parents: diff changeset	86 #define __DEVICE_FUNCTIONS_DECLS_H__
1d019706d866 LLVM10 anatofuz parents: diff changeset	87
1d019706d866 LLVM10 anatofuz parents: diff changeset	88 #undef __CUDACC__
1d019706d866 LLVM10 anatofuz parents: diff changeset	89 #if CUDA_VERSION < 9000
1d019706d866 LLVM10 anatofuz parents: diff changeset	90 #define __CUDABE__
1d019706d866 LLVM10 anatofuz parents: diff changeset	91 #else
173 0572611fdcc8 reorgnization done Shinji KONO <kono@ie.u-ryukyu.ac.jp> parents: 150 diff changeset	92 #define __CUDACC__
150 1d019706d866 LLVM10 anatofuz parents: diff changeset	93 #define __CUDA_LIBDEVICE__
1d019706d866 LLVM10 anatofuz parents: diff changeset	94 #endif
1d019706d866 LLVM10 anatofuz parents: diff changeset	95 // Disables definitions of device-side runtime support stubs in
1d019706d866 LLVM10 anatofuz parents: diff changeset	96 // cuda_device_runtime_api.h
173 0572611fdcc8 reorgnization done Shinji KONO <kono@ie.u-ryukyu.ac.jp> parents: 150 diff changeset	97 #include "host_defines.h"
0572611fdcc8 reorgnization done Shinji KONO <kono@ie.u-ryukyu.ac.jp> parents: 150 diff changeset	98 #undef __CUDACC__
150 1d019706d866 LLVM10 anatofuz parents: diff changeset	99 #include "driver_types.h"
1d019706d866 LLVM10 anatofuz parents: diff changeset	100 #include "host_config.h"
1d019706d866 LLVM10 anatofuz parents: diff changeset	101
1d019706d866 LLVM10 anatofuz parents: diff changeset	102 // Temporarily replace "nv_weak" with weak, so __attribute__((nv_weak)) in
1d019706d866 LLVM10 anatofuz parents: diff changeset	103 // cuda_device_runtime_api.h ends up being __attribute__((weak)) which is the
1d019706d866 LLVM10 anatofuz parents: diff changeset	104 // functional equivalent of what we need.
1d019706d866 LLVM10 anatofuz parents: diff changeset	105 #pragma push_macro("nv_weak")
1d019706d866 LLVM10 anatofuz parents: diff changeset	106 #define nv_weak weak
1d019706d866 LLVM10 anatofuz parents: diff changeset	107 #undef __CUDABE__
1d019706d866 LLVM10 anatofuz parents: diff changeset	108 #undef __CUDA_LIBDEVICE__
1d019706d866 LLVM10 anatofuz parents: diff changeset	109 #define __CUDACC__
1d019706d866 LLVM10 anatofuz parents: diff changeset	110 #include "cuda_runtime.h"
1d019706d866 LLVM10 anatofuz parents: diff changeset	111
1d019706d866 LLVM10 anatofuz parents: diff changeset	112 #pragma pop_macro("nv_weak")
1d019706d866 LLVM10 anatofuz parents: diff changeset	113 #undef __CUDACC__
1d019706d866 LLVM10 anatofuz parents: diff changeset	114 #define __CUDABE__
1d019706d866 LLVM10 anatofuz parents: diff changeset	115
1d019706d866 LLVM10 anatofuz parents: diff changeset	116 // CUDA headers use __nvvm_memcpy and __nvvm_memset which Clang does
1d019706d866 LLVM10 anatofuz parents: diff changeset	117 // not have at the moment. Emulate them with a builtin memcpy/memset.
1d019706d866 LLVM10 anatofuz parents: diff changeset	118 #define __nvvm_memcpy(s, d, n, a) __builtin_memcpy(s, d, n)
1d019706d866 LLVM10 anatofuz parents: diff changeset	119 #define __nvvm_memset(d, c, n, a) __builtin_memset(d, c, n)
1d019706d866 LLVM10 anatofuz parents: diff changeset	120
1d019706d866 LLVM10 anatofuz parents: diff changeset	121 #if CUDA_VERSION < 9000
1d019706d866 LLVM10 anatofuz parents: diff changeset	122 #include "crt/device_runtime.h"
1d019706d866 LLVM10 anatofuz parents: diff changeset	123 #endif
1d019706d866 LLVM10 anatofuz parents: diff changeset	124 #include "crt/host_runtime.h"
1d019706d866 LLVM10 anatofuz parents: diff changeset	125 // device_runtime.h defines __cxa_* macros that will conflict with
1d019706d866 LLVM10 anatofuz parents: diff changeset	126 // cxxabi.h.
1d019706d866 LLVM10 anatofuz parents: diff changeset	127 // FIXME: redefine these as __device__ functions.
1d019706d866 LLVM10 anatofuz parents: diff changeset	128 #undef __cxa_vec_ctor
1d019706d866 LLVM10 anatofuz parents: diff changeset	129 #undef __cxa_vec_cctor
1d019706d866 LLVM10 anatofuz parents: diff changeset	130 #undef __cxa_vec_dtor
1d019706d866 LLVM10 anatofuz parents: diff changeset	131 #undef __cxa_vec_new
1d019706d866 LLVM10 anatofuz parents: diff changeset	132 #undef __cxa_vec_new2
1d019706d866 LLVM10 anatofuz parents: diff changeset	133 #undef __cxa_vec_new3
1d019706d866 LLVM10 anatofuz parents: diff changeset	134 #undef __cxa_vec_delete2
1d019706d866 LLVM10 anatofuz parents: diff changeset	135 #undef __cxa_vec_delete
1d019706d866 LLVM10 anatofuz parents: diff changeset	136 #undef __cxa_vec_delete3
1d019706d866 LLVM10 anatofuz parents: diff changeset	137 #undef __cxa_pure_virtual
1d019706d866 LLVM10 anatofuz parents: diff changeset	138
1d019706d866 LLVM10 anatofuz parents: diff changeset	139 // math_functions.hpp expects this host function be defined on MacOS, but it
1d019706d866 LLVM10 anatofuz parents: diff changeset	140 // ends up not being there because of the games we play here. Just define it
1d019706d866 LLVM10 anatofuz parents: diff changeset	141 // ourselves; it's simple enough.
1d019706d866 LLVM10 anatofuz parents: diff changeset	142 #ifdef __APPLE__
1d019706d866 LLVM10 anatofuz parents: diff changeset	143 inline __host__ double __signbitd(double x) {
1d019706d866 LLVM10 anatofuz parents: diff changeset	144 return std::signbit(x);
1d019706d866 LLVM10 anatofuz parents: diff changeset	145 }
1d019706d866 LLVM10 anatofuz parents: diff changeset	146 #endif
1d019706d866 LLVM10 anatofuz parents: diff changeset	147
1d019706d866 LLVM10 anatofuz parents: diff changeset	148 // CUDA 9.1 no longer provides declarations for libdevice functions, so we need
1d019706d866 LLVM10 anatofuz parents: diff changeset	149 // to provide our own.
1d019706d866 LLVM10 anatofuz parents: diff changeset	150 #include <__clang_cuda_libdevice_declares.h>
1d019706d866 LLVM10 anatofuz parents: diff changeset	151
173 0572611fdcc8 reorgnization done Shinji KONO <kono@ie.u-ryukyu.ac.jp> parents: 150 diff changeset	152 // Wrappers for many device-side standard library functions, incl. math
0572611fdcc8 reorgnization done Shinji KONO <kono@ie.u-ryukyu.ac.jp> parents: 150 diff changeset	153 // functions, became compiler builtins in CUDA-9 and have been removed from the
0572611fdcc8 reorgnization done Shinji KONO <kono@ie.u-ryukyu.ac.jp> parents: 150 diff changeset	154 // CUDA headers. Clang now provides its own implementation of the wrappers.
150 1d019706d866 LLVM10 anatofuz parents: diff changeset	155 #if CUDA_VERSION >= 9000
1d019706d866 LLVM10 anatofuz parents: diff changeset	156 #include <__clang_cuda_device_functions.h>
173 0572611fdcc8 reorgnization done Shinji KONO <kono@ie.u-ryukyu.ac.jp> parents: 150 diff changeset	157 #include <__clang_cuda_math.h>
150 1d019706d866 LLVM10 anatofuz parents: diff changeset	158 #endif
1d019706d866 LLVM10 anatofuz parents: diff changeset	159
1d019706d866 LLVM10 anatofuz parents: diff changeset	160 // __THROW is redefined to be empty by device_functions_decls.h in CUDA. Clang's
1d019706d866 LLVM10 anatofuz parents: diff changeset	161 // counterpart does not do it, so we need to make it empty here to keep
1d019706d866 LLVM10 anatofuz parents: diff changeset	162 // following CUDA includes happy.
1d019706d866 LLVM10 anatofuz parents: diff changeset	163 #undef __THROW
1d019706d866 LLVM10 anatofuz parents: diff changeset	164 #define __THROW
1d019706d866 LLVM10 anatofuz parents: diff changeset	165
1d019706d866 LLVM10 anatofuz parents: diff changeset	166 // CUDA 8.0.41 relies on __USE_FAST_MATH__ and __CUDA_PREC_DIV's values.
1d019706d866 LLVM10 anatofuz parents: diff changeset	167 // Previous versions used to check whether they are defined or not.
1d019706d866 LLVM10 anatofuz parents: diff changeset	168 // CU_DEVICE_INVALID macro is only defined in 8.0.41, so we use it
1d019706d866 LLVM10 anatofuz parents: diff changeset	169 // here to detect the switch.
1d019706d866 LLVM10 anatofuz parents: diff changeset	170
1d019706d866 LLVM10 anatofuz parents: diff changeset	171 #if defined(CU_DEVICE_INVALID)
1d019706d866 LLVM10 anatofuz parents: diff changeset	172 #if !defined(__USE_FAST_MATH__)
1d019706d866 LLVM10 anatofuz parents: diff changeset	173 #define __USE_FAST_MATH__ 0
1d019706d866 LLVM10 anatofuz parents: diff changeset	174 #endif
1d019706d866 LLVM10 anatofuz parents: diff changeset	175
1d019706d866 LLVM10 anatofuz parents: diff changeset	176 #if !defined(__CUDA_PREC_DIV)
1d019706d866 LLVM10 anatofuz parents: diff changeset	177 #define __CUDA_PREC_DIV 0
1d019706d866 LLVM10 anatofuz parents: diff changeset	178 #endif
1d019706d866 LLVM10 anatofuz parents: diff changeset	179 #endif
1d019706d866 LLVM10 anatofuz parents: diff changeset	180
1d019706d866 LLVM10 anatofuz parents: diff changeset	181 // Temporarily poison __host__ macro to ensure it's not used by any of
1d019706d866 LLVM10 anatofuz parents: diff changeset	182 // the headers we're about to include.
1d019706d866 LLVM10 anatofuz parents: diff changeset	183 #pragma push_macro("__host__")
1d019706d866 LLVM10 anatofuz parents: diff changeset	184 #define __host__ UNEXPECTED_HOST_ATTRIBUTE
1d019706d866 LLVM10 anatofuz parents: diff changeset	185
1d019706d866 LLVM10 anatofuz parents: diff changeset	186 // device_functions.hpp and math_functions*.hpp use 'static
1d019706d866 LLVM10 anatofuz parents: diff changeset	187 // __forceinline__' (with no __device__) for definitions of device
1d019706d866 LLVM10 anatofuz parents: diff changeset	188 // functions. Temporarily redefine __forceinline__ to include
1d019706d866 LLVM10 anatofuz parents: diff changeset	189 // __device__.
1d019706d866 LLVM10 anatofuz parents: diff changeset	190 #pragma push_macro("__forceinline__")
1d019706d866 LLVM10 anatofuz parents: diff changeset	191 #define __forceinline__ __device__ __inline__ __attribute__((always_inline))
1d019706d866 LLVM10 anatofuz parents: diff changeset	192 #if CUDA_VERSION < 9000
1d019706d866 LLVM10 anatofuz parents: diff changeset	193 #include "device_functions.hpp"
1d019706d866 LLVM10 anatofuz parents: diff changeset	194 #endif
1d019706d866 LLVM10 anatofuz parents: diff changeset	195
1d019706d866 LLVM10 anatofuz parents: diff changeset	196 // math_function.hpp uses the __USE_FAST_MATH__ macro to determine whether we
1d019706d866 LLVM10 anatofuz parents: diff changeset	197 // get the slow-but-accurate or fast-but-inaccurate versions of functions like
1d019706d866 LLVM10 anatofuz parents: diff changeset	198 // sin and exp. This is controlled in clang by -fcuda-approx-transcendentals.
1d019706d866 LLVM10 anatofuz parents: diff changeset	199 //
1d019706d866 LLVM10 anatofuz parents: diff changeset	200 // device_functions.hpp uses __USE_FAST_MATH__ for a different purpose (fast vs.
1d019706d866 LLVM10 anatofuz parents: diff changeset	201 // slow divides), so we need to scope our define carefully here.
1d019706d866 LLVM10 anatofuz parents: diff changeset	202 #pragma push_macro("__USE_FAST_MATH__")
1d019706d866 LLVM10 anatofuz parents: diff changeset	203 #if defined(__CLANG_CUDA_APPROX_TRANSCENDENTALS__)
1d019706d866 LLVM10 anatofuz parents: diff changeset	204 #define __USE_FAST_MATH__ 1
1d019706d866 LLVM10 anatofuz parents: diff changeset	205 #endif
1d019706d866 LLVM10 anatofuz parents: diff changeset	206
1d019706d866 LLVM10 anatofuz parents: diff changeset	207 #if CUDA_VERSION >= 9000
1d019706d866 LLVM10 anatofuz parents: diff changeset	208 // CUDA-9.2 needs host-side memcpy for some host functions in
1d019706d866 LLVM10 anatofuz parents: diff changeset	209 // device_functions.hpp
1d019706d866 LLVM10 anatofuz parents: diff changeset	210 #if CUDA_VERSION >= 9020
1d019706d866 LLVM10 anatofuz parents: diff changeset	211 #include <string.h>
1d019706d866 LLVM10 anatofuz parents: diff changeset	212 #endif
1d019706d866 LLVM10 anatofuz parents: diff changeset	213 #include "crt/math_functions.hpp"
1d019706d866 LLVM10 anatofuz parents: diff changeset	214 #else
1d019706d866 LLVM10 anatofuz parents: diff changeset	215 #include "math_functions.hpp"
1d019706d866 LLVM10 anatofuz parents: diff changeset	216 #endif
1d019706d866 LLVM10 anatofuz parents: diff changeset	217
1d019706d866 LLVM10 anatofuz parents: diff changeset	218 #pragma pop_macro("__USE_FAST_MATH__")
1d019706d866 LLVM10 anatofuz parents: diff changeset	219
1d019706d866 LLVM10 anatofuz parents: diff changeset	220 #if CUDA_VERSION < 9000
1d019706d866 LLVM10 anatofuz parents: diff changeset	221 #include "math_functions_dbl_ptx3.hpp"
1d019706d866 LLVM10 anatofuz parents: diff changeset	222 #endif
1d019706d866 LLVM10 anatofuz parents: diff changeset	223 #pragma pop_macro("__forceinline__")
1d019706d866 LLVM10 anatofuz parents: diff changeset	224
1d019706d866 LLVM10 anatofuz parents: diff changeset	225 // Pull in host-only functions that are only available when neither
1d019706d866 LLVM10 anatofuz parents: diff changeset	226 // __CUDACC__ nor __CUDABE__ are defined.
1d019706d866 LLVM10 anatofuz parents: diff changeset	227 #undef __MATH_FUNCTIONS_HPP__
1d019706d866 LLVM10 anatofuz parents: diff changeset	228 #undef __CUDABE__
1d019706d866 LLVM10 anatofuz parents: diff changeset	229 #if CUDA_VERSION < 9000
1d019706d866 LLVM10 anatofuz parents: diff changeset	230 #include "math_functions.hpp"
1d019706d866 LLVM10 anatofuz parents: diff changeset	231 #endif
1d019706d866 LLVM10 anatofuz parents: diff changeset	232 // Alas, additional overloads for these functions are hard to get to.
1d019706d866 LLVM10 anatofuz parents: diff changeset	233 // Considering that we only need these overloads for a few functions,
1d019706d866 LLVM10 anatofuz parents: diff changeset	234 // we can provide them here.
1d019706d866 LLVM10 anatofuz parents: diff changeset	235 static inline float rsqrt(float __a) { return rsqrtf(__a); }
1d019706d866 LLVM10 anatofuz parents: diff changeset	236 static inline float rcbrt(float __a) { return rcbrtf(__a); }
1d019706d866 LLVM10 anatofuz parents: diff changeset	237 static inline float sinpi(float __a) { return sinpif(__a); }
1d019706d866 LLVM10 anatofuz parents: diff changeset	238 static inline float cospi(float __a) { return cospif(__a); }
1d019706d866 LLVM10 anatofuz parents: diff changeset	239 static inline void sincospi(float __a, float __b, float __c) {
1d019706d866 LLVM10 anatofuz parents: diff changeset	240 return sincospif(__a, __b, __c);
1d019706d866 LLVM10 anatofuz parents: diff changeset	241 }
1d019706d866 LLVM10 anatofuz parents: diff changeset	242 static inline float erfcinv(float __a) { return erfcinvf(__a); }
1d019706d866 LLVM10 anatofuz parents: diff changeset	243 static inline float normcdfinv(float __a) { return normcdfinvf(__a); }
1d019706d866 LLVM10 anatofuz parents: diff changeset	244 static inline float normcdf(float __a) { return normcdff(__a); }
1d019706d866 LLVM10 anatofuz parents: diff changeset	245 static inline float erfcx(float __a) { return erfcxf(__a); }
1d019706d866 LLVM10 anatofuz parents: diff changeset	246
1d019706d866 LLVM10 anatofuz parents: diff changeset	247 #if CUDA_VERSION < 9000
1d019706d866 LLVM10 anatofuz parents: diff changeset	248 // For some reason single-argument variant is not always declared by
1d019706d866 LLVM10 anatofuz parents: diff changeset	249 // CUDA headers. Alas, device_functions.hpp included below needs it.
1d019706d866 LLVM10 anatofuz parents: diff changeset	250 static inline __device__ void __brkpt(int __c) { __brkpt(); }
1d019706d866 LLVM10 anatofuz parents: diff changeset	251 #endif
1d019706d866 LLVM10 anatofuz parents: diff changeset	252
1d019706d866 LLVM10 anatofuz parents: diff changeset	253 // Now include *.hpp with definitions of various GPU functions. Alas,
1d019706d866 LLVM10 anatofuz parents: diff changeset	254 // a lot of thins get declared/defined with __host__ attribute which
1d019706d866 LLVM10 anatofuz parents: diff changeset	255 // we don't want and we have to define it out. We also have to include
1d019706d866 LLVM10 anatofuz parents: diff changeset	256 // {device,math}_functions.hpp again in order to extract the other
1d019706d866 LLVM10 anatofuz parents: diff changeset	257 // branch of #if/else inside.
1d019706d866 LLVM10 anatofuz parents: diff changeset	258 #define __host__
1d019706d866 LLVM10 anatofuz parents: diff changeset	259 #undef __CUDABE__
1d019706d866 LLVM10 anatofuz parents: diff changeset	260 #define __CUDACC__
1d019706d866 LLVM10 anatofuz parents: diff changeset	261 #if CUDA_VERSION >= 9000
1d019706d866 LLVM10 anatofuz parents: diff changeset	262 // Some atomic functions became compiler builtins in CUDA-9 , so we need their
1d019706d866 LLVM10 anatofuz parents: diff changeset	263 // declarations.
1d019706d866 LLVM10 anatofuz parents: diff changeset	264 #include "device_atomic_functions.h"
1d019706d866 LLVM10 anatofuz parents: diff changeset	265 #endif
1d019706d866 LLVM10 anatofuz parents: diff changeset	266 #undef __DEVICE_FUNCTIONS_HPP__
1d019706d866 LLVM10 anatofuz parents: diff changeset	267 #include "device_atomic_functions.hpp"
1d019706d866 LLVM10 anatofuz parents: diff changeset	268 #if CUDA_VERSION >= 9000
1d019706d866 LLVM10 anatofuz parents: diff changeset	269 #include "crt/device_functions.hpp"
1d019706d866 LLVM10 anatofuz parents: diff changeset	270 #include "crt/device_double_functions.hpp"
1d019706d866 LLVM10 anatofuz parents: diff changeset	271 #else
1d019706d866 LLVM10 anatofuz parents: diff changeset	272 #include "device_functions.hpp"
1d019706d866 LLVM10 anatofuz parents: diff changeset	273 #define __CUDABE__
1d019706d866 LLVM10 anatofuz parents: diff changeset	274 #include "device_double_functions.h"
1d019706d866 LLVM10 anatofuz parents: diff changeset	275 #undef __CUDABE__
1d019706d866 LLVM10 anatofuz parents: diff changeset	276 #endif
1d019706d866 LLVM10 anatofuz parents: diff changeset	277 #include "sm_20_atomic_functions.hpp"
1d019706d866 LLVM10 anatofuz parents: diff changeset	278 #include "sm_20_intrinsics.hpp"
1d019706d866 LLVM10 anatofuz parents: diff changeset	279 #include "sm_32_atomic_functions.hpp"
1d019706d866 LLVM10 anatofuz parents: diff changeset	280
1d019706d866 LLVM10 anatofuz parents: diff changeset	281 // Don't include sm_30_intrinsics.h and sm_32_intrinsics.h. These define the
1d019706d866 LLVM10 anatofuz parents: diff changeset	282 // __shfl and __ldg intrinsics using inline (volatile) asm, but we want to
1d019706d866 LLVM10 anatofuz parents: diff changeset	283 // define them using builtins so that the optimizer can reason about and across
1d019706d866 LLVM10 anatofuz parents: diff changeset	284 // these instructions. In particular, using intrinsics for ldg gets us the
1d019706d866 LLVM10 anatofuz parents: diff changeset	285 // [addr+imm] addressing mode, which, although it doesn't actually exist in the
1d019706d866 LLVM10 anatofuz parents: diff changeset	286 // hardware, seems to generate faster machine code because ptxas can more easily
1d019706d866 LLVM10 anatofuz parents: diff changeset	287 // reason about our code.
1d019706d866 LLVM10 anatofuz parents: diff changeset	288
1d019706d866 LLVM10 anatofuz parents: diff changeset	289 #if CUDA_VERSION >= 8000
1d019706d866 LLVM10 anatofuz parents: diff changeset	290 #pragma push_macro("__CUDA_ARCH__")
1d019706d866 LLVM10 anatofuz parents: diff changeset	291 #undef __CUDA_ARCH__
1d019706d866 LLVM10 anatofuz parents: diff changeset	292 #include "sm_60_atomic_functions.hpp"
1d019706d866 LLVM10 anatofuz parents: diff changeset	293 #include "sm_61_intrinsics.hpp"
1d019706d866 LLVM10 anatofuz parents: diff changeset	294 #pragma pop_macro("__CUDA_ARCH__")
1d019706d866 LLVM10 anatofuz parents: diff changeset	295 #endif
1d019706d866 LLVM10 anatofuz parents: diff changeset	296
1d019706d866 LLVM10 anatofuz parents: diff changeset	297 #undef __MATH_FUNCTIONS_HPP__
1d019706d866 LLVM10 anatofuz parents: diff changeset	298
1d019706d866 LLVM10 anatofuz parents: diff changeset	299 // math_functions.hpp defines ::signbit as a __host__ __device__ function. This
1d019706d866 LLVM10 anatofuz parents: diff changeset	300 // conflicts with libstdc++'s constexpr ::signbit, so we have to rename
1d019706d866 LLVM10 anatofuz parents: diff changeset	301 // math_function.hpp's ::signbit. It's guarded by #undef signbit, but that's
1d019706d866 LLVM10 anatofuz parents: diff changeset	302 // conditional on __GNUC__. :)
1d019706d866 LLVM10 anatofuz parents: diff changeset	303 #pragma push_macro("signbit")
1d019706d866 LLVM10 anatofuz parents: diff changeset	304 #pragma push_macro("__GNUC__")
1d019706d866 LLVM10 anatofuz parents: diff changeset	305 #undef __GNUC__
1d019706d866 LLVM10 anatofuz parents: diff changeset	306 #define signbit __ignored_cuda_signbit
1d019706d866 LLVM10 anatofuz parents: diff changeset	307
1d019706d866 LLVM10 anatofuz parents: diff changeset	308 // CUDA-9 omits device-side definitions of some math functions if it sees
1d019706d866 LLVM10 anatofuz parents: diff changeset	309 // include guard from math.h wrapper from libstdc++. We have to undo the header
1d019706d866 LLVM10 anatofuz parents: diff changeset	310 // guard temporarily to get the definitions we need.
1d019706d866 LLVM10 anatofuz parents: diff changeset	311 #pragma push_macro("_GLIBCXX_MATH_H")
1d019706d866 LLVM10 anatofuz parents: diff changeset	312 #pragma push_macro("_LIBCPP_VERSION")
1d019706d866 LLVM10 anatofuz parents: diff changeset	313 #if CUDA_VERSION >= 9000
1d019706d866 LLVM10 anatofuz parents: diff changeset	314 #undef _GLIBCXX_MATH_H
1d019706d866 LLVM10 anatofuz parents: diff changeset	315 // We also need to undo another guard that checks for libc++ 3.8+
1d019706d866 LLVM10 anatofuz parents: diff changeset	316 #ifdef _LIBCPP_VERSION
1d019706d866 LLVM10 anatofuz parents: diff changeset	317 #define _LIBCPP_VERSION 3700
1d019706d866 LLVM10 anatofuz parents: diff changeset	318 #endif
1d019706d866 LLVM10 anatofuz parents: diff changeset	319 #endif
1d019706d866 LLVM10 anatofuz parents: diff changeset	320
1d019706d866 LLVM10 anatofuz parents: diff changeset	321 #if CUDA_VERSION >= 9000
1d019706d866 LLVM10 anatofuz parents: diff changeset	322 #include "crt/math_functions.hpp"
1d019706d866 LLVM10 anatofuz parents: diff changeset	323 #else
1d019706d866 LLVM10 anatofuz parents: diff changeset	324 #include "math_functions.hpp"
1d019706d866 LLVM10 anatofuz parents: diff changeset	325 #endif
1d019706d866 LLVM10 anatofuz parents: diff changeset	326 #pragma pop_macro("_GLIBCXX_MATH_H")
1d019706d866 LLVM10 anatofuz parents: diff changeset	327 #pragma pop_macro("_LIBCPP_VERSION")
1d019706d866 LLVM10 anatofuz parents: diff changeset	328 #pragma pop_macro("__GNUC__")
1d019706d866 LLVM10 anatofuz parents: diff changeset	329 #pragma pop_macro("signbit")
1d019706d866 LLVM10 anatofuz parents: diff changeset	330
1d019706d866 LLVM10 anatofuz parents: diff changeset	331 #pragma pop_macro("__host__")
1d019706d866 LLVM10 anatofuz parents: diff changeset	332
1d019706d866 LLVM10 anatofuz parents: diff changeset	333 #include "texture_indirect_functions.h"
1d019706d866 LLVM10 anatofuz parents: diff changeset	334
1d019706d866 LLVM10 anatofuz parents: diff changeset	335 // Restore state of __CUDA_ARCH__ and __THROW we had on entry.
1d019706d866 LLVM10 anatofuz parents: diff changeset	336 #pragma pop_macro("__CUDA_ARCH__")
1d019706d866 LLVM10 anatofuz parents: diff changeset	337 #pragma pop_macro("__THROW")
1d019706d866 LLVM10 anatofuz parents: diff changeset	338
1d019706d866 LLVM10 anatofuz parents: diff changeset	339 // Set up compiler macros expected to be seen during compilation.
1d019706d866 LLVM10 anatofuz parents: diff changeset	340 #undef __CUDABE__
1d019706d866 LLVM10 anatofuz parents: diff changeset	341 #define __CUDACC__
1d019706d866 LLVM10 anatofuz parents: diff changeset	342
1d019706d866 LLVM10 anatofuz parents: diff changeset	343 extern "C" {
1d019706d866 LLVM10 anatofuz parents: diff changeset	344 // Device-side CUDA system calls.
1d019706d866 LLVM10 anatofuz parents: diff changeset	345 // http://docs.nvidia.com/cuda/ptx-writers-guide-to-interoperability/index.html#system-calls
1d019706d866 LLVM10 anatofuz parents: diff changeset	346 // We need these declarations and wrappers for device-side
1d019706d866 LLVM10 anatofuz parents: diff changeset	347 // malloc/free/printf calls to work without relying on
1d019706d866 LLVM10 anatofuz parents: diff changeset	348 // -fcuda-disable-target-call-checks option.
1d019706d866 LLVM10 anatofuz parents: diff changeset	349 __device__ int vprintf(const char , const char );
1d019706d866 LLVM10 anatofuz parents: diff changeset	350 __device__ void free(void *) __attribute((nothrow));
1d019706d866 LLVM10 anatofuz parents: diff changeset	351 __device__ void *malloc(size_t) __attribute((nothrow)) __attribute__((malloc));
1d019706d866 LLVM10 anatofuz parents: diff changeset	352 __device__ void __assertfail(const char __message, const char __file,
1d019706d866 LLVM10 anatofuz parents: diff changeset	353 unsigned __line, const char *__function,
1d019706d866 LLVM10 anatofuz parents: diff changeset	354 size_t __charSize) __attribute__((noreturn));
1d019706d866 LLVM10 anatofuz parents: diff changeset	355
1d019706d866 LLVM10 anatofuz parents: diff changeset	356 // In order for standard assert() macro on linux to work we need to
1d019706d866 LLVM10 anatofuz parents: diff changeset	357 // provide device-side __assert_fail()
1d019706d866 LLVM10 anatofuz parents: diff changeset	358 __device__ static inline void __assert_fail(const char *__message,
1d019706d866 LLVM10 anatofuz parents: diff changeset	359 const char *__file, unsigned __line,
1d019706d866 LLVM10 anatofuz parents: diff changeset	360 const char *__function) {
1d019706d866 LLVM10 anatofuz parents: diff changeset	361 __assertfail(__message, __file, __line, __function, sizeof(char));
1d019706d866 LLVM10 anatofuz parents: diff changeset	362 }
1d019706d866 LLVM10 anatofuz parents: diff changeset	363
1d019706d866 LLVM10 anatofuz parents: diff changeset	364 // Clang will convert printf into vprintf, but we still need
1d019706d866 LLVM10 anatofuz parents: diff changeset	365 // device-side declaration for it.
1d019706d866 LLVM10 anatofuz parents: diff changeset	366 __device__ int printf(const char *, ...);
1d019706d866 LLVM10 anatofuz parents: diff changeset	367 } // extern "C"
1d019706d866 LLVM10 anatofuz parents: diff changeset	368
1d019706d866 LLVM10 anatofuz parents: diff changeset	369 // We also need device-side std::malloc and std::free.
1d019706d866 LLVM10 anatofuz parents: diff changeset	370 namespace std {
1d019706d866 LLVM10 anatofuz parents: diff changeset	371 __device__ static inline void free(void *__ptr) { ::free(__ptr); }
1d019706d866 LLVM10 anatofuz parents: diff changeset	372 __device__ static inline void *malloc(size_t __size) {
1d019706d866 LLVM10 anatofuz parents: diff changeset	373 return ::malloc(__size);
1d019706d866 LLVM10 anatofuz parents: diff changeset	374 }
1d019706d866 LLVM10 anatofuz parents: diff changeset	375 } // namespace std
1d019706d866 LLVM10 anatofuz parents: diff changeset	376
1d019706d866 LLVM10 anatofuz parents: diff changeset	377 // Out-of-line implementations from __clang_cuda_builtin_vars.h. These need to
1d019706d866 LLVM10 anatofuz parents: diff changeset	378 // come after we've pulled in the definition of uint3 and dim3.
1d019706d866 LLVM10 anatofuz parents: diff changeset	379
1d019706d866 LLVM10 anatofuz parents: diff changeset	380 __device__ inline __cuda_builtin_threadIdx_t::operator uint3() const {
1d019706d866 LLVM10 anatofuz parents: diff changeset	381 uint3 ret;
1d019706d866 LLVM10 anatofuz parents: diff changeset	382 ret.x = x;
1d019706d866 LLVM10 anatofuz parents: diff changeset	383 ret.y = y;
1d019706d866 LLVM10 anatofuz parents: diff changeset	384 ret.z = z;
1d019706d866 LLVM10 anatofuz parents: diff changeset	385 return ret;
1d019706d866 LLVM10 anatofuz parents: diff changeset	386 }
1d019706d866 LLVM10 anatofuz parents: diff changeset	387
1d019706d866 LLVM10 anatofuz parents: diff changeset	388 __device__ inline __cuda_builtin_blockIdx_t::operator uint3() const {
1d019706d866 LLVM10 anatofuz parents: diff changeset	389 uint3 ret;
1d019706d866 LLVM10 anatofuz parents: diff changeset	390 ret.x = x;
1d019706d866 LLVM10 anatofuz parents: diff changeset	391 ret.y = y;
1d019706d866 LLVM10 anatofuz parents: diff changeset	392 ret.z = z;
1d019706d866 LLVM10 anatofuz parents: diff changeset	393 return ret;
1d019706d866 LLVM10 anatofuz parents: diff changeset	394 }
1d019706d866 LLVM10 anatofuz parents: diff changeset	395
1d019706d866 LLVM10 anatofuz parents: diff changeset	396 __device__ inline __cuda_builtin_blockDim_t::operator dim3() const {
1d019706d866 LLVM10 anatofuz parents: diff changeset	397 return dim3(x, y, z);
1d019706d866 LLVM10 anatofuz parents: diff changeset	398 }
1d019706d866 LLVM10 anatofuz parents: diff changeset	399
1d019706d866 LLVM10 anatofuz parents: diff changeset	400 __device__ inline __cuda_builtin_gridDim_t::operator dim3() const {
1d019706d866 LLVM10 anatofuz parents: diff changeset	401 return dim3(x, y, z);
1d019706d866 LLVM10 anatofuz parents: diff changeset	402 }
1d019706d866 LLVM10 anatofuz parents: diff changeset	403
1d019706d866 LLVM10 anatofuz parents: diff changeset	404 #include <__clang_cuda_cmath.h>
1d019706d866 LLVM10 anatofuz parents: diff changeset	405 #include <__clang_cuda_intrinsics.h>
1d019706d866 LLVM10 anatofuz parents: diff changeset	406 #include <__clang_cuda_complex_builtins.h>
1d019706d866 LLVM10 anatofuz parents: diff changeset	407
1d019706d866 LLVM10 anatofuz parents: diff changeset	408 // curand_mtgp32_kernel helpfully redeclares blockDim and threadIdx in host
1d019706d866 LLVM10 anatofuz parents: diff changeset	409 // mode, giving them their "proper" types of dim3 and uint3. This is
1d019706d866 LLVM10 anatofuz parents: diff changeset	410 // incompatible with the types we give in __clang_cuda_builtin_vars.h. As as
1d019706d866 LLVM10 anatofuz parents: diff changeset	411 // hack, force-include the header (nvcc doesn't include it by default) but
1d019706d866 LLVM10 anatofuz parents: diff changeset	412 // redefine dim3 and uint3 to our builtin types. (Thankfully dim3 and uint3 are
1d019706d866 LLVM10 anatofuz parents: diff changeset	413 // only used here for the redeclarations of blockDim and threadIdx.)
1d019706d866 LLVM10 anatofuz parents: diff changeset	414 #pragma push_macro("dim3")
1d019706d866 LLVM10 anatofuz parents: diff changeset	415 #pragma push_macro("uint3")
1d019706d866 LLVM10 anatofuz parents: diff changeset	416 #define dim3 __cuda_builtin_blockDim_t
1d019706d866 LLVM10 anatofuz parents: diff changeset	417 #define uint3 __cuda_builtin_threadIdx_t
1d019706d866 LLVM10 anatofuz parents: diff changeset	418 #include "curand_mtgp32_kernel.h"
1d019706d866 LLVM10 anatofuz parents: diff changeset	419 #pragma pop_macro("dim3")
1d019706d866 LLVM10 anatofuz parents: diff changeset	420 #pragma pop_macro("uint3")
1d019706d866 LLVM10 anatofuz parents: diff changeset	421 #pragma pop_macro("__USE_FAST_MATH__")
1d019706d866 LLVM10 anatofuz parents: diff changeset	422 #pragma pop_macro("__CUDA_INCLUDE_COMPILER_INTERNAL_HEADERS__")
1d019706d866 LLVM10 anatofuz parents: diff changeset	423
1d019706d866 LLVM10 anatofuz parents: diff changeset	424 // CUDA runtime uses this undocumented function to access kernel launch
1d019706d866 LLVM10 anatofuz parents: diff changeset	425 // configuration. The declaration is in crt/device_functions.h but that file
1d019706d866 LLVM10 anatofuz parents: diff changeset	426 // includes a lot of other stuff we don't want. Instead, we'll provide our own
1d019706d866 LLVM10 anatofuz parents: diff changeset	427 // declaration for it here.
1d019706d866 LLVM10 anatofuz parents: diff changeset	428 #if CUDA_VERSION >= 9020
1d019706d866 LLVM10 anatofuz parents: diff changeset	429 extern "C" unsigned __cudaPushCallConfiguration(dim3 gridDim, dim3 blockDim,
1d019706d866 LLVM10 anatofuz parents: diff changeset	430 size_t sharedMem = 0,
1d019706d866 LLVM10 anatofuz parents: diff changeset	431 void *stream = 0);
1d019706d866 LLVM10 anatofuz parents: diff changeset	432 #endif
1d019706d866 LLVM10 anatofuz parents: diff changeset	433
1d019706d866 LLVM10 anatofuz parents: diff changeset	434 #endif // __CUDA__
1d019706d866 LLVM10 anatofuz parents: diff changeset	435 #endif // __CLANG_CUDA_RUNTIME_WRAPPER_H__

Mercurial > hg > CbC > CbC_llvm

annotate clang/lib/Headers/__clang_cuda_runtime_wrapper.h @ 176:de4ac79aef9d