150
|
1 //===----- CGCUDANV.cpp - Interface to NVIDIA CUDA Runtime ----------------===//
|
|
2 //
|
|
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
|
|
4 // See https://llvm.org/LICENSE.txt for license information.
|
|
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
|
|
6 //
|
|
7 //===----------------------------------------------------------------------===//
|
|
8 //
|
|
9 // This provides a class for CUDA code generation targeting the NVIDIA CUDA
|
|
10 // runtime library.
|
|
11 //
|
|
12 //===----------------------------------------------------------------------===//
|
|
13
|
|
14 #include "CGCUDARuntime.h"
|
221
|
15 #include "CGCXXABI.h"
|
150
|
16 #include "CodeGenFunction.h"
|
|
17 #include "CodeGenModule.h"
|
|
18 #include "clang/AST/Decl.h"
|
|
19 #include "clang/Basic/Cuda.h"
|
|
20 #include "clang/CodeGen/CodeGenABITypes.h"
|
|
21 #include "clang/CodeGen/ConstantInitBuilder.h"
|
|
22 #include "llvm/IR/BasicBlock.h"
|
|
23 #include "llvm/IR/Constants.h"
|
|
24 #include "llvm/IR/DerivedTypes.h"
|
221
|
25 #include "llvm/IR/ReplaceConstant.h"
|
150
|
26 #include "llvm/Support/Format.h"
|
|
27
|
|
28 using namespace clang;
|
|
29 using namespace CodeGen;
|
|
30
|
|
31 namespace {
|
|
32 constexpr unsigned CudaFatMagic = 0x466243b1;
|
|
33 constexpr unsigned HIPFatMagic = 0x48495046; // "HIPF"
|
|
34
|
|
35 class CGNVCUDARuntime : public CGCUDARuntime {
|
|
36
|
|
37 private:
|
|
38 llvm::IntegerType *IntTy, *SizeTy;
|
|
39 llvm::Type *VoidTy;
|
|
40 llvm::PointerType *CharPtrTy, *VoidPtrTy, *VoidPtrPtrTy;
|
|
41
|
|
42 /// Convenience reference to LLVM Context
|
|
43 llvm::LLVMContext &Context;
|
|
44 /// Convenience reference to the current module
|
|
45 llvm::Module &TheModule;
|
221
|
46 /// Keeps track of kernel launch stubs and handles emitted in this module
|
150
|
47 struct KernelInfo {
|
221
|
48 llvm::Function *Kernel; // stub function to help launch kernel
|
150
|
49 const Decl *D;
|
|
50 };
|
|
51 llvm::SmallVector<KernelInfo, 16> EmittedKernels;
|
221
|
52 // Map a device stub function to a symbol for identifying kernel in host code.
|
|
53 // For CUDA, the symbol for identifying the kernel is the same as the device
|
|
54 // stub function. For HIP, they are different.
|
|
55 llvm::DenseMap<llvm::Function *, llvm::GlobalValue *> KernelHandles;
|
|
56 // Map a kernel handle to the kernel stub.
|
|
57 llvm::DenseMap<llvm::GlobalValue *, llvm::Function *> KernelStubs;
|
150
|
58 struct VarInfo {
|
|
59 llvm::GlobalVariable *Var;
|
|
60 const VarDecl *D;
|
173
|
61 DeviceVarFlags Flags;
|
150
|
62 };
|
|
63 llvm::SmallVector<VarInfo, 16> DeviceVars;
|
|
64 /// Keeps track of variable containing handle of GPU binary. Populated by
|
|
65 /// ModuleCtorFunction() and used to create corresponding cleanup calls in
|
|
66 /// ModuleDtorFunction()
|
|
67 llvm::GlobalVariable *GpuBinaryHandle = nullptr;
|
|
68 /// Whether we generate relocatable device code.
|
|
69 bool RelocatableDeviceCode;
|
|
70 /// Mangle context for device.
|
|
71 std::unique_ptr<MangleContext> DeviceMC;
|
|
72
|
|
73 llvm::FunctionCallee getSetupArgumentFn() const;
|
|
74 llvm::FunctionCallee getLaunchFn() const;
|
|
75
|
|
76 llvm::FunctionType *getRegisterGlobalsFnTy() const;
|
|
77 llvm::FunctionType *getCallbackFnTy() const;
|
|
78 llvm::FunctionType *getRegisterLinkedBinaryFnTy() const;
|
|
79 std::string addPrefixToName(StringRef FuncName) const;
|
|
80 std::string addUnderscoredPrefixToName(StringRef FuncName) const;
|
|
81
|
|
82 /// Creates a function to register all kernel stubs generated in this module.
|
|
83 llvm::Function *makeRegisterGlobalsFn();
|
|
84
|
|
85 /// Helper function that generates a constant string and returns a pointer to
|
|
86 /// the start of the string. The result of this function can be used anywhere
|
|
87 /// where the C code specifies const char*.
|
|
88 llvm::Constant *makeConstantString(const std::string &Str,
|
|
89 const std::string &Name = "",
|
|
90 const std::string &SectionName = "",
|
|
91 unsigned Alignment = 0) {
|
|
92 llvm::Constant *Zeros[] = {llvm::ConstantInt::get(SizeTy, 0),
|
|
93 llvm::ConstantInt::get(SizeTy, 0)};
|
|
94 auto ConstStr = CGM.GetAddrOfConstantCString(Str, Name.c_str());
|
|
95 llvm::GlobalVariable *GV =
|
|
96 cast<llvm::GlobalVariable>(ConstStr.getPointer());
|
|
97 if (!SectionName.empty()) {
|
|
98 GV->setSection(SectionName);
|
|
99 // Mark the address as used which make sure that this section isn't
|
|
100 // merged and we will really have it in the object file.
|
|
101 GV->setUnnamedAddr(llvm::GlobalValue::UnnamedAddr::None);
|
|
102 }
|
|
103 if (Alignment)
|
|
104 GV->setAlignment(llvm::Align(Alignment));
|
|
105
|
|
106 return llvm::ConstantExpr::getGetElementPtr(ConstStr.getElementType(),
|
|
107 ConstStr.getPointer(), Zeros);
|
|
108 }
|
|
109
|
|
110 /// Helper function that generates an empty dummy function returning void.
|
|
111 llvm::Function *makeDummyFunction(llvm::FunctionType *FnTy) {
|
|
112 assert(FnTy->getReturnType()->isVoidTy() &&
|
|
113 "Can only generate dummy functions returning void!");
|
|
114 llvm::Function *DummyFunc = llvm::Function::Create(
|
|
115 FnTy, llvm::GlobalValue::InternalLinkage, "dummy", &TheModule);
|
|
116
|
|
117 llvm::BasicBlock *DummyBlock =
|
|
118 llvm::BasicBlock::Create(Context, "", DummyFunc);
|
|
119 CGBuilderTy FuncBuilder(CGM, Context);
|
|
120 FuncBuilder.SetInsertPoint(DummyBlock);
|
|
121 FuncBuilder.CreateRetVoid();
|
|
122
|
|
123 return DummyFunc;
|
|
124 }
|
|
125
|
|
126 void emitDeviceStubBodyLegacy(CodeGenFunction &CGF, FunctionArgList &Args);
|
|
127 void emitDeviceStubBodyNew(CodeGenFunction &CGF, FunctionArgList &Args);
|
173
|
128 std::string getDeviceSideName(const NamedDecl *ND) override;
|
150
|
129
|
|
130 void registerDeviceVar(const VarDecl *VD, llvm::GlobalVariable &Var,
|
221
|
131 bool Extern, bool Constant) {
|
173
|
132 DeviceVars.push_back({&Var,
|
|
133 VD,
|
|
134 {DeviceVarFlags::Variable, Extern, Constant,
|
221
|
135 VD->hasAttr<HIPManagedAttr>(),
|
|
136 /*Normalized*/ false, 0}});
|
173
|
137 }
|
|
138 void registerDeviceSurf(const VarDecl *VD, llvm::GlobalVariable &Var,
|
221
|
139 bool Extern, int Type) {
|
173
|
140 DeviceVars.push_back({&Var,
|
|
141 VD,
|
|
142 {DeviceVarFlags::Surface, Extern, /*Constant*/ false,
|
221
|
143 /*Managed*/ false,
|
173
|
144 /*Normalized*/ false, Type}});
|
|
145 }
|
|
146 void registerDeviceTex(const VarDecl *VD, llvm::GlobalVariable &Var,
|
221
|
147 bool Extern, int Type, bool Normalized) {
|
173
|
148 DeviceVars.push_back({&Var,
|
|
149 VD,
|
|
150 {DeviceVarFlags::Texture, Extern, /*Constant*/ false,
|
221
|
151 /*Managed*/ false, Normalized, Type}});
|
150
|
152 }
|
|
153
|
|
154 /// Creates module constructor function
|
221
|
155 llvm::Function *makeModuleCtorFunction();
|
150
|
156 /// Creates module destructor function
|
221
|
157 llvm::Function *makeModuleDtorFunction();
|
|
158 /// Transform managed variables for device compilation.
|
|
159 void transformManagedVars();
|
|
160
|
|
161 public:
|
|
162 CGNVCUDARuntime(CodeGenModule &CGM);
|
|
163
|
|
164 llvm::GlobalValue *getKernelHandle(llvm::Function *F, GlobalDecl GD) override;
|
|
165 llvm::Function *getKernelStub(llvm::GlobalValue *Handle) override {
|
|
166 auto Loc = KernelStubs.find(Handle);
|
|
167 assert(Loc != KernelStubs.end());
|
|
168 return Loc->second;
|
|
169 }
|
|
170 void emitDeviceStub(CodeGenFunction &CGF, FunctionArgList &Args) override;
|
|
171 void handleVarRegistration(const VarDecl *VD,
|
|
172 llvm::GlobalVariable &Var) override;
|
|
173 void
|
|
174 internalizeDeviceSideVar(const VarDecl *D,
|
|
175 llvm::GlobalValue::LinkageTypes &Linkage) override;
|
|
176
|
|
177 llvm::Function *finalizeModule() override;
|
150
|
178 };
|
|
179
|
|
180 }
|
|
181
|
|
182 std::string CGNVCUDARuntime::addPrefixToName(StringRef FuncName) const {
|
|
183 if (CGM.getLangOpts().HIP)
|
|
184 return ((Twine("hip") + Twine(FuncName)).str());
|
|
185 return ((Twine("cuda") + Twine(FuncName)).str());
|
|
186 }
|
|
187 std::string
|
|
188 CGNVCUDARuntime::addUnderscoredPrefixToName(StringRef FuncName) const {
|
|
189 if (CGM.getLangOpts().HIP)
|
|
190 return ((Twine("__hip") + Twine(FuncName)).str());
|
|
191 return ((Twine("__cuda") + Twine(FuncName)).str());
|
|
192 }
|
|
193
|
221
|
194 static std::unique_ptr<MangleContext> InitDeviceMC(CodeGenModule &CGM) {
|
|
195 // If the host and device have different C++ ABIs, mark it as the device
|
|
196 // mangle context so that the mangling needs to retrieve the additional
|
|
197 // device lambda mangling number instead of the regular host one.
|
|
198 if (CGM.getContext().getAuxTargetInfo() &&
|
|
199 CGM.getContext().getTargetInfo().getCXXABI().isMicrosoft() &&
|
|
200 CGM.getContext().getAuxTargetInfo()->getCXXABI().isItaniumFamily()) {
|
|
201 return std::unique_ptr<MangleContext>(
|
|
202 CGM.getContext().createDeviceMangleContext(
|
|
203 *CGM.getContext().getAuxTargetInfo()));
|
|
204 }
|
|
205
|
|
206 return std::unique_ptr<MangleContext>(CGM.getContext().createMangleContext(
|
|
207 CGM.getContext().getAuxTargetInfo()));
|
|
208 }
|
|
209
|
150
|
210 CGNVCUDARuntime::CGNVCUDARuntime(CodeGenModule &CGM)
|
|
211 : CGCUDARuntime(CGM), Context(CGM.getLLVMContext()),
|
|
212 TheModule(CGM.getModule()),
|
|
213 RelocatableDeviceCode(CGM.getLangOpts().GPURelocatableDeviceCode),
|
221
|
214 DeviceMC(InitDeviceMC(CGM)) {
|
150
|
215 CodeGen::CodeGenTypes &Types = CGM.getTypes();
|
|
216 ASTContext &Ctx = CGM.getContext();
|
|
217
|
|
218 IntTy = CGM.IntTy;
|
|
219 SizeTy = CGM.SizeTy;
|
|
220 VoidTy = CGM.VoidTy;
|
|
221
|
|
222 CharPtrTy = llvm::PointerType::getUnqual(Types.ConvertType(Ctx.CharTy));
|
|
223 VoidPtrTy = cast<llvm::PointerType>(Types.ConvertType(Ctx.VoidPtrTy));
|
|
224 VoidPtrPtrTy = VoidPtrTy->getPointerTo();
|
|
225 }
|
|
226
|
|
227 llvm::FunctionCallee CGNVCUDARuntime::getSetupArgumentFn() const {
|
|
228 // cudaError_t cudaSetupArgument(void *, size_t, size_t)
|
|
229 llvm::Type *Params[] = {VoidPtrTy, SizeTy, SizeTy};
|
|
230 return CGM.CreateRuntimeFunction(
|
|
231 llvm::FunctionType::get(IntTy, Params, false),
|
|
232 addPrefixToName("SetupArgument"));
|
|
233 }
|
|
234
|
|
235 llvm::FunctionCallee CGNVCUDARuntime::getLaunchFn() const {
|
|
236 if (CGM.getLangOpts().HIP) {
|
|
237 // hipError_t hipLaunchByPtr(char *);
|
|
238 return CGM.CreateRuntimeFunction(
|
|
239 llvm::FunctionType::get(IntTy, CharPtrTy, false), "hipLaunchByPtr");
|
|
240 } else {
|
|
241 // cudaError_t cudaLaunch(char *);
|
|
242 return CGM.CreateRuntimeFunction(
|
|
243 llvm::FunctionType::get(IntTy, CharPtrTy, false), "cudaLaunch");
|
|
244 }
|
|
245 }
|
|
246
|
|
247 llvm::FunctionType *CGNVCUDARuntime::getRegisterGlobalsFnTy() const {
|
|
248 return llvm::FunctionType::get(VoidTy, VoidPtrPtrTy, false);
|
|
249 }
|
|
250
|
|
251 llvm::FunctionType *CGNVCUDARuntime::getCallbackFnTy() const {
|
|
252 return llvm::FunctionType::get(VoidTy, VoidPtrTy, false);
|
|
253 }
|
|
254
|
|
255 llvm::FunctionType *CGNVCUDARuntime::getRegisterLinkedBinaryFnTy() const {
|
|
256 auto CallbackFnTy = getCallbackFnTy();
|
|
257 auto RegisterGlobalsFnTy = getRegisterGlobalsFnTy();
|
|
258 llvm::Type *Params[] = {RegisterGlobalsFnTy->getPointerTo(), VoidPtrTy,
|
|
259 VoidPtrTy, CallbackFnTy->getPointerTo()};
|
|
260 return llvm::FunctionType::get(VoidTy, Params, false);
|
|
261 }
|
|
262
|
173
|
263 std::string CGNVCUDARuntime::getDeviceSideName(const NamedDecl *ND) {
|
|
264 GlobalDecl GD;
|
|
265 // D could be either a kernel or a variable.
|
|
266 if (auto *FD = dyn_cast<FunctionDecl>(ND))
|
|
267 GD = GlobalDecl(FD, KernelReferenceKind::Kernel);
|
|
268 else
|
|
269 GD = GlobalDecl(ND);
|
150
|
270 std::string DeviceSideName;
|
221
|
271 MangleContext *MC;
|
|
272 if (CGM.getLangOpts().CUDAIsDevice)
|
|
273 MC = &CGM.getCXXABI().getMangleContext();
|
|
274 else
|
|
275 MC = DeviceMC.get();
|
|
276 if (MC->shouldMangleDeclName(ND)) {
|
150
|
277 SmallString<256> Buffer;
|
|
278 llvm::raw_svector_ostream Out(Buffer);
|
221
|
279 MC->mangleName(GD, Out);
|
150
|
280 DeviceSideName = std::string(Out.str());
|
|
281 } else
|
|
282 DeviceSideName = std::string(ND->getIdentifier()->getName());
|
221
|
283
|
|
284 // Make unique name for device side static file-scope variable for HIP.
|
|
285 if (CGM.getContext().shouldExternalizeStaticVar(ND) &&
|
|
286 CGM.getLangOpts().GPURelocatableDeviceCode &&
|
|
287 !CGM.getLangOpts().CUID.empty()) {
|
|
288 SmallString<256> Buffer;
|
|
289 llvm::raw_svector_ostream Out(Buffer);
|
|
290 Out << DeviceSideName;
|
|
291 CGM.printPostfixForExternalizedStaticVar(Out);
|
|
292 DeviceSideName = std::string(Out.str());
|
|
293 }
|
150
|
294 return DeviceSideName;
|
|
295 }
|
|
296
|
|
297 void CGNVCUDARuntime::emitDeviceStub(CodeGenFunction &CGF,
|
|
298 FunctionArgList &Args) {
|
|
299 EmittedKernels.push_back({CGF.CurFn, CGF.CurFuncDecl});
|
221
|
300 if (auto *GV = dyn_cast<llvm::GlobalVariable>(KernelHandles[CGF.CurFn])) {
|
|
301 GV->setLinkage(CGF.CurFn->getLinkage());
|
|
302 GV->setInitializer(CGF.CurFn);
|
|
303 }
|
150
|
304 if (CudaFeatureEnabled(CGM.getTarget().getSDKVersion(),
|
|
305 CudaFeature::CUDA_USES_NEW_LAUNCH) ||
|
221
|
306 (CGF.getLangOpts().HIP && CGF.getLangOpts().HIPUseNewLaunchAPI))
|
150
|
307 emitDeviceStubBodyNew(CGF, Args);
|
|
308 else
|
|
309 emitDeviceStubBodyLegacy(CGF, Args);
|
|
310 }
|
|
311
|
|
312 // CUDA 9.0+ uses new way to launch kernels. Parameters are packed in a local
|
|
313 // array and kernels are launched using cudaLaunchKernel().
|
|
314 void CGNVCUDARuntime::emitDeviceStubBodyNew(CodeGenFunction &CGF,
|
|
315 FunctionArgList &Args) {
|
|
316 // Build the shadow stack entry at the very start of the function.
|
|
317
|
|
318 // Calculate amount of space we will need for all arguments. If we have no
|
|
319 // args, allocate a single pointer so we still have a valid pointer to the
|
|
320 // argument array that we can pass to runtime, even if it will be unused.
|
|
321 Address KernelArgs = CGF.CreateTempAlloca(
|
|
322 VoidPtrTy, CharUnits::fromQuantity(16), "kernel_args",
|
|
323 llvm::ConstantInt::get(SizeTy, std::max<size_t>(1, Args.size())));
|
|
324 // Store pointers to the arguments in a locally allocated launch_args.
|
|
325 for (unsigned i = 0; i < Args.size(); ++i) {
|
|
326 llvm::Value* VarPtr = CGF.GetAddrOfLocalVar(Args[i]).getPointer();
|
|
327 llvm::Value *VoidVarPtr = CGF.Builder.CreatePointerCast(VarPtr, VoidPtrTy);
|
|
328 CGF.Builder.CreateDefaultAlignedStore(
|
|
329 VoidVarPtr, CGF.Builder.CreateConstGEP1_32(KernelArgs.getPointer(), i));
|
|
330 }
|
|
331
|
|
332 llvm::BasicBlock *EndBlock = CGF.createBasicBlock("setup.end");
|
|
333
|
|
334 // Lookup cudaLaunchKernel/hipLaunchKernel function.
|
|
335 // cudaError_t cudaLaunchKernel(const void *func, dim3 gridDim, dim3 blockDim,
|
|
336 // void **args, size_t sharedMem,
|
|
337 // cudaStream_t stream);
|
|
338 // hipError_t hipLaunchKernel(const void *func, dim3 gridDim, dim3 blockDim,
|
|
339 // void **args, size_t sharedMem,
|
|
340 // hipStream_t stream);
|
|
341 TranslationUnitDecl *TUDecl = CGM.getContext().getTranslationUnitDecl();
|
|
342 DeclContext *DC = TranslationUnitDecl::castToDeclContext(TUDecl);
|
|
343 auto LaunchKernelName = addPrefixToName("LaunchKernel");
|
|
344 IdentifierInfo &cudaLaunchKernelII =
|
|
345 CGM.getContext().Idents.get(LaunchKernelName);
|
|
346 FunctionDecl *cudaLaunchKernelFD = nullptr;
|
221
|
347 for (auto *Result : DC->lookup(&cudaLaunchKernelII)) {
|
150
|
348 if (FunctionDecl *FD = dyn_cast<FunctionDecl>(Result))
|
|
349 cudaLaunchKernelFD = FD;
|
|
350 }
|
|
351
|
|
352 if (cudaLaunchKernelFD == nullptr) {
|
|
353 CGM.Error(CGF.CurFuncDecl->getLocation(),
|
|
354 "Can't find declaration for " + LaunchKernelName);
|
|
355 return;
|
|
356 }
|
|
357 // Create temporary dim3 grid_dim, block_dim.
|
|
358 ParmVarDecl *GridDimParam = cudaLaunchKernelFD->getParamDecl(1);
|
|
359 QualType Dim3Ty = GridDimParam->getType();
|
|
360 Address GridDim =
|
|
361 CGF.CreateMemTemp(Dim3Ty, CharUnits::fromQuantity(8), "grid_dim");
|
|
362 Address BlockDim =
|
|
363 CGF.CreateMemTemp(Dim3Ty, CharUnits::fromQuantity(8), "block_dim");
|
|
364 Address ShmemSize =
|
|
365 CGF.CreateTempAlloca(SizeTy, CGM.getSizeAlign(), "shmem_size");
|
|
366 Address Stream =
|
|
367 CGF.CreateTempAlloca(VoidPtrTy, CGM.getPointerAlign(), "stream");
|
|
368 llvm::FunctionCallee cudaPopConfigFn = CGM.CreateRuntimeFunction(
|
|
369 llvm::FunctionType::get(IntTy,
|
|
370 {/*gridDim=*/GridDim.getType(),
|
|
371 /*blockDim=*/BlockDim.getType(),
|
|
372 /*ShmemSize=*/ShmemSize.getType(),
|
|
373 /*Stream=*/Stream.getType()},
|
|
374 /*isVarArg=*/false),
|
|
375 addUnderscoredPrefixToName("PopCallConfiguration"));
|
|
376
|
|
377 CGF.EmitRuntimeCallOrInvoke(cudaPopConfigFn,
|
|
378 {GridDim.getPointer(), BlockDim.getPointer(),
|
|
379 ShmemSize.getPointer(), Stream.getPointer()});
|
|
380
|
|
381 // Emit the call to cudaLaunch
|
221
|
382 llvm::Value *Kernel =
|
|
383 CGF.Builder.CreatePointerCast(KernelHandles[CGF.CurFn], VoidPtrTy);
|
150
|
384 CallArgList LaunchKernelArgs;
|
|
385 LaunchKernelArgs.add(RValue::get(Kernel),
|
|
386 cudaLaunchKernelFD->getParamDecl(0)->getType());
|
|
387 LaunchKernelArgs.add(RValue::getAggregate(GridDim), Dim3Ty);
|
|
388 LaunchKernelArgs.add(RValue::getAggregate(BlockDim), Dim3Ty);
|
|
389 LaunchKernelArgs.add(RValue::get(KernelArgs.getPointer()),
|
|
390 cudaLaunchKernelFD->getParamDecl(3)->getType());
|
|
391 LaunchKernelArgs.add(RValue::get(CGF.Builder.CreateLoad(ShmemSize)),
|
|
392 cudaLaunchKernelFD->getParamDecl(4)->getType());
|
|
393 LaunchKernelArgs.add(RValue::get(CGF.Builder.CreateLoad(Stream)),
|
|
394 cudaLaunchKernelFD->getParamDecl(5)->getType());
|
|
395
|
|
396 QualType QT = cudaLaunchKernelFD->getType();
|
|
397 QualType CQT = QT.getCanonicalType();
|
|
398 llvm::Type *Ty = CGM.getTypes().ConvertType(CQT);
|
|
399 llvm::FunctionType *FTy = dyn_cast<llvm::FunctionType>(Ty);
|
|
400
|
|
401 const CGFunctionInfo &FI =
|
|
402 CGM.getTypes().arrangeFunctionDeclaration(cudaLaunchKernelFD);
|
|
403 llvm::FunctionCallee cudaLaunchKernelFn =
|
|
404 CGM.CreateRuntimeFunction(FTy, LaunchKernelName);
|
|
405 CGF.EmitCall(FI, CGCallee::forDirect(cudaLaunchKernelFn), ReturnValueSlot(),
|
|
406 LaunchKernelArgs);
|
|
407 CGF.EmitBranch(EndBlock);
|
|
408
|
|
409 CGF.EmitBlock(EndBlock);
|
|
410 }
|
|
411
|
|
412 void CGNVCUDARuntime::emitDeviceStubBodyLegacy(CodeGenFunction &CGF,
|
|
413 FunctionArgList &Args) {
|
|
414 // Emit a call to cudaSetupArgument for each arg in Args.
|
|
415 llvm::FunctionCallee cudaSetupArgFn = getSetupArgumentFn();
|
|
416 llvm::BasicBlock *EndBlock = CGF.createBasicBlock("setup.end");
|
|
417 CharUnits Offset = CharUnits::Zero();
|
|
418 for (const VarDecl *A : Args) {
|
221
|
419 auto TInfo = CGM.getContext().getTypeInfoInChars(A->getType());
|
|
420 Offset = Offset.alignTo(TInfo.Align);
|
150
|
421 llvm::Value *Args[] = {
|
|
422 CGF.Builder.CreatePointerCast(CGF.GetAddrOfLocalVar(A).getPointer(),
|
|
423 VoidPtrTy),
|
221
|
424 llvm::ConstantInt::get(SizeTy, TInfo.Width.getQuantity()),
|
150
|
425 llvm::ConstantInt::get(SizeTy, Offset.getQuantity()),
|
|
426 };
|
|
427 llvm::CallBase *CB = CGF.EmitRuntimeCallOrInvoke(cudaSetupArgFn, Args);
|
|
428 llvm::Constant *Zero = llvm::ConstantInt::get(IntTy, 0);
|
|
429 llvm::Value *CBZero = CGF.Builder.CreateICmpEQ(CB, Zero);
|
|
430 llvm::BasicBlock *NextBlock = CGF.createBasicBlock("setup.next");
|
|
431 CGF.Builder.CreateCondBr(CBZero, NextBlock, EndBlock);
|
|
432 CGF.EmitBlock(NextBlock);
|
221
|
433 Offset += TInfo.Width;
|
150
|
434 }
|
|
435
|
|
436 // Emit the call to cudaLaunch
|
|
437 llvm::FunctionCallee cudaLaunchFn = getLaunchFn();
|
221
|
438 llvm::Value *Arg =
|
|
439 CGF.Builder.CreatePointerCast(KernelHandles[CGF.CurFn], CharPtrTy);
|
150
|
440 CGF.EmitRuntimeCallOrInvoke(cudaLaunchFn, Arg);
|
|
441 CGF.EmitBranch(EndBlock);
|
|
442
|
|
443 CGF.EmitBlock(EndBlock);
|
|
444 }
|
|
445
|
221
|
446 // Replace the original variable Var with the address loaded from variable
|
|
447 // ManagedVar populated by HIP runtime.
|
|
448 static void replaceManagedVar(llvm::GlobalVariable *Var,
|
|
449 llvm::GlobalVariable *ManagedVar) {
|
|
450 SmallVector<SmallVector<llvm::User *, 8>, 8> WorkList;
|
|
451 for (auto &&VarUse : Var->uses()) {
|
|
452 WorkList.push_back({VarUse.getUser()});
|
|
453 }
|
|
454 while (!WorkList.empty()) {
|
|
455 auto &&WorkItem = WorkList.pop_back_val();
|
|
456 auto *U = WorkItem.back();
|
|
457 if (isa<llvm::ConstantExpr>(U)) {
|
|
458 for (auto &&UU : U->uses()) {
|
|
459 WorkItem.push_back(UU.getUser());
|
|
460 WorkList.push_back(WorkItem);
|
|
461 WorkItem.pop_back();
|
|
462 }
|
|
463 continue;
|
|
464 }
|
|
465 if (auto *I = dyn_cast<llvm::Instruction>(U)) {
|
|
466 llvm::Value *OldV = Var;
|
|
467 llvm::Instruction *NewV =
|
|
468 new llvm::LoadInst(Var->getType(), ManagedVar, "ld.managed", false,
|
|
469 llvm::Align(Var->getAlignment()), I);
|
|
470 WorkItem.pop_back();
|
|
471 // Replace constant expressions directly or indirectly using the managed
|
|
472 // variable with instructions.
|
|
473 for (auto &&Op : WorkItem) {
|
|
474 auto *CE = cast<llvm::ConstantExpr>(Op);
|
|
475 auto *NewInst = llvm::createReplacementInstr(CE, I);
|
|
476 NewInst->replaceUsesOfWith(OldV, NewV);
|
|
477 OldV = CE;
|
|
478 NewV = NewInst;
|
|
479 }
|
|
480 I->replaceUsesOfWith(OldV, NewV);
|
|
481 } else {
|
|
482 llvm_unreachable("Invalid use of managed variable");
|
|
483 }
|
|
484 }
|
|
485 }
|
|
486
|
150
|
487 /// Creates a function that sets up state on the host side for CUDA objects that
|
|
488 /// have a presence on both the host and device sides. Specifically, registers
|
|
489 /// the host side of kernel functions and device global variables with the CUDA
|
|
490 /// runtime.
|
|
491 /// \code
|
|
492 /// void __cuda_register_globals(void** GpuBinaryHandle) {
|
|
493 /// __cudaRegisterFunction(GpuBinaryHandle,Kernel0,...);
|
|
494 /// ...
|
|
495 /// __cudaRegisterFunction(GpuBinaryHandle,KernelM,...);
|
|
496 /// __cudaRegisterVar(GpuBinaryHandle, GlobalVar0, ...);
|
|
497 /// ...
|
|
498 /// __cudaRegisterVar(GpuBinaryHandle, GlobalVarN, ...);
|
|
499 /// }
|
|
500 /// \endcode
|
|
501 llvm::Function *CGNVCUDARuntime::makeRegisterGlobalsFn() {
|
|
502 // No need to register anything
|
|
503 if (EmittedKernels.empty() && DeviceVars.empty())
|
|
504 return nullptr;
|
|
505
|
|
506 llvm::Function *RegisterKernelsFunc = llvm::Function::Create(
|
|
507 getRegisterGlobalsFnTy(), llvm::GlobalValue::InternalLinkage,
|
|
508 addUnderscoredPrefixToName("_register_globals"), &TheModule);
|
|
509 llvm::BasicBlock *EntryBB =
|
|
510 llvm::BasicBlock::Create(Context, "entry", RegisterKernelsFunc);
|
|
511 CGBuilderTy Builder(CGM, Context);
|
|
512 Builder.SetInsertPoint(EntryBB);
|
|
513
|
|
514 // void __cudaRegisterFunction(void **, const char *, char *, const char *,
|
|
515 // int, uint3*, uint3*, dim3*, dim3*, int*)
|
|
516 llvm::Type *RegisterFuncParams[] = {
|
|
517 VoidPtrPtrTy, CharPtrTy, CharPtrTy, CharPtrTy, IntTy,
|
|
518 VoidPtrTy, VoidPtrTy, VoidPtrTy, VoidPtrTy, IntTy->getPointerTo()};
|
|
519 llvm::FunctionCallee RegisterFunc = CGM.CreateRuntimeFunction(
|
|
520 llvm::FunctionType::get(IntTy, RegisterFuncParams, false),
|
|
521 addUnderscoredPrefixToName("RegisterFunction"));
|
|
522
|
|
523 // Extract GpuBinaryHandle passed as the first argument passed to
|
|
524 // __cuda_register_globals() and generate __cudaRegisterFunction() call for
|
|
525 // each emitted kernel.
|
|
526 llvm::Argument &GpuBinaryHandlePtr = *RegisterKernelsFunc->arg_begin();
|
|
527 for (auto &&I : EmittedKernels) {
|
173
|
528 llvm::Constant *KernelName =
|
|
529 makeConstantString(getDeviceSideName(cast<NamedDecl>(I.D)));
|
150
|
530 llvm::Constant *NullPtr = llvm::ConstantPointerNull::get(VoidPtrTy);
|
|
531 llvm::Value *Args[] = {
|
|
532 &GpuBinaryHandlePtr,
|
221
|
533 Builder.CreateBitCast(KernelHandles[I.Kernel], VoidPtrTy),
|
150
|
534 KernelName,
|
|
535 KernelName,
|
|
536 llvm::ConstantInt::get(IntTy, -1),
|
|
537 NullPtr,
|
|
538 NullPtr,
|
|
539 NullPtr,
|
|
540 NullPtr,
|
|
541 llvm::ConstantPointerNull::get(IntTy->getPointerTo())};
|
|
542 Builder.CreateCall(RegisterFunc, Args);
|
|
543 }
|
|
544
|
173
|
545 llvm::Type *VarSizeTy = IntTy;
|
|
546 // For HIP or CUDA 9.0+, device variable size is type of `size_t`.
|
|
547 if (CGM.getLangOpts().HIP ||
|
|
548 ToCudaVersion(CGM.getTarget().getSDKVersion()) >= CudaVersion::CUDA_90)
|
|
549 VarSizeTy = SizeTy;
|
|
550
|
150
|
551 // void __cudaRegisterVar(void **, char *, char *, const char *,
|
|
552 // int, int, int, int)
|
|
553 llvm::Type *RegisterVarParams[] = {VoidPtrPtrTy, CharPtrTy, CharPtrTy,
|
173
|
554 CharPtrTy, IntTy, VarSizeTy,
|
150
|
555 IntTy, IntTy};
|
|
556 llvm::FunctionCallee RegisterVar = CGM.CreateRuntimeFunction(
|
173
|
557 llvm::FunctionType::get(VoidTy, RegisterVarParams, false),
|
150
|
558 addUnderscoredPrefixToName("RegisterVar"));
|
221
|
559 // void __hipRegisterManagedVar(void **, char *, char *, const char *,
|
|
560 // size_t, unsigned)
|
|
561 llvm::Type *RegisterManagedVarParams[] = {VoidPtrPtrTy, CharPtrTy, CharPtrTy,
|
|
562 CharPtrTy, VarSizeTy, IntTy};
|
|
563 llvm::FunctionCallee RegisterManagedVar = CGM.CreateRuntimeFunction(
|
|
564 llvm::FunctionType::get(VoidTy, RegisterManagedVarParams, false),
|
|
565 addUnderscoredPrefixToName("RegisterManagedVar"));
|
173
|
566 // void __cudaRegisterSurface(void **, const struct surfaceReference *,
|
|
567 // const void **, const char *, int, int);
|
|
568 llvm::FunctionCallee RegisterSurf = CGM.CreateRuntimeFunction(
|
|
569 llvm::FunctionType::get(
|
|
570 VoidTy, {VoidPtrPtrTy, VoidPtrTy, CharPtrTy, CharPtrTy, IntTy, IntTy},
|
|
571 false),
|
|
572 addUnderscoredPrefixToName("RegisterSurface"));
|
|
573 // void __cudaRegisterTexture(void **, const struct textureReference *,
|
|
574 // const void **, const char *, int, int, int)
|
|
575 llvm::FunctionCallee RegisterTex = CGM.CreateRuntimeFunction(
|
|
576 llvm::FunctionType::get(
|
|
577 VoidTy,
|
|
578 {VoidPtrPtrTy, VoidPtrTy, CharPtrTy, CharPtrTy, IntTy, IntTy, IntTy},
|
|
579 false),
|
|
580 addUnderscoredPrefixToName("RegisterTexture"));
|
150
|
581 for (auto &&Info : DeviceVars) {
|
|
582 llvm::GlobalVariable *Var = Info.Var;
|
221
|
583 assert((!Var->isDeclaration() || Info.Flags.isManaged()) &&
|
|
584 "External variables should not show up here, except HIP managed "
|
|
585 "variables");
|
150
|
586 llvm::Constant *VarName = makeConstantString(getDeviceSideName(Info.D));
|
173
|
587 switch (Info.Flags.getKind()) {
|
|
588 case DeviceVarFlags::Variable: {
|
|
589 uint64_t VarSize =
|
|
590 CGM.getDataLayout().getTypeAllocSize(Var->getValueType());
|
221
|
591 if (Info.Flags.isManaged()) {
|
|
592 auto ManagedVar = new llvm::GlobalVariable(
|
|
593 CGM.getModule(), Var->getType(),
|
|
594 /*isConstant=*/false, Var->getLinkage(),
|
|
595 /*Init=*/Var->isDeclaration()
|
|
596 ? nullptr
|
|
597 : llvm::ConstantPointerNull::get(Var->getType()),
|
|
598 /*Name=*/"", /*InsertBefore=*/nullptr,
|
|
599 llvm::GlobalVariable::NotThreadLocal);
|
|
600 ManagedVar->setDSOLocal(Var->isDSOLocal());
|
|
601 ManagedVar->setVisibility(Var->getVisibility());
|
|
602 ManagedVar->setExternallyInitialized(true);
|
|
603 ManagedVar->takeName(Var);
|
|
604 Var->setName(Twine(ManagedVar->getName() + ".managed"));
|
|
605 replaceManagedVar(Var, ManagedVar);
|
|
606 llvm::Value *Args[] = {
|
|
607 &GpuBinaryHandlePtr,
|
|
608 Builder.CreateBitCast(ManagedVar, VoidPtrTy),
|
|
609 Builder.CreateBitCast(Var, VoidPtrTy),
|
|
610 VarName,
|
|
611 llvm::ConstantInt::get(VarSizeTy, VarSize),
|
|
612 llvm::ConstantInt::get(IntTy, Var->getAlignment())};
|
|
613 if (!Var->isDeclaration())
|
|
614 Builder.CreateCall(RegisterManagedVar, Args);
|
|
615 } else {
|
|
616 llvm::Value *Args[] = {
|
|
617 &GpuBinaryHandlePtr,
|
|
618 Builder.CreateBitCast(Var, VoidPtrTy),
|
|
619 VarName,
|
|
620 VarName,
|
|
621 llvm::ConstantInt::get(IntTy, Info.Flags.isExtern()),
|
|
622 llvm::ConstantInt::get(VarSizeTy, VarSize),
|
|
623 llvm::ConstantInt::get(IntTy, Info.Flags.isConstant()),
|
|
624 llvm::ConstantInt::get(IntTy, 0)};
|
|
625 Builder.CreateCall(RegisterVar, Args);
|
|
626 }
|
173
|
627 break;
|
|
628 }
|
|
629 case DeviceVarFlags::Surface:
|
|
630 Builder.CreateCall(
|
|
631 RegisterSurf,
|
|
632 {&GpuBinaryHandlePtr, Builder.CreateBitCast(Var, VoidPtrTy), VarName,
|
|
633 VarName, llvm::ConstantInt::get(IntTy, Info.Flags.getSurfTexType()),
|
|
634 llvm::ConstantInt::get(IntTy, Info.Flags.isExtern())});
|
|
635 break;
|
|
636 case DeviceVarFlags::Texture:
|
|
637 Builder.CreateCall(
|
|
638 RegisterTex,
|
|
639 {&GpuBinaryHandlePtr, Builder.CreateBitCast(Var, VoidPtrTy), VarName,
|
|
640 VarName, llvm::ConstantInt::get(IntTy, Info.Flags.getSurfTexType()),
|
|
641 llvm::ConstantInt::get(IntTy, Info.Flags.isNormalized()),
|
|
642 llvm::ConstantInt::get(IntTy, Info.Flags.isExtern())});
|
|
643 break;
|
|
644 }
|
150
|
645 }
|
|
646
|
|
647 Builder.CreateRetVoid();
|
|
648 return RegisterKernelsFunc;
|
|
649 }
|
|
650
|
|
651 /// Creates a global constructor function for the module:
|
|
652 ///
|
|
653 /// For CUDA:
|
|
654 /// \code
|
|
655 /// void __cuda_module_ctor(void*) {
|
|
656 /// Handle = __cudaRegisterFatBinary(GpuBinaryBlob);
|
|
657 /// __cuda_register_globals(Handle);
|
|
658 /// }
|
|
659 /// \endcode
|
|
660 ///
|
|
661 /// For HIP:
|
|
662 /// \code
|
|
663 /// void __hip_module_ctor(void*) {
|
|
664 /// if (__hip_gpubin_handle == 0) {
|
|
665 /// __hip_gpubin_handle = __hipRegisterFatBinary(GpuBinaryBlob);
|
|
666 /// __hip_register_globals(__hip_gpubin_handle);
|
|
667 /// }
|
|
668 /// }
|
|
669 /// \endcode
|
|
670 llvm::Function *CGNVCUDARuntime::makeModuleCtorFunction() {
|
|
671 bool IsHIP = CGM.getLangOpts().HIP;
|
|
672 bool IsCUDA = CGM.getLangOpts().CUDA;
|
|
673 // No need to generate ctors/dtors if there is no GPU binary.
|
|
674 StringRef CudaGpuBinaryFileName = CGM.getCodeGenOpts().CudaGpuBinaryFileName;
|
|
675 if (CudaGpuBinaryFileName.empty() && !IsHIP)
|
|
676 return nullptr;
|
|
677 if ((IsHIP || (IsCUDA && !RelocatableDeviceCode)) && EmittedKernels.empty() &&
|
|
678 DeviceVars.empty())
|
|
679 return nullptr;
|
|
680
|
|
681 // void __{cuda|hip}_register_globals(void* handle);
|
|
682 llvm::Function *RegisterGlobalsFunc = makeRegisterGlobalsFn();
|
|
683 // We always need a function to pass in as callback. Create a dummy
|
|
684 // implementation if we don't need to register anything.
|
|
685 if (RelocatableDeviceCode && !RegisterGlobalsFunc)
|
|
686 RegisterGlobalsFunc = makeDummyFunction(getRegisterGlobalsFnTy());
|
|
687
|
|
688 // void ** __{cuda|hip}RegisterFatBinary(void *);
|
|
689 llvm::FunctionCallee RegisterFatbinFunc = CGM.CreateRuntimeFunction(
|
|
690 llvm::FunctionType::get(VoidPtrPtrTy, VoidPtrTy, false),
|
|
691 addUnderscoredPrefixToName("RegisterFatBinary"));
|
|
692 // struct { int magic, int version, void * gpu_binary, void * dont_care };
|
|
693 llvm::StructType *FatbinWrapperTy =
|
|
694 llvm::StructType::get(IntTy, IntTy, VoidPtrTy, VoidPtrTy);
|
|
695
|
|
696 // Register GPU binary with the CUDA runtime, store returned handle in a
|
|
697 // global variable and save a reference in GpuBinaryHandle to be cleaned up
|
|
698 // in destructor on exit. Then associate all known kernels with the GPU binary
|
|
699 // handle so CUDA runtime can figure out what to call on the GPU side.
|
|
700 std::unique_ptr<llvm::MemoryBuffer> CudaGpuBinary = nullptr;
|
|
701 if (!CudaGpuBinaryFileName.empty()) {
|
|
702 llvm::ErrorOr<std::unique_ptr<llvm::MemoryBuffer>> CudaGpuBinaryOrErr =
|
|
703 llvm::MemoryBuffer::getFileOrSTDIN(CudaGpuBinaryFileName);
|
|
704 if (std::error_code EC = CudaGpuBinaryOrErr.getError()) {
|
|
705 CGM.getDiags().Report(diag::err_cannot_open_file)
|
|
706 << CudaGpuBinaryFileName << EC.message();
|
|
707 return nullptr;
|
|
708 }
|
|
709 CudaGpuBinary = std::move(CudaGpuBinaryOrErr.get());
|
|
710 }
|
|
711
|
|
712 llvm::Function *ModuleCtorFunc = llvm::Function::Create(
|
|
713 llvm::FunctionType::get(VoidTy, VoidPtrTy, false),
|
|
714 llvm::GlobalValue::InternalLinkage,
|
|
715 addUnderscoredPrefixToName("_module_ctor"), &TheModule);
|
|
716 llvm::BasicBlock *CtorEntryBB =
|
|
717 llvm::BasicBlock::Create(Context, "entry", ModuleCtorFunc);
|
|
718 CGBuilderTy CtorBuilder(CGM, Context);
|
|
719
|
|
720 CtorBuilder.SetInsertPoint(CtorEntryBB);
|
|
721
|
|
722 const char *FatbinConstantName;
|
|
723 const char *FatbinSectionName;
|
|
724 const char *ModuleIDSectionName;
|
|
725 StringRef ModuleIDPrefix;
|
|
726 llvm::Constant *FatBinStr;
|
|
727 unsigned FatMagic;
|
|
728 if (IsHIP) {
|
|
729 FatbinConstantName = ".hip_fatbin";
|
|
730 FatbinSectionName = ".hipFatBinSegment";
|
|
731
|
|
732 ModuleIDSectionName = "__hip_module_id";
|
|
733 ModuleIDPrefix = "__hip_";
|
|
734
|
|
735 if (CudaGpuBinary) {
|
|
736 // If fatbin is available from early finalization, create a string
|
|
737 // literal containing the fat binary loaded from the given file.
|
221
|
738 const unsigned HIPCodeObjectAlign = 4096;
|
|
739 FatBinStr =
|
|
740 makeConstantString(std::string(CudaGpuBinary->getBuffer()), "",
|
|
741 FatbinConstantName, HIPCodeObjectAlign);
|
150
|
742 } else {
|
|
743 // If fatbin is not available, create an external symbol
|
|
744 // __hip_fatbin in section .hip_fatbin. The external symbol is supposed
|
|
745 // to contain the fat binary but will be populated somewhere else,
|
|
746 // e.g. by lld through link script.
|
|
747 FatBinStr = new llvm::GlobalVariable(
|
|
748 CGM.getModule(), CGM.Int8Ty,
|
|
749 /*isConstant=*/true, llvm::GlobalValue::ExternalLinkage, nullptr,
|
|
750 "__hip_fatbin", nullptr,
|
|
751 llvm::GlobalVariable::NotThreadLocal);
|
|
752 cast<llvm::GlobalVariable>(FatBinStr)->setSection(FatbinConstantName);
|
|
753 }
|
|
754
|
|
755 FatMagic = HIPFatMagic;
|
|
756 } else {
|
|
757 if (RelocatableDeviceCode)
|
|
758 FatbinConstantName = CGM.getTriple().isMacOSX()
|
|
759 ? "__NV_CUDA,__nv_relfatbin"
|
|
760 : "__nv_relfatbin";
|
|
761 else
|
|
762 FatbinConstantName =
|
|
763 CGM.getTriple().isMacOSX() ? "__NV_CUDA,__nv_fatbin" : ".nv_fatbin";
|
|
764 // NVIDIA's cuobjdump looks for fatbins in this section.
|
|
765 FatbinSectionName =
|
|
766 CGM.getTriple().isMacOSX() ? "__NV_CUDA,__fatbin" : ".nvFatBinSegment";
|
|
767
|
|
768 ModuleIDSectionName = CGM.getTriple().isMacOSX()
|
|
769 ? "__NV_CUDA,__nv_module_id"
|
|
770 : "__nv_module_id";
|
|
771 ModuleIDPrefix = "__nv_";
|
|
772
|
|
773 // For CUDA, create a string literal containing the fat binary loaded from
|
|
774 // the given file.
|
|
775 FatBinStr = makeConstantString(std::string(CudaGpuBinary->getBuffer()), "",
|
|
776 FatbinConstantName, 8);
|
|
777 FatMagic = CudaFatMagic;
|
|
778 }
|
|
779
|
|
780 // Create initialized wrapper structure that points to the loaded GPU binary
|
|
781 ConstantInitBuilder Builder(CGM);
|
|
782 auto Values = Builder.beginStruct(FatbinWrapperTy);
|
|
783 // Fatbin wrapper magic.
|
|
784 Values.addInt(IntTy, FatMagic);
|
|
785 // Fatbin version.
|
|
786 Values.addInt(IntTy, 1);
|
|
787 // Data.
|
|
788 Values.add(FatBinStr);
|
|
789 // Unused in fatbin v1.
|
|
790 Values.add(llvm::ConstantPointerNull::get(VoidPtrTy));
|
|
791 llvm::GlobalVariable *FatbinWrapper = Values.finishAndCreateGlobal(
|
|
792 addUnderscoredPrefixToName("_fatbin_wrapper"), CGM.getPointerAlign(),
|
|
793 /*constant*/ true);
|
|
794 FatbinWrapper->setSection(FatbinSectionName);
|
|
795
|
|
796 // There is only one HIP fat binary per linked module, however there are
|
|
797 // multiple constructor functions. Make sure the fat binary is registered
|
|
798 // only once. The constructor functions are executed by the dynamic loader
|
|
799 // before the program gains control. The dynamic loader cannot execute the
|
|
800 // constructor functions concurrently since doing that would not guarantee
|
|
801 // thread safety of the loaded program. Therefore we can assume sequential
|
|
802 // execution of constructor functions here.
|
|
803 if (IsHIP) {
|
|
804 auto Linkage = CudaGpuBinary ? llvm::GlobalValue::InternalLinkage :
|
|
805 llvm::GlobalValue::LinkOnceAnyLinkage;
|
|
806 llvm::BasicBlock *IfBlock =
|
|
807 llvm::BasicBlock::Create(Context, "if", ModuleCtorFunc);
|
|
808 llvm::BasicBlock *ExitBlock =
|
|
809 llvm::BasicBlock::Create(Context, "exit", ModuleCtorFunc);
|
|
810 // The name, size, and initialization pattern of this variable is part
|
|
811 // of HIP ABI.
|
|
812 GpuBinaryHandle = new llvm::GlobalVariable(
|
|
813 TheModule, VoidPtrPtrTy, /*isConstant=*/false,
|
|
814 Linkage,
|
|
815 /*Initializer=*/llvm::ConstantPointerNull::get(VoidPtrPtrTy),
|
|
816 "__hip_gpubin_handle");
|
|
817 GpuBinaryHandle->setAlignment(CGM.getPointerAlign().getAsAlign());
|
|
818 // Prevent the weak symbol in different shared libraries being merged.
|
|
819 if (Linkage != llvm::GlobalValue::InternalLinkage)
|
|
820 GpuBinaryHandle->setVisibility(llvm::GlobalValue::HiddenVisibility);
|
|
821 Address GpuBinaryAddr(
|
|
822 GpuBinaryHandle,
|
|
823 CharUnits::fromQuantity(GpuBinaryHandle->getAlignment()));
|
|
824 {
|
|
825 auto HandleValue = CtorBuilder.CreateLoad(GpuBinaryAddr);
|
|
826 llvm::Constant *Zero =
|
|
827 llvm::Constant::getNullValue(HandleValue->getType());
|
|
828 llvm::Value *EQZero = CtorBuilder.CreateICmpEQ(HandleValue, Zero);
|
|
829 CtorBuilder.CreateCondBr(EQZero, IfBlock, ExitBlock);
|
|
830 }
|
|
831 {
|
|
832 CtorBuilder.SetInsertPoint(IfBlock);
|
|
833 // GpuBinaryHandle = __hipRegisterFatBinary(&FatbinWrapper);
|
|
834 llvm::CallInst *RegisterFatbinCall = CtorBuilder.CreateCall(
|
|
835 RegisterFatbinFunc,
|
|
836 CtorBuilder.CreateBitCast(FatbinWrapper, VoidPtrTy));
|
|
837 CtorBuilder.CreateStore(RegisterFatbinCall, GpuBinaryAddr);
|
|
838 CtorBuilder.CreateBr(ExitBlock);
|
|
839 }
|
|
840 {
|
|
841 CtorBuilder.SetInsertPoint(ExitBlock);
|
|
842 // Call __hip_register_globals(GpuBinaryHandle);
|
|
843 if (RegisterGlobalsFunc) {
|
|
844 auto HandleValue = CtorBuilder.CreateLoad(GpuBinaryAddr);
|
|
845 CtorBuilder.CreateCall(RegisterGlobalsFunc, HandleValue);
|
|
846 }
|
|
847 }
|
|
848 } else if (!RelocatableDeviceCode) {
|
|
849 // Register binary with CUDA runtime. This is substantially different in
|
|
850 // default mode vs. separate compilation!
|
|
851 // GpuBinaryHandle = __cudaRegisterFatBinary(&FatbinWrapper);
|
|
852 llvm::CallInst *RegisterFatbinCall = CtorBuilder.CreateCall(
|
|
853 RegisterFatbinFunc,
|
|
854 CtorBuilder.CreateBitCast(FatbinWrapper, VoidPtrTy));
|
|
855 GpuBinaryHandle = new llvm::GlobalVariable(
|
|
856 TheModule, VoidPtrPtrTy, false, llvm::GlobalValue::InternalLinkage,
|
|
857 llvm::ConstantPointerNull::get(VoidPtrPtrTy), "__cuda_gpubin_handle");
|
|
858 GpuBinaryHandle->setAlignment(CGM.getPointerAlign().getAsAlign());
|
|
859 CtorBuilder.CreateAlignedStore(RegisterFatbinCall, GpuBinaryHandle,
|
|
860 CGM.getPointerAlign());
|
|
861
|
|
862 // Call __cuda_register_globals(GpuBinaryHandle);
|
|
863 if (RegisterGlobalsFunc)
|
|
864 CtorBuilder.CreateCall(RegisterGlobalsFunc, RegisterFatbinCall);
|
|
865
|
|
866 // Call __cudaRegisterFatBinaryEnd(Handle) if this CUDA version needs it.
|
|
867 if (CudaFeatureEnabled(CGM.getTarget().getSDKVersion(),
|
|
868 CudaFeature::CUDA_USES_FATBIN_REGISTER_END)) {
|
|
869 // void __cudaRegisterFatBinaryEnd(void **);
|
|
870 llvm::FunctionCallee RegisterFatbinEndFunc = CGM.CreateRuntimeFunction(
|
|
871 llvm::FunctionType::get(VoidTy, VoidPtrPtrTy, false),
|
|
872 "__cudaRegisterFatBinaryEnd");
|
|
873 CtorBuilder.CreateCall(RegisterFatbinEndFunc, RegisterFatbinCall);
|
|
874 }
|
|
875 } else {
|
|
876 // Generate a unique module ID.
|
|
877 SmallString<64> ModuleID;
|
|
878 llvm::raw_svector_ostream OS(ModuleID);
|
|
879 OS << ModuleIDPrefix << llvm::format("%" PRIx64, FatbinWrapper->getGUID());
|
|
880 llvm::Constant *ModuleIDConstant = makeConstantString(
|
|
881 std::string(ModuleID.str()), "", ModuleIDSectionName, 32);
|
|
882
|
|
883 // Create an alias for the FatbinWrapper that nvcc will look for.
|
|
884 llvm::GlobalAlias::create(llvm::GlobalValue::ExternalLinkage,
|
|
885 Twine("__fatbinwrap") + ModuleID, FatbinWrapper);
|
|
886
|
|
887 // void __cudaRegisterLinkedBinary%ModuleID%(void (*)(void *), void *,
|
|
888 // void *, void (*)(void **))
|
|
889 SmallString<128> RegisterLinkedBinaryName("__cudaRegisterLinkedBinary");
|
|
890 RegisterLinkedBinaryName += ModuleID;
|
|
891 llvm::FunctionCallee RegisterLinkedBinaryFunc = CGM.CreateRuntimeFunction(
|
|
892 getRegisterLinkedBinaryFnTy(), RegisterLinkedBinaryName);
|
|
893
|
|
894 assert(RegisterGlobalsFunc && "Expecting at least dummy function!");
|
|
895 llvm::Value *Args[] = {RegisterGlobalsFunc,
|
|
896 CtorBuilder.CreateBitCast(FatbinWrapper, VoidPtrTy),
|
|
897 ModuleIDConstant,
|
|
898 makeDummyFunction(getCallbackFnTy())};
|
|
899 CtorBuilder.CreateCall(RegisterLinkedBinaryFunc, Args);
|
|
900 }
|
|
901
|
|
902 // Create destructor and register it with atexit() the way NVCC does it. Doing
|
|
903 // it during regular destructor phase worked in CUDA before 9.2 but results in
|
|
904 // double-free in 9.2.
|
|
905 if (llvm::Function *CleanupFn = makeModuleDtorFunction()) {
|
|
906 // extern "C" int atexit(void (*f)(void));
|
|
907 llvm::FunctionType *AtExitTy =
|
|
908 llvm::FunctionType::get(IntTy, CleanupFn->getType(), false);
|
|
909 llvm::FunctionCallee AtExitFunc =
|
|
910 CGM.CreateRuntimeFunction(AtExitTy, "atexit", llvm::AttributeList(),
|
|
911 /*Local=*/true);
|
|
912 CtorBuilder.CreateCall(AtExitFunc, CleanupFn);
|
|
913 }
|
|
914
|
|
915 CtorBuilder.CreateRetVoid();
|
|
916 return ModuleCtorFunc;
|
|
917 }
|
|
918
|
|
919 /// Creates a global destructor function that unregisters the GPU code blob
|
|
920 /// registered by constructor.
|
|
921 ///
|
|
922 /// For CUDA:
|
|
923 /// \code
|
|
924 /// void __cuda_module_dtor(void*) {
|
|
925 /// __cudaUnregisterFatBinary(Handle);
|
|
926 /// }
|
|
927 /// \endcode
|
|
928 ///
|
|
929 /// For HIP:
|
|
930 /// \code
|
|
931 /// void __hip_module_dtor(void*) {
|
|
932 /// if (__hip_gpubin_handle) {
|
|
933 /// __hipUnregisterFatBinary(__hip_gpubin_handle);
|
|
934 /// __hip_gpubin_handle = 0;
|
|
935 /// }
|
|
936 /// }
|
|
937 /// \endcode
|
|
938 llvm::Function *CGNVCUDARuntime::makeModuleDtorFunction() {
|
|
939 // No need for destructor if we don't have a handle to unregister.
|
|
940 if (!GpuBinaryHandle)
|
|
941 return nullptr;
|
|
942
|
|
943 // void __cudaUnregisterFatBinary(void ** handle);
|
|
944 llvm::FunctionCallee UnregisterFatbinFunc = CGM.CreateRuntimeFunction(
|
|
945 llvm::FunctionType::get(VoidTy, VoidPtrPtrTy, false),
|
|
946 addUnderscoredPrefixToName("UnregisterFatBinary"));
|
|
947
|
|
948 llvm::Function *ModuleDtorFunc = llvm::Function::Create(
|
|
949 llvm::FunctionType::get(VoidTy, VoidPtrTy, false),
|
|
950 llvm::GlobalValue::InternalLinkage,
|
|
951 addUnderscoredPrefixToName("_module_dtor"), &TheModule);
|
|
952
|
|
953 llvm::BasicBlock *DtorEntryBB =
|
|
954 llvm::BasicBlock::Create(Context, "entry", ModuleDtorFunc);
|
|
955 CGBuilderTy DtorBuilder(CGM, Context);
|
|
956 DtorBuilder.SetInsertPoint(DtorEntryBB);
|
|
957
|
|
958 Address GpuBinaryAddr(GpuBinaryHandle, CharUnits::fromQuantity(
|
|
959 GpuBinaryHandle->getAlignment()));
|
|
960 auto HandleValue = DtorBuilder.CreateLoad(GpuBinaryAddr);
|
|
961 // There is only one HIP fat binary per linked module, however there are
|
|
962 // multiple destructor functions. Make sure the fat binary is unregistered
|
|
963 // only once.
|
|
964 if (CGM.getLangOpts().HIP) {
|
|
965 llvm::BasicBlock *IfBlock =
|
|
966 llvm::BasicBlock::Create(Context, "if", ModuleDtorFunc);
|
|
967 llvm::BasicBlock *ExitBlock =
|
|
968 llvm::BasicBlock::Create(Context, "exit", ModuleDtorFunc);
|
|
969 llvm::Constant *Zero = llvm::Constant::getNullValue(HandleValue->getType());
|
|
970 llvm::Value *NEZero = DtorBuilder.CreateICmpNE(HandleValue, Zero);
|
|
971 DtorBuilder.CreateCondBr(NEZero, IfBlock, ExitBlock);
|
|
972
|
|
973 DtorBuilder.SetInsertPoint(IfBlock);
|
|
974 DtorBuilder.CreateCall(UnregisterFatbinFunc, HandleValue);
|
|
975 DtorBuilder.CreateStore(Zero, GpuBinaryAddr);
|
|
976 DtorBuilder.CreateBr(ExitBlock);
|
|
977
|
|
978 DtorBuilder.SetInsertPoint(ExitBlock);
|
|
979 } else {
|
|
980 DtorBuilder.CreateCall(UnregisterFatbinFunc, HandleValue);
|
|
981 }
|
|
982 DtorBuilder.CreateRetVoid();
|
|
983 return ModuleDtorFunc;
|
|
984 }
|
|
985
|
|
986 CGCUDARuntime *CodeGen::CreateNVCUDARuntime(CodeGenModule &CGM) {
|
|
987 return new CGNVCUDARuntime(CGM);
|
|
988 }
|
221
|
989
|
|
990 void CGNVCUDARuntime::internalizeDeviceSideVar(
|
|
991 const VarDecl *D, llvm::GlobalValue::LinkageTypes &Linkage) {
|
|
992 // For -fno-gpu-rdc, host-side shadows of external declarations of device-side
|
|
993 // global variables become internal definitions. These have to be internal in
|
|
994 // order to prevent name conflicts with global host variables with the same
|
|
995 // name in a different TUs.
|
|
996 //
|
|
997 // For -fgpu-rdc, the shadow variables should not be internalized because
|
|
998 // they may be accessed by different TU.
|
|
999 if (CGM.getLangOpts().GPURelocatableDeviceCode)
|
|
1000 return;
|
|
1001
|
|
1002 // __shared__ variables are odd. Shadows do get created, but
|
|
1003 // they are not registered with the CUDA runtime, so they
|
|
1004 // can't really be used to access their device-side
|
|
1005 // counterparts. It's not clear yet whether it's nvcc's bug or
|
|
1006 // a feature, but we've got to do the same for compatibility.
|
|
1007 if (D->hasAttr<CUDADeviceAttr>() || D->hasAttr<CUDAConstantAttr>() ||
|
|
1008 D->hasAttr<CUDASharedAttr>() ||
|
|
1009 D->getType()->isCUDADeviceBuiltinSurfaceType() ||
|
|
1010 D->getType()->isCUDADeviceBuiltinTextureType()) {
|
|
1011 Linkage = llvm::GlobalValue::InternalLinkage;
|
|
1012 }
|
|
1013 }
|
|
1014
|
|
1015 void CGNVCUDARuntime::handleVarRegistration(const VarDecl *D,
|
|
1016 llvm::GlobalVariable &GV) {
|
|
1017 if (D->hasAttr<CUDADeviceAttr>() || D->hasAttr<CUDAConstantAttr>()) {
|
|
1018 // Shadow variables and their properties must be registered with CUDA
|
|
1019 // runtime. Skip Extern global variables, which will be registered in
|
|
1020 // the TU where they are defined.
|
|
1021 //
|
|
1022 // Don't register a C++17 inline variable. The local symbol can be
|
|
1023 // discarded and referencing a discarded local symbol from outside the
|
|
1024 // comdat (__cuda_register_globals) is disallowed by the ELF spec.
|
|
1025 //
|
|
1026 // HIP managed variables need to be always recorded in device and host
|
|
1027 // compilations for transformation.
|
|
1028 //
|
|
1029 // HIP managed variables and variables in CUDADeviceVarODRUsedByHost are
|
|
1030 // added to llvm.compiler-used, therefore they are safe to be registered.
|
|
1031 if ((!D->hasExternalStorage() && !D->isInline()) ||
|
|
1032 CGM.getContext().CUDADeviceVarODRUsedByHost.contains(D) ||
|
|
1033 D->hasAttr<HIPManagedAttr>()) {
|
|
1034 registerDeviceVar(D, GV, !D->hasDefinition(),
|
|
1035 D->hasAttr<CUDAConstantAttr>());
|
|
1036 }
|
|
1037 } else if (D->getType()->isCUDADeviceBuiltinSurfaceType() ||
|
|
1038 D->getType()->isCUDADeviceBuiltinTextureType()) {
|
|
1039 // Builtin surfaces and textures and their template arguments are
|
|
1040 // also registered with CUDA runtime.
|
|
1041 const auto *TD = cast<ClassTemplateSpecializationDecl>(
|
|
1042 D->getType()->castAs<RecordType>()->getDecl());
|
|
1043 const TemplateArgumentList &Args = TD->getTemplateArgs();
|
|
1044 if (TD->hasAttr<CUDADeviceBuiltinSurfaceTypeAttr>()) {
|
|
1045 assert(Args.size() == 2 &&
|
|
1046 "Unexpected number of template arguments of CUDA device "
|
|
1047 "builtin surface type.");
|
|
1048 auto SurfType = Args[1].getAsIntegral();
|
|
1049 if (!D->hasExternalStorage())
|
|
1050 registerDeviceSurf(D, GV, !D->hasDefinition(), SurfType.getSExtValue());
|
|
1051 } else {
|
|
1052 assert(Args.size() == 3 &&
|
|
1053 "Unexpected number of template arguments of CUDA device "
|
|
1054 "builtin texture type.");
|
|
1055 auto TexType = Args[1].getAsIntegral();
|
|
1056 auto Normalized = Args[2].getAsIntegral();
|
|
1057 if (!D->hasExternalStorage())
|
|
1058 registerDeviceTex(D, GV, !D->hasDefinition(), TexType.getSExtValue(),
|
|
1059 Normalized.getZExtValue());
|
|
1060 }
|
|
1061 }
|
|
1062 }
|
|
1063
|
|
1064 // Transform managed variables to pointers to managed variables in device code.
|
|
1065 // Each use of the original managed variable is replaced by a load from the
|
|
1066 // transformed managed variable. The transformed managed variable contains
|
|
1067 // the address of managed memory which will be allocated by the runtime.
|
|
1068 void CGNVCUDARuntime::transformManagedVars() {
|
|
1069 for (auto &&Info : DeviceVars) {
|
|
1070 llvm::GlobalVariable *Var = Info.Var;
|
|
1071 if (Info.Flags.getKind() == DeviceVarFlags::Variable &&
|
|
1072 Info.Flags.isManaged()) {
|
|
1073 auto ManagedVar = new llvm::GlobalVariable(
|
|
1074 CGM.getModule(), Var->getType(),
|
|
1075 /*isConstant=*/false, Var->getLinkage(),
|
|
1076 /*Init=*/Var->isDeclaration()
|
|
1077 ? nullptr
|
|
1078 : llvm::ConstantPointerNull::get(Var->getType()),
|
|
1079 /*Name=*/"", /*InsertBefore=*/nullptr,
|
|
1080 llvm::GlobalVariable::NotThreadLocal,
|
|
1081 CGM.getContext().getTargetAddressSpace(LangAS::cuda_device));
|
|
1082 ManagedVar->setDSOLocal(Var->isDSOLocal());
|
|
1083 ManagedVar->setVisibility(Var->getVisibility());
|
|
1084 ManagedVar->setExternallyInitialized(true);
|
|
1085 replaceManagedVar(Var, ManagedVar);
|
|
1086 ManagedVar->takeName(Var);
|
|
1087 Var->setName(Twine(ManagedVar->getName()) + ".managed");
|
|
1088 // Keep managed variables even if they are not used in device code since
|
|
1089 // they need to be allocated by the runtime.
|
|
1090 if (!Var->isDeclaration()) {
|
|
1091 assert(!ManagedVar->isDeclaration());
|
|
1092 CGM.addCompilerUsedGlobal(Var);
|
|
1093 CGM.addCompilerUsedGlobal(ManagedVar);
|
|
1094 }
|
|
1095 }
|
|
1096 }
|
|
1097 }
|
|
1098
|
|
1099 // Returns module constructor to be added.
|
|
1100 llvm::Function *CGNVCUDARuntime::finalizeModule() {
|
|
1101 if (CGM.getLangOpts().CUDAIsDevice) {
|
|
1102 transformManagedVars();
|
|
1103
|
|
1104 // Mark ODR-used device variables as compiler used to prevent it from being
|
|
1105 // eliminated by optimization. This is necessary for device variables
|
|
1106 // ODR-used by host functions. Sema correctly marks them as ODR-used no
|
|
1107 // matter whether they are ODR-used by device or host functions.
|
|
1108 //
|
|
1109 // We do not need to do this if the variable has used attribute since it
|
|
1110 // has already been added.
|
|
1111 //
|
|
1112 // Static device variables have been externalized at this point, therefore
|
|
1113 // variables with LLVM private or internal linkage need not be added.
|
|
1114 for (auto &&Info : DeviceVars) {
|
|
1115 auto Kind = Info.Flags.getKind();
|
|
1116 if (!Info.Var->isDeclaration() &&
|
|
1117 !llvm::GlobalValue::isLocalLinkage(Info.Var->getLinkage()) &&
|
|
1118 (Kind == DeviceVarFlags::Variable ||
|
|
1119 Kind == DeviceVarFlags::Surface ||
|
|
1120 Kind == DeviceVarFlags::Texture) &&
|
|
1121 Info.D->isUsed() && !Info.D->hasAttr<UsedAttr>()) {
|
|
1122 CGM.addCompilerUsedGlobal(Info.Var);
|
|
1123 }
|
|
1124 }
|
|
1125 return nullptr;
|
|
1126 }
|
|
1127 return makeModuleCtorFunction();
|
|
1128 }
|
|
1129
|
|
1130 llvm::GlobalValue *CGNVCUDARuntime::getKernelHandle(llvm::Function *F,
|
|
1131 GlobalDecl GD) {
|
|
1132 auto Loc = KernelHandles.find(F);
|
|
1133 if (Loc != KernelHandles.end())
|
|
1134 return Loc->second;
|
|
1135
|
|
1136 if (!CGM.getLangOpts().HIP) {
|
|
1137 KernelHandles[F] = F;
|
|
1138 KernelStubs[F] = F;
|
|
1139 return F;
|
|
1140 }
|
|
1141
|
|
1142 auto *Var = new llvm::GlobalVariable(
|
|
1143 TheModule, F->getType(), /*isConstant=*/true, F->getLinkage(),
|
|
1144 /*Initializer=*/nullptr,
|
|
1145 CGM.getMangledName(
|
|
1146 GD.getWithKernelReferenceKind(KernelReferenceKind::Kernel)));
|
|
1147 Var->setAlignment(CGM.getPointerAlign().getAsAlign());
|
|
1148 Var->setDSOLocal(F->isDSOLocal());
|
|
1149 Var->setVisibility(F->getVisibility());
|
|
1150 KernelHandles[F] = Var;
|
|
1151 KernelStubs[Var] = F;
|
|
1152 return Var;
|
|
1153 }
|