comparison clang/lib/CodeGen/CGCUDANV.cpp @ 221:79ff65ed7e25

LLVM12 Original
author Shinji KONO <kono@ie.u-ryukyu.ac.jp>
date Tue, 15 Jun 2021 19:15:29 +0900
parents 0572611fdcc8
children 5f17cb93ff66
comparison
equal deleted inserted replaced
220:42394fc6a535 221:79ff65ed7e25
10 // runtime library. 10 // runtime library.
11 // 11 //
12 //===----------------------------------------------------------------------===// 12 //===----------------------------------------------------------------------===//
13 13
14 #include "CGCUDARuntime.h" 14 #include "CGCUDARuntime.h"
15 #include "CGCXXABI.h"
15 #include "CodeGenFunction.h" 16 #include "CodeGenFunction.h"
16 #include "CodeGenModule.h" 17 #include "CodeGenModule.h"
17 #include "clang/AST/Decl.h" 18 #include "clang/AST/Decl.h"
18 #include "clang/Basic/Cuda.h" 19 #include "clang/Basic/Cuda.h"
19 #include "clang/CodeGen/CodeGenABITypes.h" 20 #include "clang/CodeGen/CodeGenABITypes.h"
20 #include "clang/CodeGen/ConstantInitBuilder.h" 21 #include "clang/CodeGen/ConstantInitBuilder.h"
21 #include "llvm/IR/BasicBlock.h" 22 #include "llvm/IR/BasicBlock.h"
22 #include "llvm/IR/Constants.h" 23 #include "llvm/IR/Constants.h"
23 #include "llvm/IR/DerivedTypes.h" 24 #include "llvm/IR/DerivedTypes.h"
25 #include "llvm/IR/ReplaceConstant.h"
24 #include "llvm/Support/Format.h" 26 #include "llvm/Support/Format.h"
25 27
26 using namespace clang; 28 using namespace clang;
27 using namespace CodeGen; 29 using namespace CodeGen;
28 30
39 41
40 /// Convenience reference to LLVM Context 42 /// Convenience reference to LLVM Context
41 llvm::LLVMContext &Context; 43 llvm::LLVMContext &Context;
42 /// Convenience reference to the current module 44 /// Convenience reference to the current module
43 llvm::Module &TheModule; 45 llvm::Module &TheModule;
44 /// Keeps track of kernel launch stubs emitted in this module 46 /// Keeps track of kernel launch stubs and handles emitted in this module
45 struct KernelInfo { 47 struct KernelInfo {
46 llvm::Function *Kernel; 48 llvm::Function *Kernel; // stub function to help launch kernel
47 const Decl *D; 49 const Decl *D;
48 }; 50 };
49 llvm::SmallVector<KernelInfo, 16> EmittedKernels; 51 llvm::SmallVector<KernelInfo, 16> EmittedKernels;
52 // Map a device stub function to a symbol for identifying kernel in host code.
53 // For CUDA, the symbol for identifying the kernel is the same as the device
54 // stub function. For HIP, they are different.
55 llvm::DenseMap<llvm::Function *, llvm::GlobalValue *> KernelHandles;
56 // Map a kernel handle to the kernel stub.
57 llvm::DenseMap<llvm::GlobalValue *, llvm::Function *> KernelStubs;
50 struct VarInfo { 58 struct VarInfo {
51 llvm::GlobalVariable *Var; 59 llvm::GlobalVariable *Var;
52 const VarDecl *D; 60 const VarDecl *D;
53 DeviceVarFlags Flags; 61 DeviceVarFlags Flags;
54 }; 62 };
117 125
118 void emitDeviceStubBodyLegacy(CodeGenFunction &CGF, FunctionArgList &Args); 126 void emitDeviceStubBodyLegacy(CodeGenFunction &CGF, FunctionArgList &Args);
119 void emitDeviceStubBodyNew(CodeGenFunction &CGF, FunctionArgList &Args); 127 void emitDeviceStubBodyNew(CodeGenFunction &CGF, FunctionArgList &Args);
120 std::string getDeviceSideName(const NamedDecl *ND) override; 128 std::string getDeviceSideName(const NamedDecl *ND) override;
121 129
122 public:
123 CGNVCUDARuntime(CodeGenModule &CGM);
124
125 void emitDeviceStub(CodeGenFunction &CGF, FunctionArgList &Args) override;
126 void registerDeviceVar(const VarDecl *VD, llvm::GlobalVariable &Var, 130 void registerDeviceVar(const VarDecl *VD, llvm::GlobalVariable &Var,
127 bool Extern, bool Constant) override { 131 bool Extern, bool Constant) {
128 DeviceVars.push_back({&Var, 132 DeviceVars.push_back({&Var,
129 VD, 133 VD,
130 {DeviceVarFlags::Variable, Extern, Constant, 134 {DeviceVarFlags::Variable, Extern, Constant,
131 /*Normalized*/ false, /*Type*/ 0}}); 135 VD->hasAttr<HIPManagedAttr>(),
136 /*Normalized*/ false, 0}});
132 } 137 }
133 void registerDeviceSurf(const VarDecl *VD, llvm::GlobalVariable &Var, 138 void registerDeviceSurf(const VarDecl *VD, llvm::GlobalVariable &Var,
134 bool Extern, int Type) override { 139 bool Extern, int Type) {
135 DeviceVars.push_back({&Var, 140 DeviceVars.push_back({&Var,
136 VD, 141 VD,
137 {DeviceVarFlags::Surface, Extern, /*Constant*/ false, 142 {DeviceVarFlags::Surface, Extern, /*Constant*/ false,
143 /*Managed*/ false,
138 /*Normalized*/ false, Type}}); 144 /*Normalized*/ false, Type}});
139 } 145 }
140 void registerDeviceTex(const VarDecl *VD, llvm::GlobalVariable &Var, 146 void registerDeviceTex(const VarDecl *VD, llvm::GlobalVariable &Var,
141 bool Extern, int Type, bool Normalized) override { 147 bool Extern, int Type, bool Normalized) {
142 DeviceVars.push_back({&Var, 148 DeviceVars.push_back({&Var,
143 VD, 149 VD,
144 {DeviceVarFlags::Texture, Extern, /*Constant*/ false, 150 {DeviceVarFlags::Texture, Extern, /*Constant*/ false,
145 Normalized, Type}}); 151 /*Managed*/ false, Normalized, Type}});
146 } 152 }
147 153
148 /// Creates module constructor function 154 /// Creates module constructor function
149 llvm::Function *makeModuleCtorFunction() override; 155 llvm::Function *makeModuleCtorFunction();
150 /// Creates module destructor function 156 /// Creates module destructor function
151 llvm::Function *makeModuleDtorFunction() override; 157 llvm::Function *makeModuleDtorFunction();
158 /// Transform managed variables for device compilation.
159 void transformManagedVars();
160
161 public:
162 CGNVCUDARuntime(CodeGenModule &CGM);
163
164 llvm::GlobalValue *getKernelHandle(llvm::Function *F, GlobalDecl GD) override;
165 llvm::Function *getKernelStub(llvm::GlobalValue *Handle) override {
166 auto Loc = KernelStubs.find(Handle);
167 assert(Loc != KernelStubs.end());
168 return Loc->second;
169 }
170 void emitDeviceStub(CodeGenFunction &CGF, FunctionArgList &Args) override;
171 void handleVarRegistration(const VarDecl *VD,
172 llvm::GlobalVariable &Var) override;
173 void
174 internalizeDeviceSideVar(const VarDecl *D,
175 llvm::GlobalValue::LinkageTypes &Linkage) override;
176
177 llvm::Function *finalizeModule() override;
152 }; 178 };
153 179
154 } 180 }
155 181
156 std::string CGNVCUDARuntime::addPrefixToName(StringRef FuncName) const { 182 std::string CGNVCUDARuntime::addPrefixToName(StringRef FuncName) const {
163 if (CGM.getLangOpts().HIP) 189 if (CGM.getLangOpts().HIP)
164 return ((Twine("__hip") + Twine(FuncName)).str()); 190 return ((Twine("__hip") + Twine(FuncName)).str());
165 return ((Twine("__cuda") + Twine(FuncName)).str()); 191 return ((Twine("__cuda") + Twine(FuncName)).str());
166 } 192 }
167 193
194 static std::unique_ptr<MangleContext> InitDeviceMC(CodeGenModule &CGM) {
195 // If the host and device have different C++ ABIs, mark it as the device
196 // mangle context so that the mangling needs to retrieve the additional
197 // device lambda mangling number instead of the regular host one.
198 if (CGM.getContext().getAuxTargetInfo() &&
199 CGM.getContext().getTargetInfo().getCXXABI().isMicrosoft() &&
200 CGM.getContext().getAuxTargetInfo()->getCXXABI().isItaniumFamily()) {
201 return std::unique_ptr<MangleContext>(
202 CGM.getContext().createDeviceMangleContext(
203 *CGM.getContext().getAuxTargetInfo()));
204 }
205
206 return std::unique_ptr<MangleContext>(CGM.getContext().createMangleContext(
207 CGM.getContext().getAuxTargetInfo()));
208 }
209
168 CGNVCUDARuntime::CGNVCUDARuntime(CodeGenModule &CGM) 210 CGNVCUDARuntime::CGNVCUDARuntime(CodeGenModule &CGM)
169 : CGCUDARuntime(CGM), Context(CGM.getLLVMContext()), 211 : CGCUDARuntime(CGM), Context(CGM.getLLVMContext()),
170 TheModule(CGM.getModule()), 212 TheModule(CGM.getModule()),
171 RelocatableDeviceCode(CGM.getLangOpts().GPURelocatableDeviceCode), 213 RelocatableDeviceCode(CGM.getLangOpts().GPURelocatableDeviceCode),
172 DeviceMC(CGM.getContext().createMangleContext( 214 DeviceMC(InitDeviceMC(CGM)) {
173 CGM.getContext().getAuxTargetInfo())) {
174 CodeGen::CodeGenTypes &Types = CGM.getTypes(); 215 CodeGen::CodeGenTypes &Types = CGM.getTypes();
175 ASTContext &Ctx = CGM.getContext(); 216 ASTContext &Ctx = CGM.getContext();
176 217
177 IntTy = CGM.IntTy; 218 IntTy = CGM.IntTy;
178 SizeTy = CGM.SizeTy; 219 SizeTy = CGM.SizeTy;
225 if (auto *FD = dyn_cast<FunctionDecl>(ND)) 266 if (auto *FD = dyn_cast<FunctionDecl>(ND))
226 GD = GlobalDecl(FD, KernelReferenceKind::Kernel); 267 GD = GlobalDecl(FD, KernelReferenceKind::Kernel);
227 else 268 else
228 GD = GlobalDecl(ND); 269 GD = GlobalDecl(ND);
229 std::string DeviceSideName; 270 std::string DeviceSideName;
230 if (DeviceMC->shouldMangleDeclName(ND)) { 271 MangleContext *MC;
272 if (CGM.getLangOpts().CUDAIsDevice)
273 MC = &CGM.getCXXABI().getMangleContext();
274 else
275 MC = DeviceMC.get();
276 if (MC->shouldMangleDeclName(ND)) {
231 SmallString<256> Buffer; 277 SmallString<256> Buffer;
232 llvm::raw_svector_ostream Out(Buffer); 278 llvm::raw_svector_ostream Out(Buffer);
233 DeviceMC->mangleName(GD, Out); 279 MC->mangleName(GD, Out);
234 DeviceSideName = std::string(Out.str()); 280 DeviceSideName = std::string(Out.str());
235 } else 281 } else
236 DeviceSideName = std::string(ND->getIdentifier()->getName()); 282 DeviceSideName = std::string(ND->getIdentifier()->getName());
283
284 // Make unique name for device side static file-scope variable for HIP.
285 if (CGM.getContext().shouldExternalizeStaticVar(ND) &&
286 CGM.getLangOpts().GPURelocatableDeviceCode &&
287 !CGM.getLangOpts().CUID.empty()) {
288 SmallString<256> Buffer;
289 llvm::raw_svector_ostream Out(Buffer);
290 Out << DeviceSideName;
291 CGM.printPostfixForExternalizedStaticVar(Out);
292 DeviceSideName = std::string(Out.str());
293 }
237 return DeviceSideName; 294 return DeviceSideName;
238 } 295 }
239 296
240 void CGNVCUDARuntime::emitDeviceStub(CodeGenFunction &CGF, 297 void CGNVCUDARuntime::emitDeviceStub(CodeGenFunction &CGF,
241 FunctionArgList &Args) { 298 FunctionArgList &Args) {
242 EmittedKernels.push_back({CGF.CurFn, CGF.CurFuncDecl}); 299 EmittedKernels.push_back({CGF.CurFn, CGF.CurFuncDecl});
300 if (auto *GV = dyn_cast<llvm::GlobalVariable>(KernelHandles[CGF.CurFn])) {
301 GV->setLinkage(CGF.CurFn->getLinkage());
302 GV->setInitializer(CGF.CurFn);
303 }
243 if (CudaFeatureEnabled(CGM.getTarget().getSDKVersion(), 304 if (CudaFeatureEnabled(CGM.getTarget().getSDKVersion(),
244 CudaFeature::CUDA_USES_NEW_LAUNCH) || 305 CudaFeature::CUDA_USES_NEW_LAUNCH) ||
245 CGF.getLangOpts().HIPUseNewLaunchAPI) 306 (CGF.getLangOpts().HIP && CGF.getLangOpts().HIPUseNewLaunchAPI))
246 emitDeviceStubBodyNew(CGF, Args); 307 emitDeviceStubBodyNew(CGF, Args);
247 else 308 else
248 emitDeviceStubBodyLegacy(CGF, Args); 309 emitDeviceStubBodyLegacy(CGF, Args);
249 } 310 }
250 311
281 DeclContext *DC = TranslationUnitDecl::castToDeclContext(TUDecl); 342 DeclContext *DC = TranslationUnitDecl::castToDeclContext(TUDecl);
282 auto LaunchKernelName = addPrefixToName("LaunchKernel"); 343 auto LaunchKernelName = addPrefixToName("LaunchKernel");
283 IdentifierInfo &cudaLaunchKernelII = 344 IdentifierInfo &cudaLaunchKernelII =
284 CGM.getContext().Idents.get(LaunchKernelName); 345 CGM.getContext().Idents.get(LaunchKernelName);
285 FunctionDecl *cudaLaunchKernelFD = nullptr; 346 FunctionDecl *cudaLaunchKernelFD = nullptr;
286 for (const auto &Result : DC->lookup(&cudaLaunchKernelII)) { 347 for (auto *Result : DC->lookup(&cudaLaunchKernelII)) {
287 if (FunctionDecl *FD = dyn_cast<FunctionDecl>(Result)) 348 if (FunctionDecl *FD = dyn_cast<FunctionDecl>(Result))
288 cudaLaunchKernelFD = FD; 349 cudaLaunchKernelFD = FD;
289 } 350 }
290 351
291 if (cudaLaunchKernelFD == nullptr) { 352 if (cudaLaunchKernelFD == nullptr) {
316 CGF.EmitRuntimeCallOrInvoke(cudaPopConfigFn, 377 CGF.EmitRuntimeCallOrInvoke(cudaPopConfigFn,
317 {GridDim.getPointer(), BlockDim.getPointer(), 378 {GridDim.getPointer(), BlockDim.getPointer(),
318 ShmemSize.getPointer(), Stream.getPointer()}); 379 ShmemSize.getPointer(), Stream.getPointer()});
319 380
320 // Emit the call to cudaLaunch 381 // Emit the call to cudaLaunch
321 llvm::Value *Kernel = CGF.Builder.CreatePointerCast(CGF.CurFn, VoidPtrTy); 382 llvm::Value *Kernel =
383 CGF.Builder.CreatePointerCast(KernelHandles[CGF.CurFn], VoidPtrTy);
322 CallArgList LaunchKernelArgs; 384 CallArgList LaunchKernelArgs;
323 LaunchKernelArgs.add(RValue::get(Kernel), 385 LaunchKernelArgs.add(RValue::get(Kernel),
324 cudaLaunchKernelFD->getParamDecl(0)->getType()); 386 cudaLaunchKernelFD->getParamDecl(0)->getType());
325 LaunchKernelArgs.add(RValue::getAggregate(GridDim), Dim3Ty); 387 LaunchKernelArgs.add(RValue::getAggregate(GridDim), Dim3Ty);
326 LaunchKernelArgs.add(RValue::getAggregate(BlockDim), Dim3Ty); 388 LaunchKernelArgs.add(RValue::getAggregate(BlockDim), Dim3Ty);
352 // Emit a call to cudaSetupArgument for each arg in Args. 414 // Emit a call to cudaSetupArgument for each arg in Args.
353 llvm::FunctionCallee cudaSetupArgFn = getSetupArgumentFn(); 415 llvm::FunctionCallee cudaSetupArgFn = getSetupArgumentFn();
354 llvm::BasicBlock *EndBlock = CGF.createBasicBlock("setup.end"); 416 llvm::BasicBlock *EndBlock = CGF.createBasicBlock("setup.end");
355 CharUnits Offset = CharUnits::Zero(); 417 CharUnits Offset = CharUnits::Zero();
356 for (const VarDecl *A : Args) { 418 for (const VarDecl *A : Args) {
357 CharUnits TyWidth, TyAlign; 419 auto TInfo = CGM.getContext().getTypeInfoInChars(A->getType());
358 std::tie(TyWidth, TyAlign) = 420 Offset = Offset.alignTo(TInfo.Align);
359 CGM.getContext().getTypeInfoInChars(A->getType());
360 Offset = Offset.alignTo(TyAlign);
361 llvm::Value *Args[] = { 421 llvm::Value *Args[] = {
362 CGF.Builder.CreatePointerCast(CGF.GetAddrOfLocalVar(A).getPointer(), 422 CGF.Builder.CreatePointerCast(CGF.GetAddrOfLocalVar(A).getPointer(),
363 VoidPtrTy), 423 VoidPtrTy),
364 llvm::ConstantInt::get(SizeTy, TyWidth.getQuantity()), 424 llvm::ConstantInt::get(SizeTy, TInfo.Width.getQuantity()),
365 llvm::ConstantInt::get(SizeTy, Offset.getQuantity()), 425 llvm::ConstantInt::get(SizeTy, Offset.getQuantity()),
366 }; 426 };
367 llvm::CallBase *CB = CGF.EmitRuntimeCallOrInvoke(cudaSetupArgFn, Args); 427 llvm::CallBase *CB = CGF.EmitRuntimeCallOrInvoke(cudaSetupArgFn, Args);
368 llvm::Constant *Zero = llvm::ConstantInt::get(IntTy, 0); 428 llvm::Constant *Zero = llvm::ConstantInt::get(IntTy, 0);
369 llvm::Value *CBZero = CGF.Builder.CreateICmpEQ(CB, Zero); 429 llvm::Value *CBZero = CGF.Builder.CreateICmpEQ(CB, Zero);
370 llvm::BasicBlock *NextBlock = CGF.createBasicBlock("setup.next"); 430 llvm::BasicBlock *NextBlock = CGF.createBasicBlock("setup.next");
371 CGF.Builder.CreateCondBr(CBZero, NextBlock, EndBlock); 431 CGF.Builder.CreateCondBr(CBZero, NextBlock, EndBlock);
372 CGF.EmitBlock(NextBlock); 432 CGF.EmitBlock(NextBlock);
373 Offset += TyWidth; 433 Offset += TInfo.Width;
374 } 434 }
375 435
376 // Emit the call to cudaLaunch 436 // Emit the call to cudaLaunch
377 llvm::FunctionCallee cudaLaunchFn = getLaunchFn(); 437 llvm::FunctionCallee cudaLaunchFn = getLaunchFn();
378 llvm::Value *Arg = CGF.Builder.CreatePointerCast(CGF.CurFn, CharPtrTy); 438 llvm::Value *Arg =
439 CGF.Builder.CreatePointerCast(KernelHandles[CGF.CurFn], CharPtrTy);
379 CGF.EmitRuntimeCallOrInvoke(cudaLaunchFn, Arg); 440 CGF.EmitRuntimeCallOrInvoke(cudaLaunchFn, Arg);
380 CGF.EmitBranch(EndBlock); 441 CGF.EmitBranch(EndBlock);
381 442
382 CGF.EmitBlock(EndBlock); 443 CGF.EmitBlock(EndBlock);
444 }
445
446 // Replace the original variable Var with the address loaded from variable
447 // ManagedVar populated by HIP runtime.
448 static void replaceManagedVar(llvm::GlobalVariable *Var,
449 llvm::GlobalVariable *ManagedVar) {
450 SmallVector<SmallVector<llvm::User *, 8>, 8> WorkList;
451 for (auto &&VarUse : Var->uses()) {
452 WorkList.push_back({VarUse.getUser()});
453 }
454 while (!WorkList.empty()) {
455 auto &&WorkItem = WorkList.pop_back_val();
456 auto *U = WorkItem.back();
457 if (isa<llvm::ConstantExpr>(U)) {
458 for (auto &&UU : U->uses()) {
459 WorkItem.push_back(UU.getUser());
460 WorkList.push_back(WorkItem);
461 WorkItem.pop_back();
462 }
463 continue;
464 }
465 if (auto *I = dyn_cast<llvm::Instruction>(U)) {
466 llvm::Value *OldV = Var;
467 llvm::Instruction *NewV =
468 new llvm::LoadInst(Var->getType(), ManagedVar, "ld.managed", false,
469 llvm::Align(Var->getAlignment()), I);
470 WorkItem.pop_back();
471 // Replace constant expressions directly or indirectly using the managed
472 // variable with instructions.
473 for (auto &&Op : WorkItem) {
474 auto *CE = cast<llvm::ConstantExpr>(Op);
475 auto *NewInst = llvm::createReplacementInstr(CE, I);
476 NewInst->replaceUsesOfWith(OldV, NewV);
477 OldV = CE;
478 NewV = NewInst;
479 }
480 I->replaceUsesOfWith(OldV, NewV);
481 } else {
482 llvm_unreachable("Invalid use of managed variable");
483 }
484 }
383 } 485 }
384 486
385 /// Creates a function that sets up state on the host side for CUDA objects that 487 /// Creates a function that sets up state on the host side for CUDA objects that
386 /// have a presence on both the host and device sides. Specifically, registers 488 /// have a presence on both the host and device sides. Specifically, registers
387 /// the host side of kernel functions and device global variables with the CUDA 489 /// the host side of kernel functions and device global variables with the CUDA
426 llvm::Constant *KernelName = 528 llvm::Constant *KernelName =
427 makeConstantString(getDeviceSideName(cast<NamedDecl>(I.D))); 529 makeConstantString(getDeviceSideName(cast<NamedDecl>(I.D)));
428 llvm::Constant *NullPtr = llvm::ConstantPointerNull::get(VoidPtrTy); 530 llvm::Constant *NullPtr = llvm::ConstantPointerNull::get(VoidPtrTy);
429 llvm::Value *Args[] = { 531 llvm::Value *Args[] = {
430 &GpuBinaryHandlePtr, 532 &GpuBinaryHandlePtr,
431 Builder.CreateBitCast(I.Kernel, VoidPtrTy), 533 Builder.CreateBitCast(KernelHandles[I.Kernel], VoidPtrTy),
432 KernelName, 534 KernelName,
433 KernelName, 535 KernelName,
434 llvm::ConstantInt::get(IntTy, -1), 536 llvm::ConstantInt::get(IntTy, -1),
435 NullPtr, 537 NullPtr,
436 NullPtr, 538 NullPtr,
452 CharPtrTy, IntTy, VarSizeTy, 554 CharPtrTy, IntTy, VarSizeTy,
453 IntTy, IntTy}; 555 IntTy, IntTy};
454 llvm::FunctionCallee RegisterVar = CGM.CreateRuntimeFunction( 556 llvm::FunctionCallee RegisterVar = CGM.CreateRuntimeFunction(
455 llvm::FunctionType::get(VoidTy, RegisterVarParams, false), 557 llvm::FunctionType::get(VoidTy, RegisterVarParams, false),
456 addUnderscoredPrefixToName("RegisterVar")); 558 addUnderscoredPrefixToName("RegisterVar"));
559 // void __hipRegisterManagedVar(void **, char *, char *, const char *,
560 // size_t, unsigned)
561 llvm::Type *RegisterManagedVarParams[] = {VoidPtrPtrTy, CharPtrTy, CharPtrTy,
562 CharPtrTy, VarSizeTy, IntTy};
563 llvm::FunctionCallee RegisterManagedVar = CGM.CreateRuntimeFunction(
564 llvm::FunctionType::get(VoidTy, RegisterManagedVarParams, false),
565 addUnderscoredPrefixToName("RegisterManagedVar"));
457 // void __cudaRegisterSurface(void **, const struct surfaceReference *, 566 // void __cudaRegisterSurface(void **, const struct surfaceReference *,
458 // const void **, const char *, int, int); 567 // const void **, const char *, int, int);
459 llvm::FunctionCallee RegisterSurf = CGM.CreateRuntimeFunction( 568 llvm::FunctionCallee RegisterSurf = CGM.CreateRuntimeFunction(
460 llvm::FunctionType::get( 569 llvm::FunctionType::get(
461 VoidTy, {VoidPtrPtrTy, VoidPtrTy, CharPtrTy, CharPtrTy, IntTy, IntTy}, 570 VoidTy, {VoidPtrPtrTy, VoidPtrTy, CharPtrTy, CharPtrTy, IntTy, IntTy},
469 {VoidPtrPtrTy, VoidPtrTy, CharPtrTy, CharPtrTy, IntTy, IntTy, IntTy}, 578 {VoidPtrPtrTy, VoidPtrTy, CharPtrTy, CharPtrTy, IntTy, IntTy, IntTy},
470 false), 579 false),
471 addUnderscoredPrefixToName("RegisterTexture")); 580 addUnderscoredPrefixToName("RegisterTexture"));
472 for (auto &&Info : DeviceVars) { 581 for (auto &&Info : DeviceVars) {
473 llvm::GlobalVariable *Var = Info.Var; 582 llvm::GlobalVariable *Var = Info.Var;
583 assert((!Var->isDeclaration() || Info.Flags.isManaged()) &&
584 "External variables should not show up here, except HIP managed "
585 "variables");
474 llvm::Constant *VarName = makeConstantString(getDeviceSideName(Info.D)); 586 llvm::Constant *VarName = makeConstantString(getDeviceSideName(Info.D));
475 switch (Info.Flags.getKind()) { 587 switch (Info.Flags.getKind()) {
476 case DeviceVarFlags::Variable: { 588 case DeviceVarFlags::Variable: {
477 uint64_t VarSize = 589 uint64_t VarSize =
478 CGM.getDataLayout().getTypeAllocSize(Var->getValueType()); 590 CGM.getDataLayout().getTypeAllocSize(Var->getValueType());
479 llvm::Value *Args[] = { 591 if (Info.Flags.isManaged()) {
480 &GpuBinaryHandlePtr, 592 auto ManagedVar = new llvm::GlobalVariable(
481 Builder.CreateBitCast(Var, VoidPtrTy), 593 CGM.getModule(), Var->getType(),
482 VarName, 594 /*isConstant=*/false, Var->getLinkage(),
483 VarName, 595 /*Init=*/Var->isDeclaration()
484 llvm::ConstantInt::get(IntTy, Info.Flags.isExtern()), 596 ? nullptr
485 llvm::ConstantInt::get(VarSizeTy, VarSize), 597 : llvm::ConstantPointerNull::get(Var->getType()),
486 llvm::ConstantInt::get(IntTy, Info.Flags.isConstant()), 598 /*Name=*/"", /*InsertBefore=*/nullptr,
487 llvm::ConstantInt::get(IntTy, 0)}; 599 llvm::GlobalVariable::NotThreadLocal);
488 Builder.CreateCall(RegisterVar, Args); 600 ManagedVar->setDSOLocal(Var->isDSOLocal());
601 ManagedVar->setVisibility(Var->getVisibility());
602 ManagedVar->setExternallyInitialized(true);
603 ManagedVar->takeName(Var);
604 Var->setName(Twine(ManagedVar->getName() + ".managed"));
605 replaceManagedVar(Var, ManagedVar);
606 llvm::Value *Args[] = {
607 &GpuBinaryHandlePtr,
608 Builder.CreateBitCast(ManagedVar, VoidPtrTy),
609 Builder.CreateBitCast(Var, VoidPtrTy),
610 VarName,
611 llvm::ConstantInt::get(VarSizeTy, VarSize),
612 llvm::ConstantInt::get(IntTy, Var->getAlignment())};
613 if (!Var->isDeclaration())
614 Builder.CreateCall(RegisterManagedVar, Args);
615 } else {
616 llvm::Value *Args[] = {
617 &GpuBinaryHandlePtr,
618 Builder.CreateBitCast(Var, VoidPtrTy),
619 VarName,
620 VarName,
621 llvm::ConstantInt::get(IntTy, Info.Flags.isExtern()),
622 llvm::ConstantInt::get(VarSizeTy, VarSize),
623 llvm::ConstantInt::get(IntTy, Info.Flags.isConstant()),
624 llvm::ConstantInt::get(IntTy, 0)};
625 Builder.CreateCall(RegisterVar, Args);
626 }
489 break; 627 break;
490 } 628 }
491 case DeviceVarFlags::Surface: 629 case DeviceVarFlags::Surface:
492 Builder.CreateCall( 630 Builder.CreateCall(
493 RegisterSurf, 631 RegisterSurf,
595 ModuleIDPrefix = "__hip_"; 733 ModuleIDPrefix = "__hip_";
596 734
597 if (CudaGpuBinary) { 735 if (CudaGpuBinary) {
598 // If fatbin is available from early finalization, create a string 736 // If fatbin is available from early finalization, create a string
599 // literal containing the fat binary loaded from the given file. 737 // literal containing the fat binary loaded from the given file.
600 FatBinStr = makeConstantString(std::string(CudaGpuBinary->getBuffer()), 738 const unsigned HIPCodeObjectAlign = 4096;
601 "", FatbinConstantName, 8); 739 FatBinStr =
740 makeConstantString(std::string(CudaGpuBinary->getBuffer()), "",
741 FatbinConstantName, HIPCodeObjectAlign);
602 } else { 742 } else {
603 // If fatbin is not available, create an external symbol 743 // If fatbin is not available, create an external symbol
604 // __hip_fatbin in section .hip_fatbin. The external symbol is supposed 744 // __hip_fatbin in section .hip_fatbin. The external symbol is supposed
605 // to contain the fat binary but will be populated somewhere else, 745 // to contain the fat binary but will be populated somewhere else,
606 // e.g. by lld through link script. 746 // e.g. by lld through link script.
844 } 984 }
845 985
846 CGCUDARuntime *CodeGen::CreateNVCUDARuntime(CodeGenModule &CGM) { 986 CGCUDARuntime *CodeGen::CreateNVCUDARuntime(CodeGenModule &CGM) {
847 return new CGNVCUDARuntime(CGM); 987 return new CGNVCUDARuntime(CGM);
848 } 988 }
989
990 void CGNVCUDARuntime::internalizeDeviceSideVar(
991 const VarDecl *D, llvm::GlobalValue::LinkageTypes &Linkage) {
992 // For -fno-gpu-rdc, host-side shadows of external declarations of device-side
993 // global variables become internal definitions. These have to be internal in
994 // order to prevent name conflicts with global host variables with the same
995 // name in a different TUs.
996 //
997 // For -fgpu-rdc, the shadow variables should not be internalized because
998 // they may be accessed by different TU.
999 if (CGM.getLangOpts().GPURelocatableDeviceCode)
1000 return;
1001
1002 // __shared__ variables are odd. Shadows do get created, but
1003 // they are not registered with the CUDA runtime, so they
1004 // can't really be used to access their device-side
1005 // counterparts. It's not clear yet whether it's nvcc's bug or
1006 // a feature, but we've got to do the same for compatibility.
1007 if (D->hasAttr<CUDADeviceAttr>() || D->hasAttr<CUDAConstantAttr>() ||
1008 D->hasAttr<CUDASharedAttr>() ||
1009 D->getType()->isCUDADeviceBuiltinSurfaceType() ||
1010 D->getType()->isCUDADeviceBuiltinTextureType()) {
1011 Linkage = llvm::GlobalValue::InternalLinkage;
1012 }
1013 }
1014
1015 void CGNVCUDARuntime::handleVarRegistration(const VarDecl *D,
1016 llvm::GlobalVariable &GV) {
1017 if (D->hasAttr<CUDADeviceAttr>() || D->hasAttr<CUDAConstantAttr>()) {
1018 // Shadow variables and their properties must be registered with CUDA
1019 // runtime. Skip Extern global variables, which will be registered in
1020 // the TU where they are defined.
1021 //
1022 // Don't register a C++17 inline variable. The local symbol can be
1023 // discarded and referencing a discarded local symbol from outside the
1024 // comdat (__cuda_register_globals) is disallowed by the ELF spec.
1025 //
1026 // HIP managed variables need to be always recorded in device and host
1027 // compilations for transformation.
1028 //
1029 // HIP managed variables and variables in CUDADeviceVarODRUsedByHost are
1030 // added to llvm.compiler-used, therefore they are safe to be registered.
1031 if ((!D->hasExternalStorage() && !D->isInline()) ||
1032 CGM.getContext().CUDADeviceVarODRUsedByHost.contains(D) ||
1033 D->hasAttr<HIPManagedAttr>()) {
1034 registerDeviceVar(D, GV, !D->hasDefinition(),
1035 D->hasAttr<CUDAConstantAttr>());
1036 }
1037 } else if (D->getType()->isCUDADeviceBuiltinSurfaceType() ||
1038 D->getType()->isCUDADeviceBuiltinTextureType()) {
1039 // Builtin surfaces and textures and their template arguments are
1040 // also registered with CUDA runtime.
1041 const auto *TD = cast<ClassTemplateSpecializationDecl>(
1042 D->getType()->castAs<RecordType>()->getDecl());
1043 const TemplateArgumentList &Args = TD->getTemplateArgs();
1044 if (TD->hasAttr<CUDADeviceBuiltinSurfaceTypeAttr>()) {
1045 assert(Args.size() == 2 &&
1046 "Unexpected number of template arguments of CUDA device "
1047 "builtin surface type.");
1048 auto SurfType = Args[1].getAsIntegral();
1049 if (!D->hasExternalStorage())
1050 registerDeviceSurf(D, GV, !D->hasDefinition(), SurfType.getSExtValue());
1051 } else {
1052 assert(Args.size() == 3 &&
1053 "Unexpected number of template arguments of CUDA device "
1054 "builtin texture type.");
1055 auto TexType = Args[1].getAsIntegral();
1056 auto Normalized = Args[2].getAsIntegral();
1057 if (!D->hasExternalStorage())
1058 registerDeviceTex(D, GV, !D->hasDefinition(), TexType.getSExtValue(),
1059 Normalized.getZExtValue());
1060 }
1061 }
1062 }
1063
1064 // Transform managed variables to pointers to managed variables in device code.
1065 // Each use of the original managed variable is replaced by a load from the
1066 // transformed managed variable. The transformed managed variable contains
1067 // the address of managed memory which will be allocated by the runtime.
1068 void CGNVCUDARuntime::transformManagedVars() {
1069 for (auto &&Info : DeviceVars) {
1070 llvm::GlobalVariable *Var = Info.Var;
1071 if (Info.Flags.getKind() == DeviceVarFlags::Variable &&
1072 Info.Flags.isManaged()) {
1073 auto ManagedVar = new llvm::GlobalVariable(
1074 CGM.getModule(), Var->getType(),
1075 /*isConstant=*/false, Var->getLinkage(),
1076 /*Init=*/Var->isDeclaration()
1077 ? nullptr
1078 : llvm::ConstantPointerNull::get(Var->getType()),
1079 /*Name=*/"", /*InsertBefore=*/nullptr,
1080 llvm::GlobalVariable::NotThreadLocal,
1081 CGM.getContext().getTargetAddressSpace(LangAS::cuda_device));
1082 ManagedVar->setDSOLocal(Var->isDSOLocal());
1083 ManagedVar->setVisibility(Var->getVisibility());
1084 ManagedVar->setExternallyInitialized(true);
1085 replaceManagedVar(Var, ManagedVar);
1086 ManagedVar->takeName(Var);
1087 Var->setName(Twine(ManagedVar->getName()) + ".managed");
1088 // Keep managed variables even if they are not used in device code since
1089 // they need to be allocated by the runtime.
1090 if (!Var->isDeclaration()) {
1091 assert(!ManagedVar->isDeclaration());
1092 CGM.addCompilerUsedGlobal(Var);
1093 CGM.addCompilerUsedGlobal(ManagedVar);
1094 }
1095 }
1096 }
1097 }
1098
1099 // Returns module constructor to be added.
1100 llvm::Function *CGNVCUDARuntime::finalizeModule() {
1101 if (CGM.getLangOpts().CUDAIsDevice) {
1102 transformManagedVars();
1103
1104 // Mark ODR-used device variables as compiler used to prevent it from being
1105 // eliminated by optimization. This is necessary for device variables
1106 // ODR-used by host functions. Sema correctly marks them as ODR-used no
1107 // matter whether they are ODR-used by device or host functions.
1108 //
1109 // We do not need to do this if the variable has used attribute since it
1110 // has already been added.
1111 //
1112 // Static device variables have been externalized at this point, therefore
1113 // variables with LLVM private or internal linkage need not be added.
1114 for (auto &&Info : DeviceVars) {
1115 auto Kind = Info.Flags.getKind();
1116 if (!Info.Var->isDeclaration() &&
1117 !llvm::GlobalValue::isLocalLinkage(Info.Var->getLinkage()) &&
1118 (Kind == DeviceVarFlags::Variable ||
1119 Kind == DeviceVarFlags::Surface ||
1120 Kind == DeviceVarFlags::Texture) &&
1121 Info.D->isUsed() && !Info.D->hasAttr<UsedAttr>()) {
1122 CGM.addCompilerUsedGlobal(Info.Var);
1123 }
1124 }
1125 return nullptr;
1126 }
1127 return makeModuleCtorFunction();
1128 }
1129
1130 llvm::GlobalValue *CGNVCUDARuntime::getKernelHandle(llvm::Function *F,
1131 GlobalDecl GD) {
1132 auto Loc = KernelHandles.find(F);
1133 if (Loc != KernelHandles.end())
1134 return Loc->second;
1135
1136 if (!CGM.getLangOpts().HIP) {
1137 KernelHandles[F] = F;
1138 KernelStubs[F] = F;
1139 return F;
1140 }
1141
1142 auto *Var = new llvm::GlobalVariable(
1143 TheModule, F->getType(), /*isConstant=*/true, F->getLinkage(),
1144 /*Initializer=*/nullptr,
1145 CGM.getMangledName(
1146 GD.getWithKernelReferenceKind(KernelReferenceKind::Kernel)));
1147 Var->setAlignment(CGM.getPointerAlign().getAsAlign());
1148 Var->setDSOLocal(F->isDSOLocal());
1149 Var->setVisibility(F->getVisibility());
1150 KernelHandles[F] = Var;
1151 KernelStubs[Var] = F;
1152 return Var;
1153 }