Mercurial > hg > CbC > CbC_llvm
comparison clang/lib/CodeGen/CGCUDANV.cpp @ 221:79ff65ed7e25
LLVM12 Original
author | Shinji KONO <kono@ie.u-ryukyu.ac.jp> |
---|---|
date | Tue, 15 Jun 2021 19:15:29 +0900 |
parents | 0572611fdcc8 |
children | 5f17cb93ff66 |
comparison
equal
deleted
inserted
replaced
220:42394fc6a535 | 221:79ff65ed7e25 |
---|---|
10 // runtime library. | 10 // runtime library. |
11 // | 11 // |
12 //===----------------------------------------------------------------------===// | 12 //===----------------------------------------------------------------------===// |
13 | 13 |
14 #include "CGCUDARuntime.h" | 14 #include "CGCUDARuntime.h" |
15 #include "CGCXXABI.h" | |
15 #include "CodeGenFunction.h" | 16 #include "CodeGenFunction.h" |
16 #include "CodeGenModule.h" | 17 #include "CodeGenModule.h" |
17 #include "clang/AST/Decl.h" | 18 #include "clang/AST/Decl.h" |
18 #include "clang/Basic/Cuda.h" | 19 #include "clang/Basic/Cuda.h" |
19 #include "clang/CodeGen/CodeGenABITypes.h" | 20 #include "clang/CodeGen/CodeGenABITypes.h" |
20 #include "clang/CodeGen/ConstantInitBuilder.h" | 21 #include "clang/CodeGen/ConstantInitBuilder.h" |
21 #include "llvm/IR/BasicBlock.h" | 22 #include "llvm/IR/BasicBlock.h" |
22 #include "llvm/IR/Constants.h" | 23 #include "llvm/IR/Constants.h" |
23 #include "llvm/IR/DerivedTypes.h" | 24 #include "llvm/IR/DerivedTypes.h" |
25 #include "llvm/IR/ReplaceConstant.h" | |
24 #include "llvm/Support/Format.h" | 26 #include "llvm/Support/Format.h" |
25 | 27 |
26 using namespace clang; | 28 using namespace clang; |
27 using namespace CodeGen; | 29 using namespace CodeGen; |
28 | 30 |
39 | 41 |
40 /// Convenience reference to LLVM Context | 42 /// Convenience reference to LLVM Context |
41 llvm::LLVMContext &Context; | 43 llvm::LLVMContext &Context; |
42 /// Convenience reference to the current module | 44 /// Convenience reference to the current module |
43 llvm::Module &TheModule; | 45 llvm::Module &TheModule; |
44 /// Keeps track of kernel launch stubs emitted in this module | 46 /// Keeps track of kernel launch stubs and handles emitted in this module |
45 struct KernelInfo { | 47 struct KernelInfo { |
46 llvm::Function *Kernel; | 48 llvm::Function *Kernel; // stub function to help launch kernel |
47 const Decl *D; | 49 const Decl *D; |
48 }; | 50 }; |
49 llvm::SmallVector<KernelInfo, 16> EmittedKernels; | 51 llvm::SmallVector<KernelInfo, 16> EmittedKernels; |
52 // Map a device stub function to a symbol for identifying kernel in host code. | |
53 // For CUDA, the symbol for identifying the kernel is the same as the device | |
54 // stub function. For HIP, they are different. | |
55 llvm::DenseMap<llvm::Function *, llvm::GlobalValue *> KernelHandles; | |
56 // Map a kernel handle to the kernel stub. | |
57 llvm::DenseMap<llvm::GlobalValue *, llvm::Function *> KernelStubs; | |
50 struct VarInfo { | 58 struct VarInfo { |
51 llvm::GlobalVariable *Var; | 59 llvm::GlobalVariable *Var; |
52 const VarDecl *D; | 60 const VarDecl *D; |
53 DeviceVarFlags Flags; | 61 DeviceVarFlags Flags; |
54 }; | 62 }; |
117 | 125 |
118 void emitDeviceStubBodyLegacy(CodeGenFunction &CGF, FunctionArgList &Args); | 126 void emitDeviceStubBodyLegacy(CodeGenFunction &CGF, FunctionArgList &Args); |
119 void emitDeviceStubBodyNew(CodeGenFunction &CGF, FunctionArgList &Args); | 127 void emitDeviceStubBodyNew(CodeGenFunction &CGF, FunctionArgList &Args); |
120 std::string getDeviceSideName(const NamedDecl *ND) override; | 128 std::string getDeviceSideName(const NamedDecl *ND) override; |
121 | 129 |
122 public: | |
123 CGNVCUDARuntime(CodeGenModule &CGM); | |
124 | |
125 void emitDeviceStub(CodeGenFunction &CGF, FunctionArgList &Args) override; | |
126 void registerDeviceVar(const VarDecl *VD, llvm::GlobalVariable &Var, | 130 void registerDeviceVar(const VarDecl *VD, llvm::GlobalVariable &Var, |
127 bool Extern, bool Constant) override { | 131 bool Extern, bool Constant) { |
128 DeviceVars.push_back({&Var, | 132 DeviceVars.push_back({&Var, |
129 VD, | 133 VD, |
130 {DeviceVarFlags::Variable, Extern, Constant, | 134 {DeviceVarFlags::Variable, Extern, Constant, |
131 /*Normalized*/ false, /*Type*/ 0}}); | 135 VD->hasAttr<HIPManagedAttr>(), |
136 /*Normalized*/ false, 0}}); | |
132 } | 137 } |
133 void registerDeviceSurf(const VarDecl *VD, llvm::GlobalVariable &Var, | 138 void registerDeviceSurf(const VarDecl *VD, llvm::GlobalVariable &Var, |
134 bool Extern, int Type) override { | 139 bool Extern, int Type) { |
135 DeviceVars.push_back({&Var, | 140 DeviceVars.push_back({&Var, |
136 VD, | 141 VD, |
137 {DeviceVarFlags::Surface, Extern, /*Constant*/ false, | 142 {DeviceVarFlags::Surface, Extern, /*Constant*/ false, |
143 /*Managed*/ false, | |
138 /*Normalized*/ false, Type}}); | 144 /*Normalized*/ false, Type}}); |
139 } | 145 } |
140 void registerDeviceTex(const VarDecl *VD, llvm::GlobalVariable &Var, | 146 void registerDeviceTex(const VarDecl *VD, llvm::GlobalVariable &Var, |
141 bool Extern, int Type, bool Normalized) override { | 147 bool Extern, int Type, bool Normalized) { |
142 DeviceVars.push_back({&Var, | 148 DeviceVars.push_back({&Var, |
143 VD, | 149 VD, |
144 {DeviceVarFlags::Texture, Extern, /*Constant*/ false, | 150 {DeviceVarFlags::Texture, Extern, /*Constant*/ false, |
145 Normalized, Type}}); | 151 /*Managed*/ false, Normalized, Type}}); |
146 } | 152 } |
147 | 153 |
148 /// Creates module constructor function | 154 /// Creates module constructor function |
149 llvm::Function *makeModuleCtorFunction() override; | 155 llvm::Function *makeModuleCtorFunction(); |
150 /// Creates module destructor function | 156 /// Creates module destructor function |
151 llvm::Function *makeModuleDtorFunction() override; | 157 llvm::Function *makeModuleDtorFunction(); |
158 /// Transform managed variables for device compilation. | |
159 void transformManagedVars(); | |
160 | |
161 public: | |
162 CGNVCUDARuntime(CodeGenModule &CGM); | |
163 | |
164 llvm::GlobalValue *getKernelHandle(llvm::Function *F, GlobalDecl GD) override; | |
165 llvm::Function *getKernelStub(llvm::GlobalValue *Handle) override { | |
166 auto Loc = KernelStubs.find(Handle); | |
167 assert(Loc != KernelStubs.end()); | |
168 return Loc->second; | |
169 } | |
170 void emitDeviceStub(CodeGenFunction &CGF, FunctionArgList &Args) override; | |
171 void handleVarRegistration(const VarDecl *VD, | |
172 llvm::GlobalVariable &Var) override; | |
173 void | |
174 internalizeDeviceSideVar(const VarDecl *D, | |
175 llvm::GlobalValue::LinkageTypes &Linkage) override; | |
176 | |
177 llvm::Function *finalizeModule() override; | |
152 }; | 178 }; |
153 | 179 |
154 } | 180 } |
155 | 181 |
156 std::string CGNVCUDARuntime::addPrefixToName(StringRef FuncName) const { | 182 std::string CGNVCUDARuntime::addPrefixToName(StringRef FuncName) const { |
163 if (CGM.getLangOpts().HIP) | 189 if (CGM.getLangOpts().HIP) |
164 return ((Twine("__hip") + Twine(FuncName)).str()); | 190 return ((Twine("__hip") + Twine(FuncName)).str()); |
165 return ((Twine("__cuda") + Twine(FuncName)).str()); | 191 return ((Twine("__cuda") + Twine(FuncName)).str()); |
166 } | 192 } |
167 | 193 |
194 static std::unique_ptr<MangleContext> InitDeviceMC(CodeGenModule &CGM) { | |
195 // If the host and device have different C++ ABIs, mark it as the device | |
196 // mangle context so that the mangling needs to retrieve the additional | |
197 // device lambda mangling number instead of the regular host one. | |
198 if (CGM.getContext().getAuxTargetInfo() && | |
199 CGM.getContext().getTargetInfo().getCXXABI().isMicrosoft() && | |
200 CGM.getContext().getAuxTargetInfo()->getCXXABI().isItaniumFamily()) { | |
201 return std::unique_ptr<MangleContext>( | |
202 CGM.getContext().createDeviceMangleContext( | |
203 *CGM.getContext().getAuxTargetInfo())); | |
204 } | |
205 | |
206 return std::unique_ptr<MangleContext>(CGM.getContext().createMangleContext( | |
207 CGM.getContext().getAuxTargetInfo())); | |
208 } | |
209 | |
168 CGNVCUDARuntime::CGNVCUDARuntime(CodeGenModule &CGM) | 210 CGNVCUDARuntime::CGNVCUDARuntime(CodeGenModule &CGM) |
169 : CGCUDARuntime(CGM), Context(CGM.getLLVMContext()), | 211 : CGCUDARuntime(CGM), Context(CGM.getLLVMContext()), |
170 TheModule(CGM.getModule()), | 212 TheModule(CGM.getModule()), |
171 RelocatableDeviceCode(CGM.getLangOpts().GPURelocatableDeviceCode), | 213 RelocatableDeviceCode(CGM.getLangOpts().GPURelocatableDeviceCode), |
172 DeviceMC(CGM.getContext().createMangleContext( | 214 DeviceMC(InitDeviceMC(CGM)) { |
173 CGM.getContext().getAuxTargetInfo())) { | |
174 CodeGen::CodeGenTypes &Types = CGM.getTypes(); | 215 CodeGen::CodeGenTypes &Types = CGM.getTypes(); |
175 ASTContext &Ctx = CGM.getContext(); | 216 ASTContext &Ctx = CGM.getContext(); |
176 | 217 |
177 IntTy = CGM.IntTy; | 218 IntTy = CGM.IntTy; |
178 SizeTy = CGM.SizeTy; | 219 SizeTy = CGM.SizeTy; |
225 if (auto *FD = dyn_cast<FunctionDecl>(ND)) | 266 if (auto *FD = dyn_cast<FunctionDecl>(ND)) |
226 GD = GlobalDecl(FD, KernelReferenceKind::Kernel); | 267 GD = GlobalDecl(FD, KernelReferenceKind::Kernel); |
227 else | 268 else |
228 GD = GlobalDecl(ND); | 269 GD = GlobalDecl(ND); |
229 std::string DeviceSideName; | 270 std::string DeviceSideName; |
230 if (DeviceMC->shouldMangleDeclName(ND)) { | 271 MangleContext *MC; |
272 if (CGM.getLangOpts().CUDAIsDevice) | |
273 MC = &CGM.getCXXABI().getMangleContext(); | |
274 else | |
275 MC = DeviceMC.get(); | |
276 if (MC->shouldMangleDeclName(ND)) { | |
231 SmallString<256> Buffer; | 277 SmallString<256> Buffer; |
232 llvm::raw_svector_ostream Out(Buffer); | 278 llvm::raw_svector_ostream Out(Buffer); |
233 DeviceMC->mangleName(GD, Out); | 279 MC->mangleName(GD, Out); |
234 DeviceSideName = std::string(Out.str()); | 280 DeviceSideName = std::string(Out.str()); |
235 } else | 281 } else |
236 DeviceSideName = std::string(ND->getIdentifier()->getName()); | 282 DeviceSideName = std::string(ND->getIdentifier()->getName()); |
283 | |
284 // Make unique name for device side static file-scope variable for HIP. | |
285 if (CGM.getContext().shouldExternalizeStaticVar(ND) && | |
286 CGM.getLangOpts().GPURelocatableDeviceCode && | |
287 !CGM.getLangOpts().CUID.empty()) { | |
288 SmallString<256> Buffer; | |
289 llvm::raw_svector_ostream Out(Buffer); | |
290 Out << DeviceSideName; | |
291 CGM.printPostfixForExternalizedStaticVar(Out); | |
292 DeviceSideName = std::string(Out.str()); | |
293 } | |
237 return DeviceSideName; | 294 return DeviceSideName; |
238 } | 295 } |
239 | 296 |
240 void CGNVCUDARuntime::emitDeviceStub(CodeGenFunction &CGF, | 297 void CGNVCUDARuntime::emitDeviceStub(CodeGenFunction &CGF, |
241 FunctionArgList &Args) { | 298 FunctionArgList &Args) { |
242 EmittedKernels.push_back({CGF.CurFn, CGF.CurFuncDecl}); | 299 EmittedKernels.push_back({CGF.CurFn, CGF.CurFuncDecl}); |
300 if (auto *GV = dyn_cast<llvm::GlobalVariable>(KernelHandles[CGF.CurFn])) { | |
301 GV->setLinkage(CGF.CurFn->getLinkage()); | |
302 GV->setInitializer(CGF.CurFn); | |
303 } | |
243 if (CudaFeatureEnabled(CGM.getTarget().getSDKVersion(), | 304 if (CudaFeatureEnabled(CGM.getTarget().getSDKVersion(), |
244 CudaFeature::CUDA_USES_NEW_LAUNCH) || | 305 CudaFeature::CUDA_USES_NEW_LAUNCH) || |
245 CGF.getLangOpts().HIPUseNewLaunchAPI) | 306 (CGF.getLangOpts().HIP && CGF.getLangOpts().HIPUseNewLaunchAPI)) |
246 emitDeviceStubBodyNew(CGF, Args); | 307 emitDeviceStubBodyNew(CGF, Args); |
247 else | 308 else |
248 emitDeviceStubBodyLegacy(CGF, Args); | 309 emitDeviceStubBodyLegacy(CGF, Args); |
249 } | 310 } |
250 | 311 |
281 DeclContext *DC = TranslationUnitDecl::castToDeclContext(TUDecl); | 342 DeclContext *DC = TranslationUnitDecl::castToDeclContext(TUDecl); |
282 auto LaunchKernelName = addPrefixToName("LaunchKernel"); | 343 auto LaunchKernelName = addPrefixToName("LaunchKernel"); |
283 IdentifierInfo &cudaLaunchKernelII = | 344 IdentifierInfo &cudaLaunchKernelII = |
284 CGM.getContext().Idents.get(LaunchKernelName); | 345 CGM.getContext().Idents.get(LaunchKernelName); |
285 FunctionDecl *cudaLaunchKernelFD = nullptr; | 346 FunctionDecl *cudaLaunchKernelFD = nullptr; |
286 for (const auto &Result : DC->lookup(&cudaLaunchKernelII)) { | 347 for (auto *Result : DC->lookup(&cudaLaunchKernelII)) { |
287 if (FunctionDecl *FD = dyn_cast<FunctionDecl>(Result)) | 348 if (FunctionDecl *FD = dyn_cast<FunctionDecl>(Result)) |
288 cudaLaunchKernelFD = FD; | 349 cudaLaunchKernelFD = FD; |
289 } | 350 } |
290 | 351 |
291 if (cudaLaunchKernelFD == nullptr) { | 352 if (cudaLaunchKernelFD == nullptr) { |
316 CGF.EmitRuntimeCallOrInvoke(cudaPopConfigFn, | 377 CGF.EmitRuntimeCallOrInvoke(cudaPopConfigFn, |
317 {GridDim.getPointer(), BlockDim.getPointer(), | 378 {GridDim.getPointer(), BlockDim.getPointer(), |
318 ShmemSize.getPointer(), Stream.getPointer()}); | 379 ShmemSize.getPointer(), Stream.getPointer()}); |
319 | 380 |
320 // Emit the call to cudaLaunch | 381 // Emit the call to cudaLaunch |
321 llvm::Value *Kernel = CGF.Builder.CreatePointerCast(CGF.CurFn, VoidPtrTy); | 382 llvm::Value *Kernel = |
383 CGF.Builder.CreatePointerCast(KernelHandles[CGF.CurFn], VoidPtrTy); | |
322 CallArgList LaunchKernelArgs; | 384 CallArgList LaunchKernelArgs; |
323 LaunchKernelArgs.add(RValue::get(Kernel), | 385 LaunchKernelArgs.add(RValue::get(Kernel), |
324 cudaLaunchKernelFD->getParamDecl(0)->getType()); | 386 cudaLaunchKernelFD->getParamDecl(0)->getType()); |
325 LaunchKernelArgs.add(RValue::getAggregate(GridDim), Dim3Ty); | 387 LaunchKernelArgs.add(RValue::getAggregate(GridDim), Dim3Ty); |
326 LaunchKernelArgs.add(RValue::getAggregate(BlockDim), Dim3Ty); | 388 LaunchKernelArgs.add(RValue::getAggregate(BlockDim), Dim3Ty); |
352 // Emit a call to cudaSetupArgument for each arg in Args. | 414 // Emit a call to cudaSetupArgument for each arg in Args. |
353 llvm::FunctionCallee cudaSetupArgFn = getSetupArgumentFn(); | 415 llvm::FunctionCallee cudaSetupArgFn = getSetupArgumentFn(); |
354 llvm::BasicBlock *EndBlock = CGF.createBasicBlock("setup.end"); | 416 llvm::BasicBlock *EndBlock = CGF.createBasicBlock("setup.end"); |
355 CharUnits Offset = CharUnits::Zero(); | 417 CharUnits Offset = CharUnits::Zero(); |
356 for (const VarDecl *A : Args) { | 418 for (const VarDecl *A : Args) { |
357 CharUnits TyWidth, TyAlign; | 419 auto TInfo = CGM.getContext().getTypeInfoInChars(A->getType()); |
358 std::tie(TyWidth, TyAlign) = | 420 Offset = Offset.alignTo(TInfo.Align); |
359 CGM.getContext().getTypeInfoInChars(A->getType()); | |
360 Offset = Offset.alignTo(TyAlign); | |
361 llvm::Value *Args[] = { | 421 llvm::Value *Args[] = { |
362 CGF.Builder.CreatePointerCast(CGF.GetAddrOfLocalVar(A).getPointer(), | 422 CGF.Builder.CreatePointerCast(CGF.GetAddrOfLocalVar(A).getPointer(), |
363 VoidPtrTy), | 423 VoidPtrTy), |
364 llvm::ConstantInt::get(SizeTy, TyWidth.getQuantity()), | 424 llvm::ConstantInt::get(SizeTy, TInfo.Width.getQuantity()), |
365 llvm::ConstantInt::get(SizeTy, Offset.getQuantity()), | 425 llvm::ConstantInt::get(SizeTy, Offset.getQuantity()), |
366 }; | 426 }; |
367 llvm::CallBase *CB = CGF.EmitRuntimeCallOrInvoke(cudaSetupArgFn, Args); | 427 llvm::CallBase *CB = CGF.EmitRuntimeCallOrInvoke(cudaSetupArgFn, Args); |
368 llvm::Constant *Zero = llvm::ConstantInt::get(IntTy, 0); | 428 llvm::Constant *Zero = llvm::ConstantInt::get(IntTy, 0); |
369 llvm::Value *CBZero = CGF.Builder.CreateICmpEQ(CB, Zero); | 429 llvm::Value *CBZero = CGF.Builder.CreateICmpEQ(CB, Zero); |
370 llvm::BasicBlock *NextBlock = CGF.createBasicBlock("setup.next"); | 430 llvm::BasicBlock *NextBlock = CGF.createBasicBlock("setup.next"); |
371 CGF.Builder.CreateCondBr(CBZero, NextBlock, EndBlock); | 431 CGF.Builder.CreateCondBr(CBZero, NextBlock, EndBlock); |
372 CGF.EmitBlock(NextBlock); | 432 CGF.EmitBlock(NextBlock); |
373 Offset += TyWidth; | 433 Offset += TInfo.Width; |
374 } | 434 } |
375 | 435 |
376 // Emit the call to cudaLaunch | 436 // Emit the call to cudaLaunch |
377 llvm::FunctionCallee cudaLaunchFn = getLaunchFn(); | 437 llvm::FunctionCallee cudaLaunchFn = getLaunchFn(); |
378 llvm::Value *Arg = CGF.Builder.CreatePointerCast(CGF.CurFn, CharPtrTy); | 438 llvm::Value *Arg = |
439 CGF.Builder.CreatePointerCast(KernelHandles[CGF.CurFn], CharPtrTy); | |
379 CGF.EmitRuntimeCallOrInvoke(cudaLaunchFn, Arg); | 440 CGF.EmitRuntimeCallOrInvoke(cudaLaunchFn, Arg); |
380 CGF.EmitBranch(EndBlock); | 441 CGF.EmitBranch(EndBlock); |
381 | 442 |
382 CGF.EmitBlock(EndBlock); | 443 CGF.EmitBlock(EndBlock); |
444 } | |
445 | |
446 // Replace the original variable Var with the address loaded from variable | |
447 // ManagedVar populated by HIP runtime. | |
448 static void replaceManagedVar(llvm::GlobalVariable *Var, | |
449 llvm::GlobalVariable *ManagedVar) { | |
450 SmallVector<SmallVector<llvm::User *, 8>, 8> WorkList; | |
451 for (auto &&VarUse : Var->uses()) { | |
452 WorkList.push_back({VarUse.getUser()}); | |
453 } | |
454 while (!WorkList.empty()) { | |
455 auto &&WorkItem = WorkList.pop_back_val(); | |
456 auto *U = WorkItem.back(); | |
457 if (isa<llvm::ConstantExpr>(U)) { | |
458 for (auto &&UU : U->uses()) { | |
459 WorkItem.push_back(UU.getUser()); | |
460 WorkList.push_back(WorkItem); | |
461 WorkItem.pop_back(); | |
462 } | |
463 continue; | |
464 } | |
465 if (auto *I = dyn_cast<llvm::Instruction>(U)) { | |
466 llvm::Value *OldV = Var; | |
467 llvm::Instruction *NewV = | |
468 new llvm::LoadInst(Var->getType(), ManagedVar, "ld.managed", false, | |
469 llvm::Align(Var->getAlignment()), I); | |
470 WorkItem.pop_back(); | |
471 // Replace constant expressions directly or indirectly using the managed | |
472 // variable with instructions. | |
473 for (auto &&Op : WorkItem) { | |
474 auto *CE = cast<llvm::ConstantExpr>(Op); | |
475 auto *NewInst = llvm::createReplacementInstr(CE, I); | |
476 NewInst->replaceUsesOfWith(OldV, NewV); | |
477 OldV = CE; | |
478 NewV = NewInst; | |
479 } | |
480 I->replaceUsesOfWith(OldV, NewV); | |
481 } else { | |
482 llvm_unreachable("Invalid use of managed variable"); | |
483 } | |
484 } | |
383 } | 485 } |
384 | 486 |
385 /// Creates a function that sets up state on the host side for CUDA objects that | 487 /// Creates a function that sets up state on the host side for CUDA objects that |
386 /// have a presence on both the host and device sides. Specifically, registers | 488 /// have a presence on both the host and device sides. Specifically, registers |
387 /// the host side of kernel functions and device global variables with the CUDA | 489 /// the host side of kernel functions and device global variables with the CUDA |
426 llvm::Constant *KernelName = | 528 llvm::Constant *KernelName = |
427 makeConstantString(getDeviceSideName(cast<NamedDecl>(I.D))); | 529 makeConstantString(getDeviceSideName(cast<NamedDecl>(I.D))); |
428 llvm::Constant *NullPtr = llvm::ConstantPointerNull::get(VoidPtrTy); | 530 llvm::Constant *NullPtr = llvm::ConstantPointerNull::get(VoidPtrTy); |
429 llvm::Value *Args[] = { | 531 llvm::Value *Args[] = { |
430 &GpuBinaryHandlePtr, | 532 &GpuBinaryHandlePtr, |
431 Builder.CreateBitCast(I.Kernel, VoidPtrTy), | 533 Builder.CreateBitCast(KernelHandles[I.Kernel], VoidPtrTy), |
432 KernelName, | 534 KernelName, |
433 KernelName, | 535 KernelName, |
434 llvm::ConstantInt::get(IntTy, -1), | 536 llvm::ConstantInt::get(IntTy, -1), |
435 NullPtr, | 537 NullPtr, |
436 NullPtr, | 538 NullPtr, |
452 CharPtrTy, IntTy, VarSizeTy, | 554 CharPtrTy, IntTy, VarSizeTy, |
453 IntTy, IntTy}; | 555 IntTy, IntTy}; |
454 llvm::FunctionCallee RegisterVar = CGM.CreateRuntimeFunction( | 556 llvm::FunctionCallee RegisterVar = CGM.CreateRuntimeFunction( |
455 llvm::FunctionType::get(VoidTy, RegisterVarParams, false), | 557 llvm::FunctionType::get(VoidTy, RegisterVarParams, false), |
456 addUnderscoredPrefixToName("RegisterVar")); | 558 addUnderscoredPrefixToName("RegisterVar")); |
559 // void __hipRegisterManagedVar(void **, char *, char *, const char *, | |
560 // size_t, unsigned) | |
561 llvm::Type *RegisterManagedVarParams[] = {VoidPtrPtrTy, CharPtrTy, CharPtrTy, | |
562 CharPtrTy, VarSizeTy, IntTy}; | |
563 llvm::FunctionCallee RegisterManagedVar = CGM.CreateRuntimeFunction( | |
564 llvm::FunctionType::get(VoidTy, RegisterManagedVarParams, false), | |
565 addUnderscoredPrefixToName("RegisterManagedVar")); | |
457 // void __cudaRegisterSurface(void **, const struct surfaceReference *, | 566 // void __cudaRegisterSurface(void **, const struct surfaceReference *, |
458 // const void **, const char *, int, int); | 567 // const void **, const char *, int, int); |
459 llvm::FunctionCallee RegisterSurf = CGM.CreateRuntimeFunction( | 568 llvm::FunctionCallee RegisterSurf = CGM.CreateRuntimeFunction( |
460 llvm::FunctionType::get( | 569 llvm::FunctionType::get( |
461 VoidTy, {VoidPtrPtrTy, VoidPtrTy, CharPtrTy, CharPtrTy, IntTy, IntTy}, | 570 VoidTy, {VoidPtrPtrTy, VoidPtrTy, CharPtrTy, CharPtrTy, IntTy, IntTy}, |
469 {VoidPtrPtrTy, VoidPtrTy, CharPtrTy, CharPtrTy, IntTy, IntTy, IntTy}, | 578 {VoidPtrPtrTy, VoidPtrTy, CharPtrTy, CharPtrTy, IntTy, IntTy, IntTy}, |
470 false), | 579 false), |
471 addUnderscoredPrefixToName("RegisterTexture")); | 580 addUnderscoredPrefixToName("RegisterTexture")); |
472 for (auto &&Info : DeviceVars) { | 581 for (auto &&Info : DeviceVars) { |
473 llvm::GlobalVariable *Var = Info.Var; | 582 llvm::GlobalVariable *Var = Info.Var; |
583 assert((!Var->isDeclaration() || Info.Flags.isManaged()) && | |
584 "External variables should not show up here, except HIP managed " | |
585 "variables"); | |
474 llvm::Constant *VarName = makeConstantString(getDeviceSideName(Info.D)); | 586 llvm::Constant *VarName = makeConstantString(getDeviceSideName(Info.D)); |
475 switch (Info.Flags.getKind()) { | 587 switch (Info.Flags.getKind()) { |
476 case DeviceVarFlags::Variable: { | 588 case DeviceVarFlags::Variable: { |
477 uint64_t VarSize = | 589 uint64_t VarSize = |
478 CGM.getDataLayout().getTypeAllocSize(Var->getValueType()); | 590 CGM.getDataLayout().getTypeAllocSize(Var->getValueType()); |
479 llvm::Value *Args[] = { | 591 if (Info.Flags.isManaged()) { |
480 &GpuBinaryHandlePtr, | 592 auto ManagedVar = new llvm::GlobalVariable( |
481 Builder.CreateBitCast(Var, VoidPtrTy), | 593 CGM.getModule(), Var->getType(), |
482 VarName, | 594 /*isConstant=*/false, Var->getLinkage(), |
483 VarName, | 595 /*Init=*/Var->isDeclaration() |
484 llvm::ConstantInt::get(IntTy, Info.Flags.isExtern()), | 596 ? nullptr |
485 llvm::ConstantInt::get(VarSizeTy, VarSize), | 597 : llvm::ConstantPointerNull::get(Var->getType()), |
486 llvm::ConstantInt::get(IntTy, Info.Flags.isConstant()), | 598 /*Name=*/"", /*InsertBefore=*/nullptr, |
487 llvm::ConstantInt::get(IntTy, 0)}; | 599 llvm::GlobalVariable::NotThreadLocal); |
488 Builder.CreateCall(RegisterVar, Args); | 600 ManagedVar->setDSOLocal(Var->isDSOLocal()); |
601 ManagedVar->setVisibility(Var->getVisibility()); | |
602 ManagedVar->setExternallyInitialized(true); | |
603 ManagedVar->takeName(Var); | |
604 Var->setName(Twine(ManagedVar->getName() + ".managed")); | |
605 replaceManagedVar(Var, ManagedVar); | |
606 llvm::Value *Args[] = { | |
607 &GpuBinaryHandlePtr, | |
608 Builder.CreateBitCast(ManagedVar, VoidPtrTy), | |
609 Builder.CreateBitCast(Var, VoidPtrTy), | |
610 VarName, | |
611 llvm::ConstantInt::get(VarSizeTy, VarSize), | |
612 llvm::ConstantInt::get(IntTy, Var->getAlignment())}; | |
613 if (!Var->isDeclaration()) | |
614 Builder.CreateCall(RegisterManagedVar, Args); | |
615 } else { | |
616 llvm::Value *Args[] = { | |
617 &GpuBinaryHandlePtr, | |
618 Builder.CreateBitCast(Var, VoidPtrTy), | |
619 VarName, | |
620 VarName, | |
621 llvm::ConstantInt::get(IntTy, Info.Flags.isExtern()), | |
622 llvm::ConstantInt::get(VarSizeTy, VarSize), | |
623 llvm::ConstantInt::get(IntTy, Info.Flags.isConstant()), | |
624 llvm::ConstantInt::get(IntTy, 0)}; | |
625 Builder.CreateCall(RegisterVar, Args); | |
626 } | |
489 break; | 627 break; |
490 } | 628 } |
491 case DeviceVarFlags::Surface: | 629 case DeviceVarFlags::Surface: |
492 Builder.CreateCall( | 630 Builder.CreateCall( |
493 RegisterSurf, | 631 RegisterSurf, |
595 ModuleIDPrefix = "__hip_"; | 733 ModuleIDPrefix = "__hip_"; |
596 | 734 |
597 if (CudaGpuBinary) { | 735 if (CudaGpuBinary) { |
598 // If fatbin is available from early finalization, create a string | 736 // If fatbin is available from early finalization, create a string |
599 // literal containing the fat binary loaded from the given file. | 737 // literal containing the fat binary loaded from the given file. |
600 FatBinStr = makeConstantString(std::string(CudaGpuBinary->getBuffer()), | 738 const unsigned HIPCodeObjectAlign = 4096; |
601 "", FatbinConstantName, 8); | 739 FatBinStr = |
740 makeConstantString(std::string(CudaGpuBinary->getBuffer()), "", | |
741 FatbinConstantName, HIPCodeObjectAlign); | |
602 } else { | 742 } else { |
603 // If fatbin is not available, create an external symbol | 743 // If fatbin is not available, create an external symbol |
604 // __hip_fatbin in section .hip_fatbin. The external symbol is supposed | 744 // __hip_fatbin in section .hip_fatbin. The external symbol is supposed |
605 // to contain the fat binary but will be populated somewhere else, | 745 // to contain the fat binary but will be populated somewhere else, |
606 // e.g. by lld through link script. | 746 // e.g. by lld through link script. |
844 } | 984 } |
845 | 985 |
846 CGCUDARuntime *CodeGen::CreateNVCUDARuntime(CodeGenModule &CGM) { | 986 CGCUDARuntime *CodeGen::CreateNVCUDARuntime(CodeGenModule &CGM) { |
847 return new CGNVCUDARuntime(CGM); | 987 return new CGNVCUDARuntime(CGM); |
848 } | 988 } |
989 | |
990 void CGNVCUDARuntime::internalizeDeviceSideVar( | |
991 const VarDecl *D, llvm::GlobalValue::LinkageTypes &Linkage) { | |
992 // For -fno-gpu-rdc, host-side shadows of external declarations of device-side | |
993 // global variables become internal definitions. These have to be internal in | |
994 // order to prevent name conflicts with global host variables with the same | |
995 // name in a different TUs. | |
996 // | |
997 // For -fgpu-rdc, the shadow variables should not be internalized because | |
998 // they may be accessed by different TU. | |
999 if (CGM.getLangOpts().GPURelocatableDeviceCode) | |
1000 return; | |
1001 | |
1002 // __shared__ variables are odd. Shadows do get created, but | |
1003 // they are not registered with the CUDA runtime, so they | |
1004 // can't really be used to access their device-side | |
1005 // counterparts. It's not clear yet whether it's nvcc's bug or | |
1006 // a feature, but we've got to do the same for compatibility. | |
1007 if (D->hasAttr<CUDADeviceAttr>() || D->hasAttr<CUDAConstantAttr>() || | |
1008 D->hasAttr<CUDASharedAttr>() || | |
1009 D->getType()->isCUDADeviceBuiltinSurfaceType() || | |
1010 D->getType()->isCUDADeviceBuiltinTextureType()) { | |
1011 Linkage = llvm::GlobalValue::InternalLinkage; | |
1012 } | |
1013 } | |
1014 | |
1015 void CGNVCUDARuntime::handleVarRegistration(const VarDecl *D, | |
1016 llvm::GlobalVariable &GV) { | |
1017 if (D->hasAttr<CUDADeviceAttr>() || D->hasAttr<CUDAConstantAttr>()) { | |
1018 // Shadow variables and their properties must be registered with CUDA | |
1019 // runtime. Skip Extern global variables, which will be registered in | |
1020 // the TU where they are defined. | |
1021 // | |
1022 // Don't register a C++17 inline variable. The local symbol can be | |
1023 // discarded and referencing a discarded local symbol from outside the | |
1024 // comdat (__cuda_register_globals) is disallowed by the ELF spec. | |
1025 // | |
1026 // HIP managed variables need to be always recorded in device and host | |
1027 // compilations for transformation. | |
1028 // | |
1029 // HIP managed variables and variables in CUDADeviceVarODRUsedByHost are | |
1030 // added to llvm.compiler-used, therefore they are safe to be registered. | |
1031 if ((!D->hasExternalStorage() && !D->isInline()) || | |
1032 CGM.getContext().CUDADeviceVarODRUsedByHost.contains(D) || | |
1033 D->hasAttr<HIPManagedAttr>()) { | |
1034 registerDeviceVar(D, GV, !D->hasDefinition(), | |
1035 D->hasAttr<CUDAConstantAttr>()); | |
1036 } | |
1037 } else if (D->getType()->isCUDADeviceBuiltinSurfaceType() || | |
1038 D->getType()->isCUDADeviceBuiltinTextureType()) { | |
1039 // Builtin surfaces and textures and their template arguments are | |
1040 // also registered with CUDA runtime. | |
1041 const auto *TD = cast<ClassTemplateSpecializationDecl>( | |
1042 D->getType()->castAs<RecordType>()->getDecl()); | |
1043 const TemplateArgumentList &Args = TD->getTemplateArgs(); | |
1044 if (TD->hasAttr<CUDADeviceBuiltinSurfaceTypeAttr>()) { | |
1045 assert(Args.size() == 2 && | |
1046 "Unexpected number of template arguments of CUDA device " | |
1047 "builtin surface type."); | |
1048 auto SurfType = Args[1].getAsIntegral(); | |
1049 if (!D->hasExternalStorage()) | |
1050 registerDeviceSurf(D, GV, !D->hasDefinition(), SurfType.getSExtValue()); | |
1051 } else { | |
1052 assert(Args.size() == 3 && | |
1053 "Unexpected number of template arguments of CUDA device " | |
1054 "builtin texture type."); | |
1055 auto TexType = Args[1].getAsIntegral(); | |
1056 auto Normalized = Args[2].getAsIntegral(); | |
1057 if (!D->hasExternalStorage()) | |
1058 registerDeviceTex(D, GV, !D->hasDefinition(), TexType.getSExtValue(), | |
1059 Normalized.getZExtValue()); | |
1060 } | |
1061 } | |
1062 } | |
1063 | |
1064 // Transform managed variables to pointers to managed variables in device code. | |
1065 // Each use of the original managed variable is replaced by a load from the | |
1066 // transformed managed variable. The transformed managed variable contains | |
1067 // the address of managed memory which will be allocated by the runtime. | |
1068 void CGNVCUDARuntime::transformManagedVars() { | |
1069 for (auto &&Info : DeviceVars) { | |
1070 llvm::GlobalVariable *Var = Info.Var; | |
1071 if (Info.Flags.getKind() == DeviceVarFlags::Variable && | |
1072 Info.Flags.isManaged()) { | |
1073 auto ManagedVar = new llvm::GlobalVariable( | |
1074 CGM.getModule(), Var->getType(), | |
1075 /*isConstant=*/false, Var->getLinkage(), | |
1076 /*Init=*/Var->isDeclaration() | |
1077 ? nullptr | |
1078 : llvm::ConstantPointerNull::get(Var->getType()), | |
1079 /*Name=*/"", /*InsertBefore=*/nullptr, | |
1080 llvm::GlobalVariable::NotThreadLocal, | |
1081 CGM.getContext().getTargetAddressSpace(LangAS::cuda_device)); | |
1082 ManagedVar->setDSOLocal(Var->isDSOLocal()); | |
1083 ManagedVar->setVisibility(Var->getVisibility()); | |
1084 ManagedVar->setExternallyInitialized(true); | |
1085 replaceManagedVar(Var, ManagedVar); | |
1086 ManagedVar->takeName(Var); | |
1087 Var->setName(Twine(ManagedVar->getName()) + ".managed"); | |
1088 // Keep managed variables even if they are not used in device code since | |
1089 // they need to be allocated by the runtime. | |
1090 if (!Var->isDeclaration()) { | |
1091 assert(!ManagedVar->isDeclaration()); | |
1092 CGM.addCompilerUsedGlobal(Var); | |
1093 CGM.addCompilerUsedGlobal(ManagedVar); | |
1094 } | |
1095 } | |
1096 } | |
1097 } | |
1098 | |
1099 // Returns module constructor to be added. | |
1100 llvm::Function *CGNVCUDARuntime::finalizeModule() { | |
1101 if (CGM.getLangOpts().CUDAIsDevice) { | |
1102 transformManagedVars(); | |
1103 | |
1104 // Mark ODR-used device variables as compiler used to prevent it from being | |
1105 // eliminated by optimization. This is necessary for device variables | |
1106 // ODR-used by host functions. Sema correctly marks them as ODR-used no | |
1107 // matter whether they are ODR-used by device or host functions. | |
1108 // | |
1109 // We do not need to do this if the variable has used attribute since it | |
1110 // has already been added. | |
1111 // | |
1112 // Static device variables have been externalized at this point, therefore | |
1113 // variables with LLVM private or internal linkage need not be added. | |
1114 for (auto &&Info : DeviceVars) { | |
1115 auto Kind = Info.Flags.getKind(); | |
1116 if (!Info.Var->isDeclaration() && | |
1117 !llvm::GlobalValue::isLocalLinkage(Info.Var->getLinkage()) && | |
1118 (Kind == DeviceVarFlags::Variable || | |
1119 Kind == DeviceVarFlags::Surface || | |
1120 Kind == DeviceVarFlags::Texture) && | |
1121 Info.D->isUsed() && !Info.D->hasAttr<UsedAttr>()) { | |
1122 CGM.addCompilerUsedGlobal(Info.Var); | |
1123 } | |
1124 } | |
1125 return nullptr; | |
1126 } | |
1127 return makeModuleCtorFunction(); | |
1128 } | |
1129 | |
1130 llvm::GlobalValue *CGNVCUDARuntime::getKernelHandle(llvm::Function *F, | |
1131 GlobalDecl GD) { | |
1132 auto Loc = KernelHandles.find(F); | |
1133 if (Loc != KernelHandles.end()) | |
1134 return Loc->second; | |
1135 | |
1136 if (!CGM.getLangOpts().HIP) { | |
1137 KernelHandles[F] = F; | |
1138 KernelStubs[F] = F; | |
1139 return F; | |
1140 } | |
1141 | |
1142 auto *Var = new llvm::GlobalVariable( | |
1143 TheModule, F->getType(), /*isConstant=*/true, F->getLinkage(), | |
1144 /*Initializer=*/nullptr, | |
1145 CGM.getMangledName( | |
1146 GD.getWithKernelReferenceKind(KernelReferenceKind::Kernel))); | |
1147 Var->setAlignment(CGM.getPointerAlign().getAsAlign()); | |
1148 Var->setDSOLocal(F->isDSOLocal()); | |
1149 Var->setVisibility(F->getVisibility()); | |
1150 KernelHandles[F] = Var; | |
1151 KernelStubs[Var] = F; | |
1152 return Var; | |
1153 } |