CbC/CbC_llvm: clang/lib/CodeGen/CGCUDANV.cpp comparison

comparison clang/lib/CodeGen/CGCUDANV.cpp @ 221:79ff65ed7e25

LLVM12 Original

author	Shinji KONO <kono@ie.u-ryukyu.ac.jp>
date	Tue, 15 Jun 2021 19:15:29 +0900
parents	0572611fdcc8
children	5f17cb93ff66

comparison

equal deleted inserted replaced

-:42394fc6a535
+:79ff65ed7e25
 // runtime library.
 //
 //===----------------------------------------------------------------------===//
 #include "CGCUDARuntime.h"
+#include "CGCXXABI.h"
 #include "CodeGenFunction.h"
 #include "CodeGenModule.h"
 #include "clang/AST/Decl.h"
 #include "clang/Basic/Cuda.h"
 #include "clang/CodeGen/CodeGenABITypes.h"
 #include "clang/CodeGen/ConstantInitBuilder.h"
 #include "llvm/IR/BasicBlock.h"
 #include "llvm/IR/Constants.h"
 #include "llvm/IR/DerivedTypes.h"
+#include "llvm/IR/ReplaceConstant.h"
 #include "llvm/Support/Format.h"
 using namespace clang;
 using namespace CodeGen;
 /// Convenience reference to LLVM Context
 llvm::LLVMContext &Context;
 /// Convenience reference to the current module
 llvm::Module &TheModule;
-/// Keeps track of kernel launch stubs emitted in this module
+/// Keeps track of kernel launch stubs and handles emitted in this module
 struct KernelInfo {
-llvm::Function *Kernel;
+llvm::Function *Kernel; // stub function to help launch kernel
 const Decl *D;
 };
 llvm::SmallVector<KernelInfo, 16> EmittedKernels;
+// Map a device stub function to a symbol for identifying kernel in host code.
+// For CUDA, the symbol for identifying the kernel is the same as the device
+// stub function. For HIP, they are different.
+llvm::DenseMap<llvm::Function *, llvm::GlobalValue *> KernelHandles;
+// Map a kernel handle to the kernel stub.
+llvm::DenseMap<llvm::GlobalValue *, llvm::Function *> KernelStubs;
 struct VarInfo {
 llvm::GlobalVariable *Var;
 const VarDecl *D;
 DeviceVarFlags Flags;
 };
 void emitDeviceStubBodyLegacy(CodeGenFunction &CGF, FunctionArgList &Args);
 void emitDeviceStubBodyNew(CodeGenFunction &CGF, FunctionArgList &Args);
 std::string getDeviceSideName(const NamedDecl *ND) override;
-public:
-CGNVCUDARuntime(CodeGenModule &CGM);
-void emitDeviceStub(CodeGenFunction &CGF, FunctionArgList &Args) override;
 void registerDeviceVar(const VarDecl *VD, llvm::GlobalVariable &Var,
-bool Extern, bool Constant) override {
+bool Extern, bool Constant) {
 DeviceVars.push_back({&Var,
 VD,
 {DeviceVarFlags::Variable, Extern, Constant,
-/*Normalized*/ false, /*Type*/ 0}});
+VD->hasAttr<HIPManagedAttr>(),
+/*Normalized*/ false, 0}});
 }
 void registerDeviceSurf(const VarDecl *VD, llvm::GlobalVariable &Var,
-bool Extern, int Type) override {
+bool Extern, int Type) {
 DeviceVars.push_back({&Var,
 VD,
 {DeviceVarFlags::Surface, Extern, /*Constant*/ false,
+/*Managed*/ false,
 /*Normalized*/ false, Type}});
 }
 void registerDeviceTex(const VarDecl *VD, llvm::GlobalVariable &Var,
-bool Extern, int Type, bool Normalized) override {
+bool Extern, int Type, bool Normalized) {
 DeviceVars.push_back({&Var,
 VD,
 {DeviceVarFlags::Texture, Extern, /*Constant*/ false,
-Normalized, Type}});
+/*Managed*/ false, Normalized, Type}});
 }
 /// Creates module constructor function
-llvm::Function *makeModuleCtorFunction() override;
+llvm::Function *makeModuleCtorFunction();
 /// Creates module destructor function
-llvm::Function *makeModuleDtorFunction() override;
+llvm::Function *makeModuleDtorFunction();
+/// Transform managed variables for device compilation.
+void transformManagedVars();
+public:
+CGNVCUDARuntime(CodeGenModule &CGM);
+llvm::GlobalValue *getKernelHandle(llvm::Function *F, GlobalDecl GD) override;
+llvm::Function *getKernelStub(llvm::GlobalValue *Handle) override {
+auto Loc = KernelStubs.find(Handle);
+assert(Loc != KernelStubs.end());
+return Loc->second;
+}
+void emitDeviceStub(CodeGenFunction &CGF, FunctionArgList &Args) override;
+void handleVarRegistration(const VarDecl *VD,
+llvm::GlobalVariable &Var) override;
+void
+internalizeDeviceSideVar(const VarDecl *D,
+llvm::GlobalValue::LinkageTypes &Linkage) override;
+llvm::Function *finalizeModule() override;
 };
 }
 std::string CGNVCUDARuntime::addPrefixToName(StringRef FuncName) const {
 if (CGM.getLangOpts().HIP)
 return ((Twine("__hip") + Twine(FuncName)).str());
 return ((Twine("__cuda") + Twine(FuncName)).str());
 }
+static std::unique_ptr<MangleContext> InitDeviceMC(CodeGenModule &CGM) {
+// If the host and device have different C++ ABIs, mark it as the device
+// mangle context so that the mangling needs to retrieve the additional
+// device lambda mangling number instead of the regular host one.
+if (CGM.getContext().getAuxTargetInfo() &&
+CGM.getContext().getTargetInfo().getCXXABI().isMicrosoft() &&
+CGM.getContext().getAuxTargetInfo()->getCXXABI().isItaniumFamily()) {
+return std::unique_ptr<MangleContext>(
+CGM.getContext().createDeviceMangleContext(
+*CGM.getContext().getAuxTargetInfo()));
+}
+return std::unique_ptr<MangleContext>(CGM.getContext().createMangleContext(
+CGM.getContext().getAuxTargetInfo()));
+}
 CGNVCUDARuntime::CGNVCUDARuntime(CodeGenModule &CGM)
 : CGCUDARuntime(CGM), Context(CGM.getLLVMContext()),
 TheModule(CGM.getModule()),
 RelocatableDeviceCode(CGM.getLangOpts().GPURelocatableDeviceCode),
-DeviceMC(CGM.getContext().createMangleContext(
+DeviceMC(InitDeviceMC(CGM)) {
-CGM.getContext().getAuxTargetInfo())) {
 CodeGen::CodeGenTypes &Types = CGM.getTypes();
 ASTContext &Ctx = CGM.getContext();
 IntTy = CGM.IntTy;
 SizeTy = CGM.SizeTy;
 if (auto *FD = dyn_cast<FunctionDecl>(ND))
 GD = GlobalDecl(FD, KernelReferenceKind::Kernel);
 else
 GD = GlobalDecl(ND);
 std::string DeviceSideName;
-if (DeviceMC->shouldMangleDeclName(ND)) {
+MangleContext *MC;
+if (CGM.getLangOpts().CUDAIsDevice)
+MC = &CGM.getCXXABI().getMangleContext();
+else
+MC = DeviceMC.get();
+if (MC->shouldMangleDeclName(ND)) {
 SmallString<256> Buffer;
 llvm::raw_svector_ostream Out(Buffer);
-DeviceMC->mangleName(GD, Out);
+MC->mangleName(GD, Out);
 DeviceSideName = std::string(Out.str());
 } else
 DeviceSideName = std::string(ND->getIdentifier()->getName());
+// Make unique name for device side static file-scope variable for HIP.
+if (CGM.getContext().shouldExternalizeStaticVar(ND) &&
+CGM.getLangOpts().GPURelocatableDeviceCode &&
+!CGM.getLangOpts().CUID.empty()) {
+SmallString<256> Buffer;
+llvm::raw_svector_ostream Out(Buffer);
+Out << DeviceSideName;
+CGM.printPostfixForExternalizedStaticVar(Out);
+DeviceSideName = std::string(Out.str());
+}
 return DeviceSideName;
 }
 void CGNVCUDARuntime::emitDeviceStub(CodeGenFunction &CGF,
 FunctionArgList &Args) {
 EmittedKernels.push_back({CGF.CurFn, CGF.CurFuncDecl});
+if (auto *GV = dyn_cast<llvm::GlobalVariable>(KernelHandles[CGF.CurFn])) {
+GV->setLinkage(CGF.CurFn->getLinkage());
+GV->setInitializer(CGF.CurFn);
+}
 if (CudaFeatureEnabled(CGM.getTarget().getSDKVersion(),
 CudaFeature::CUDA_USES_NEW_LAUNCH) ||
-CGF.getLangOpts().HIPUseNewLaunchAPI)
+(CGF.getLangOpts().HIP && CGF.getLangOpts().HIPUseNewLaunchAPI))
 emitDeviceStubBodyNew(CGF, Args);
 else
 emitDeviceStubBodyLegacy(CGF, Args);
 }
 DeclContext *DC = TranslationUnitDecl::castToDeclContext(TUDecl);
 auto LaunchKernelName = addPrefixToName("LaunchKernel");
 IdentifierInfo &cudaLaunchKernelII =
 CGM.getContext().Idents.get(LaunchKernelName);
 FunctionDecl *cudaLaunchKernelFD = nullptr;
-for (const auto &Result : DC->lookup(&cudaLaunchKernelII)) {
+for (auto *Result : DC->lookup(&cudaLaunchKernelII)) {
 if (FunctionDecl *FD = dyn_cast<FunctionDecl>(Result))
 cudaLaunchKernelFD = FD;
 }
 if (cudaLaunchKernelFD == nullptr) {
 CGF.EmitRuntimeCallOrInvoke(cudaPopConfigFn,
 {GridDim.getPointer(), BlockDim.getPointer(),
 ShmemSize.getPointer(), Stream.getPointer()});
 // Emit the call to cudaLaunch
-llvm::Value *Kernel = CGF.Builder.CreatePointerCast(CGF.CurFn, VoidPtrTy);
+llvm::Value *Kernel =
+CGF.Builder.CreatePointerCast(KernelHandles[CGF.CurFn], VoidPtrTy);
 CallArgList LaunchKernelArgs;
 LaunchKernelArgs.add(RValue::get(Kernel),
 cudaLaunchKernelFD->getParamDecl(0)->getType());
 LaunchKernelArgs.add(RValue::getAggregate(GridDim), Dim3Ty);
 LaunchKernelArgs.add(RValue::getAggregate(BlockDim), Dim3Ty);
 // Emit a call to cudaSetupArgument for each arg in Args.
 llvm::FunctionCallee cudaSetupArgFn = getSetupArgumentFn();
 llvm::BasicBlock *EndBlock = CGF.createBasicBlock("setup.end");
 CharUnits Offset = CharUnits::Zero();
 for (const VarDecl *A : Args) {
-CharUnits TyWidth, TyAlign;
+auto TInfo = CGM.getContext().getTypeInfoInChars(A->getType());
-std::tie(TyWidth, TyAlign) =
+Offset = Offset.alignTo(TInfo.Align);
-CGM.getContext().getTypeInfoInChars(A->getType());
-Offset = Offset.alignTo(TyAlign);
 llvm::Value *Args[] = {
 CGF.Builder.CreatePointerCast(CGF.GetAddrOfLocalVar(A).getPointer(),
 VoidPtrTy),
-llvm::ConstantInt::get(SizeTy, TyWidth.getQuantity()),
+llvm::ConstantInt::get(SizeTy, TInfo.Width.getQuantity()),
 llvm::ConstantInt::get(SizeTy, Offset.getQuantity()),
 };
 llvm::CallBase *CB = CGF.EmitRuntimeCallOrInvoke(cudaSetupArgFn, Args);
 llvm::Constant *Zero = llvm::ConstantInt::get(IntTy, 0);
 llvm::Value *CBZero = CGF.Builder.CreateICmpEQ(CB, Zero);
 llvm::BasicBlock *NextBlock = CGF.createBasicBlock("setup.next");
 CGF.Builder.CreateCondBr(CBZero, NextBlock, EndBlock);
 CGF.EmitBlock(NextBlock);
-Offset += TyWidth;
+Offset += TInfo.Width;
 }
 // Emit the call to cudaLaunch
 llvm::FunctionCallee cudaLaunchFn = getLaunchFn();
-llvm::Value *Arg = CGF.Builder.CreatePointerCast(CGF.CurFn, CharPtrTy);
+llvm::Value *Arg =
+CGF.Builder.CreatePointerCast(KernelHandles[CGF.CurFn], CharPtrTy);
 CGF.EmitRuntimeCallOrInvoke(cudaLaunchFn, Arg);
 CGF.EmitBranch(EndBlock);
 CGF.EmitBlock(EndBlock);
+}
+// Replace the original variable Var with the address loaded from variable
+// ManagedVar populated by HIP runtime.
+static void replaceManagedVar(llvm::GlobalVariable *Var,
+llvm::GlobalVariable *ManagedVar) {
+SmallVector<SmallVector<llvm::User *, 8>, 8> WorkList;
+for (auto &&VarUse : Var->uses()) {
+WorkList.push_back({VarUse.getUser()});
+}
+while (!WorkList.empty()) {
+auto &&WorkItem = WorkList.pop_back_val();
+auto *U = WorkItem.back();
+if (isa<llvm::ConstantExpr>(U)) {
+for (auto &&UU : U->uses()) {
+WorkItem.push_back(UU.getUser());
+WorkList.push_back(WorkItem);
+WorkItem.pop_back();
+}
+continue;
+}
+if (auto *I = dyn_cast<llvm::Instruction>(U)) {
+llvm::Value *OldV = Var;
+llvm::Instruction *NewV =
+new llvm::LoadInst(Var->getType(), ManagedVar, "ld.managed", false,
+llvm::Align(Var->getAlignment()), I);
+WorkItem.pop_back();
+// Replace constant expressions directly or indirectly using the managed
+// variable with instructions.
+for (auto &&Op : WorkItem) {
+auto *CE = cast<llvm::ConstantExpr>(Op);
+auto *NewInst = llvm::createReplacementInstr(CE, I);
+NewInst->replaceUsesOfWith(OldV, NewV);
+OldV = CE;
+NewV = NewInst;
+}
+I->replaceUsesOfWith(OldV, NewV);
+} else {
+llvm_unreachable("Invalid use of managed variable");
+}
+}
 }
 /// Creates a function that sets up state on the host side for CUDA objects that
 /// have a presence on both the host and device sides. Specifically, registers
 /// the host side of kernel functions and device global variables with the CUDA
 llvm::Constant *KernelName =
 makeConstantString(getDeviceSideName(cast<NamedDecl>(I.D)));
 llvm::Constant *NullPtr = llvm::ConstantPointerNull::get(VoidPtrTy);
 llvm::Value *Args[] = {
 &GpuBinaryHandlePtr,
-Builder.CreateBitCast(I.Kernel, VoidPtrTy),
+Builder.CreateBitCast(KernelHandles[I.Kernel], VoidPtrTy),
 KernelName,
 KernelName,
 llvm::ConstantInt::get(IntTy, -1),
 NullPtr,
 NullPtr,
 CharPtrTy,    IntTy,     VarSizeTy,
 IntTy,        IntTy};
 llvm::FunctionCallee RegisterVar = CGM.CreateRuntimeFunction(
 llvm::FunctionType::get(VoidTy, RegisterVarParams, false),
 addUnderscoredPrefixToName("RegisterVar"));
+// void __hipRegisterManagedVar(void **, char *, char *, const char *,
+//                              size_t, unsigned)
+llvm::Type *RegisterManagedVarParams[] = {VoidPtrPtrTy, CharPtrTy, CharPtrTy,
+CharPtrTy,    VarSizeTy, IntTy};
+llvm::FunctionCallee RegisterManagedVar = CGM.CreateRuntimeFunction(
+llvm::FunctionType::get(VoidTy, RegisterManagedVarParams, false),
+addUnderscoredPrefixToName("RegisterManagedVar"));
 // void __cudaRegisterSurface(void **, const struct surfaceReference *,
 //                            const void **, const char *, int, int);
 llvm::FunctionCallee RegisterSurf = CGM.CreateRuntimeFunction(
 llvm::FunctionType::get(
 VoidTy, {VoidPtrPtrTy, VoidPtrTy, CharPtrTy, CharPtrTy, IntTy, IntTy},
 {VoidPtrPtrTy, VoidPtrTy, CharPtrTy, CharPtrTy, IntTy, IntTy, IntTy},
 false),
 addUnderscoredPrefixToName("RegisterTexture"));
 for (auto &&Info : DeviceVars) {
 llvm::GlobalVariable *Var = Info.Var;
+assert((!Var->isDeclaration() || Info.Flags.isManaged()) &&
+"External variables should not show up here, except HIP managed "
+"variables");
 llvm::Constant *VarName = makeConstantString(getDeviceSideName(Info.D));
 switch (Info.Flags.getKind()) {
 case DeviceVarFlags::Variable: {
 uint64_t VarSize =
 CGM.getDataLayout().getTypeAllocSize(Var->getValueType());
-llvm::Value *Args[] = {
+if (Info.Flags.isManaged()) {
-&GpuBinaryHandlePtr,
+auto ManagedVar = new llvm::GlobalVariable(
-Builder.CreateBitCast(Var, VoidPtrTy),
+CGM.getModule(), Var->getType(),
-VarName,
+/*isConstant=*/false, Var->getLinkage(),
-VarName,
+/*Init=*/Var->isDeclaration()
-llvm::ConstantInt::get(IntTy, Info.Flags.isExtern()),
+? nullptr
-llvm::ConstantInt::get(VarSizeTy, VarSize),
+: llvm::ConstantPointerNull::get(Var->getType()),
-llvm::ConstantInt::get(IntTy, Info.Flags.isConstant()),
+/*Name=*/"", /*InsertBefore=*/nullptr,
-llvm::ConstantInt::get(IntTy, 0)};
+llvm::GlobalVariable::NotThreadLocal);
-Builder.CreateCall(RegisterVar, Args);
+ManagedVar->setDSOLocal(Var->isDSOLocal());
+ManagedVar->setVisibility(Var->getVisibility());
+ManagedVar->setExternallyInitialized(true);
+ManagedVar->takeName(Var);
+Var->setName(Twine(ManagedVar->getName() + ".managed"));
+replaceManagedVar(Var, ManagedVar);
+llvm::Value *Args[] = {
+&GpuBinaryHandlePtr,
+Builder.CreateBitCast(ManagedVar, VoidPtrTy),
+Builder.CreateBitCast(Var, VoidPtrTy),
+VarName,
+llvm::ConstantInt::get(VarSizeTy, VarSize),
+llvm::ConstantInt::get(IntTy, Var->getAlignment())};
+if (!Var->isDeclaration())
+Builder.CreateCall(RegisterManagedVar, Args);
+} else {
+llvm::Value *Args[] = {
+&GpuBinaryHandlePtr,
+Builder.CreateBitCast(Var, VoidPtrTy),
+VarName,
+VarName,
+llvm::ConstantInt::get(IntTy, Info.Flags.isExtern()),
+llvm::ConstantInt::get(VarSizeTy, VarSize),
+llvm::ConstantInt::get(IntTy, Info.Flags.isConstant()),
+llvm::ConstantInt::get(IntTy, 0)};
+Builder.CreateCall(RegisterVar, Args);
+}
 break;
 }
 case DeviceVarFlags::Surface:
 Builder.CreateCall(
 RegisterSurf,
 ModuleIDPrefix = "__hip_";
 if (CudaGpuBinary) {
 // If fatbin is available from early finalization, create a string
 // literal containing the fat binary loaded from the given file.
-FatBinStr = makeConstantString(std::string(CudaGpuBinary->getBuffer()),
+const unsigned HIPCodeObjectAlign = 4096;
-"", FatbinConstantName, 8);
+FatBinStr =
+makeConstantString(std::string(CudaGpuBinary->getBuffer()), "",
+FatbinConstantName, HIPCodeObjectAlign);
 } else {
 // If fatbin is not available, create an external symbol
 // __hip_fatbin in section .hip_fatbin. The external symbol is supposed
 // to contain the fat binary but will be populated somewhere else,
 // e.g. by lld through link script.
 }
 CGCUDARuntime *CodeGen::CreateNVCUDARuntime(CodeGenModule &CGM) {
 return new CGNVCUDARuntime(CGM);
 }
+void CGNVCUDARuntime::internalizeDeviceSideVar(
+const VarDecl *D, llvm::GlobalValue::LinkageTypes &Linkage) {
+// For -fno-gpu-rdc, host-side shadows of external declarations of device-side
+// global variables become internal definitions. These have to be internal in
+// order to prevent name conflicts with global host variables with the same
+// name in a different TUs.
+//
+// For -fgpu-rdc, the shadow variables should not be internalized because
+// they may be accessed by different TU.
+if (CGM.getLangOpts().GPURelocatableDeviceCode)
+return;
+// __shared__ variables are odd. Shadows do get created, but
+// they are not registered with the CUDA runtime, so they
+// can't really be used to access their device-side
+// counterparts. It's not clear yet whether it's nvcc's bug or
+// a feature, but we've got to do the same for compatibility.
+if (D->hasAttr<CUDADeviceAttr>() || D->hasAttr<CUDAConstantAttr>() ||
+D->hasAttr<CUDASharedAttr>() ||
+D->getType()->isCUDADeviceBuiltinSurfaceType() ||
+D->getType()->isCUDADeviceBuiltinTextureType()) {
+Linkage = llvm::GlobalValue::InternalLinkage;
+}
+}
+void CGNVCUDARuntime::handleVarRegistration(const VarDecl *D,
+llvm::GlobalVariable &GV) {
+if (D->hasAttr<CUDADeviceAttr>() || D->hasAttr<CUDAConstantAttr>()) {
+// Shadow variables and their properties must be registered with CUDA
+// runtime. Skip Extern global variables, which will be registered in
+// the TU where they are defined.
+//
+// Don't register a C++17 inline variable. The local symbol can be
+// discarded and referencing a discarded local symbol from outside the
+// comdat (__cuda_register_globals) is disallowed by the ELF spec.
+//
+// HIP managed variables need to be always recorded in device and host
+// compilations for transformation.
+//
+// HIP managed variables and variables in CUDADeviceVarODRUsedByHost are
+// added to llvm.compiler-used, therefore they are safe to be registered.
+if ((!D->hasExternalStorage() && !D->isInline()) ||
+CGM.getContext().CUDADeviceVarODRUsedByHost.contains(D) ||
+D->hasAttr<HIPManagedAttr>()) {
+registerDeviceVar(D, GV, !D->hasDefinition(),
+D->hasAttr<CUDAConstantAttr>());
+}
+} else if (D->getType()->isCUDADeviceBuiltinSurfaceType() ||
+D->getType()->isCUDADeviceBuiltinTextureType()) {
+// Builtin surfaces and textures and their template arguments are
+// also registered with CUDA runtime.
+const auto *TD = cast<ClassTemplateSpecializationDecl>(
+D->getType()->castAs<RecordType>()->getDecl());
+const TemplateArgumentList &Args = TD->getTemplateArgs();
+if (TD->hasAttr<CUDADeviceBuiltinSurfaceTypeAttr>()) {
+assert(Args.size() == 2 &&
+"Unexpected number of template arguments of CUDA device "
+"builtin surface type.");
+auto SurfType = Args[1].getAsIntegral();
+if (!D->hasExternalStorage())
+registerDeviceSurf(D, GV, !D->hasDefinition(), SurfType.getSExtValue());
+} else {
+assert(Args.size() == 3 &&
+"Unexpected number of template arguments of CUDA device "
+"builtin texture type.");
+auto TexType = Args[1].getAsIntegral();
+auto Normalized = Args[2].getAsIntegral();
+if (!D->hasExternalStorage())
+registerDeviceTex(D, GV, !D->hasDefinition(), TexType.getSExtValue(),
+Normalized.getZExtValue());
+}
+}
+}
+// Transform managed variables to pointers to managed variables in device code.
+// Each use of the original managed variable is replaced by a load from the
+// transformed managed variable. The transformed managed variable contains
+// the address of managed memory which will be allocated by the runtime.
+void CGNVCUDARuntime::transformManagedVars() {
+for (auto &&Info : DeviceVars) {
+llvm::GlobalVariable *Var = Info.Var;
+if (Info.Flags.getKind() == DeviceVarFlags::Variable &&
+Info.Flags.isManaged()) {
+auto ManagedVar = new llvm::GlobalVariable(
+CGM.getModule(), Var->getType(),
+/*isConstant=*/false, Var->getLinkage(),
+/*Init=*/Var->isDeclaration()
+? nullptr
+: llvm::ConstantPointerNull::get(Var->getType()),
+/*Name=*/"", /*InsertBefore=*/nullptr,
+llvm::GlobalVariable::NotThreadLocal,
+CGM.getContext().getTargetAddressSpace(LangAS::cuda_device));
+ManagedVar->setDSOLocal(Var->isDSOLocal());
+ManagedVar->setVisibility(Var->getVisibility());
+ManagedVar->setExternallyInitialized(true);
+replaceManagedVar(Var, ManagedVar);
+ManagedVar->takeName(Var);
+Var->setName(Twine(ManagedVar->getName()) + ".managed");
+// Keep managed variables even if they are not used in device code since
+// they need to be allocated by the runtime.
+if (!Var->isDeclaration()) {
+assert(!ManagedVar->isDeclaration());
+CGM.addCompilerUsedGlobal(Var);
+CGM.addCompilerUsedGlobal(ManagedVar);
+}
+}
+}
+}
+// Returns module constructor to be added.
+llvm::Function *CGNVCUDARuntime::finalizeModule() {
+if (CGM.getLangOpts().CUDAIsDevice) {
+transformManagedVars();
+// Mark ODR-used device variables as compiler used to prevent it from being
+// eliminated by optimization. This is necessary for device variables
+// ODR-used by host functions. Sema correctly marks them as ODR-used no
+// matter whether they are ODR-used by device or host functions.
+//
+// We do not need to do this if the variable has used attribute since it
+// has already been added.
+//
+// Static device variables have been externalized at this point, therefore
+// variables with LLVM private or internal linkage need not be added.
+for (auto &&Info : DeviceVars) {
+auto Kind = Info.Flags.getKind();
+if (!Info.Var->isDeclaration() &&
+!llvm::GlobalValue::isLocalLinkage(Info.Var->getLinkage()) &&
+(Kind == DeviceVarFlags::Variable ||
+Kind == DeviceVarFlags::Surface ||
+Kind == DeviceVarFlags::Texture) &&
+Info.D->isUsed() && !Info.D->hasAttr<UsedAttr>()) {
+CGM.addCompilerUsedGlobal(Info.Var);
+}
+}
+return nullptr;
+}
+return makeModuleCtorFunction();
+}
+llvm::GlobalValue *CGNVCUDARuntime::getKernelHandle(llvm::Function *F,
+GlobalDecl GD) {
+auto Loc = KernelHandles.find(F);
+if (Loc != KernelHandles.end())
+return Loc->second;
+if (!CGM.getLangOpts().HIP) {
+KernelHandles[F] = F;
+KernelStubs[F] = F;
+return F;
+}
+auto *Var = new llvm::GlobalVariable(
+TheModule, F->getType(), /*isConstant=*/true, F->getLinkage(),
+/*Initializer=*/nullptr,
+CGM.getMangledName(
+GD.getWithKernelReferenceKind(KernelReferenceKind::Kernel)));
+Var->setAlignment(CGM.getPointerAlign().getAsAlign());
+Var->setDSOLocal(F->isDSOLocal());
+Var->setVisibility(F->getVisibility());
+KernelHandles[F] = Var;
+KernelStubs[Var] = F;
+return Var;
+}

Mercurial > hg > CbC > CbC_llvm

comparison clang/lib/CodeGen/CGCUDANV.cpp @ 221:79ff65ed7e25