Mercurial > hg > CbC > CbC_llvm
comparison lib/Target/NVPTX/NVPTXLowerKernelArgs.cpp @ 95:afa8332a0e37 LLVM3.8
LLVM 3.8
author | Kaito Tokumori <e105711@ie.u-ryukyu.ac.jp> |
---|---|
date | Tue, 13 Oct 2015 17:48:58 +0900 |
parents | |
children | 7d135dc70f03 |
comparison
equal
deleted
inserted
replaced
84:f3e34b893a5f | 95:afa8332a0e37 |
---|---|
1 //===-- NVPTXLowerKernelArgs.cpp - Lower kernel arguments -----------------===// | |
2 // | |
3 // The LLVM Compiler Infrastructure | |
4 // | |
5 // This file is distributed under the University of Illinois Open Source | |
6 // License. See LICENSE.TXT for details. | |
7 // | |
8 //===----------------------------------------------------------------------===// | |
9 // | |
10 // Pointer arguments to kernel functions need to be lowered specially. | |
11 // | |
12 // 1. Copy byval struct args to local memory. This is a preparation for handling | |
13 // cases like | |
14 // | |
15 // kernel void foo(struct A arg, ...) | |
16 // { | |
17 // struct A *p = &arg; | |
18 // ... | |
19 // ... = p->filed1 ... (this is no generic address for .param) | |
20 // p->filed2 = ... (this is no write access to .param) | |
21 // } | |
22 // | |
23 // 2. Convert non-byval pointer arguments of CUDA kernels to pointers in the | |
24 // global address space. This allows later optimizations to emit | |
25 // ld.global.*/st.global.* for accessing these pointer arguments. For | |
26 // example, | |
27 // | |
28 // define void @foo(float* %input) { | |
29 // %v = load float, float* %input, align 4 | |
30 // ... | |
31 // } | |
32 // | |
33 // becomes | |
34 // | |
35 // define void @foo(float* %input) { | |
36 // %input2 = addrspacecast float* %input to float addrspace(1)* | |
37 // %input3 = addrspacecast float addrspace(1)* %input2 to float* | |
38 // %v = load float, float* %input3, align 4 | |
39 // ... | |
40 // } | |
41 // | |
42 // Later, NVPTXFavorNonGenericAddrSpaces will optimize it to | |
43 // | |
44 // define void @foo(float* %input) { | |
45 // %input2 = addrspacecast float* %input to float addrspace(1)* | |
46 // %v = load float, float addrspace(1)* %input2, align 4 | |
47 // ... | |
48 // } | |
49 // | |
50 // 3. Convert pointers in a byval kernel parameter to pointers in the global | |
51 // address space. As #2, it allows NVPTX to emit more ld/st.global. E.g., | |
52 // | |
53 // struct S { | |
54 // int *x; | |
55 // int *y; | |
56 // }; | |
57 // __global__ void foo(S s) { | |
58 // int *b = s.y; | |
59 // // use b | |
60 // } | |
61 // | |
62 // "b" points to the global address space. In the IR level, | |
63 // | |
64 // define void @foo({i32*, i32*}* byval %input) { | |
65 // %b_ptr = getelementptr {i32*, i32*}, {i32*, i32*}* %input, i64 0, i32 1 | |
66 // %b = load i32*, i32** %b_ptr | |
67 // ; use %b | |
68 // } | |
69 // | |
70 // becomes | |
71 // | |
72 // define void @foo({i32*, i32*}* byval %input) { | |
73 // %b_ptr = getelementptr {i32*, i32*}, {i32*, i32*}* %input, i64 0, i32 1 | |
74 // %b = load i32*, i32** %b_ptr | |
75 // %b_global = addrspacecast i32* %b to i32 addrspace(1)* | |
76 // %b_generic = addrspacecast i32 addrspace(1)* %b_global to i32* | |
77 // ; use %b_generic | |
78 // } | |
79 // | |
80 // TODO: merge this pass with NVPTXFavorNonGenericAddrSpace so that other passes | |
81 // don't cancel the addrspacecast pair this pass emits. | |
82 //===----------------------------------------------------------------------===// | |
83 | |
84 #include "NVPTX.h" | |
85 #include "NVPTXUtilities.h" | |
86 #include "NVPTXTargetMachine.h" | |
87 #include "llvm/Analysis/ValueTracking.h" | |
88 #include "llvm/IR/Function.h" | |
89 #include "llvm/IR/Instructions.h" | |
90 #include "llvm/IR/Module.h" | |
91 #include "llvm/IR/Type.h" | |
92 #include "llvm/Pass.h" | |
93 | |
94 using namespace llvm; | |
95 | |
96 namespace llvm { | |
97 void initializeNVPTXLowerKernelArgsPass(PassRegistry &); | |
98 } | |
99 | |
100 namespace { | |
101 class NVPTXLowerKernelArgs : public FunctionPass { | |
102 bool runOnFunction(Function &F) override; | |
103 | |
104 // handle byval parameters | |
105 void handleByValParam(Argument *Arg); | |
106 // Knowing Ptr must point to the global address space, this function | |
107 // addrspacecasts Ptr to global and then back to generic. This allows | |
108 // NVPTXFavorNonGenericAddrSpace to fold the global-to-generic cast into | |
109 // loads/stores that appear later. | |
110 void markPointerAsGlobal(Value *Ptr); | |
111 | |
112 public: | |
113 static char ID; // Pass identification, replacement for typeid | |
114 NVPTXLowerKernelArgs(const NVPTXTargetMachine *TM = nullptr) | |
115 : FunctionPass(ID), TM(TM) {} | |
116 const char *getPassName() const override { | |
117 return "Lower pointer arguments of CUDA kernels"; | |
118 } | |
119 | |
120 private: | |
121 const NVPTXTargetMachine *TM; | |
122 }; | |
123 } // namespace | |
124 | |
125 char NVPTXLowerKernelArgs::ID = 1; | |
126 | |
127 INITIALIZE_PASS(NVPTXLowerKernelArgs, "nvptx-lower-kernel-args", | |
128 "Lower kernel arguments (NVPTX)", false, false) | |
129 | |
130 // ============================================================================= | |
131 // If the function had a byval struct ptr arg, say foo(%struct.x *byval %d), | |
132 // then add the following instructions to the first basic block: | |
133 // | |
134 // %temp = alloca %struct.x, align 8 | |
135 // %tempd = addrspacecast %struct.x* %d to %struct.x addrspace(101)* | |
136 // %tv = load %struct.x addrspace(101)* %tempd | |
137 // store %struct.x %tv, %struct.x* %temp, align 8 | |
138 // | |
139 // The above code allocates some space in the stack and copies the incoming | |
140 // struct from param space to local space. | |
141 // Then replace all occurrences of %d by %temp. | |
142 // ============================================================================= | |
143 void NVPTXLowerKernelArgs::handleByValParam(Argument *Arg) { | |
144 Function *Func = Arg->getParent(); | |
145 Instruction *FirstInst = &(Func->getEntryBlock().front()); | |
146 PointerType *PType = dyn_cast<PointerType>(Arg->getType()); | |
147 | |
148 assert(PType && "Expecting pointer type in handleByValParam"); | |
149 | |
150 Type *StructType = PType->getElementType(); | |
151 AllocaInst *AllocA = new AllocaInst(StructType, Arg->getName(), FirstInst); | |
152 // Set the alignment to alignment of the byval parameter. This is because, | |
153 // later load/stores assume that alignment, and we are going to replace | |
154 // the use of the byval parameter with this alloca instruction. | |
155 AllocA->setAlignment(Func->getParamAlignment(Arg->getArgNo() + 1)); | |
156 Arg->replaceAllUsesWith(AllocA); | |
157 | |
158 Value *ArgInParam = new AddrSpaceCastInst( | |
159 Arg, PointerType::get(StructType, ADDRESS_SPACE_PARAM), Arg->getName(), | |
160 FirstInst); | |
161 LoadInst *LI = new LoadInst(ArgInParam, Arg->getName(), FirstInst); | |
162 new StoreInst(LI, AllocA, FirstInst); | |
163 } | |
164 | |
165 void NVPTXLowerKernelArgs::markPointerAsGlobal(Value *Ptr) { | |
166 if (Ptr->getType()->getPointerAddressSpace() == ADDRESS_SPACE_GLOBAL) | |
167 return; | |
168 | |
169 // Deciding where to emit the addrspacecast pair. | |
170 BasicBlock::iterator InsertPt; | |
171 if (Argument *Arg = dyn_cast<Argument>(Ptr)) { | |
172 // Insert at the functon entry if Ptr is an argument. | |
173 InsertPt = Arg->getParent()->getEntryBlock().begin(); | |
174 } else { | |
175 // Insert right after Ptr if Ptr is an instruction. | |
176 InsertPt = cast<Instruction>(Ptr); | |
177 ++InsertPt; | |
178 assert(InsertPt != InsertPt->getParent()->end() && | |
179 "We don't call this function with Ptr being a terminator."); | |
180 } | |
181 | |
182 Instruction *PtrInGlobal = new AddrSpaceCastInst( | |
183 Ptr, PointerType::get(Ptr->getType()->getPointerElementType(), | |
184 ADDRESS_SPACE_GLOBAL), | |
185 Ptr->getName(), InsertPt); | |
186 Value *PtrInGeneric = new AddrSpaceCastInst(PtrInGlobal, Ptr->getType(), | |
187 Ptr->getName(), InsertPt); | |
188 // Replace with PtrInGeneric all uses of Ptr except PtrInGlobal. | |
189 Ptr->replaceAllUsesWith(PtrInGeneric); | |
190 PtrInGlobal->setOperand(0, Ptr); | |
191 } | |
192 | |
193 // ============================================================================= | |
194 // Main function for this pass. | |
195 // ============================================================================= | |
196 bool NVPTXLowerKernelArgs::runOnFunction(Function &F) { | |
197 // Skip non-kernels. See the comments at the top of this file. | |
198 if (!isKernelFunction(F)) | |
199 return false; | |
200 | |
201 if (TM && TM->getDrvInterface() == NVPTX::CUDA) { | |
202 // Mark pointers in byval structs as global. | |
203 for (auto &B : F) { | |
204 for (auto &I : B) { | |
205 if (LoadInst *LI = dyn_cast<LoadInst>(&I)) { | |
206 if (LI->getType()->isPointerTy()) { | |
207 Value *UO = GetUnderlyingObject(LI->getPointerOperand(), | |
208 F.getParent()->getDataLayout()); | |
209 if (Argument *Arg = dyn_cast<Argument>(UO)) { | |
210 if (Arg->hasByValAttr()) { | |
211 // LI is a load from a pointer within a byval kernel parameter. | |
212 markPointerAsGlobal(LI); | |
213 } | |
214 } | |
215 } | |
216 } | |
217 } | |
218 } | |
219 } | |
220 | |
221 for (Argument &Arg : F.args()) { | |
222 if (Arg.getType()->isPointerTy()) { | |
223 if (Arg.hasByValAttr()) | |
224 handleByValParam(&Arg); | |
225 else if (TM && TM->getDrvInterface() == NVPTX::CUDA) | |
226 markPointerAsGlobal(&Arg); | |
227 } | |
228 } | |
229 return true; | |
230 } | |
231 | |
232 FunctionPass * | |
233 llvm::createNVPTXLowerKernelArgsPass(const NVPTXTargetMachine *TM) { | |
234 return new NVPTXLowerKernelArgs(TM); | |
235 } |