Mercurial > hg > CbC > CbC_llvm
comparison lib/Target/NVPTX/NVPTXLowerArgs.cpp @ 120:1172e4bd9c6f
update 4.0.0
author | mir3636 |
---|---|
date | Fri, 25 Nov 2016 19:14:25 +0900 |
parents | |
children | 803732b1fca8 |
comparison
equal
deleted
inserted
replaced
101:34baf5011add | 120:1172e4bd9c6f |
---|---|
1 //===-- NVPTXLowerArgs.cpp - Lower arguments ------------------------------===// | |
2 // | |
3 // The LLVM Compiler Infrastructure | |
4 // | |
5 // This file is distributed under the University of Illinois Open Source | |
6 // License. See LICENSE.TXT for details. | |
7 // | |
8 //===----------------------------------------------------------------------===// | |
9 // | |
10 // | |
11 // Arguments to kernel and device functions are passed via param space, | |
12 // which imposes certain restrictions: | |
13 // http://docs.nvidia.com/cuda/parallel-thread-execution/#state-spaces | |
14 // | |
15 // Kernel parameters are read-only and accessible only via ld.param | |
16 // instruction, directly or via a pointer. Pointers to kernel | |
17 // arguments can't be converted to generic address space. | |
18 // | |
19 // Device function parameters are directly accessible via | |
20 // ld.param/st.param, but taking the address of one returns a pointer | |
21 // to a copy created in local space which *can't* be used with | |
22 // ld.param/st.param. | |
23 // | |
24 // Copying a byval struct into local memory in IR allows us to enforce | |
25 // the param space restrictions, gives the rest of IR a pointer w/o | |
26 // param space restrictions, and gives us an opportunity to eliminate | |
27 // the copy. | |
28 // | |
29 // Pointer arguments to kernel functions need more work to be lowered: | |
30 // | |
31 // 1. Convert non-byval pointer arguments of CUDA kernels to pointers in the | |
32 // global address space. This allows later optimizations to emit | |
33 // ld.global.*/st.global.* for accessing these pointer arguments. For | |
34 // example, | |
35 // | |
36 // define void @foo(float* %input) { | |
37 // %v = load float, float* %input, align 4 | |
38 // ... | |
39 // } | |
40 // | |
41 // becomes | |
42 // | |
43 // define void @foo(float* %input) { | |
44 // %input2 = addrspacecast float* %input to float addrspace(1)* | |
45 // %input3 = addrspacecast float addrspace(1)* %input2 to float* | |
46 // %v = load float, float* %input3, align 4 | |
47 // ... | |
48 // } | |
49 // | |
50 // Later, NVPTXInferAddressSpaces will optimize it to | |
51 // | |
52 // define void @foo(float* %input) { | |
53 // %input2 = addrspacecast float* %input to float addrspace(1)* | |
54 // %v = load float, float addrspace(1)* %input2, align 4 | |
55 // ... | |
56 // } | |
57 // | |
58 // 2. Convert pointers in a byval kernel parameter to pointers in the global | |
59 // address space. As #2, it allows NVPTX to emit more ld/st.global. E.g., | |
60 // | |
61 // struct S { | |
62 // int *x; | |
63 // int *y; | |
64 // }; | |
65 // __global__ void foo(S s) { | |
66 // int *b = s.y; | |
67 // // use b | |
68 // } | |
69 // | |
70 // "b" points to the global address space. In the IR level, | |
71 // | |
72 // define void @foo({i32*, i32*}* byval %input) { | |
73 // %b_ptr = getelementptr {i32*, i32*}, {i32*, i32*}* %input, i64 0, i32 1 | |
74 // %b = load i32*, i32** %b_ptr | |
75 // ; use %b | |
76 // } | |
77 // | |
78 // becomes | |
79 // | |
80 // define void @foo({i32*, i32*}* byval %input) { | |
81 // %b_ptr = getelementptr {i32*, i32*}, {i32*, i32*}* %input, i64 0, i32 1 | |
82 // %b = load i32*, i32** %b_ptr | |
83 // %b_global = addrspacecast i32* %b to i32 addrspace(1)* | |
84 // %b_generic = addrspacecast i32 addrspace(1)* %b_global to i32* | |
85 // ; use %b_generic | |
86 // } | |
87 // | |
88 // TODO: merge this pass with NVPTXInferAddressSpaces so that other passes don't | |
89 // cancel the addrspacecast pair this pass emits. | |
90 //===----------------------------------------------------------------------===// | |
91 | |
92 #include "NVPTX.h" | |
93 #include "NVPTXUtilities.h" | |
94 #include "NVPTXTargetMachine.h" | |
95 #include "llvm/Analysis/ValueTracking.h" | |
96 #include "llvm/IR/Function.h" | |
97 #include "llvm/IR/Instructions.h" | |
98 #include "llvm/IR/Module.h" | |
99 #include "llvm/IR/Type.h" | |
100 #include "llvm/Pass.h" | |
101 | |
102 using namespace llvm; | |
103 | |
104 namespace llvm { | |
105 void initializeNVPTXLowerArgsPass(PassRegistry &); | |
106 } | |
107 | |
108 namespace { | |
109 class NVPTXLowerArgs : public FunctionPass { | |
110 bool runOnFunction(Function &F) override; | |
111 | |
112 bool runOnKernelFunction(Function &F); | |
113 bool runOnDeviceFunction(Function &F); | |
114 | |
115 // handle byval parameters | |
116 void handleByValParam(Argument *Arg); | |
117 // Knowing Ptr must point to the global address space, this function | |
118 // addrspacecasts Ptr to global and then back to generic. This allows | |
119 // NVPTXInferAddressSpaces to fold the global-to-generic cast into | |
120 // loads/stores that appear later. | |
121 void markPointerAsGlobal(Value *Ptr); | |
122 | |
123 public: | |
124 static char ID; // Pass identification, replacement for typeid | |
125 NVPTXLowerArgs(const NVPTXTargetMachine *TM = nullptr) | |
126 : FunctionPass(ID), TM(TM) {} | |
127 StringRef getPassName() const override { | |
128 return "Lower pointer arguments of CUDA kernels"; | |
129 } | |
130 | |
131 private: | |
132 const NVPTXTargetMachine *TM; | |
133 }; | |
134 } // namespace | |
135 | |
136 char NVPTXLowerArgs::ID = 1; | |
137 | |
138 INITIALIZE_PASS(NVPTXLowerArgs, "nvptx-lower-args", | |
139 "Lower arguments (NVPTX)", false, false) | |
140 | |
141 // ============================================================================= | |
142 // If the function had a byval struct ptr arg, say foo(%struct.x* byval %d), | |
143 // then add the following instructions to the first basic block: | |
144 // | |
145 // %temp = alloca %struct.x, align 8 | |
146 // %tempd = addrspacecast %struct.x* %d to %struct.x addrspace(101)* | |
147 // %tv = load %struct.x addrspace(101)* %tempd | |
148 // store %struct.x %tv, %struct.x* %temp, align 8 | |
149 // | |
150 // The above code allocates some space in the stack and copies the incoming | |
151 // struct from param space to local space. | |
152 // Then replace all occurrences of %d by %temp. | |
153 // ============================================================================= | |
154 void NVPTXLowerArgs::handleByValParam(Argument *Arg) { | |
155 Function *Func = Arg->getParent(); | |
156 Instruction *FirstInst = &(Func->getEntryBlock().front()); | |
157 PointerType *PType = dyn_cast<PointerType>(Arg->getType()); | |
158 | |
159 assert(PType && "Expecting pointer type in handleByValParam"); | |
160 | |
161 Type *StructType = PType->getElementType(); | |
162 AllocaInst *AllocA = new AllocaInst(StructType, Arg->getName(), FirstInst); | |
163 // Set the alignment to alignment of the byval parameter. This is because, | |
164 // later load/stores assume that alignment, and we are going to replace | |
165 // the use of the byval parameter with this alloca instruction. | |
166 AllocA->setAlignment(Func->getParamAlignment(Arg->getArgNo() + 1)); | |
167 Arg->replaceAllUsesWith(AllocA); | |
168 | |
169 Value *ArgInParam = new AddrSpaceCastInst( | |
170 Arg, PointerType::get(StructType, ADDRESS_SPACE_PARAM), Arg->getName(), | |
171 FirstInst); | |
172 LoadInst *LI = new LoadInst(ArgInParam, Arg->getName(), FirstInst); | |
173 new StoreInst(LI, AllocA, FirstInst); | |
174 } | |
175 | |
176 void NVPTXLowerArgs::markPointerAsGlobal(Value *Ptr) { | |
177 if (Ptr->getType()->getPointerAddressSpace() == ADDRESS_SPACE_GLOBAL) | |
178 return; | |
179 | |
180 // Deciding where to emit the addrspacecast pair. | |
181 BasicBlock::iterator InsertPt; | |
182 if (Argument *Arg = dyn_cast<Argument>(Ptr)) { | |
183 // Insert at the functon entry if Ptr is an argument. | |
184 InsertPt = Arg->getParent()->getEntryBlock().begin(); | |
185 } else { | |
186 // Insert right after Ptr if Ptr is an instruction. | |
187 InsertPt = ++cast<Instruction>(Ptr)->getIterator(); | |
188 assert(InsertPt != InsertPt->getParent()->end() && | |
189 "We don't call this function with Ptr being a terminator."); | |
190 } | |
191 | |
192 Instruction *PtrInGlobal = new AddrSpaceCastInst( | |
193 Ptr, PointerType::get(Ptr->getType()->getPointerElementType(), | |
194 ADDRESS_SPACE_GLOBAL), | |
195 Ptr->getName(), &*InsertPt); | |
196 Value *PtrInGeneric = new AddrSpaceCastInst(PtrInGlobal, Ptr->getType(), | |
197 Ptr->getName(), &*InsertPt); | |
198 // Replace with PtrInGeneric all uses of Ptr except PtrInGlobal. | |
199 Ptr->replaceAllUsesWith(PtrInGeneric); | |
200 PtrInGlobal->setOperand(0, Ptr); | |
201 } | |
202 | |
203 // ============================================================================= | |
204 // Main function for this pass. | |
205 // ============================================================================= | |
206 bool NVPTXLowerArgs::runOnKernelFunction(Function &F) { | |
207 if (TM && TM->getDrvInterface() == NVPTX::CUDA) { | |
208 // Mark pointers in byval structs as global. | |
209 for (auto &B : F) { | |
210 for (auto &I : B) { | |
211 if (LoadInst *LI = dyn_cast<LoadInst>(&I)) { | |
212 if (LI->getType()->isPointerTy()) { | |
213 Value *UO = GetUnderlyingObject(LI->getPointerOperand(), | |
214 F.getParent()->getDataLayout()); | |
215 if (Argument *Arg = dyn_cast<Argument>(UO)) { | |
216 if (Arg->hasByValAttr()) { | |
217 // LI is a load from a pointer within a byval kernel parameter. | |
218 markPointerAsGlobal(LI); | |
219 } | |
220 } | |
221 } | |
222 } | |
223 } | |
224 } | |
225 } | |
226 | |
227 for (Argument &Arg : F.args()) { | |
228 if (Arg.getType()->isPointerTy()) { | |
229 if (Arg.hasByValAttr()) | |
230 handleByValParam(&Arg); | |
231 else if (TM && TM->getDrvInterface() == NVPTX::CUDA) | |
232 markPointerAsGlobal(&Arg); | |
233 } | |
234 } | |
235 return true; | |
236 } | |
237 | |
238 // Device functions only need to copy byval args into local memory. | |
239 bool NVPTXLowerArgs::runOnDeviceFunction(Function &F) { | |
240 for (Argument &Arg : F.args()) | |
241 if (Arg.getType()->isPointerTy() && Arg.hasByValAttr()) | |
242 handleByValParam(&Arg); | |
243 return true; | |
244 } | |
245 | |
246 bool NVPTXLowerArgs::runOnFunction(Function &F) { | |
247 return isKernelFunction(F) ? runOnKernelFunction(F) : runOnDeviceFunction(F); | |
248 } | |
249 | |
250 FunctionPass * | |
251 llvm::createNVPTXLowerArgsPass(const NVPTXTargetMachine *TM) { | |
252 return new NVPTXLowerArgs(TM); | |
253 } |