Mercurial > hg > CbC > CbC_llvm
comparison lib/Analysis/InlineCost.cpp @ 121:803732b1fca8
LLVM 5.0
author | kono |
---|---|
date | Fri, 27 Oct 2017 17:07:41 +0900 |
parents | 1172e4bd9c6f |
children | 3a76565eade5 |
comparison
equal
deleted
inserted
replaced
120:1172e4bd9c6f | 121:803732b1fca8 |
---|---|
16 #include "llvm/ADT/SetVector.h" | 16 #include "llvm/ADT/SetVector.h" |
17 #include "llvm/ADT/SmallPtrSet.h" | 17 #include "llvm/ADT/SmallPtrSet.h" |
18 #include "llvm/ADT/SmallVector.h" | 18 #include "llvm/ADT/SmallVector.h" |
19 #include "llvm/ADT/Statistic.h" | 19 #include "llvm/ADT/Statistic.h" |
20 #include "llvm/Analysis/AssumptionCache.h" | 20 #include "llvm/Analysis/AssumptionCache.h" |
21 #include "llvm/Analysis/BlockFrequencyInfo.h" | |
21 #include "llvm/Analysis/CodeMetrics.h" | 22 #include "llvm/Analysis/CodeMetrics.h" |
22 #include "llvm/Analysis/ConstantFolding.h" | 23 #include "llvm/Analysis/ConstantFolding.h" |
23 #include "llvm/Analysis/InstructionSimplify.h" | 24 #include "llvm/Analysis/InstructionSimplify.h" |
24 #include "llvm/Analysis/ProfileSummaryInfo.h" | 25 #include "llvm/Analysis/ProfileSummaryInfo.h" |
25 #include "llvm/Analysis/TargetTransformInfo.h" | 26 #include "llvm/Analysis/TargetTransformInfo.h" |
46 | 47 |
47 static cl::opt<int> HintThreshold( | 48 static cl::opt<int> HintThreshold( |
48 "inlinehint-threshold", cl::Hidden, cl::init(325), | 49 "inlinehint-threshold", cl::Hidden, cl::init(325), |
49 cl::desc("Threshold for inlining functions with inline hint")); | 50 cl::desc("Threshold for inlining functions with inline hint")); |
50 | 51 |
52 static cl::opt<int> | |
53 ColdCallSiteThreshold("inline-cold-callsite-threshold", cl::Hidden, | |
54 cl::init(45), | |
55 cl::desc("Threshold for inlining cold callsites")); | |
56 | |
51 // We introduce this threshold to help performance of instrumentation based | 57 // We introduce this threshold to help performance of instrumentation based |
52 // PGO before we actually hook up inliner with analysis passes such as BPI and | 58 // PGO before we actually hook up inliner with analysis passes such as BPI and |
53 // BFI. | 59 // BFI. |
54 static cl::opt<int> ColdThreshold( | 60 static cl::opt<int> ColdThreshold( |
55 "inlinecold-threshold", cl::Hidden, cl::init(225), | 61 "inlinecold-threshold", cl::Hidden, cl::init(45), |
56 cl::desc("Threshold for inlining functions with cold attribute")); | 62 cl::desc("Threshold for inlining functions with cold attribute")); |
57 | 63 |
58 static cl::opt<int> | 64 static cl::opt<int> |
59 HotCallSiteThreshold("hot-callsite-threshold", cl::Hidden, cl::init(3000), | 65 HotCallSiteThreshold("hot-callsite-threshold", cl::Hidden, cl::init(3000), |
60 cl::ZeroOrMore, | 66 cl::ZeroOrMore, |
61 cl::desc("Threshold for hot callsites ")); | 67 cl::desc("Threshold for hot callsites ")); |
62 | 68 |
69 static cl::opt<int> LocallyHotCallSiteThreshold( | |
70 "locally-hot-callsite-threshold", cl::Hidden, cl::init(525), cl::ZeroOrMore, | |
71 cl::desc("Threshold for locally hot callsites ")); | |
72 | |
73 static cl::opt<int> ColdCallSiteRelFreq( | |
74 "cold-callsite-rel-freq", cl::Hidden, cl::init(2), cl::ZeroOrMore, | |
75 cl::desc("Maxmimum block frequency, expressed as a percentage of caller's " | |
76 "entry frequency, for a callsite to be cold in the absence of " | |
77 "profile information.")); | |
78 | |
79 static cl::opt<int> HotCallSiteRelFreq( | |
80 "hot-callsite-rel-freq", cl::Hidden, cl::init(60), cl::ZeroOrMore, | |
81 cl::desc("Minimum block frequency, expressed as a multiple of caller's " | |
82 "entry frequency, for a callsite to be hot in the absence of " | |
83 "profile information.")); | |
84 | |
85 static cl::opt<bool> OptComputeFullInlineCost( | |
86 "inline-cost-full", cl::Hidden, cl::init(false), | |
87 cl::desc("Compute the full inline cost of a call site even when the cost " | |
88 "exceeds the threshold.")); | |
89 | |
63 namespace { | 90 namespace { |
64 | 91 |
65 class CallAnalyzer : public InstVisitor<CallAnalyzer, bool> { | 92 class CallAnalyzer : public InstVisitor<CallAnalyzer, bool> { |
66 typedef InstVisitor<CallAnalyzer, bool> Base; | 93 typedef InstVisitor<CallAnalyzer, bool> Base; |
67 friend class InstVisitor<CallAnalyzer, bool>; | 94 friend class InstVisitor<CallAnalyzer, bool>; |
70 const TargetTransformInfo &TTI; | 97 const TargetTransformInfo &TTI; |
71 | 98 |
72 /// Getter for the cache of @llvm.assume intrinsics. | 99 /// Getter for the cache of @llvm.assume intrinsics. |
73 std::function<AssumptionCache &(Function &)> &GetAssumptionCache; | 100 std::function<AssumptionCache &(Function &)> &GetAssumptionCache; |
74 | 101 |
102 /// Getter for BlockFrequencyInfo | |
103 Optional<function_ref<BlockFrequencyInfo &(Function &)>> &GetBFI; | |
104 | |
75 /// Profile summary information. | 105 /// Profile summary information. |
76 ProfileSummaryInfo *PSI; | 106 ProfileSummaryInfo *PSI; |
77 | 107 |
78 /// The called function. | 108 /// The called function. |
79 Function &F; | 109 Function &F; |
110 | |
111 // Cache the DataLayout since we use it a lot. | |
112 const DataLayout &DL; | |
113 | |
114 /// The OptimizationRemarkEmitter available for this compilation. | |
115 OptimizationRemarkEmitter *ORE; | |
80 | 116 |
81 /// The candidate callsite being analyzed. Please do not use this to do | 117 /// The candidate callsite being analyzed. Please do not use this to do |
82 /// analysis in the caller function; we want the inline cost query to be | 118 /// analysis in the caller function; we want the inline cost query to be |
83 /// easily cacheable. Instead, use the cover function paramHasAttr. | 119 /// easily cacheable. Instead, use the cover function paramHasAttr. |
84 CallSite CandidateCS; | 120 CallSite CandidateCS; |
86 /// Tunable parameters that control the analysis. | 122 /// Tunable parameters that control the analysis. |
87 const InlineParams &Params; | 123 const InlineParams &Params; |
88 | 124 |
89 int Threshold; | 125 int Threshold; |
90 int Cost; | 126 int Cost; |
127 bool ComputeFullInlineCost; | |
91 | 128 |
92 bool IsCallerRecursive; | 129 bool IsCallerRecursive; |
93 bool IsRecursiveCall; | 130 bool IsRecursiveCall; |
94 bool ExposesReturnsTwice; | 131 bool ExposesReturnsTwice; |
95 bool HasDynamicAlloca; | 132 bool HasDynamicAlloca; |
99 bool HasFrameEscape; | 136 bool HasFrameEscape; |
100 | 137 |
101 /// Number of bytes allocated statically by the callee. | 138 /// Number of bytes allocated statically by the callee. |
102 uint64_t AllocatedSize; | 139 uint64_t AllocatedSize; |
103 unsigned NumInstructions, NumVectorInstructions; | 140 unsigned NumInstructions, NumVectorInstructions; |
104 int FiftyPercentVectorBonus, TenPercentVectorBonus; | 141 int VectorBonus, TenPercentVectorBonus; |
105 int VectorBonus; | 142 // Bonus to be applied when the callee has only one reachable basic block. |
143 int SingleBBBonus; | |
106 | 144 |
107 /// While we walk the potentially-inlined instructions, we build up and | 145 /// While we walk the potentially-inlined instructions, we build up and |
108 /// maintain a mapping of simplified values specific to this callsite. The | 146 /// maintain a mapping of simplified values specific to this callsite. The |
109 /// idea is to propagate any special information we have about arguments to | 147 /// idea is to propagate any special information we have about arguments to |
110 /// this call through the inlinable section of the function, and account for | 148 /// this call through the inlinable section of the function, and account for |
131 DenseMap<Value *, int>::iterator &CostIt); | 169 DenseMap<Value *, int>::iterator &CostIt); |
132 void disableSROA(DenseMap<Value *, int>::iterator CostIt); | 170 void disableSROA(DenseMap<Value *, int>::iterator CostIt); |
133 void disableSROA(Value *V); | 171 void disableSROA(Value *V); |
134 void accumulateSROACost(DenseMap<Value *, int>::iterator CostIt, | 172 void accumulateSROACost(DenseMap<Value *, int>::iterator CostIt, |
135 int InstructionCost); | 173 int InstructionCost); |
136 bool isGEPOffsetConstant(GetElementPtrInst &GEP); | 174 bool isGEPFree(GetElementPtrInst &GEP); |
175 bool canFoldInboundsGEP(GetElementPtrInst &I); | |
137 bool accumulateGEPOffset(GEPOperator &GEP, APInt &Offset); | 176 bool accumulateGEPOffset(GEPOperator &GEP, APInt &Offset); |
138 bool simplifyCallSite(Function *F, CallSite CS); | 177 bool simplifyCallSite(Function *F, CallSite CS); |
178 template <typename Callable> | |
179 bool simplifyInstruction(Instruction &I, Callable Evaluate); | |
139 ConstantInt *stripAndComputeInBoundsConstantOffsets(Value *&V); | 180 ConstantInt *stripAndComputeInBoundsConstantOffsets(Value *&V); |
140 | 181 |
141 /// Return true if the given argument to the function being considered for | 182 /// Return true if the given argument to the function being considered for |
142 /// inlining has the given attribute set either at the call site or the | 183 /// inlining has the given attribute set either at the call site or the |
143 /// function declaration. Primarily used to inspect call site specific | 184 /// function declaration. Primarily used to inspect call site specific |
155 /// analysis. | 196 /// analysis. |
156 void updateThreshold(CallSite CS, Function &Callee); | 197 void updateThreshold(CallSite CS, Function &Callee); |
157 | 198 |
158 /// Return true if size growth is allowed when inlining the callee at CS. | 199 /// Return true if size growth is allowed when inlining the callee at CS. |
159 bool allowSizeGrowth(CallSite CS); | 200 bool allowSizeGrowth(CallSite CS); |
201 | |
202 /// Return true if \p CS is a cold callsite. | |
203 bool isColdCallSite(CallSite CS, BlockFrequencyInfo *CallerBFI); | |
204 | |
205 /// Return a higher threshold if \p CS is a hot callsite. | |
206 Optional<int> getHotCallSiteThreshold(CallSite CS, | |
207 BlockFrequencyInfo *CallerBFI); | |
160 | 208 |
161 // Custom analysis routines. | 209 // Custom analysis routines. |
162 bool analyzeBlock(BasicBlock *BB, SmallPtrSetImpl<const Value *> &EphValues); | 210 bool analyzeBlock(BasicBlock *BB, SmallPtrSetImpl<const Value *> &EphValues); |
163 | 211 |
164 // Disable several entry points to the visitor so we don't accidentally use | 212 // Disable several entry points to the visitor so we don't accidentally use |
181 bool visitPtrToInt(PtrToIntInst &I); | 229 bool visitPtrToInt(PtrToIntInst &I); |
182 bool visitIntToPtr(IntToPtrInst &I); | 230 bool visitIntToPtr(IntToPtrInst &I); |
183 bool visitCastInst(CastInst &I); | 231 bool visitCastInst(CastInst &I); |
184 bool visitUnaryInstruction(UnaryInstruction &I); | 232 bool visitUnaryInstruction(UnaryInstruction &I); |
185 bool visitCmpInst(CmpInst &I); | 233 bool visitCmpInst(CmpInst &I); |
234 bool visitAnd(BinaryOperator &I); | |
235 bool visitOr(BinaryOperator &I); | |
186 bool visitSub(BinaryOperator &I); | 236 bool visitSub(BinaryOperator &I); |
187 bool visitBinaryOperator(BinaryOperator &I); | 237 bool visitBinaryOperator(BinaryOperator &I); |
188 bool visitLoad(LoadInst &I); | 238 bool visitLoad(LoadInst &I); |
189 bool visitStore(StoreInst &I); | 239 bool visitStore(StoreInst &I); |
190 bool visitExtractValue(ExtractValueInst &I); | 240 bool visitExtractValue(ExtractValueInst &I); |
191 bool visitInsertValue(InsertValueInst &I); | 241 bool visitInsertValue(InsertValueInst &I); |
192 bool visitCallSite(CallSite CS); | 242 bool visitCallSite(CallSite CS); |
193 bool visitReturnInst(ReturnInst &RI); | 243 bool visitReturnInst(ReturnInst &RI); |
194 bool visitBranchInst(BranchInst &BI); | 244 bool visitBranchInst(BranchInst &BI); |
245 bool visitSelectInst(SelectInst &SI); | |
195 bool visitSwitchInst(SwitchInst &SI); | 246 bool visitSwitchInst(SwitchInst &SI); |
196 bool visitIndirectBrInst(IndirectBrInst &IBI); | 247 bool visitIndirectBrInst(IndirectBrInst &IBI); |
197 bool visitResumeInst(ResumeInst &RI); | 248 bool visitResumeInst(ResumeInst &RI); |
198 bool visitCleanupReturnInst(CleanupReturnInst &RI); | 249 bool visitCleanupReturnInst(CleanupReturnInst &RI); |
199 bool visitCatchReturnInst(CatchReturnInst &RI); | 250 bool visitCatchReturnInst(CatchReturnInst &RI); |
200 bool visitUnreachableInst(UnreachableInst &I); | 251 bool visitUnreachableInst(UnreachableInst &I); |
201 | 252 |
202 public: | 253 public: |
203 CallAnalyzer(const TargetTransformInfo &TTI, | 254 CallAnalyzer(const TargetTransformInfo &TTI, |
204 std::function<AssumptionCache &(Function &)> &GetAssumptionCache, | 255 std::function<AssumptionCache &(Function &)> &GetAssumptionCache, |
205 ProfileSummaryInfo *PSI, Function &Callee, CallSite CSArg, | 256 Optional<function_ref<BlockFrequencyInfo &(Function &)>> &GetBFI, |
206 const InlineParams &Params) | 257 ProfileSummaryInfo *PSI, OptimizationRemarkEmitter *ORE, |
207 : TTI(TTI), GetAssumptionCache(GetAssumptionCache), PSI(PSI), F(Callee), | 258 Function &Callee, CallSite CSArg, const InlineParams &Params) |
259 : TTI(TTI), GetAssumptionCache(GetAssumptionCache), GetBFI(GetBFI), | |
260 PSI(PSI), F(Callee), DL(F.getParent()->getDataLayout()), ORE(ORE), | |
208 CandidateCS(CSArg), Params(Params), Threshold(Params.DefaultThreshold), | 261 CandidateCS(CSArg), Params(Params), Threshold(Params.DefaultThreshold), |
209 Cost(0), IsCallerRecursive(false), IsRecursiveCall(false), | 262 Cost(0), ComputeFullInlineCost(OptComputeFullInlineCost || |
263 Params.ComputeFullInlineCost || ORE), | |
264 IsCallerRecursive(false), IsRecursiveCall(false), | |
210 ExposesReturnsTwice(false), HasDynamicAlloca(false), | 265 ExposesReturnsTwice(false), HasDynamicAlloca(false), |
211 ContainsNoDuplicateCall(false), HasReturn(false), HasIndirectBr(false), | 266 ContainsNoDuplicateCall(false), HasReturn(false), HasIndirectBr(false), |
212 HasFrameEscape(false), AllocatedSize(0), NumInstructions(0), | 267 HasFrameEscape(false), AllocatedSize(0), NumInstructions(0), |
213 NumVectorInstructions(0), FiftyPercentVectorBonus(0), | 268 NumVectorInstructions(0), VectorBonus(0), SingleBBBonus(0), |
214 TenPercentVectorBonus(0), VectorBonus(0), NumConstantArgs(0), | 269 NumConstantArgs(0), NumConstantOffsetPtrArgs(0), NumAllocaArgs(0), |
215 NumConstantOffsetPtrArgs(0), NumAllocaArgs(0), NumConstantPtrCmps(0), | 270 NumConstantPtrCmps(0), NumConstantPtrDiffs(0), |
216 NumConstantPtrDiffs(0), NumInstructionsSimplified(0), | 271 NumInstructionsSimplified(0), SROACostSavings(0), |
217 SROACostSavings(0), SROACostSavingsLost(0) {} | 272 SROACostSavingsLost(0) {} |
218 | 273 |
219 bool analyzeCall(CallSite CS); | 274 bool analyzeCall(CallSite CS); |
220 | 275 |
221 int getThreshold() { return Threshold; } | 276 int getThreshold() { return Threshold; } |
222 int getCost() { return Cost; } | 277 int getCost() { return Cost; } |
284 int InstructionCost) { | 339 int InstructionCost) { |
285 CostIt->second += InstructionCost; | 340 CostIt->second += InstructionCost; |
286 SROACostSavings += InstructionCost; | 341 SROACostSavings += InstructionCost; |
287 } | 342 } |
288 | 343 |
289 /// \brief Check whether a GEP's indices are all constant. | |
290 /// | |
291 /// Respects any simplified values known during the analysis of this callsite. | |
292 bool CallAnalyzer::isGEPOffsetConstant(GetElementPtrInst &GEP) { | |
293 for (User::op_iterator I = GEP.idx_begin(), E = GEP.idx_end(); I != E; ++I) | |
294 if (!isa<Constant>(*I) && !SimplifiedValues.lookup(*I)) | |
295 return false; | |
296 | |
297 return true; | |
298 } | |
299 | |
300 /// \brief Accumulate a constant GEP offset into an APInt if possible. | 344 /// \brief Accumulate a constant GEP offset into an APInt if possible. |
301 /// | 345 /// |
302 /// Returns false if unable to compute the offset for any reason. Respects any | 346 /// Returns false if unable to compute the offset for any reason. Respects any |
303 /// simplified values known during the analysis of this callsite. | 347 /// simplified values known during the analysis of this callsite. |
304 bool CallAnalyzer::accumulateGEPOffset(GEPOperator &GEP, APInt &Offset) { | 348 bool CallAnalyzer::accumulateGEPOffset(GEPOperator &GEP, APInt &Offset) { |
305 const DataLayout &DL = F.getParent()->getDataLayout(); | |
306 unsigned IntPtrWidth = DL.getPointerSizeInBits(); | 349 unsigned IntPtrWidth = DL.getPointerSizeInBits(); |
307 assert(IntPtrWidth == Offset.getBitWidth()); | 350 assert(IntPtrWidth == Offset.getBitWidth()); |
308 | 351 |
309 for (gep_type_iterator GTI = gep_type_begin(GEP), GTE = gep_type_end(GEP); | 352 for (gep_type_iterator GTI = gep_type_begin(GEP), GTE = gep_type_end(GEP); |
310 GTI != GTE; ++GTI) { | 353 GTI != GTE; ++GTI) { |
316 return false; | 359 return false; |
317 if (OpC->isZero()) | 360 if (OpC->isZero()) |
318 continue; | 361 continue; |
319 | 362 |
320 // Handle a struct index, which adds its field offset to the pointer. | 363 // Handle a struct index, which adds its field offset to the pointer. |
321 if (StructType *STy = dyn_cast<StructType>(*GTI)) { | 364 if (StructType *STy = GTI.getStructTypeOrNull()) { |
322 unsigned ElementIdx = OpC->getZExtValue(); | 365 unsigned ElementIdx = OpC->getZExtValue(); |
323 const StructLayout *SL = DL.getStructLayout(STy); | 366 const StructLayout *SL = DL.getStructLayout(STy); |
324 Offset += APInt(IntPtrWidth, SL->getElementOffset(ElementIdx)); | 367 Offset += APInt(IntPtrWidth, SL->getElementOffset(ElementIdx)); |
325 continue; | 368 continue; |
326 } | 369 } |
327 | 370 |
328 APInt TypeSize(IntPtrWidth, DL.getTypeAllocSize(GTI.getIndexedType())); | 371 APInt TypeSize(IntPtrWidth, DL.getTypeAllocSize(GTI.getIndexedType())); |
329 Offset += OpC->getValue().sextOrTrunc(IntPtrWidth) * TypeSize; | 372 Offset += OpC->getValue().sextOrTrunc(IntPtrWidth) * TypeSize; |
330 } | 373 } |
331 return true; | 374 return true; |
375 } | |
376 | |
377 /// \brief Use TTI to check whether a GEP is free. | |
378 /// | |
379 /// Respects any simplified values known during the analysis of this callsite. | |
380 bool CallAnalyzer::isGEPFree(GetElementPtrInst &GEP) { | |
381 SmallVector<Value *, 4> Operands; | |
382 Operands.push_back(GEP.getOperand(0)); | |
383 for (User::op_iterator I = GEP.idx_begin(), E = GEP.idx_end(); I != E; ++I) | |
384 if (Constant *SimpleOp = SimplifiedValues.lookup(*I)) | |
385 Operands.push_back(SimpleOp); | |
386 else | |
387 Operands.push_back(*I); | |
388 return TargetTransformInfo::TCC_Free == TTI.getUserCost(&GEP, Operands); | |
332 } | 389 } |
333 | 390 |
334 bool CallAnalyzer::visitAlloca(AllocaInst &I) { | 391 bool CallAnalyzer::visitAlloca(AllocaInst &I) { |
335 // Check whether inlining will turn a dynamic alloca into a static | 392 // Check whether inlining will turn a dynamic alloca into a static |
336 // alloca and handle that case. | 393 // alloca and handle that case. |
337 if (I.isArrayAllocation()) { | 394 if (I.isArrayAllocation()) { |
338 Constant *Size = SimplifiedValues.lookup(I.getArraySize()); | 395 Constant *Size = SimplifiedValues.lookup(I.getArraySize()); |
339 if (auto *AllocSize = dyn_cast_or_null<ConstantInt>(Size)) { | 396 if (auto *AllocSize = dyn_cast_or_null<ConstantInt>(Size)) { |
340 const DataLayout &DL = F.getParent()->getDataLayout(); | |
341 Type *Ty = I.getAllocatedType(); | 397 Type *Ty = I.getAllocatedType(); |
342 AllocatedSize = SaturatingMultiplyAdd( | 398 AllocatedSize = SaturatingMultiplyAdd( |
343 AllocSize->getLimitedValue(), DL.getTypeAllocSize(Ty), AllocatedSize); | 399 AllocSize->getLimitedValue(), DL.getTypeAllocSize(Ty), AllocatedSize); |
344 return Base::visitAlloca(I); | 400 return Base::visitAlloca(I); |
345 } | 401 } |
346 } | 402 } |
347 | 403 |
348 // Accumulate the allocated size. | 404 // Accumulate the allocated size. |
349 if (I.isStaticAlloca()) { | 405 if (I.isStaticAlloca()) { |
350 const DataLayout &DL = F.getParent()->getDataLayout(); | |
351 Type *Ty = I.getAllocatedType(); | 406 Type *Ty = I.getAllocatedType(); |
352 AllocatedSize = SaturatingAdd(DL.getTypeAllocSize(Ty), AllocatedSize); | 407 AllocatedSize = SaturatingAdd(DL.getTypeAllocSize(Ty), AllocatedSize); |
353 } | 408 } |
354 | 409 |
355 // We will happily inline static alloca instructions. | 410 // We will happily inline static alloca instructions. |
375 | 430 |
376 // Phi nodes are always zero-cost. | 431 // Phi nodes are always zero-cost. |
377 return true; | 432 return true; |
378 } | 433 } |
379 | 434 |
435 /// \brief Check we can fold GEPs of constant-offset call site argument pointers. | |
436 /// This requires target data and inbounds GEPs. | |
437 /// | |
438 /// \return true if the specified GEP can be folded. | |
439 bool CallAnalyzer::canFoldInboundsGEP(GetElementPtrInst &I) { | |
440 // Check if we have a base + offset for the pointer. | |
441 std::pair<Value *, APInt> BaseAndOffset = | |
442 ConstantOffsetPtrs.lookup(I.getPointerOperand()); | |
443 if (!BaseAndOffset.first) | |
444 return false; | |
445 | |
446 // Check if the offset of this GEP is constant, and if so accumulate it | |
447 // into Offset. | |
448 if (!accumulateGEPOffset(cast<GEPOperator>(I), BaseAndOffset.second)) | |
449 return false; | |
450 | |
451 // Add the result as a new mapping to Base + Offset. | |
452 ConstantOffsetPtrs[&I] = BaseAndOffset; | |
453 | |
454 return true; | |
455 } | |
456 | |
380 bool CallAnalyzer::visitGetElementPtr(GetElementPtrInst &I) { | 457 bool CallAnalyzer::visitGetElementPtr(GetElementPtrInst &I) { |
381 Value *SROAArg; | 458 Value *SROAArg; |
382 DenseMap<Value *, int>::iterator CostIt; | 459 DenseMap<Value *, int>::iterator CostIt; |
383 bool SROACandidate = | 460 bool SROACandidate = |
384 lookupSROAArgAndCost(I.getPointerOperand(), SROAArg, CostIt); | 461 lookupSROAArgAndCost(I.getPointerOperand(), SROAArg, CostIt); |
385 | 462 |
386 // Try to fold GEPs of constant-offset call site argument pointers. This | 463 // Lambda to check whether a GEP's indices are all constant. |
387 // requires target data and inbounds GEPs. | 464 auto IsGEPOffsetConstant = [&](GetElementPtrInst &GEP) { |
388 if (I.isInBounds()) { | 465 for (User::op_iterator I = GEP.idx_begin(), E = GEP.idx_end(); I != E; ++I) |
389 // Check if we have a base + offset for the pointer. | 466 if (!isa<Constant>(*I) && !SimplifiedValues.lookup(*I)) |
390 Value *Ptr = I.getPointerOperand(); | |
391 std::pair<Value *, APInt> BaseAndOffset = ConstantOffsetPtrs.lookup(Ptr); | |
392 if (BaseAndOffset.first) { | |
393 // Check if the offset of this GEP is constant, and if so accumulate it | |
394 // into Offset. | |
395 if (!accumulateGEPOffset(cast<GEPOperator>(I), BaseAndOffset.second)) { | |
396 // Non-constant GEPs aren't folded, and disable SROA. | |
397 if (SROACandidate) | |
398 disableSROA(CostIt); | |
399 return false; | 467 return false; |
400 } | 468 return true; |
401 | 469 }; |
402 // Add the result as a new mapping to Base + Offset. | 470 |
403 ConstantOffsetPtrs[&I] = BaseAndOffset; | 471 if ((I.isInBounds() && canFoldInboundsGEP(I)) || IsGEPOffsetConstant(I)) { |
404 | |
405 // Also handle SROA candidates here, we already know that the GEP is | |
406 // all-constant indexed. | |
407 if (SROACandidate) | |
408 SROAArgValues[&I] = SROAArg; | |
409 | |
410 return true; | |
411 } | |
412 } | |
413 | |
414 if (isGEPOffsetConstant(I)) { | |
415 if (SROACandidate) | 472 if (SROACandidate) |
416 SROAArgValues[&I] = SROAArg; | 473 SROAArgValues[&I] = SROAArg; |
417 | 474 |
418 // Constant GEPs are modeled as free. | 475 // Constant GEPs are modeled as free. |
419 return true; | 476 return true; |
420 } | 477 } |
421 | 478 |
422 // Variable GEPs will require math and will disable SROA. | 479 // Variable GEPs will require math and will disable SROA. |
423 if (SROACandidate) | 480 if (SROACandidate) |
424 disableSROA(CostIt); | 481 disableSROA(CostIt); |
425 return false; | 482 return isGEPFree(I); |
483 } | |
484 | |
485 /// Simplify \p I if its operands are constants and update SimplifiedValues. | |
486 /// \p Evaluate is a callable specific to instruction type that evaluates the | |
487 /// instruction when all the operands are constants. | |
488 template <typename Callable> | |
489 bool CallAnalyzer::simplifyInstruction(Instruction &I, Callable Evaluate) { | |
490 SmallVector<Constant *, 2> COps; | |
491 for (Value *Op : I.operands()) { | |
492 Constant *COp = dyn_cast<Constant>(Op); | |
493 if (!COp) | |
494 COp = SimplifiedValues.lookup(Op); | |
495 if (!COp) | |
496 return false; | |
497 COps.push_back(COp); | |
498 } | |
499 auto *C = Evaluate(COps); | |
500 if (!C) | |
501 return false; | |
502 SimplifiedValues[&I] = C; | |
503 return true; | |
426 } | 504 } |
427 | 505 |
428 bool CallAnalyzer::visitBitCast(BitCastInst &I) { | 506 bool CallAnalyzer::visitBitCast(BitCastInst &I) { |
429 // Propagate constants through bitcasts. | 507 // Propagate constants through bitcasts. |
430 Constant *COp = dyn_cast<Constant>(I.getOperand(0)); | 508 if (simplifyInstruction(I, [&](SmallVectorImpl<Constant *> &COps) { |
431 if (!COp) | 509 return ConstantExpr::getBitCast(COps[0], I.getType()); |
432 COp = SimplifiedValues.lookup(I.getOperand(0)); | 510 })) |
433 if (COp) | 511 return true; |
434 if (Constant *C = ConstantExpr::getBitCast(COp, I.getType())) { | |
435 SimplifiedValues[&I] = C; | |
436 return true; | |
437 } | |
438 | 512 |
439 // Track base/offsets through casts | 513 // Track base/offsets through casts |
440 std::pair<Value *, APInt> BaseAndOffset = | 514 std::pair<Value *, APInt> BaseAndOffset = |
441 ConstantOffsetPtrs.lookup(I.getOperand(0)); | 515 ConstantOffsetPtrs.lookup(I.getOperand(0)); |
442 // Casts don't change the offset, just wrap it up. | 516 // Casts don't change the offset, just wrap it up. |
453 return true; | 527 return true; |
454 } | 528 } |
455 | 529 |
456 bool CallAnalyzer::visitPtrToInt(PtrToIntInst &I) { | 530 bool CallAnalyzer::visitPtrToInt(PtrToIntInst &I) { |
457 // Propagate constants through ptrtoint. | 531 // Propagate constants through ptrtoint. |
458 Constant *COp = dyn_cast<Constant>(I.getOperand(0)); | 532 if (simplifyInstruction(I, [&](SmallVectorImpl<Constant *> &COps) { |
459 if (!COp) | 533 return ConstantExpr::getPtrToInt(COps[0], I.getType()); |
460 COp = SimplifiedValues.lookup(I.getOperand(0)); | 534 })) |
461 if (COp) | 535 return true; |
462 if (Constant *C = ConstantExpr::getPtrToInt(COp, I.getType())) { | |
463 SimplifiedValues[&I] = C; | |
464 return true; | |
465 } | |
466 | 536 |
467 // Track base/offset pairs when converted to a plain integer provided the | 537 // Track base/offset pairs when converted to a plain integer provided the |
468 // integer is large enough to represent the pointer. | 538 // integer is large enough to represent the pointer. |
469 unsigned IntegerSize = I.getType()->getScalarSizeInBits(); | 539 unsigned IntegerSize = I.getType()->getScalarSizeInBits(); |
470 const DataLayout &DL = F.getParent()->getDataLayout(); | |
471 if (IntegerSize >= DL.getPointerSizeInBits()) { | 540 if (IntegerSize >= DL.getPointerSizeInBits()) { |
472 std::pair<Value *, APInt> BaseAndOffset = | 541 std::pair<Value *, APInt> BaseAndOffset = |
473 ConstantOffsetPtrs.lookup(I.getOperand(0)); | 542 ConstantOffsetPtrs.lookup(I.getOperand(0)); |
474 if (BaseAndOffset.first) | 543 if (BaseAndOffset.first) |
475 ConstantOffsetPtrs[&I] = BaseAndOffset; | 544 ConstantOffsetPtrs[&I] = BaseAndOffset; |
490 return TargetTransformInfo::TCC_Free == TTI.getUserCost(&I); | 559 return TargetTransformInfo::TCC_Free == TTI.getUserCost(&I); |
491 } | 560 } |
492 | 561 |
493 bool CallAnalyzer::visitIntToPtr(IntToPtrInst &I) { | 562 bool CallAnalyzer::visitIntToPtr(IntToPtrInst &I) { |
494 // Propagate constants through ptrtoint. | 563 // Propagate constants through ptrtoint. |
495 Constant *COp = dyn_cast<Constant>(I.getOperand(0)); | 564 if (simplifyInstruction(I, [&](SmallVectorImpl<Constant *> &COps) { |
496 if (!COp) | 565 return ConstantExpr::getIntToPtr(COps[0], I.getType()); |
497 COp = SimplifiedValues.lookup(I.getOperand(0)); | 566 })) |
498 if (COp) | 567 return true; |
499 if (Constant *C = ConstantExpr::getIntToPtr(COp, I.getType())) { | |
500 SimplifiedValues[&I] = C; | |
501 return true; | |
502 } | |
503 | 568 |
504 // Track base/offset pairs when round-tripped through a pointer without | 569 // Track base/offset pairs when round-tripped through a pointer without |
505 // modifications provided the integer is not too large. | 570 // modifications provided the integer is not too large. |
506 Value *Op = I.getOperand(0); | 571 Value *Op = I.getOperand(0); |
507 unsigned IntegerSize = Op->getType()->getScalarSizeInBits(); | 572 unsigned IntegerSize = Op->getType()->getScalarSizeInBits(); |
508 const DataLayout &DL = F.getParent()->getDataLayout(); | |
509 if (IntegerSize <= DL.getPointerSizeInBits()) { | 573 if (IntegerSize <= DL.getPointerSizeInBits()) { |
510 std::pair<Value *, APInt> BaseAndOffset = ConstantOffsetPtrs.lookup(Op); | 574 std::pair<Value *, APInt> BaseAndOffset = ConstantOffsetPtrs.lookup(Op); |
511 if (BaseAndOffset.first) | 575 if (BaseAndOffset.first) |
512 ConstantOffsetPtrs[&I] = BaseAndOffset; | 576 ConstantOffsetPtrs[&I] = BaseAndOffset; |
513 } | 577 } |
521 return TargetTransformInfo::TCC_Free == TTI.getUserCost(&I); | 585 return TargetTransformInfo::TCC_Free == TTI.getUserCost(&I); |
522 } | 586 } |
523 | 587 |
524 bool CallAnalyzer::visitCastInst(CastInst &I) { | 588 bool CallAnalyzer::visitCastInst(CastInst &I) { |
525 // Propagate constants through ptrtoint. | 589 // Propagate constants through ptrtoint. |
526 Constant *COp = dyn_cast<Constant>(I.getOperand(0)); | 590 if (simplifyInstruction(I, [&](SmallVectorImpl<Constant *> &COps) { |
527 if (!COp) | 591 return ConstantExpr::getCast(I.getOpcode(), COps[0], I.getType()); |
528 COp = SimplifiedValues.lookup(I.getOperand(0)); | 592 })) |
529 if (COp) | 593 return true; |
530 if (Constant *C = ConstantExpr::getCast(I.getOpcode(), COp, I.getType())) { | |
531 SimplifiedValues[&I] = C; | |
532 return true; | |
533 } | |
534 | 594 |
535 // Disable SROA in the face of arbitrary casts we don't whitelist elsewhere. | 595 // Disable SROA in the face of arbitrary casts we don't whitelist elsewhere. |
536 disableSROA(I.getOperand(0)); | 596 disableSROA(I.getOperand(0)); |
537 | 597 |
538 return TargetTransformInfo::TCC_Free == TTI.getUserCost(&I); | 598 return TargetTransformInfo::TCC_Free == TTI.getUserCost(&I); |
539 } | 599 } |
540 | 600 |
541 bool CallAnalyzer::visitUnaryInstruction(UnaryInstruction &I) { | 601 bool CallAnalyzer::visitUnaryInstruction(UnaryInstruction &I) { |
542 Value *Operand = I.getOperand(0); | 602 Value *Operand = I.getOperand(0); |
543 Constant *COp = dyn_cast<Constant>(Operand); | 603 if (simplifyInstruction(I, [&](SmallVectorImpl<Constant *> &COps) { |
544 if (!COp) | 604 return ConstantFoldInstOperands(&I, COps[0], DL); |
545 COp = SimplifiedValues.lookup(Operand); | 605 })) |
546 if (COp) { | 606 return true; |
547 const DataLayout &DL = F.getParent()->getDataLayout(); | |
548 if (Constant *C = ConstantFoldInstOperands(&I, COp, DL)) { | |
549 SimplifiedValues[&I] = C; | |
550 return true; | |
551 } | |
552 } | |
553 | 607 |
554 // Disable any SROA on the argument to arbitrary unary operators. | 608 // Disable any SROA on the argument to arbitrary unary operators. |
555 disableSROA(Operand); | 609 disableSROA(Operand); |
556 | 610 |
557 return false; | 611 return false; |
558 } | 612 } |
559 | 613 |
560 bool CallAnalyzer::paramHasAttr(Argument *A, Attribute::AttrKind Attr) { | 614 bool CallAnalyzer::paramHasAttr(Argument *A, Attribute::AttrKind Attr) { |
561 unsigned ArgNo = A->getArgNo(); | 615 return CandidateCS.paramHasAttr(A->getArgNo(), Attr); |
562 return CandidateCS.paramHasAttr(ArgNo + 1, Attr); | |
563 } | 616 } |
564 | 617 |
565 bool CallAnalyzer::isKnownNonNullInCallee(Value *V) { | 618 bool CallAnalyzer::isKnownNonNullInCallee(Value *V) { |
566 // Does the *call site* have the NonNull attribute set on an argument? We | 619 // Does the *call site* have the NonNull attribute set on an argument? We |
567 // use the attribute on the call site to memoize any analysis done in the | 620 // use the attribute on the call site to memoize any analysis done in the |
608 return false; | 661 return false; |
609 | 662 |
610 return true; | 663 return true; |
611 } | 664 } |
612 | 665 |
666 bool CallAnalyzer::isColdCallSite(CallSite CS, BlockFrequencyInfo *CallerBFI) { | |
667 // If global profile summary is available, then callsite's coldness is | |
668 // determined based on that. | |
669 if (PSI && PSI->hasProfileSummary()) | |
670 return PSI->isColdCallSite(CS, CallerBFI); | |
671 | |
672 // Otherwise we need BFI to be available. | |
673 if (!CallerBFI) | |
674 return false; | |
675 | |
676 // Determine if the callsite is cold relative to caller's entry. We could | |
677 // potentially cache the computation of scaled entry frequency, but the added | |
678 // complexity is not worth it unless this scaling shows up high in the | |
679 // profiles. | |
680 const BranchProbability ColdProb(ColdCallSiteRelFreq, 100); | |
681 auto CallSiteBB = CS.getInstruction()->getParent(); | |
682 auto CallSiteFreq = CallerBFI->getBlockFreq(CallSiteBB); | |
683 auto CallerEntryFreq = | |
684 CallerBFI->getBlockFreq(&(CS.getCaller()->getEntryBlock())); | |
685 return CallSiteFreq < CallerEntryFreq * ColdProb; | |
686 } | |
687 | |
688 Optional<int> | |
689 CallAnalyzer::getHotCallSiteThreshold(CallSite CS, | |
690 BlockFrequencyInfo *CallerBFI) { | |
691 | |
692 // If global profile summary is available, then callsite's hotness is | |
693 // determined based on that. | |
694 if (PSI && PSI->hasProfileSummary() && PSI->isHotCallSite(CS, CallerBFI)) | |
695 return Params.HotCallSiteThreshold; | |
696 | |
697 // Otherwise we need BFI to be available and to have a locally hot callsite | |
698 // threshold. | |
699 if (!CallerBFI || !Params.LocallyHotCallSiteThreshold) | |
700 return None; | |
701 | |
702 // Determine if the callsite is hot relative to caller's entry. We could | |
703 // potentially cache the computation of scaled entry frequency, but the added | |
704 // complexity is not worth it unless this scaling shows up high in the | |
705 // profiles. | |
706 auto CallSiteBB = CS.getInstruction()->getParent(); | |
707 auto CallSiteFreq = CallerBFI->getBlockFreq(CallSiteBB).getFrequency(); | |
708 auto CallerEntryFreq = CallerBFI->getEntryFreq(); | |
709 if (CallSiteFreq >= CallerEntryFreq * HotCallSiteRelFreq) | |
710 return Params.LocallyHotCallSiteThreshold; | |
711 | |
712 // Otherwise treat it normally. | |
713 return None; | |
714 } | |
715 | |
613 void CallAnalyzer::updateThreshold(CallSite CS, Function &Callee) { | 716 void CallAnalyzer::updateThreshold(CallSite CS, Function &Callee) { |
614 // If no size growth is allowed for this inlining, set Threshold to 0. | 717 // If no size growth is allowed for this inlining, set Threshold to 0. |
615 if (!allowSizeGrowth(CS)) { | 718 if (!allowSizeGrowth(CS)) { |
616 Threshold = 0; | 719 Threshold = 0; |
617 return; | 720 return; |
627 // return max(A, B) if B is valid. | 730 // return max(A, B) if B is valid. |
628 auto MaxIfValid = [](int A, Optional<int> B) { | 731 auto MaxIfValid = [](int A, Optional<int> B) { |
629 return B ? std::max(A, B.getValue()) : A; | 732 return B ? std::max(A, B.getValue()) : A; |
630 }; | 733 }; |
631 | 734 |
735 // Various bonus percentages. These are multiplied by Threshold to get the | |
736 // bonus values. | |
737 // SingleBBBonus: This bonus is applied if the callee has a single reachable | |
738 // basic block at the given callsite context. This is speculatively applied | |
739 // and withdrawn if more than one basic block is seen. | |
740 // | |
741 // Vector bonuses: We want to more aggressively inline vector-dense kernels | |
742 // and apply this bonus based on the percentage of vector instructions. A | |
743 // bonus is applied if the vector instructions exceed 50% and half that amount | |
744 // is applied if it exceeds 10%. Note that these bonuses are some what | |
745 // arbitrary and evolved over time by accident as much as because they are | |
746 // principled bonuses. | |
747 // FIXME: It would be nice to base the bonus values on something more | |
748 // scientific. | |
749 // | |
750 // LstCallToStaticBonus: This large bonus is applied to ensure the inlining | |
751 // of the last call to a static function as inlining such functions is | |
752 // guaranteed to reduce code size. | |
753 // | |
754 // These bonus percentages may be set to 0 based on properties of the caller | |
755 // and the callsite. | |
756 int SingleBBBonusPercent = 50; | |
757 int VectorBonusPercent = 150; | |
758 int LastCallToStaticBonus = InlineConstants::LastCallToStaticBonus; | |
759 | |
760 // Lambda to set all the above bonus and bonus percentages to 0. | |
761 auto DisallowAllBonuses = [&]() { | |
762 SingleBBBonusPercent = 0; | |
763 VectorBonusPercent = 0; | |
764 LastCallToStaticBonus = 0; | |
765 }; | |
766 | |
632 // Use the OptMinSizeThreshold or OptSizeThreshold knob if they are available | 767 // Use the OptMinSizeThreshold or OptSizeThreshold knob if they are available |
633 // and reduce the threshold if the caller has the necessary attribute. | 768 // and reduce the threshold if the caller has the necessary attribute. |
634 if (Caller->optForMinSize()) | 769 if (Caller->optForMinSize()) { |
635 Threshold = MinIfValid(Threshold, Params.OptMinSizeThreshold); | 770 Threshold = MinIfValid(Threshold, Params.OptMinSizeThreshold); |
636 else if (Caller->optForSize()) | 771 // For minsize, we want to disable the single BB bonus and the vector |
772 // bonuses, but not the last-call-to-static bonus. Inlining the last call to | |
773 // a static function will, at the minimum, eliminate the parameter setup and | |
774 // call/return instructions. | |
775 SingleBBBonusPercent = 0; | |
776 VectorBonusPercent = 0; | |
777 } else if (Caller->optForSize()) | |
637 Threshold = MinIfValid(Threshold, Params.OptSizeThreshold); | 778 Threshold = MinIfValid(Threshold, Params.OptSizeThreshold); |
638 | 779 |
639 bool HotCallsite = false; | 780 // Adjust the threshold based on inlinehint attribute and profile based |
640 uint64_t TotalWeight; | 781 // hotness information if the caller does not have MinSize attribute. |
641 if (CS.getInstruction()->extractProfTotalWeight(TotalWeight) && | 782 if (!Caller->optForMinSize()) { |
642 PSI->isHotCount(TotalWeight)) { | 783 if (Callee.hasFnAttribute(Attribute::InlineHint)) |
643 HotCallsite = true; | 784 Threshold = MaxIfValid(Threshold, Params.HintThreshold); |
644 } | 785 |
645 | 786 // FIXME: After switching to the new passmanager, simplify the logic below |
646 // Listen to the inlinehint attribute or profile based hotness information | 787 // by checking only the callsite hotness/coldness as we will reliably |
647 // when it would increase the threshold and the caller does not need to | 788 // have local profile information. |
648 // minimize its size. | 789 // |
649 bool InlineHint = Callee.hasFnAttribute(Attribute::InlineHint) || | 790 // Callsite hotness and coldness can be determined if sample profile is |
650 PSI->isFunctionEntryHot(&Callee); | 791 // used (which adds hotness metadata to calls) or if caller's |
651 if (InlineHint && !Caller->optForMinSize()) | 792 // BlockFrequencyInfo is available. |
652 Threshold = MaxIfValid(Threshold, Params.HintThreshold); | 793 BlockFrequencyInfo *CallerBFI = GetBFI ? &((*GetBFI)(*Caller)) : nullptr; |
653 | 794 auto HotCallSiteThreshold = getHotCallSiteThreshold(CS, CallerBFI); |
654 if (HotCallsite && !Caller->optForMinSize()) | 795 if (!Caller->optForSize() && HotCallSiteThreshold) { |
655 Threshold = MaxIfValid(Threshold, Params.HotCallSiteThreshold); | 796 DEBUG(dbgs() << "Hot callsite.\n"); |
656 | 797 // FIXME: This should update the threshold only if it exceeds the |
657 bool ColdCallee = PSI->isFunctionEntryCold(&Callee); | 798 // current threshold, but AutoFDO + ThinLTO currently relies on this |
658 // For cold callees, use the ColdThreshold knob if it is available and reduces | 799 // behavior to prevent inlining of hot callsites during ThinLTO |
659 // the threshold. | 800 // compile phase. |
660 if (ColdCallee) | 801 Threshold = HotCallSiteThreshold.getValue(); |
661 Threshold = MinIfValid(Threshold, Params.ColdThreshold); | 802 } else if (isColdCallSite(CS, CallerBFI)) { |
803 DEBUG(dbgs() << "Cold callsite.\n"); | |
804 // Do not apply bonuses for a cold callsite including the | |
805 // LastCallToStatic bonus. While this bonus might result in code size | |
806 // reduction, it can cause the size of a non-cold caller to increase | |
807 // preventing it from being inlined. | |
808 DisallowAllBonuses(); | |
809 Threshold = MinIfValid(Threshold, Params.ColdCallSiteThreshold); | |
810 } else if (PSI) { | |
811 // Use callee's global profile information only if we have no way of | |
812 // determining this via callsite information. | |
813 if (PSI->isFunctionEntryHot(&Callee)) { | |
814 DEBUG(dbgs() << "Hot callee.\n"); | |
815 // If callsite hotness can not be determined, we may still know | |
816 // that the callee is hot and treat it as a weaker hint for threshold | |
817 // increase. | |
818 Threshold = MaxIfValid(Threshold, Params.HintThreshold); | |
819 } else if (PSI->isFunctionEntryCold(&Callee)) { | |
820 DEBUG(dbgs() << "Cold callee.\n"); | |
821 // Do not apply bonuses for a cold callee including the | |
822 // LastCallToStatic bonus. While this bonus might result in code size | |
823 // reduction, it can cause the size of a non-cold caller to increase | |
824 // preventing it from being inlined. | |
825 DisallowAllBonuses(); | |
826 Threshold = MinIfValid(Threshold, Params.ColdThreshold); | |
827 } | |
828 } | |
829 } | |
662 | 830 |
663 // Finally, take the target-specific inlining threshold multiplier into | 831 // Finally, take the target-specific inlining threshold multiplier into |
664 // account. | 832 // account. |
665 Threshold *= TTI.getInliningThresholdMultiplier(); | 833 Threshold *= TTI.getInliningThresholdMultiplier(); |
834 | |
835 SingleBBBonus = Threshold * SingleBBBonusPercent / 100; | |
836 VectorBonus = Threshold * VectorBonusPercent / 100; | |
837 | |
838 bool OnlyOneCallAndLocalLinkage = | |
839 F.hasLocalLinkage() && F.hasOneUse() && &F == CS.getCalledFunction(); | |
840 // If there is only one call of the function, and it has internal linkage, | |
841 // the cost of inlining it drops dramatically. It may seem odd to update | |
842 // Cost in updateThreshold, but the bonus depends on the logic in this method. | |
843 if (OnlyOneCallAndLocalLinkage) | |
844 Cost -= LastCallToStaticBonus; | |
666 } | 845 } |
667 | 846 |
668 bool CallAnalyzer::visitCmpInst(CmpInst &I) { | 847 bool CallAnalyzer::visitCmpInst(CmpInst &I) { |
669 Value *LHS = I.getOperand(0), *RHS = I.getOperand(1); | 848 Value *LHS = I.getOperand(0), *RHS = I.getOperand(1); |
670 // First try to handle simplified comparisons. | 849 // First try to handle simplified comparisons. |
671 if (!isa<Constant>(LHS)) | 850 if (simplifyInstruction(I, [&](SmallVectorImpl<Constant *> &COps) { |
672 if (Constant *SimpleLHS = SimplifiedValues.lookup(LHS)) | 851 return ConstantExpr::getCompare(I.getPredicate(), COps[0], COps[1]); |
673 LHS = SimpleLHS; | 852 })) |
674 if (!isa<Constant>(RHS)) | 853 return true; |
675 if (Constant *SimpleRHS = SimplifiedValues.lookup(RHS)) | |
676 RHS = SimpleRHS; | |
677 if (Constant *CLHS = dyn_cast<Constant>(LHS)) { | |
678 if (Constant *CRHS = dyn_cast<Constant>(RHS)) | |
679 if (Constant *C = | |
680 ConstantExpr::getCompare(I.getPredicate(), CLHS, CRHS)) { | |
681 SimplifiedValues[&I] = C; | |
682 return true; | |
683 } | |
684 } | |
685 | 854 |
686 if (I.getOpcode() == Instruction::FCmp) | 855 if (I.getOpcode() == Instruction::FCmp) |
687 return false; | 856 return false; |
688 | 857 |
689 // Otherwise look for a comparison between constant offset pointers with | 858 // Otherwise look for a comparison between constant offset pointers with |
726 | 895 |
727 disableSROA(CostIt); | 896 disableSROA(CostIt); |
728 } | 897 } |
729 | 898 |
730 return false; | 899 return false; |
900 } | |
901 | |
902 bool CallAnalyzer::visitOr(BinaryOperator &I) { | |
903 // This is necessary because the generic simplify instruction only works if | |
904 // both operands are constants. | |
905 for (unsigned i = 0; i < 2; ++i) { | |
906 if (ConstantInt *C = dyn_cast_or_null<ConstantInt>( | |
907 SimplifiedValues.lookup(I.getOperand(i)))) | |
908 if (C->isAllOnesValue()) { | |
909 SimplifiedValues[&I] = C; | |
910 return true; | |
911 } | |
912 } | |
913 return Base::visitOr(I); | |
914 } | |
915 | |
916 bool CallAnalyzer::visitAnd(BinaryOperator &I) { | |
917 // This is necessary because the generic simplify instruction only works if | |
918 // both operands are constants. | |
919 for (unsigned i = 0; i < 2; ++i) { | |
920 if (ConstantInt *C = dyn_cast_or_null<ConstantInt>( | |
921 SimplifiedValues.lookup(I.getOperand(i)))) | |
922 if (C->isZero()) { | |
923 SimplifiedValues[&I] = C; | |
924 return true; | |
925 } | |
926 } | |
927 return Base::visitAnd(I); | |
731 } | 928 } |
732 | 929 |
733 bool CallAnalyzer::visitSub(BinaryOperator &I) { | 930 bool CallAnalyzer::visitSub(BinaryOperator &I) { |
734 // Try to handle a special case: we can fold computing the difference of two | 931 // Try to handle a special case: we can fold computing the difference of two |
735 // constant-related pointers. | 932 // constant-related pointers. |
757 return Base::visitSub(I); | 954 return Base::visitSub(I); |
758 } | 955 } |
759 | 956 |
760 bool CallAnalyzer::visitBinaryOperator(BinaryOperator &I) { | 957 bool CallAnalyzer::visitBinaryOperator(BinaryOperator &I) { |
761 Value *LHS = I.getOperand(0), *RHS = I.getOperand(1); | 958 Value *LHS = I.getOperand(0), *RHS = I.getOperand(1); |
762 const DataLayout &DL = F.getParent()->getDataLayout(); | 959 auto Evaluate = [&](SmallVectorImpl<Constant *> &COps) { |
763 if (!isa<Constant>(LHS)) | 960 Value *SimpleV = nullptr; |
764 if (Constant *SimpleLHS = SimplifiedValues.lookup(LHS)) | 961 if (auto FI = dyn_cast<FPMathOperator>(&I)) |
765 LHS = SimpleLHS; | 962 SimpleV = SimplifyFPBinOp(I.getOpcode(), COps[0], COps[1], |
766 if (!isa<Constant>(RHS)) | 963 FI->getFastMathFlags(), DL); |
767 if (Constant *SimpleRHS = SimplifiedValues.lookup(RHS)) | 964 else |
768 RHS = SimpleRHS; | 965 SimpleV = SimplifyBinOp(I.getOpcode(), COps[0], COps[1], DL); |
769 Value *SimpleV = nullptr; | 966 return dyn_cast_or_null<Constant>(SimpleV); |
770 if (auto FI = dyn_cast<FPMathOperator>(&I)) | 967 }; |
771 SimpleV = | 968 |
772 SimplifyFPBinOp(I.getOpcode(), LHS, RHS, FI->getFastMathFlags(), DL); | 969 if (simplifyInstruction(I, Evaluate)) |
773 else | |
774 SimpleV = SimplifyBinOp(I.getOpcode(), LHS, RHS, DL); | |
775 | |
776 if (Constant *C = dyn_cast_or_null<Constant>(SimpleV)) { | |
777 SimplifiedValues[&I] = C; | |
778 return true; | 970 return true; |
779 } | |
780 | 971 |
781 // Disable any SROA on arguments to arbitrary, unsimplified binary operators. | 972 // Disable any SROA on arguments to arbitrary, unsimplified binary operators. |
782 disableSROA(LHS); | 973 disableSROA(LHS); |
783 disableSROA(RHS); | 974 disableSROA(RHS); |
784 | 975 |
815 return false; | 1006 return false; |
816 } | 1007 } |
817 | 1008 |
818 bool CallAnalyzer::visitExtractValue(ExtractValueInst &I) { | 1009 bool CallAnalyzer::visitExtractValue(ExtractValueInst &I) { |
819 // Constant folding for extract value is trivial. | 1010 // Constant folding for extract value is trivial. |
820 Constant *C = dyn_cast<Constant>(I.getAggregateOperand()); | 1011 if (simplifyInstruction(I, [&](SmallVectorImpl<Constant *> &COps) { |
821 if (!C) | 1012 return ConstantExpr::getExtractValue(COps[0], I.getIndices()); |
822 C = SimplifiedValues.lookup(I.getAggregateOperand()); | 1013 })) |
823 if (C) { | |
824 SimplifiedValues[&I] = ConstantExpr::getExtractValue(C, I.getIndices()); | |
825 return true; | 1014 return true; |
826 } | |
827 | 1015 |
828 // SROA can look through these but give them a cost. | 1016 // SROA can look through these but give them a cost. |
829 return false; | 1017 return false; |
830 } | 1018 } |
831 | 1019 |
832 bool CallAnalyzer::visitInsertValue(InsertValueInst &I) { | 1020 bool CallAnalyzer::visitInsertValue(InsertValueInst &I) { |
833 // Constant folding for insert value is trivial. | 1021 // Constant folding for insert value is trivial. |
834 Constant *AggC = dyn_cast<Constant>(I.getAggregateOperand()); | 1022 if (simplifyInstruction(I, [&](SmallVectorImpl<Constant *> &COps) { |
835 if (!AggC) | 1023 return ConstantExpr::getInsertValue(/*AggregateOperand*/ COps[0], |
836 AggC = SimplifiedValues.lookup(I.getAggregateOperand()); | 1024 /*InsertedValueOperand*/ COps[1], |
837 Constant *InsertedC = dyn_cast<Constant>(I.getInsertedValueOperand()); | 1025 I.getIndices()); |
838 if (!InsertedC) | 1026 })) |
839 InsertedC = SimplifiedValues.lookup(I.getInsertedValueOperand()); | |
840 if (AggC && InsertedC) { | |
841 SimplifiedValues[&I] = | |
842 ConstantExpr::getInsertValue(AggC, InsertedC, I.getIndices()); | |
843 return true; | 1027 return true; |
844 } | |
845 | 1028 |
846 // SROA can look through these but give them a cost. | 1029 // SROA can look through these but give them a cost. |
847 return false; | 1030 return false; |
848 } | 1031 } |
849 | 1032 |
856 bool CallAnalyzer::simplifyCallSite(Function *F, CallSite CS) { | 1039 bool CallAnalyzer::simplifyCallSite(Function *F, CallSite CS) { |
857 // FIXME: Using the instsimplify logic directly for this is inefficient | 1040 // FIXME: Using the instsimplify logic directly for this is inefficient |
858 // because we have to continually rebuild the argument list even when no | 1041 // because we have to continually rebuild the argument list even when no |
859 // simplifications can be performed. Until that is fixed with remapping | 1042 // simplifications can be performed. Until that is fixed with remapping |
860 // inside of instsimplify, directly constant fold calls here. | 1043 // inside of instsimplify, directly constant fold calls here. |
861 if (!canConstantFoldCallTo(F)) | 1044 if (!canConstantFoldCallTo(CS, F)) |
862 return false; | 1045 return false; |
863 | 1046 |
864 // Try to re-map the arguments to constants. | 1047 // Try to re-map the arguments to constants. |
865 SmallVector<Constant *, 4> ConstantArgs; | 1048 SmallVector<Constant *, 4> ConstantArgs; |
866 ConstantArgs.reserve(CS.arg_size()); | 1049 ConstantArgs.reserve(CS.arg_size()); |
872 if (!C) | 1055 if (!C) |
873 return false; // This argument doesn't map to a constant. | 1056 return false; // This argument doesn't map to a constant. |
874 | 1057 |
875 ConstantArgs.push_back(C); | 1058 ConstantArgs.push_back(C); |
876 } | 1059 } |
877 if (Constant *C = ConstantFoldCall(F, ConstantArgs)) { | 1060 if (Constant *C = ConstantFoldCall(CS, F, ConstantArgs)) { |
878 SimplifiedValues[CS.getInstruction()] = C; | 1061 SimplifiedValues[CS.getInstruction()] = C; |
879 return true; | 1062 return true; |
880 } | 1063 } |
881 | 1064 |
882 return false; | 1065 return false; |
960 // during devirtualization and so we want to give it a hefty bonus for | 1143 // during devirtualization and so we want to give it a hefty bonus for |
961 // inlining, but cap that bonus in the event that inlining wouldn't pan | 1144 // inlining, but cap that bonus in the event that inlining wouldn't pan |
962 // out. Pretend to inline the function, with a custom threshold. | 1145 // out. Pretend to inline the function, with a custom threshold. |
963 auto IndirectCallParams = Params; | 1146 auto IndirectCallParams = Params; |
964 IndirectCallParams.DefaultThreshold = InlineConstants::IndirectCallThreshold; | 1147 IndirectCallParams.DefaultThreshold = InlineConstants::IndirectCallThreshold; |
965 CallAnalyzer CA(TTI, GetAssumptionCache, PSI, *F, CS, IndirectCallParams); | 1148 CallAnalyzer CA(TTI, GetAssumptionCache, GetBFI, PSI, ORE, *F, CS, |
1149 IndirectCallParams); | |
966 if (CA.analyzeCall(CS)) { | 1150 if (CA.analyzeCall(CS)) { |
967 // We were able to inline the indirect call! Subtract the cost from the | 1151 // We were able to inline the indirect call! Subtract the cost from the |
968 // threshold to get the bonus we want to apply, but don't go below zero. | 1152 // threshold to get the bonus we want to apply, but don't go below zero. |
969 Cost -= std::max(0, CA.getThreshold() - CA.getCost()); | 1153 Cost -= std::max(0, CA.getThreshold() - CA.getCost()); |
970 } | 1154 } |
987 return BI.isUnconditional() || isa<ConstantInt>(BI.getCondition()) || | 1171 return BI.isUnconditional() || isa<ConstantInt>(BI.getCondition()) || |
988 dyn_cast_or_null<ConstantInt>( | 1172 dyn_cast_or_null<ConstantInt>( |
989 SimplifiedValues.lookup(BI.getCondition())); | 1173 SimplifiedValues.lookup(BI.getCondition())); |
990 } | 1174 } |
991 | 1175 |
1176 bool CallAnalyzer::visitSelectInst(SelectInst &SI) { | |
1177 bool CheckSROA = SI.getType()->isPointerTy(); | |
1178 Value *TrueVal = SI.getTrueValue(); | |
1179 Value *FalseVal = SI.getFalseValue(); | |
1180 | |
1181 Constant *TrueC = dyn_cast<Constant>(TrueVal); | |
1182 if (!TrueC) | |
1183 TrueC = SimplifiedValues.lookup(TrueVal); | |
1184 Constant *FalseC = dyn_cast<Constant>(FalseVal); | |
1185 if (!FalseC) | |
1186 FalseC = SimplifiedValues.lookup(FalseVal); | |
1187 Constant *CondC = | |
1188 dyn_cast_or_null<Constant>(SimplifiedValues.lookup(SI.getCondition())); | |
1189 | |
1190 if (!CondC) { | |
1191 // Select C, X, X => X | |
1192 if (TrueC == FalseC && TrueC) { | |
1193 SimplifiedValues[&SI] = TrueC; | |
1194 return true; | |
1195 } | |
1196 | |
1197 if (!CheckSROA) | |
1198 return Base::visitSelectInst(SI); | |
1199 | |
1200 std::pair<Value *, APInt> TrueBaseAndOffset = | |
1201 ConstantOffsetPtrs.lookup(TrueVal); | |
1202 std::pair<Value *, APInt> FalseBaseAndOffset = | |
1203 ConstantOffsetPtrs.lookup(FalseVal); | |
1204 if (TrueBaseAndOffset == FalseBaseAndOffset && TrueBaseAndOffset.first) { | |
1205 ConstantOffsetPtrs[&SI] = TrueBaseAndOffset; | |
1206 | |
1207 Value *SROAArg; | |
1208 DenseMap<Value *, int>::iterator CostIt; | |
1209 if (lookupSROAArgAndCost(TrueVal, SROAArg, CostIt)) | |
1210 SROAArgValues[&SI] = SROAArg; | |
1211 return true; | |
1212 } | |
1213 | |
1214 return Base::visitSelectInst(SI); | |
1215 } | |
1216 | |
1217 // Select condition is a constant. | |
1218 Value *SelectedV = CondC->isAllOnesValue() | |
1219 ? TrueVal | |
1220 : (CondC->isNullValue()) ? FalseVal : nullptr; | |
1221 if (!SelectedV) { | |
1222 // Condition is a vector constant that is not all 1s or all 0s. If all | |
1223 // operands are constants, ConstantExpr::getSelect() can handle the cases | |
1224 // such as select vectors. | |
1225 if (TrueC && FalseC) { | |
1226 if (auto *C = ConstantExpr::getSelect(CondC, TrueC, FalseC)) { | |
1227 SimplifiedValues[&SI] = C; | |
1228 return true; | |
1229 } | |
1230 } | |
1231 return Base::visitSelectInst(SI); | |
1232 } | |
1233 | |
1234 // Condition is either all 1s or all 0s. SI can be simplified. | |
1235 if (Constant *SelectedC = dyn_cast<Constant>(SelectedV)) { | |
1236 SimplifiedValues[&SI] = SelectedC; | |
1237 return true; | |
1238 } | |
1239 | |
1240 if (!CheckSROA) | |
1241 return true; | |
1242 | |
1243 std::pair<Value *, APInt> BaseAndOffset = | |
1244 ConstantOffsetPtrs.lookup(SelectedV); | |
1245 if (BaseAndOffset.first) { | |
1246 ConstantOffsetPtrs[&SI] = BaseAndOffset; | |
1247 | |
1248 Value *SROAArg; | |
1249 DenseMap<Value *, int>::iterator CostIt; | |
1250 if (lookupSROAArgAndCost(SelectedV, SROAArg, CostIt)) | |
1251 SROAArgValues[&SI] = SROAArg; | |
1252 } | |
1253 | |
1254 return true; | |
1255 } | |
1256 | |
992 bool CallAnalyzer::visitSwitchInst(SwitchInst &SI) { | 1257 bool CallAnalyzer::visitSwitchInst(SwitchInst &SI) { |
993 // We model unconditional switches as free, see the comments on handling | 1258 // We model unconditional switches as free, see the comments on handling |
994 // branches. | 1259 // branches. |
995 if (isa<ConstantInt>(SI.getCondition())) | 1260 if (isa<ConstantInt>(SI.getCondition())) |
996 return true; | 1261 return true; |
997 if (Value *V = SimplifiedValues.lookup(SI.getCondition())) | 1262 if (Value *V = SimplifiedValues.lookup(SI.getCondition())) |
998 if (isa<ConstantInt>(V)) | 1263 if (isa<ConstantInt>(V)) |
999 return true; | 1264 return true; |
1000 | 1265 |
1001 // Otherwise, we need to accumulate a cost proportional to the number of | 1266 // Assume the most general case where the switch is lowered into |
1002 // distinct successor blocks. This fan-out in the CFG cannot be represented | 1267 // either a jump table, bit test, or a balanced binary tree consisting of |
1003 // for free even if we can represent the core switch as a jumptable that | 1268 // case clusters without merging adjacent clusters with the same |
1004 // takes a single instruction. | 1269 // destination. We do not consider the switches that are lowered with a mix |
1270 // of jump table/bit test/binary search tree. The cost of the switch is | |
1271 // proportional to the size of the tree or the size of jump table range. | |
1005 // | 1272 // |
1006 // NB: We convert large switches which are just used to initialize large phi | 1273 // NB: We convert large switches which are just used to initialize large phi |
1007 // nodes to lookup tables instead in simplify-cfg, so this shouldn't prevent | 1274 // nodes to lookup tables instead in simplify-cfg, so this shouldn't prevent |
1008 // inlining those. It will prevent inlining in cases where the optimization | 1275 // inlining those. It will prevent inlining in cases where the optimization |
1009 // does not (yet) fire. | 1276 // does not (yet) fire. |
1010 SmallPtrSet<BasicBlock *, 8> SuccessorBlocks; | 1277 |
1011 SuccessorBlocks.insert(SI.getDefaultDest()); | 1278 // Maximum valid cost increased in this function. |
1012 for (auto I = SI.case_begin(), E = SI.case_end(); I != E; ++I) | 1279 int CostUpperBound = INT_MAX - InlineConstants::InstrCost - 1; |
1013 SuccessorBlocks.insert(I.getCaseSuccessor()); | 1280 |
1014 // Add cost corresponding to the number of distinct destinations. The first | 1281 // Exit early for a large switch, assuming one case needs at least one |
1015 // we model as free because of fallthrough. | 1282 // instruction. |
1016 Cost += (SuccessorBlocks.size() - 1) * InlineConstants::InstrCost; | 1283 // FIXME: This is not true for a bit test, but ignore such case for now to |
1284 // save compile-time. | |
1285 int64_t CostLowerBound = | |
1286 std::min((int64_t)CostUpperBound, | |
1287 (int64_t)SI.getNumCases() * InlineConstants::InstrCost + Cost); | |
1288 | |
1289 if (CostLowerBound > Threshold && !ComputeFullInlineCost) { | |
1290 Cost = CostLowerBound; | |
1291 return false; | |
1292 } | |
1293 | |
1294 unsigned JumpTableSize = 0; | |
1295 unsigned NumCaseCluster = | |
1296 TTI.getEstimatedNumberOfCaseClusters(SI, JumpTableSize); | |
1297 | |
1298 // If suitable for a jump table, consider the cost for the table size and | |
1299 // branch to destination. | |
1300 if (JumpTableSize) { | |
1301 int64_t JTCost = (int64_t)JumpTableSize * InlineConstants::InstrCost + | |
1302 4 * InlineConstants::InstrCost; | |
1303 | |
1304 Cost = std::min((int64_t)CostUpperBound, JTCost + Cost); | |
1305 return false; | |
1306 } | |
1307 | |
1308 // Considering forming a binary search, we should find the number of nodes | |
1309 // which is same as the number of comparisons when lowered. For a given | |
1310 // number of clusters, n, we can define a recursive function, f(n), to find | |
1311 // the number of nodes in the tree. The recursion is : | |
1312 // f(n) = 1 + f(n/2) + f (n - n/2), when n > 3, | |
1313 // and f(n) = n, when n <= 3. | |
1314 // This will lead a binary tree where the leaf should be either f(2) or f(3) | |
1315 // when n > 3. So, the number of comparisons from leaves should be n, while | |
1316 // the number of non-leaf should be : | |
1317 // 2^(log2(n) - 1) - 1 | |
1318 // = 2^log2(n) * 2^-1 - 1 | |
1319 // = n / 2 - 1. | |
1320 // Considering comparisons from leaf and non-leaf nodes, we can estimate the | |
1321 // number of comparisons in a simple closed form : | |
1322 // n + n / 2 - 1 = n * 3 / 2 - 1 | |
1323 if (NumCaseCluster <= 3) { | |
1324 // Suppose a comparison includes one compare and one conditional branch. | |
1325 Cost += NumCaseCluster * 2 * InlineConstants::InstrCost; | |
1326 return false; | |
1327 } | |
1328 | |
1329 int64_t ExpectedNumberOfCompare = 3 * (int64_t)NumCaseCluster / 2 - 1; | |
1330 int64_t SwitchCost = | |
1331 ExpectedNumberOfCompare * 2 * InlineConstants::InstrCost; | |
1332 | |
1333 Cost = std::min((int64_t)CostUpperBound, SwitchCost + Cost); | |
1017 return false; | 1334 return false; |
1018 } | 1335 } |
1019 | 1336 |
1020 bool CallAnalyzer::visitIndirectBrInst(IndirectBrInst &IBI) { | 1337 bool CallAnalyzer::visitIndirectBrInst(IndirectBrInst &IBI) { |
1021 // We never want to inline functions that contain an indirectbr. This is | 1338 // We never want to inline functions that contain an indirectbr. This is |
1099 | 1416 |
1100 // If the instruction is floating point, and the target says this operation | 1417 // If the instruction is floating point, and the target says this operation |
1101 // is expensive or the function has the "use-soft-float" attribute, this may | 1418 // is expensive or the function has the "use-soft-float" attribute, this may |
1102 // eventually become a library call. Treat the cost as such. | 1419 // eventually become a library call. Treat the cost as such. |
1103 if (I->getType()->isFloatingPointTy()) { | 1420 if (I->getType()->isFloatingPointTy()) { |
1104 bool hasSoftFloatAttr = false; | |
1105 | |
1106 // If the function has the "use-soft-float" attribute, mark it as | 1421 // If the function has the "use-soft-float" attribute, mark it as |
1107 // expensive. | 1422 // expensive. |
1108 if (F.hasFnAttribute("use-soft-float")) { | |
1109 Attribute Attr = F.getFnAttribute("use-soft-float"); | |
1110 StringRef Val = Attr.getValueAsString(); | |
1111 if (Val == "true") | |
1112 hasSoftFloatAttr = true; | |
1113 } | |
1114 | |
1115 if (TTI.getFPOpCost(I->getType()) == TargetTransformInfo::TCC_Expensive || | 1423 if (TTI.getFPOpCost(I->getType()) == TargetTransformInfo::TCC_Expensive || |
1116 hasSoftFloatAttr) | 1424 (F.getFnAttribute("use-soft-float").getValueAsString() == "true")) |
1117 Cost += InlineConstants::CallPenalty; | 1425 Cost += InlineConstants::CallPenalty; |
1118 } | 1426 } |
1119 | 1427 |
1120 // If the instruction simplified to a constant, there is no cost to this | 1428 // If the instruction simplified to a constant, there is no cost to this |
1121 // instruction. Visit the instructions using our InstVisitor to account for | 1429 // instruction. Visit the instructions using our InstVisitor to account for |
1125 if (Base::visit(&*I)) | 1433 if (Base::visit(&*I)) |
1126 ++NumInstructionsSimplified; | 1434 ++NumInstructionsSimplified; |
1127 else | 1435 else |
1128 Cost += InlineConstants::InstrCost; | 1436 Cost += InlineConstants::InstrCost; |
1129 | 1437 |
1438 using namespace ore; | |
1130 // If the visit this instruction detected an uninlinable pattern, abort. | 1439 // If the visit this instruction detected an uninlinable pattern, abort. |
1131 if (IsRecursiveCall || ExposesReturnsTwice || HasDynamicAlloca || | 1440 if (IsRecursiveCall || ExposesReturnsTwice || HasDynamicAlloca || |
1132 HasIndirectBr || HasFrameEscape) | 1441 HasIndirectBr || HasFrameEscape) { |
1442 if (ORE) | |
1443 ORE->emit([&]() { | |
1444 return OptimizationRemarkMissed(DEBUG_TYPE, "NeverInline", | |
1445 CandidateCS.getInstruction()) | |
1446 << NV("Callee", &F) | |
1447 << " has uninlinable pattern and cost is not fully computed"; | |
1448 }); | |
1133 return false; | 1449 return false; |
1450 } | |
1134 | 1451 |
1135 // If the caller is a recursive function then we don't want to inline | 1452 // If the caller is a recursive function then we don't want to inline |
1136 // functions which allocate a lot of stack space because it would increase | 1453 // functions which allocate a lot of stack space because it would increase |
1137 // the caller stack usage dramatically. | 1454 // the caller stack usage dramatically. |
1138 if (IsCallerRecursive && | 1455 if (IsCallerRecursive && |
1139 AllocatedSize > InlineConstants::TotalAllocaSizeRecursiveCaller) | 1456 AllocatedSize > InlineConstants::TotalAllocaSizeRecursiveCaller) { |
1457 if (ORE) | |
1458 ORE->emit([&]() { | |
1459 return OptimizationRemarkMissed(DEBUG_TYPE, "NeverInline", | |
1460 CandidateCS.getInstruction()) | |
1461 << NV("Callee", &F) | |
1462 << " is recursive and allocates too much stack space. Cost is " | |
1463 "not fully computed"; | |
1464 }); | |
1140 return false; | 1465 return false; |
1466 } | |
1141 | 1467 |
1142 // Check if we've past the maximum possible threshold so we don't spin in | 1468 // Check if we've past the maximum possible threshold so we don't spin in |
1143 // huge basic blocks that will never inline. | 1469 // huge basic blocks that will never inline. |
1144 if (Cost > Threshold) | 1470 if (Cost >= Threshold && !ComputeFullInlineCost) |
1145 return false; | 1471 return false; |
1146 } | 1472 } |
1147 | 1473 |
1148 return true; | 1474 return true; |
1149 } | 1475 } |
1156 /// no constant offsets applied. | 1482 /// no constant offsets applied. |
1157 ConstantInt *CallAnalyzer::stripAndComputeInBoundsConstantOffsets(Value *&V) { | 1483 ConstantInt *CallAnalyzer::stripAndComputeInBoundsConstantOffsets(Value *&V) { |
1158 if (!V->getType()->isPointerTy()) | 1484 if (!V->getType()->isPointerTy()) |
1159 return nullptr; | 1485 return nullptr; |
1160 | 1486 |
1161 const DataLayout &DL = F.getParent()->getDataLayout(); | |
1162 unsigned IntPtrWidth = DL.getPointerSizeInBits(); | 1487 unsigned IntPtrWidth = DL.getPointerSizeInBits(); |
1163 APInt Offset = APInt::getNullValue(IntPtrWidth); | 1488 APInt Offset = APInt::getNullValue(IntPtrWidth); |
1164 | 1489 |
1165 // Even though we don't look through PHI nodes, we could be called on an | 1490 // Even though we don't look through PHI nodes, we could be called on an |
1166 // instruction in an unreachable block, which may be on a cycle. | 1491 // instruction in an unreachable block, which may be on a cycle. |
1211 assert(NumVectorInstructions == 0); | 1536 assert(NumVectorInstructions == 0); |
1212 | 1537 |
1213 // Update the threshold based on callsite properties | 1538 // Update the threshold based on callsite properties |
1214 updateThreshold(CS, F); | 1539 updateThreshold(CS, F); |
1215 | 1540 |
1216 FiftyPercentVectorBonus = 3 * Threshold / 2; | |
1217 TenPercentVectorBonus = 3 * Threshold / 4; | |
1218 const DataLayout &DL = F.getParent()->getDataLayout(); | |
1219 | |
1220 // Track whether the post-inlining function would have more than one basic | |
1221 // block. A single basic block is often intended for inlining. Balloon the | |
1222 // threshold by 50% until we pass the single-BB phase. | |
1223 bool SingleBB = true; | |
1224 int SingleBBBonus = Threshold / 2; | |
1225 | |
1226 // Speculatively apply all possible bonuses to Threshold. If cost exceeds | 1541 // Speculatively apply all possible bonuses to Threshold. If cost exceeds |
1227 // this Threshold any time, and cost cannot decrease, we can stop processing | 1542 // this Threshold any time, and cost cannot decrease, we can stop processing |
1228 // the rest of the function body. | 1543 // the rest of the function body. |
1229 Threshold += (SingleBBBonus + FiftyPercentVectorBonus); | 1544 Threshold += (SingleBBBonus + VectorBonus); |
1230 | 1545 |
1231 // Give out bonuses per argument, as the instructions setting them up will | 1546 // Give out bonuses for the callsite, as the instructions setting them up |
1232 // be gone after inlining. | 1547 // will be gone after inlining. |
1233 for (unsigned I = 0, E = CS.arg_size(); I != E; ++I) { | 1548 Cost -= getCallsiteCost(CS, DL); |
1234 if (CS.isByValArgument(I)) { | |
1235 // We approximate the number of loads and stores needed by dividing the | |
1236 // size of the byval type by the target's pointer size. | |
1237 PointerType *PTy = cast<PointerType>(CS.getArgument(I)->getType()); | |
1238 unsigned TypeSize = DL.getTypeSizeInBits(PTy->getElementType()); | |
1239 unsigned PointerSize = DL.getPointerSizeInBits(); | |
1240 // Ceiling division. | |
1241 unsigned NumStores = (TypeSize + PointerSize - 1) / PointerSize; | |
1242 | |
1243 // If it generates more than 8 stores it is likely to be expanded as an | |
1244 // inline memcpy so we take that as an upper bound. Otherwise we assume | |
1245 // one load and one store per word copied. | |
1246 // FIXME: The maxStoresPerMemcpy setting from the target should be used | |
1247 // here instead of a magic number of 8, but it's not available via | |
1248 // DataLayout. | |
1249 NumStores = std::min(NumStores, 8U); | |
1250 | |
1251 Cost -= 2 * NumStores * InlineConstants::InstrCost; | |
1252 } else { | |
1253 // For non-byval arguments subtract off one instruction per call | |
1254 // argument. | |
1255 Cost -= InlineConstants::InstrCost; | |
1256 } | |
1257 } | |
1258 // The call instruction also disappears after inlining. | |
1259 Cost -= InlineConstants::InstrCost + InlineConstants::CallPenalty; | |
1260 | |
1261 // If there is only one call of the function, and it has internal linkage, | |
1262 // the cost of inlining it drops dramatically. | |
1263 bool OnlyOneCallAndLocalLinkage = | |
1264 F.hasLocalLinkage() && F.hasOneUse() && &F == CS.getCalledFunction(); | |
1265 if (OnlyOneCallAndLocalLinkage) | |
1266 Cost -= InlineConstants::LastCallToStaticBonus; | |
1267 | 1549 |
1268 // If this function uses the coldcc calling convention, prefer not to inline | 1550 // If this function uses the coldcc calling convention, prefer not to inline |
1269 // it. | 1551 // it. |
1270 if (F.getCallingConv() == CallingConv::Cold) | 1552 if (F.getCallingConv() == CallingConv::Cold) |
1271 Cost += InlineConstants::ColdccPenalty; | 1553 Cost += InlineConstants::ColdccPenalty; |
1272 | 1554 |
1273 // Check if we're done. This can happen due to bonuses and penalties. | 1555 // Check if we're done. This can happen due to bonuses and penalties. |
1274 if (Cost > Threshold) | 1556 if (Cost >= Threshold && !ComputeFullInlineCost) |
1275 return false; | 1557 return false; |
1276 | 1558 |
1277 if (F.empty()) | 1559 if (F.empty()) |
1278 return true; | 1560 return true; |
1279 | 1561 |
1330 typedef SetVector<BasicBlock *, SmallVector<BasicBlock *, 16>, | 1612 typedef SetVector<BasicBlock *, SmallVector<BasicBlock *, 16>, |
1331 SmallPtrSet<BasicBlock *, 16>> | 1613 SmallPtrSet<BasicBlock *, 16>> |
1332 BBSetVector; | 1614 BBSetVector; |
1333 BBSetVector BBWorklist; | 1615 BBSetVector BBWorklist; |
1334 BBWorklist.insert(&F.getEntryBlock()); | 1616 BBWorklist.insert(&F.getEntryBlock()); |
1617 bool SingleBB = true; | |
1335 // Note that we *must not* cache the size, this loop grows the worklist. | 1618 // Note that we *must not* cache the size, this loop grows the worklist. |
1336 for (unsigned Idx = 0; Idx != BBWorklist.size(); ++Idx) { | 1619 for (unsigned Idx = 0; Idx != BBWorklist.size(); ++Idx) { |
1337 // Bail out the moment we cross the threshold. This means we'll under-count | 1620 // Bail out the moment we cross the threshold. This means we'll under-count |
1338 // the cost, but only when undercounting doesn't matter. | 1621 // the cost, but only when undercounting doesn't matter. |
1339 if (Cost > Threshold) | 1622 if (Cost >= Threshold && !ComputeFullInlineCost) |
1340 break; | 1623 break; |
1341 | 1624 |
1342 BasicBlock *BB = BBWorklist[Idx]; | 1625 BasicBlock *BB = BBWorklist[Idx]; |
1343 if (BB->empty()) | 1626 if (BB->empty()) |
1344 continue; | 1627 continue; |
1372 } | 1655 } |
1373 } else if (SwitchInst *SI = dyn_cast<SwitchInst>(TI)) { | 1656 } else if (SwitchInst *SI = dyn_cast<SwitchInst>(TI)) { |
1374 Value *Cond = SI->getCondition(); | 1657 Value *Cond = SI->getCondition(); |
1375 if (ConstantInt *SimpleCond = | 1658 if (ConstantInt *SimpleCond = |
1376 dyn_cast_or_null<ConstantInt>(SimplifiedValues.lookup(Cond))) { | 1659 dyn_cast_or_null<ConstantInt>(SimplifiedValues.lookup(Cond))) { |
1377 BBWorklist.insert(SI->findCaseValue(SimpleCond).getCaseSuccessor()); | 1660 BBWorklist.insert(SI->findCaseValue(SimpleCond)->getCaseSuccessor()); |
1378 continue; | 1661 continue; |
1379 } | 1662 } |
1380 } | 1663 } |
1381 | 1664 |
1382 // If we're unable to select a particular successor, just count all of | 1665 // If we're unable to select a particular successor, just count all of |
1394 Threshold -= SingleBBBonus; | 1677 Threshold -= SingleBBBonus; |
1395 SingleBB = false; | 1678 SingleBB = false; |
1396 } | 1679 } |
1397 } | 1680 } |
1398 | 1681 |
1682 bool OnlyOneCallAndLocalLinkage = | |
1683 F.hasLocalLinkage() && F.hasOneUse() && &F == CS.getCalledFunction(); | |
1399 // If this is a noduplicate call, we can still inline as long as | 1684 // If this is a noduplicate call, we can still inline as long as |
1400 // inlining this would cause the removal of the caller (so the instruction | 1685 // inlining this would cause the removal of the caller (so the instruction |
1401 // is not actually duplicated, just moved). | 1686 // is not actually duplicated, just moved). |
1402 if (!OnlyOneCallAndLocalLinkage && ContainsNoDuplicateCall) | 1687 if (!OnlyOneCallAndLocalLinkage && ContainsNoDuplicateCall) |
1403 return false; | 1688 return false; |
1404 | 1689 |
1405 // We applied the maximum possible vector bonus at the beginning. Now, | 1690 // We applied the maximum possible vector bonus at the beginning. Now, |
1406 // subtract the excess bonus, if any, from the Threshold before | 1691 // subtract the excess bonus, if any, from the Threshold before |
1407 // comparing against Cost. | 1692 // comparing against Cost. |
1408 if (NumVectorInstructions <= NumInstructions / 10) | 1693 if (NumVectorInstructions <= NumInstructions / 10) |
1409 Threshold -= FiftyPercentVectorBonus; | 1694 Threshold -= VectorBonus; |
1410 else if (NumVectorInstructions <= NumInstructions / 2) | 1695 else if (NumVectorInstructions <= NumInstructions / 2) |
1411 Threshold -= (FiftyPercentVectorBonus - TenPercentVectorBonus); | 1696 Threshold -= VectorBonus/2; |
1412 | 1697 |
1413 return Cost < std::max(1, Threshold); | 1698 return Cost < std::max(1, Threshold); |
1414 } | 1699 } |
1415 | 1700 |
1416 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) | 1701 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) |
1431 DEBUG_PRINT_STAT(Threshold); | 1716 DEBUG_PRINT_STAT(Threshold); |
1432 #undef DEBUG_PRINT_STAT | 1717 #undef DEBUG_PRINT_STAT |
1433 } | 1718 } |
1434 #endif | 1719 #endif |
1435 | 1720 |
1436 /// \brief Test that two functions either have or have not the given attribute | |
1437 /// at the same time. | |
1438 template <typename AttrKind> | |
1439 static bool attributeMatches(Function *F1, Function *F2, AttrKind Attr) { | |
1440 return F1->getFnAttribute(Attr) == F2->getFnAttribute(Attr); | |
1441 } | |
1442 | |
1443 /// \brief Test that there are no attribute conflicts between Caller and Callee | 1721 /// \brief Test that there are no attribute conflicts between Caller and Callee |
1444 /// that prevent inlining. | 1722 /// that prevent inlining. |
1445 static bool functionsHaveCompatibleAttributes(Function *Caller, | 1723 static bool functionsHaveCompatibleAttributes(Function *Caller, |
1446 Function *Callee, | 1724 Function *Callee, |
1447 TargetTransformInfo &TTI) { | 1725 TargetTransformInfo &TTI) { |
1448 return TTI.areInlineCompatible(Caller, Callee) && | 1726 return TTI.areInlineCompatible(Caller, Callee) && |
1449 AttributeFuncs::areInlineCompatible(*Caller, *Callee); | 1727 AttributeFuncs::areInlineCompatible(*Caller, *Callee); |
1450 } | 1728 } |
1451 | 1729 |
1730 int llvm::getCallsiteCost(CallSite CS, const DataLayout &DL) { | |
1731 int Cost = 0; | |
1732 for (unsigned I = 0, E = CS.arg_size(); I != E; ++I) { | |
1733 if (CS.isByValArgument(I)) { | |
1734 // We approximate the number of loads and stores needed by dividing the | |
1735 // size of the byval type by the target's pointer size. | |
1736 PointerType *PTy = cast<PointerType>(CS.getArgument(I)->getType()); | |
1737 unsigned TypeSize = DL.getTypeSizeInBits(PTy->getElementType()); | |
1738 unsigned PointerSize = DL.getPointerSizeInBits(); | |
1739 // Ceiling division. | |
1740 unsigned NumStores = (TypeSize + PointerSize - 1) / PointerSize; | |
1741 | |
1742 // If it generates more than 8 stores it is likely to be expanded as an | |
1743 // inline memcpy so we take that as an upper bound. Otherwise we assume | |
1744 // one load and one store per word copied. | |
1745 // FIXME: The maxStoresPerMemcpy setting from the target should be used | |
1746 // here instead of a magic number of 8, but it's not available via | |
1747 // DataLayout. | |
1748 NumStores = std::min(NumStores, 8U); | |
1749 | |
1750 Cost += 2 * NumStores * InlineConstants::InstrCost; | |
1751 } else { | |
1752 // For non-byval arguments subtract off one instruction per call | |
1753 // argument. | |
1754 Cost += InlineConstants::InstrCost; | |
1755 } | |
1756 } | |
1757 // The call instruction also disappears after inlining. | |
1758 Cost += InlineConstants::InstrCost + InlineConstants::CallPenalty; | |
1759 return Cost; | |
1760 } | |
1761 | |
1452 InlineCost llvm::getInlineCost( | 1762 InlineCost llvm::getInlineCost( |
1453 CallSite CS, const InlineParams &Params, TargetTransformInfo &CalleeTTI, | 1763 CallSite CS, const InlineParams &Params, TargetTransformInfo &CalleeTTI, |
1454 std::function<AssumptionCache &(Function &)> &GetAssumptionCache, | 1764 std::function<AssumptionCache &(Function &)> &GetAssumptionCache, |
1455 ProfileSummaryInfo *PSI) { | 1765 Optional<function_ref<BlockFrequencyInfo &(Function &)>> GetBFI, |
1766 ProfileSummaryInfo *PSI, OptimizationRemarkEmitter *ORE) { | |
1456 return getInlineCost(CS, CS.getCalledFunction(), Params, CalleeTTI, | 1767 return getInlineCost(CS, CS.getCalledFunction(), Params, CalleeTTI, |
1457 GetAssumptionCache, PSI); | 1768 GetAssumptionCache, GetBFI, PSI, ORE); |
1458 } | 1769 } |
1459 | 1770 |
1460 InlineCost llvm::getInlineCost( | 1771 InlineCost llvm::getInlineCost( |
1461 CallSite CS, Function *Callee, const InlineParams &Params, | 1772 CallSite CS, Function *Callee, const InlineParams &Params, |
1462 TargetTransformInfo &CalleeTTI, | 1773 TargetTransformInfo &CalleeTTI, |
1463 std::function<AssumptionCache &(Function &)> &GetAssumptionCache, | 1774 std::function<AssumptionCache &(Function &)> &GetAssumptionCache, |
1464 ProfileSummaryInfo *PSI) { | 1775 Optional<function_ref<BlockFrequencyInfo &(Function &)>> GetBFI, |
1776 ProfileSummaryInfo *PSI, OptimizationRemarkEmitter *ORE) { | |
1465 | 1777 |
1466 // Cannot inline indirect calls. | 1778 // Cannot inline indirect calls. |
1467 if (!Callee) | 1779 if (!Callee) |
1468 return llvm::InlineCost::getNever(); | 1780 return llvm::InlineCost::getNever(); |
1469 | 1781 |
1475 return llvm::InlineCost::getNever(); | 1787 return llvm::InlineCost::getNever(); |
1476 } | 1788 } |
1477 | 1789 |
1478 // Never inline functions with conflicting attributes (unless callee has | 1790 // Never inline functions with conflicting attributes (unless callee has |
1479 // always-inline attribute). | 1791 // always-inline attribute). |
1480 if (!functionsHaveCompatibleAttributes(CS.getCaller(), Callee, CalleeTTI)) | 1792 Function *Caller = CS.getCaller(); |
1793 if (!functionsHaveCompatibleAttributes(Caller, Callee, CalleeTTI)) | |
1481 return llvm::InlineCost::getNever(); | 1794 return llvm::InlineCost::getNever(); |
1482 | 1795 |
1483 // Don't inline this call if the caller has the optnone attribute. | 1796 // Don't inline this call if the caller has the optnone attribute. |
1484 if (CS.getCaller()->hasFnAttribute(Attribute::OptimizeNone)) | 1797 if (Caller->hasFnAttribute(Attribute::OptimizeNone)) |
1485 return llvm::InlineCost::getNever(); | 1798 return llvm::InlineCost::getNever(); |
1486 | 1799 |
1487 // Don't inline functions which can be interposed at link-time. Don't inline | 1800 // Don't inline functions which can be interposed at link-time. Don't inline |
1488 // functions marked noinline or call sites marked noinline. | 1801 // functions marked noinline or call sites marked noinline. |
1489 // Note: inlining non-exact non-interposable fucntions is fine, since we know | 1802 // Note: inlining non-exact non-interposable functions is fine, since we know |
1490 // we have *a* correct implementation of the source level function. | 1803 // we have *a* correct implementation of the source level function. |
1491 if (Callee->isInterposable() || Callee->hasFnAttribute(Attribute::NoInline) || | 1804 if (Callee->isInterposable() || Callee->hasFnAttribute(Attribute::NoInline) || |
1492 CS.isNoInline()) | 1805 CS.isNoInline()) |
1493 return llvm::InlineCost::getNever(); | 1806 return llvm::InlineCost::getNever(); |
1494 | 1807 |
1495 DEBUG(llvm::dbgs() << " Analyzing call of " << Callee->getName() | 1808 DEBUG(llvm::dbgs() << " Analyzing call of " << Callee->getName() |
1496 << "...\n"); | 1809 << "... (caller:" << Caller->getName() << ")\n"); |
1497 | 1810 |
1498 CallAnalyzer CA(CalleeTTI, GetAssumptionCache, PSI, *Callee, CS, Params); | 1811 CallAnalyzer CA(CalleeTTI, GetAssumptionCache, GetBFI, PSI, ORE, *Callee, CS, |
1812 Params); | |
1499 bool ShouldInline = CA.analyzeCall(CS); | 1813 bool ShouldInline = CA.analyzeCall(CS); |
1500 | 1814 |
1501 DEBUG(CA.dump()); | 1815 DEBUG(CA.dump()); |
1502 | 1816 |
1503 // Check if there was a reason to force inlining or no inlining. | 1817 // Check if there was a reason to force inlining or no inlining. |
1566 Params.HintThreshold = HintThreshold; | 1880 Params.HintThreshold = HintThreshold; |
1567 | 1881 |
1568 // Set the HotCallSiteThreshold knob from the -hot-callsite-threshold. | 1882 // Set the HotCallSiteThreshold knob from the -hot-callsite-threshold. |
1569 Params.HotCallSiteThreshold = HotCallSiteThreshold; | 1883 Params.HotCallSiteThreshold = HotCallSiteThreshold; |
1570 | 1884 |
1571 // Set the OptMinSizeThreshold and OptSizeThreshold params only if the | 1885 // If the -locally-hot-callsite-threshold is explicitly specified, use it to |
1886 // populate LocallyHotCallSiteThreshold. Later, we populate | |
1887 // Params.LocallyHotCallSiteThreshold from -locally-hot-callsite-threshold if | |
1888 // we know that optimization level is O3 (in the getInlineParams variant that | |
1889 // takes the opt and size levels). | |
1890 // FIXME: Remove this check (and make the assignment unconditional) after | |
1891 // addressing size regression issues at O2. | |
1892 if (LocallyHotCallSiteThreshold.getNumOccurrences() > 0) | |
1893 Params.LocallyHotCallSiteThreshold = LocallyHotCallSiteThreshold; | |
1894 | |
1895 // Set the ColdCallSiteThreshold knob from the -inline-cold-callsite-threshold. | |
1896 Params.ColdCallSiteThreshold = ColdCallSiteThreshold; | |
1897 | |
1572 // Set the OptMinSizeThreshold and OptSizeThreshold params only if the | 1898 // Set the OptMinSizeThreshold and OptSizeThreshold params only if the |
1573 // -inlinehint-threshold commandline option is not explicitly given. If that | 1899 // -inlinehint-threshold commandline option is not explicitly given. If that |
1574 // option is present, then its value applies even for callees with size and | 1900 // option is present, then its value applies even for callees with size and |
1575 // minsize attributes. | 1901 // minsize attributes. |
1576 // If the -inline-threshold is not specified, set the ColdThreshold from the | 1902 // If the -inline-threshold is not specified, set the ColdThreshold from the |
1603 return InlineConstants::OptMinSizeThreshold; | 1929 return InlineConstants::OptMinSizeThreshold; |
1604 return InlineThreshold; | 1930 return InlineThreshold; |
1605 } | 1931 } |
1606 | 1932 |
1607 InlineParams llvm::getInlineParams(unsigned OptLevel, unsigned SizeOptLevel) { | 1933 InlineParams llvm::getInlineParams(unsigned OptLevel, unsigned SizeOptLevel) { |
1608 return getInlineParams(computeThresholdFromOptLevels(OptLevel, SizeOptLevel)); | 1934 auto Params = |
1609 } | 1935 getInlineParams(computeThresholdFromOptLevels(OptLevel, SizeOptLevel)); |
1936 // At O3, use the value of -locally-hot-callsite-threshold option to populate | |
1937 // Params.LocallyHotCallSiteThreshold. Below O3, this flag has effect only | |
1938 // when it is specified explicitly. | |
1939 if (OptLevel > 2) | |
1940 Params.LocallyHotCallSiteThreshold = LocallyHotCallSiteThreshold; | |
1941 return Params; | |
1942 } |