comparison lib/Analysis/InlineCost.cpp @ 121:803732b1fca8

LLVM 5.0
author kono
date Fri, 27 Oct 2017 17:07:41 +0900
parents 1172e4bd9c6f
children 3a76565eade5
comparison
equal deleted inserted replaced
120:1172e4bd9c6f 121:803732b1fca8
16 #include "llvm/ADT/SetVector.h" 16 #include "llvm/ADT/SetVector.h"
17 #include "llvm/ADT/SmallPtrSet.h" 17 #include "llvm/ADT/SmallPtrSet.h"
18 #include "llvm/ADT/SmallVector.h" 18 #include "llvm/ADT/SmallVector.h"
19 #include "llvm/ADT/Statistic.h" 19 #include "llvm/ADT/Statistic.h"
20 #include "llvm/Analysis/AssumptionCache.h" 20 #include "llvm/Analysis/AssumptionCache.h"
21 #include "llvm/Analysis/BlockFrequencyInfo.h"
21 #include "llvm/Analysis/CodeMetrics.h" 22 #include "llvm/Analysis/CodeMetrics.h"
22 #include "llvm/Analysis/ConstantFolding.h" 23 #include "llvm/Analysis/ConstantFolding.h"
23 #include "llvm/Analysis/InstructionSimplify.h" 24 #include "llvm/Analysis/InstructionSimplify.h"
24 #include "llvm/Analysis/ProfileSummaryInfo.h" 25 #include "llvm/Analysis/ProfileSummaryInfo.h"
25 #include "llvm/Analysis/TargetTransformInfo.h" 26 #include "llvm/Analysis/TargetTransformInfo.h"
46 47
47 static cl::opt<int> HintThreshold( 48 static cl::opt<int> HintThreshold(
48 "inlinehint-threshold", cl::Hidden, cl::init(325), 49 "inlinehint-threshold", cl::Hidden, cl::init(325),
49 cl::desc("Threshold for inlining functions with inline hint")); 50 cl::desc("Threshold for inlining functions with inline hint"));
50 51
52 static cl::opt<int>
53 ColdCallSiteThreshold("inline-cold-callsite-threshold", cl::Hidden,
54 cl::init(45),
55 cl::desc("Threshold for inlining cold callsites"));
56
51 // We introduce this threshold to help performance of instrumentation based 57 // We introduce this threshold to help performance of instrumentation based
52 // PGO before we actually hook up inliner with analysis passes such as BPI and 58 // PGO before we actually hook up inliner with analysis passes such as BPI and
53 // BFI. 59 // BFI.
54 static cl::opt<int> ColdThreshold( 60 static cl::opt<int> ColdThreshold(
55 "inlinecold-threshold", cl::Hidden, cl::init(225), 61 "inlinecold-threshold", cl::Hidden, cl::init(45),
56 cl::desc("Threshold for inlining functions with cold attribute")); 62 cl::desc("Threshold for inlining functions with cold attribute"));
57 63
58 static cl::opt<int> 64 static cl::opt<int>
59 HotCallSiteThreshold("hot-callsite-threshold", cl::Hidden, cl::init(3000), 65 HotCallSiteThreshold("hot-callsite-threshold", cl::Hidden, cl::init(3000),
60 cl::ZeroOrMore, 66 cl::ZeroOrMore,
61 cl::desc("Threshold for hot callsites ")); 67 cl::desc("Threshold for hot callsites "));
62 68
69 static cl::opt<int> LocallyHotCallSiteThreshold(
70 "locally-hot-callsite-threshold", cl::Hidden, cl::init(525), cl::ZeroOrMore,
71 cl::desc("Threshold for locally hot callsites "));
72
73 static cl::opt<int> ColdCallSiteRelFreq(
74 "cold-callsite-rel-freq", cl::Hidden, cl::init(2), cl::ZeroOrMore,
75 cl::desc("Maxmimum block frequency, expressed as a percentage of caller's "
76 "entry frequency, for a callsite to be cold in the absence of "
77 "profile information."));
78
79 static cl::opt<int> HotCallSiteRelFreq(
80 "hot-callsite-rel-freq", cl::Hidden, cl::init(60), cl::ZeroOrMore,
81 cl::desc("Minimum block frequency, expressed as a multiple of caller's "
82 "entry frequency, for a callsite to be hot in the absence of "
83 "profile information."));
84
85 static cl::opt<bool> OptComputeFullInlineCost(
86 "inline-cost-full", cl::Hidden, cl::init(false),
87 cl::desc("Compute the full inline cost of a call site even when the cost "
88 "exceeds the threshold."));
89
63 namespace { 90 namespace {
64 91
65 class CallAnalyzer : public InstVisitor<CallAnalyzer, bool> { 92 class CallAnalyzer : public InstVisitor<CallAnalyzer, bool> {
66 typedef InstVisitor<CallAnalyzer, bool> Base; 93 typedef InstVisitor<CallAnalyzer, bool> Base;
67 friend class InstVisitor<CallAnalyzer, bool>; 94 friend class InstVisitor<CallAnalyzer, bool>;
70 const TargetTransformInfo &TTI; 97 const TargetTransformInfo &TTI;
71 98
72 /// Getter for the cache of @llvm.assume intrinsics. 99 /// Getter for the cache of @llvm.assume intrinsics.
73 std::function<AssumptionCache &(Function &)> &GetAssumptionCache; 100 std::function<AssumptionCache &(Function &)> &GetAssumptionCache;
74 101
102 /// Getter for BlockFrequencyInfo
103 Optional<function_ref<BlockFrequencyInfo &(Function &)>> &GetBFI;
104
75 /// Profile summary information. 105 /// Profile summary information.
76 ProfileSummaryInfo *PSI; 106 ProfileSummaryInfo *PSI;
77 107
78 /// The called function. 108 /// The called function.
79 Function &F; 109 Function &F;
110
111 // Cache the DataLayout since we use it a lot.
112 const DataLayout &DL;
113
114 /// The OptimizationRemarkEmitter available for this compilation.
115 OptimizationRemarkEmitter *ORE;
80 116
81 /// The candidate callsite being analyzed. Please do not use this to do 117 /// The candidate callsite being analyzed. Please do not use this to do
82 /// analysis in the caller function; we want the inline cost query to be 118 /// analysis in the caller function; we want the inline cost query to be
83 /// easily cacheable. Instead, use the cover function paramHasAttr. 119 /// easily cacheable. Instead, use the cover function paramHasAttr.
84 CallSite CandidateCS; 120 CallSite CandidateCS;
86 /// Tunable parameters that control the analysis. 122 /// Tunable parameters that control the analysis.
87 const InlineParams &Params; 123 const InlineParams &Params;
88 124
89 int Threshold; 125 int Threshold;
90 int Cost; 126 int Cost;
127 bool ComputeFullInlineCost;
91 128
92 bool IsCallerRecursive; 129 bool IsCallerRecursive;
93 bool IsRecursiveCall; 130 bool IsRecursiveCall;
94 bool ExposesReturnsTwice; 131 bool ExposesReturnsTwice;
95 bool HasDynamicAlloca; 132 bool HasDynamicAlloca;
99 bool HasFrameEscape; 136 bool HasFrameEscape;
100 137
101 /// Number of bytes allocated statically by the callee. 138 /// Number of bytes allocated statically by the callee.
102 uint64_t AllocatedSize; 139 uint64_t AllocatedSize;
103 unsigned NumInstructions, NumVectorInstructions; 140 unsigned NumInstructions, NumVectorInstructions;
104 int FiftyPercentVectorBonus, TenPercentVectorBonus; 141 int VectorBonus, TenPercentVectorBonus;
105 int VectorBonus; 142 // Bonus to be applied when the callee has only one reachable basic block.
143 int SingleBBBonus;
106 144
107 /// While we walk the potentially-inlined instructions, we build up and 145 /// While we walk the potentially-inlined instructions, we build up and
108 /// maintain a mapping of simplified values specific to this callsite. The 146 /// maintain a mapping of simplified values specific to this callsite. The
109 /// idea is to propagate any special information we have about arguments to 147 /// idea is to propagate any special information we have about arguments to
110 /// this call through the inlinable section of the function, and account for 148 /// this call through the inlinable section of the function, and account for
131 DenseMap<Value *, int>::iterator &CostIt); 169 DenseMap<Value *, int>::iterator &CostIt);
132 void disableSROA(DenseMap<Value *, int>::iterator CostIt); 170 void disableSROA(DenseMap<Value *, int>::iterator CostIt);
133 void disableSROA(Value *V); 171 void disableSROA(Value *V);
134 void accumulateSROACost(DenseMap<Value *, int>::iterator CostIt, 172 void accumulateSROACost(DenseMap<Value *, int>::iterator CostIt,
135 int InstructionCost); 173 int InstructionCost);
136 bool isGEPOffsetConstant(GetElementPtrInst &GEP); 174 bool isGEPFree(GetElementPtrInst &GEP);
175 bool canFoldInboundsGEP(GetElementPtrInst &I);
137 bool accumulateGEPOffset(GEPOperator &GEP, APInt &Offset); 176 bool accumulateGEPOffset(GEPOperator &GEP, APInt &Offset);
138 bool simplifyCallSite(Function *F, CallSite CS); 177 bool simplifyCallSite(Function *F, CallSite CS);
178 template <typename Callable>
179 bool simplifyInstruction(Instruction &I, Callable Evaluate);
139 ConstantInt *stripAndComputeInBoundsConstantOffsets(Value *&V); 180 ConstantInt *stripAndComputeInBoundsConstantOffsets(Value *&V);
140 181
141 /// Return true if the given argument to the function being considered for 182 /// Return true if the given argument to the function being considered for
142 /// inlining has the given attribute set either at the call site or the 183 /// inlining has the given attribute set either at the call site or the
143 /// function declaration. Primarily used to inspect call site specific 184 /// function declaration. Primarily used to inspect call site specific
155 /// analysis. 196 /// analysis.
156 void updateThreshold(CallSite CS, Function &Callee); 197 void updateThreshold(CallSite CS, Function &Callee);
157 198
158 /// Return true if size growth is allowed when inlining the callee at CS. 199 /// Return true if size growth is allowed when inlining the callee at CS.
159 bool allowSizeGrowth(CallSite CS); 200 bool allowSizeGrowth(CallSite CS);
201
202 /// Return true if \p CS is a cold callsite.
203 bool isColdCallSite(CallSite CS, BlockFrequencyInfo *CallerBFI);
204
205 /// Return a higher threshold if \p CS is a hot callsite.
206 Optional<int> getHotCallSiteThreshold(CallSite CS,
207 BlockFrequencyInfo *CallerBFI);
160 208
161 // Custom analysis routines. 209 // Custom analysis routines.
162 bool analyzeBlock(BasicBlock *BB, SmallPtrSetImpl<const Value *> &EphValues); 210 bool analyzeBlock(BasicBlock *BB, SmallPtrSetImpl<const Value *> &EphValues);
163 211
164 // Disable several entry points to the visitor so we don't accidentally use 212 // Disable several entry points to the visitor so we don't accidentally use
181 bool visitPtrToInt(PtrToIntInst &I); 229 bool visitPtrToInt(PtrToIntInst &I);
182 bool visitIntToPtr(IntToPtrInst &I); 230 bool visitIntToPtr(IntToPtrInst &I);
183 bool visitCastInst(CastInst &I); 231 bool visitCastInst(CastInst &I);
184 bool visitUnaryInstruction(UnaryInstruction &I); 232 bool visitUnaryInstruction(UnaryInstruction &I);
185 bool visitCmpInst(CmpInst &I); 233 bool visitCmpInst(CmpInst &I);
234 bool visitAnd(BinaryOperator &I);
235 bool visitOr(BinaryOperator &I);
186 bool visitSub(BinaryOperator &I); 236 bool visitSub(BinaryOperator &I);
187 bool visitBinaryOperator(BinaryOperator &I); 237 bool visitBinaryOperator(BinaryOperator &I);
188 bool visitLoad(LoadInst &I); 238 bool visitLoad(LoadInst &I);
189 bool visitStore(StoreInst &I); 239 bool visitStore(StoreInst &I);
190 bool visitExtractValue(ExtractValueInst &I); 240 bool visitExtractValue(ExtractValueInst &I);
191 bool visitInsertValue(InsertValueInst &I); 241 bool visitInsertValue(InsertValueInst &I);
192 bool visitCallSite(CallSite CS); 242 bool visitCallSite(CallSite CS);
193 bool visitReturnInst(ReturnInst &RI); 243 bool visitReturnInst(ReturnInst &RI);
194 bool visitBranchInst(BranchInst &BI); 244 bool visitBranchInst(BranchInst &BI);
245 bool visitSelectInst(SelectInst &SI);
195 bool visitSwitchInst(SwitchInst &SI); 246 bool visitSwitchInst(SwitchInst &SI);
196 bool visitIndirectBrInst(IndirectBrInst &IBI); 247 bool visitIndirectBrInst(IndirectBrInst &IBI);
197 bool visitResumeInst(ResumeInst &RI); 248 bool visitResumeInst(ResumeInst &RI);
198 bool visitCleanupReturnInst(CleanupReturnInst &RI); 249 bool visitCleanupReturnInst(CleanupReturnInst &RI);
199 bool visitCatchReturnInst(CatchReturnInst &RI); 250 bool visitCatchReturnInst(CatchReturnInst &RI);
200 bool visitUnreachableInst(UnreachableInst &I); 251 bool visitUnreachableInst(UnreachableInst &I);
201 252
202 public: 253 public:
203 CallAnalyzer(const TargetTransformInfo &TTI, 254 CallAnalyzer(const TargetTransformInfo &TTI,
204 std::function<AssumptionCache &(Function &)> &GetAssumptionCache, 255 std::function<AssumptionCache &(Function &)> &GetAssumptionCache,
205 ProfileSummaryInfo *PSI, Function &Callee, CallSite CSArg, 256 Optional<function_ref<BlockFrequencyInfo &(Function &)>> &GetBFI,
206 const InlineParams &Params) 257 ProfileSummaryInfo *PSI, OptimizationRemarkEmitter *ORE,
207 : TTI(TTI), GetAssumptionCache(GetAssumptionCache), PSI(PSI), F(Callee), 258 Function &Callee, CallSite CSArg, const InlineParams &Params)
259 : TTI(TTI), GetAssumptionCache(GetAssumptionCache), GetBFI(GetBFI),
260 PSI(PSI), F(Callee), DL(F.getParent()->getDataLayout()), ORE(ORE),
208 CandidateCS(CSArg), Params(Params), Threshold(Params.DefaultThreshold), 261 CandidateCS(CSArg), Params(Params), Threshold(Params.DefaultThreshold),
209 Cost(0), IsCallerRecursive(false), IsRecursiveCall(false), 262 Cost(0), ComputeFullInlineCost(OptComputeFullInlineCost ||
263 Params.ComputeFullInlineCost || ORE),
264 IsCallerRecursive(false), IsRecursiveCall(false),
210 ExposesReturnsTwice(false), HasDynamicAlloca(false), 265 ExposesReturnsTwice(false), HasDynamicAlloca(false),
211 ContainsNoDuplicateCall(false), HasReturn(false), HasIndirectBr(false), 266 ContainsNoDuplicateCall(false), HasReturn(false), HasIndirectBr(false),
212 HasFrameEscape(false), AllocatedSize(0), NumInstructions(0), 267 HasFrameEscape(false), AllocatedSize(0), NumInstructions(0),
213 NumVectorInstructions(0), FiftyPercentVectorBonus(0), 268 NumVectorInstructions(0), VectorBonus(0), SingleBBBonus(0),
214 TenPercentVectorBonus(0), VectorBonus(0), NumConstantArgs(0), 269 NumConstantArgs(0), NumConstantOffsetPtrArgs(0), NumAllocaArgs(0),
215 NumConstantOffsetPtrArgs(0), NumAllocaArgs(0), NumConstantPtrCmps(0), 270 NumConstantPtrCmps(0), NumConstantPtrDiffs(0),
216 NumConstantPtrDiffs(0), NumInstructionsSimplified(0), 271 NumInstructionsSimplified(0), SROACostSavings(0),
217 SROACostSavings(0), SROACostSavingsLost(0) {} 272 SROACostSavingsLost(0) {}
218 273
219 bool analyzeCall(CallSite CS); 274 bool analyzeCall(CallSite CS);
220 275
221 int getThreshold() { return Threshold; } 276 int getThreshold() { return Threshold; }
222 int getCost() { return Cost; } 277 int getCost() { return Cost; }
284 int InstructionCost) { 339 int InstructionCost) {
285 CostIt->second += InstructionCost; 340 CostIt->second += InstructionCost;
286 SROACostSavings += InstructionCost; 341 SROACostSavings += InstructionCost;
287 } 342 }
288 343
289 /// \brief Check whether a GEP's indices are all constant.
290 ///
291 /// Respects any simplified values known during the analysis of this callsite.
292 bool CallAnalyzer::isGEPOffsetConstant(GetElementPtrInst &GEP) {
293 for (User::op_iterator I = GEP.idx_begin(), E = GEP.idx_end(); I != E; ++I)
294 if (!isa<Constant>(*I) && !SimplifiedValues.lookup(*I))
295 return false;
296
297 return true;
298 }
299
300 /// \brief Accumulate a constant GEP offset into an APInt if possible. 344 /// \brief Accumulate a constant GEP offset into an APInt if possible.
301 /// 345 ///
302 /// Returns false if unable to compute the offset for any reason. Respects any 346 /// Returns false if unable to compute the offset for any reason. Respects any
303 /// simplified values known during the analysis of this callsite. 347 /// simplified values known during the analysis of this callsite.
304 bool CallAnalyzer::accumulateGEPOffset(GEPOperator &GEP, APInt &Offset) { 348 bool CallAnalyzer::accumulateGEPOffset(GEPOperator &GEP, APInt &Offset) {
305 const DataLayout &DL = F.getParent()->getDataLayout();
306 unsigned IntPtrWidth = DL.getPointerSizeInBits(); 349 unsigned IntPtrWidth = DL.getPointerSizeInBits();
307 assert(IntPtrWidth == Offset.getBitWidth()); 350 assert(IntPtrWidth == Offset.getBitWidth());
308 351
309 for (gep_type_iterator GTI = gep_type_begin(GEP), GTE = gep_type_end(GEP); 352 for (gep_type_iterator GTI = gep_type_begin(GEP), GTE = gep_type_end(GEP);
310 GTI != GTE; ++GTI) { 353 GTI != GTE; ++GTI) {
316 return false; 359 return false;
317 if (OpC->isZero()) 360 if (OpC->isZero())
318 continue; 361 continue;
319 362
320 // Handle a struct index, which adds its field offset to the pointer. 363 // Handle a struct index, which adds its field offset to the pointer.
321 if (StructType *STy = dyn_cast<StructType>(*GTI)) { 364 if (StructType *STy = GTI.getStructTypeOrNull()) {
322 unsigned ElementIdx = OpC->getZExtValue(); 365 unsigned ElementIdx = OpC->getZExtValue();
323 const StructLayout *SL = DL.getStructLayout(STy); 366 const StructLayout *SL = DL.getStructLayout(STy);
324 Offset += APInt(IntPtrWidth, SL->getElementOffset(ElementIdx)); 367 Offset += APInt(IntPtrWidth, SL->getElementOffset(ElementIdx));
325 continue; 368 continue;
326 } 369 }
327 370
328 APInt TypeSize(IntPtrWidth, DL.getTypeAllocSize(GTI.getIndexedType())); 371 APInt TypeSize(IntPtrWidth, DL.getTypeAllocSize(GTI.getIndexedType()));
329 Offset += OpC->getValue().sextOrTrunc(IntPtrWidth) * TypeSize; 372 Offset += OpC->getValue().sextOrTrunc(IntPtrWidth) * TypeSize;
330 } 373 }
331 return true; 374 return true;
375 }
376
377 /// \brief Use TTI to check whether a GEP is free.
378 ///
379 /// Respects any simplified values known during the analysis of this callsite.
380 bool CallAnalyzer::isGEPFree(GetElementPtrInst &GEP) {
381 SmallVector<Value *, 4> Operands;
382 Operands.push_back(GEP.getOperand(0));
383 for (User::op_iterator I = GEP.idx_begin(), E = GEP.idx_end(); I != E; ++I)
384 if (Constant *SimpleOp = SimplifiedValues.lookup(*I))
385 Operands.push_back(SimpleOp);
386 else
387 Operands.push_back(*I);
388 return TargetTransformInfo::TCC_Free == TTI.getUserCost(&GEP, Operands);
332 } 389 }
333 390
334 bool CallAnalyzer::visitAlloca(AllocaInst &I) { 391 bool CallAnalyzer::visitAlloca(AllocaInst &I) {
335 // Check whether inlining will turn a dynamic alloca into a static 392 // Check whether inlining will turn a dynamic alloca into a static
336 // alloca and handle that case. 393 // alloca and handle that case.
337 if (I.isArrayAllocation()) { 394 if (I.isArrayAllocation()) {
338 Constant *Size = SimplifiedValues.lookup(I.getArraySize()); 395 Constant *Size = SimplifiedValues.lookup(I.getArraySize());
339 if (auto *AllocSize = dyn_cast_or_null<ConstantInt>(Size)) { 396 if (auto *AllocSize = dyn_cast_or_null<ConstantInt>(Size)) {
340 const DataLayout &DL = F.getParent()->getDataLayout();
341 Type *Ty = I.getAllocatedType(); 397 Type *Ty = I.getAllocatedType();
342 AllocatedSize = SaturatingMultiplyAdd( 398 AllocatedSize = SaturatingMultiplyAdd(
343 AllocSize->getLimitedValue(), DL.getTypeAllocSize(Ty), AllocatedSize); 399 AllocSize->getLimitedValue(), DL.getTypeAllocSize(Ty), AllocatedSize);
344 return Base::visitAlloca(I); 400 return Base::visitAlloca(I);
345 } 401 }
346 } 402 }
347 403
348 // Accumulate the allocated size. 404 // Accumulate the allocated size.
349 if (I.isStaticAlloca()) { 405 if (I.isStaticAlloca()) {
350 const DataLayout &DL = F.getParent()->getDataLayout();
351 Type *Ty = I.getAllocatedType(); 406 Type *Ty = I.getAllocatedType();
352 AllocatedSize = SaturatingAdd(DL.getTypeAllocSize(Ty), AllocatedSize); 407 AllocatedSize = SaturatingAdd(DL.getTypeAllocSize(Ty), AllocatedSize);
353 } 408 }
354 409
355 // We will happily inline static alloca instructions. 410 // We will happily inline static alloca instructions.
375 430
376 // Phi nodes are always zero-cost. 431 // Phi nodes are always zero-cost.
377 return true; 432 return true;
378 } 433 }
379 434
435 /// \brief Check we can fold GEPs of constant-offset call site argument pointers.
436 /// This requires target data and inbounds GEPs.
437 ///
438 /// \return true if the specified GEP can be folded.
439 bool CallAnalyzer::canFoldInboundsGEP(GetElementPtrInst &I) {
440 // Check if we have a base + offset for the pointer.
441 std::pair<Value *, APInt> BaseAndOffset =
442 ConstantOffsetPtrs.lookup(I.getPointerOperand());
443 if (!BaseAndOffset.first)
444 return false;
445
446 // Check if the offset of this GEP is constant, and if so accumulate it
447 // into Offset.
448 if (!accumulateGEPOffset(cast<GEPOperator>(I), BaseAndOffset.second))
449 return false;
450
451 // Add the result as a new mapping to Base + Offset.
452 ConstantOffsetPtrs[&I] = BaseAndOffset;
453
454 return true;
455 }
456
380 bool CallAnalyzer::visitGetElementPtr(GetElementPtrInst &I) { 457 bool CallAnalyzer::visitGetElementPtr(GetElementPtrInst &I) {
381 Value *SROAArg; 458 Value *SROAArg;
382 DenseMap<Value *, int>::iterator CostIt; 459 DenseMap<Value *, int>::iterator CostIt;
383 bool SROACandidate = 460 bool SROACandidate =
384 lookupSROAArgAndCost(I.getPointerOperand(), SROAArg, CostIt); 461 lookupSROAArgAndCost(I.getPointerOperand(), SROAArg, CostIt);
385 462
386 // Try to fold GEPs of constant-offset call site argument pointers. This 463 // Lambda to check whether a GEP's indices are all constant.
387 // requires target data and inbounds GEPs. 464 auto IsGEPOffsetConstant = [&](GetElementPtrInst &GEP) {
388 if (I.isInBounds()) { 465 for (User::op_iterator I = GEP.idx_begin(), E = GEP.idx_end(); I != E; ++I)
389 // Check if we have a base + offset for the pointer. 466 if (!isa<Constant>(*I) && !SimplifiedValues.lookup(*I))
390 Value *Ptr = I.getPointerOperand();
391 std::pair<Value *, APInt> BaseAndOffset = ConstantOffsetPtrs.lookup(Ptr);
392 if (BaseAndOffset.first) {
393 // Check if the offset of this GEP is constant, and if so accumulate it
394 // into Offset.
395 if (!accumulateGEPOffset(cast<GEPOperator>(I), BaseAndOffset.second)) {
396 // Non-constant GEPs aren't folded, and disable SROA.
397 if (SROACandidate)
398 disableSROA(CostIt);
399 return false; 467 return false;
400 } 468 return true;
401 469 };
402 // Add the result as a new mapping to Base + Offset. 470
403 ConstantOffsetPtrs[&I] = BaseAndOffset; 471 if ((I.isInBounds() && canFoldInboundsGEP(I)) || IsGEPOffsetConstant(I)) {
404
405 // Also handle SROA candidates here, we already know that the GEP is
406 // all-constant indexed.
407 if (SROACandidate)
408 SROAArgValues[&I] = SROAArg;
409
410 return true;
411 }
412 }
413
414 if (isGEPOffsetConstant(I)) {
415 if (SROACandidate) 472 if (SROACandidate)
416 SROAArgValues[&I] = SROAArg; 473 SROAArgValues[&I] = SROAArg;
417 474
418 // Constant GEPs are modeled as free. 475 // Constant GEPs are modeled as free.
419 return true; 476 return true;
420 } 477 }
421 478
422 // Variable GEPs will require math and will disable SROA. 479 // Variable GEPs will require math and will disable SROA.
423 if (SROACandidate) 480 if (SROACandidate)
424 disableSROA(CostIt); 481 disableSROA(CostIt);
425 return false; 482 return isGEPFree(I);
483 }
484
485 /// Simplify \p I if its operands are constants and update SimplifiedValues.
486 /// \p Evaluate is a callable specific to instruction type that evaluates the
487 /// instruction when all the operands are constants.
488 template <typename Callable>
489 bool CallAnalyzer::simplifyInstruction(Instruction &I, Callable Evaluate) {
490 SmallVector<Constant *, 2> COps;
491 for (Value *Op : I.operands()) {
492 Constant *COp = dyn_cast<Constant>(Op);
493 if (!COp)
494 COp = SimplifiedValues.lookup(Op);
495 if (!COp)
496 return false;
497 COps.push_back(COp);
498 }
499 auto *C = Evaluate(COps);
500 if (!C)
501 return false;
502 SimplifiedValues[&I] = C;
503 return true;
426 } 504 }
427 505
428 bool CallAnalyzer::visitBitCast(BitCastInst &I) { 506 bool CallAnalyzer::visitBitCast(BitCastInst &I) {
429 // Propagate constants through bitcasts. 507 // Propagate constants through bitcasts.
430 Constant *COp = dyn_cast<Constant>(I.getOperand(0)); 508 if (simplifyInstruction(I, [&](SmallVectorImpl<Constant *> &COps) {
431 if (!COp) 509 return ConstantExpr::getBitCast(COps[0], I.getType());
432 COp = SimplifiedValues.lookup(I.getOperand(0)); 510 }))
433 if (COp) 511 return true;
434 if (Constant *C = ConstantExpr::getBitCast(COp, I.getType())) {
435 SimplifiedValues[&I] = C;
436 return true;
437 }
438 512
439 // Track base/offsets through casts 513 // Track base/offsets through casts
440 std::pair<Value *, APInt> BaseAndOffset = 514 std::pair<Value *, APInt> BaseAndOffset =
441 ConstantOffsetPtrs.lookup(I.getOperand(0)); 515 ConstantOffsetPtrs.lookup(I.getOperand(0));
442 // Casts don't change the offset, just wrap it up. 516 // Casts don't change the offset, just wrap it up.
453 return true; 527 return true;
454 } 528 }
455 529
456 bool CallAnalyzer::visitPtrToInt(PtrToIntInst &I) { 530 bool CallAnalyzer::visitPtrToInt(PtrToIntInst &I) {
457 // Propagate constants through ptrtoint. 531 // Propagate constants through ptrtoint.
458 Constant *COp = dyn_cast<Constant>(I.getOperand(0)); 532 if (simplifyInstruction(I, [&](SmallVectorImpl<Constant *> &COps) {
459 if (!COp) 533 return ConstantExpr::getPtrToInt(COps[0], I.getType());
460 COp = SimplifiedValues.lookup(I.getOperand(0)); 534 }))
461 if (COp) 535 return true;
462 if (Constant *C = ConstantExpr::getPtrToInt(COp, I.getType())) {
463 SimplifiedValues[&I] = C;
464 return true;
465 }
466 536
467 // Track base/offset pairs when converted to a plain integer provided the 537 // Track base/offset pairs when converted to a plain integer provided the
468 // integer is large enough to represent the pointer. 538 // integer is large enough to represent the pointer.
469 unsigned IntegerSize = I.getType()->getScalarSizeInBits(); 539 unsigned IntegerSize = I.getType()->getScalarSizeInBits();
470 const DataLayout &DL = F.getParent()->getDataLayout();
471 if (IntegerSize >= DL.getPointerSizeInBits()) { 540 if (IntegerSize >= DL.getPointerSizeInBits()) {
472 std::pair<Value *, APInt> BaseAndOffset = 541 std::pair<Value *, APInt> BaseAndOffset =
473 ConstantOffsetPtrs.lookup(I.getOperand(0)); 542 ConstantOffsetPtrs.lookup(I.getOperand(0));
474 if (BaseAndOffset.first) 543 if (BaseAndOffset.first)
475 ConstantOffsetPtrs[&I] = BaseAndOffset; 544 ConstantOffsetPtrs[&I] = BaseAndOffset;
490 return TargetTransformInfo::TCC_Free == TTI.getUserCost(&I); 559 return TargetTransformInfo::TCC_Free == TTI.getUserCost(&I);
491 } 560 }
492 561
493 bool CallAnalyzer::visitIntToPtr(IntToPtrInst &I) { 562 bool CallAnalyzer::visitIntToPtr(IntToPtrInst &I) {
494 // Propagate constants through ptrtoint. 563 // Propagate constants through ptrtoint.
495 Constant *COp = dyn_cast<Constant>(I.getOperand(0)); 564 if (simplifyInstruction(I, [&](SmallVectorImpl<Constant *> &COps) {
496 if (!COp) 565 return ConstantExpr::getIntToPtr(COps[0], I.getType());
497 COp = SimplifiedValues.lookup(I.getOperand(0)); 566 }))
498 if (COp) 567 return true;
499 if (Constant *C = ConstantExpr::getIntToPtr(COp, I.getType())) {
500 SimplifiedValues[&I] = C;
501 return true;
502 }
503 568
504 // Track base/offset pairs when round-tripped through a pointer without 569 // Track base/offset pairs when round-tripped through a pointer without
505 // modifications provided the integer is not too large. 570 // modifications provided the integer is not too large.
506 Value *Op = I.getOperand(0); 571 Value *Op = I.getOperand(0);
507 unsigned IntegerSize = Op->getType()->getScalarSizeInBits(); 572 unsigned IntegerSize = Op->getType()->getScalarSizeInBits();
508 const DataLayout &DL = F.getParent()->getDataLayout();
509 if (IntegerSize <= DL.getPointerSizeInBits()) { 573 if (IntegerSize <= DL.getPointerSizeInBits()) {
510 std::pair<Value *, APInt> BaseAndOffset = ConstantOffsetPtrs.lookup(Op); 574 std::pair<Value *, APInt> BaseAndOffset = ConstantOffsetPtrs.lookup(Op);
511 if (BaseAndOffset.first) 575 if (BaseAndOffset.first)
512 ConstantOffsetPtrs[&I] = BaseAndOffset; 576 ConstantOffsetPtrs[&I] = BaseAndOffset;
513 } 577 }
521 return TargetTransformInfo::TCC_Free == TTI.getUserCost(&I); 585 return TargetTransformInfo::TCC_Free == TTI.getUserCost(&I);
522 } 586 }
523 587
524 bool CallAnalyzer::visitCastInst(CastInst &I) { 588 bool CallAnalyzer::visitCastInst(CastInst &I) {
525 // Propagate constants through ptrtoint. 589 // Propagate constants through ptrtoint.
526 Constant *COp = dyn_cast<Constant>(I.getOperand(0)); 590 if (simplifyInstruction(I, [&](SmallVectorImpl<Constant *> &COps) {
527 if (!COp) 591 return ConstantExpr::getCast(I.getOpcode(), COps[0], I.getType());
528 COp = SimplifiedValues.lookup(I.getOperand(0)); 592 }))
529 if (COp) 593 return true;
530 if (Constant *C = ConstantExpr::getCast(I.getOpcode(), COp, I.getType())) {
531 SimplifiedValues[&I] = C;
532 return true;
533 }
534 594
535 // Disable SROA in the face of arbitrary casts we don't whitelist elsewhere. 595 // Disable SROA in the face of arbitrary casts we don't whitelist elsewhere.
536 disableSROA(I.getOperand(0)); 596 disableSROA(I.getOperand(0));
537 597
538 return TargetTransformInfo::TCC_Free == TTI.getUserCost(&I); 598 return TargetTransformInfo::TCC_Free == TTI.getUserCost(&I);
539 } 599 }
540 600
541 bool CallAnalyzer::visitUnaryInstruction(UnaryInstruction &I) { 601 bool CallAnalyzer::visitUnaryInstruction(UnaryInstruction &I) {
542 Value *Operand = I.getOperand(0); 602 Value *Operand = I.getOperand(0);
543 Constant *COp = dyn_cast<Constant>(Operand); 603 if (simplifyInstruction(I, [&](SmallVectorImpl<Constant *> &COps) {
544 if (!COp) 604 return ConstantFoldInstOperands(&I, COps[0], DL);
545 COp = SimplifiedValues.lookup(Operand); 605 }))
546 if (COp) { 606 return true;
547 const DataLayout &DL = F.getParent()->getDataLayout();
548 if (Constant *C = ConstantFoldInstOperands(&I, COp, DL)) {
549 SimplifiedValues[&I] = C;
550 return true;
551 }
552 }
553 607
554 // Disable any SROA on the argument to arbitrary unary operators. 608 // Disable any SROA on the argument to arbitrary unary operators.
555 disableSROA(Operand); 609 disableSROA(Operand);
556 610
557 return false; 611 return false;
558 } 612 }
559 613
560 bool CallAnalyzer::paramHasAttr(Argument *A, Attribute::AttrKind Attr) { 614 bool CallAnalyzer::paramHasAttr(Argument *A, Attribute::AttrKind Attr) {
561 unsigned ArgNo = A->getArgNo(); 615 return CandidateCS.paramHasAttr(A->getArgNo(), Attr);
562 return CandidateCS.paramHasAttr(ArgNo + 1, Attr);
563 } 616 }
564 617
565 bool CallAnalyzer::isKnownNonNullInCallee(Value *V) { 618 bool CallAnalyzer::isKnownNonNullInCallee(Value *V) {
566 // Does the *call site* have the NonNull attribute set on an argument? We 619 // Does the *call site* have the NonNull attribute set on an argument? We
567 // use the attribute on the call site to memoize any analysis done in the 620 // use the attribute on the call site to memoize any analysis done in the
608 return false; 661 return false;
609 662
610 return true; 663 return true;
611 } 664 }
612 665
666 bool CallAnalyzer::isColdCallSite(CallSite CS, BlockFrequencyInfo *CallerBFI) {
667 // If global profile summary is available, then callsite's coldness is
668 // determined based on that.
669 if (PSI && PSI->hasProfileSummary())
670 return PSI->isColdCallSite(CS, CallerBFI);
671
672 // Otherwise we need BFI to be available.
673 if (!CallerBFI)
674 return false;
675
676 // Determine if the callsite is cold relative to caller's entry. We could
677 // potentially cache the computation of scaled entry frequency, but the added
678 // complexity is not worth it unless this scaling shows up high in the
679 // profiles.
680 const BranchProbability ColdProb(ColdCallSiteRelFreq, 100);
681 auto CallSiteBB = CS.getInstruction()->getParent();
682 auto CallSiteFreq = CallerBFI->getBlockFreq(CallSiteBB);
683 auto CallerEntryFreq =
684 CallerBFI->getBlockFreq(&(CS.getCaller()->getEntryBlock()));
685 return CallSiteFreq < CallerEntryFreq * ColdProb;
686 }
687
688 Optional<int>
689 CallAnalyzer::getHotCallSiteThreshold(CallSite CS,
690 BlockFrequencyInfo *CallerBFI) {
691
692 // If global profile summary is available, then callsite's hotness is
693 // determined based on that.
694 if (PSI && PSI->hasProfileSummary() && PSI->isHotCallSite(CS, CallerBFI))
695 return Params.HotCallSiteThreshold;
696
697 // Otherwise we need BFI to be available and to have a locally hot callsite
698 // threshold.
699 if (!CallerBFI || !Params.LocallyHotCallSiteThreshold)
700 return None;
701
702 // Determine if the callsite is hot relative to caller's entry. We could
703 // potentially cache the computation of scaled entry frequency, but the added
704 // complexity is not worth it unless this scaling shows up high in the
705 // profiles.
706 auto CallSiteBB = CS.getInstruction()->getParent();
707 auto CallSiteFreq = CallerBFI->getBlockFreq(CallSiteBB).getFrequency();
708 auto CallerEntryFreq = CallerBFI->getEntryFreq();
709 if (CallSiteFreq >= CallerEntryFreq * HotCallSiteRelFreq)
710 return Params.LocallyHotCallSiteThreshold;
711
712 // Otherwise treat it normally.
713 return None;
714 }
715
613 void CallAnalyzer::updateThreshold(CallSite CS, Function &Callee) { 716 void CallAnalyzer::updateThreshold(CallSite CS, Function &Callee) {
614 // If no size growth is allowed for this inlining, set Threshold to 0. 717 // If no size growth is allowed for this inlining, set Threshold to 0.
615 if (!allowSizeGrowth(CS)) { 718 if (!allowSizeGrowth(CS)) {
616 Threshold = 0; 719 Threshold = 0;
617 return; 720 return;
627 // return max(A, B) if B is valid. 730 // return max(A, B) if B is valid.
628 auto MaxIfValid = [](int A, Optional<int> B) { 731 auto MaxIfValid = [](int A, Optional<int> B) {
629 return B ? std::max(A, B.getValue()) : A; 732 return B ? std::max(A, B.getValue()) : A;
630 }; 733 };
631 734
735 // Various bonus percentages. These are multiplied by Threshold to get the
736 // bonus values.
737 // SingleBBBonus: This bonus is applied if the callee has a single reachable
738 // basic block at the given callsite context. This is speculatively applied
739 // and withdrawn if more than one basic block is seen.
740 //
741 // Vector bonuses: We want to more aggressively inline vector-dense kernels
742 // and apply this bonus based on the percentage of vector instructions. A
743 // bonus is applied if the vector instructions exceed 50% and half that amount
744 // is applied if it exceeds 10%. Note that these bonuses are some what
745 // arbitrary and evolved over time by accident as much as because they are
746 // principled bonuses.
747 // FIXME: It would be nice to base the bonus values on something more
748 // scientific.
749 //
750 // LstCallToStaticBonus: This large bonus is applied to ensure the inlining
751 // of the last call to a static function as inlining such functions is
752 // guaranteed to reduce code size.
753 //
754 // These bonus percentages may be set to 0 based on properties of the caller
755 // and the callsite.
756 int SingleBBBonusPercent = 50;
757 int VectorBonusPercent = 150;
758 int LastCallToStaticBonus = InlineConstants::LastCallToStaticBonus;
759
760 // Lambda to set all the above bonus and bonus percentages to 0.
761 auto DisallowAllBonuses = [&]() {
762 SingleBBBonusPercent = 0;
763 VectorBonusPercent = 0;
764 LastCallToStaticBonus = 0;
765 };
766
632 // Use the OptMinSizeThreshold or OptSizeThreshold knob if they are available 767 // Use the OptMinSizeThreshold or OptSizeThreshold knob if they are available
633 // and reduce the threshold if the caller has the necessary attribute. 768 // and reduce the threshold if the caller has the necessary attribute.
634 if (Caller->optForMinSize()) 769 if (Caller->optForMinSize()) {
635 Threshold = MinIfValid(Threshold, Params.OptMinSizeThreshold); 770 Threshold = MinIfValid(Threshold, Params.OptMinSizeThreshold);
636 else if (Caller->optForSize()) 771 // For minsize, we want to disable the single BB bonus and the vector
772 // bonuses, but not the last-call-to-static bonus. Inlining the last call to
773 // a static function will, at the minimum, eliminate the parameter setup and
774 // call/return instructions.
775 SingleBBBonusPercent = 0;
776 VectorBonusPercent = 0;
777 } else if (Caller->optForSize())
637 Threshold = MinIfValid(Threshold, Params.OptSizeThreshold); 778 Threshold = MinIfValid(Threshold, Params.OptSizeThreshold);
638 779
639 bool HotCallsite = false; 780 // Adjust the threshold based on inlinehint attribute and profile based
640 uint64_t TotalWeight; 781 // hotness information if the caller does not have MinSize attribute.
641 if (CS.getInstruction()->extractProfTotalWeight(TotalWeight) && 782 if (!Caller->optForMinSize()) {
642 PSI->isHotCount(TotalWeight)) { 783 if (Callee.hasFnAttribute(Attribute::InlineHint))
643 HotCallsite = true; 784 Threshold = MaxIfValid(Threshold, Params.HintThreshold);
644 } 785
645 786 // FIXME: After switching to the new passmanager, simplify the logic below
646 // Listen to the inlinehint attribute or profile based hotness information 787 // by checking only the callsite hotness/coldness as we will reliably
647 // when it would increase the threshold and the caller does not need to 788 // have local profile information.
648 // minimize its size. 789 //
649 bool InlineHint = Callee.hasFnAttribute(Attribute::InlineHint) || 790 // Callsite hotness and coldness can be determined if sample profile is
650 PSI->isFunctionEntryHot(&Callee); 791 // used (which adds hotness metadata to calls) or if caller's
651 if (InlineHint && !Caller->optForMinSize()) 792 // BlockFrequencyInfo is available.
652 Threshold = MaxIfValid(Threshold, Params.HintThreshold); 793 BlockFrequencyInfo *CallerBFI = GetBFI ? &((*GetBFI)(*Caller)) : nullptr;
653 794 auto HotCallSiteThreshold = getHotCallSiteThreshold(CS, CallerBFI);
654 if (HotCallsite && !Caller->optForMinSize()) 795 if (!Caller->optForSize() && HotCallSiteThreshold) {
655 Threshold = MaxIfValid(Threshold, Params.HotCallSiteThreshold); 796 DEBUG(dbgs() << "Hot callsite.\n");
656 797 // FIXME: This should update the threshold only if it exceeds the
657 bool ColdCallee = PSI->isFunctionEntryCold(&Callee); 798 // current threshold, but AutoFDO + ThinLTO currently relies on this
658 // For cold callees, use the ColdThreshold knob if it is available and reduces 799 // behavior to prevent inlining of hot callsites during ThinLTO
659 // the threshold. 800 // compile phase.
660 if (ColdCallee) 801 Threshold = HotCallSiteThreshold.getValue();
661 Threshold = MinIfValid(Threshold, Params.ColdThreshold); 802 } else if (isColdCallSite(CS, CallerBFI)) {
803 DEBUG(dbgs() << "Cold callsite.\n");
804 // Do not apply bonuses for a cold callsite including the
805 // LastCallToStatic bonus. While this bonus might result in code size
806 // reduction, it can cause the size of a non-cold caller to increase
807 // preventing it from being inlined.
808 DisallowAllBonuses();
809 Threshold = MinIfValid(Threshold, Params.ColdCallSiteThreshold);
810 } else if (PSI) {
811 // Use callee's global profile information only if we have no way of
812 // determining this via callsite information.
813 if (PSI->isFunctionEntryHot(&Callee)) {
814 DEBUG(dbgs() << "Hot callee.\n");
815 // If callsite hotness can not be determined, we may still know
816 // that the callee is hot and treat it as a weaker hint for threshold
817 // increase.
818 Threshold = MaxIfValid(Threshold, Params.HintThreshold);
819 } else if (PSI->isFunctionEntryCold(&Callee)) {
820 DEBUG(dbgs() << "Cold callee.\n");
821 // Do not apply bonuses for a cold callee including the
822 // LastCallToStatic bonus. While this bonus might result in code size
823 // reduction, it can cause the size of a non-cold caller to increase
824 // preventing it from being inlined.
825 DisallowAllBonuses();
826 Threshold = MinIfValid(Threshold, Params.ColdThreshold);
827 }
828 }
829 }
662 830
663 // Finally, take the target-specific inlining threshold multiplier into 831 // Finally, take the target-specific inlining threshold multiplier into
664 // account. 832 // account.
665 Threshold *= TTI.getInliningThresholdMultiplier(); 833 Threshold *= TTI.getInliningThresholdMultiplier();
834
835 SingleBBBonus = Threshold * SingleBBBonusPercent / 100;
836 VectorBonus = Threshold * VectorBonusPercent / 100;
837
838 bool OnlyOneCallAndLocalLinkage =
839 F.hasLocalLinkage() && F.hasOneUse() && &F == CS.getCalledFunction();
840 // If there is only one call of the function, and it has internal linkage,
841 // the cost of inlining it drops dramatically. It may seem odd to update
842 // Cost in updateThreshold, but the bonus depends on the logic in this method.
843 if (OnlyOneCallAndLocalLinkage)
844 Cost -= LastCallToStaticBonus;
666 } 845 }
667 846
668 bool CallAnalyzer::visitCmpInst(CmpInst &I) { 847 bool CallAnalyzer::visitCmpInst(CmpInst &I) {
669 Value *LHS = I.getOperand(0), *RHS = I.getOperand(1); 848 Value *LHS = I.getOperand(0), *RHS = I.getOperand(1);
670 // First try to handle simplified comparisons. 849 // First try to handle simplified comparisons.
671 if (!isa<Constant>(LHS)) 850 if (simplifyInstruction(I, [&](SmallVectorImpl<Constant *> &COps) {
672 if (Constant *SimpleLHS = SimplifiedValues.lookup(LHS)) 851 return ConstantExpr::getCompare(I.getPredicate(), COps[0], COps[1]);
673 LHS = SimpleLHS; 852 }))
674 if (!isa<Constant>(RHS)) 853 return true;
675 if (Constant *SimpleRHS = SimplifiedValues.lookup(RHS))
676 RHS = SimpleRHS;
677 if (Constant *CLHS = dyn_cast<Constant>(LHS)) {
678 if (Constant *CRHS = dyn_cast<Constant>(RHS))
679 if (Constant *C =
680 ConstantExpr::getCompare(I.getPredicate(), CLHS, CRHS)) {
681 SimplifiedValues[&I] = C;
682 return true;
683 }
684 }
685 854
686 if (I.getOpcode() == Instruction::FCmp) 855 if (I.getOpcode() == Instruction::FCmp)
687 return false; 856 return false;
688 857
689 // Otherwise look for a comparison between constant offset pointers with 858 // Otherwise look for a comparison between constant offset pointers with
726 895
727 disableSROA(CostIt); 896 disableSROA(CostIt);
728 } 897 }
729 898
730 return false; 899 return false;
900 }
901
902 bool CallAnalyzer::visitOr(BinaryOperator &I) {
903 // This is necessary because the generic simplify instruction only works if
904 // both operands are constants.
905 for (unsigned i = 0; i < 2; ++i) {
906 if (ConstantInt *C = dyn_cast_or_null<ConstantInt>(
907 SimplifiedValues.lookup(I.getOperand(i))))
908 if (C->isAllOnesValue()) {
909 SimplifiedValues[&I] = C;
910 return true;
911 }
912 }
913 return Base::visitOr(I);
914 }
915
916 bool CallAnalyzer::visitAnd(BinaryOperator &I) {
917 // This is necessary because the generic simplify instruction only works if
918 // both operands are constants.
919 for (unsigned i = 0; i < 2; ++i) {
920 if (ConstantInt *C = dyn_cast_or_null<ConstantInt>(
921 SimplifiedValues.lookup(I.getOperand(i))))
922 if (C->isZero()) {
923 SimplifiedValues[&I] = C;
924 return true;
925 }
926 }
927 return Base::visitAnd(I);
731 } 928 }
732 929
733 bool CallAnalyzer::visitSub(BinaryOperator &I) { 930 bool CallAnalyzer::visitSub(BinaryOperator &I) {
734 // Try to handle a special case: we can fold computing the difference of two 931 // Try to handle a special case: we can fold computing the difference of two
735 // constant-related pointers. 932 // constant-related pointers.
757 return Base::visitSub(I); 954 return Base::visitSub(I);
758 } 955 }
759 956
760 bool CallAnalyzer::visitBinaryOperator(BinaryOperator &I) { 957 bool CallAnalyzer::visitBinaryOperator(BinaryOperator &I) {
761 Value *LHS = I.getOperand(0), *RHS = I.getOperand(1); 958 Value *LHS = I.getOperand(0), *RHS = I.getOperand(1);
762 const DataLayout &DL = F.getParent()->getDataLayout(); 959 auto Evaluate = [&](SmallVectorImpl<Constant *> &COps) {
763 if (!isa<Constant>(LHS)) 960 Value *SimpleV = nullptr;
764 if (Constant *SimpleLHS = SimplifiedValues.lookup(LHS)) 961 if (auto FI = dyn_cast<FPMathOperator>(&I))
765 LHS = SimpleLHS; 962 SimpleV = SimplifyFPBinOp(I.getOpcode(), COps[0], COps[1],
766 if (!isa<Constant>(RHS)) 963 FI->getFastMathFlags(), DL);
767 if (Constant *SimpleRHS = SimplifiedValues.lookup(RHS)) 964 else
768 RHS = SimpleRHS; 965 SimpleV = SimplifyBinOp(I.getOpcode(), COps[0], COps[1], DL);
769 Value *SimpleV = nullptr; 966 return dyn_cast_or_null<Constant>(SimpleV);
770 if (auto FI = dyn_cast<FPMathOperator>(&I)) 967 };
771 SimpleV = 968
772 SimplifyFPBinOp(I.getOpcode(), LHS, RHS, FI->getFastMathFlags(), DL); 969 if (simplifyInstruction(I, Evaluate))
773 else
774 SimpleV = SimplifyBinOp(I.getOpcode(), LHS, RHS, DL);
775
776 if (Constant *C = dyn_cast_or_null<Constant>(SimpleV)) {
777 SimplifiedValues[&I] = C;
778 return true; 970 return true;
779 }
780 971
781 // Disable any SROA on arguments to arbitrary, unsimplified binary operators. 972 // Disable any SROA on arguments to arbitrary, unsimplified binary operators.
782 disableSROA(LHS); 973 disableSROA(LHS);
783 disableSROA(RHS); 974 disableSROA(RHS);
784 975
815 return false; 1006 return false;
816 } 1007 }
817 1008
818 bool CallAnalyzer::visitExtractValue(ExtractValueInst &I) { 1009 bool CallAnalyzer::visitExtractValue(ExtractValueInst &I) {
819 // Constant folding for extract value is trivial. 1010 // Constant folding for extract value is trivial.
820 Constant *C = dyn_cast<Constant>(I.getAggregateOperand()); 1011 if (simplifyInstruction(I, [&](SmallVectorImpl<Constant *> &COps) {
821 if (!C) 1012 return ConstantExpr::getExtractValue(COps[0], I.getIndices());
822 C = SimplifiedValues.lookup(I.getAggregateOperand()); 1013 }))
823 if (C) {
824 SimplifiedValues[&I] = ConstantExpr::getExtractValue(C, I.getIndices());
825 return true; 1014 return true;
826 }
827 1015
828 // SROA can look through these but give them a cost. 1016 // SROA can look through these but give them a cost.
829 return false; 1017 return false;
830 } 1018 }
831 1019
832 bool CallAnalyzer::visitInsertValue(InsertValueInst &I) { 1020 bool CallAnalyzer::visitInsertValue(InsertValueInst &I) {
833 // Constant folding for insert value is trivial. 1021 // Constant folding for insert value is trivial.
834 Constant *AggC = dyn_cast<Constant>(I.getAggregateOperand()); 1022 if (simplifyInstruction(I, [&](SmallVectorImpl<Constant *> &COps) {
835 if (!AggC) 1023 return ConstantExpr::getInsertValue(/*AggregateOperand*/ COps[0],
836 AggC = SimplifiedValues.lookup(I.getAggregateOperand()); 1024 /*InsertedValueOperand*/ COps[1],
837 Constant *InsertedC = dyn_cast<Constant>(I.getInsertedValueOperand()); 1025 I.getIndices());
838 if (!InsertedC) 1026 }))
839 InsertedC = SimplifiedValues.lookup(I.getInsertedValueOperand());
840 if (AggC && InsertedC) {
841 SimplifiedValues[&I] =
842 ConstantExpr::getInsertValue(AggC, InsertedC, I.getIndices());
843 return true; 1027 return true;
844 }
845 1028
846 // SROA can look through these but give them a cost. 1029 // SROA can look through these but give them a cost.
847 return false; 1030 return false;
848 } 1031 }
849 1032
856 bool CallAnalyzer::simplifyCallSite(Function *F, CallSite CS) { 1039 bool CallAnalyzer::simplifyCallSite(Function *F, CallSite CS) {
857 // FIXME: Using the instsimplify logic directly for this is inefficient 1040 // FIXME: Using the instsimplify logic directly for this is inefficient
858 // because we have to continually rebuild the argument list even when no 1041 // because we have to continually rebuild the argument list even when no
859 // simplifications can be performed. Until that is fixed with remapping 1042 // simplifications can be performed. Until that is fixed with remapping
860 // inside of instsimplify, directly constant fold calls here. 1043 // inside of instsimplify, directly constant fold calls here.
861 if (!canConstantFoldCallTo(F)) 1044 if (!canConstantFoldCallTo(CS, F))
862 return false; 1045 return false;
863 1046
864 // Try to re-map the arguments to constants. 1047 // Try to re-map the arguments to constants.
865 SmallVector<Constant *, 4> ConstantArgs; 1048 SmallVector<Constant *, 4> ConstantArgs;
866 ConstantArgs.reserve(CS.arg_size()); 1049 ConstantArgs.reserve(CS.arg_size());
872 if (!C) 1055 if (!C)
873 return false; // This argument doesn't map to a constant. 1056 return false; // This argument doesn't map to a constant.
874 1057
875 ConstantArgs.push_back(C); 1058 ConstantArgs.push_back(C);
876 } 1059 }
877 if (Constant *C = ConstantFoldCall(F, ConstantArgs)) { 1060 if (Constant *C = ConstantFoldCall(CS, F, ConstantArgs)) {
878 SimplifiedValues[CS.getInstruction()] = C; 1061 SimplifiedValues[CS.getInstruction()] = C;
879 return true; 1062 return true;
880 } 1063 }
881 1064
882 return false; 1065 return false;
960 // during devirtualization and so we want to give it a hefty bonus for 1143 // during devirtualization and so we want to give it a hefty bonus for
961 // inlining, but cap that bonus in the event that inlining wouldn't pan 1144 // inlining, but cap that bonus in the event that inlining wouldn't pan
962 // out. Pretend to inline the function, with a custom threshold. 1145 // out. Pretend to inline the function, with a custom threshold.
963 auto IndirectCallParams = Params; 1146 auto IndirectCallParams = Params;
964 IndirectCallParams.DefaultThreshold = InlineConstants::IndirectCallThreshold; 1147 IndirectCallParams.DefaultThreshold = InlineConstants::IndirectCallThreshold;
965 CallAnalyzer CA(TTI, GetAssumptionCache, PSI, *F, CS, IndirectCallParams); 1148 CallAnalyzer CA(TTI, GetAssumptionCache, GetBFI, PSI, ORE, *F, CS,
1149 IndirectCallParams);
966 if (CA.analyzeCall(CS)) { 1150 if (CA.analyzeCall(CS)) {
967 // We were able to inline the indirect call! Subtract the cost from the 1151 // We were able to inline the indirect call! Subtract the cost from the
968 // threshold to get the bonus we want to apply, but don't go below zero. 1152 // threshold to get the bonus we want to apply, but don't go below zero.
969 Cost -= std::max(0, CA.getThreshold() - CA.getCost()); 1153 Cost -= std::max(0, CA.getThreshold() - CA.getCost());
970 } 1154 }
987 return BI.isUnconditional() || isa<ConstantInt>(BI.getCondition()) || 1171 return BI.isUnconditional() || isa<ConstantInt>(BI.getCondition()) ||
988 dyn_cast_or_null<ConstantInt>( 1172 dyn_cast_or_null<ConstantInt>(
989 SimplifiedValues.lookup(BI.getCondition())); 1173 SimplifiedValues.lookup(BI.getCondition()));
990 } 1174 }
991 1175
1176 bool CallAnalyzer::visitSelectInst(SelectInst &SI) {
1177 bool CheckSROA = SI.getType()->isPointerTy();
1178 Value *TrueVal = SI.getTrueValue();
1179 Value *FalseVal = SI.getFalseValue();
1180
1181 Constant *TrueC = dyn_cast<Constant>(TrueVal);
1182 if (!TrueC)
1183 TrueC = SimplifiedValues.lookup(TrueVal);
1184 Constant *FalseC = dyn_cast<Constant>(FalseVal);
1185 if (!FalseC)
1186 FalseC = SimplifiedValues.lookup(FalseVal);
1187 Constant *CondC =
1188 dyn_cast_or_null<Constant>(SimplifiedValues.lookup(SI.getCondition()));
1189
1190 if (!CondC) {
1191 // Select C, X, X => X
1192 if (TrueC == FalseC && TrueC) {
1193 SimplifiedValues[&SI] = TrueC;
1194 return true;
1195 }
1196
1197 if (!CheckSROA)
1198 return Base::visitSelectInst(SI);
1199
1200 std::pair<Value *, APInt> TrueBaseAndOffset =
1201 ConstantOffsetPtrs.lookup(TrueVal);
1202 std::pair<Value *, APInt> FalseBaseAndOffset =
1203 ConstantOffsetPtrs.lookup(FalseVal);
1204 if (TrueBaseAndOffset == FalseBaseAndOffset && TrueBaseAndOffset.first) {
1205 ConstantOffsetPtrs[&SI] = TrueBaseAndOffset;
1206
1207 Value *SROAArg;
1208 DenseMap<Value *, int>::iterator CostIt;
1209 if (lookupSROAArgAndCost(TrueVal, SROAArg, CostIt))
1210 SROAArgValues[&SI] = SROAArg;
1211 return true;
1212 }
1213
1214 return Base::visitSelectInst(SI);
1215 }
1216
1217 // Select condition is a constant.
1218 Value *SelectedV = CondC->isAllOnesValue()
1219 ? TrueVal
1220 : (CondC->isNullValue()) ? FalseVal : nullptr;
1221 if (!SelectedV) {
1222 // Condition is a vector constant that is not all 1s or all 0s. If all
1223 // operands are constants, ConstantExpr::getSelect() can handle the cases
1224 // such as select vectors.
1225 if (TrueC && FalseC) {
1226 if (auto *C = ConstantExpr::getSelect(CondC, TrueC, FalseC)) {
1227 SimplifiedValues[&SI] = C;
1228 return true;
1229 }
1230 }
1231 return Base::visitSelectInst(SI);
1232 }
1233
1234 // Condition is either all 1s or all 0s. SI can be simplified.
1235 if (Constant *SelectedC = dyn_cast<Constant>(SelectedV)) {
1236 SimplifiedValues[&SI] = SelectedC;
1237 return true;
1238 }
1239
1240 if (!CheckSROA)
1241 return true;
1242
1243 std::pair<Value *, APInt> BaseAndOffset =
1244 ConstantOffsetPtrs.lookup(SelectedV);
1245 if (BaseAndOffset.first) {
1246 ConstantOffsetPtrs[&SI] = BaseAndOffset;
1247
1248 Value *SROAArg;
1249 DenseMap<Value *, int>::iterator CostIt;
1250 if (lookupSROAArgAndCost(SelectedV, SROAArg, CostIt))
1251 SROAArgValues[&SI] = SROAArg;
1252 }
1253
1254 return true;
1255 }
1256
992 bool CallAnalyzer::visitSwitchInst(SwitchInst &SI) { 1257 bool CallAnalyzer::visitSwitchInst(SwitchInst &SI) {
993 // We model unconditional switches as free, see the comments on handling 1258 // We model unconditional switches as free, see the comments on handling
994 // branches. 1259 // branches.
995 if (isa<ConstantInt>(SI.getCondition())) 1260 if (isa<ConstantInt>(SI.getCondition()))
996 return true; 1261 return true;
997 if (Value *V = SimplifiedValues.lookup(SI.getCondition())) 1262 if (Value *V = SimplifiedValues.lookup(SI.getCondition()))
998 if (isa<ConstantInt>(V)) 1263 if (isa<ConstantInt>(V))
999 return true; 1264 return true;
1000 1265
1001 // Otherwise, we need to accumulate a cost proportional to the number of 1266 // Assume the most general case where the switch is lowered into
1002 // distinct successor blocks. This fan-out in the CFG cannot be represented 1267 // either a jump table, bit test, or a balanced binary tree consisting of
1003 // for free even if we can represent the core switch as a jumptable that 1268 // case clusters without merging adjacent clusters with the same
1004 // takes a single instruction. 1269 // destination. We do not consider the switches that are lowered with a mix
1270 // of jump table/bit test/binary search tree. The cost of the switch is
1271 // proportional to the size of the tree or the size of jump table range.
1005 // 1272 //
1006 // NB: We convert large switches which are just used to initialize large phi 1273 // NB: We convert large switches which are just used to initialize large phi
1007 // nodes to lookup tables instead in simplify-cfg, so this shouldn't prevent 1274 // nodes to lookup tables instead in simplify-cfg, so this shouldn't prevent
1008 // inlining those. It will prevent inlining in cases where the optimization 1275 // inlining those. It will prevent inlining in cases where the optimization
1009 // does not (yet) fire. 1276 // does not (yet) fire.
1010 SmallPtrSet<BasicBlock *, 8> SuccessorBlocks; 1277
1011 SuccessorBlocks.insert(SI.getDefaultDest()); 1278 // Maximum valid cost increased in this function.
1012 for (auto I = SI.case_begin(), E = SI.case_end(); I != E; ++I) 1279 int CostUpperBound = INT_MAX - InlineConstants::InstrCost - 1;
1013 SuccessorBlocks.insert(I.getCaseSuccessor()); 1280
1014 // Add cost corresponding to the number of distinct destinations. The first 1281 // Exit early for a large switch, assuming one case needs at least one
1015 // we model as free because of fallthrough. 1282 // instruction.
1016 Cost += (SuccessorBlocks.size() - 1) * InlineConstants::InstrCost; 1283 // FIXME: This is not true for a bit test, but ignore such case for now to
1284 // save compile-time.
1285 int64_t CostLowerBound =
1286 std::min((int64_t)CostUpperBound,
1287 (int64_t)SI.getNumCases() * InlineConstants::InstrCost + Cost);
1288
1289 if (CostLowerBound > Threshold && !ComputeFullInlineCost) {
1290 Cost = CostLowerBound;
1291 return false;
1292 }
1293
1294 unsigned JumpTableSize = 0;
1295 unsigned NumCaseCluster =
1296 TTI.getEstimatedNumberOfCaseClusters(SI, JumpTableSize);
1297
1298 // If suitable for a jump table, consider the cost for the table size and
1299 // branch to destination.
1300 if (JumpTableSize) {
1301 int64_t JTCost = (int64_t)JumpTableSize * InlineConstants::InstrCost +
1302 4 * InlineConstants::InstrCost;
1303
1304 Cost = std::min((int64_t)CostUpperBound, JTCost + Cost);
1305 return false;
1306 }
1307
1308 // Considering forming a binary search, we should find the number of nodes
1309 // which is same as the number of comparisons when lowered. For a given
1310 // number of clusters, n, we can define a recursive function, f(n), to find
1311 // the number of nodes in the tree. The recursion is :
1312 // f(n) = 1 + f(n/2) + f (n - n/2), when n > 3,
1313 // and f(n) = n, when n <= 3.
1314 // This will lead a binary tree where the leaf should be either f(2) or f(3)
1315 // when n > 3. So, the number of comparisons from leaves should be n, while
1316 // the number of non-leaf should be :
1317 // 2^(log2(n) - 1) - 1
1318 // = 2^log2(n) * 2^-1 - 1
1319 // = n / 2 - 1.
1320 // Considering comparisons from leaf and non-leaf nodes, we can estimate the
1321 // number of comparisons in a simple closed form :
1322 // n + n / 2 - 1 = n * 3 / 2 - 1
1323 if (NumCaseCluster <= 3) {
1324 // Suppose a comparison includes one compare and one conditional branch.
1325 Cost += NumCaseCluster * 2 * InlineConstants::InstrCost;
1326 return false;
1327 }
1328
1329 int64_t ExpectedNumberOfCompare = 3 * (int64_t)NumCaseCluster / 2 - 1;
1330 int64_t SwitchCost =
1331 ExpectedNumberOfCompare * 2 * InlineConstants::InstrCost;
1332
1333 Cost = std::min((int64_t)CostUpperBound, SwitchCost + Cost);
1017 return false; 1334 return false;
1018 } 1335 }
1019 1336
1020 bool CallAnalyzer::visitIndirectBrInst(IndirectBrInst &IBI) { 1337 bool CallAnalyzer::visitIndirectBrInst(IndirectBrInst &IBI) {
1021 // We never want to inline functions that contain an indirectbr. This is 1338 // We never want to inline functions that contain an indirectbr. This is
1099 1416
1100 // If the instruction is floating point, and the target says this operation 1417 // If the instruction is floating point, and the target says this operation
1101 // is expensive or the function has the "use-soft-float" attribute, this may 1418 // is expensive or the function has the "use-soft-float" attribute, this may
1102 // eventually become a library call. Treat the cost as such. 1419 // eventually become a library call. Treat the cost as such.
1103 if (I->getType()->isFloatingPointTy()) { 1420 if (I->getType()->isFloatingPointTy()) {
1104 bool hasSoftFloatAttr = false;
1105
1106 // If the function has the "use-soft-float" attribute, mark it as 1421 // If the function has the "use-soft-float" attribute, mark it as
1107 // expensive. 1422 // expensive.
1108 if (F.hasFnAttribute("use-soft-float")) {
1109 Attribute Attr = F.getFnAttribute("use-soft-float");
1110 StringRef Val = Attr.getValueAsString();
1111 if (Val == "true")
1112 hasSoftFloatAttr = true;
1113 }
1114
1115 if (TTI.getFPOpCost(I->getType()) == TargetTransformInfo::TCC_Expensive || 1423 if (TTI.getFPOpCost(I->getType()) == TargetTransformInfo::TCC_Expensive ||
1116 hasSoftFloatAttr) 1424 (F.getFnAttribute("use-soft-float").getValueAsString() == "true"))
1117 Cost += InlineConstants::CallPenalty; 1425 Cost += InlineConstants::CallPenalty;
1118 } 1426 }
1119 1427
1120 // If the instruction simplified to a constant, there is no cost to this 1428 // If the instruction simplified to a constant, there is no cost to this
1121 // instruction. Visit the instructions using our InstVisitor to account for 1429 // instruction. Visit the instructions using our InstVisitor to account for
1125 if (Base::visit(&*I)) 1433 if (Base::visit(&*I))
1126 ++NumInstructionsSimplified; 1434 ++NumInstructionsSimplified;
1127 else 1435 else
1128 Cost += InlineConstants::InstrCost; 1436 Cost += InlineConstants::InstrCost;
1129 1437
1438 using namespace ore;
1130 // If the visit this instruction detected an uninlinable pattern, abort. 1439 // If the visit this instruction detected an uninlinable pattern, abort.
1131 if (IsRecursiveCall || ExposesReturnsTwice || HasDynamicAlloca || 1440 if (IsRecursiveCall || ExposesReturnsTwice || HasDynamicAlloca ||
1132 HasIndirectBr || HasFrameEscape) 1441 HasIndirectBr || HasFrameEscape) {
1442 if (ORE)
1443 ORE->emit([&]() {
1444 return OptimizationRemarkMissed(DEBUG_TYPE, "NeverInline",
1445 CandidateCS.getInstruction())
1446 << NV("Callee", &F)
1447 << " has uninlinable pattern and cost is not fully computed";
1448 });
1133 return false; 1449 return false;
1450 }
1134 1451
1135 // If the caller is a recursive function then we don't want to inline 1452 // If the caller is a recursive function then we don't want to inline
1136 // functions which allocate a lot of stack space because it would increase 1453 // functions which allocate a lot of stack space because it would increase
1137 // the caller stack usage dramatically. 1454 // the caller stack usage dramatically.
1138 if (IsCallerRecursive && 1455 if (IsCallerRecursive &&
1139 AllocatedSize > InlineConstants::TotalAllocaSizeRecursiveCaller) 1456 AllocatedSize > InlineConstants::TotalAllocaSizeRecursiveCaller) {
1457 if (ORE)
1458 ORE->emit([&]() {
1459 return OptimizationRemarkMissed(DEBUG_TYPE, "NeverInline",
1460 CandidateCS.getInstruction())
1461 << NV("Callee", &F)
1462 << " is recursive and allocates too much stack space. Cost is "
1463 "not fully computed";
1464 });
1140 return false; 1465 return false;
1466 }
1141 1467
1142 // Check if we've past the maximum possible threshold so we don't spin in 1468 // Check if we've past the maximum possible threshold so we don't spin in
1143 // huge basic blocks that will never inline. 1469 // huge basic blocks that will never inline.
1144 if (Cost > Threshold) 1470 if (Cost >= Threshold && !ComputeFullInlineCost)
1145 return false; 1471 return false;
1146 } 1472 }
1147 1473
1148 return true; 1474 return true;
1149 } 1475 }
1156 /// no constant offsets applied. 1482 /// no constant offsets applied.
1157 ConstantInt *CallAnalyzer::stripAndComputeInBoundsConstantOffsets(Value *&V) { 1483 ConstantInt *CallAnalyzer::stripAndComputeInBoundsConstantOffsets(Value *&V) {
1158 if (!V->getType()->isPointerTy()) 1484 if (!V->getType()->isPointerTy())
1159 return nullptr; 1485 return nullptr;
1160 1486
1161 const DataLayout &DL = F.getParent()->getDataLayout();
1162 unsigned IntPtrWidth = DL.getPointerSizeInBits(); 1487 unsigned IntPtrWidth = DL.getPointerSizeInBits();
1163 APInt Offset = APInt::getNullValue(IntPtrWidth); 1488 APInt Offset = APInt::getNullValue(IntPtrWidth);
1164 1489
1165 // Even though we don't look through PHI nodes, we could be called on an 1490 // Even though we don't look through PHI nodes, we could be called on an
1166 // instruction in an unreachable block, which may be on a cycle. 1491 // instruction in an unreachable block, which may be on a cycle.
1211 assert(NumVectorInstructions == 0); 1536 assert(NumVectorInstructions == 0);
1212 1537
1213 // Update the threshold based on callsite properties 1538 // Update the threshold based on callsite properties
1214 updateThreshold(CS, F); 1539 updateThreshold(CS, F);
1215 1540
1216 FiftyPercentVectorBonus = 3 * Threshold / 2;
1217 TenPercentVectorBonus = 3 * Threshold / 4;
1218 const DataLayout &DL = F.getParent()->getDataLayout();
1219
1220 // Track whether the post-inlining function would have more than one basic
1221 // block. A single basic block is often intended for inlining. Balloon the
1222 // threshold by 50% until we pass the single-BB phase.
1223 bool SingleBB = true;
1224 int SingleBBBonus = Threshold / 2;
1225
1226 // Speculatively apply all possible bonuses to Threshold. If cost exceeds 1541 // Speculatively apply all possible bonuses to Threshold. If cost exceeds
1227 // this Threshold any time, and cost cannot decrease, we can stop processing 1542 // this Threshold any time, and cost cannot decrease, we can stop processing
1228 // the rest of the function body. 1543 // the rest of the function body.
1229 Threshold += (SingleBBBonus + FiftyPercentVectorBonus); 1544 Threshold += (SingleBBBonus + VectorBonus);
1230 1545
1231 // Give out bonuses per argument, as the instructions setting them up will 1546 // Give out bonuses for the callsite, as the instructions setting them up
1232 // be gone after inlining. 1547 // will be gone after inlining.
1233 for (unsigned I = 0, E = CS.arg_size(); I != E; ++I) { 1548 Cost -= getCallsiteCost(CS, DL);
1234 if (CS.isByValArgument(I)) {
1235 // We approximate the number of loads and stores needed by dividing the
1236 // size of the byval type by the target's pointer size.
1237 PointerType *PTy = cast<PointerType>(CS.getArgument(I)->getType());
1238 unsigned TypeSize = DL.getTypeSizeInBits(PTy->getElementType());
1239 unsigned PointerSize = DL.getPointerSizeInBits();
1240 // Ceiling division.
1241 unsigned NumStores = (TypeSize + PointerSize - 1) / PointerSize;
1242
1243 // If it generates more than 8 stores it is likely to be expanded as an
1244 // inline memcpy so we take that as an upper bound. Otherwise we assume
1245 // one load and one store per word copied.
1246 // FIXME: The maxStoresPerMemcpy setting from the target should be used
1247 // here instead of a magic number of 8, but it's not available via
1248 // DataLayout.
1249 NumStores = std::min(NumStores, 8U);
1250
1251 Cost -= 2 * NumStores * InlineConstants::InstrCost;
1252 } else {
1253 // For non-byval arguments subtract off one instruction per call
1254 // argument.
1255 Cost -= InlineConstants::InstrCost;
1256 }
1257 }
1258 // The call instruction also disappears after inlining.
1259 Cost -= InlineConstants::InstrCost + InlineConstants::CallPenalty;
1260
1261 // If there is only one call of the function, and it has internal linkage,
1262 // the cost of inlining it drops dramatically.
1263 bool OnlyOneCallAndLocalLinkage =
1264 F.hasLocalLinkage() && F.hasOneUse() && &F == CS.getCalledFunction();
1265 if (OnlyOneCallAndLocalLinkage)
1266 Cost -= InlineConstants::LastCallToStaticBonus;
1267 1549
1268 // If this function uses the coldcc calling convention, prefer not to inline 1550 // If this function uses the coldcc calling convention, prefer not to inline
1269 // it. 1551 // it.
1270 if (F.getCallingConv() == CallingConv::Cold) 1552 if (F.getCallingConv() == CallingConv::Cold)
1271 Cost += InlineConstants::ColdccPenalty; 1553 Cost += InlineConstants::ColdccPenalty;
1272 1554
1273 // Check if we're done. This can happen due to bonuses and penalties. 1555 // Check if we're done. This can happen due to bonuses and penalties.
1274 if (Cost > Threshold) 1556 if (Cost >= Threshold && !ComputeFullInlineCost)
1275 return false; 1557 return false;
1276 1558
1277 if (F.empty()) 1559 if (F.empty())
1278 return true; 1560 return true;
1279 1561
1330 typedef SetVector<BasicBlock *, SmallVector<BasicBlock *, 16>, 1612 typedef SetVector<BasicBlock *, SmallVector<BasicBlock *, 16>,
1331 SmallPtrSet<BasicBlock *, 16>> 1613 SmallPtrSet<BasicBlock *, 16>>
1332 BBSetVector; 1614 BBSetVector;
1333 BBSetVector BBWorklist; 1615 BBSetVector BBWorklist;
1334 BBWorklist.insert(&F.getEntryBlock()); 1616 BBWorklist.insert(&F.getEntryBlock());
1617 bool SingleBB = true;
1335 // Note that we *must not* cache the size, this loop grows the worklist. 1618 // Note that we *must not* cache the size, this loop grows the worklist.
1336 for (unsigned Idx = 0; Idx != BBWorklist.size(); ++Idx) { 1619 for (unsigned Idx = 0; Idx != BBWorklist.size(); ++Idx) {
1337 // Bail out the moment we cross the threshold. This means we'll under-count 1620 // Bail out the moment we cross the threshold. This means we'll under-count
1338 // the cost, but only when undercounting doesn't matter. 1621 // the cost, but only when undercounting doesn't matter.
1339 if (Cost > Threshold) 1622 if (Cost >= Threshold && !ComputeFullInlineCost)
1340 break; 1623 break;
1341 1624
1342 BasicBlock *BB = BBWorklist[Idx]; 1625 BasicBlock *BB = BBWorklist[Idx];
1343 if (BB->empty()) 1626 if (BB->empty())
1344 continue; 1627 continue;
1372 } 1655 }
1373 } else if (SwitchInst *SI = dyn_cast<SwitchInst>(TI)) { 1656 } else if (SwitchInst *SI = dyn_cast<SwitchInst>(TI)) {
1374 Value *Cond = SI->getCondition(); 1657 Value *Cond = SI->getCondition();
1375 if (ConstantInt *SimpleCond = 1658 if (ConstantInt *SimpleCond =
1376 dyn_cast_or_null<ConstantInt>(SimplifiedValues.lookup(Cond))) { 1659 dyn_cast_or_null<ConstantInt>(SimplifiedValues.lookup(Cond))) {
1377 BBWorklist.insert(SI->findCaseValue(SimpleCond).getCaseSuccessor()); 1660 BBWorklist.insert(SI->findCaseValue(SimpleCond)->getCaseSuccessor());
1378 continue; 1661 continue;
1379 } 1662 }
1380 } 1663 }
1381 1664
1382 // If we're unable to select a particular successor, just count all of 1665 // If we're unable to select a particular successor, just count all of
1394 Threshold -= SingleBBBonus; 1677 Threshold -= SingleBBBonus;
1395 SingleBB = false; 1678 SingleBB = false;
1396 } 1679 }
1397 } 1680 }
1398 1681
1682 bool OnlyOneCallAndLocalLinkage =
1683 F.hasLocalLinkage() && F.hasOneUse() && &F == CS.getCalledFunction();
1399 // If this is a noduplicate call, we can still inline as long as 1684 // If this is a noduplicate call, we can still inline as long as
1400 // inlining this would cause the removal of the caller (so the instruction 1685 // inlining this would cause the removal of the caller (so the instruction
1401 // is not actually duplicated, just moved). 1686 // is not actually duplicated, just moved).
1402 if (!OnlyOneCallAndLocalLinkage && ContainsNoDuplicateCall) 1687 if (!OnlyOneCallAndLocalLinkage && ContainsNoDuplicateCall)
1403 return false; 1688 return false;
1404 1689
1405 // We applied the maximum possible vector bonus at the beginning. Now, 1690 // We applied the maximum possible vector bonus at the beginning. Now,
1406 // subtract the excess bonus, if any, from the Threshold before 1691 // subtract the excess bonus, if any, from the Threshold before
1407 // comparing against Cost. 1692 // comparing against Cost.
1408 if (NumVectorInstructions <= NumInstructions / 10) 1693 if (NumVectorInstructions <= NumInstructions / 10)
1409 Threshold -= FiftyPercentVectorBonus; 1694 Threshold -= VectorBonus;
1410 else if (NumVectorInstructions <= NumInstructions / 2) 1695 else if (NumVectorInstructions <= NumInstructions / 2)
1411 Threshold -= (FiftyPercentVectorBonus - TenPercentVectorBonus); 1696 Threshold -= VectorBonus/2;
1412 1697
1413 return Cost < std::max(1, Threshold); 1698 return Cost < std::max(1, Threshold);
1414 } 1699 }
1415 1700
1416 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) 1701 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
1431 DEBUG_PRINT_STAT(Threshold); 1716 DEBUG_PRINT_STAT(Threshold);
1432 #undef DEBUG_PRINT_STAT 1717 #undef DEBUG_PRINT_STAT
1433 } 1718 }
1434 #endif 1719 #endif
1435 1720
1436 /// \brief Test that two functions either have or have not the given attribute
1437 /// at the same time.
1438 template <typename AttrKind>
1439 static bool attributeMatches(Function *F1, Function *F2, AttrKind Attr) {
1440 return F1->getFnAttribute(Attr) == F2->getFnAttribute(Attr);
1441 }
1442
1443 /// \brief Test that there are no attribute conflicts between Caller and Callee 1721 /// \brief Test that there are no attribute conflicts between Caller and Callee
1444 /// that prevent inlining. 1722 /// that prevent inlining.
1445 static bool functionsHaveCompatibleAttributes(Function *Caller, 1723 static bool functionsHaveCompatibleAttributes(Function *Caller,
1446 Function *Callee, 1724 Function *Callee,
1447 TargetTransformInfo &TTI) { 1725 TargetTransformInfo &TTI) {
1448 return TTI.areInlineCompatible(Caller, Callee) && 1726 return TTI.areInlineCompatible(Caller, Callee) &&
1449 AttributeFuncs::areInlineCompatible(*Caller, *Callee); 1727 AttributeFuncs::areInlineCompatible(*Caller, *Callee);
1450 } 1728 }
1451 1729
1730 int llvm::getCallsiteCost(CallSite CS, const DataLayout &DL) {
1731 int Cost = 0;
1732 for (unsigned I = 0, E = CS.arg_size(); I != E; ++I) {
1733 if (CS.isByValArgument(I)) {
1734 // We approximate the number of loads and stores needed by dividing the
1735 // size of the byval type by the target's pointer size.
1736 PointerType *PTy = cast<PointerType>(CS.getArgument(I)->getType());
1737 unsigned TypeSize = DL.getTypeSizeInBits(PTy->getElementType());
1738 unsigned PointerSize = DL.getPointerSizeInBits();
1739 // Ceiling division.
1740 unsigned NumStores = (TypeSize + PointerSize - 1) / PointerSize;
1741
1742 // If it generates more than 8 stores it is likely to be expanded as an
1743 // inline memcpy so we take that as an upper bound. Otherwise we assume
1744 // one load and one store per word copied.
1745 // FIXME: The maxStoresPerMemcpy setting from the target should be used
1746 // here instead of a magic number of 8, but it's not available via
1747 // DataLayout.
1748 NumStores = std::min(NumStores, 8U);
1749
1750 Cost += 2 * NumStores * InlineConstants::InstrCost;
1751 } else {
1752 // For non-byval arguments subtract off one instruction per call
1753 // argument.
1754 Cost += InlineConstants::InstrCost;
1755 }
1756 }
1757 // The call instruction also disappears after inlining.
1758 Cost += InlineConstants::InstrCost + InlineConstants::CallPenalty;
1759 return Cost;
1760 }
1761
1452 InlineCost llvm::getInlineCost( 1762 InlineCost llvm::getInlineCost(
1453 CallSite CS, const InlineParams &Params, TargetTransformInfo &CalleeTTI, 1763 CallSite CS, const InlineParams &Params, TargetTransformInfo &CalleeTTI,
1454 std::function<AssumptionCache &(Function &)> &GetAssumptionCache, 1764 std::function<AssumptionCache &(Function &)> &GetAssumptionCache,
1455 ProfileSummaryInfo *PSI) { 1765 Optional<function_ref<BlockFrequencyInfo &(Function &)>> GetBFI,
1766 ProfileSummaryInfo *PSI, OptimizationRemarkEmitter *ORE) {
1456 return getInlineCost(CS, CS.getCalledFunction(), Params, CalleeTTI, 1767 return getInlineCost(CS, CS.getCalledFunction(), Params, CalleeTTI,
1457 GetAssumptionCache, PSI); 1768 GetAssumptionCache, GetBFI, PSI, ORE);
1458 } 1769 }
1459 1770
1460 InlineCost llvm::getInlineCost( 1771 InlineCost llvm::getInlineCost(
1461 CallSite CS, Function *Callee, const InlineParams &Params, 1772 CallSite CS, Function *Callee, const InlineParams &Params,
1462 TargetTransformInfo &CalleeTTI, 1773 TargetTransformInfo &CalleeTTI,
1463 std::function<AssumptionCache &(Function &)> &GetAssumptionCache, 1774 std::function<AssumptionCache &(Function &)> &GetAssumptionCache,
1464 ProfileSummaryInfo *PSI) { 1775 Optional<function_ref<BlockFrequencyInfo &(Function &)>> GetBFI,
1776 ProfileSummaryInfo *PSI, OptimizationRemarkEmitter *ORE) {
1465 1777
1466 // Cannot inline indirect calls. 1778 // Cannot inline indirect calls.
1467 if (!Callee) 1779 if (!Callee)
1468 return llvm::InlineCost::getNever(); 1780 return llvm::InlineCost::getNever();
1469 1781
1475 return llvm::InlineCost::getNever(); 1787 return llvm::InlineCost::getNever();
1476 } 1788 }
1477 1789
1478 // Never inline functions with conflicting attributes (unless callee has 1790 // Never inline functions with conflicting attributes (unless callee has
1479 // always-inline attribute). 1791 // always-inline attribute).
1480 if (!functionsHaveCompatibleAttributes(CS.getCaller(), Callee, CalleeTTI)) 1792 Function *Caller = CS.getCaller();
1793 if (!functionsHaveCompatibleAttributes(Caller, Callee, CalleeTTI))
1481 return llvm::InlineCost::getNever(); 1794 return llvm::InlineCost::getNever();
1482 1795
1483 // Don't inline this call if the caller has the optnone attribute. 1796 // Don't inline this call if the caller has the optnone attribute.
1484 if (CS.getCaller()->hasFnAttribute(Attribute::OptimizeNone)) 1797 if (Caller->hasFnAttribute(Attribute::OptimizeNone))
1485 return llvm::InlineCost::getNever(); 1798 return llvm::InlineCost::getNever();
1486 1799
1487 // Don't inline functions which can be interposed at link-time. Don't inline 1800 // Don't inline functions which can be interposed at link-time. Don't inline
1488 // functions marked noinline or call sites marked noinline. 1801 // functions marked noinline or call sites marked noinline.
1489 // Note: inlining non-exact non-interposable fucntions is fine, since we know 1802 // Note: inlining non-exact non-interposable functions is fine, since we know
1490 // we have *a* correct implementation of the source level function. 1803 // we have *a* correct implementation of the source level function.
1491 if (Callee->isInterposable() || Callee->hasFnAttribute(Attribute::NoInline) || 1804 if (Callee->isInterposable() || Callee->hasFnAttribute(Attribute::NoInline) ||
1492 CS.isNoInline()) 1805 CS.isNoInline())
1493 return llvm::InlineCost::getNever(); 1806 return llvm::InlineCost::getNever();
1494 1807
1495 DEBUG(llvm::dbgs() << " Analyzing call of " << Callee->getName() 1808 DEBUG(llvm::dbgs() << " Analyzing call of " << Callee->getName()
1496 << "...\n"); 1809 << "... (caller:" << Caller->getName() << ")\n");
1497 1810
1498 CallAnalyzer CA(CalleeTTI, GetAssumptionCache, PSI, *Callee, CS, Params); 1811 CallAnalyzer CA(CalleeTTI, GetAssumptionCache, GetBFI, PSI, ORE, *Callee, CS,
1812 Params);
1499 bool ShouldInline = CA.analyzeCall(CS); 1813 bool ShouldInline = CA.analyzeCall(CS);
1500 1814
1501 DEBUG(CA.dump()); 1815 DEBUG(CA.dump());
1502 1816
1503 // Check if there was a reason to force inlining or no inlining. 1817 // Check if there was a reason to force inlining or no inlining.
1566 Params.HintThreshold = HintThreshold; 1880 Params.HintThreshold = HintThreshold;
1567 1881
1568 // Set the HotCallSiteThreshold knob from the -hot-callsite-threshold. 1882 // Set the HotCallSiteThreshold knob from the -hot-callsite-threshold.
1569 Params.HotCallSiteThreshold = HotCallSiteThreshold; 1883 Params.HotCallSiteThreshold = HotCallSiteThreshold;
1570 1884
1571 // Set the OptMinSizeThreshold and OptSizeThreshold params only if the 1885 // If the -locally-hot-callsite-threshold is explicitly specified, use it to
1886 // populate LocallyHotCallSiteThreshold. Later, we populate
1887 // Params.LocallyHotCallSiteThreshold from -locally-hot-callsite-threshold if
1888 // we know that optimization level is O3 (in the getInlineParams variant that
1889 // takes the opt and size levels).
1890 // FIXME: Remove this check (and make the assignment unconditional) after
1891 // addressing size regression issues at O2.
1892 if (LocallyHotCallSiteThreshold.getNumOccurrences() > 0)
1893 Params.LocallyHotCallSiteThreshold = LocallyHotCallSiteThreshold;
1894
1895 // Set the ColdCallSiteThreshold knob from the -inline-cold-callsite-threshold.
1896 Params.ColdCallSiteThreshold = ColdCallSiteThreshold;
1897
1572 // Set the OptMinSizeThreshold and OptSizeThreshold params only if the 1898 // Set the OptMinSizeThreshold and OptSizeThreshold params only if the
1573 // -inlinehint-threshold commandline option is not explicitly given. If that 1899 // -inlinehint-threshold commandline option is not explicitly given. If that
1574 // option is present, then its value applies even for callees with size and 1900 // option is present, then its value applies even for callees with size and
1575 // minsize attributes. 1901 // minsize attributes.
1576 // If the -inline-threshold is not specified, set the ColdThreshold from the 1902 // If the -inline-threshold is not specified, set the ColdThreshold from the
1603 return InlineConstants::OptMinSizeThreshold; 1929 return InlineConstants::OptMinSizeThreshold;
1604 return InlineThreshold; 1930 return InlineThreshold;
1605 } 1931 }
1606 1932
1607 InlineParams llvm::getInlineParams(unsigned OptLevel, unsigned SizeOptLevel) { 1933 InlineParams llvm::getInlineParams(unsigned OptLevel, unsigned SizeOptLevel) {
1608 return getInlineParams(computeThresholdFromOptLevels(OptLevel, SizeOptLevel)); 1934 auto Params =
1609 } 1935 getInlineParams(computeThresholdFromOptLevels(OptLevel, SizeOptLevel));
1936 // At O3, use the value of -locally-hot-callsite-threshold option to populate
1937 // Params.LocallyHotCallSiteThreshold. Below O3, this flag has effect only
1938 // when it is specified explicitly.
1939 if (OptLevel > 2)
1940 Params.LocallyHotCallSiteThreshold = LocallyHotCallSiteThreshold;
1941 return Params;
1942 }