121
|
1 //===-- PGOMemOPSizeOpt.cpp - Optimizations based on value profiling ===//
|
|
2 //
|
|
3 // The LLVM Compiler Infrastructure
|
|
4 //
|
|
5 // This file is distributed under the University of Illinois Open Source
|
|
6 // License. See LICENSE.TXT for details.
|
|
7 //
|
|
8 //===----------------------------------------------------------------------===//
|
|
9 //
|
|
10 // This file implements the transformation that optimizes memory intrinsics
|
|
11 // such as memcpy using the size value profile. When memory intrinsic size
|
|
12 // value profile metadata is available, a single memory intrinsic is expanded
|
|
13 // to a sequence of guarded specialized versions that are called with the
|
|
14 // hottest size(s), for later expansion into more optimal inline sequences.
|
|
15 //
|
|
16 //===----------------------------------------------------------------------===//
|
|
17
|
|
18 #include "llvm/ADT/ArrayRef.h"
|
|
19 #include "llvm/ADT/Statistic.h"
|
|
20 #include "llvm/ADT/StringRef.h"
|
|
21 #include "llvm/ADT/Twine.h"
|
|
22 #include "llvm/Analysis/BlockFrequencyInfo.h"
|
|
23 #include "llvm/Analysis/GlobalsModRef.h"
|
|
24 #include "llvm/Analysis/OptimizationRemarkEmitter.h"
|
|
25 #include "llvm/IR/BasicBlock.h"
|
|
26 #include "llvm/IR/CallSite.h"
|
|
27 #include "llvm/IR/DerivedTypes.h"
|
|
28 #include "llvm/IR/Function.h"
|
|
29 #include "llvm/IR/IRBuilder.h"
|
|
30 #include "llvm/IR/InstVisitor.h"
|
|
31 #include "llvm/IR/InstrTypes.h"
|
|
32 #include "llvm/IR/Instruction.h"
|
|
33 #include "llvm/IR/Instructions.h"
|
|
34 #include "llvm/IR/LLVMContext.h"
|
|
35 #include "llvm/IR/PassManager.h"
|
|
36 #include "llvm/IR/Type.h"
|
|
37 #include "llvm/Pass.h"
|
|
38 #include "llvm/PassRegistry.h"
|
|
39 #include "llvm/PassSupport.h"
|
|
40 #include "llvm/ProfileData/InstrProf.h"
|
|
41 #include "llvm/Support/Casting.h"
|
|
42 #include "llvm/Support/CommandLine.h"
|
|
43 #include "llvm/Support/Debug.h"
|
|
44 #include "llvm/Support/ErrorHandling.h"
|
|
45 #include "llvm/Support/MathExtras.h"
|
|
46 #include "llvm/Transforms/Instrumentation.h"
|
|
47 #include "llvm/Transforms/PGOInstrumentation.h"
|
|
48 #include "llvm/Transforms/Utils/BasicBlockUtils.h"
|
|
49 #include <cassert>
|
|
50 #include <cstdint>
|
|
51 #include <vector>
|
|
52
|
|
53 using namespace llvm;
|
|
54
|
|
55 #define DEBUG_TYPE "pgo-memop-opt"
|
|
56
|
|
57 STATISTIC(NumOfPGOMemOPOpt, "Number of memop intrinsics optimized.");
|
|
58 STATISTIC(NumOfPGOMemOPAnnotate, "Number of memop intrinsics annotated.");
|
|
59
|
|
60 // The minimum call count to optimize memory intrinsic calls.
|
|
61 static cl::opt<unsigned>
|
|
62 MemOPCountThreshold("pgo-memop-count-threshold", cl::Hidden, cl::ZeroOrMore,
|
|
63 cl::init(1000),
|
|
64 cl::desc("The minimum count to optimize memory "
|
|
65 "intrinsic calls"));
|
|
66
|
|
67 // Command line option to disable memory intrinsic optimization. The default is
|
|
68 // false. This is for debug purpose.
|
|
69 static cl::opt<bool> DisableMemOPOPT("disable-memop-opt", cl::init(false),
|
|
70 cl::Hidden, cl::desc("Disable optimize"));
|
|
71
|
|
72 // The percent threshold to optimize memory intrinsic calls.
|
|
73 static cl::opt<unsigned>
|
|
74 MemOPPercentThreshold("pgo-memop-percent-threshold", cl::init(40),
|
|
75 cl::Hidden, cl::ZeroOrMore,
|
|
76 cl::desc("The percentage threshold for the "
|
|
77 "memory intrinsic calls optimization"));
|
|
78
|
|
79 // Maximum number of versions for optimizing memory intrinsic call.
|
|
80 static cl::opt<unsigned>
|
|
81 MemOPMaxVersion("pgo-memop-max-version", cl::init(3), cl::Hidden,
|
|
82 cl::ZeroOrMore,
|
|
83 cl::desc("The max version for the optimized memory "
|
|
84 " intrinsic calls"));
|
|
85
|
|
86 // Scale the counts from the annotation using the BB count value.
|
|
87 static cl::opt<bool>
|
|
88 MemOPScaleCount("pgo-memop-scale-count", cl::init(true), cl::Hidden,
|
|
89 cl::desc("Scale the memop size counts using the basic "
|
|
90 " block count value"));
|
|
91
|
|
92 // This option sets the rangge of precise profile memop sizes.
|
|
93 extern cl::opt<std::string> MemOPSizeRange;
|
|
94
|
|
95 // This option sets the value that groups large memop sizes
|
|
96 extern cl::opt<unsigned> MemOPSizeLarge;
|
|
97
|
|
98 namespace {
|
|
99 class PGOMemOPSizeOptLegacyPass : public FunctionPass {
|
|
100 public:
|
|
101 static char ID;
|
|
102
|
|
103 PGOMemOPSizeOptLegacyPass() : FunctionPass(ID) {
|
|
104 initializePGOMemOPSizeOptLegacyPassPass(*PassRegistry::getPassRegistry());
|
|
105 }
|
|
106
|
|
107 StringRef getPassName() const override { return "PGOMemOPSize"; }
|
|
108
|
|
109 private:
|
|
110 bool runOnFunction(Function &F) override;
|
|
111 void getAnalysisUsage(AnalysisUsage &AU) const override {
|
|
112 AU.addRequired<BlockFrequencyInfoWrapperPass>();
|
|
113 AU.addRequired<OptimizationRemarkEmitterWrapperPass>();
|
|
114 AU.addPreserved<GlobalsAAWrapperPass>();
|
|
115 }
|
|
116 };
|
|
117 } // end anonymous namespace
|
|
118
|
|
119 char PGOMemOPSizeOptLegacyPass::ID = 0;
|
|
120 INITIALIZE_PASS_BEGIN(PGOMemOPSizeOptLegacyPass, "pgo-memop-opt",
|
|
121 "Optimize memory intrinsic using its size value profile",
|
|
122 false, false)
|
|
123 INITIALIZE_PASS_DEPENDENCY(BlockFrequencyInfoWrapperPass)
|
|
124 INITIALIZE_PASS_END(PGOMemOPSizeOptLegacyPass, "pgo-memop-opt",
|
|
125 "Optimize memory intrinsic using its size value profile",
|
|
126 false, false)
|
|
127
|
|
128 FunctionPass *llvm::createPGOMemOPSizeOptLegacyPass() {
|
|
129 return new PGOMemOPSizeOptLegacyPass();
|
|
130 }
|
|
131
|
|
132 namespace {
|
|
133 class MemOPSizeOpt : public InstVisitor<MemOPSizeOpt> {
|
|
134 public:
|
|
135 MemOPSizeOpt(Function &Func, BlockFrequencyInfo &BFI,
|
|
136 OptimizationRemarkEmitter &ORE)
|
|
137 : Func(Func), BFI(BFI), ORE(ORE), Changed(false) {
|
|
138 ValueDataArray =
|
|
139 llvm::make_unique<InstrProfValueData[]>(MemOPMaxVersion + 2);
|
|
140 // Get the MemOPSize range information from option MemOPSizeRange,
|
|
141 getMemOPSizeRangeFromOption(MemOPSizeRange, PreciseRangeStart,
|
|
142 PreciseRangeLast);
|
|
143 }
|
|
144 bool isChanged() const { return Changed; }
|
|
145 void perform() {
|
|
146 WorkList.clear();
|
|
147 visit(Func);
|
|
148
|
|
149 for (auto &MI : WorkList) {
|
|
150 ++NumOfPGOMemOPAnnotate;
|
|
151 if (perform(MI)) {
|
|
152 Changed = true;
|
|
153 ++NumOfPGOMemOPOpt;
|
|
154 DEBUG(dbgs() << "MemOP call: " << MI->getCalledFunction()->getName()
|
|
155 << "is Transformed.\n");
|
|
156 }
|
|
157 }
|
|
158 }
|
|
159
|
|
160 void visitMemIntrinsic(MemIntrinsic &MI) {
|
|
161 Value *Length = MI.getLength();
|
|
162 // Not perform on constant length calls.
|
|
163 if (dyn_cast<ConstantInt>(Length))
|
|
164 return;
|
|
165 WorkList.push_back(&MI);
|
|
166 }
|
|
167
|
|
168 private:
|
|
169 Function &Func;
|
|
170 BlockFrequencyInfo &BFI;
|
|
171 OptimizationRemarkEmitter &ORE;
|
|
172 bool Changed;
|
|
173 std::vector<MemIntrinsic *> WorkList;
|
|
174 // Start of the previse range.
|
|
175 int64_t PreciseRangeStart;
|
|
176 // Last value of the previse range.
|
|
177 int64_t PreciseRangeLast;
|
|
178 // The space to read the profile annotation.
|
|
179 std::unique_ptr<InstrProfValueData[]> ValueDataArray;
|
|
180 bool perform(MemIntrinsic *MI);
|
|
181
|
|
182 // This kind shows which group the value falls in. For PreciseValue, we have
|
|
183 // the profile count for that value. LargeGroup groups the values that are in
|
|
184 // range [LargeValue, +inf). NonLargeGroup groups the rest of values.
|
|
185 enum MemOPSizeKind { PreciseValue, NonLargeGroup, LargeGroup };
|
|
186
|
|
187 MemOPSizeKind getMemOPSizeKind(int64_t Value) const {
|
|
188 if (Value == MemOPSizeLarge && MemOPSizeLarge != 0)
|
|
189 return LargeGroup;
|
|
190 if (Value == PreciseRangeLast + 1)
|
|
191 return NonLargeGroup;
|
|
192 return PreciseValue;
|
|
193 }
|
|
194 };
|
|
195
|
|
196 static const char *getMIName(const MemIntrinsic *MI) {
|
|
197 switch (MI->getIntrinsicID()) {
|
|
198 case Intrinsic::memcpy:
|
|
199 return "memcpy";
|
|
200 case Intrinsic::memmove:
|
|
201 return "memmove";
|
|
202 case Intrinsic::memset:
|
|
203 return "memset";
|
|
204 default:
|
|
205 return "unknown";
|
|
206 }
|
|
207 }
|
|
208
|
|
209 static bool isProfitable(uint64_t Count, uint64_t TotalCount) {
|
|
210 assert(Count <= TotalCount);
|
|
211 if (Count < MemOPCountThreshold)
|
|
212 return false;
|
|
213 if (Count < TotalCount * MemOPPercentThreshold / 100)
|
|
214 return false;
|
|
215 return true;
|
|
216 }
|
|
217
|
|
218 static inline uint64_t getScaledCount(uint64_t Count, uint64_t Num,
|
|
219 uint64_t Denom) {
|
|
220 if (!MemOPScaleCount)
|
|
221 return Count;
|
|
222 bool Overflowed;
|
|
223 uint64_t ScaleCount = SaturatingMultiply(Count, Num, &Overflowed);
|
|
224 return ScaleCount / Denom;
|
|
225 }
|
|
226
|
|
227 bool MemOPSizeOpt::perform(MemIntrinsic *MI) {
|
|
228 assert(MI);
|
|
229 if (MI->getIntrinsicID() == Intrinsic::memmove)
|
|
230 return false;
|
|
231
|
|
232 uint32_t NumVals, MaxNumPromotions = MemOPMaxVersion + 2;
|
|
233 uint64_t TotalCount;
|
|
234 if (!getValueProfDataFromInst(*MI, IPVK_MemOPSize, MaxNumPromotions,
|
|
235 ValueDataArray.get(), NumVals, TotalCount))
|
|
236 return false;
|
|
237
|
|
238 uint64_t ActualCount = TotalCount;
|
|
239 uint64_t SavedTotalCount = TotalCount;
|
|
240 if (MemOPScaleCount) {
|
|
241 auto BBEdgeCount = BFI.getBlockProfileCount(MI->getParent());
|
|
242 if (!BBEdgeCount)
|
|
243 return false;
|
|
244 ActualCount = *BBEdgeCount;
|
|
245 }
|
|
246
|
|
247 ArrayRef<InstrProfValueData> VDs(ValueDataArray.get(), NumVals);
|
|
248 DEBUG(dbgs() << "Read one memory intrinsic profile with count " << ActualCount
|
|
249 << "\n");
|
|
250 DEBUG(
|
|
251 for (auto &VD
|
|
252 : VDs) { dbgs() << " (" << VD.Value << "," << VD.Count << ")\n"; });
|
|
253
|
|
254 if (ActualCount < MemOPCountThreshold)
|
|
255 return false;
|
|
256 // Skip if the total value profiled count is 0, in which case we can't
|
|
257 // scale up the counts properly (and there is no profitable transformation).
|
|
258 if (TotalCount == 0)
|
|
259 return false;
|
|
260
|
|
261 TotalCount = ActualCount;
|
|
262 if (MemOPScaleCount)
|
|
263 DEBUG(dbgs() << "Scale counts: numerator = " << ActualCount
|
|
264 << " denominator = " << SavedTotalCount << "\n");
|
|
265
|
|
266 // Keeping track of the count of the default case:
|
|
267 uint64_t RemainCount = TotalCount;
|
|
268 uint64_t SavedRemainCount = SavedTotalCount;
|
|
269 SmallVector<uint64_t, 16> SizeIds;
|
|
270 SmallVector<uint64_t, 16> CaseCounts;
|
|
271 uint64_t MaxCount = 0;
|
|
272 unsigned Version = 0;
|
|
273 // Default case is in the front -- save the slot here.
|
|
274 CaseCounts.push_back(0);
|
|
275 for (auto &VD : VDs) {
|
|
276 int64_t V = VD.Value;
|
|
277 uint64_t C = VD.Count;
|
|
278 if (MemOPScaleCount)
|
|
279 C = getScaledCount(C, ActualCount, SavedTotalCount);
|
|
280
|
|
281 // Only care precise value here.
|
|
282 if (getMemOPSizeKind(V) != PreciseValue)
|
|
283 continue;
|
|
284
|
|
285 // ValueCounts are sorted on the count. Break at the first un-profitable
|
|
286 // value.
|
|
287 if (!isProfitable(C, RemainCount))
|
|
288 break;
|
|
289
|
|
290 SizeIds.push_back(V);
|
|
291 CaseCounts.push_back(C);
|
|
292 if (C > MaxCount)
|
|
293 MaxCount = C;
|
|
294
|
|
295 assert(RemainCount >= C);
|
|
296 RemainCount -= C;
|
|
297 assert(SavedRemainCount >= VD.Count);
|
|
298 SavedRemainCount -= VD.Count;
|
|
299
|
|
300 if (++Version > MemOPMaxVersion && MemOPMaxVersion != 0)
|
|
301 break;
|
|
302 }
|
|
303
|
|
304 if (Version == 0)
|
|
305 return false;
|
|
306
|
|
307 CaseCounts[0] = RemainCount;
|
|
308 if (RemainCount > MaxCount)
|
|
309 MaxCount = RemainCount;
|
|
310
|
|
311 uint64_t SumForOpt = TotalCount - RemainCount;
|
|
312
|
|
313 DEBUG(dbgs() << "Optimize one memory intrinsic call to " << Version
|
|
314 << " Versions (covering " << SumForOpt << " out of "
|
|
315 << TotalCount << ")\n");
|
|
316
|
|
317 // mem_op(..., size)
|
|
318 // ==>
|
|
319 // switch (size) {
|
|
320 // case s1:
|
|
321 // mem_op(..., s1);
|
|
322 // goto merge_bb;
|
|
323 // case s2:
|
|
324 // mem_op(..., s2);
|
|
325 // goto merge_bb;
|
|
326 // ...
|
|
327 // default:
|
|
328 // mem_op(..., size);
|
|
329 // goto merge_bb;
|
|
330 // }
|
|
331 // merge_bb:
|
|
332
|
|
333 BasicBlock *BB = MI->getParent();
|
|
334 DEBUG(dbgs() << "\n\n== Basic Block Before ==\n");
|
|
335 DEBUG(dbgs() << *BB << "\n");
|
|
336 auto OrigBBFreq = BFI.getBlockFreq(BB);
|
|
337
|
|
338 BasicBlock *DefaultBB = SplitBlock(BB, MI);
|
|
339 BasicBlock::iterator It(*MI);
|
|
340 ++It;
|
|
341 assert(It != DefaultBB->end());
|
|
342 BasicBlock *MergeBB = SplitBlock(DefaultBB, &(*It));
|
|
343 MergeBB->setName("MemOP.Merge");
|
|
344 BFI.setBlockFreq(MergeBB, OrigBBFreq.getFrequency());
|
|
345 DefaultBB->setName("MemOP.Default");
|
|
346
|
|
347 auto &Ctx = Func.getContext();
|
|
348 IRBuilder<> IRB(BB);
|
|
349 BB->getTerminator()->eraseFromParent();
|
|
350 Value *SizeVar = MI->getLength();
|
|
351 SwitchInst *SI = IRB.CreateSwitch(SizeVar, DefaultBB, SizeIds.size());
|
|
352
|
|
353 // Clear the value profile data.
|
|
354 MI->setMetadata(LLVMContext::MD_prof, nullptr);
|
|
355 // If all promoted, we don't need the MD.prof metadata.
|
|
356 if (SavedRemainCount > 0 || Version != NumVals)
|
|
357 // Otherwise we need update with the un-promoted records back.
|
|
358 annotateValueSite(*Func.getParent(), *MI, VDs.slice(Version),
|
|
359 SavedRemainCount, IPVK_MemOPSize, NumVals);
|
|
360
|
|
361 DEBUG(dbgs() << "\n\n== Basic Block After==\n");
|
|
362
|
|
363 for (uint64_t SizeId : SizeIds) {
|
|
364 BasicBlock *CaseBB = BasicBlock::Create(
|
|
365 Ctx, Twine("MemOP.Case.") + Twine(SizeId), &Func, DefaultBB);
|
|
366 Instruction *NewInst = MI->clone();
|
|
367 // Fix the argument.
|
|
368 MemIntrinsic * MemI = dyn_cast<MemIntrinsic>(NewInst);
|
|
369 IntegerType *SizeType = dyn_cast<IntegerType>(MemI->getLength()->getType());
|
|
370 assert(SizeType && "Expected integer type size argument.");
|
|
371 ConstantInt *CaseSizeId = ConstantInt::get(SizeType, SizeId);
|
|
372 MemI->setLength(CaseSizeId);
|
|
373 CaseBB->getInstList().push_back(NewInst);
|
|
374 IRBuilder<> IRBCase(CaseBB);
|
|
375 IRBCase.CreateBr(MergeBB);
|
|
376 SI->addCase(CaseSizeId, CaseBB);
|
|
377 DEBUG(dbgs() << *CaseBB << "\n");
|
|
378 }
|
|
379 setProfMetadata(Func.getParent(), SI, CaseCounts, MaxCount);
|
|
380
|
|
381 DEBUG(dbgs() << *BB << "\n");
|
|
382 DEBUG(dbgs() << *DefaultBB << "\n");
|
|
383 DEBUG(dbgs() << *MergeBB << "\n");
|
|
384
|
|
385 ORE.emit([&]() {
|
|
386 using namespace ore;
|
|
387 return OptimizationRemark(DEBUG_TYPE, "memopt-opt", MI)
|
|
388 << "optimized " << NV("Intrinsic", StringRef(getMIName(MI)))
|
|
389 << " with count " << NV("Count", SumForOpt) << " out of "
|
|
390 << NV("Total", TotalCount) << " for " << NV("Versions", Version)
|
|
391 << " versions";
|
|
392 });
|
|
393
|
|
394 return true;
|
|
395 }
|
|
396 } // namespace
|
|
397
|
|
398 static bool PGOMemOPSizeOptImpl(Function &F, BlockFrequencyInfo &BFI,
|
|
399 OptimizationRemarkEmitter &ORE) {
|
|
400 if (DisableMemOPOPT)
|
|
401 return false;
|
|
402
|
|
403 if (F.hasFnAttribute(Attribute::OptimizeForSize))
|
|
404 return false;
|
|
405 MemOPSizeOpt MemOPSizeOpt(F, BFI, ORE);
|
|
406 MemOPSizeOpt.perform();
|
|
407 return MemOPSizeOpt.isChanged();
|
|
408 }
|
|
409
|
|
410 bool PGOMemOPSizeOptLegacyPass::runOnFunction(Function &F) {
|
|
411 BlockFrequencyInfo &BFI =
|
|
412 getAnalysis<BlockFrequencyInfoWrapperPass>().getBFI();
|
|
413 auto &ORE = getAnalysis<OptimizationRemarkEmitterWrapperPass>().getORE();
|
|
414 return PGOMemOPSizeOptImpl(F, BFI, ORE);
|
|
415 }
|
|
416
|
|
417 namespace llvm {
|
|
418 char &PGOMemOPSizeOptID = PGOMemOPSizeOptLegacyPass::ID;
|
|
419
|
|
420 PreservedAnalyses PGOMemOPSizeOpt::run(Function &F,
|
|
421 FunctionAnalysisManager &FAM) {
|
|
422 auto &BFI = FAM.getResult<BlockFrequencyAnalysis>(F);
|
|
423 auto &ORE = FAM.getResult<OptimizationRemarkEmitterAnalysis>(F);
|
|
424 bool Changed = PGOMemOPSizeOptImpl(F, BFI, ORE);
|
|
425 if (!Changed)
|
|
426 return PreservedAnalyses::all();
|
|
427 auto PA = PreservedAnalyses();
|
|
428 PA.preserve<GlobalsAA>();
|
|
429 return PA;
|
|
430 }
|
|
431 } // namespace llvm
|