Mercurial > hg > CbC > CbC_llvm
comparison tools/llvm-exegesis/lib/Uops.cpp @ 148:63bd29f05246
merged
author | Shinji KONO <kono@ie.u-ryukyu.ac.jp> |
---|---|
date | Wed, 14 Aug 2019 19:46:37 +0900 |
parents | c2174574ed3a |
children |
comparison
equal
deleted
inserted
replaced
146:3fc4d5c3e21e | 148:63bd29f05246 |
---|---|
1 //===-- Uops.cpp ------------------------------------------------*- C++ -*-===// | |
2 // | |
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. | |
4 // See https://llvm.org/LICENSE.txt for license information. | |
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception | |
6 // | |
7 //===----------------------------------------------------------------------===// | |
8 | |
9 #include "Uops.h" | |
10 | |
11 #include "Assembler.h" | |
12 #include "BenchmarkRunner.h" | |
13 #include "MCInstrDescView.h" | |
14 #include "Target.h" | |
15 | |
16 // FIXME: Load constants into registers (e.g. with fld1) to not break | |
17 // instructions like x87. | |
18 | |
19 // Ideally we would like the only limitation on executing uops to be the issue | |
20 // ports. Maximizing port pressure increases the likelihood that the load is | |
21 // distributed evenly across possible ports. | |
22 | |
23 // To achieve that, one approach is to generate instructions that do not have | |
24 // data dependencies between them. | |
25 // | |
26 // For some instructions, this is trivial: | |
27 // mov rax, qword ptr [rsi] | |
28 // mov rax, qword ptr [rsi] | |
29 // mov rax, qword ptr [rsi] | |
30 // mov rax, qword ptr [rsi] | |
31 // For the above snippet, haswell just renames rax four times and executes the | |
32 // four instructions two at a time on P23 and P0126. | |
33 // | |
34 // For some instructions, we just need to make sure that the source is | |
35 // different from the destination. For example, IDIV8r reads from GPR and | |
36 // writes to AX. We just need to ensure that the Var is assigned a | |
37 // register which is different from AX: | |
38 // idiv bx | |
39 // idiv bx | |
40 // idiv bx | |
41 // idiv bx | |
42 // The above snippet will be able to fully saturate the ports, while the same | |
43 // with ax would issue one uop every `latency(IDIV8r)` cycles. | |
44 // | |
45 // Some instructions make this harder because they both read and write from | |
46 // the same register: | |
47 // inc rax | |
48 // inc rax | |
49 // inc rax | |
50 // inc rax | |
51 // This has a data dependency from each instruction to the next, limit the | |
52 // number of instructions that can be issued in parallel. | |
53 // It turns out that this is not a big issue on recent Intel CPUs because they | |
54 // have heuristics to balance port pressure. In the snippet above, subsequent | |
55 // instructions will end up evenly distributed on {P0,P1,P5,P6}, but some CPUs | |
56 // might end up executing them all on P0 (just because they can), or try | |
57 // avoiding P5 because it's usually under high pressure from vector | |
58 // instructions. | |
59 // This issue is even more important for high-latency instructions because | |
60 // they increase the idle time of the CPU, e.g. : | |
61 // imul rax, rbx | |
62 // imul rax, rbx | |
63 // imul rax, rbx | |
64 // imul rax, rbx | |
65 // | |
66 // To avoid that, we do the renaming statically by generating as many | |
67 // independent exclusive assignments as possible (until all possible registers | |
68 // are exhausted) e.g.: | |
69 // imul rax, rbx | |
70 // imul rcx, rbx | |
71 // imul rdx, rbx | |
72 // imul r8, rbx | |
73 // | |
74 // Some instruction even make the above static renaming impossible because | |
75 // they implicitly read and write from the same operand, e.g. ADC16rr reads | |
76 // and writes from EFLAGS. | |
77 // In that case we just use a greedy register assignment and hope for the | |
78 // best. | |
79 | |
80 namespace llvm { | |
81 namespace exegesis { | |
82 | |
83 static llvm::SmallVector<const Variable *, 8> | |
84 getVariablesWithTiedOperands(const Instruction &Instr) { | |
85 llvm::SmallVector<const Variable *, 8> Result; | |
86 for (const auto &Var : Instr.Variables) | |
87 if (Var.hasTiedOperands()) | |
88 Result.push_back(&Var); | |
89 return Result; | |
90 } | |
91 | |
92 static void remove(llvm::BitVector &a, const llvm::BitVector &b) { | |
93 assert(a.size() == b.size()); | |
94 for (auto I : b.set_bits()) | |
95 a.reset(I); | |
96 } | |
97 | |
98 UopsBenchmarkRunner::~UopsBenchmarkRunner() = default; | |
99 | |
100 UopsSnippetGenerator::~UopsSnippetGenerator() = default; | |
101 | |
102 void UopsSnippetGenerator::instantiateMemoryOperands( | |
103 const unsigned ScratchSpacePointerInReg, | |
104 std::vector<InstructionTemplate> &Instructions) const { | |
105 if (ScratchSpacePointerInReg == 0) | |
106 return; // no memory operands. | |
107 const auto &ET = State.getExegesisTarget(); | |
108 const unsigned MemStep = ET.getMaxMemoryAccessSize(); | |
109 const size_t OriginalInstructionsSize = Instructions.size(); | |
110 size_t I = 0; | |
111 for (InstructionTemplate &IT : Instructions) { | |
112 ET.fillMemoryOperands(IT, ScratchSpacePointerInReg, I * MemStep); | |
113 ++I; | |
114 } | |
115 | |
116 while (Instructions.size() < kMinNumDifferentAddresses) { | |
117 InstructionTemplate IT = Instructions[I % OriginalInstructionsSize]; | |
118 ET.fillMemoryOperands(IT, ScratchSpacePointerInReg, I * MemStep); | |
119 ++I; | |
120 Instructions.push_back(std::move(IT)); | |
121 } | |
122 assert(I * MemStep < BenchmarkRunner::ScratchSpace::kSize && | |
123 "not enough scratch space"); | |
124 } | |
125 | |
126 static std::vector<InstructionTemplate> generateSnippetUsingStaticRenaming( | |
127 const LLVMState &State, const InstructionTemplate &IT, | |
128 const ArrayRef<const Variable *> TiedVariables, | |
129 const BitVector *ScratchSpaceAliasedRegs) { | |
130 std::vector<InstructionTemplate> Instructions; | |
131 // Assign registers to variables in a round-robin manner. This is simple but | |
132 // ensures that the most register-constrained variable does not get starved. | |
133 std::vector<BitVector> PossibleRegsForVar; | |
134 for (const Variable *Var : TiedVariables) { | |
135 assert(Var); | |
136 const Operand &Op = IT.Instr.getPrimaryOperand(*Var); | |
137 assert(Op.isReg()); | |
138 BitVector PossibleRegs = State.getRATC().emptyRegisters(); | |
139 if (ScratchSpaceAliasedRegs) { | |
140 PossibleRegs |= *ScratchSpaceAliasedRegs; | |
141 } | |
142 PossibleRegs.flip(); | |
143 PossibleRegs &= Op.getRegisterAliasing().sourceBits(); | |
144 PossibleRegsForVar.push_back(std::move(PossibleRegs)); | |
145 } | |
146 SmallVector<int, 2> Iterators(TiedVariables.size(), 0); | |
147 while (true) { | |
148 InstructionTemplate TmpIT = IT; | |
149 // Find a possible register for each variable in turn, marking the | |
150 // register as taken. | |
151 for (size_t VarId = 0; VarId < TiedVariables.size(); ++VarId) { | |
152 const int NextPossibleReg = | |
153 PossibleRegsForVar[VarId].find_next(Iterators[VarId]); | |
154 if (NextPossibleReg <= 0) { | |
155 return Instructions; | |
156 } | |
157 TmpIT.getValueFor(*TiedVariables[VarId]) = | |
158 llvm::MCOperand::createReg(NextPossibleReg); | |
159 // Bump iterator. | |
160 Iterators[VarId] = NextPossibleReg; | |
161 // Prevent other variables from using the register. | |
162 for (BitVector &OtherPossibleRegs : PossibleRegsForVar) { | |
163 OtherPossibleRegs.reset(NextPossibleReg); | |
164 } | |
165 } | |
166 Instructions.push_back(std::move(TmpIT)); | |
167 } | |
168 } | |
169 | |
170 llvm::Expected<std::vector<CodeTemplate>> | |
171 UopsSnippetGenerator::generateCodeTemplates(const Instruction &Instr) const { | |
172 CodeTemplate CT; | |
173 const llvm::BitVector *ScratchSpaceAliasedRegs = nullptr; | |
174 if (Instr.hasMemoryOperands()) { | |
175 const auto &ET = State.getExegesisTarget(); | |
176 CT.ScratchSpacePointerInReg = | |
177 ET.getScratchMemoryRegister(State.getTargetMachine().getTargetTriple()); | |
178 if (CT.ScratchSpacePointerInReg == 0) | |
179 return llvm::make_error<BenchmarkFailure>( | |
180 "Infeasible : target does not support memory instructions"); | |
181 ScratchSpaceAliasedRegs = | |
182 &State.getRATC().getRegister(CT.ScratchSpacePointerInReg).aliasedBits(); | |
183 // If the instruction implicitly writes to ScratchSpacePointerInReg , abort. | |
184 // FIXME: We could make a copy of the scratch register. | |
185 for (const auto &Op : Instr.Operands) { | |
186 if (Op.isDef() && Op.isImplicitReg() && | |
187 ScratchSpaceAliasedRegs->test(Op.getImplicitReg())) | |
188 return llvm::make_error<BenchmarkFailure>( | |
189 "Infeasible : memory instruction uses scratch memory register"); | |
190 } | |
191 } | |
192 | |
193 const AliasingConfigurations SelfAliasing(Instr, Instr); | |
194 InstructionTemplate IT(Instr); | |
195 if (SelfAliasing.empty()) { | |
196 CT.Info = "instruction is parallel, repeating a random one."; | |
197 CT.Instructions.push_back(std::move(IT)); | |
198 instantiateMemoryOperands(CT.ScratchSpacePointerInReg, CT.Instructions); | |
199 return getSingleton(std::move(CT)); | |
200 } | |
201 if (SelfAliasing.hasImplicitAliasing()) { | |
202 CT.Info = "instruction is serial, repeating a random one."; | |
203 CT.Instructions.push_back(std::move(IT)); | |
204 instantiateMemoryOperands(CT.ScratchSpacePointerInReg, CT.Instructions); | |
205 return getSingleton(std::move(CT)); | |
206 } | |
207 const auto TiedVariables = getVariablesWithTiedOperands(Instr); | |
208 if (!TiedVariables.empty()) { | |
209 CT.Info = "instruction has tied variables, using static renaming."; | |
210 CT.Instructions = generateSnippetUsingStaticRenaming( | |
211 State, IT, TiedVariables, ScratchSpaceAliasedRegs); | |
212 instantiateMemoryOperands(CT.ScratchSpacePointerInReg, CT.Instructions); | |
213 return getSingleton(std::move(CT)); | |
214 } | |
215 const auto &ReservedRegisters = State.getRATC().reservedRegisters(); | |
216 // No tied variables, we pick random values for defs. | |
217 llvm::BitVector Defs(State.getRegInfo().getNumRegs()); | |
218 for (const auto &Op : Instr.Operands) { | |
219 if (Op.isReg() && Op.isExplicit() && Op.isDef() && !Op.isMemory()) { | |
220 auto PossibleRegisters = Op.getRegisterAliasing().sourceBits(); | |
221 remove(PossibleRegisters, ReservedRegisters); | |
222 // Do not use the scratch memory address register. | |
223 if (ScratchSpaceAliasedRegs) | |
224 remove(PossibleRegisters, *ScratchSpaceAliasedRegs); | |
225 assert(PossibleRegisters.any() && "No register left to choose from"); | |
226 const auto RandomReg = randomBit(PossibleRegisters); | |
227 Defs.set(RandomReg); | |
228 IT.getValueFor(Op) = llvm::MCOperand::createReg(RandomReg); | |
229 } | |
230 } | |
231 // And pick random use values that are not reserved and don't alias with defs. | |
232 const auto DefAliases = getAliasedBits(State.getRegInfo(), Defs); | |
233 for (const auto &Op : Instr.Operands) { | |
234 if (Op.isReg() && Op.isExplicit() && Op.isUse() && !Op.isMemory()) { | |
235 auto PossibleRegisters = Op.getRegisterAliasing().sourceBits(); | |
236 remove(PossibleRegisters, ReservedRegisters); | |
237 // Do not use the scratch memory address register. | |
238 if (ScratchSpaceAliasedRegs) | |
239 remove(PossibleRegisters, *ScratchSpaceAliasedRegs); | |
240 remove(PossibleRegisters, DefAliases); | |
241 assert(PossibleRegisters.any() && "No register left to choose from"); | |
242 const auto RandomReg = randomBit(PossibleRegisters); | |
243 IT.getValueFor(Op) = llvm::MCOperand::createReg(RandomReg); | |
244 } | |
245 } | |
246 CT.Info = | |
247 "instruction has no tied variables picking Uses different from defs"; | |
248 CT.Instructions.push_back(std::move(IT)); | |
249 instantiateMemoryOperands(CT.ScratchSpacePointerInReg, CT.Instructions); | |
250 return getSingleton(std::move(CT)); | |
251 } | |
252 | |
253 llvm::Expected<std::vector<BenchmarkMeasure>> | |
254 UopsBenchmarkRunner::runMeasurements(const FunctionExecutor &Executor) const { | |
255 std::vector<BenchmarkMeasure> Result; | |
256 const PfmCountersInfo &PCI = State.getPfmCounters(); | |
257 // Uops per port. | |
258 for (const auto *IssueCounter = PCI.IssueCounters, | |
259 *IssueCounterEnd = PCI.IssueCounters + PCI.NumIssueCounters; | |
260 IssueCounter != IssueCounterEnd; ++IssueCounter) { | |
261 if (!IssueCounter->Counter) | |
262 continue; | |
263 auto ExpectedCounterValue = Executor.runAndMeasure(IssueCounter->Counter); | |
264 if (!ExpectedCounterValue) | |
265 return ExpectedCounterValue.takeError(); | |
266 Result.push_back(BenchmarkMeasure::Create(IssueCounter->ProcResName, | |
267 *ExpectedCounterValue)); | |
268 } | |
269 // NumMicroOps. | |
270 if (const char *const UopsCounter = PCI.UopsCounter) { | |
271 auto ExpectedCounterValue = Executor.runAndMeasure(UopsCounter); | |
272 if (!ExpectedCounterValue) | |
273 return ExpectedCounterValue.takeError(); | |
274 Result.push_back( | |
275 BenchmarkMeasure::Create("NumMicroOps", *ExpectedCounterValue)); | |
276 } | |
277 return std::move(Result); | |
278 } | |
279 | |
280 constexpr const size_t UopsSnippetGenerator::kMinNumDifferentAddresses; | |
281 | |
282 } // namespace exegesis | |
283 } // namespace llvm |