147
|
1 //===-- Uops.cpp ------------------------------------------------*- C++ -*-===//
|
|
2 //
|
|
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
|
|
4 // See https://llvm.org/LICENSE.txt for license information.
|
|
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
|
|
6 //
|
|
7 //===----------------------------------------------------------------------===//
|
|
8
|
|
9 #include "Uops.h"
|
|
10
|
|
11 #include "Assembler.h"
|
|
12 #include "BenchmarkRunner.h"
|
|
13 #include "MCInstrDescView.h"
|
|
14 #include "Target.h"
|
|
15
|
|
16 // FIXME: Load constants into registers (e.g. with fld1) to not break
|
|
17 // instructions like x87.
|
|
18
|
|
19 // Ideally we would like the only limitation on executing uops to be the issue
|
|
20 // ports. Maximizing port pressure increases the likelihood that the load is
|
|
21 // distributed evenly across possible ports.
|
|
22
|
|
23 // To achieve that, one approach is to generate instructions that do not have
|
|
24 // data dependencies between them.
|
|
25 //
|
|
26 // For some instructions, this is trivial:
|
|
27 // mov rax, qword ptr [rsi]
|
|
28 // mov rax, qword ptr [rsi]
|
|
29 // mov rax, qword ptr [rsi]
|
|
30 // mov rax, qword ptr [rsi]
|
|
31 // For the above snippet, haswell just renames rax four times and executes the
|
|
32 // four instructions two at a time on P23 and P0126.
|
|
33 //
|
|
34 // For some instructions, we just need to make sure that the source is
|
|
35 // different from the destination. For example, IDIV8r reads from GPR and
|
|
36 // writes to AX. We just need to ensure that the Var is assigned a
|
|
37 // register which is different from AX:
|
|
38 // idiv bx
|
|
39 // idiv bx
|
|
40 // idiv bx
|
|
41 // idiv bx
|
|
42 // The above snippet will be able to fully saturate the ports, while the same
|
|
43 // with ax would issue one uop every `latency(IDIV8r)` cycles.
|
|
44 //
|
|
45 // Some instructions make this harder because they both read and write from
|
|
46 // the same register:
|
|
47 // inc rax
|
|
48 // inc rax
|
|
49 // inc rax
|
|
50 // inc rax
|
|
51 // This has a data dependency from each instruction to the next, limit the
|
|
52 // number of instructions that can be issued in parallel.
|
|
53 // It turns out that this is not a big issue on recent Intel CPUs because they
|
|
54 // have heuristics to balance port pressure. In the snippet above, subsequent
|
|
55 // instructions will end up evenly distributed on {P0,P1,P5,P6}, but some CPUs
|
|
56 // might end up executing them all on P0 (just because they can), or try
|
|
57 // avoiding P5 because it's usually under high pressure from vector
|
|
58 // instructions.
|
|
59 // This issue is even more important for high-latency instructions because
|
|
60 // they increase the idle time of the CPU, e.g. :
|
|
61 // imul rax, rbx
|
|
62 // imul rax, rbx
|
|
63 // imul rax, rbx
|
|
64 // imul rax, rbx
|
|
65 //
|
|
66 // To avoid that, we do the renaming statically by generating as many
|
|
67 // independent exclusive assignments as possible (until all possible registers
|
|
68 // are exhausted) e.g.:
|
|
69 // imul rax, rbx
|
|
70 // imul rcx, rbx
|
|
71 // imul rdx, rbx
|
|
72 // imul r8, rbx
|
|
73 //
|
|
74 // Some instruction even make the above static renaming impossible because
|
|
75 // they implicitly read and write from the same operand, e.g. ADC16rr reads
|
|
76 // and writes from EFLAGS.
|
|
77 // In that case we just use a greedy register assignment and hope for the
|
|
78 // best.
|
|
79
|
|
80 namespace llvm {
|
|
81 namespace exegesis {
|
|
82
|
|
83 static llvm::SmallVector<const Variable *, 8>
|
|
84 getVariablesWithTiedOperands(const Instruction &Instr) {
|
|
85 llvm::SmallVector<const Variable *, 8> Result;
|
|
86 for (const auto &Var : Instr.Variables)
|
|
87 if (Var.hasTiedOperands())
|
|
88 Result.push_back(&Var);
|
|
89 return Result;
|
|
90 }
|
|
91
|
|
92 static void remove(llvm::BitVector &a, const llvm::BitVector &b) {
|
|
93 assert(a.size() == b.size());
|
|
94 for (auto I : b.set_bits())
|
|
95 a.reset(I);
|
|
96 }
|
|
97
|
|
98 UopsBenchmarkRunner::~UopsBenchmarkRunner() = default;
|
|
99
|
|
100 UopsSnippetGenerator::~UopsSnippetGenerator() = default;
|
|
101
|
|
102 void UopsSnippetGenerator::instantiateMemoryOperands(
|
|
103 const unsigned ScratchSpacePointerInReg,
|
|
104 std::vector<InstructionTemplate> &Instructions) const {
|
|
105 if (ScratchSpacePointerInReg == 0)
|
|
106 return; // no memory operands.
|
|
107 const auto &ET = State.getExegesisTarget();
|
|
108 const unsigned MemStep = ET.getMaxMemoryAccessSize();
|
|
109 const size_t OriginalInstructionsSize = Instructions.size();
|
|
110 size_t I = 0;
|
|
111 for (InstructionTemplate &IT : Instructions) {
|
|
112 ET.fillMemoryOperands(IT, ScratchSpacePointerInReg, I * MemStep);
|
|
113 ++I;
|
|
114 }
|
|
115
|
|
116 while (Instructions.size() < kMinNumDifferentAddresses) {
|
|
117 InstructionTemplate IT = Instructions[I % OriginalInstructionsSize];
|
|
118 ET.fillMemoryOperands(IT, ScratchSpacePointerInReg, I * MemStep);
|
|
119 ++I;
|
|
120 Instructions.push_back(std::move(IT));
|
|
121 }
|
|
122 assert(I * MemStep < BenchmarkRunner::ScratchSpace::kSize &&
|
|
123 "not enough scratch space");
|
|
124 }
|
|
125
|
|
126 static std::vector<InstructionTemplate> generateSnippetUsingStaticRenaming(
|
|
127 const LLVMState &State, const InstructionTemplate &IT,
|
|
128 const ArrayRef<const Variable *> TiedVariables,
|
|
129 const BitVector *ScratchSpaceAliasedRegs) {
|
|
130 std::vector<InstructionTemplate> Instructions;
|
|
131 // Assign registers to variables in a round-robin manner. This is simple but
|
|
132 // ensures that the most register-constrained variable does not get starved.
|
|
133 std::vector<BitVector> PossibleRegsForVar;
|
|
134 for (const Variable *Var : TiedVariables) {
|
|
135 assert(Var);
|
|
136 const Operand &Op = IT.Instr.getPrimaryOperand(*Var);
|
|
137 assert(Op.isReg());
|
|
138 BitVector PossibleRegs = State.getRATC().emptyRegisters();
|
|
139 if (ScratchSpaceAliasedRegs) {
|
|
140 PossibleRegs |= *ScratchSpaceAliasedRegs;
|
|
141 }
|
|
142 PossibleRegs.flip();
|
|
143 PossibleRegs &= Op.getRegisterAliasing().sourceBits();
|
|
144 PossibleRegsForVar.push_back(std::move(PossibleRegs));
|
|
145 }
|
|
146 SmallVector<int, 2> Iterators(TiedVariables.size(), 0);
|
|
147 while (true) {
|
|
148 InstructionTemplate TmpIT = IT;
|
|
149 // Find a possible register for each variable in turn, marking the
|
|
150 // register as taken.
|
|
151 for (size_t VarId = 0; VarId < TiedVariables.size(); ++VarId) {
|
|
152 const int NextPossibleReg =
|
|
153 PossibleRegsForVar[VarId].find_next(Iterators[VarId]);
|
|
154 if (NextPossibleReg <= 0) {
|
|
155 return Instructions;
|
|
156 }
|
|
157 TmpIT.getValueFor(*TiedVariables[VarId]) =
|
|
158 llvm::MCOperand::createReg(NextPossibleReg);
|
|
159 // Bump iterator.
|
|
160 Iterators[VarId] = NextPossibleReg;
|
|
161 // Prevent other variables from using the register.
|
|
162 for (BitVector &OtherPossibleRegs : PossibleRegsForVar) {
|
|
163 OtherPossibleRegs.reset(NextPossibleReg);
|
|
164 }
|
|
165 }
|
|
166 Instructions.push_back(std::move(TmpIT));
|
|
167 }
|
|
168 }
|
|
169
|
|
170 llvm::Expected<std::vector<CodeTemplate>>
|
|
171 UopsSnippetGenerator::generateCodeTemplates(const Instruction &Instr) const {
|
|
172 CodeTemplate CT;
|
|
173 const llvm::BitVector *ScratchSpaceAliasedRegs = nullptr;
|
|
174 if (Instr.hasMemoryOperands()) {
|
|
175 const auto &ET = State.getExegesisTarget();
|
|
176 CT.ScratchSpacePointerInReg =
|
|
177 ET.getScratchMemoryRegister(State.getTargetMachine().getTargetTriple());
|
|
178 if (CT.ScratchSpacePointerInReg == 0)
|
|
179 return llvm::make_error<BenchmarkFailure>(
|
|
180 "Infeasible : target does not support memory instructions");
|
|
181 ScratchSpaceAliasedRegs =
|
|
182 &State.getRATC().getRegister(CT.ScratchSpacePointerInReg).aliasedBits();
|
|
183 // If the instruction implicitly writes to ScratchSpacePointerInReg , abort.
|
|
184 // FIXME: We could make a copy of the scratch register.
|
|
185 for (const auto &Op : Instr.Operands) {
|
|
186 if (Op.isDef() && Op.isImplicitReg() &&
|
|
187 ScratchSpaceAliasedRegs->test(Op.getImplicitReg()))
|
|
188 return llvm::make_error<BenchmarkFailure>(
|
|
189 "Infeasible : memory instruction uses scratch memory register");
|
|
190 }
|
|
191 }
|
|
192
|
|
193 const AliasingConfigurations SelfAliasing(Instr, Instr);
|
|
194 InstructionTemplate IT(Instr);
|
|
195 if (SelfAliasing.empty()) {
|
|
196 CT.Info = "instruction is parallel, repeating a random one.";
|
|
197 CT.Instructions.push_back(std::move(IT));
|
|
198 instantiateMemoryOperands(CT.ScratchSpacePointerInReg, CT.Instructions);
|
|
199 return getSingleton(std::move(CT));
|
|
200 }
|
|
201 if (SelfAliasing.hasImplicitAliasing()) {
|
|
202 CT.Info = "instruction is serial, repeating a random one.";
|
|
203 CT.Instructions.push_back(std::move(IT));
|
|
204 instantiateMemoryOperands(CT.ScratchSpacePointerInReg, CT.Instructions);
|
|
205 return getSingleton(std::move(CT));
|
|
206 }
|
|
207 const auto TiedVariables = getVariablesWithTiedOperands(Instr);
|
|
208 if (!TiedVariables.empty()) {
|
|
209 CT.Info = "instruction has tied variables, using static renaming.";
|
|
210 CT.Instructions = generateSnippetUsingStaticRenaming(
|
|
211 State, IT, TiedVariables, ScratchSpaceAliasedRegs);
|
|
212 instantiateMemoryOperands(CT.ScratchSpacePointerInReg, CT.Instructions);
|
|
213 return getSingleton(std::move(CT));
|
|
214 }
|
|
215 const auto &ReservedRegisters = State.getRATC().reservedRegisters();
|
|
216 // No tied variables, we pick random values for defs.
|
|
217 llvm::BitVector Defs(State.getRegInfo().getNumRegs());
|
|
218 for (const auto &Op : Instr.Operands) {
|
|
219 if (Op.isReg() && Op.isExplicit() && Op.isDef() && !Op.isMemory()) {
|
|
220 auto PossibleRegisters = Op.getRegisterAliasing().sourceBits();
|
|
221 remove(PossibleRegisters, ReservedRegisters);
|
|
222 // Do not use the scratch memory address register.
|
|
223 if (ScratchSpaceAliasedRegs)
|
|
224 remove(PossibleRegisters, *ScratchSpaceAliasedRegs);
|
|
225 assert(PossibleRegisters.any() && "No register left to choose from");
|
|
226 const auto RandomReg = randomBit(PossibleRegisters);
|
|
227 Defs.set(RandomReg);
|
|
228 IT.getValueFor(Op) = llvm::MCOperand::createReg(RandomReg);
|
|
229 }
|
|
230 }
|
|
231 // And pick random use values that are not reserved and don't alias with defs.
|
|
232 const auto DefAliases = getAliasedBits(State.getRegInfo(), Defs);
|
|
233 for (const auto &Op : Instr.Operands) {
|
|
234 if (Op.isReg() && Op.isExplicit() && Op.isUse() && !Op.isMemory()) {
|
|
235 auto PossibleRegisters = Op.getRegisterAliasing().sourceBits();
|
|
236 remove(PossibleRegisters, ReservedRegisters);
|
|
237 // Do not use the scratch memory address register.
|
|
238 if (ScratchSpaceAliasedRegs)
|
|
239 remove(PossibleRegisters, *ScratchSpaceAliasedRegs);
|
|
240 remove(PossibleRegisters, DefAliases);
|
|
241 assert(PossibleRegisters.any() && "No register left to choose from");
|
|
242 const auto RandomReg = randomBit(PossibleRegisters);
|
|
243 IT.getValueFor(Op) = llvm::MCOperand::createReg(RandomReg);
|
|
244 }
|
|
245 }
|
|
246 CT.Info =
|
|
247 "instruction has no tied variables picking Uses different from defs";
|
|
248 CT.Instructions.push_back(std::move(IT));
|
|
249 instantiateMemoryOperands(CT.ScratchSpacePointerInReg, CT.Instructions);
|
|
250 return getSingleton(std::move(CT));
|
|
251 }
|
|
252
|
|
253 llvm::Expected<std::vector<BenchmarkMeasure>>
|
|
254 UopsBenchmarkRunner::runMeasurements(const FunctionExecutor &Executor) const {
|
|
255 std::vector<BenchmarkMeasure> Result;
|
|
256 const PfmCountersInfo &PCI = State.getPfmCounters();
|
|
257 // Uops per port.
|
|
258 for (const auto *IssueCounter = PCI.IssueCounters,
|
|
259 *IssueCounterEnd = PCI.IssueCounters + PCI.NumIssueCounters;
|
|
260 IssueCounter != IssueCounterEnd; ++IssueCounter) {
|
|
261 if (!IssueCounter->Counter)
|
|
262 continue;
|
|
263 auto ExpectedCounterValue = Executor.runAndMeasure(IssueCounter->Counter);
|
|
264 if (!ExpectedCounterValue)
|
|
265 return ExpectedCounterValue.takeError();
|
|
266 Result.push_back(BenchmarkMeasure::Create(IssueCounter->ProcResName,
|
|
267 *ExpectedCounterValue));
|
|
268 }
|
|
269 // NumMicroOps.
|
|
270 if (const char *const UopsCounter = PCI.UopsCounter) {
|
|
271 auto ExpectedCounterValue = Executor.runAndMeasure(UopsCounter);
|
|
272 if (!ExpectedCounterValue)
|
|
273 return ExpectedCounterValue.takeError();
|
|
274 Result.push_back(
|
|
275 BenchmarkMeasure::Create("NumMicroOps", *ExpectedCounterValue));
|
|
276 }
|
|
277 return std::move(Result);
|
|
278 }
|
|
279
|
|
280 constexpr const size_t UopsSnippetGenerator::kMinNumDifferentAddresses;
|
|
281
|
|
282 } // namespace exegesis
|
|
283 } // namespace llvm
|