Mercurial > hg > CbC > CbC_llvm
diff tools/llvm-exegesis/lib/Uops.cpp @ 147:c2174574ed3a
LLVM 10
author | Shinji KONO <kono@ie.u-ryukyu.ac.jp> |
---|---|
date | Wed, 14 Aug 2019 16:55:33 +0900 |
parents | |
children |
line wrap: on
line diff
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/tools/llvm-exegesis/lib/Uops.cpp Wed Aug 14 16:55:33 2019 +0900 @@ -0,0 +1,283 @@ +//===-- Uops.cpp ------------------------------------------------*- C++ -*-===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#include "Uops.h" + +#include "Assembler.h" +#include "BenchmarkRunner.h" +#include "MCInstrDescView.h" +#include "Target.h" + +// FIXME: Load constants into registers (e.g. with fld1) to not break +// instructions like x87. + +// Ideally we would like the only limitation on executing uops to be the issue +// ports. Maximizing port pressure increases the likelihood that the load is +// distributed evenly across possible ports. + +// To achieve that, one approach is to generate instructions that do not have +// data dependencies between them. +// +// For some instructions, this is trivial: +// mov rax, qword ptr [rsi] +// mov rax, qword ptr [rsi] +// mov rax, qword ptr [rsi] +// mov rax, qword ptr [rsi] +// For the above snippet, haswell just renames rax four times and executes the +// four instructions two at a time on P23 and P0126. +// +// For some instructions, we just need to make sure that the source is +// different from the destination. For example, IDIV8r reads from GPR and +// writes to AX. We just need to ensure that the Var is assigned a +// register which is different from AX: +// idiv bx +// idiv bx +// idiv bx +// idiv bx +// The above snippet will be able to fully saturate the ports, while the same +// with ax would issue one uop every `latency(IDIV8r)` cycles. +// +// Some instructions make this harder because they both read and write from +// the same register: +// inc rax +// inc rax +// inc rax +// inc rax +// This has a data dependency from each instruction to the next, limit the +// number of instructions that can be issued in parallel. +// It turns out that this is not a big issue on recent Intel CPUs because they +// have heuristics to balance port pressure. In the snippet above, subsequent +// instructions will end up evenly distributed on {P0,P1,P5,P6}, but some CPUs +// might end up executing them all on P0 (just because they can), or try +// avoiding P5 because it's usually under high pressure from vector +// instructions. +// This issue is even more important for high-latency instructions because +// they increase the idle time of the CPU, e.g. : +// imul rax, rbx +// imul rax, rbx +// imul rax, rbx +// imul rax, rbx +// +// To avoid that, we do the renaming statically by generating as many +// independent exclusive assignments as possible (until all possible registers +// are exhausted) e.g.: +// imul rax, rbx +// imul rcx, rbx +// imul rdx, rbx +// imul r8, rbx +// +// Some instruction even make the above static renaming impossible because +// they implicitly read and write from the same operand, e.g. ADC16rr reads +// and writes from EFLAGS. +// In that case we just use a greedy register assignment and hope for the +// best. + +namespace llvm { +namespace exegesis { + +static llvm::SmallVector<const Variable *, 8> +getVariablesWithTiedOperands(const Instruction &Instr) { + llvm::SmallVector<const Variable *, 8> Result; + for (const auto &Var : Instr.Variables) + if (Var.hasTiedOperands()) + Result.push_back(&Var); + return Result; +} + +static void remove(llvm::BitVector &a, const llvm::BitVector &b) { + assert(a.size() == b.size()); + for (auto I : b.set_bits()) + a.reset(I); +} + +UopsBenchmarkRunner::~UopsBenchmarkRunner() = default; + +UopsSnippetGenerator::~UopsSnippetGenerator() = default; + +void UopsSnippetGenerator::instantiateMemoryOperands( + const unsigned ScratchSpacePointerInReg, + std::vector<InstructionTemplate> &Instructions) const { + if (ScratchSpacePointerInReg == 0) + return; // no memory operands. + const auto &ET = State.getExegesisTarget(); + const unsigned MemStep = ET.getMaxMemoryAccessSize(); + const size_t OriginalInstructionsSize = Instructions.size(); + size_t I = 0; + for (InstructionTemplate &IT : Instructions) { + ET.fillMemoryOperands(IT, ScratchSpacePointerInReg, I * MemStep); + ++I; + } + + while (Instructions.size() < kMinNumDifferentAddresses) { + InstructionTemplate IT = Instructions[I % OriginalInstructionsSize]; + ET.fillMemoryOperands(IT, ScratchSpacePointerInReg, I * MemStep); + ++I; + Instructions.push_back(std::move(IT)); + } + assert(I * MemStep < BenchmarkRunner::ScratchSpace::kSize && + "not enough scratch space"); +} + +static std::vector<InstructionTemplate> generateSnippetUsingStaticRenaming( + const LLVMState &State, const InstructionTemplate &IT, + const ArrayRef<const Variable *> TiedVariables, + const BitVector *ScratchSpaceAliasedRegs) { + std::vector<InstructionTemplate> Instructions; + // Assign registers to variables in a round-robin manner. This is simple but + // ensures that the most register-constrained variable does not get starved. + std::vector<BitVector> PossibleRegsForVar; + for (const Variable *Var : TiedVariables) { + assert(Var); + const Operand &Op = IT.Instr.getPrimaryOperand(*Var); + assert(Op.isReg()); + BitVector PossibleRegs = State.getRATC().emptyRegisters(); + if (ScratchSpaceAliasedRegs) { + PossibleRegs |= *ScratchSpaceAliasedRegs; + } + PossibleRegs.flip(); + PossibleRegs &= Op.getRegisterAliasing().sourceBits(); + PossibleRegsForVar.push_back(std::move(PossibleRegs)); + } + SmallVector<int, 2> Iterators(TiedVariables.size(), 0); + while (true) { + InstructionTemplate TmpIT = IT; + // Find a possible register for each variable in turn, marking the + // register as taken. + for (size_t VarId = 0; VarId < TiedVariables.size(); ++VarId) { + const int NextPossibleReg = + PossibleRegsForVar[VarId].find_next(Iterators[VarId]); + if (NextPossibleReg <= 0) { + return Instructions; + } + TmpIT.getValueFor(*TiedVariables[VarId]) = + llvm::MCOperand::createReg(NextPossibleReg); + // Bump iterator. + Iterators[VarId] = NextPossibleReg; + // Prevent other variables from using the register. + for (BitVector &OtherPossibleRegs : PossibleRegsForVar) { + OtherPossibleRegs.reset(NextPossibleReg); + } + } + Instructions.push_back(std::move(TmpIT)); + } +} + +llvm::Expected<std::vector<CodeTemplate>> +UopsSnippetGenerator::generateCodeTemplates(const Instruction &Instr) const { + CodeTemplate CT; + const llvm::BitVector *ScratchSpaceAliasedRegs = nullptr; + if (Instr.hasMemoryOperands()) { + const auto &ET = State.getExegesisTarget(); + CT.ScratchSpacePointerInReg = + ET.getScratchMemoryRegister(State.getTargetMachine().getTargetTriple()); + if (CT.ScratchSpacePointerInReg == 0) + return llvm::make_error<BenchmarkFailure>( + "Infeasible : target does not support memory instructions"); + ScratchSpaceAliasedRegs = + &State.getRATC().getRegister(CT.ScratchSpacePointerInReg).aliasedBits(); + // If the instruction implicitly writes to ScratchSpacePointerInReg , abort. + // FIXME: We could make a copy of the scratch register. + for (const auto &Op : Instr.Operands) { + if (Op.isDef() && Op.isImplicitReg() && + ScratchSpaceAliasedRegs->test(Op.getImplicitReg())) + return llvm::make_error<BenchmarkFailure>( + "Infeasible : memory instruction uses scratch memory register"); + } + } + + const AliasingConfigurations SelfAliasing(Instr, Instr); + InstructionTemplate IT(Instr); + if (SelfAliasing.empty()) { + CT.Info = "instruction is parallel, repeating a random one."; + CT.Instructions.push_back(std::move(IT)); + instantiateMemoryOperands(CT.ScratchSpacePointerInReg, CT.Instructions); + return getSingleton(std::move(CT)); + } + if (SelfAliasing.hasImplicitAliasing()) { + CT.Info = "instruction is serial, repeating a random one."; + CT.Instructions.push_back(std::move(IT)); + instantiateMemoryOperands(CT.ScratchSpacePointerInReg, CT.Instructions); + return getSingleton(std::move(CT)); + } + const auto TiedVariables = getVariablesWithTiedOperands(Instr); + if (!TiedVariables.empty()) { + CT.Info = "instruction has tied variables, using static renaming."; + CT.Instructions = generateSnippetUsingStaticRenaming( + State, IT, TiedVariables, ScratchSpaceAliasedRegs); + instantiateMemoryOperands(CT.ScratchSpacePointerInReg, CT.Instructions); + return getSingleton(std::move(CT)); + } + const auto &ReservedRegisters = State.getRATC().reservedRegisters(); + // No tied variables, we pick random values for defs. + llvm::BitVector Defs(State.getRegInfo().getNumRegs()); + for (const auto &Op : Instr.Operands) { + if (Op.isReg() && Op.isExplicit() && Op.isDef() && !Op.isMemory()) { + auto PossibleRegisters = Op.getRegisterAliasing().sourceBits(); + remove(PossibleRegisters, ReservedRegisters); + // Do not use the scratch memory address register. + if (ScratchSpaceAliasedRegs) + remove(PossibleRegisters, *ScratchSpaceAliasedRegs); + assert(PossibleRegisters.any() && "No register left to choose from"); + const auto RandomReg = randomBit(PossibleRegisters); + Defs.set(RandomReg); + IT.getValueFor(Op) = llvm::MCOperand::createReg(RandomReg); + } + } + // And pick random use values that are not reserved and don't alias with defs. + const auto DefAliases = getAliasedBits(State.getRegInfo(), Defs); + for (const auto &Op : Instr.Operands) { + if (Op.isReg() && Op.isExplicit() && Op.isUse() && !Op.isMemory()) { + auto PossibleRegisters = Op.getRegisterAliasing().sourceBits(); + remove(PossibleRegisters, ReservedRegisters); + // Do not use the scratch memory address register. + if (ScratchSpaceAliasedRegs) + remove(PossibleRegisters, *ScratchSpaceAliasedRegs); + remove(PossibleRegisters, DefAliases); + assert(PossibleRegisters.any() && "No register left to choose from"); + const auto RandomReg = randomBit(PossibleRegisters); + IT.getValueFor(Op) = llvm::MCOperand::createReg(RandomReg); + } + } + CT.Info = + "instruction has no tied variables picking Uses different from defs"; + CT.Instructions.push_back(std::move(IT)); + instantiateMemoryOperands(CT.ScratchSpacePointerInReg, CT.Instructions); + return getSingleton(std::move(CT)); +} + +llvm::Expected<std::vector<BenchmarkMeasure>> +UopsBenchmarkRunner::runMeasurements(const FunctionExecutor &Executor) const { + std::vector<BenchmarkMeasure> Result; + const PfmCountersInfo &PCI = State.getPfmCounters(); + // Uops per port. + for (const auto *IssueCounter = PCI.IssueCounters, + *IssueCounterEnd = PCI.IssueCounters + PCI.NumIssueCounters; + IssueCounter != IssueCounterEnd; ++IssueCounter) { + if (!IssueCounter->Counter) + continue; + auto ExpectedCounterValue = Executor.runAndMeasure(IssueCounter->Counter); + if (!ExpectedCounterValue) + return ExpectedCounterValue.takeError(); + Result.push_back(BenchmarkMeasure::Create(IssueCounter->ProcResName, + *ExpectedCounterValue)); + } + // NumMicroOps. + if (const char *const UopsCounter = PCI.UopsCounter) { + auto ExpectedCounterValue = Executor.runAndMeasure(UopsCounter); + if (!ExpectedCounterValue) + return ExpectedCounterValue.takeError(); + Result.push_back( + BenchmarkMeasure::Create("NumMicroOps", *ExpectedCounterValue)); + } + return std::move(Result); +} + +constexpr const size_t UopsSnippetGenerator::kMinNumDifferentAddresses; + +} // namespace exegesis +} // namespace llvm