diff tools/llvm-exegesis/lib/Uops.cpp @ 147:c2174574ed3a

LLVM 10
author Shinji KONO <kono@ie.u-ryukyu.ac.jp>
date Wed, 14 Aug 2019 16:55:33 +0900
parents
children
line wrap: on
line diff
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/tools/llvm-exegesis/lib/Uops.cpp	Wed Aug 14 16:55:33 2019 +0900
@@ -0,0 +1,283 @@
+//===-- Uops.cpp ------------------------------------------------*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "Uops.h"
+
+#include "Assembler.h"
+#include "BenchmarkRunner.h"
+#include "MCInstrDescView.h"
+#include "Target.h"
+
+// FIXME: Load constants into registers (e.g. with fld1) to not break
+// instructions like x87.
+
+// Ideally we would like the only limitation on executing uops to be the issue
+// ports. Maximizing port pressure increases the likelihood that the load is
+// distributed evenly across possible ports.
+
+// To achieve that, one approach is to generate instructions that do not have
+// data dependencies between them.
+//
+// For some instructions, this is trivial:
+//    mov rax, qword ptr [rsi]
+//    mov rax, qword ptr [rsi]
+//    mov rax, qword ptr [rsi]
+//    mov rax, qword ptr [rsi]
+// For the above snippet, haswell just renames rax four times and executes the
+// four instructions two at a time on P23 and P0126.
+//
+// For some instructions, we just need to make sure that the source is
+// different from the destination. For example, IDIV8r reads from GPR and
+// writes to AX. We just need to ensure that the Var is assigned a
+// register which is different from AX:
+//    idiv bx
+//    idiv bx
+//    idiv bx
+//    idiv bx
+// The above snippet will be able to fully saturate the ports, while the same
+// with ax would issue one uop every `latency(IDIV8r)` cycles.
+//
+// Some instructions make this harder because they both read and write from
+// the same register:
+//    inc rax
+//    inc rax
+//    inc rax
+//    inc rax
+// This has a data dependency from each instruction to the next, limit the
+// number of instructions that can be issued in parallel.
+// It turns out that this is not a big issue on recent Intel CPUs because they
+// have heuristics to balance port pressure. In the snippet above, subsequent
+// instructions will end up evenly distributed on {P0,P1,P5,P6}, but some CPUs
+// might end up executing them all on P0 (just because they can), or try
+// avoiding P5 because it's usually under high pressure from vector
+// instructions.
+// This issue is even more important for high-latency instructions because
+// they increase the idle time of the CPU, e.g. :
+//    imul rax, rbx
+//    imul rax, rbx
+//    imul rax, rbx
+//    imul rax, rbx
+//
+// To avoid that, we do the renaming statically by generating as many
+// independent exclusive assignments as possible (until all possible registers
+// are exhausted) e.g.:
+//    imul rax, rbx
+//    imul rcx, rbx
+//    imul rdx, rbx
+//    imul r8,  rbx
+//
+// Some instruction even make the above static renaming impossible because
+// they implicitly read and write from the same operand, e.g. ADC16rr reads
+// and writes from EFLAGS.
+// In that case we just use a greedy register assignment and hope for the
+// best.
+
+namespace llvm {
+namespace exegesis {
+
+static llvm::SmallVector<const Variable *, 8>
+getVariablesWithTiedOperands(const Instruction &Instr) {
+  llvm::SmallVector<const Variable *, 8> Result;
+  for (const auto &Var : Instr.Variables)
+    if (Var.hasTiedOperands())
+      Result.push_back(&Var);
+  return Result;
+}
+
+static void remove(llvm::BitVector &a, const llvm::BitVector &b) {
+  assert(a.size() == b.size());
+  for (auto I : b.set_bits())
+    a.reset(I);
+}
+
+UopsBenchmarkRunner::~UopsBenchmarkRunner() = default;
+
+UopsSnippetGenerator::~UopsSnippetGenerator() = default;
+
+void UopsSnippetGenerator::instantiateMemoryOperands(
+    const unsigned ScratchSpacePointerInReg,
+    std::vector<InstructionTemplate> &Instructions) const {
+  if (ScratchSpacePointerInReg == 0)
+    return; // no memory operands.
+  const auto &ET = State.getExegesisTarget();
+  const unsigned MemStep = ET.getMaxMemoryAccessSize();
+  const size_t OriginalInstructionsSize = Instructions.size();
+  size_t I = 0;
+  for (InstructionTemplate &IT : Instructions) {
+    ET.fillMemoryOperands(IT, ScratchSpacePointerInReg, I * MemStep);
+    ++I;
+  }
+
+  while (Instructions.size() < kMinNumDifferentAddresses) {
+    InstructionTemplate IT = Instructions[I % OriginalInstructionsSize];
+    ET.fillMemoryOperands(IT, ScratchSpacePointerInReg, I * MemStep);
+    ++I;
+    Instructions.push_back(std::move(IT));
+  }
+  assert(I * MemStep < BenchmarkRunner::ScratchSpace::kSize &&
+         "not enough scratch space");
+}
+
+static std::vector<InstructionTemplate> generateSnippetUsingStaticRenaming(
+    const LLVMState &State, const InstructionTemplate &IT,
+    const ArrayRef<const Variable *> TiedVariables,
+    const BitVector *ScratchSpaceAliasedRegs) {
+  std::vector<InstructionTemplate> Instructions;
+  // Assign registers to variables in a round-robin manner. This is simple but
+  // ensures that the most register-constrained variable does not get starved.
+  std::vector<BitVector> PossibleRegsForVar;
+  for (const Variable *Var : TiedVariables) {
+    assert(Var);
+    const Operand &Op = IT.Instr.getPrimaryOperand(*Var);
+    assert(Op.isReg());
+    BitVector PossibleRegs = State.getRATC().emptyRegisters();
+    if (ScratchSpaceAliasedRegs) {
+      PossibleRegs |= *ScratchSpaceAliasedRegs;
+    }
+    PossibleRegs.flip();
+    PossibleRegs &= Op.getRegisterAliasing().sourceBits();
+    PossibleRegsForVar.push_back(std::move(PossibleRegs));
+  }
+  SmallVector<int, 2> Iterators(TiedVariables.size(), 0);
+  while (true) {
+    InstructionTemplate TmpIT = IT;
+    // Find a possible register for each variable in turn, marking the
+    // register as taken.
+    for (size_t VarId = 0; VarId < TiedVariables.size(); ++VarId) {
+      const int NextPossibleReg =
+          PossibleRegsForVar[VarId].find_next(Iterators[VarId]);
+      if (NextPossibleReg <= 0) {
+        return Instructions;
+      }
+      TmpIT.getValueFor(*TiedVariables[VarId]) =
+          llvm::MCOperand::createReg(NextPossibleReg);
+      // Bump iterator.
+      Iterators[VarId] = NextPossibleReg;
+      // Prevent other variables from using the register.
+      for (BitVector &OtherPossibleRegs : PossibleRegsForVar) {
+        OtherPossibleRegs.reset(NextPossibleReg);
+      }
+    }
+    Instructions.push_back(std::move(TmpIT));
+  }
+}
+
+llvm::Expected<std::vector<CodeTemplate>>
+UopsSnippetGenerator::generateCodeTemplates(const Instruction &Instr) const {
+  CodeTemplate CT;
+  const llvm::BitVector *ScratchSpaceAliasedRegs = nullptr;
+  if (Instr.hasMemoryOperands()) {
+    const auto &ET = State.getExegesisTarget();
+    CT.ScratchSpacePointerInReg =
+        ET.getScratchMemoryRegister(State.getTargetMachine().getTargetTriple());
+    if (CT.ScratchSpacePointerInReg == 0)
+      return llvm::make_error<BenchmarkFailure>(
+          "Infeasible : target does not support memory instructions");
+    ScratchSpaceAliasedRegs =
+        &State.getRATC().getRegister(CT.ScratchSpacePointerInReg).aliasedBits();
+    // If the instruction implicitly writes to ScratchSpacePointerInReg , abort.
+    // FIXME: We could make a copy of the scratch register.
+    for (const auto &Op : Instr.Operands) {
+      if (Op.isDef() && Op.isImplicitReg() &&
+          ScratchSpaceAliasedRegs->test(Op.getImplicitReg()))
+        return llvm::make_error<BenchmarkFailure>(
+            "Infeasible : memory instruction uses scratch memory register");
+    }
+  }
+
+  const AliasingConfigurations SelfAliasing(Instr, Instr);
+  InstructionTemplate IT(Instr);
+  if (SelfAliasing.empty()) {
+    CT.Info = "instruction is parallel, repeating a random one.";
+    CT.Instructions.push_back(std::move(IT));
+    instantiateMemoryOperands(CT.ScratchSpacePointerInReg, CT.Instructions);
+    return getSingleton(std::move(CT));
+  }
+  if (SelfAliasing.hasImplicitAliasing()) {
+    CT.Info = "instruction is serial, repeating a random one.";
+    CT.Instructions.push_back(std::move(IT));
+    instantiateMemoryOperands(CT.ScratchSpacePointerInReg, CT.Instructions);
+    return getSingleton(std::move(CT));
+  }
+  const auto TiedVariables = getVariablesWithTiedOperands(Instr);
+  if (!TiedVariables.empty()) {
+    CT.Info = "instruction has tied variables, using static renaming.";
+    CT.Instructions = generateSnippetUsingStaticRenaming(
+        State, IT, TiedVariables, ScratchSpaceAliasedRegs);
+    instantiateMemoryOperands(CT.ScratchSpacePointerInReg, CT.Instructions);
+    return getSingleton(std::move(CT));
+  }
+  const auto &ReservedRegisters = State.getRATC().reservedRegisters();
+  // No tied variables, we pick random values for defs.
+  llvm::BitVector Defs(State.getRegInfo().getNumRegs());
+  for (const auto &Op : Instr.Operands) {
+    if (Op.isReg() && Op.isExplicit() && Op.isDef() && !Op.isMemory()) {
+      auto PossibleRegisters = Op.getRegisterAliasing().sourceBits();
+      remove(PossibleRegisters, ReservedRegisters);
+      // Do not use the scratch memory address register.
+      if (ScratchSpaceAliasedRegs)
+        remove(PossibleRegisters, *ScratchSpaceAliasedRegs);
+      assert(PossibleRegisters.any() && "No register left to choose from");
+      const auto RandomReg = randomBit(PossibleRegisters);
+      Defs.set(RandomReg);
+      IT.getValueFor(Op) = llvm::MCOperand::createReg(RandomReg);
+    }
+  }
+  // And pick random use values that are not reserved and don't alias with defs.
+  const auto DefAliases = getAliasedBits(State.getRegInfo(), Defs);
+  for (const auto &Op : Instr.Operands) {
+    if (Op.isReg() && Op.isExplicit() && Op.isUse() && !Op.isMemory()) {
+      auto PossibleRegisters = Op.getRegisterAliasing().sourceBits();
+      remove(PossibleRegisters, ReservedRegisters);
+      // Do not use the scratch memory address register.
+      if (ScratchSpaceAliasedRegs)
+        remove(PossibleRegisters, *ScratchSpaceAliasedRegs);
+      remove(PossibleRegisters, DefAliases);
+      assert(PossibleRegisters.any() && "No register left to choose from");
+      const auto RandomReg = randomBit(PossibleRegisters);
+      IT.getValueFor(Op) = llvm::MCOperand::createReg(RandomReg);
+    }
+  }
+  CT.Info =
+      "instruction has no tied variables picking Uses different from defs";
+  CT.Instructions.push_back(std::move(IT));
+  instantiateMemoryOperands(CT.ScratchSpacePointerInReg, CT.Instructions);
+  return getSingleton(std::move(CT));
+}
+
+llvm::Expected<std::vector<BenchmarkMeasure>>
+UopsBenchmarkRunner::runMeasurements(const FunctionExecutor &Executor) const {
+  std::vector<BenchmarkMeasure> Result;
+  const PfmCountersInfo &PCI = State.getPfmCounters();
+  // Uops per port.
+  for (const auto *IssueCounter = PCI.IssueCounters,
+                  *IssueCounterEnd = PCI.IssueCounters + PCI.NumIssueCounters;
+       IssueCounter != IssueCounterEnd; ++IssueCounter) {
+    if (!IssueCounter->Counter)
+      continue;
+    auto ExpectedCounterValue = Executor.runAndMeasure(IssueCounter->Counter);
+    if (!ExpectedCounterValue)
+      return ExpectedCounterValue.takeError();
+    Result.push_back(BenchmarkMeasure::Create(IssueCounter->ProcResName,
+                                              *ExpectedCounterValue));
+  }
+  // NumMicroOps.
+  if (const char *const UopsCounter = PCI.UopsCounter) {
+    auto ExpectedCounterValue = Executor.runAndMeasure(UopsCounter);
+    if (!ExpectedCounterValue)
+      return ExpectedCounterValue.takeError();
+    Result.push_back(
+        BenchmarkMeasure::Create("NumMicroOps", *ExpectedCounterValue));
+  }
+  return std::move(Result);
+}
+
+constexpr const size_t UopsSnippetGenerator::kMinNumDifferentAddresses;
+
+} // namespace exegesis
+} // namespace llvm