diff lld/MachO/SyntheticSections.h @ 236:c4bab56944e8 llvm-original

LLVM 16
author kono
date Wed, 09 Nov 2022 17:45:10 +0900
parents 5f17cb93ff66
children 1f2b6ac9f198
line wrap: on
line diff
--- a/lld/MachO/SyntheticSections.h	Wed Jul 21 10:27:27 2021 +0900
+++ b/lld/MachO/SyntheticSections.h	Wed Nov 09 17:45:10 2022 +0900
@@ -19,8 +19,9 @@
 
 #include "llvm/ADT/DenseMap.h"
 #include "llvm/ADT/Hashing.h"
+#include "llvm/ADT/Optional.h"
 #include "llvm/ADT/SetVector.h"
-#include "llvm/MC/StringTableBuilder.h"
+#include "llvm/BinaryFormat/MachO.h"
 #include "llvm/Support/MathExtras.h"
 #include "llvm/Support/raw_ostream.h"
 
@@ -30,8 +31,7 @@
 class DWARFUnit;
 } // namespace llvm
 
-namespace lld {
-namespace macho {
+namespace lld::macho {
 
 class Defined;
 class DylibSymbol;
@@ -62,12 +62,14 @@
     align = target->wordSize;
   }
 
+  // Implementations of this method can assume that the regular (non-__LINKEDIT)
+  // sections already have their addresses assigned.
   virtual void finalizeContents() {}
 
   // Sections in __LINKEDIT are special: their offsets are recorded in the
   // load commands like LC_DYLD_INFO_ONLY and LC_SYMTAB, instead of in section
   // headers.
-  bool isHidden() const override final { return true; }
+  bool isHidden() const final { return true; }
 
   virtual uint64_t getRawSize() const = 0;
 
@@ -77,9 +79,7 @@
   //
   // NOTE: This assumes that the extra bytes required for alignment can be
   // zero-valued bytes.
-  uint64_t getSize() const override final {
-    return llvm::alignTo(getRawSize(), align);
-  }
+  uint64_t getSize() const final { return llvm::alignTo(getRawSize(), align); }
 };
 
 // The header of the Mach-O file, which must have a file offset of zero.
@@ -103,6 +103,7 @@
 public:
   PageZeroSection();
   bool isHidden() const override { return true; }
+  bool isNeeded() const override { return target->pageZeroSize != 0; }
   uint64_t getSize() const override { return target->pageZeroSize; }
   uint64_t getFileSize() const override { return 0; }
   void writeTo(uint8_t *buf) const override {}
@@ -189,13 +190,13 @@
   bool isNeeded() const override { return !bindingsMap.empty(); }
   void writeTo(uint8_t *buf) const override;
 
-  void addEntry(const DylibSymbol *dysym, const InputSection *isec,
-                uint64_t offset, int64_t addend = 0) {
+  void addEntry(const Symbol *dysym, const InputSection *isec, uint64_t offset,
+                int64_t addend = 0) {
     bindingsMap[dysym].emplace_back(addend, Location(isec, offset));
   }
 
 private:
-  BindingsMap<const DylibSymbol *> bindingsMap;
+  BindingsMap<const Symbol *> bindingsMap;
   SmallVector<char, 128> contents;
 };
 
@@ -269,6 +270,12 @@
 // order that the weak bindings may overwrite the non-lazy bindings if an
 // appropriate symbol is found at runtime. However, the bound addresses will
 // still be written (non-lazily) into the LazyPointerSection.
+//
+// Symbols are always bound eagerly when chained fixups are used. In that case,
+// StubsSection contains indirect jumps to addresses stored in the GotSection.
+// The GOT directly contains the fixup entries, which will be replaced by the
+// address of the target symbols on load. LazyPointerSection and
+// StubHelperSection are not used.
 
 class StubsSection final : public SyntheticSection {
 public:
@@ -278,9 +285,9 @@
   void finalize() override;
   void writeTo(uint8_t *buf) const override;
   const llvm::SetVector<Symbol *> &getEntries() const { return entries; }
-  // Returns whether the symbol was added. Note that every stubs entry will
-  // have a corresponding entry in the LazyPointerSection.
-  bool addEntry(Symbol *);
+  // Creates a stub for the symbol and the corresponding entry in the
+  // LazyPointerSection.
+  void addEntry(Symbol *);
   uint64_t getVA(uint32_t stubsIndex) const {
     assert(isFinal || target->usesThunks());
     // ConcatOutputSection::finalize() can seek the address of a
@@ -303,12 +310,36 @@
   bool isNeeded() const override;
   void writeTo(uint8_t *buf) const override;
 
-  void setup();
+  void setUp();
 
   DylibSymbol *stubBinder = nullptr;
   Defined *dyldPrivate = nullptr;
 };
 
+// Objective-C stubs are hoisted objc_msgSend calls per selector called in the
+// program. Apple Clang produces undefined symbols to each stub, such as
+// '_objc_msgSend$foo', which are then synthesized by the linker. The stubs
+// load the particular selector 'foo' from __objc_selrefs, setting it to the
+// first argument of the objc_msgSend call, and then jumps to objc_msgSend. The
+// actual stub contents are mirrored from ld64.
+class ObjCStubsSection final : public SyntheticSection {
+public:
+  ObjCStubsSection();
+  void addEntry(Symbol *sym);
+  uint64_t getSize() const override;
+  bool isNeeded() const override { return !symbols.empty(); }
+  void finalize() override { isec->isFinal = true; }
+  void writeTo(uint8_t *buf) const override;
+  void setUp();
+
+  static constexpr llvm::StringLiteral symbolPrefix = "_objc_msgSend$";
+
+private:
+  std::vector<Defined *> symbols;
+  std::vector<uint32_t> offsets;
+  int objcMsgSendGotIndex = 0;
+};
+
 // Note that this section may also be targeted by non-lazy bindings. In
 // particular, this happens when branch relocations target weak symbols.
 class LazyPointerSection final : public SyntheticSection {
@@ -317,6 +348,9 @@
   uint64_t getSize() const override;
   bool isNeeded() const override;
   void writeTo(uint8_t *buf) const override;
+  uint64_t getVA(uint32_t index) const {
+    return addr + (index << target->p2WordSize);
+  }
 };
 
 class LazyBindingSection final : public LinkEditSection {
@@ -328,13 +362,13 @@
   void writeTo(uint8_t *buf) const override;
   // Note that every entry here will by referenced by a corresponding entry in
   // the StubHelperSection.
-  void addEntry(DylibSymbol *dysym);
-  const llvm::SetVector<DylibSymbol *> &getEntries() const { return entries; }
+  void addEntry(Symbol *dysym);
+  const llvm::SetVector<Symbol *> &getEntries() const { return entries; }
 
 private:
-  uint32_t encode(const DylibSymbol &);
+  uint32_t encode(const Symbol &);
 
-  llvm::SetVector<DylibSymbol *> entries;
+  llvm::SetVector<Symbol *> entries;
   SmallVector<char, 128> contents;
   llvm::raw_svector_ostream os{contents};
 };
@@ -345,6 +379,7 @@
   ExportSection();
   void finalizeContents() override;
   uint64_t getRawSize() const override { return size; }
+  bool isNeeded() const override { return size; }
   void writeTo(uint8_t *buf) const override;
 
   bool hasWeakSymbol = false;
@@ -354,8 +389,9 @@
   size_t size = 0;
 };
 
-// Stores 'data in code' entries that describe the locations of
-// data regions inside code sections.
+// Stores 'data in code' entries that describe the locations of data regions
+// inside code sections. This is used by llvm-objdump to distinguish jump tables
+// and stop them from being disassembled as instructions.
 class DataInCodeSection final : public LinkEditSection {
 public:
   DataInCodeSection();
@@ -431,7 +467,7 @@
   uint32_t getNumUndefinedSymbols() const { return undefinedSymbols.size(); }
 
 private:
-  void emitBeginSourceStab(llvm::DWARFUnit *compileUnit);
+  void emitBeginSourceStab(StringRef);
   void emitEndSourceStab();
   void emitObjectFileStab(ObjFile *);
   void emitEndFunStab(Defined *);
@@ -476,6 +512,8 @@
 // The code signature comes at the very end of the linked output file.
 class CodeSignatureSection final : public LinkEditSection {
 public:
+  // NOTE: These values are duplicated in llvm-objcopy's MachO/Object.h file
+  // and any changes here, should be repeated there.
   static constexpr uint8_t blockSizeShift = 12;
   static constexpr size_t blockSize = (1 << blockSizeShift); // 4 KiB
   static constexpr size_t hashSize = 256 / 8;
@@ -510,7 +548,7 @@
 
 class CStringSection : public SyntheticSection {
 public:
-  CStringSection();
+  CStringSection(const char *name);
   void addInput(CStringInputSection *);
   uint64_t getSize() const override { return size; }
   virtual void finalizeContents();
@@ -525,13 +563,23 @@
 
 class DeduplicatedCStringSection final : public CStringSection {
 public:
-  DeduplicatedCStringSection();
-  uint64_t getSize() const override { return builder.getSize(); }
+  DeduplicatedCStringSection(const char *name) : CStringSection(name){};
+  uint64_t getSize() const override { return size; }
   void finalizeContents() override;
-  void writeTo(uint8_t *buf) const override { builder.write(buf); }
+  void writeTo(uint8_t *buf) const override;
+
+  struct StringOffset {
+    uint8_t trailingZeros;
+    uint64_t outSecOff = UINT64_MAX;
+
+    explicit StringOffset(uint8_t zeros) : trailingZeros(zeros) {}
+  };
+
+  StringOffset getStringOffset(StringRef str) const;
 
 private:
-  llvm::StringTableBuilder builder;
+  llvm::DenseMap<llvm::CachedHashStringRef, StringOffset> stringOffsetMap;
+  size_t size = 0;
 };
 
 /*
@@ -543,7 +591,7 @@
   using UInt128 = std::pair<uint64_t, uint64_t>;
   // I don't think the standard guarantees the size of a pair, so let's make
   // sure it's exact -- that way we can construct it via `mmap`.
-  static_assert(sizeof(UInt128) == 16, "");
+  static_assert(sizeof(UInt128) == 16);
 
   WordLiteralSection();
   void addInput(WordLiteralInputSection *);
@@ -560,16 +608,16 @@
            !literal8Map.empty();
   }
 
-  uint64_t getLiteral16Offset(const uint8_t *buf) const {
+  uint64_t getLiteral16Offset(uintptr_t buf) const {
     return literal16Map.at(*reinterpret_cast<const UInt128 *>(buf)) * 16;
   }
 
-  uint64_t getLiteral8Offset(const uint8_t *buf) const {
+  uint64_t getLiteral8Offset(uintptr_t buf) const {
     return literal16Map.size() * 16 +
            literal8Map.at(*reinterpret_cast<const uint64_t *>(buf)) * 8;
   }
 
-  uint64_t getLiteral4Offset(const uint8_t *buf) const {
+  uint64_t getLiteral4Offset(uintptr_t buf) const {
     return literal16Map.size() * 16 + literal8Map.size() * 8 +
            literal4Map.at(*reinterpret_cast<const uint32_t *>(buf)) * 4;
   }
@@ -588,9 +636,163 @@
   std::unordered_map<uint32_t, uint64_t> literal4Map;
 };
 
+class ObjCImageInfoSection final : public SyntheticSection {
+public:
+  ObjCImageInfoSection();
+  bool isNeeded() const override { return !files.empty(); }
+  uint64_t getSize() const override { return 8; }
+  void addFile(const InputFile *file) {
+    assert(!file->objCImageInfo.empty());
+    files.push_back(file);
+  }
+  void finalizeContents();
+  void writeTo(uint8_t *buf) const override;
+
+private:
+  struct ImageInfo {
+    uint8_t swiftVersion = 0;
+    bool hasCategoryClassProperties = false;
+  } info;
+  static ImageInfo parseImageInfo(const InputFile *);
+  std::vector<const InputFile *> files; // files with image info
+};
+
+// This section stores 32-bit __TEXT segment offsets of initializer functions.
+//
+// The compiler stores pointers to initializers in __mod_init_func. These need
+// to be fixed up at load time, which takes time and dirties memory. By
+// synthesizing InitOffsetsSection from them, this data can live in the
+// read-only __TEXT segment instead. This section is used by default when
+// chained fixups are enabled.
+//
+// There is no similar counterpart to __mod_term_func, as that section is
+// deprecated, and static destructors are instead handled by registering them
+// via __cxa_atexit from an autogenerated initializer function (see D121736).
+class InitOffsetsSection final : public SyntheticSection {
+public:
+  InitOffsetsSection();
+  bool isNeeded() const override { return !sections.empty(); }
+  uint64_t getSize() const override;
+  void writeTo(uint8_t *buf) const override;
+  void setUp();
+
+  void addInput(ConcatInputSection *isec) { sections.push_back(isec); }
+  const std::vector<ConcatInputSection *> &inputs() const { return sections; }
+
+private:
+  std::vector<ConcatInputSection *> sections;
+};
+
+// Chained fixups are a replacement for classic dyld opcodes. In this format,
+// most of the metadata necessary for binding symbols and rebasing addresses is
+// stored directly in the memory location that will have the fixup applied.
+//
+// The fixups form singly linked lists; each one covering a single page in
+// memory. The __LINKEDIT,__chainfixups section stores the page offset of the
+// first fixup of each page; the rest can be found by walking the chain using
+// the offset that is embedded in each entry.
+//
+// This setup allows pages to be relocated lazily at page-in time and without
+// being dirtied. The kernel can discard and load them again as needed. This
+// technique, called page-in linking, was introduced in macOS 13.
+//
+// The benefits of this format are:
+//  - smaller __LINKEDIT segment, as most of the fixup information is stored in
+//    the data segment
+//  - faster startup, since not all relocations need to be done upfront
+//  - slightly lower memory usage, as fewer pages are dirtied
+//
+// Userspace x86_64 and arm64 binaries have two types of fixup entries:
+//   - Rebase entries contain an absolute address, to which the object's load
+//     address will be added to get the final value. This is used for loading
+//     the address of a symbol defined in the same binary.
+//   - Binding entries are mostly used for symbols imported from other dylibs,
+//     but for weakly bound and interposable symbols as well. They are looked up
+//     by a (symbol name, library) pair stored in __chainfixups. This import
+//     entry also encodes whether the import is weak (i.e. if the symbol is
+//     missing, it should be set to null instead of producing a load error).
+//     The fixup encodes an ordinal associated with the import, and an optional
+//     addend.
+//
+// The entries are tightly packed 64-bit bitfields. One of the bits specifies
+// which kind of fixup to interpret them as.
+//
+// LLD generates the fixup data in 5 stages:
+//   1. While scanning relocations, we make a note of each location that needs
+//      a fixup by calling addRebase() or addBinding(). During this, we assign
+//      a unique ordinal for each (symbol name, library, addend) import tuple.
+//   2. After addresses have been assigned to all sections, and thus the memory
+//      layout of the linked image is final; finalizeContents() is called. Here,
+//      the page offsets of the chain start entries are calculated.
+//   3. ChainedFixupsSection::writeTo() writes the page start offsets and the
+//      imports table to the output file.
+//   4. Each section's fixup entries are encoded and written to disk in
+//      ConcatInputSection::writeTo(), but without writing the offsets that form
+//      the chain.
+//   5. Finally, each page's (which might correspond to multiple sections)
+//      fixups are linked together in Writer::buildFixupChains().
+class ChainedFixupsSection final : public LinkEditSection {
+public:
+  ChainedFixupsSection();
+  void finalizeContents() override;
+  uint64_t getRawSize() const override { return size; }
+  bool isNeeded() const override;
+  void writeTo(uint8_t *buf) const override;
+
+  void addRebase(const InputSection *isec, uint64_t offset) {
+    locations.emplace_back(isec, offset);
+  }
+  void addBinding(const Symbol *dysym, const InputSection *isec,
+                  uint64_t offset, int64_t addend = 0);
+
+  void setHasNonWeakDefinition() { hasNonWeakDef = true; }
+
+  // Returns an (ordinal, inline addend) tuple used by dyld_chained_ptr_64_bind.
+  std::pair<uint32_t, uint8_t> getBinding(const Symbol *sym,
+                                          int64_t addend) const;
+
+  const std::vector<Location> &getLocations() const { return locations; }
+
+  bool hasWeakBinding() const { return hasWeakBind; }
+  bool hasNonWeakDefinition() const { return hasNonWeakDef; }
+
+private:
+  // Location::offset initially stores the offset within an InputSection, but
+  // contains output segment offsets after finalizeContents().
+  std::vector<Location> locations;
+  // (target symbol, addend) => import ordinal
+  llvm::MapVector<std::pair<const Symbol *, int64_t>, uint32_t> bindings;
+
+  struct SegmentInfo {
+    SegmentInfo(const OutputSegment *oseg) : oseg(oseg) {}
+
+    const OutputSegment *oseg;
+    // (page index, fixup starts offset)
+    llvm::SmallVector<std::pair<uint16_t, uint16_t>> pageStarts;
+
+    size_t getSize() const;
+    size_t writeTo(uint8_t *buf) const;
+  };
+  llvm::SmallVector<SegmentInfo, 4> fixupSegments;
+
+  size_t symtabSize = 0;
+  size_t size = 0;
+
+  bool needsAddend = false;
+  bool needsLargeAddend = false;
+  bool hasWeakBind = false;
+  bool hasNonWeakDef = false;
+  llvm::MachO::ChainedImportFormat importFormat;
+};
+
+void writeChainedRebase(uint8_t *buf, uint64_t targetVA);
+void writeChainedFixup(uint8_t *buf, const Symbol *sym, int64_t addend);
+
 struct InStruct {
+  const uint8_t *bufferStart = nullptr;
   MachHeaderSection *header = nullptr;
   CStringSection *cStringSection = nullptr;
+  DeduplicatedCStringSection *objcMethnameSection = nullptr;
   WordLiteralSection *wordLiteralSection = nullptr;
   RebaseSection *rebase = nullptr;
   BindingSection *binding = nullptr;
@@ -602,8 +804,13 @@
   LazyPointerSection *lazyPointers = nullptr;
   StubsSection *stubs = nullptr;
   StubHelperSection *stubHelper = nullptr;
+  ObjCStubsSection *objcStubs = nullptr;
+  ConcatInputSection *objcSelrefs = nullptr;
   UnwindInfoSection *unwindInfo = nullptr;
+  ObjCImageInfoSection *objCImageInfo = nullptr;
   ConcatInputSection *imageLoaderCache = nullptr;
+  InitOffsetsSection *initOffsets = nullptr;
+  ChainedFixupsSection *chainedFixups = nullptr;
 };
 
 extern InStruct in;
@@ -611,7 +818,6 @@
 
 void createSyntheticSymbols();
 
-} // namespace macho
-} // namespace lld
+} // namespace lld::macho
 
 #endif