Alexander Shaposhnikov | d911ed1 | 2019-02-02 00:38:07 +0000 | [diff] [blame] | 1 | //===- Object.h - Mach-O object file model ----------------------*- C++ -*-===// |
| 2 | // |
Chandler Carruth | 127252b | 2019-02-11 08:25:19 +0000 | [diff] [blame] | 3 | // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. |
| 4 | // See https://llvm.org/LICENSE.txt for license information. |
| 5 | // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception |
Alexander Shaposhnikov | d911ed1 | 2019-02-02 00:38:07 +0000 | [diff] [blame] | 6 | // |
| 7 | //===----------------------------------------------------------------------===// |
| 8 | |
| 9 | #ifndef LLVM_OBJCOPY_MACHO_OBJECT_H |
| 10 | #define LLVM_OBJCOPY_MACHO_OBJECT_H |
| 11 | |
| 12 | #include "llvm/ADT/Optional.h" |
| 13 | #include "llvm/ADT/StringRef.h" |
| 14 | #include "llvm/BinaryFormat/MachO.h" |
Seiya Nuta | f923d9b | 2019-06-21 00:21:50 +0000 | [diff] [blame] | 15 | #include "llvm/MC/StringTableBuilder.h" |
Alexander Shaposhnikov | d911ed1 | 2019-02-02 00:38:07 +0000 | [diff] [blame] | 16 | #include "llvm/ObjectYAML/DWARFYAML.h" |
| 17 | #include "llvm/Support/YAMLTraits.h" |
| 18 | #include <cstdint> |
| 19 | #include <string> |
| 20 | #include <vector> |
| 21 | |
| 22 | namespace llvm { |
| 23 | namespace objcopy { |
| 24 | namespace macho { |
| 25 | |
| 26 | struct MachHeader { |
| 27 | uint32_t Magic; |
| 28 | uint32_t CPUType; |
| 29 | uint32_t CPUSubType; |
| 30 | uint32_t FileType; |
| 31 | uint32_t NCmds; |
| 32 | uint32_t SizeOfCmds; |
| 33 | uint32_t Flags; |
| 34 | uint32_t Reserved = 0; |
| 35 | }; |
| 36 | |
Seiya Nuta | f923d9b | 2019-06-21 00:21:50 +0000 | [diff] [blame] | 37 | struct RelocationInfo; |
Alexander Shaposhnikov | d911ed1 | 2019-02-02 00:38:07 +0000 | [diff] [blame] | 38 | struct Section { |
Seiya Nuta | b728e53 | 2019-06-08 01:22:54 +0000 | [diff] [blame] | 39 | std::string Sectname; |
| 40 | std::string Segname; |
Seiya Nuta | 7f19dd1 | 2019-10-28 15:40:37 +0900 | [diff] [blame] | 41 | // CanonicalName is a string formatted as “<Segname>,<Sectname>". |
| 42 | std::string CanonicalName; |
Alexander Shaposhnikov | d911ed1 | 2019-02-02 00:38:07 +0000 | [diff] [blame] | 43 | uint64_t Addr; |
| 44 | uint64_t Size; |
| 45 | uint32_t Offset; |
| 46 | uint32_t Align; |
| 47 | uint32_t RelOff; |
| 48 | uint32_t NReloc; |
| 49 | uint32_t Flags; |
| 50 | uint32_t Reserved1; |
| 51 | uint32_t Reserved2; |
| 52 | uint32_t Reserved3; |
| 53 | |
| 54 | StringRef Content; |
Seiya Nuta | f923d9b | 2019-06-21 00:21:50 +0000 | [diff] [blame] | 55 | std::vector<RelocationInfo> Relocations; |
Seiya Nuta | b728e53 | 2019-06-08 01:22:54 +0000 | [diff] [blame] | 56 | |
| 57 | MachO::SectionType getType() const { |
| 58 | return static_cast<MachO::SectionType>(Flags & MachO::SECTION_TYPE); |
| 59 | } |
| 60 | |
| 61 | bool isVirtualSection() const { |
| 62 | return (getType() == MachO::S_ZEROFILL || |
| 63 | getType() == MachO::S_GB_ZEROFILL || |
| 64 | getType() == MachO::S_THREAD_LOCAL_ZEROFILL); |
| 65 | } |
Alexander Shaposhnikov | d911ed1 | 2019-02-02 00:38:07 +0000 | [diff] [blame] | 66 | }; |
| 67 | |
| 68 | struct LoadCommand { |
| 69 | // The type MachO::macho_load_command is defined in llvm/BinaryFormat/MachO.h |
| 70 | // and it is a union of all the structs corresponding to various load |
| 71 | // commands. |
| 72 | MachO::macho_load_command MachOLoadCommand; |
| 73 | |
| 74 | // The raw content of the payload of the load command (located right after the |
| 75 | // corresponding struct). In some cases it is either empty or can be |
| 76 | // copied-over without digging into its structure. |
| 77 | ArrayRef<uint8_t> Payload; |
| 78 | |
| 79 | // Some load commands can contain (inside the payload) an array of sections, |
| 80 | // though the contents of the sections are stored separately. The struct |
| 81 | // Section describes only sections' metadata and where to find the |
| 82 | // corresponding content inside the binary. |
| 83 | std::vector<Section> Sections; |
| 84 | }; |
| 85 | |
Seiya Nuta | f923d9b | 2019-06-21 00:21:50 +0000 | [diff] [blame] | 86 | // A symbol information. Fields which starts with "n_" are same as them in the |
| 87 | // nlist. |
| 88 | struct SymbolEntry { |
| 89 | std::string Name; |
| 90 | uint32_t Index; |
Alexander Shaposhnikov | d911ed1 | 2019-02-02 00:38:07 +0000 | [diff] [blame] | 91 | uint8_t n_type; |
| 92 | uint8_t n_sect; |
| 93 | uint16_t n_desc; |
| 94 | uint64_t n_value; |
Seiya Nuta | 552bcb8 | 2019-08-19 21:05:31 +0000 | [diff] [blame] | 95 | |
| 96 | bool isExternalSymbol() const { |
| 97 | return n_type & ((MachO::N_EXT | MachO::N_PEXT)); |
| 98 | } |
| 99 | |
| 100 | bool isLocalSymbol() const { return !isExternalSymbol(); } |
| 101 | |
| 102 | bool isUndefinedSymbol() const { |
| 103 | return (n_type & MachO::N_TYPE) == MachO::N_UNDF; |
| 104 | } |
Alexander Shaposhnikov | d911ed1 | 2019-02-02 00:38:07 +0000 | [diff] [blame] | 105 | }; |
| 106 | |
| 107 | /// The location of the symbol table inside the binary is described by LC_SYMTAB |
| 108 | /// load command. |
| 109 | struct SymbolTable { |
Seiya Nuta | f923d9b | 2019-06-21 00:21:50 +0000 | [diff] [blame] | 110 | std::vector<std::unique_ptr<SymbolEntry>> Symbols; |
| 111 | |
| 112 | const SymbolEntry *getSymbolByIndex(uint32_t Index) const; |
Alexander Shaposhnikov | d911ed1 | 2019-02-02 00:38:07 +0000 | [diff] [blame] | 113 | }; |
| 114 | |
Seiya Nuta | 1e589f6 | 2019-10-30 15:12:17 +0900 | [diff] [blame^] | 115 | struct IndirectSymbolEntry { |
| 116 | // The original value in an indirect symbol table. Higher bits encode extra |
| 117 | // information (INDIRECT_SYMBOL_LOCAL and INDIRECT_SYMBOL_ABS). |
| 118 | uint32_t OriginalIndex; |
| 119 | /// The Symbol referenced by this entry. It's None if the index is |
| 120 | /// INDIRECT_SYMBOL_LOCAL or INDIRECT_SYMBOL_ABS. |
| 121 | Optional<const SymbolEntry *> Symbol; |
| 122 | |
| 123 | IndirectSymbolEntry(uint32_t OriginalIndex, |
| 124 | Optional<const SymbolEntry *> Symbol) |
| 125 | : OriginalIndex(OriginalIndex), Symbol(Symbol) {} |
| 126 | }; |
| 127 | |
Seiya Nuta | 552bcb8 | 2019-08-19 21:05:31 +0000 | [diff] [blame] | 128 | struct IndirectSymbolTable { |
Seiya Nuta | 1e589f6 | 2019-10-30 15:12:17 +0900 | [diff] [blame^] | 129 | std::vector<IndirectSymbolEntry> Symbols; |
Seiya Nuta | 552bcb8 | 2019-08-19 21:05:31 +0000 | [diff] [blame] | 130 | }; |
| 131 | |
Alexander Shaposhnikov | d911ed1 | 2019-02-02 00:38:07 +0000 | [diff] [blame] | 132 | /// The location of the string table inside the binary is described by LC_SYMTAB |
| 133 | /// load command. |
| 134 | struct StringTable { |
| 135 | std::vector<std::string> Strings; |
| 136 | }; |
| 137 | |
Seiya Nuta | f923d9b | 2019-06-21 00:21:50 +0000 | [diff] [blame] | 138 | struct RelocationInfo { |
| 139 | const SymbolEntry *Symbol; |
| 140 | // True if Info is a scattered_relocation_info. |
| 141 | bool Scattered; |
| 142 | MachO::any_relocation_info Info; |
| 143 | }; |
| 144 | |
Alexander Shaposhnikov | d911ed1 | 2019-02-02 00:38:07 +0000 | [diff] [blame] | 145 | /// The location of the rebase info inside the binary is described by |
| 146 | /// LC_DYLD_INFO load command. Dyld rebases an image whenever dyld loads it at |
| 147 | /// an address different from its preferred address. The rebase information is |
| 148 | /// a stream of byte sized opcodes whose symbolic names start with |
| 149 | /// REBASE_OPCODE_. Conceptually the rebase information is a table of tuples: |
| 150 | /// <seg-index, seg-offset, type> |
| 151 | /// The opcodes are a compressed way to encode the table by only |
| 152 | /// encoding when a column changes. In addition simple patterns |
| 153 | /// like "every n'th offset for m times" can be encoded in a few |
| 154 | /// bytes. |
| 155 | struct RebaseInfo { |
| 156 | // At the moment we do not parse this info (and it is simply copied over), |
| 157 | // but the proper support will be added later. |
| 158 | ArrayRef<uint8_t> Opcodes; |
| 159 | }; |
| 160 | |
| 161 | /// The location of the bind info inside the binary is described by |
| 162 | /// LC_DYLD_INFO load command. Dyld binds an image during the loading process, |
| 163 | /// if the image requires any pointers to be initialized to symbols in other |
| 164 | /// images. The bind information is a stream of byte sized opcodes whose |
| 165 | /// symbolic names start with BIND_OPCODE_. Conceptually the bind information is |
| 166 | /// a table of tuples: <seg-index, seg-offset, type, symbol-library-ordinal, |
| 167 | /// symbol-name, addend> The opcodes are a compressed way to encode the table by |
| 168 | /// only encoding when a column changes. In addition simple patterns like for |
| 169 | /// runs of pointers initialized to the same value can be encoded in a few |
| 170 | /// bytes. |
| 171 | struct BindInfo { |
| 172 | // At the moment we do not parse this info (and it is simply copied over), |
| 173 | // but the proper support will be added later. |
| 174 | ArrayRef<uint8_t> Opcodes; |
| 175 | }; |
| 176 | |
| 177 | /// The location of the weak bind info inside the binary is described by |
| 178 | /// LC_DYLD_INFO load command. Some C++ programs require dyld to unique symbols |
| 179 | /// so that all images in the process use the same copy of some code/data. This |
| 180 | /// step is done after binding. The content of the weak_bind info is an opcode |
| 181 | /// stream like the bind_info. But it is sorted alphabetically by symbol name. |
| 182 | /// This enable dyld to walk all images with weak binding information in order |
| 183 | /// and look for collisions. If there are no collisions, dyld does no updating. |
| 184 | /// That means that some fixups are also encoded in the bind_info. For |
| 185 | /// instance, all calls to "operator new" are first bound to libstdc++.dylib |
| 186 | /// using the information in bind_info. Then if some image overrides operator |
| 187 | /// new that is detected when the weak_bind information is processed and the |
| 188 | /// call to operator new is then rebound. |
| 189 | struct WeakBindInfo { |
| 190 | // At the moment we do not parse this info (and it is simply copied over), |
| 191 | // but the proper support will be added later. |
| 192 | ArrayRef<uint8_t> Opcodes; |
| 193 | }; |
| 194 | |
| 195 | /// The location of the lazy bind info inside the binary is described by |
| 196 | /// LC_DYLD_INFO load command. Some uses of external symbols do not need to be |
| 197 | /// bound immediately. Instead they can be lazily bound on first use. The |
| 198 | /// lazy_bind contains a stream of BIND opcodes to bind all lazy symbols. Normal |
| 199 | /// use is that dyld ignores the lazy_bind section when loading an image. |
| 200 | /// Instead the static linker arranged for the lazy pointer to initially point |
| 201 | /// to a helper function which pushes the offset into the lazy_bind area for the |
| 202 | /// symbol needing to be bound, then jumps to dyld which simply adds the offset |
| 203 | /// to lazy_bind_off to get the information on what to bind. |
| 204 | struct LazyBindInfo { |
| 205 | ArrayRef<uint8_t> Opcodes; |
| 206 | }; |
| 207 | |
| 208 | /// The location of the export info inside the binary is described by |
| 209 | /// LC_DYLD_INFO load command. The symbols exported by a dylib are encoded in a |
| 210 | /// trie. This is a compact representation that factors out common prefixes. It |
| 211 | /// also reduces LINKEDIT pages in RAM because it encodes all information (name, |
| 212 | /// address, flags) in one small, contiguous range. The export area is a stream |
| 213 | /// of nodes. The first node sequentially is the start node for the trie. Nodes |
| 214 | /// for a symbol start with a uleb128 that is the length of the exported symbol |
| 215 | /// information for the string so far. If there is no exported symbol, the node |
| 216 | /// starts with a zero byte. If there is exported info, it follows the length. |
| 217 | /// First is a uleb128 containing flags. Normally, it is followed by |
| 218 | /// a uleb128 encoded offset which is location of the content named |
| 219 | /// by the symbol from the mach_header for the image. If the flags |
| 220 | /// is EXPORT_SYMBOL_FLAGS_REEXPORT, then following the flags is |
| 221 | /// a uleb128 encoded library ordinal, then a zero terminated |
| 222 | /// UTF8 string. If the string is zero length, then the symbol |
| 223 | /// is re-export from the specified dylib with the same name. |
| 224 | /// If the flags is EXPORT_SYMBOL_FLAGS_STUB_AND_RESOLVER, then following |
| 225 | /// the flags is two uleb128s: the stub offset and the resolver offset. |
| 226 | /// The stub is used by non-lazy pointers. The resolver is used |
| 227 | /// by lazy pointers and must be called to get the actual address to use. |
| 228 | /// After the optional exported symbol information is a byte of |
| 229 | /// how many edges (0-255) that this node has leaving it, |
| 230 | /// followed by each edge. |
| 231 | /// Each edge is a zero terminated UTF8 of the addition chars |
| 232 | /// in the symbol, followed by a uleb128 offset for the node that |
| 233 | /// edge points to. |
| 234 | struct ExportInfo { |
| 235 | ArrayRef<uint8_t> Trie; |
| 236 | }; |
| 237 | |
Seiya Nuta | 552bcb8 | 2019-08-19 21:05:31 +0000 | [diff] [blame] | 238 | struct LinkData { |
| 239 | ArrayRef<uint8_t> Data; |
| 240 | }; |
| 241 | |
Alexander Shaposhnikov | d911ed1 | 2019-02-02 00:38:07 +0000 | [diff] [blame] | 242 | struct Object { |
| 243 | MachHeader Header; |
| 244 | std::vector<LoadCommand> LoadCommands; |
| 245 | |
| 246 | SymbolTable SymTable; |
| 247 | StringTable StrTable; |
| 248 | |
| 249 | RebaseInfo Rebases; |
| 250 | BindInfo Binds; |
| 251 | WeakBindInfo WeakBinds; |
| 252 | LazyBindInfo LazyBinds; |
| 253 | ExportInfo Exports; |
Seiya Nuta | 552bcb8 | 2019-08-19 21:05:31 +0000 | [diff] [blame] | 254 | IndirectSymbolTable IndirectSymTable; |
| 255 | LinkData DataInCode; |
| 256 | LinkData FunctionStarts; |
Alexander Shaposhnikov | d911ed1 | 2019-02-02 00:38:07 +0000 | [diff] [blame] | 257 | |
| 258 | /// The index of LC_SYMTAB load command if present. |
| 259 | Optional<size_t> SymTabCommandIndex; |
| 260 | /// The index of LC_DYLD_INFO or LC_DYLD_INFO_ONLY load command if present. |
| 261 | Optional<size_t> DyLdInfoCommandIndex; |
Seiya Nuta | 552bcb8 | 2019-08-19 21:05:31 +0000 | [diff] [blame] | 262 | /// The index LC_DYSYMTAB load comamnd if present. |
| 263 | Optional<size_t> DySymTabCommandIndex; |
| 264 | /// The index LC_DATA_IN_CODE load comamnd if present. |
| 265 | Optional<size_t> DataInCodeCommandIndex; |
| 266 | /// The index LC_FUNCTION_STARTS load comamnd if present. |
| 267 | Optional<size_t> FunctionStartsCommandIndex; |
Seiya Nuta | 7f19dd1 | 2019-10-28 15:40:37 +0900 | [diff] [blame] | 268 | |
| 269 | void removeSections(function_ref<bool(const Section &)> ToRemove); |
Alexander Shaposhnikov | d911ed1 | 2019-02-02 00:38:07 +0000 | [diff] [blame] | 270 | }; |
| 271 | |
| 272 | } // end namespace macho |
| 273 | } // end namespace objcopy |
| 274 | } // end namespace llvm |
| 275 | |
| 276 | #endif // LLVM_OBJCOPY_MACHO_OBJECT_H |