Alexander Shaposhnikov | d911ed1 | 2019-02-02 00:38:07 +0000 | [diff] [blame] | 1 | //===- Object.h - Mach-O object file model ----------------------*- C++ -*-===// |
| 2 | // |
Chandler Carruth | 127252b | 2019-02-11 08:25:19 +0000 | [diff] [blame] | 3 | // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. |
| 4 | // See https://llvm.org/LICENSE.txt for license information. |
| 5 | // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception |
Alexander Shaposhnikov | d911ed1 | 2019-02-02 00:38:07 +0000 | [diff] [blame] | 6 | // |
| 7 | //===----------------------------------------------------------------------===// |
| 8 | |
| 9 | #ifndef LLVM_OBJCOPY_MACHO_OBJECT_H |
| 10 | #define LLVM_OBJCOPY_MACHO_OBJECT_H |
| 11 | |
| 12 | #include "llvm/ADT/Optional.h" |
| 13 | #include "llvm/ADT/StringRef.h" |
| 14 | #include "llvm/BinaryFormat/MachO.h" |
Seiya Nuta | f923d9b | 2019-06-21 00:21:50 +0000 | [diff] [blame] | 15 | #include "llvm/MC/StringTableBuilder.h" |
Alexander Shaposhnikov | d911ed1 | 2019-02-02 00:38:07 +0000 | [diff] [blame] | 16 | #include "llvm/ObjectYAML/DWARFYAML.h" |
| 17 | #include "llvm/Support/YAMLTraits.h" |
| 18 | #include <cstdint> |
| 19 | #include <string> |
| 20 | #include <vector> |
| 21 | |
| 22 | namespace llvm { |
| 23 | namespace objcopy { |
| 24 | namespace macho { |
| 25 | |
| 26 | struct MachHeader { |
| 27 | uint32_t Magic; |
| 28 | uint32_t CPUType; |
| 29 | uint32_t CPUSubType; |
| 30 | uint32_t FileType; |
| 31 | uint32_t NCmds; |
| 32 | uint32_t SizeOfCmds; |
| 33 | uint32_t Flags; |
| 34 | uint32_t Reserved = 0; |
| 35 | }; |
| 36 | |
Seiya Nuta | f923d9b | 2019-06-21 00:21:50 +0000 | [diff] [blame] | 37 | struct RelocationInfo; |
Alexander Shaposhnikov | d911ed1 | 2019-02-02 00:38:07 +0000 | [diff] [blame] | 38 | struct Section { |
Seiya Nuta | b728e53 | 2019-06-08 01:22:54 +0000 | [diff] [blame] | 39 | std::string Sectname; |
| 40 | std::string Segname; |
Alexander Shaposhnikov | d911ed1 | 2019-02-02 00:38:07 +0000 | [diff] [blame] | 41 | uint64_t Addr; |
| 42 | uint64_t Size; |
| 43 | uint32_t Offset; |
| 44 | uint32_t Align; |
| 45 | uint32_t RelOff; |
| 46 | uint32_t NReloc; |
| 47 | uint32_t Flags; |
| 48 | uint32_t Reserved1; |
| 49 | uint32_t Reserved2; |
| 50 | uint32_t Reserved3; |
| 51 | |
| 52 | StringRef Content; |
Seiya Nuta | f923d9b | 2019-06-21 00:21:50 +0000 | [diff] [blame] | 53 | std::vector<RelocationInfo> Relocations; |
Seiya Nuta | b728e53 | 2019-06-08 01:22:54 +0000 | [diff] [blame] | 54 | |
| 55 | MachO::SectionType getType() const { |
| 56 | return static_cast<MachO::SectionType>(Flags & MachO::SECTION_TYPE); |
| 57 | } |
| 58 | |
| 59 | bool isVirtualSection() const { |
| 60 | return (getType() == MachO::S_ZEROFILL || |
| 61 | getType() == MachO::S_GB_ZEROFILL || |
| 62 | getType() == MachO::S_THREAD_LOCAL_ZEROFILL); |
| 63 | } |
Alexander Shaposhnikov | d911ed1 | 2019-02-02 00:38:07 +0000 | [diff] [blame] | 64 | }; |
| 65 | |
| 66 | struct LoadCommand { |
| 67 | // The type MachO::macho_load_command is defined in llvm/BinaryFormat/MachO.h |
| 68 | // and it is a union of all the structs corresponding to various load |
| 69 | // commands. |
| 70 | MachO::macho_load_command MachOLoadCommand; |
| 71 | |
| 72 | // The raw content of the payload of the load command (located right after the |
| 73 | // corresponding struct). In some cases it is either empty or can be |
| 74 | // copied-over without digging into its structure. |
| 75 | ArrayRef<uint8_t> Payload; |
| 76 | |
| 77 | // Some load commands can contain (inside the payload) an array of sections, |
| 78 | // though the contents of the sections are stored separately. The struct |
| 79 | // Section describes only sections' metadata and where to find the |
| 80 | // corresponding content inside the binary. |
| 81 | std::vector<Section> Sections; |
| 82 | }; |
| 83 | |
Seiya Nuta | f923d9b | 2019-06-21 00:21:50 +0000 | [diff] [blame] | 84 | // A symbol information. Fields which starts with "n_" are same as them in the |
| 85 | // nlist. |
| 86 | struct SymbolEntry { |
| 87 | std::string Name; |
| 88 | uint32_t Index; |
Alexander Shaposhnikov | d911ed1 | 2019-02-02 00:38:07 +0000 | [diff] [blame] | 89 | uint8_t n_type; |
| 90 | uint8_t n_sect; |
| 91 | uint16_t n_desc; |
| 92 | uint64_t n_value; |
Seiya Nuta | 552bcb8 | 2019-08-19 21:05:31 +0000 | [diff] [blame^] | 93 | |
| 94 | bool isExternalSymbol() const { |
| 95 | return n_type & ((MachO::N_EXT | MachO::N_PEXT)); |
| 96 | } |
| 97 | |
| 98 | bool isLocalSymbol() const { return !isExternalSymbol(); } |
| 99 | |
| 100 | bool isUndefinedSymbol() const { |
| 101 | return (n_type & MachO::N_TYPE) == MachO::N_UNDF; |
| 102 | } |
Alexander Shaposhnikov | d911ed1 | 2019-02-02 00:38:07 +0000 | [diff] [blame] | 103 | }; |
| 104 | |
| 105 | /// The location of the symbol table inside the binary is described by LC_SYMTAB |
| 106 | /// load command. |
| 107 | struct SymbolTable { |
Seiya Nuta | f923d9b | 2019-06-21 00:21:50 +0000 | [diff] [blame] | 108 | std::vector<std::unique_ptr<SymbolEntry>> Symbols; |
| 109 | |
| 110 | const SymbolEntry *getSymbolByIndex(uint32_t Index) const; |
Alexander Shaposhnikov | d911ed1 | 2019-02-02 00:38:07 +0000 | [diff] [blame] | 111 | }; |
| 112 | |
Seiya Nuta | 552bcb8 | 2019-08-19 21:05:31 +0000 | [diff] [blame^] | 113 | struct IndirectSymbolTable { |
| 114 | std::vector<uint32_t> Symbols; |
| 115 | }; |
| 116 | |
Alexander Shaposhnikov | d911ed1 | 2019-02-02 00:38:07 +0000 | [diff] [blame] | 117 | /// The location of the string table inside the binary is described by LC_SYMTAB |
| 118 | /// load command. |
| 119 | struct StringTable { |
| 120 | std::vector<std::string> Strings; |
| 121 | }; |
| 122 | |
Seiya Nuta | f923d9b | 2019-06-21 00:21:50 +0000 | [diff] [blame] | 123 | struct RelocationInfo { |
| 124 | const SymbolEntry *Symbol; |
| 125 | // True if Info is a scattered_relocation_info. |
| 126 | bool Scattered; |
| 127 | MachO::any_relocation_info Info; |
| 128 | }; |
| 129 | |
Alexander Shaposhnikov | d911ed1 | 2019-02-02 00:38:07 +0000 | [diff] [blame] | 130 | /// The location of the rebase info inside the binary is described by |
| 131 | /// LC_DYLD_INFO load command. Dyld rebases an image whenever dyld loads it at |
| 132 | /// an address different from its preferred address. The rebase information is |
| 133 | /// a stream of byte sized opcodes whose symbolic names start with |
| 134 | /// REBASE_OPCODE_. Conceptually the rebase information is a table of tuples: |
| 135 | /// <seg-index, seg-offset, type> |
| 136 | /// The opcodes are a compressed way to encode the table by only |
| 137 | /// encoding when a column changes. In addition simple patterns |
| 138 | /// like "every n'th offset for m times" can be encoded in a few |
| 139 | /// bytes. |
| 140 | struct RebaseInfo { |
| 141 | // At the moment we do not parse this info (and it is simply copied over), |
| 142 | // but the proper support will be added later. |
| 143 | ArrayRef<uint8_t> Opcodes; |
| 144 | }; |
| 145 | |
| 146 | /// The location of the bind info inside the binary is described by |
| 147 | /// LC_DYLD_INFO load command. Dyld binds an image during the loading process, |
| 148 | /// if the image requires any pointers to be initialized to symbols in other |
| 149 | /// images. The bind information is a stream of byte sized opcodes whose |
| 150 | /// symbolic names start with BIND_OPCODE_. Conceptually the bind information is |
| 151 | /// a table of tuples: <seg-index, seg-offset, type, symbol-library-ordinal, |
| 152 | /// symbol-name, addend> The opcodes are a compressed way to encode the table by |
| 153 | /// only encoding when a column changes. In addition simple patterns like for |
| 154 | /// runs of pointers initialized to the same value can be encoded in a few |
| 155 | /// bytes. |
| 156 | struct BindInfo { |
| 157 | // At the moment we do not parse this info (and it is simply copied over), |
| 158 | // but the proper support will be added later. |
| 159 | ArrayRef<uint8_t> Opcodes; |
| 160 | }; |
| 161 | |
| 162 | /// The location of the weak bind info inside the binary is described by |
| 163 | /// LC_DYLD_INFO load command. Some C++ programs require dyld to unique symbols |
| 164 | /// so that all images in the process use the same copy of some code/data. This |
| 165 | /// step is done after binding. The content of the weak_bind info is an opcode |
| 166 | /// stream like the bind_info. But it is sorted alphabetically by symbol name. |
| 167 | /// This enable dyld to walk all images with weak binding information in order |
| 168 | /// and look for collisions. If there are no collisions, dyld does no updating. |
| 169 | /// That means that some fixups are also encoded in the bind_info. For |
| 170 | /// instance, all calls to "operator new" are first bound to libstdc++.dylib |
| 171 | /// using the information in bind_info. Then if some image overrides operator |
| 172 | /// new that is detected when the weak_bind information is processed and the |
| 173 | /// call to operator new is then rebound. |
| 174 | struct WeakBindInfo { |
| 175 | // At the moment we do not parse this info (and it is simply copied over), |
| 176 | // but the proper support will be added later. |
| 177 | ArrayRef<uint8_t> Opcodes; |
| 178 | }; |
| 179 | |
| 180 | /// The location of the lazy bind info inside the binary is described by |
| 181 | /// LC_DYLD_INFO load command. Some uses of external symbols do not need to be |
| 182 | /// bound immediately. Instead they can be lazily bound on first use. The |
| 183 | /// lazy_bind contains a stream of BIND opcodes to bind all lazy symbols. Normal |
| 184 | /// use is that dyld ignores the lazy_bind section when loading an image. |
| 185 | /// Instead the static linker arranged for the lazy pointer to initially point |
| 186 | /// to a helper function which pushes the offset into the lazy_bind area for the |
| 187 | /// symbol needing to be bound, then jumps to dyld which simply adds the offset |
| 188 | /// to lazy_bind_off to get the information on what to bind. |
| 189 | struct LazyBindInfo { |
| 190 | ArrayRef<uint8_t> Opcodes; |
| 191 | }; |
| 192 | |
| 193 | /// The location of the export info inside the binary is described by |
| 194 | /// LC_DYLD_INFO load command. The symbols exported by a dylib are encoded in a |
| 195 | /// trie. This is a compact representation that factors out common prefixes. It |
| 196 | /// also reduces LINKEDIT pages in RAM because it encodes all information (name, |
| 197 | /// address, flags) in one small, contiguous range. The export area is a stream |
| 198 | /// of nodes. The first node sequentially is the start node for the trie. Nodes |
| 199 | /// for a symbol start with a uleb128 that is the length of the exported symbol |
| 200 | /// information for the string so far. If there is no exported symbol, the node |
| 201 | /// starts with a zero byte. If there is exported info, it follows the length. |
| 202 | /// First is a uleb128 containing flags. Normally, it is followed by |
| 203 | /// a uleb128 encoded offset which is location of the content named |
| 204 | /// by the symbol from the mach_header for the image. If the flags |
| 205 | /// is EXPORT_SYMBOL_FLAGS_REEXPORT, then following the flags is |
| 206 | /// a uleb128 encoded library ordinal, then a zero terminated |
| 207 | /// UTF8 string. If the string is zero length, then the symbol |
| 208 | /// is re-export from the specified dylib with the same name. |
| 209 | /// If the flags is EXPORT_SYMBOL_FLAGS_STUB_AND_RESOLVER, then following |
| 210 | /// the flags is two uleb128s: the stub offset and the resolver offset. |
| 211 | /// The stub is used by non-lazy pointers. The resolver is used |
| 212 | /// by lazy pointers and must be called to get the actual address to use. |
| 213 | /// After the optional exported symbol information is a byte of |
| 214 | /// how many edges (0-255) that this node has leaving it, |
| 215 | /// followed by each edge. |
| 216 | /// Each edge is a zero terminated UTF8 of the addition chars |
| 217 | /// in the symbol, followed by a uleb128 offset for the node that |
| 218 | /// edge points to. |
| 219 | struct ExportInfo { |
| 220 | ArrayRef<uint8_t> Trie; |
| 221 | }; |
| 222 | |
Seiya Nuta | 552bcb8 | 2019-08-19 21:05:31 +0000 | [diff] [blame^] | 223 | struct LinkData { |
| 224 | ArrayRef<uint8_t> Data; |
| 225 | }; |
| 226 | |
Alexander Shaposhnikov | d911ed1 | 2019-02-02 00:38:07 +0000 | [diff] [blame] | 227 | struct Object { |
| 228 | MachHeader Header; |
| 229 | std::vector<LoadCommand> LoadCommands; |
| 230 | |
| 231 | SymbolTable SymTable; |
| 232 | StringTable StrTable; |
| 233 | |
| 234 | RebaseInfo Rebases; |
| 235 | BindInfo Binds; |
| 236 | WeakBindInfo WeakBinds; |
| 237 | LazyBindInfo LazyBinds; |
| 238 | ExportInfo Exports; |
Seiya Nuta | 552bcb8 | 2019-08-19 21:05:31 +0000 | [diff] [blame^] | 239 | IndirectSymbolTable IndirectSymTable; |
| 240 | LinkData DataInCode; |
| 241 | LinkData FunctionStarts; |
Alexander Shaposhnikov | d911ed1 | 2019-02-02 00:38:07 +0000 | [diff] [blame] | 242 | |
| 243 | /// The index of LC_SYMTAB load command if present. |
| 244 | Optional<size_t> SymTabCommandIndex; |
| 245 | /// The index of LC_DYLD_INFO or LC_DYLD_INFO_ONLY load command if present. |
| 246 | Optional<size_t> DyLdInfoCommandIndex; |
Seiya Nuta | 552bcb8 | 2019-08-19 21:05:31 +0000 | [diff] [blame^] | 247 | /// The index LC_DYSYMTAB load comamnd if present. |
| 248 | Optional<size_t> DySymTabCommandIndex; |
| 249 | /// The index LC_DATA_IN_CODE load comamnd if present. |
| 250 | Optional<size_t> DataInCodeCommandIndex; |
| 251 | /// The index LC_FUNCTION_STARTS load comamnd if present. |
| 252 | Optional<size_t> FunctionStartsCommandIndex; |
Alexander Shaposhnikov | d911ed1 | 2019-02-02 00:38:07 +0000 | [diff] [blame] | 253 | }; |
| 254 | |
| 255 | } // end namespace macho |
| 256 | } // end namespace objcopy |
| 257 | } // end namespace llvm |
| 258 | |
| 259 | #endif // LLVM_OBJCOPY_MACHO_OBJECT_H |