Merge "Create minimal remap table for symbol binary search."
diff --git a/libunwindstack/Symbols.cpp b/libunwindstack/Symbols.cpp
index e3c15a2..2117ebd 100644
--- a/libunwindstack/Symbols.cpp
+++ b/libunwindstack/Symbols.cpp
@@ -19,6 +19,7 @@
 
 #include <algorithm>
 #include <string>
+#include <vector>
 
 #include <unwindstack/Memory.h>
 
@@ -29,23 +30,55 @@
 
 Symbols::Symbols(uint64_t offset, uint64_t size, uint64_t entry_size, uint64_t str_offset,
                  uint64_t str_size)
-    : cur_offset_(offset),
-      offset_(offset),
-      end_(offset + size),
+    : offset_(offset),
+      count_(entry_size != 0 ? size / entry_size : 0),
       entry_size_(entry_size),
       str_offset_(str_offset),
       str_end_(str_offset_ + str_size) {}
 
-const Symbols::Info* Symbols::GetInfoFromCache(uint64_t addr) {
-  // Binary search the table.
+template <typename SymType>
+static bool IsFunc(const SymType* entry) {
+  return entry->st_shndx != SHN_UNDEF && ELF32_ST_TYPE(entry->st_info) == STT_FUNC;
+}
+
+// Read symbol entry from memory and cache it so we don't have to read it again.
+template <typename SymType>
+inline __attribute__((__always_inline__)) const Symbols::Info* Symbols::ReadFuncInfo(
+    uint32_t symbol_index, Memory* elf_memory) {
+  auto it = symbols_.find(symbol_index);
+  if (it != symbols_.end()) {
+    return &it->second;
+  }
+  SymType sym;
+  if (!elf_memory->ReadFully(offset_ + symbol_index * entry_size_, &sym, sizeof(sym))) {
+    return nullptr;
+  }
+  if (!IsFunc(&sym)) {
+    // We need the address for binary search, but we don't want it to be matched.
+    sym.st_size = 0;
+  }
+  Info info{.addr = sym.st_value, .size = static_cast<uint32_t>(sym.st_size), .name = sym.st_name};
+  return &symbols_.emplace(symbol_index, info).first->second;
+}
+
+// Binary search the symbol table to find function containing the given address.
+// Without remap, the symbol table is assumed to be sorted and accessed directly.
+// If the symbol table is not sorted this method might fail but should not crash.
+// When the indices are remapped, they are guaranteed to be sorted by address.
+template <typename SymType, bool RemapIndices>
+const Symbols::Info* Symbols::BinarySearch(uint64_t addr, Memory* elf_memory) {
   size_t first = 0;
-  size_t last = symbols_.size();
+  size_t last = RemapIndices ? remap_->size() : count_;
   while (first < last) {
     size_t current = first + (last - first) / 2;
-    const Info* info = &symbols_[current];
-    if (addr < info->start_offset) {
+    size_t symbol_index = RemapIndices ? remap_.value()[current] : current;
+    const Info* info = ReadFuncInfo<SymType>(symbol_index, elf_memory);
+    if (info == nullptr) {
+      return nullptr;
+    }
+    if (addr < info->addr) {
       last = current;
-    } else if (addr < info->end_offset) {
+    } else if (addr < info->addr + info->size) {
       return info;
     } else {
       first = current + 1;
@@ -54,64 +87,72 @@
   return nullptr;
 }
 
+// Create remapping table which allows us to access symbols as if they were sorted by address.
 template <typename SymType>
-bool Symbols::GetName(uint64_t addr, Memory* elf_memory, std::string* name, uint64_t* func_offset) {
-  if (symbols_.size() != 0) {
-    const Info* info = GetInfoFromCache(addr);
-    if (info) {
-      CHECK(addr >= info->start_offset && addr <= info->end_offset);
-      *func_offset = addr - info->start_offset;
-      return elf_memory->ReadString(info->str_offset, name, str_end_ - info->str_offset);
+void Symbols::BuildRemapTable(Memory* elf_memory) {
+  std::vector<uint64_t> addrs;  // Addresses of all symbols (addrs[i] == symbols[i].st_value).
+  addrs.reserve(count_);
+  remap_.emplace();  // Construct the optional remap table.
+  remap_->reserve(count_);
+  for (size_t symbol_idx = 0; symbol_idx < count_;) {
+    // Read symbols from memory.  We intentionally bypass the cache to save memory.
+    // Do the reads in batches so that we minimize the number of memory read calls.
+    uint8_t buffer[1024];
+    size_t read = std::min<size_t>(sizeof(buffer), (count_ - symbol_idx) * entry_size_);
+    size_t size = elf_memory->Read(offset_ + symbol_idx * entry_size_, buffer, read);
+    if (size < sizeof(SymType)) {
+      break;  // Stop processing, something looks like it is corrupted.
     }
-  }
-
-  bool symbol_added = false;
-  bool return_value = false;
-  while (cur_offset_ + entry_size_ <= end_) {
-    SymType entry;
-    if (!elf_memory->ReadFully(cur_offset_, &entry, sizeof(entry))) {
-      // Stop all processing, something looks like it is corrupted.
-      cur_offset_ = UINT64_MAX;
-      return false;
-    }
-    cur_offset_ += entry_size_;
-
-    if (entry.st_shndx != SHN_UNDEF && ELF32_ST_TYPE(entry.st_info) == STT_FUNC) {
-      // Treat st_value as virtual address.
-      uint64_t start_offset = entry.st_value;
-      uint64_t end_offset = start_offset + entry.st_size;
-
-      // Cache the value.
-      symbols_.emplace_back(start_offset, end_offset, str_offset_ + entry.st_name);
-      symbol_added = true;
-
-      if (addr >= start_offset && addr < end_offset) {
-        *func_offset = addr - start_offset;
-        uint64_t offset = str_offset_ + entry.st_name;
-        if (offset < str_end_) {
-          return_value = elf_memory->ReadString(offset, name, str_end_ - offset);
-        }
-        break;
+    for (size_t offset = 0; offset + sizeof(SymType) <= size; offset += entry_size_, symbol_idx++) {
+      SymType sym;
+      memcpy(&sym, &buffer[offset], sizeof(SymType));  // Copy to ensure alignment.
+      addrs.push_back(sym.st_value);  // Always insert so it is indexable by symbol index.
+      if (IsFunc(&sym)) {
+        remap_->push_back(symbol_idx);  // Indices of function symbols only.
       }
     }
   }
+  // Sort by address to make the remap list binary searchable (stable due to the a<b tie break).
+  auto comp = [&addrs](auto a, auto b) { return std::tie(addrs[a], a) < std::tie(addrs[b], b); };
+  std::sort(remap_->begin(), remap_->end(), comp);
+  // Remove duplicate entries (methods de-duplicated by the linker).
+  auto pred = [&addrs](auto a, auto b) { return addrs[a] == addrs[b]; };
+  remap_->erase(std::unique(remap_->begin(), remap_->end(), pred), remap_->end());
+  remap_->shrink_to_fit();
+}
 
-  if (symbol_added) {
-    std::sort(symbols_.begin(), symbols_.end(),
-              [](const Info& a, const Info& b) { return a.start_offset < b.start_offset; });
+template <typename SymType>
+bool Symbols::GetName(uint64_t addr, Memory* elf_memory, std::string* name, uint64_t* func_offset) {
+  const Info* info;
+  if (!remap_.has_value()) {
+    // Assume the symbol table is sorted. If it is not, this will gracefully fail.
+    info = BinarySearch<SymType, false>(addr, elf_memory);
+    if (info == nullptr) {
+      // Create the remapping table and retry the search.
+      BuildRemapTable<SymType>(elf_memory);
+      symbols_.clear();  // Remove cached symbols since the access pattern will be different.
+      info = BinarySearch<SymType, true>(addr, elf_memory);
+    }
+  } else {
+    // Fast search using the previously created remap table.
+    info = BinarySearch<SymType, true>(addr, elf_memory);
   }
-  return return_value;
+  if (info == nullptr) {
+    return false;
+  }
+  // Read the function name from the string table.
+  *func_offset = addr - info->addr;
+  uint64_t str = str_offset_ + info->name;
+  return str < str_end_ && elf_memory->ReadString(str, name, str_end_ - str);
 }
 
 template <typename SymType>
 bool Symbols::GetGlobal(Memory* elf_memory, const std::string& name, uint64_t* memory_address) {
-  uint64_t cur_offset = offset_;
-  while (cur_offset + entry_size_ <= end_) {
+  for (uint32_t i = 0; i < count_; i++) {
     SymType entry;
-    if (!elf_memory->ReadFully(cur_offset, &entry, sizeof(entry))) {
+    if (!elf_memory->ReadFully(offset_ + i * entry_size_, &entry, sizeof(entry))) {
       return false;
     }
-    cur_offset += entry_size_;
 
     if (entry.st_shndx != SHN_UNDEF && ELF32_ST_TYPE(entry.st_info) == STT_OBJECT &&
         ELF32_ST_BIND(entry.st_info) == STB_GLOBAL) {
diff --git a/libunwindstack/Symbols.h b/libunwindstack/Symbols.h
index 7fcd067..3b3f20b 100644
--- a/libunwindstack/Symbols.h
+++ b/libunwindstack/Symbols.h
@@ -19,8 +19,9 @@
 
 #include <stdint.h>
 
+#include <optional>
 #include <string>
-#include <vector>
+#include <unordered_map>
 
 namespace unwindstack {
 
@@ -29,11 +30,9 @@
 
 class Symbols {
   struct Info {
-    Info(uint64_t start_offset, uint64_t end_offset, uint64_t str_offset)
-        : start_offset(start_offset), end_offset(end_offset), str_offset(str_offset) {}
-    uint64_t start_offset;
-    uint64_t end_offset;
-    uint64_t str_offset;
+    uint64_t addr;  // Symbol address.
+    uint32_t size;  // Symbol size in bytes. Zero if not a function.
+    uint32_t name;  // Offset in .strtab.
   };
 
  public:
@@ -41,8 +40,6 @@
           uint64_t str_size);
   virtual ~Symbols() = default;
 
-  const Info* GetInfoFromCache(uint64_t addr);
-
   template <typename SymType>
   bool GetName(uint64_t addr, Memory* elf_memory, std::string* name, uint64_t* func_offset);
 
@@ -51,18 +48,27 @@
 
   void ClearCache() {
     symbols_.clear();
-    cur_offset_ = offset_;
+    remap_.reset();
   }
 
  private:
-  uint64_t cur_offset_;
-  uint64_t offset_;
-  uint64_t end_;
-  uint64_t entry_size_;
-  uint64_t str_offset_;
-  uint64_t str_end_;
+  template <typename SymType>
+  const Info* ReadFuncInfo(uint32_t symbol_index, Memory* elf_memory);
 
-  std::vector<Info> symbols_;
+  template <typename SymType, bool RemapIndices>
+  const Info* BinarySearch(uint64_t addr, Memory* elf_memory);
+
+  template <typename SymType>
+  void BuildRemapTable(Memory* elf_memory);
+
+  const uint64_t offset_;
+  const uint64_t count_;
+  const uint64_t entry_size_;
+  const uint64_t str_offset_;
+  const uint64_t str_end_;
+
+  std::unordered_map<uint32_t, Info> symbols_;  // Cache of read symbols (keyed by symbol index).
+  std::optional<std::vector<uint32_t>> remap_;  // Indices of function symbols sorted by address.
 };
 
 }  // namespace unwindstack
diff --git a/libunwindstack/tests/SymbolsTest.cpp b/libunwindstack/tests/SymbolsTest.cpp
index c58aeff..9afbeec 100644
--- a/libunwindstack/tests/SymbolsTest.cpp
+++ b/libunwindstack/tests/SymbolsTest.cpp
@@ -185,18 +185,21 @@
   std::string fake_name;
 
   this->InitSym(&sym, 0x5000, 0x10, 0x40);
+  this->memory_.SetMemoryBlock(offset, entry_size, 0);
   this->memory_.SetMemory(offset, &sym, sizeof(sym));
   fake_name = "function_one";
   this->memory_.SetMemory(0x2040, fake_name.c_str(), fake_name.size() + 1);
   offset += entry_size;
 
   this->InitSym(&sym, 0x3004, 0x200, 0x100);
+  this->memory_.SetMemoryBlock(offset, entry_size, 0);
   this->memory_.SetMemory(offset, &sym, sizeof(sym));
   fake_name = "function_two";
   this->memory_.SetMemory(0x2100, fake_name.c_str(), fake_name.size() + 1);
   offset += entry_size;
 
   this->InitSym(&sym, 0xa010, 0x20, 0x230);
+  this->memory_.SetMemoryBlock(offset, entry_size, 0);
   this->memory_.SetMemory(offset, &sym, sizeof(sym));
   fake_name = "function_three";
   this->memory_.SetMemory(0x2230, fake_name.c_str(), fake_name.size() + 1);
@@ -274,7 +277,9 @@
   // Do call that should cache all of the entries (except the string data).
   std::string name;
   uint64_t func_offset;
-  ASSERT_FALSE(symbols.GetName<TypeParam>(0x6000, &this->memory_, &name, &func_offset));
+  ASSERT_FALSE(symbols.GetName<TypeParam>(0x5000, &this->memory_, &name, &func_offset));
+  ASSERT_FALSE(symbols.GetName<TypeParam>(0x2000, &this->memory_, &name, &func_offset));
+  ASSERT_FALSE(symbols.GetName<TypeParam>(0x1000, &this->memory_, &name, &func_offset));
   this->memory_.Clear();
   ASSERT_FALSE(symbols.GetName<TypeParam>(0x6000, &this->memory_, &name, &func_offset));