Breakpad: Extract parsing code into a separate file Summary: This centralizes parsing of breakpad records, which was previously spread out over ObjectFileBreakpad and SymbolFileBreakpad. For each record type X there is a separate breakpad::XRecord class, and an associated parse function. The classes just store the information in the breakpad records in a more accessible form. It is up to the users to determine what to do with that data. This separation also made it possible to write some targeted tests for the parsing code, which was previously unaccessible, so I write a couple of those too. Reviewers: clayborg, lemo, zturner Reviewed By: clayborg Subscribers: mgorny, fedor.sergeev, lldb-commits Differential Revision: https://reviews.llvm.org/D56844 llvm-svn: 351541

commit: 2cf5486ce4539d868ec84f1047aed8cd89739661 [log] [tgz]
author: Pavel Labath <pavel@labath.sk> Fri Jan 18 10:37:04 2019 +0000
committer: Pavel Labath <pavel@labath.sk> Fri Jan 18 10:37:04 2019 +0000
tree: 02837a3b37ed8b52172c3c3e23daff8a50930b16
parent: 9c66a47831a586ed36a138453a3234797f3a0127 [diff] [blame]
diff --git a/lldb/source/Plugins/ObjectFile/Breakpad/ObjectFileBreakpad.cpp b/lldb/source/Plugins/ObjectFile/Breakpad/ObjectFileBreakpad.cpp
index 9170250..3d360b0 100644
--- a/lldb/source/Plugins/ObjectFile/Breakpad/ObjectFileBreakpad.cpp
+++ b/lldb/source/Plugins/ObjectFile/Breakpad/ObjectFileBreakpad.cpp

@@ -8,11 +8,10 @@
 //===----------------------------------------------------------------------===//
 
 #include "Plugins/ObjectFile/Breakpad/ObjectFileBreakpad.h"
+#include "Plugins/ObjectFile/Breakpad/BreakpadRecords.h"
 #include "lldb/Core/ModuleSpec.h"
 #include "lldb/Core/PluginManager.h"
 #include "lldb/Core/Section.h"
-#include "lldb/Utility/DataBuffer.h"
-#include "llvm/ADT/StringExtras.h"
 
 using namespace lldb;
 using namespace lldb_private;
@@ -24,164 +23,24 @@
   UUID uuid;
   static llvm::Optional<Header> parse(llvm::StringRef text);
 };
-
-enum class Token { Unknown, Module, Info, File, Func, Public, Stack };
 } // namespace
 
-static Token toToken(llvm::StringRef str) {
-  return llvm::StringSwitch<Token>(str)
-      .Case("MODULE", Token::Module)
-      .Case("INFO", Token::Info)
-      .Case("FILE", Token::File)
-      .Case("FUNC", Token::Func)
-      .Case("PUBLIC", Token::Public)
-      .Case("STACK", Token::Stack)
-      .Default(Token::Unknown);
-}
-
-static llvm::StringRef toString(Token t) {
-  switch (t) {
-  case Token::Unknown:
-    return "";
-  case Token::Module:
-    return "MODULE";
-  case Token::Info:
-    return "INFO";
-  case Token::File:
-    return "FILE";
-  case Token::Func:
-    return "FUNC";
-  case Token::Public:
-    return "PUBLIC";
-  case Token::Stack:
-    return "STACK";
-  }
-  llvm_unreachable("Unknown token!");
-}
-
-static llvm::Triple::OSType toOS(llvm::StringRef str) {
-  using llvm::Triple;
-  return llvm::StringSwitch<Triple::OSType>(str)
-      .Case("Linux", Triple::Linux)
-      .Case("mac", Triple::MacOSX)
-      .Case("windows", Triple::Win32)
-      .Default(Triple::UnknownOS);
-}
-
-static llvm::Triple::ArchType toArch(llvm::StringRef str) {
-  using llvm::Triple;
-  return llvm::StringSwitch<Triple::ArchType>(str)
-      .Case("arm", Triple::arm)
-      .Case("arm64", Triple::aarch64)
-      .Case("mips", Triple::mips)
-      .Case("ppc", Triple::ppc)
-      .Case("ppc64", Triple::ppc64)
-      .Case("s390", Triple::systemz)
-      .Case("sparc", Triple::sparc)
-      .Case("sparcv9", Triple::sparcv9)
-      .Case("x86", Triple::x86)
-      .Case("x86_64", Triple::x86_64)
-      .Default(Triple::UnknownArch);
-}
-
-static llvm::StringRef consume_front(llvm::StringRef &str, size_t n) {
-  llvm::StringRef result = str.take_front(n);
-  str = str.drop_front(n);
-  return result;
-}
-
-static UUID parseModuleId(llvm::Triple::OSType os, llvm::StringRef str) {
-  struct uuid_data {
-    llvm::support::ulittle32_t uuid1;
-    llvm::support::ulittle16_t uuid2[2];
-    uint8_t uuid3[8];
-    llvm::support::ulittle32_t age;
-  } data;
-  static_assert(sizeof(data) == 20, "");
-  // The textual module id encoding should be between 33 and 40 bytes long,
-  // depending on the size of the age field, which is of variable length.
-  // The first three chunks of the id are encoded in big endian, so we need to
-  // byte-swap those.
-  if (str.size() < 33 || str.size() > 40)
-    return UUID();
-  uint32_t t;
-  if (to_integer(consume_front(str, 8), t, 16))
-    data.uuid1 = t;
-  else
-    return UUID();
-  for (int i = 0; i < 2; ++i) {
-    if (to_integer(consume_front(str, 4), t, 16))
-      data.uuid2[i] = t;
-    else
-      return UUID();
-  }
-  for (int i = 0; i < 8; ++i) {
-    if (!to_integer(consume_front(str, 2), data.uuid3[i], 16))
-      return UUID();
-  }
-  if (to_integer(str, t, 16))
-    data.age = t;
-  else
-    return UUID();
-
-  // On non-windows, the age field should always be zero, so we don't include to
-  // match the native uuid format of these platforms.
-  return UUID::fromData(&data, os == llvm::Triple::Win32 ? 20 : 16);
-}
-
 llvm::Optional<Header> Header::parse(llvm::StringRef text) {
-  // A valid module should start with something like:
-  // MODULE Linux x86_64 E5894855C35DCCCCCCCCCCCCCCCCCCCC0 a.out
-  // optionally followed by
-  // INFO CODE_ID 554889E55DC3CCCCCCCCCCCCCCCCCCCC [a.exe]
-  llvm::StringRef token, line;
+  llvm::StringRef line;
   std::tie(line, text) = text.split('\n');
-  std::tie(token, line) = getToken(line);
-  if (toToken(token) != Token::Module)
+  auto Module = ModuleRecord::parse(line);
+  if (!Module)
     return llvm::None;
 
-  std::tie(token, line) = getToken(line);
   llvm::Triple triple;
-  triple.setOS(toOS(token));
-  if (triple.getOS() == llvm::Triple::UnknownOS)
-    return llvm::None;
-
-  std::tie(token, line) = getToken(line);
-  triple.setArch(toArch(token));
-  if (triple.getArch() == llvm::Triple::UnknownArch)
-    return llvm::None;
-
-  llvm::StringRef module_id;
-  std::tie(module_id, line) = getToken(line);
+  triple.setArch(Module->getArch());
+  triple.setOS(Module->getOS());
 
   std::tie(line, text) = text.split('\n');
-  std::tie(token, line) = getToken(line);
-  if (token == "INFO") {
-    std::tie(token, line) = getToken(line);
-    if (token != "CODE_ID")
-      return llvm::None;
 
-    std::tie(token, line) = getToken(line);
-    // If we don't have any text following the code id (e.g. on linux), we
-    // should use the module id as UUID. Otherwise, we revert back to the module
-    // id.
-    if (line.trim().empty()) {
-      UUID uuid;
-      if (uuid.SetFromStringRef(token, token.size() / 2) != token.size())
-        return llvm::None;
-
-      return Header{ArchSpec(triple), uuid};
-    }
-  }
-
-  // We reach here if we don't have a INFO CODE_ID section, or we chose not to
-  // use it. In either case, we need to properly decode the module id, whose
-  // fields are encoded in big-endian.
-  UUID uuid = parseModuleId(triple.getOS(), module_id);
-  if (!uuid)
-    return llvm::None;
-
-  return Header{ArchSpec(triple), uuid};
+  auto Info = InfoRecord::parse(line);
+  UUID uuid = Info && Info->getID() ? Info->getID() : Module->getID();
+  return Header{ArchSpec(triple), std::move(uuid)};
 }
 
 void ObjectFileBreakpad::Initialize() {
@@ -274,18 +133,18 @@
     return;
   m_sections_ap = llvm::make_unique<SectionList>();
 
-  Token current_section = Token::Unknown;
+  llvm::Optional<Record::Kind> current_section;
   offset_t section_start;
   llvm::StringRef text = toStringRef(m_data.GetData());
   uint32_t next_section_id = 1;
   auto maybe_add_section = [&](const uint8_t *end_ptr) {
-    if (current_section == Token::Unknown)
+    if (!current_section)
       return; // We have been called before parsing the first line.
 
     offset_t end_offset = end_ptr - m_data.GetDataStart();
     auto section_sp = std::make_shared<Section>(
         GetModule(), this, next_section_id++,
-        ConstString(toString(current_section)), eSectionTypeOther,
+        ConstString(toString(*current_section)), eSectionTypeOther,
         /*file_vm_addr*/ 0, /*vm_size*/ 0, section_start,
         end_offset - section_start, /*log2align*/ 0, /*flags*/ 0);
     m_sections_ap->AddSection(section_sp);
@@ -295,19 +154,19 @@
     llvm::StringRef line;
     std::tie(line, text) = text.split('\n');
 
-    Token token = toToken(getToken(line).first);
-    if (token == Token::Unknown) {
-      // We assume this is a line record, which logically belongs to the Func
-      // section. Errors will be handled when parsing the Func section.
-      token = Token::Func;
+    Record::Kind next_section = Record::classify(line);
+    if (next_section == Record::Line) {
+      // Line records logically belong to the preceding Func record, so we put
+      // them in the same section.
+      next_section = Record::Func;
     }
-    if (token == current_section)
+    if (next_section == current_section)
       continue;
 
     // Changing sections, finish off the previous one, if there was any.
     maybe_add_section(line.bytes_begin());
     // And start a new one.
-    current_section = token;
+    current_section = next_section;
     section_start = line.bytes_begin() - m_data.GetDataStart();
   }
   // Finally, add the last section.
commit	2cf5486ce4539d868ec84f1047aed8cd89739661	[log] [tgz]
author	Pavel Labath <pavel@labath.sk>	Fri Jan 18 10:37:04 2019 +0000
committer	Pavel Labath <pavel@labath.sk>	Fri Jan 18 10:37:04 2019 +0000
tree	02837a3b37ed8b52172c3c3e23daff8a50930b16
parent	9c66a47831a586ed36a138453a3234797f3a0127 [diff] [blame]