[PDB] Don't build the entire source file list up front.

I tried to run llvm-pdbdump on a very large (~1.5GB) PDB to
try and identify show-stopping performance problems.  This
patch addresses the first such problem.

When loading the DBI stream, before anyone has even tried to
access a single record, we build an in memory map of every
source file for every module.  In the particular PDB I was
using, this was over 85 million files.  Specifically, the
complexity is O(m*n) where m is the number of modules and
n is the average number of source files (including headers)
per module.

The whole reason for doing this was so that we could have
constant time access to any module and any of its source
file lists.  However, we can still get O(1) access to the
source file list for a given module with a simple O(m)
precomputation, and access to the list of modules is
already O(1) anyway.

So this patches reduces the O(m*n) up-front precomputation
to an O(m) one, where n is ~6,500 and n*m is about 85 million
in my pathological test case.

Differential Revision: https://reviews.llvm.org/D32870

llvm-svn: 302205
diff --git a/llvm/lib/DebugInfo/PDB/CMakeLists.txt b/llvm/lib/DebugInfo/PDB/CMakeLists.txt
index e175301..e9fd29c 100644
--- a/llvm/lib/DebugInfo/PDB/CMakeLists.txt
+++ b/llvm/lib/DebugInfo/PDB/CMakeLists.txt
@@ -30,6 +30,7 @@
 add_pdb_impl_folder(Native
   Native/DbiModuleDescriptor.cpp
   Native/DbiModuleDescriptorBuilder.cpp
+  Native/DbiModuleList.cpp
   Native/DbiStream.cpp
   Native/DbiStreamBuilder.cpp
   Native/EnumTables.cpp
diff --git a/llvm/lib/DebugInfo/PDB/Native/DbiModuleList.cpp b/llvm/lib/DebugInfo/PDB/Native/DbiModuleList.cpp
new file mode 100644
index 0000000..434f775
--- /dev/null
+++ b/llvm/lib/DebugInfo/PDB/Native/DbiModuleList.cpp
@@ -0,0 +1,273 @@
+//===- DbiModuleList.cpp - PDB module information list ----------*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+#include "llvm/DebugInfo/PDB/Native/DbiModuleList.h"
+
+#include "llvm/DebugInfo/PDB/Native/RawError.h"
+#include "llvm/Support/Error.h"
+
+using namespace llvm;
+using namespace llvm::pdb;
+
+DbiModuleSourceFilesIterator::DbiModuleSourceFilesIterator(
+    const DbiModuleList &Modules, uint32_t Modi, uint16_t Filei)
+    : Modules(&Modules), Modi(Modi), Filei(Filei) {
+  setValue();
+}
+
+bool DbiModuleSourceFilesIterator::
+operator==(const DbiModuleSourceFilesIterator &R) const {
+  // incompatible iterators are never equal
+  if (!isCompatible(R))
+    return false;
+
+  // If they're compatible, and they're both ends, then they're equal.
+  if (isEnd() && R.isEnd())
+    return true;
+
+  // If one is an end and the other is not, they're not equal.
+  if (isEnd() != R.isEnd())
+    return false;
+
+  // Now we know:
+  // - They're compatible
+  // - They're not *both* end iterators
+  // - Their endness is the same.
+  // Thus, they're compatible iterators pointing to a valid file on the same
+  // module.  All we need to check are the file indices.
+  assert(Modules == R.Modules);
+  assert(Modi == R.Modi);
+  assert(!isEnd());
+  assert(!R.isEnd());
+
+  return (Filei == R.Filei);
+}
+
+bool DbiModuleSourceFilesIterator::
+operator<(const DbiModuleSourceFilesIterator &R) const {
+  assert(isCompatible(R));
+
+  // It's not sufficient to compare the file indices, because default
+  // constructed iterators could be equal to iterators with valid indices.  To
+  // account for this, early-out if they're equal.
+  if (*this == R)
+    return false;
+
+  return Filei < R.Filei;
+}
+
+std::ptrdiff_t DbiModuleSourceFilesIterator::
+operator-(const DbiModuleSourceFilesIterator &R) const {
+  assert(isCompatible(R));
+  assert(!(*this < R));
+
+  // If they're both end iterators, the distance is 0.
+  if (isEnd() && R.isEnd())
+    return 0;
+
+  assert(!R.isEnd());
+
+  // At this point, R cannot be end, but *this can, which means that *this
+  // might be a universal end iterator with none of its fields set.  So in that
+  // case have to rely on R as the authority to figure out how many files there
+  // are to compute the distance.
+  uint32_t Thisi = Filei;
+  if (isEnd()) {
+    uint32_t RealModi = R.Modi;
+    Thisi = R.Modules->getSourceFileCount(RealModi);
+  }
+
+  assert(Thisi >= R.Filei);
+  return Thisi - R.Filei;
+}
+
+DbiModuleSourceFilesIterator &DbiModuleSourceFilesIterator::
+operator+=(std::ptrdiff_t N) {
+  assert(!isEnd());
+
+  Filei += N;
+  assert(Filei <= Modules->getSourceFileCount(Modi));
+  setValue();
+  return *this;
+}
+
+DbiModuleSourceFilesIterator &DbiModuleSourceFilesIterator::
+operator-=(std::ptrdiff_t N) {
+  // Note that we can subtract from an end iterator, but not a universal end
+  // iterator.
+  assert(!isUniversalEnd());
+
+  assert(N <= Filei);
+
+  Filei -= N;
+  return *this;
+}
+
+void DbiModuleSourceFilesIterator::setValue() {
+  if (isEnd()) {
+    ThisValue = "";
+    return;
+  }
+
+  uint32_t Off = Modules->ModuleInitialFileIndex[Modi] + Filei;
+  auto ExpectedValue = Modules->getFileName(Off);
+  if (!ExpectedValue) {
+    consumeError(ExpectedValue.takeError());
+    Filei = Modules->getSourceFileCount(Modi);
+  } else
+    ThisValue = *ExpectedValue;
+}
+
+bool DbiModuleSourceFilesIterator::isEnd() const {
+  if (isUniversalEnd())
+    return true;
+
+  assert(Modules);
+  assert(Modi <= Modules->getModuleCount());
+  assert(Filei <= Modules->getSourceFileCount(Modi));
+
+  if (Modi == Modules->getModuleCount())
+    return true;
+  if (Filei == Modules->getSourceFileCount(Modi))
+    return true;
+  return false;
+}
+
+bool DbiModuleSourceFilesIterator::isUniversalEnd() const { return !Modules; }
+
+bool DbiModuleSourceFilesIterator::isCompatible(
+    const DbiModuleSourceFilesIterator &R) const {
+  // Universal iterators are compatible with any other iterator.
+  if (isUniversalEnd() || R.isUniversalEnd())
+    return true;
+
+  // At this point, neither iterator is a universal end iterator, although one
+  // or both might be non-universal end iterators.  Regardless, the module index
+  // is valid, so they are compatible if and only if they refer to the same
+  // module.
+  return Modi == R.Modi;
+}
+
+Error DbiModuleList::initialize(BinaryStreamRef ModInfo,
+                                BinaryStreamRef FileInfo) {
+  if (auto EC = initializeModInfo(ModInfo))
+    return EC;
+  if (auto EC = initializeFileInfo(FileInfo))
+    return EC;
+
+  return Error::success();
+}
+
+Error DbiModuleList::initializeModInfo(BinaryStreamRef ModInfo) {
+  ModInfoSubstream = ModInfo;
+
+  if (ModInfo.getLength() == 0)
+    return Error::success();
+
+  BinaryStreamReader Reader(ModInfo);
+
+  if (auto EC = Reader.readArray(Descriptors, ModInfo.getLength()))
+    return EC;
+
+  return Error::success();
+}
+
+Error DbiModuleList::initializeFileInfo(BinaryStreamRef FileInfo) {
+  FileInfoSubstream = FileInfo;
+
+  if (FileInfo.getLength() == 0)
+    return Error::success();
+
+  BinaryStreamReader FISR(FileInfo);
+  if (auto EC = FISR.readObject(FileInfoHeader))
+    return EC;
+
+  // First is an array of `NumModules` module indices.  This does not seem to be
+  // used for anything meaningful, so we ignore it.
+  FixedStreamArray<support::ulittle16_t> ModuleIndices;
+  if (auto EC = FISR.readArray(ModuleIndices, FileInfoHeader->NumModules))
+    return EC;
+  if (auto EC = FISR.readArray(ModFileCountArray, FileInfoHeader->NumModules))
+    return EC;
+
+  // Compute the real number of source files.  We can't trust the value in
+  // `FileInfoHeader->NumSourceFiles` because it is a unit16, and the sum of all
+  // source file counts might be larger than a unit16.  So we compute the real
+  // count by summing up the individual counts.
+  uint32_t NumSourceFiles = 0;
+  for (auto Count : ModFileCountArray)
+    NumSourceFiles += Count;
+
+  // In the reference implementation, this array is where the pointer documented
+  // at the definition of ModuleInfoHeader::FileNameOffs points to.  Note that
+  // although the field in ModuleInfoHeader is ignored this array is not, as it
+  // is the authority on where each filename begins in the names buffer.
+  if (auto EC = FISR.readArray(FileNameOffsets, NumSourceFiles))
+    return EC;
+
+  if (auto EC = FISR.readStreamRef(NamesBuffer))
+    return EC;
+
+  auto DescriptorIter = Descriptors.begin();
+  uint32_t NextFileIndex = 0;
+  ModuleInitialFileIndex.resize(FileInfoHeader->NumModules);
+  ModuleDescriptorOffsets.resize(FileInfoHeader->NumModules);
+  for (size_t I = 0; I < FileInfoHeader->NumModules; ++I) {
+    assert(DescriptorIter != Descriptors.end());
+    ModuleInitialFileIndex[I] = NextFileIndex;
+    ModuleDescriptorOffsets[I] = DescriptorIter.offset();
+
+    NextFileIndex += ModFileCountArray[I];
+    ++DescriptorIter;
+  }
+
+  assert(DescriptorIter == Descriptors.end());
+  assert(NextFileIndex == NumSourceFiles);
+
+  return Error::success();
+}
+
+uint32_t DbiModuleList::getModuleCount() const {
+  return FileInfoHeader->NumModules;
+}
+
+uint32_t DbiModuleList::getSourceFileCount() const {
+  return FileNameOffsets.size();
+}
+
+uint16_t DbiModuleList::getSourceFileCount(uint32_t Modi) const {
+  return ModFileCountArray[Modi];
+}
+
+DbiModuleDescriptor DbiModuleList::getModuleDescriptor(uint32_t Modi) const {
+  assert(Modi < getModuleCount());
+  uint32_t Offset = ModuleDescriptorOffsets[Modi];
+  auto Iter = Descriptors.at(Offset);
+  assert(Iter != Descriptors.end());
+  return *Iter;
+}
+
+iterator_range<DbiModuleSourceFilesIterator>
+DbiModuleList::source_files(uint32_t Modi) const {
+  return make_range<DbiModuleSourceFilesIterator>(
+      DbiModuleSourceFilesIterator(*this, Modi, 0),
+      DbiModuleSourceFilesIterator());
+}
+
+Expected<StringRef> DbiModuleList::getFileName(uint32_t Index) const {
+  BinaryStreamReader Names(NamesBuffer);
+  if (Index >= getSourceFileCount())
+    return make_error<RawError>(raw_error_code::index_out_of_bounds);
+
+  uint32_t FileOffset = FileNameOffsets[Index];
+  Names.setOffset(FileOffset);
+  StringRef Name;
+  if (auto EC = Names.readCString(Name))
+    return std::move(EC);
+  return Name;
+}
diff --git a/llvm/lib/DebugInfo/PDB/Native/DbiStream.cpp b/llvm/lib/DebugInfo/PDB/Native/DbiStream.cpp
index db70380..f7538c5 100644
--- a/llvm/lib/DebugInfo/PDB/Native/DbiStream.cpp
+++ b/llvm/lib/DebugInfo/PDB/Native/DbiStream.cpp
@@ -107,11 +107,11 @@
     return make_error<RawError>(raw_error_code::corrupt_file,
                                 "DBI type server substream not aligned.");
 
+  BinaryStreamRef ModInfoSubstream;
+  BinaryStreamRef FileInfoSubstream;
   if (auto EC =
           Reader.readStreamRef(ModInfoSubstream, Header->ModiSubstreamSize))
     return EC;
-  if (auto EC = initializeModInfoArray())
-    return EC;
 
   if (auto EC = Reader.readStreamRef(SecContrSubstream,
                                      Header->SecContrSubstreamSize))
@@ -129,14 +129,15 @@
           DbgStreams, Header->OptionalDbgHdrSize / sizeof(ulittle16_t)))
     return EC;
 
+  if (auto EC = Modules.initialize(ModInfoSubstream, FileInfoSubstream))
+    return EC;
+
   if (auto EC = initializeSectionContributionData())
     return EC;
   if (auto EC = initializeSectionHeadersData())
     return EC;
   if (auto EC = initializeSectionMapData())
     return EC;
-  if (auto EC = initializeFileInfo())
-    return EC;
   if (auto EC = initializeFpoRecords())
     return EC;
 
@@ -215,7 +216,8 @@
   return FpoRecords;
 }
 
-ArrayRef<ModuleInfoEx> DbiStream::modules() const { return ModuleInfos; }
+const DbiModuleList &DbiStream::modules() const { return Modules; }
+
 FixedStreamArray<SecMapEntry> DbiStream::getSectionMap() const {
   return SectionMap;
 }
@@ -248,25 +250,6 @@
                               "Unsupported DBI Section Contribution version");
 }
 
-Error DbiStream::initializeModInfoArray() {
-  if (ModInfoSubstream.getLength() == 0)
-    return Error::success();
-
-  // Since each DbiModuleDescriptor in the stream is a variable length, we have
-  // to iterate
-  // them to know how many there actually are.
-  BinaryStreamReader Reader(ModInfoSubstream);
-
-  VarStreamArray<DbiModuleDescriptor> ModInfoArray;
-  if (auto EC = Reader.readArray(ModInfoArray, ModInfoSubstream.getLength()))
-    return EC;
-  for (auto &Info : ModInfoArray) {
-    ModuleInfos.emplace_back(Info);
-  }
-
-  return Error::success();
-}
-
 // Initializes this->SectionHeaders.
 Error DbiStream::initializeSectionHeadersData() {
   if (DbgStreams.size() == 0)
@@ -338,90 +321,9 @@
   return Error::success();
 }
 
-Error DbiStream::initializeFileInfo() {
-  if (FileInfoSubstream.getLength() == 0)
-    return Error::success();
-
-  const FileInfoSubstreamHeader *FH;
-  BinaryStreamReader FISR(FileInfoSubstream);
-  if (auto EC = FISR.readObject(FH))
-    return EC;
-
-  // The number of modules in the stream should be the same as reported by
-  // the FileInfoSubstreamHeader.
-  if (FH->NumModules != ModuleInfos.size())
-    return make_error<RawError>(raw_error_code::corrupt_file,
-                                "FileInfo substream count doesn't match DBI.");
-
-  FixedStreamArray<ulittle16_t> ModIndexArray;
-  FixedStreamArray<ulittle16_t> ModFileCountArray;
-
-  // First is an array of `NumModules` module indices.  This is not used for the
-  // same reason that `NumSourceFiles` is not used.  It's an array of uint16's,
-  // but it's possible there are more than 64k source files, which would imply
-  // more than 64k modules (e.g. object files) as well.  So we ignore this
-  // field.
-  if (auto EC = FISR.readArray(ModIndexArray, ModuleInfos.size()))
-    return EC;
-  if (auto EC = FISR.readArray(ModFileCountArray, ModuleInfos.size()))
-    return EC;
-
-  // Compute the real number of source files.
-  uint32_t NumSourceFiles = 0;
-  for (auto Count : ModFileCountArray)
-    NumSourceFiles += Count;
-
-  // This is the array that in the reference implementation corresponds to
-  // `DbiModuleDescriptor::FileLayout::FileNameOffs`, which is commented there
-  // as being a
-  // pointer. Due to the mentioned problems of pointers causing difficulty
-  // when reading from the file on 64-bit systems, we continue to ignore that
-  // field in `DbiModuleDescriptor`, and instead build a vector of StringRefs
-  // and stores
-  // them in `ModuleInfoEx`.  The value written to and read from the file is
-  // not used anyway, it is only there as a way to store the offsets for the
-  // purposes of later accessing the names at runtime.
-  if (auto EC = FISR.readArray(FileNameOffsets, NumSourceFiles))
-    return EC;
-
-  if (auto EC = FISR.readStreamRef(NamesBuffer))
-    return EC;
-
-  // We go through each ModuleInfo, determine the number N of source files for
-  // that module, and then get the next N offsets from the Offsets array, using
-  // them to get the corresponding N names from the Names buffer and associating
-  // each one with the corresponding module.
-  uint32_t NextFileIndex = 0;
-  for (size_t I = 0; I < ModuleInfos.size(); ++I) {
-    uint32_t NumFiles = ModFileCountArray[I];
-    ModuleInfos[I].SourceFiles.resize(NumFiles);
-    for (size_t J = 0; J < NumFiles; ++J, ++NextFileIndex) {
-      auto ThisName = getFileNameForIndex(NextFileIndex);
-      if (!ThisName)
-        return ThisName.takeError();
-      ModuleInfos[I].SourceFiles[J] = *ThisName;
-    }
-  }
-
-  return Error::success();
-}
-
 uint32_t DbiStream::getDebugStreamIndex(DbgHeaderType Type) const {
   uint16_t T = static_cast<uint16_t>(Type);
   if (T >= DbgStreams.size())
     return kInvalidStreamIndex;
   return DbgStreams[T];
 }
-
-Expected<StringRef> DbiStream::getFileNameForIndex(uint32_t Index) const {
-  BinaryStreamReader Names(NamesBuffer);
-  if (Index >= FileNameOffsets.size())
-    return make_error<RawError>(raw_error_code::index_out_of_bounds);
-
-  uint32_t FileOffset = FileNameOffsets[Index];
-  Names.setOffset(FileOffset);
-  StringRef Name;
-  if (auto EC = Names.readCString(Name))
-    return std::move(EC);
-  return Name;
-}
diff --git a/llvm/lib/DebugInfo/PDB/Native/NativeCompilandSymbol.cpp b/llvm/lib/DebugInfo/PDB/Native/NativeCompilandSymbol.cpp
index 9c0cc0b..77f8325 100644
--- a/llvm/lib/DebugInfo/PDB/Native/NativeCompilandSymbol.cpp
+++ b/llvm/lib/DebugInfo/PDB/Native/NativeCompilandSymbol.cpp
@@ -13,7 +13,7 @@
 namespace pdb {
 
 NativeCompilandSymbol::NativeCompilandSymbol(NativeSession &Session,
-                                             const ModuleInfoEx &MI)
+                                             DbiModuleDescriptor MI)
     : NativeRawSymbol(Session), Module(MI) {}
 
 PDB_SymType NativeCompilandSymbol::getSymTag() const {
@@ -21,7 +21,7 @@
 }
 
 bool NativeCompilandSymbol::isEditAndContinueEnabled() const {
-  return Module.Info.hasECInfo();
+  return Module.hasECInfo();
 }
 
 uint32_t NativeCompilandSymbol::getLexicalParentId() const { return 0; }
@@ -32,11 +32,11 @@
 // this potential confusion.
 
 std::string NativeCompilandSymbol::getLibraryName() const {
-  return Module.Info.getObjFileName();
+  return Module.getObjFileName();
 }
 
 std::string NativeCompilandSymbol::getName() const {
-  return Module.Info.getModuleName();
+  return Module.getModuleName();
 }
 
 } // namespace pdb
diff --git a/llvm/lib/DebugInfo/PDB/Native/NativeEnumModules.cpp b/llvm/lib/DebugInfo/PDB/Native/NativeEnumModules.cpp
index 7532110..97319fd 100644
--- a/llvm/lib/DebugInfo/PDB/Native/NativeEnumModules.cpp
+++ b/llvm/lib/DebugInfo/PDB/Native/NativeEnumModules.cpp
@@ -10,6 +10,7 @@
 #include "llvm/DebugInfo/PDB/Native/NativeEnumModules.h"
 
 #include "llvm/DebugInfo/PDB/IPDBEnumChildren.h"
+#include "llvm/DebugInfo/PDB/Native/DbiModuleList.h"
 #include "llvm/DebugInfo/PDB/Native/NativeCompilandSymbol.h"
 #include "llvm/DebugInfo/PDB/Native/NativeSession.h"
 #include "llvm/DebugInfo/PDB/PDBSymbol.h"
@@ -19,25 +20,25 @@
 namespace pdb {
 
 NativeEnumModules::NativeEnumModules(NativeSession &PDBSession,
-                                     ArrayRef<ModuleInfoEx> Modules,
+                                     const DbiModuleList &Modules,
                                      uint32_t Index)
     : Session(PDBSession), Modules(Modules), Index(Index) {}
 
 uint32_t NativeEnumModules::getChildCount() const {
-  return static_cast<uint32_t>(Modules.size());
+  return static_cast<uint32_t>(Modules.getModuleCount());
 }
 
 std::unique_ptr<PDBSymbol>
 NativeEnumModules::getChildAtIndex(uint32_t Index) const {
-  if (Index >= Modules.size())
+  if (Index >= Modules.getModuleCount())
     return nullptr;
-  return std::unique_ptr<PDBSymbol>(new PDBSymbolCompiland(Session,
-      std::unique_ptr<IPDBRawSymbol>(
-          new NativeCompilandSymbol(Session, Modules[Index]))));
+  return std::unique_ptr<PDBSymbol>(new PDBSymbolCompiland(
+      Session, std::unique_ptr<IPDBRawSymbol>(new NativeCompilandSymbol(
+                   Session, Modules.getModuleDescriptor(Index)))));
 }
 
 std::unique_ptr<PDBSymbol> NativeEnumModules::getNext() {
-  if (Index >= Modules.size())
+  if (Index >= Modules.getModuleCount())
     return nullptr;
   return getChildAtIndex(Index++);
 }
diff --git a/llvm/lib/DebugInfo/PDB/Native/NativeExeSymbol.cpp b/llvm/lib/DebugInfo/PDB/Native/NativeExeSymbol.cpp
index ec2a4b8..bb52560 100644
--- a/llvm/lib/DebugInfo/PDB/Native/NativeExeSymbol.cpp
+++ b/llvm/lib/DebugInfo/PDB/Native/NativeExeSymbol.cpp
@@ -26,7 +26,7 @@
   case PDB_SymType::Compiland: {
     auto Dbi = File.getPDBDbiStream();
     if (Dbi) {
-      const auto Modules = Dbi->modules();
+      const DbiModuleList &Modules = Dbi->modules();
       return std::unique_ptr<IPDBEnumSymbols>(
           new NativeEnumModules(Session, Modules));
     }