[PDB] Don't build the entire source file list up front.

I tried to run llvm-pdbdump on a very large (~1.5GB) PDB to
try and identify show-stopping performance problems.  This
patch addresses the first such problem.

When loading the DBI stream, before anyone has even tried to
access a single record, we build an in memory map of every
source file for every module.  In the particular PDB I was
using, this was over 85 million files.  Specifically, the
complexity is O(m*n) where m is the number of modules and
n is the average number of source files (including headers)
per module.

The whole reason for doing this was so that we could have
constant time access to any module and any of its source
file lists.  However, we can still get O(1) access to the
source file list for a given module with a simple O(m)
precomputation, and access to the list of modules is
already O(1) anyway.

So this patches reduces the O(m*n) up-front precomputation
to an O(m) one, where n is ~6,500 and n*m is about 85 million
in my pathological test case.

Differential Revision: https://reviews.llvm.org/D32870

llvm-svn: 302205
diff --git a/llvm/lib/DebugInfo/PDB/Native/DbiStream.cpp b/llvm/lib/DebugInfo/PDB/Native/DbiStream.cpp
index db70380..f7538c5 100644
--- a/llvm/lib/DebugInfo/PDB/Native/DbiStream.cpp
+++ b/llvm/lib/DebugInfo/PDB/Native/DbiStream.cpp
@@ -107,11 +107,11 @@
     return make_error<RawError>(raw_error_code::corrupt_file,
                                 "DBI type server substream not aligned.");
 
+  BinaryStreamRef ModInfoSubstream;
+  BinaryStreamRef FileInfoSubstream;
   if (auto EC =
           Reader.readStreamRef(ModInfoSubstream, Header->ModiSubstreamSize))
     return EC;
-  if (auto EC = initializeModInfoArray())
-    return EC;
 
   if (auto EC = Reader.readStreamRef(SecContrSubstream,
                                      Header->SecContrSubstreamSize))
@@ -129,14 +129,15 @@
           DbgStreams, Header->OptionalDbgHdrSize / sizeof(ulittle16_t)))
     return EC;
 
+  if (auto EC = Modules.initialize(ModInfoSubstream, FileInfoSubstream))
+    return EC;
+
   if (auto EC = initializeSectionContributionData())
     return EC;
   if (auto EC = initializeSectionHeadersData())
     return EC;
   if (auto EC = initializeSectionMapData())
     return EC;
-  if (auto EC = initializeFileInfo())
-    return EC;
   if (auto EC = initializeFpoRecords())
     return EC;
 
@@ -215,7 +216,8 @@
   return FpoRecords;
 }
 
-ArrayRef<ModuleInfoEx> DbiStream::modules() const { return ModuleInfos; }
+const DbiModuleList &DbiStream::modules() const { return Modules; }
+
 FixedStreamArray<SecMapEntry> DbiStream::getSectionMap() const {
   return SectionMap;
 }
@@ -248,25 +250,6 @@
                               "Unsupported DBI Section Contribution version");
 }
 
-Error DbiStream::initializeModInfoArray() {
-  if (ModInfoSubstream.getLength() == 0)
-    return Error::success();
-
-  // Since each DbiModuleDescriptor in the stream is a variable length, we have
-  // to iterate
-  // them to know how many there actually are.
-  BinaryStreamReader Reader(ModInfoSubstream);
-
-  VarStreamArray<DbiModuleDescriptor> ModInfoArray;
-  if (auto EC = Reader.readArray(ModInfoArray, ModInfoSubstream.getLength()))
-    return EC;
-  for (auto &Info : ModInfoArray) {
-    ModuleInfos.emplace_back(Info);
-  }
-
-  return Error::success();
-}
-
 // Initializes this->SectionHeaders.
 Error DbiStream::initializeSectionHeadersData() {
   if (DbgStreams.size() == 0)
@@ -338,90 +321,9 @@
   return Error::success();
 }
 
-Error DbiStream::initializeFileInfo() {
-  if (FileInfoSubstream.getLength() == 0)
-    return Error::success();
-
-  const FileInfoSubstreamHeader *FH;
-  BinaryStreamReader FISR(FileInfoSubstream);
-  if (auto EC = FISR.readObject(FH))
-    return EC;
-
-  // The number of modules in the stream should be the same as reported by
-  // the FileInfoSubstreamHeader.
-  if (FH->NumModules != ModuleInfos.size())
-    return make_error<RawError>(raw_error_code::corrupt_file,
-                                "FileInfo substream count doesn't match DBI.");
-
-  FixedStreamArray<ulittle16_t> ModIndexArray;
-  FixedStreamArray<ulittle16_t> ModFileCountArray;
-
-  // First is an array of `NumModules` module indices.  This is not used for the
-  // same reason that `NumSourceFiles` is not used.  It's an array of uint16's,
-  // but it's possible there are more than 64k source files, which would imply
-  // more than 64k modules (e.g. object files) as well.  So we ignore this
-  // field.
-  if (auto EC = FISR.readArray(ModIndexArray, ModuleInfos.size()))
-    return EC;
-  if (auto EC = FISR.readArray(ModFileCountArray, ModuleInfos.size()))
-    return EC;
-
-  // Compute the real number of source files.
-  uint32_t NumSourceFiles = 0;
-  for (auto Count : ModFileCountArray)
-    NumSourceFiles += Count;
-
-  // This is the array that in the reference implementation corresponds to
-  // `DbiModuleDescriptor::FileLayout::FileNameOffs`, which is commented there
-  // as being a
-  // pointer. Due to the mentioned problems of pointers causing difficulty
-  // when reading from the file on 64-bit systems, we continue to ignore that
-  // field in `DbiModuleDescriptor`, and instead build a vector of StringRefs
-  // and stores
-  // them in `ModuleInfoEx`.  The value written to and read from the file is
-  // not used anyway, it is only there as a way to store the offsets for the
-  // purposes of later accessing the names at runtime.
-  if (auto EC = FISR.readArray(FileNameOffsets, NumSourceFiles))
-    return EC;
-
-  if (auto EC = FISR.readStreamRef(NamesBuffer))
-    return EC;
-
-  // We go through each ModuleInfo, determine the number N of source files for
-  // that module, and then get the next N offsets from the Offsets array, using
-  // them to get the corresponding N names from the Names buffer and associating
-  // each one with the corresponding module.
-  uint32_t NextFileIndex = 0;
-  for (size_t I = 0; I < ModuleInfos.size(); ++I) {
-    uint32_t NumFiles = ModFileCountArray[I];
-    ModuleInfos[I].SourceFiles.resize(NumFiles);
-    for (size_t J = 0; J < NumFiles; ++J, ++NextFileIndex) {
-      auto ThisName = getFileNameForIndex(NextFileIndex);
-      if (!ThisName)
-        return ThisName.takeError();
-      ModuleInfos[I].SourceFiles[J] = *ThisName;
-    }
-  }
-
-  return Error::success();
-}
-
 uint32_t DbiStream::getDebugStreamIndex(DbgHeaderType Type) const {
   uint16_t T = static_cast<uint16_t>(Type);
   if (T >= DbgStreams.size())
     return kInvalidStreamIndex;
   return DbgStreams[T];
 }
-
-Expected<StringRef> DbiStream::getFileNameForIndex(uint32_t Index) const {
-  BinaryStreamReader Names(NamesBuffer);
-  if (Index >= FileNameOffsets.size())
-    return make_error<RawError>(raw_error_code::index_out_of_bounds);
-
-  uint32_t FileOffset = FileNameOffsets[Index];
-  Names.setOffset(FileOffset);
-  StringRef Name;
-  if (auto EC = Names.readCString(Name))
-    return std::move(EC);
-  return Name;
-}