[Support] Make line-number cache robust against access patterns.

Summary:
The LLVM SourceMgr class (which is used indirectly by Swift, though not Clang)
has a routine for looking up line numbers of SMLocs. This routine uses a
shared, special-purpose cache that handles exactly one access pattern
efficiently: looking up the line number of an SMLoc that points into the same
buffer as the last query made to the SourceMgr, at a location in the buffer at
or ahead of the last query.

When this works it's fine, but when it fails it's catastrophic for performancer:
one recent out-of-order access from a Swift utility routine ran for tens of
seconds, spending 99% of its time repeatedly scanning buffers for '\n'.

This change removes the shared cache from the SourceMgr and installs a new
cache in each SrcBuffer. The per-SrcBuffer caches are also "full", in the sense
that rather than caching a single last-query pointer, they cache _all_ the
line-ending offsets, in a binary-searchable array, such that once it's
populated (on first access), all subsequent access patterns run at the same
speed.

Performance measurements I've done show this is actually a little bit faster on
real codebases (though only a couple fractions of a percent). Memory usage is
up by a few tens to hundreds of bytes per SrcBuffer that has a line lookup done
on it; I've attempted to minimize this by using dynamic selection of integer
sized when storing offset arrays. But the main motive here is to
make-impossible the cases we don't always see, that show up by surprise when
there is an out-of-order access pattern.

Reviewers: jordan_rose

Reviewed By: jordan_rose

Subscribers: probinson, llvm-commits

Differential Revision: https://reviews.llvm.org/D45003

llvm-svn: 329470
diff --git a/llvm/lib/Support/SourceMgr.cpp b/llvm/lib/Support/SourceMgr.cpp
index a8f6208..089fcdb 100644
--- a/llvm/lib/Support/SourceMgr.cpp
+++ b/llvm/lib/Support/SourceMgr.cpp
@@ -28,6 +28,7 @@
 #include <algorithm>
 #include <cassert>
 #include <cstddef>
+#include <limits>
 #include <memory>
 #include <string>
 #include <utility>
@@ -36,24 +37,6 @@
 
 static const size_t TabStop = 8;
 
-namespace {
-
-  struct LineNoCacheTy {
-    const char *LastQuery;
-    unsigned LastQueryBufferID;
-    unsigned LineNoOfQuery;
-  };
-
-} // end anonymous namespace
-
-static LineNoCacheTy *getCache(void *Ptr) {
-  return (LineNoCacheTy*)Ptr;
-}
-
-SourceMgr::~SourceMgr() {
-  delete getCache(LineNoCache);
-}
-
 unsigned SourceMgr::AddIncludeFile(const std::string &Filename,
                                    SMLoc IncludeLoc,
                                    std::string &IncludedFile) {
@@ -85,46 +68,86 @@
   return 0;
 }
 
+template <typename T>
+unsigned SourceMgr::SrcBuffer::getLineNumber(const char *Ptr) const {
+
+  // Ensure OffsetCache is allocated and populated with offsets of all the
+  // '\n' bytes.
+  std::vector<T> *Offsets = nullptr;
+  if (OffsetCache.isNull()) {
+    Offsets = new std::vector<T>();
+    OffsetCache = Offsets;
+    size_t Sz = Buffer->getBufferSize();
+    assert(Sz <= std::numeric_limits<T>::max());
+    StringRef S = Buffer->getBuffer();
+    for (size_t N = 0; N < Sz; ++N) {
+      if (S[N] == '\n') {
+        Offsets->push_back(static_cast<T>(N));
+      }
+    }
+  } else {
+    Offsets = OffsetCache.get<std::vector<T> *>();
+  }
+
+  const char *BufStart = Buffer->getBufferStart();
+  assert(Ptr >= BufStart && Ptr <= Buffer->getBufferEnd());
+  ptrdiff_t PtrDiff = Ptr - BufStart;
+  assert(PtrDiff >= 0 && static_cast<size_t>(PtrDiff) <= std::numeric_limits<T>::max());
+  T PtrOffset = static_cast<T>(PtrDiff);
+
+  // std::lower_bound returns the first EOL offset that's not-less-than
+  // PtrOffset, meaning the EOL that _ends the line_ that PtrOffset is on
+  // (including if PtrOffset refers to the EOL itself). If there's no such
+  // EOL, returns end().
+  auto EOL = std::lower_bound(Offsets->begin(), Offsets->end(), PtrOffset);
+
+  // Lines count from 1, so add 1 to the distance from the 0th line.
+  return (1 + (EOL - Offsets->begin()));
+}
+
+SourceMgr::SrcBuffer::SrcBuffer(SourceMgr::SrcBuffer &&Other)
+  : Buffer(std::move(Other.Buffer)),
+    OffsetCache(Other.OffsetCache),
+    IncludeLoc(Other.IncludeLoc) {
+  Other.OffsetCache = nullptr;
+}
+
+SourceMgr::SrcBuffer::~SrcBuffer() {
+  if (!OffsetCache.isNull()) {
+    if (OffsetCache.is<std::vector<uint8_t>*>())
+      delete OffsetCache.get<std::vector<uint8_t>*>();
+    else if (OffsetCache.is<std::vector<uint16_t>*>())
+      delete OffsetCache.get<std::vector<uint16_t>*>();
+    else if (OffsetCache.is<std::vector<uint32_t>*>())
+      delete OffsetCache.get<std::vector<uint32_t>*>();
+    else
+      delete OffsetCache.get<std::vector<uint64_t>*>();
+    OffsetCache = nullptr;
+  }
+}
+
 std::pair<unsigned, unsigned>
 SourceMgr::getLineAndColumn(SMLoc Loc, unsigned BufferID) const {
   if (!BufferID)
     BufferID = FindBufferContainingLoc(Loc);
   assert(BufferID && "Invalid Location!");
 
-  const MemoryBuffer *Buff = getMemoryBuffer(BufferID);
+  auto &SB = getBufferInfo(BufferID);
+  const char *Ptr = Loc.getPointer();
 
-  // Count the number of \n's between the start of the file and the specified
-  // location.
-  unsigned LineNo = 1;
+  size_t Sz = SB.Buffer->getBufferSize();
+  assert(Sz <= std::numeric_limits<uint64_t>::max());
+  unsigned LineNo;
+  if (Sz <= std::numeric_limits<uint8_t>::max())
+    LineNo = SB.getLineNumber<uint8_t>(Ptr);
+  else if (Sz <= std::numeric_limits<uint16_t>::max())
+    LineNo = SB.getLineNumber<uint16_t>(Ptr);
+  else if (Sz <= std::numeric_limits<uint32_t>::max())
+    LineNo = SB.getLineNumber<uint32_t>(Ptr);
+  else
+    LineNo = SB.getLineNumber<uint64_t>(Ptr);
 
-  const char *BufStart = Buff->getBufferStart();
-  const char *Ptr = BufStart;
-
-  // If we have a line number cache, and if the query is to a later point in the
-  // same file, start searching from the last query location.  This optimizes
-  // for the case when multiple diagnostics come out of one file in order.
-  if (LineNoCacheTy *Cache = getCache(LineNoCache))
-    if (Cache->LastQueryBufferID == BufferID &&
-        Cache->LastQuery <= Loc.getPointer()) {
-      Ptr = Cache->LastQuery;
-      LineNo = Cache->LineNoOfQuery;
-    }
-
-  // Scan for the location being queried, keeping track of the number of lines
-  // we see.
-  for (; SMLoc::getFromPointer(Ptr) != Loc; ++Ptr)
-    if (*Ptr == '\n') ++LineNo;
-
-  // Allocate the line number cache if it doesn't exist.
-  if (!LineNoCache)
-    LineNoCache = new LineNoCacheTy();
-
-  // Update the line # cache.
-  LineNoCacheTy &Cache = *getCache(LineNoCache);
-  Cache.LastQueryBufferID = BufferID;
-  Cache.LastQuery = Ptr;
-  Cache.LineNoOfQuery = LineNo;
-  
+  const char *BufStart = SB.Buffer->getBufferStart();
   size_t NewlineOffs = StringRef(BufStart, Ptr-BufStart).find_last_of("\n\r");
   if (NewlineOffs == StringRef::npos) NewlineOffs = ~(size_t)0;
   return std::make_pair(LineNo, Ptr-BufStart-NewlineOffs);