Restructure how we break tokens.

This fixes some bugs in the reflowing logic and splits out the concerns
of reflowing from BreakableToken.

Things to do after this patch:
- Refactor the breakProtrudingToken function possibly into a class, so we
  can split it up into methods that operate on the common state.
- Optimize whitespace compression when reflowing by using the next possible
  split point instead of the latest possible split point.
- Retry different strategies for reflowing (strictly staying below the
  column limit vs. allowing excess characters if possible).

Differential Revision: https://reviews.llvm.org/D40310

llvm-svn: 319314
diff --git a/clang/lib/Format/ContinuationIndenter.cpp b/clang/lib/Format/ContinuationIndenter.cpp
index 44e6ad4..c0d2bf6 100644
--- a/clang/lib/Format/ContinuationIndenter.cpp
+++ b/clang/lib/Format/ContinuationIndenter.cpp
@@ -1484,23 +1484,25 @@
                                                     LineState &State,
                                                     bool AllowBreak,
                                                     bool DryRun) {
-  std::unique_ptr<BreakableToken> Token =
+  std::unique_ptr<const BreakableToken> Token =
       createBreakableToken(Current, State, AllowBreak);
   if (!Token)
     return 0;
+  assert(Token->getLineCount() > 0);
   unsigned ColumnLimit = getColumnLimit(State);
-  unsigned StartColumn = State.Column - Current.ColumnWidth;
   if (Current.is(TT_LineComment)) {
     // We don't insert backslashes when breaking line comments.
     ColumnLimit = Style.ColumnLimit;
   }
   if (Current.UnbreakableTailLength >= ColumnLimit)
     return 0;
-
+  // ColumnWidth was already accounted into State.Column before calling
+  // breakProtrudingToken.
+  unsigned StartColumn = State.Column - Current.ColumnWidth;
   unsigned NewBreakPenalty = Current.isStringLiteral()
                                  ? Style.PenaltyBreakString
                                  : Style.PenaltyBreakComment;
-  unsigned RemainingSpace = ColumnLimit - Current.UnbreakableTailLength;
+  // Stores whether we introduce a break anywhere in the token.
   bool BreakInserted = Token->introducesBreakBeforeToken();
   // Store whether we inserted a new line break at the end of the previous
   // logical line.
@@ -1508,145 +1510,295 @@
   // We use a conservative reflowing strategy. Reflow starts after a line is
   // broken or the corresponding whitespace compressed. Reflow ends as soon as a
   // line that doesn't get reflown with the previous line is reached.
-  bool ReflowInProgress = false;
-  unsigned Penalty = 0;
-  unsigned RemainingTokenColumns = 0;
+  bool Reflow = false;
+  // Keep track of where we are in the token:
+  // Where we are in the content of the current logical line.
   unsigned TailOffset = 0;
+  // The column number we're currently at.
+  unsigned ContentStartColumn =
+      Token->getContentStartColumn(0, /*Break=*/false);
+  // The number of columns left in the current logical line after TailOffset.
+  unsigned RemainingTokenColumns =
+      Token->getRemainingLength(0, TailOffset, ContentStartColumn);
+  // Adapt the start of the token, for example indent.
+  if (!DryRun)
+    Token->adaptStartOfLine(0, Whitespaces);
+
+  unsigned Penalty = 0;
   DEBUG(llvm::dbgs() << "Breaking protruding token at column " << StartColumn
                      << ".\n");
   for (unsigned LineIndex = 0, EndIndex = Token->getLineCount();
        LineIndex != EndIndex; ++LineIndex) {
-    DEBUG(llvm::dbgs() << "  Line: " << LineIndex
-                       << " (Reflow: " << ReflowInProgress << ")\n");
-    BreakableToken::Split SplitBefore(StringRef::npos, 0);
-    if (ReflowInProgress) {
-      SplitBefore = Token->getSplitBefore(LineIndex, RemainingTokenColumns,
-                                          RemainingSpace, CommentPragmasRegex);
-    }
-    ReflowInProgress = SplitBefore.first != StringRef::npos;
-    DEBUG({
-      if (ReflowInProgress)
-        llvm::dbgs() << "  Reflowing.\n";
-    });
-    TailOffset =
-        ReflowInProgress ? (SplitBefore.first + SplitBefore.second) : 0;
-    // If we found a reflow split and have added a new break before this line,
-    // we are going to remove the line break at the start of the next logical
-    // line.
-    // For example, here we'll add a new line break after 'text', and
-    // subsequently delete the line break between 'that' and 'reflows'.
-    //   // some text that
-    //   // reflows
-    // ->
-    //   // some text
-    //   // that reflows
-    // When adding the line break, we also added the penalty for it, so we need
-    // to subtract that penalty again when we remove the line break due to
-    // reflowing.
-    if (ReflowInProgress && NewBreakBefore) {
-      assert(Penalty >= NewBreakPenalty);
-      Penalty -= NewBreakPenalty;
-    }
+    DEBUG(llvm::dbgs() << "  Line: " << LineIndex << " (Reflow: " << Reflow
+                       << ")\n");
     NewBreakBefore = false;
-    if (!DryRun)
-      Token->replaceWhitespaceBefore(LineIndex, RemainingTokenColumns,
-                                     RemainingSpace, SplitBefore, Whitespaces);
-    RemainingTokenColumns = Token->getLineLengthAfterSplitBefore(
-        LineIndex, TailOffset, RemainingTokenColumns, ColumnLimit, SplitBefore);
-    while (RemainingTokenColumns > RemainingSpace) {
-      DEBUG(llvm::dbgs() << "    Over limit, need: " << RemainingTokenColumns
-                         << ", space: " << RemainingSpace << "\n");
-      BreakableToken::Split Split = Token->getSplit(
-          LineIndex, TailOffset, ColumnLimit, CommentPragmasRegex);
+    // If we did reflow the previous line, we'll try reflowing again. Otherwise
+    // we'll start reflowing if the current line is broken or whitespace is
+    // compressed.
+    bool TryReflow = Reflow;
+    // Break the current token until we can fit the rest of the line.
+    while (ContentStartColumn + RemainingTokenColumns > ColumnLimit) {
+      DEBUG(llvm::dbgs() << "    Over limit, need: "
+                         << (ContentStartColumn + RemainingTokenColumns)
+                         << ", space: " << ColumnLimit
+                         << ", reflown prefix: " << ContentStartColumn
+                         << ", offset in line: " << TailOffset << "\n");
+      // If the current token doesn't fit, find the latest possible split in the
+      // current line so that breaking at it will be under the column limit.
+      // FIXME: Use the earliest possible split while reflowing to correctly
+      // compress whitespace within a line.
+      BreakableToken::Split Split =
+          Token->getSplit(LineIndex, TailOffset, ColumnLimit,
+                          ContentStartColumn, CommentPragmasRegex);
       if (Split.first == StringRef::npos) {
-        // The last line's penalty is handled in addNextStateToQueue().
+        // No break opportunity - update the penalty and continue with the next
+        // logical line.
         if (LineIndex < EndIndex - 1)
+          // The last line's penalty is handled in addNextStateToQueue().
           Penalty += Style.PenaltyExcessCharacter *
-                     (RemainingTokenColumns - RemainingSpace);
+                     (ContentStartColumn + RemainingTokenColumns - ColumnLimit);
         DEBUG(llvm::dbgs() << "    No break opportunity.\n");
         break;
       }
       assert(Split.first != 0);
 
-      // Check if compressing the whitespace range will bring the line length
-      // under the limit. If that is the case, we perform whitespace compression
-      // instead of inserting a line break.
-      unsigned RemainingTokenColumnsAfterCompression =
-          Token->getLineLengthAfterCompression(RemainingTokenColumns, Split);
-      if (RemainingTokenColumnsAfterCompression <= RemainingSpace) {
-        RemainingTokenColumns = RemainingTokenColumnsAfterCompression;
-        ReflowInProgress = true;
-        if (!DryRun)
-          Token->compressWhitespace(LineIndex, TailOffset, Split, Whitespaces);
-        DEBUG(llvm::dbgs() << "    Compressing below limit.\n");
-        break;
+      if (Token->supportsReflow()) {
+        // Check whether the next natural split point after the current one can
+        // still fit the line, either because we can compress away whitespace,
+        // or because the penalty the excess characters introduce is lower than
+        // the break penalty.
+        // We only do this for tokens that support reflowing, and thus allow us
+        // to change the whitespace arbitrarily (e.g. comments).
+        // Other tokens, like string literals, can be broken on arbitrary
+        // positions.
+
+        // First, compute the columns from TailOffset to the next possible split
+        // position.
+        // For example:
+        // ColumnLimit:     |
+        // // Some text   that    breaks
+        //    ^ tail offset
+        //             ^-- split
+        //    ^-------- to split columns
+        //                    ^--- next split
+        //    ^--------------- to next split columns
+        unsigned ToSplitColumns = Token->getRangeLength(
+            LineIndex, TailOffset, Split.first, ContentStartColumn);
+        DEBUG(llvm::dbgs() << "    ToSplit: " << ToSplitColumns << "\n");
+
+        BreakableToken::Split NextSplit = Token->getSplit(
+            LineIndex, TailOffset + Split.first + Split.second, ColumnLimit,
+            ContentStartColumn + ToSplitColumns + 1, CommentPragmasRegex);
+        // Compute the columns necessary to fit the next non-breakable sequence
+        // into the current line.
+        unsigned ToNextSplitColumns = 0;
+        if (NextSplit.first == StringRef::npos) {
+          ToNextSplitColumns = Token->getRemainingLength(LineIndex, TailOffset,
+                                                         ContentStartColumn);
+        } else {
+          ToNextSplitColumns = Token->getRangeLength(
+              LineIndex, TailOffset,
+              Split.first + Split.second + NextSplit.first, ContentStartColumn);
+        }
+        // Compress the whitespace between the break and the start of the next
+        // unbreakable sequence.
+        ToNextSplitColumns =
+            Token->getLengthAfterCompression(ToNextSplitColumns, Split);
+        DEBUG(llvm::dbgs() << "    ContentStartColumn: " << ContentStartColumn
+                           << "\n");
+        DEBUG(llvm::dbgs() << "    ToNextSplit: " << ToNextSplitColumns << "\n");
+        // If the whitespace compression makes us fit, continue on the current
+        // line.
+        bool ContinueOnLine =
+            ContentStartColumn + ToNextSplitColumns <= ColumnLimit;
+        unsigned ExcessCharactersPenalty = 0;
+        if (!ContinueOnLine) {
+          // Similarly, if the excess characters' penalty is lower than the
+          // penalty of introducing a new break, continue on the current line.
+          ExcessCharactersPenalty =
+              (ContentStartColumn + ToNextSplitColumns - ColumnLimit) *
+              Style.PenaltyExcessCharacter;
+          DEBUG(llvm::dbgs()
+                << "    Penalty excess: " << ExcessCharactersPenalty
+                << "\n            break : " << NewBreakPenalty << "\n");
+          if (ExcessCharactersPenalty < NewBreakPenalty)
+            ContinueOnLine = true;
+        }
+        if (ContinueOnLine) {
+          DEBUG(llvm::dbgs() << "    Continuing on line...\n");
+          // The current line fits after compressing the whitespace - reflow
+          // the next line into it if possible.
+          TryReflow = true;
+          if (!DryRun)
+            Token->compressWhitespace(LineIndex, TailOffset, Split,
+                                      Whitespaces);
+          // When we continue on the same line, leave one space between content.
+          ContentStartColumn += ToSplitColumns + 1;
+          Penalty += ExcessCharactersPenalty;
+          TailOffset += Split.first + Split.second;
+          RemainingTokenColumns = Token->getRemainingLength(
+              LineIndex, TailOffset, ContentStartColumn);
+          continue;
+        }
       }
-
-      // Compute both the penalties for:
-      // - not breaking, and leaving excess characters
-      // - adding a new line break
-      assert(RemainingTokenColumnsAfterCompression > RemainingSpace);
-      unsigned ExcessCharactersPenalty =
-          (RemainingTokenColumnsAfterCompression - RemainingSpace) *
-          Style.PenaltyExcessCharacter;
-
-      unsigned BreakPenalty = NewBreakPenalty;
-      unsigned ColumnsUsed =
-          Token->getLineLengthAfterSplit(LineIndex, TailOffset, Split.first);
-      if (ColumnsUsed > ColumnLimit)
-        BreakPenalty +=
-            Style.PenaltyExcessCharacter * (ColumnsUsed - ColumnLimit);
-
-      DEBUG(llvm::dbgs() << "    Penalty excess: " << ExcessCharactersPenalty
-                         << "\n            break : " << BreakPenalty << "\n");
-      // Only continue to add the line break if the penalty of the excess
-      // characters is larger than the penalty of the line break.
-      // FIXME: This does not take into account when we can later remove the
-      // line break again due to a reflow.
-      if (ExcessCharactersPenalty < BreakPenalty) {
-        if (!DryRun)
-          Token->compressWhitespace(LineIndex, TailOffset, Split, Whitespaces);
-        // Do not set ReflowInProgress: we do not have any space left to
-        // reflow into.
-        Penalty += ExcessCharactersPenalty;
-        break;
-      }
-
-      unsigned NewRemainingTokenColumns = Token->getLineLengthAfterSplit(
-          LineIndex, TailOffset + Split.first + Split.second, StringRef::npos);
+      DEBUG(llvm::dbgs() << "    Breaking...\n");
+      ContentStartColumn =
+          Token->getContentStartColumn(LineIndex, /*Break=*/true);
+      unsigned NewRemainingTokenColumns = Token->getRemainingLength(
+          LineIndex, TailOffset + Split.first + Split.second,
+          ContentStartColumn);
 
       // When breaking before a tab character, it may be moved by a few columns,
       // but will still be expanded to the next tab stop, so we don't save any
       // columns.
-      if (NewRemainingTokenColumns == RemainingTokenColumns)
+      if (NewRemainingTokenColumns == RemainingTokenColumns) {
         // FIXME: Do we need to adjust the penalty?
         break;
+      }
       assert(NewRemainingTokenColumns < RemainingTokenColumns);
 
+      DEBUG(llvm::dbgs() << "    Breaking at: " << TailOffset + Split.first
+                         << ", " << Split.second << "\n");
       if (!DryRun)
         Token->insertBreak(LineIndex, TailOffset, Split, Whitespaces);
 
-      Penalty += BreakPenalty;
+      Penalty += NewBreakPenalty;
       TailOffset += Split.first + Split.second;
       RemainingTokenColumns = NewRemainingTokenColumns;
-      ReflowInProgress = true;
       BreakInserted = true;
       NewBreakBefore = true;
     }
+    // In case there's another line, prepare the state for the start of the next
+    // line.
+    if (LineIndex + 1 != EndIndex) {
+      unsigned NextLineIndex = LineIndex + 1;
+      if (NewBreakBefore)
+        // After breaking a line, try to reflow the next line into the current
+        // one once RemainingTokenColumns fits.
+        TryReflow = true;
+      if (TryReflow) {
+        // We decided that we want to try reflowing the next line into the
+        // current one.
+        // We will now adjust the state as if the reflow is successful (in
+        // preparation for the next line), and see whether that works. If we
+        // decide that we cannot reflow, we will later reset the state to the
+        // start of the next line.
+        Reflow = false;
+        // As we did not continue breaking the line, RemainingTokenColumns is
+        // known to fit after ContentStartColumn. Adapt ContentStartColumn to
+        // the position at which we want to format the next line if we do
+        // actually reflow.
+        // When we reflow, we need to add a space between the end of the current
+        // line and the next line's start column.
+        ContentStartColumn += RemainingTokenColumns + 1;
+        // Get the split that we need to reflow next logical line into the end
+        // of the current one; the split will include any leading whitespace of
+        // the next logical line.
+        BreakableToken::Split SplitBeforeNext =
+            Token->getReflowSplit(NextLineIndex, CommentPragmasRegex);
+        DEBUG(llvm::dbgs() << "    Size of reflown text: " << ContentStartColumn
+                           << "\n    Potential reflow split: ");
+        if (SplitBeforeNext.first != StringRef::npos) {
+          DEBUG(llvm::dbgs() << SplitBeforeNext.first << ", "
+                             << SplitBeforeNext.second << "\n");
+          TailOffset = SplitBeforeNext.first + SplitBeforeNext.second;
+          // If the rest of the next line fits into the current line below the
+          // column limit, we can safely reflow.
+          RemainingTokenColumns = Token->getRemainingLength(
+              NextLineIndex, TailOffset, ContentStartColumn);
+          Reflow = true;
+          if (ContentStartColumn + RemainingTokenColumns > ColumnLimit) {
+            DEBUG(llvm::dbgs() << "    Over limit after reflow, need: "
+                               << (ContentStartColumn + RemainingTokenColumns)
+                               << ", space: " << ColumnLimit
+                               << ", reflown prefix: " << ContentStartColumn
+                               << ", offset in line: " << TailOffset << "\n");
+            // If the whole next line does not fit, try to find a point in
+            // the next line at which we can break so that attaching the part
+            // of the next line to that break point onto the current line is
+            // below the column limit.
+            BreakableToken::Split Split =
+                Token->getSplit(NextLineIndex, TailOffset, ColumnLimit,
+                                ContentStartColumn, CommentPragmasRegex);
+            if (Split.first == StringRef::npos) {
+              DEBUG(llvm::dbgs() << "    Did not find later break\n");
+              Reflow = false;
+            } else {
+              // Check whether the first split point gets us below the column
+              // limit. Note that we will execute this split below as part of
+              // the normal token breaking and reflow logic within the line.
+              unsigned ToSplitColumns = Token->getRangeLength(
+                  NextLineIndex, TailOffset, Split.first, ContentStartColumn);
+              if (ContentStartColumn + ToSplitColumns > ColumnLimit) {
+                DEBUG(llvm::dbgs() << "    Next split protrudes, need: "
+                                   << (ContentStartColumn + ToSplitColumns)
+                                   << ", space: " << ColumnLimit);
+                unsigned ExcessCharactersPenalty =
+                    (ContentStartColumn + ToSplitColumns - ColumnLimit) *
+                    Style.PenaltyExcessCharacter;
+                if (NewBreakPenalty < ExcessCharactersPenalty) {
+                  Reflow = false;
+                }
+              }
+            }
+          }
+        } else {
+          DEBUG(llvm::dbgs() << "not found.\n");
+        }
+      }
+      if (!Reflow) {
+        // If we didn't reflow into the next line, the only space to consider is
+        // the next logical line. Reset our state to match the start of the next
+        // line.
+        TailOffset = 0;
+        ContentStartColumn =
+            Token->getContentStartColumn(NextLineIndex, /*Break=*/false);
+        RemainingTokenColumns = Token->getRemainingLength(
+            NextLineIndex, TailOffset, ContentStartColumn);
+        // Adapt the start of the token, for example indent.
+        if (!DryRun)
+          Token->adaptStartOfLine(NextLineIndex, Whitespaces);
+      } else {
+        // If we found a reflow split and have added a new break before the next
+        // line, we are going to remove the line break at the start of the next
+        // logical line. For example, here we'll add a new line break after
+        // 'text', and subsequently delete the line break between 'that' and
+        // 'reflows'.
+        //   // some text that
+        //   // reflows
+        // ->
+        //   // some text
+        //   // that reflows
+        // When adding the line break, we also added the penalty for it, so we
+        // need to subtract that penalty again when we remove the line break due
+        // to reflowing.
+        if (NewBreakBefore) {
+          assert(Penalty >= NewBreakPenalty);
+          Penalty -= NewBreakPenalty;
+        }
+        if (!DryRun)
+          Token->reflow(NextLineIndex, Whitespaces);
+      }
+    }
   }
 
   BreakableToken::Split SplitAfterLastLine =
-      Token->getSplitAfterLastLine(TailOffset, ColumnLimit);
+      Token->getSplitAfterLastLine(TailOffset);
   if (SplitAfterLastLine.first != StringRef::npos) {
     DEBUG(llvm::dbgs() << "Replacing whitespace after last line.\n");
     if (!DryRun)
       Token->replaceWhitespaceAfterLastLine(TailOffset, SplitAfterLastLine,
                                             Whitespaces);
-    RemainingTokenColumns = Token->getLineLengthAfterSplitAfterLastLine(
-        TailOffset, SplitAfterLastLine);
+    ContentStartColumn =
+        Token->getContentStartColumn(Token->getLineCount() - 1, /*Break=*/true);
+    RemainingTokenColumns = Token->getRemainingLength(
+        Token->getLineCount() - 1,
+        TailOffset + SplitAfterLastLine.first + SplitAfterLastLine.second,
+        ContentStartColumn);
   }
 
-  State.Column = RemainingTokenColumns;
+  State.Column = ContentStartColumn + RemainingTokenColumns -
+                 Current.UnbreakableTailLength;
 
   if (BreakInserted) {
     // If we break the token inside a parameter list, we need to break before