Fixes various problems with accounting for tabs in the original code.

We now count the original token's column directly when lexing the
tokens, where we already have all knowledge about where lines start.

Before this patch, formatting:
 void f() {
 \tg();
 \th();
 }
would incorrectly count the \t's as 1 character if only the line
containing h() was reformatted, and thus indent h() at offset 1.

git-svn-id: https://llvm.org/svn/llvm-project/cfe/trunk@189585 91177308-0d34-0410-b5e6-96231b3b80d8
diff --git a/lib/Format/Format.cpp b/lib/Format/Format.cpp
index 3982ba6..3c740d9 100644
--- a/lib/Format/Format.cpp
+++ b/lib/Format/Format.cpp
@@ -487,11 +487,11 @@
 
 class FormatTokenLexer {
 public:
-  FormatTokenLexer(Lexer &Lex, SourceManager &SourceMgr,
+  FormatTokenLexer(Lexer &Lex, SourceManager &SourceMgr, FormatStyle &Style,
                    encoding::Encoding Encoding)
-      : FormatTok(NULL), GreaterStashed(false), TrailingWhitespace(0), Lex(Lex),
-        SourceMgr(SourceMgr), IdentTable(getFormattingLangOpts()),
-        Encoding(Encoding) {
+      : FormatTok(NULL), GreaterStashed(false), Column(0),
+        TrailingWhitespace(0), Lex(Lex), SourceMgr(SourceMgr), Style(Style),
+        IdentTable(getFormattingLangOpts()), Encoding(Encoding) {
     Lex.SetKeepWhitespaceMode(true);
   }
 
@@ -509,6 +509,7 @@
   FormatToken *getNextToken() {
     if (GreaterStashed) {
       // Create a synthesized second '>' token.
+      // FIXME: Increment Column and set OriginalColumn.
       Token Greater = FormatTok->Tok;
       FormatTok = new (Allocator.Allocate()) FormatToken;
       FormatTok->Tok = Greater;
@@ -532,13 +533,29 @@
     // Consume and record whitespace until we find a significant token.
     unsigned WhitespaceLength = TrailingWhitespace;
     while (FormatTok->Tok.is(tok::unknown)) {
-      unsigned Newlines = FormatTok->TokenText.count('\n');
-      if (Newlines > 0)
-        FormatTok->LastNewlineOffset =
-            WhitespaceLength + FormatTok->TokenText.rfind('\n') + 1;
-      FormatTok->NewlinesBefore += Newlines;
-      unsigned EscapedNewlines = FormatTok->TokenText.count("\\\n");
-      FormatTok->HasUnescapedNewline |= EscapedNewlines != Newlines;
+      for (int i = 0, e = FormatTok->TokenText.size(); i != e; ++i) {
+        switch (FormatTok->TokenText[i]) {
+        case '\n':
+          ++FormatTok->NewlinesBefore;
+          // FIXME: This is technically incorrect, as it could also
+          // be a literal backslash at the end of the line.
+          if (i == 0 || FormatTok->TokenText[i-1] != '\\')
+            FormatTok->HasUnescapedNewline = true;
+          FormatTok->LastNewlineOffset = WhitespaceLength + i + 1;
+          Column = 0;
+          break;
+        case ' ':
+          ++Column;
+          break;
+        case '\t':
+          Column += Style.IndentWidth - Column % Style.IndentWidth;
+          break;
+        default:
+          ++Column;
+          break;
+        }
+      }
+
       WhitespaceLength += FormatTok->Tok.getLength();
 
       readRawToken(*FormatTok);
@@ -554,11 +571,14 @@
            FormatTok->TokenText[1] == '\n') {
       // FIXME: ++FormatTok->NewlinesBefore is missing...
       WhitespaceLength += 2;
+      Column = 0;
       FormatTok->TokenText = FormatTok->TokenText.substr(2);
     }
+    FormatTok->OriginalColumn = Column;
 
     TrailingWhitespace = 0;
     if (FormatTok->Tok.is(tok::comment)) {
+      // FIXME: Add the trimmed whitespace to Column.
       StringRef UntrimmedText = FormatTok->TokenText;
       FormatTok->TokenText = FormatTok->TokenText.rtrim();
       TrailingWhitespace = UntrimmedText.size() - FormatTok->TokenText.size();
@@ -576,6 +596,8 @@
     FormatTok->CodePointCount =
         encoding::getCodePointCount(FormatTok->TokenText, Encoding);
 
+    // FIXME: Add the CodePointCount to Column.
+
     FormatTok->WhitespaceRange = SourceRange(
         WhitespaceStart, WhitespaceStart.getLocWithOffset(WhitespaceLength));
     return FormatTok;
@@ -583,9 +605,11 @@
 
   FormatToken *FormatTok;
   bool GreaterStashed;
+  unsigned Column;
   unsigned TrailingWhitespace;
   Lexer &Lex;
   SourceManager &SourceMgr;
+  FormatStyle &Style;
   IdentifierTable IdentTable;
   encoding::Encoding Encoding;
   llvm::SpecificBumpPtrAllocator<FormatToken> Allocator;
@@ -595,7 +619,6 @@
     Lex.LexFromRawLexer(Tok.Tok);
     Tok.TokenText = StringRef(SourceMgr.getCharacterData(Tok.Tok.getLocation()),
                               Tok.Tok.getLength());
-
     // For formatting, treat unterminated string literals like normal string
     // literals.
     if (Tok.is(tok::unknown) && !Tok.TokenText.empty() &&
@@ -622,7 +645,7 @@
   virtual ~Formatter() {}
 
   tooling::Replacements format() {
-    FormatTokenLexer Tokens(Lex, SourceMgr, Encoding);
+    FormatTokenLexer Tokens(Lex, SourceMgr, Style, Encoding);
 
     UnwrappedLineParser Parser(Style, Tokens.lex(), *this);
     bool StructuralError = Parser.parse();
@@ -692,9 +715,7 @@
           formatFirstToken(*TheLine.First, PreviousLineLastToken, Indent,
                            TheLine.InPPDirective);
         } else {
-          Indent = LevelIndent =
-              SourceMgr.getSpellingColumnNumber(FirstTok->Tok.getLocation()) -
-              1;
+          Indent = LevelIndent = FirstTok->OriginalColumn;
         }
         ContinuationIndenter Indenter(Style, SourceMgr, TheLine, Indent,
                                       Whitespaces, Encoding,
@@ -727,8 +748,7 @@
              Tok = Tok->Next) {
           if (Tok == TheLine.First &&
               (Tok->NewlinesBefore > 0 || Tok->IsFirst)) {
-            unsigned LevelIndent =
-                SourceMgr.getSpellingColumnNumber(Tok->Tok.getLocation()) - 1;
+            unsigned LevelIndent = Tok->OriginalColumn;
             // Remove trailing whitespace of the previous line if it was
             // touched.
             if (PreviousLineWasTouched || touchesEmptyLineBefore(TheLine)) {
diff --git a/lib/Format/FormatToken.h b/lib/Format/FormatToken.h
index 62b0ba6..9509383 100644
--- a/lib/Format/FormatToken.h
+++ b/lib/Format/FormatToken.h
@@ -169,9 +169,18 @@
   /// \brief If this is an opening parenthesis, how are the parameters packed?
   ParameterPackingKind PackingKind;
 
-  /// \brief The total length of the line up to and including this token.
+  /// \brief The total length of the unwrapped line up to and including this
+  /// token.
   unsigned TotalLength;
 
+  /// \brief The original column of this token, including expanded tabs.
+  /// The configured IndentWidth is used as tab width. Only tabs in whitespace
+  /// are expanded.
+  /// FIXME: This is currently only used on the first token of an unwrapped
+  /// line, and the implementation is not correct for other tokens (see the
+  /// FIXMEs in FormatTokenLexer::getNextToken()).
+  unsigned OriginalColumn;
+
   /// \brief The length of following tokens until the next natural split point,
   /// or the next token that can be broken.
   unsigned UnbreakableTailLength;
diff --git a/unittests/Format/FormatTest.cpp b/unittests/Format/FormatTest.cpp
index 122b367..b20007d 100644
--- a/unittests/Format/FormatTest.cpp
+++ b/unittests/Format/FormatTest.cpp
@@ -5528,7 +5528,33 @@
                "\t\t    parameter2); \\\n"
                "\t}",
                Tab);
-
+  EXPECT_EQ("void f() {\n"
+            "\tf();\n"
+            "\tg();\n"
+            "}",
+            format("void f() {\n"
+                   "\tf();\n"
+                   "\tg();\n"
+                   "}",
+                   0, 0, Tab));
+  EXPECT_EQ("void f() {\n"
+            "\tf();\n"
+            "\tg();\n"
+            "}",
+            format("void f() {\n"
+                   "\tf();\n"
+                   "\tg();\n"
+                   "}",
+                   16, 0, Tab));
+  EXPECT_EQ("void f() {\n"
+            "  \tf();\n"
+            "\tg();\n"
+            "}",
+            format("void f() {\n"
+                   "  \tf();\n"
+                   "  \tg();\n"
+                   "}",
+                   21, 0, Tab));
 
   // FIXME: To correctly count mixed whitespace we need to
   // also correctly count mixed whitespace in front of the comment.