Handle zero-width and double-width characters in string literals and comments.

Summary:
Count column width instead of the number of code points. This also
includes correct handling of tabs inside string literals and comments (with an
exception of multiline string literals/comments, where tabs are present before
the first escaped newline).

Reviewers: djasper, klimek

Reviewed By: klimek

CC: cfe-commits, klimek

Differential Revision: http://llvm-reviews.chandlerc.com/D1601

git-svn-id: https://llvm.org/svn/llvm-project/cfe/trunk@190052 91177308-0d34-0410-b5e6-96231b3b80d8
diff --git a/lib/Format/BreakableToken.cpp b/lib/Format/BreakableToken.cpp
index 3b29137..053be4b 100644
--- a/lib/Format/BreakableToken.cpp
+++ b/lib/Format/BreakableToken.cpp
@@ -41,6 +41,7 @@
 static BreakableToken::Split getCommentSplit(StringRef Text,
                                              unsigned ContentStartColumn,
                                              unsigned ColumnLimit,
+                                             unsigned TabWidth,
                                              encoding::Encoding Encoding) {
   if (ColumnLimit <= ContentStartColumn + 1)
     return BreakableToken::Split(StringRef::npos, 0);
@@ -49,9 +50,14 @@
   unsigned MaxSplitBytes = 0;
 
   for (unsigned NumChars = 0;
-       NumChars < MaxSplit && MaxSplitBytes < Text.size(); ++NumChars)
-    MaxSplitBytes +=
+       NumChars < MaxSplit && MaxSplitBytes < Text.size();) {
+    unsigned BytesInChar =
         encoding::getCodePointNumBytes(Text[MaxSplitBytes], Encoding);
+    NumChars +=
+        encoding::columnWidthWithTabs(Text.substr(MaxSplitBytes, BytesInChar),
+                                      ContentStartColumn, TabWidth, Encoding);
+    MaxSplitBytes += BytesInChar;
+  }
 
   StringRef::size_type SpaceOffset = Text.find_last_of(Blanks, MaxSplitBytes);
   if (SpaceOffset == StringRef::npos ||
@@ -78,6 +84,7 @@
 static BreakableToken::Split getStringSplit(StringRef Text,
                                             unsigned ContentStartColumn,
                                             unsigned ColumnLimit,
+                                            unsigned TabWidth,
                                             encoding::Encoding Encoding) {
   // FIXME: Reduce unit test case.
   if (Text.empty())
@@ -86,7 +93,9 @@
     return BreakableToken::Split(StringRef::npos, 0);
   unsigned MaxSplit =
       std::min<unsigned>(ColumnLimit - ContentStartColumn,
-                         encoding::getCodePointCount(Text, Encoding) - 1);
+                         encoding::columnWidthWithTabs(Text, ContentStartColumn,
+                                                       TabWidth, Encoding) -
+                             1);
   StringRef::size_type SpaceOffset = 0;
   StringRef::size_type SlashOffset = 0;
   StringRef::size_type WordStartOffset = 0;
@@ -98,7 +107,9 @@
       Chars += Advance;
     } else {
       Advance = encoding::getCodePointNumBytes(Text[0], Encoding);
-      Chars += 1;
+      Chars += encoding::columnWidthWithTabs(Text.substr(0, Advance),
+                                             ContentStartColumn + Chars,
+                                             TabWidth, Encoding);
     }
 
     if (Chars > MaxSplit)
@@ -131,14 +142,17 @@
 unsigned BreakableSingleLineToken::getLineLengthAfterSplit(
     unsigned LineIndex, unsigned Offset, StringRef::size_type Length) const {
   return StartColumn + Prefix.size() + Postfix.size() +
-         encoding::getCodePointCount(Line.substr(Offset, Length), Encoding);
+         encoding::columnWidthWithTabs(Line.substr(Offset, Length),
+                                       StartColumn + Prefix.size(),
+                                       Style.TabWidth, Encoding);
 }
 
 BreakableSingleLineToken::BreakableSingleLineToken(
     const FormatToken &Tok, unsigned StartColumn, StringRef Prefix,
-    StringRef Postfix, bool InPPDirective, encoding::Encoding Encoding)
-    : BreakableToken(Tok, InPPDirective, Encoding), StartColumn(StartColumn),
-      Prefix(Prefix), Postfix(Postfix) {
+    StringRef Postfix, bool InPPDirective, encoding::Encoding Encoding,
+    const FormatStyle &Style)
+    : BreakableToken(Tok, InPPDirective, Encoding, Style),
+      StartColumn(StartColumn), Prefix(Prefix), Postfix(Postfix) {
   assert(Tok.TokenText.startswith(Prefix) && Tok.TokenText.endswith(Postfix));
   Line = Tok.TokenText.substr(
       Prefix.size(), Tok.TokenText.size() - Prefix.size() - Postfix.size());
@@ -147,15 +161,16 @@
 BreakableStringLiteral::BreakableStringLiteral(const FormatToken &Tok,
                                                unsigned StartColumn,
                                                bool InPPDirective,
-                                               encoding::Encoding Encoding)
+                                               encoding::Encoding Encoding,
+                                               const FormatStyle &Style)
     : BreakableSingleLineToken(Tok, StartColumn, "\"", "\"", InPPDirective,
-                               Encoding) {}
+                               Encoding, Style) {}
 
 BreakableToken::Split
 BreakableStringLiteral::getSplit(unsigned LineIndex, unsigned TailOffset,
                                  unsigned ColumnLimit) const {
   return getStringSplit(Line.substr(TailOffset), StartColumn + 2, ColumnLimit,
-                        Encoding);
+                        Style.TabWidth, Encoding);
 }
 
 void BreakableStringLiteral::insertBreak(unsigned LineIndex,
@@ -177,10 +192,11 @@
 BreakableLineComment::BreakableLineComment(const FormatToken &Token,
                                            unsigned StartColumn,
                                            bool InPPDirective,
-                                           encoding::Encoding Encoding)
+                                           encoding::Encoding Encoding,
+                                           const FormatStyle &Style)
     : BreakableSingleLineToken(Token, StartColumn,
                                getLineCommentPrefix(Token.TokenText), "",
-                               InPPDirective, Encoding) {
+                               InPPDirective, Encoding, Style) {
   OriginalPrefix = Prefix;
   if (Token.TokenText.size() > Prefix.size() &&
       isAlphanumeric(Token.TokenText[Prefix.size()])) {
@@ -195,7 +211,7 @@
 BreakableLineComment::getSplit(unsigned LineIndex, unsigned TailOffset,
                                unsigned ColumnLimit) const {
   return getCommentSplit(Line.substr(TailOffset), StartColumn + Prefix.size(),
-                         ColumnLimit, Encoding);
+                         ColumnLimit, Style.TabWidth, Encoding);
 }
 
 void BreakableLineComment::insertBreak(unsigned LineIndex, unsigned TailOffset,
@@ -216,10 +232,10 @@
 }
 
 BreakableBlockComment::BreakableBlockComment(
-    const FormatStyle &Style, const FormatToken &Token, unsigned StartColumn,
+    const FormatToken &Token, unsigned StartColumn,
     unsigned OriginalStartColumn, bool FirstInLine, bool InPPDirective,
-    encoding::Encoding Encoding)
-    : BreakableToken(Token, InPPDirective, Encoding) {
+    encoding::Encoding Encoding, const FormatStyle &Style)
+    : BreakableToken(Token, InPPDirective, Encoding, Style) {
   StringRef TokenText(Token.TokenText);
   assert(TokenText.startswith("/*") && TokenText.endswith("*/"));
   TokenText.substr(2, TokenText.size() - 4).split(Lines, "\n");
@@ -229,7 +245,7 @@
   StartOfLineColumn.resize(Lines.size());
   StartOfLineColumn[0] = StartColumn + 2;
   for (size_t i = 1; i < Lines.size(); ++i)
-    adjustWhitespace(Style, i, IndentDelta);
+    adjustWhitespace(i, IndentDelta);
 
   Decoration = "* ";
   if (Lines.size() == 1 && !FirstInLine) {
@@ -282,8 +298,7 @@
   });
 }
 
-void BreakableBlockComment::adjustWhitespace(const FormatStyle &Style,
-                                             unsigned LineIndex,
+void BreakableBlockComment::adjustWhitespace(unsigned LineIndex,
                                              int IndentDelta) {
   // When in a preprocessor directive, the trailing backslash in a block comment
   // is not needed, but can serve a purpose of uniformity with necessary escaped
@@ -306,6 +321,7 @@
   if (StartOfLine == StringRef::npos)
     StartOfLine = Lines[LineIndex].size();
 
+  StringRef Whitespace = Lines[LineIndex].substr(0, StartOfLine);
   // Adjust Lines to only contain relevant text.
   Lines[LineIndex - 1] = Lines[LineIndex - 1].substr(0, EndOfPreviousLine);
   Lines[LineIndex] = Lines[LineIndex].substr(StartOfLine);
@@ -321,16 +337,19 @@
   // if leading tabs are intermixed with spaces, that is not a high priority.
 
   // Adjust the start column uniformly accross all lines.
-  StartOfLineColumn[LineIndex] = std::max<int>(0, StartOfLine + IndentDelta);
+  StartOfLineColumn[LineIndex] =
+      std::max<int>(0, Whitespace.size() + IndentDelta);
 }
 
 unsigned BreakableBlockComment::getLineCount() const { return Lines.size(); }
 
 unsigned BreakableBlockComment::getLineLengthAfterSplit(
     unsigned LineIndex, unsigned Offset, StringRef::size_type Length) const {
-  return getContentStartColumn(LineIndex, Offset) +
-         encoding::getCodePointCount(Lines[LineIndex].substr(Offset, Length),
-                                     Encoding) +
+  unsigned ContentStartColumn = getContentStartColumn(LineIndex, Offset);
+  return ContentStartColumn +
+         encoding::columnWidthWithTabs(Lines[LineIndex].substr(Offset, Length),
+                                       ContentStartColumn, Style.TabWidth,
+                                       Encoding) +
          // The last line gets a "*/" postfix.
          (LineIndex + 1 == Lines.size() ? 2 : 0);
 }
@@ -340,7 +359,7 @@
                                 unsigned ColumnLimit) const {
   return getCommentSplit(Lines[LineIndex].substr(TailOffset),
                          getContentStartColumn(LineIndex, TailOffset),
-                         ColumnLimit, Encoding);
+                         ColumnLimit, Style.TabWidth, Encoding);
 }
 
 void BreakableBlockComment::insertBreak(unsigned LineIndex, unsigned TailOffset,
diff --git a/lib/Format/BreakableToken.h b/lib/Format/BreakableToken.h
index 90b78ac..65b9015 100644
--- a/lib/Format/BreakableToken.h
+++ b/lib/Format/BreakableToken.h
@@ -67,12 +67,14 @@
 
 protected:
   BreakableToken(const FormatToken &Tok, bool InPPDirective,
-                 encoding::Encoding Encoding)
-      : Tok(Tok), InPPDirective(InPPDirective), Encoding(Encoding) {}
+                 encoding::Encoding Encoding, const FormatStyle &Style)
+      : Tok(Tok), InPPDirective(InPPDirective), Encoding(Encoding),
+        Style(Style) {}
 
   const FormatToken &Tok;
   const bool InPPDirective;
   const encoding::Encoding Encoding;
+  const FormatStyle &Style;
 };
 
 /// \brief Base class for single line tokens that can be broken.
@@ -88,7 +90,8 @@
 protected:
   BreakableSingleLineToken(const FormatToken &Tok, unsigned StartColumn,
                            StringRef Prefix, StringRef Postfix,
-                           bool InPPDirective, encoding::Encoding Encoding);
+                           bool InPPDirective, encoding::Encoding Encoding,
+                           const FormatStyle &Style);
 
   // The column in which the token starts.
   unsigned StartColumn;
@@ -107,7 +110,8 @@
   /// \p StartColumn specifies the column in which the token will start
   /// after formatting.
   BreakableStringLiteral(const FormatToken &Tok, unsigned StartColumn,
-                         bool InPPDirective, encoding::Encoding Encoding);
+                         bool InPPDirective, encoding::Encoding Encoding,
+                         const FormatStyle &Style);
 
   virtual Split getSplit(unsigned LineIndex, unsigned TailOffset,
                          unsigned ColumnLimit) const;
@@ -122,7 +126,8 @@
   /// \p StartColumn specifies the column in which the comment will start
   /// after formatting.
   BreakableLineComment(const FormatToken &Token, unsigned StartColumn,
-                       bool InPPDirective, encoding::Encoding Encoding);
+                       bool InPPDirective, encoding::Encoding Encoding,
+                       const FormatStyle &Style);
 
   virtual Split getSplit(unsigned LineIndex, unsigned TailOffset,
                          unsigned ColumnLimit) const;
@@ -144,10 +149,10 @@
   /// after formatting, while \p OriginalStartColumn specifies in which
   /// column the comment started before formatting.
   /// If the comment starts a line after formatting, set \p FirstInLine to true.
-  BreakableBlockComment(const FormatStyle &Style, const FormatToken &Token,
-                        unsigned StartColumn, unsigned OriginaStartColumn,
-                        bool FirstInLine, bool InPPDirective,
-                        encoding::Encoding Encoding);
+  BreakableBlockComment(const FormatToken &Token, unsigned StartColumn,
+                        unsigned OriginaStartColumn, bool FirstInLine,
+                        bool InPPDirective, encoding::Encoding Encoding,
+                        const FormatStyle &Style);
 
   virtual unsigned getLineCount() const;
   virtual unsigned getLineLengthAfterSplit(unsigned LineIndex,
@@ -172,8 +177,7 @@
   // Sets StartOfLineColumn to the intended column in which the text at
   // Lines[LineIndex] starts (note that the decoration, if present, is not
   // considered part of the text).
-  void adjustWhitespace(const FormatStyle &Style, unsigned LineIndex,
-                        int IndentDelta);
+  void adjustWhitespace(unsigned LineIndex, int IndentDelta);
 
   // Returns the column at which the text in line LineIndex starts, when broken
   // at TailOffset. Note that the decoration (if present) is not considered part
diff --git a/lib/Format/ContinuationIndenter.cpp b/lib/Format/ContinuationIndenter.cpp
index c894a4b..9e84ea7 100644
--- a/lib/Format/ContinuationIndenter.cpp
+++ b/lib/Format/ContinuationIndenter.cpp
@@ -623,10 +623,10 @@
     State.Stack[i].BreakBeforeParameter = true;
 
   unsigned ColumnsUsed =
-      State.Column - Current.CodePointCount + Current.CodePointsInFirstLine;
+      State.Column - Current.CodePointCount + Current.FirstLineColumnWidth;
   // We can only affect layout of the first and the last line, so the penalty
   // for all other lines is constant, and we ignore it.
-  State.Column = Current.CodePointsInLastLine;
+  State.Column = Current.LastLineColumnWidth;
 
   if (ColumnsUsed > getColumnLimit(State))
     return Style.PenaltyExcessCharacter * (ColumnsUsed - getColumnLimit(State));
@@ -659,14 +659,14 @@
       return 0;
 
     Token.reset(new BreakableStringLiteral(
-        Current, StartColumn, State.Line->InPPDirective, Encoding));
+        Current, StartColumn, State.Line->InPPDirective, Encoding, Style));
   } else if (Current.Type == TT_BlockComment && Current.isTrailingComment()) {
     unsigned OriginalStartColumn =
         SourceMgr.getSpellingColumnNumber(Current.getStartOfNonWhitespace()) -
         1;
     Token.reset(new BreakableBlockComment(
-        Style, Current, StartColumn, OriginalStartColumn, !Current.Previous,
-        State.Line->InPPDirective, Encoding));
+        Current, StartColumn, OriginalStartColumn, !Current.Previous,
+        State.Line->InPPDirective, Encoding, Style));
   } else if (Current.Type == TT_LineComment &&
              (Current.Previous == NULL ||
               Current.Previous->Type != TT_ImplicitStringLiteral)) {
@@ -678,12 +678,12 @@
     // leading whitespace in consecutive lines when changing indentation of
     // the first line similar to what we do with block comments.
     if (Current.isMultiline()) {
-      State.Column = StartColumn + Current.CodePointsInFirstLine;
+      State.Column = StartColumn + Current.FirstLineColumnWidth;
       return 0;
     }
 
-    Token.reset(new BreakableLineComment(Current, StartColumn,
-                                         State.Line->InPPDirective, Encoding));
+    Token.reset(new BreakableLineComment(
+        Current, StartColumn, State.Line->InPPDirective, Encoding, Style));
   } else {
     return 0;
   }
diff --git a/lib/Format/Encoding.h b/lib/Format/Encoding.h
index e9e9ae7..356334d 100644
--- a/lib/Format/Encoding.h
+++ b/lib/Format/Encoding.h
@@ -18,6 +18,7 @@
 
 #include "clang/Basic/LLVM.h"
 #include "llvm/Support/ConvertUTF.h"
+#include "llvm/Support/Unicode.h"
 
 namespace clang {
 namespace format {
@@ -57,6 +58,37 @@
   }
 }
 
+/// \brief Returns the number of columns required to display the \p Text on a
+/// generic Unicode-capable terminal. Text is assumed to use the specified
+/// \p Encoding.
+inline unsigned columnWidth(StringRef Text, Encoding Encoding) {
+  if (Encoding == Encoding_UTF8) {
+    int ContentWidth = llvm::sys::unicode::columnWidthUTF8(Text);
+    if (ContentWidth >= 0)
+      return ContentWidth;
+  }
+  return Text.size();
+}
+
+/// \brief Returns the number of columns required to display the \p Text,
+/// starting from the \p StartColumn on a terminal with the \p TabWidth. The
+/// text is assumed to use the specified \p Encoding.
+inline unsigned columnWidthWithTabs(StringRef Text, unsigned StartColumn,
+                                    unsigned TabWidth, Encoding Encoding) {
+  unsigned TotalWidth = 0;
+  StringRef Tail = Text;
+  for (;;) {
+    StringRef::size_type TabPos = Tail.find('\t');
+    if (TabPos == StringRef::npos)
+      return TotalWidth + columnWidth(Tail, Encoding);
+    int Width = columnWidth(Tail.substr(0, TabPos), Encoding);
+    assert(Width >= 0);
+    TotalWidth += Width;
+    TotalWidth += TabWidth - (TotalWidth + StartColumn) % TabWidth;
+    Tail = Tail.substr(TabPos + 1);
+  }
+}
+
 /// \brief Gets the number of bytes in a sequence representing a single
 /// codepoint and starting with FirstChar in the specified Encoding.
 inline unsigned getCodePointNumBytes(char FirstChar, Encoding Encoding) {
diff --git a/lib/Format/Format.cpp b/lib/Format/Format.cpp
index c482c40..02adc5a 100644
--- a/lib/Format/Format.cpp
+++ b/lib/Format/Format.cpp
@@ -136,6 +136,7 @@
     IO.mapOptional("Cpp11BracedListStyle", Style.Cpp11BracedListStyle);
     IO.mapOptional("Standard", Style.Standard);
     IO.mapOptional("IndentWidth", Style.IndentWidth);
+    IO.mapOptional("TabWidth", Style.TabWidth);
     IO.mapOptional("UseTab", Style.UseTab);
     IO.mapOptional("BreakBeforeBraces", Style.BreakBeforeBraces);
     IO.mapOptional("IndentFunctionDeclarationAfterType",
@@ -184,6 +185,7 @@
   LLVMStyle.IndentCaseLabels = false;
   LLVMStyle.IndentFunctionDeclarationAfterType = false;
   LLVMStyle.IndentWidth = 2;
+  LLVMStyle.TabWidth = 8;
   LLVMStyle.MaxEmptyLinesToKeep = 1;
   LLVMStyle.NamespaceIndentation = FormatStyle::NI_None;
   LLVMStyle.ObjCSpaceBeforeProtocolList = true;
@@ -225,6 +227,7 @@
   GoogleStyle.IndentCaseLabels = true;
   GoogleStyle.IndentFunctionDeclarationAfterType = true;
   GoogleStyle.IndentWidth = 2;
+  GoogleStyle.TabWidth = 8;
   GoogleStyle.MaxEmptyLinesToKeep = 1;
   GoogleStyle.NamespaceIndentation = FormatStyle::NI_None;
   GoogleStyle.ObjCSpaceBeforeProtocolList = false;
@@ -629,7 +632,7 @@
           ++Column;
           break;
         case '\t':
-          Column += Style.IndentWidth - Column % Style.IndentWidth;
+          Column += Style.TabWidth - Column % Style.TabWidth;
           break;
         default:
           ++Column;
@@ -681,10 +684,12 @@
       StringRef Text = FormatTok->TokenText;
       size_t FirstNewlinePos = Text.find('\n');
       if (FirstNewlinePos != StringRef::npos) {
-        FormatTok->CodePointsInFirstLine = encoding::getCodePointCount(
-            Text.substr(0, FirstNewlinePos), Encoding);
-        FormatTok->CodePointsInLastLine = encoding::getCodePointCount(
-            Text.substr(Text.find_last_of('\n') + 1), Encoding);
+        // FIXME: Handle embedded tabs.
+        FormatTok->FirstLineColumnWidth = encoding::columnWidthWithTabs(
+            Text.substr(0, FirstNewlinePos), 0, Style.TabWidth, Encoding);
+        FormatTok->LastLineColumnWidth = encoding::columnWidthWithTabs(
+            Text.substr(Text.find_last_of('\n') + 1), 0, Style.TabWidth,
+            Encoding);
       }
     }
     // FIXME: Add the CodePointCount to Column.
diff --git a/lib/Format/FormatToken.h b/lib/Format/FormatToken.h
index 0b770f3..e4342dd 100644
--- a/lib/Format/FormatToken.h
+++ b/lib/Format/FormatToken.h
@@ -83,7 +83,7 @@
 struct FormatToken {
   FormatToken()
       : NewlinesBefore(0), HasUnescapedNewline(false), LastNewlineOffset(0),
-        CodePointCount(0), CodePointsInFirstLine(0), CodePointsInLastLine(0),
+        CodePointCount(0), FirstLineColumnWidth(0), LastLineColumnWidth(0),
         IsFirst(false), MustBreakBefore(false), IsUnterminatedLiteral(false),
         BlockKind(BK_Unknown), Type(TT_Unknown), SpacesRequiredBefore(0),
         CanBreakBefore(false), ClosesTemplateDeclaration(false),
@@ -120,15 +120,15 @@
   /// \brief Contains the number of code points in the first line of a
   /// multi-line string literal or comment. Zero if there's no newline in the
   /// token.
-  unsigned CodePointsInFirstLine;
+  unsigned FirstLineColumnWidth;
 
   /// \brief Contains the number of code points in the last line of a
   /// multi-line string literal or comment. Can be zero for line comments.
-  unsigned CodePointsInLastLine;
+  unsigned LastLineColumnWidth;
 
   /// \brief Returns \c true if the token text contains newlines (escaped or
   /// not).
-  bool isMultiline() const { return CodePointsInFirstLine != 0; }
+  bool isMultiline() const { return FirstLineColumnWidth != 0; }
 
   /// \brief Indicates that this is the first token.
   bool IsFirst;
diff --git a/lib/Format/WhitespaceManager.cpp b/lib/Format/WhitespaceManager.cpp
index 41519b6..0f46e62 100644
--- a/lib/Format/WhitespaceManager.cpp
+++ b/lib/Format/WhitespaceManager.cpp
@@ -272,8 +272,8 @@
   if (!Style.UseTab)
     return std::string(Spaces, ' ');
 
-  return std::string(Spaces / Style.IndentWidth, '\t') +
-         std::string(Spaces % Style.IndentWidth, ' ');
+  return std::string(Spaces / Style.TabWidth, '\t') +
+         std::string(Spaces % Style.TabWidth, ' ');
 }
 
 } // namespace format