Fixed calculation of penalty when breaking tokens.

Summary:
Introduced two new style parameters: PenaltyBreakComment and
PenaltyBreakString. Add penalty for each character of a breakable token beyond
the column limit (this relates mainly to comments, as they are broken only on
whitespace). Tuned PenaltyBreakComment to prefer comment breaking over breaking
inside most binary expressions.
Fixed a bug that prevented *, & and && from being considered TT_BinaryOperator
in the presense of adjacent comments.

Reviewers: klimek, djasper

Reviewed By: klimek

CC: cfe-commits

Differential Revision: http://llvm-reviews.chandlerc.com/D933

git-svn-id: https://llvm.org/svn/llvm-project/cfe/trunk@183530 91177308-0d34-0410-b5e6-96231b3b80d8
diff --git a/lib/Format/BreakableToken.cpp b/lib/Format/BreakableToken.cpp
index 5e5604c..94b4322 100644
--- a/lib/Format/BreakableToken.cpp
+++ b/lib/Format/BreakableToken.cpp
@@ -112,11 +112,10 @@
 
 unsigned BreakableSingleLineToken::getLineCount() const { return 1; }
 
-unsigned
-BreakableSingleLineToken::getLineLengthAfterSplit(unsigned LineIndex,
-                                                  unsigned TailOffset) const {
+unsigned BreakableSingleLineToken::getLineLengthAfterSplit(
+    unsigned LineIndex, unsigned Offset, StringRef::size_type Length) const {
   return StartColumn + Prefix.size() + Postfix.size() +
-         encoding::getCodePointCount(Line.substr(TailOffset), Encoding);
+         encoding::getCodePointCount(Line.substr(Offset, Length), Encoding);
 }
 
 void BreakableSingleLineToken::insertBreak(unsigned LineIndex,
@@ -268,11 +267,10 @@
 
 unsigned BreakableBlockComment::getLineCount() const { return Lines.size(); }
 
-unsigned
-BreakableBlockComment::getLineLengthAfterSplit(unsigned LineIndex,
-                                               unsigned TailOffset) const {
-  return getContentStartColumn(LineIndex, TailOffset) +
-         encoding::getCodePointCount(Lines[LineIndex].substr(TailOffset),
+unsigned BreakableBlockComment::getLineLengthAfterSplit(
+    unsigned LineIndex, unsigned Offset, StringRef::size_type Length) const {
+  return getContentStartColumn(LineIndex, Offset) +
+         encoding::getCodePointCount(Lines[LineIndex].substr(Offset, Length),
                                      Encoding) +
          // The last line gets a "*/" postfix.
          (LineIndex + 1 == Lines.size() ? 2 : 0);
diff --git a/lib/Format/BreakableToken.h b/lib/Format/BreakableToken.h
index 157bff4..100bb19 100644
--- a/lib/Format/BreakableToken.h
+++ b/lib/Format/BreakableToken.h
@@ -33,7 +33,7 @@
 /// strategy into the class, instead of controlling it from the outside.
 class BreakableToken {
 public:
-  // Contains starting character index and length of split.
+  /// \brief Contains starting character index and length of split.
   typedef std::pair<StringRef::size_type, unsigned> Split;
 
   virtual ~BreakableToken() {}
@@ -41,13 +41,15 @@
   /// \brief Returns the number of lines in this token in the original code.
   virtual unsigned getLineCount() const = 0;
 
-  /// \brief Returns the rest of the length of the line at \p LineIndex,
-  /// when broken at \p TailOffset.
+  /// \brief Returns the number of columns required to format the piece of line
+  /// at \p LineIndex, from byte offset \p Offset with length \p Length.
   ///
-  /// Note that previous breaks are not taken into account. \p TailOffset
-  /// is always specified from the start of the (original) line.
-  virtual unsigned getLineLengthAfterSplit(unsigned LineIndex,
-                                           unsigned TailOffset) const = 0;
+  /// Note that previous breaks are not taken into account. \p Offset is always
+  /// specified from the start of the (original) line.
+  /// \p Length can be set to StringRef::npos, which means "to the end of line".
+  virtual unsigned
+      getLineLengthAfterSplit(unsigned LineIndex, unsigned Offset,
+                              StringRef::size_type Length) const = 0;
 
   /// \brief Returns a range (offset, length) at which to break the line at
   /// \p LineIndex, if previously broken at \p TailOffset. If possible, do not
@@ -80,7 +82,8 @@
 public:
   virtual unsigned getLineCount() const;
   virtual unsigned getLineLengthAfterSplit(unsigned LineIndex,
-                                           unsigned TailOffset) const;
+                                           unsigned TailOffset,
+                                           StringRef::size_type Length) const;
   virtual void insertBreak(unsigned LineIndex, unsigned TailOffset, Split Split,
                            bool InPPDirective, WhitespaceManager &Whitespaces);
 
@@ -139,7 +142,8 @@
 
   virtual unsigned getLineCount() const;
   virtual unsigned getLineLengthAfterSplit(unsigned LineIndex,
-                                           unsigned TailOffset) const;
+                                           unsigned TailOffset,
+                                           StringRef::size_type Length) const;
   virtual Split getSplit(unsigned LineIndex, unsigned TailOffset,
                          unsigned ColumnLimit) const;
   virtual void insertBreak(unsigned LineIndex, unsigned TailOffset, Split Split,
diff --git a/lib/Format/Format.cpp b/lib/Format/Format.cpp
index 9dd5e4a..f61f1fb 100644
--- a/lib/Format/Format.cpp
+++ b/lib/Format/Format.cpp
@@ -96,6 +96,8 @@
     IO.mapOptional("MaxEmptyLinesToKeep", Style.MaxEmptyLinesToKeep);
     IO.mapOptional("ObjCSpaceBeforeProtocolList",
                    Style.ObjCSpaceBeforeProtocolList);
+    IO.mapOptional("PenaltyBreakComment", Style.PenaltyBreakComment);
+    IO.mapOptional("PenaltyBreakString", Style.PenaltyBreakString);
     IO.mapOptional("PenaltyExcessCharacter", Style.PenaltyExcessCharacter);
     IO.mapOptional("PenaltyReturnTypeOnItsOwnLine",
                    Style.PenaltyReturnTypeOnItsOwnLine);
@@ -130,6 +132,8 @@
   LLVMStyle.IndentCaseLabels = false;
   LLVMStyle.MaxEmptyLinesToKeep = 1;
   LLVMStyle.ObjCSpaceBeforeProtocolList = true;
+  LLVMStyle.PenaltyBreakComment = 45;
+  LLVMStyle.PenaltyBreakString = 1000;
   LLVMStyle.PenaltyExcessCharacter = 1000000;
   LLVMStyle.PenaltyReturnTypeOnItsOwnLine = 75;
   LLVMStyle.PointerBindsToType = false;
@@ -157,6 +161,8 @@
   GoogleStyle.IndentCaseLabels = true;
   GoogleStyle.MaxEmptyLinesToKeep = 1;
   GoogleStyle.ObjCSpaceBeforeProtocolList = false;
+  GoogleStyle.PenaltyBreakComment = 45;
+  GoogleStyle.PenaltyBreakString = 1000;
   GoogleStyle.PenaltyExcessCharacter = 1000000;
   GoogleStyle.PenaltyReturnTypeOnItsOwnLine = 200;
   GoogleStyle.PointerBindsToType = true;
@@ -840,8 +846,8 @@
                                        Whitespaces);
       }
       unsigned TailOffset = 0;
-      unsigned RemainingTokenColumns =
-          Token->getLineLengthAfterSplit(LineIndex, TailOffset);
+      unsigned RemainingTokenColumns = Token->getLineLengthAfterSplit(
+          LineIndex, TailOffset, StringRef::npos);
       while (RemainingTokenColumns > RemainingSpace) {
         BreakableToken::Split Split =
             Token->getSplit(LineIndex, TailOffset, getColumnLimit());
@@ -849,15 +855,23 @@
           break;
         assert(Split.first != 0);
         unsigned NewRemainingTokenColumns = Token->getLineLengthAfterSplit(
-            LineIndex, TailOffset + Split.first + Split.second);
+            LineIndex, TailOffset + Split.first + Split.second,
+            StringRef::npos);
         assert(NewRemainingTokenColumns < RemainingTokenColumns);
         if (!DryRun) {
           Token->insertBreak(LineIndex, TailOffset, Split, Line.InPPDirective,
                              Whitespaces);
         }
+        Penalty += Current.is(tok::string_literal) ? Style.PenaltyBreakString
+                                                   : Style.PenaltyBreakComment;
+        unsigned ColumnsUsed =
+            Token->getLineLengthAfterSplit(LineIndex, TailOffset, Split.first);
+        if (ColumnsUsed > getColumnLimit()) {
+          Penalty +=
+              Style.PenaltyExcessCharacter * (ColumnsUsed - getColumnLimit());
+        }
         TailOffset += Split.first + Split.second;
         RemainingTokenColumns = NewRemainingTokenColumns;
-        Penalty += Style.PenaltyExcessCharacter;
         BreakInserted = true;
       }
       PositionAfterLastLineInToken = RemainingTokenColumns;
diff --git a/lib/Format/TokenAnnotator.cpp b/lib/Format/TokenAnnotator.cpp
index 8b1382e..20709bb 100644
--- a/lib/Format/TokenAnnotator.cpp
+++ b/lib/Format/TokenAnnotator.cpp
@@ -103,13 +103,16 @@
       // '*' has to be a binary operator but determineStarAmpUsage() will
       // categorize it as an unary operator, so set the right type here.
       if (LookForDecls && CurrentToken->Next) {
-        FormatToken *Prev = CurrentToken->Previous;
-        FormatToken *Next = CurrentToken->Next;
-        if (Prev->Previous->is(tok::identifier) &&
-            Prev->isOneOf(tok::star, tok::amp, tok::ampamp) &&
-            CurrentToken->is(tok::identifier) && Next->isNot(tok::equal)) {
-          Prev->Type = TT_BinaryOperator;
-          LookForDecls = false;
+        FormatToken *Prev = CurrentToken->getPreviousNoneComment();
+        if (Prev) {
+          FormatToken *PrevPrev = Prev->getPreviousNoneComment();
+          FormatToken *Next = CurrentToken->Next;
+          if (PrevPrev && PrevPrev->is(tok::identifier) &&
+              Prev->isOneOf(tok::star, tok::amp, tok::ampamp) &&
+              CurrentToken->is(tok::identifier) && Next->isNot(tok::equal)) {
+            Prev->Type = TT_BinaryOperator;
+            LookForDecls = false;
+          }
         }
       }