Implement warning for non-wide string literals with an unexpected encoding. Downgrade error for non-wide character literals with an unexpected encoding to a warning for compatibility with gcc and older versions of clang. <rdar://problem/10837678>. git-svn-id: https://llvm.org/svn/llvm-project/cfe/trunk@150295 91177308-0d34-0410-b5e6-96231b3b80d8

commit: 91359302b822d829afa93c0dadf5f7ce6e19fbc6 [log] [tgz]
author: Eli Friedman <eli.friedman@gmail.com> Sat Feb 11 05:08:10 2012 +0000
committer: Eli Friedman <eli.friedman@gmail.com> Sat Feb 11 05:08:10 2012 +0000
tree: 601729580f8b8e7d697dffde0d0657026cf95fd4
parent: d747efaad80117799ff7e2ab10608f18ee1348c1 [diff] [blame]
diff --git a/lib/Lex/LiteralSupport.cpp b/lib/Lex/LiteralSupport.cpp
index a3f97d9..547bd4e 100644
--- a/lib/Lex/LiteralSupport.cpp
+++ b/lib/Lex/LiteralSupport.cpp

@@ -822,17 +822,32 @@
         ++begin;
       } while (begin != end && *begin != '\\');
 
-      uint32_t *tmp_begin = buffer_begin;
+      char const *tmp_in_start = start;
+      uint32_t *tmp_out_start = buffer_begin;
       ConversionResult res =
       ConvertUTF8toUTF32(reinterpret_cast<UTF8 const **>(&start),
                          reinterpret_cast<UTF8 const *>(begin),
                          &buffer_begin,buffer_end,strictConversion);
       if (res!=conversionOK) {
-        PP.Diag(Loc, diag::err_bad_character_encoding);
-        HadError = true;
+        // If we see bad encoding for unprefixed character literals, warn and 
+        // simply copy the byte values, for compatibility with gcc and 
+        // older versions of clang.
+        bool NoErrorOnBadEncoding = isAscii();
+        unsigned Msg = diag::err_bad_character_encoding;
+        if (NoErrorOnBadEncoding)
+          Msg = diag::warn_bad_character_encoding;
+        PP.Diag(Loc, Msg);
+        if (NoErrorOnBadEncoding) {
+          start = tmp_in_start;
+          buffer_begin = tmp_out_start;
+          for ( ; start != begin; ++start, ++buffer_begin)
+            *buffer_begin = static_cast<uint8_t>(*start);
+        } else {
+          HadError = true;
+        }
       } else {
-        for (; tmp_begin<buffer_begin; ++tmp_begin) {
-          if (*tmp_begin > largest_character_for_kind) {
+        for (; tmp_out_start <buffer_begin; ++tmp_out_start) {
+          if (*tmp_out_start > largest_character_for_kind) {
             HadError = true;
             PP.Diag(Loc, diag::err_character_too_large);
           }
@@ -1097,10 +1112,8 @@
       // Copy the string over
       if (CopyStringFragment(StringRef(ThisTokBuf,ThisTokEnd-ThisTokBuf)))
       {
-        if (Diags)
-          Diags->Report(FullSourceLoc(StringToks[i].getLocation(), SM),
-                        diag::err_bad_string_encoding);
-        hadError = true;
+        if (DiagnoseBadString(StringToks[i]))
+          hadError = true;
       }
 
     } else {
@@ -1131,10 +1144,8 @@
           // Copy the character span over.
           if (CopyStringFragment(StringRef(InStart,ThisTokBuf-InStart)))
           {
-            if (Diags)
-              Diags->Report(FullSourceLoc(StringToks[i].getLocation(), SM),
-                            diag::err_bad_string_encoding);
-            hadError = true;
+            if (DiagnoseBadString(StringToks[i]))
+              hadError = true;
           }
           continue;
         }
@@ -1219,6 +1230,9 @@
   ConversionResult result = conversionOK;
   // Copy the character span over.
   if (CharByteWidth == 1) {
+    if (!isLegalUTF8Sequence(reinterpret_cast<const UTF8*>(Fragment.begin()),
+                             reinterpret_cast<const UTF8*>(Fragment.end())))
+      result = sourceIllegal;
     memcpy(ResultPtr, Fragment.data(), Fragment.size());
     ResultPtr += Fragment.size();
   } else if (CharByteWidth == 2) {
@@ -1226,7 +1240,7 @@
     // FIXME: Make the type of the result buffer correct instead of
     // using reinterpret_cast.
     UTF16 *targetStart = reinterpret_cast<UTF16*>(ResultPtr);
-    ConversionFlags flags = lenientConversion;
+    ConversionFlags flags = strictConversion;
     result = ConvertUTF8toUTF16(
 	    &sourceStart,sourceStart + Fragment.size(),
         &targetStart,targetStart + 2*Fragment.size(),flags);
@@ -1237,7 +1251,7 @@
     // FIXME: Make the type of the result buffer correct instead of
     // using reinterpret_cast.
     UTF32 *targetStart = reinterpret_cast<UTF32*>(ResultPtr);
-    ConversionFlags flags = lenientConversion;
+    ConversionFlags flags = strictConversion;
     result = ConvertUTF8toUTF32(
         &sourceStart,sourceStart + Fragment.size(),
         &targetStart,targetStart + 4*Fragment.size(),flags);
@@ -1249,6 +1263,17 @@
   return result != conversionOK;
 }
 
+bool StringLiteralParser::DiagnoseBadString(const Token &Tok) {
+  // If we see bad encoding for unprefixed string literals, warn and
+  // simply copy the byte values, for compatibility with gcc and older
+  // versions of clang.
+  bool NoErrorOnBadEncoding = isAscii();
+  unsigned Msg = NoErrorOnBadEncoding ? diag::warn_bad_string_encoding :
+                                        diag::err_bad_string_encoding;
+  if (Diags)
+    Diags->Report(FullSourceLoc(Tok.getLocation(), SM), Msg);
+  return !NoErrorOnBadEncoding;
+}
 
 /// getOffsetOfStringByte - This function returns the offset of the
 /// specified byte of the string data represented by Token.  This handles
commit	91359302b822d829afa93c0dadf5f7ce6e19fbc6	[log] [tgz]
author	Eli Friedman <eli.friedman@gmail.com>	Sat Feb 11 05:08:10 2012 +0000
committer	Eli Friedman <eli.friedman@gmail.com>	Sat Feb 11 05:08:10 2012 +0000
tree	601729580f8b8e7d697dffde0d0657026cf95fd4
parent	d747efaad80117799ff7e2ab10608f18ee1348c1 [diff] [blame]