When a bad UTF-8 encoding or bogus escape sequence is encountered in a string literal, produce a diagnostic pointing at the erroneous character range, not at the start of the literal. llvm-svn: 163459

commit: 639b8d05dd2432f0609774f7963b5780dd5366e1 [log] [tgz]
author: Richard Smith <richard-llvm@metafoo.co.uk> Sat Sep 08 07:16:20 2012 +0000
committer: Richard Smith <richard-llvm@metafoo.co.uk> Sat Sep 08 07:16:20 2012 +0000
tree: de4121f4117db133f5da18d62912166d624fa12e
parent: 3e41a5bb3176c2163f1646f313e91c9674658e77 [diff]
diff --git a/clang/lib/Basic/ConvertUTF.c b/clang/lib/Basic/ConvertUTF.c
index 2e25e79..ec57be7 100644
--- a/clang/lib/Basic/ConvertUTF.c
+++ b/clang/lib/Basic/ConvertUTF.c

@@ -393,15 +393,25 @@
 /* --------------------------------------------------------------------- */
 
 /*
+ * Exported function to return the total number of bytes in a codepoint
+ * represented in UTF-8, given the value of the first byte.
+ */
+unsigned getNumBytesForUTF8(UTF8 first) {
+  return trailingBytesForUTF8[first] + 1;
+}
+
+/* --------------------------------------------------------------------- */
+
+/*
  * Exported function to return whether a UTF-8 string is legal or not.
  * This is not used here; it's just exported.
  */
-Boolean isLegalUTF8String(const UTF8 *source, const UTF8 *sourceEnd) {
-    while (source != sourceEnd) {
-        int length = trailingBytesForUTF8[*source] + 1;
-        if (length > sourceEnd - source || !isLegalUTF8(source, length))
+Boolean isLegalUTF8String(const UTF8 **source, const UTF8 *sourceEnd) {
+    while (*source != sourceEnd) {
+        int length = trailingBytesForUTF8[**source] + 1;
+        if (length > sourceEnd - *source || !isLegalUTF8(*source, length))
             return false;
-        source += length;
+        *source += length;
     }
     return true;
 }

diff --git a/clang/lib/Basic/ConvertUTFWrapper.cpp b/clang/lib/Basic/ConvertUTFWrapper.cpp
index a1b3f7f..6be3828 100644
--- a/clang/lib/Basic/ConvertUTFWrapper.cpp
+++ b/clang/lib/Basic/ConvertUTFWrapper.cpp

@@ -13,16 +13,19 @@
 namespace clang {
 
 bool ConvertUTF8toWide(unsigned WideCharWidth, llvm::StringRef Source,
-                       char *&ResultPtr) {
+                       char *&ResultPtr, const UTF8 *&ErrorPtr) {
   assert(WideCharWidth == 1 || WideCharWidth == 2 || WideCharWidth == 4);
   ConversionResult result = conversionOK;
   // Copy the character span over.
   if (WideCharWidth == 1) {
-    if (!isLegalUTF8String(reinterpret_cast<const UTF8*>(Source.begin()),
-                           reinterpret_cast<const UTF8*>(Source.end())))
+    const UTF8 *Pos = reinterpret_cast<const UTF8*>(Source.begin());
+    if (!isLegalUTF8String(&Pos, reinterpret_cast<const UTF8*>(Source.end()))) {
       result = sourceIllegal;
-    memcpy(ResultPtr, Source.data(), Source.size());
-    ResultPtr += Source.size();
+      ErrorPtr = Pos;
+    } else {
+      memcpy(ResultPtr, Source.data(), Source.size());
+      ResultPtr += Source.size();
+    }
   } else if (WideCharWidth == 2) {
     const UTF8 *sourceStart = (const UTF8*)Source.data();
     // FIXME: Make the type of the result buffer correct instead of
@@ -34,6 +37,8 @@
         &targetStart, targetStart + 2*Source.size(), flags);
     if (result == conversionOK)
       ResultPtr = reinterpret_cast<char*>(targetStart);
+    else
+      ErrorPtr = sourceStart;
   } else if (WideCharWidth == 4) {
     const UTF8 *sourceStart = (const UTF8*)Source.data();
     // FIXME: Make the type of the result buffer correct instead of
@@ -45,6 +50,8 @@
         &targetStart, targetStart + 4*Source.size(), flags);
     if (result == conversionOK)
       ResultPtr = reinterpret_cast<char*>(targetStart);
+    else
+      ErrorPtr = sourceStart;
   }
   assert((result != targetExhausted)
          && "ConvertUTF8toUTFXX exhausted target buffer");
@@ -67,4 +74,3 @@
 }
 
 } // end namespace clang
-
commit	639b8d05dd2432f0609774f7963b5780dd5366e1	[log] [tgz]
author	Richard Smith <richard-llvm@metafoo.co.uk>	Sat Sep 08 07:16:20 2012 +0000
committer	Richard Smith <richard-llvm@metafoo.co.uk>	Sat Sep 08 07:16:20 2012 +0000
tree	de4121f4117db133f5da18d62912166d624fa12e
parent	3e41a5bb3176c2163f1646f313e91c9674658e77 [diff]