When a bad UTF-8 encoding or bogus escape sequence is encountered in a
string literal, produce a diagnostic pointing at the erroneous character
range, not at the start of the literal.
llvm-svn: 163459
diff --git a/clang/lib/Basic/ConvertUTF.c b/clang/lib/Basic/ConvertUTF.c
index 2e25e79..ec57be7 100644
--- a/clang/lib/Basic/ConvertUTF.c
+++ b/clang/lib/Basic/ConvertUTF.c
@@ -393,15 +393,25 @@
/* --------------------------------------------------------------------- */
/*
+ * Exported function to return the total number of bytes in a codepoint
+ * represented in UTF-8, given the value of the first byte.
+ */
+unsigned getNumBytesForUTF8(UTF8 first) {
+ return trailingBytesForUTF8[first] + 1;
+}
+
+/* --------------------------------------------------------------------- */
+
+/*
* Exported function to return whether a UTF-8 string is legal or not.
* This is not used here; it's just exported.
*/
-Boolean isLegalUTF8String(const UTF8 *source, const UTF8 *sourceEnd) {
- while (source != sourceEnd) {
- int length = trailingBytesForUTF8[*source] + 1;
- if (length > sourceEnd - source || !isLegalUTF8(source, length))
+Boolean isLegalUTF8String(const UTF8 **source, const UTF8 *sourceEnd) {
+ while (*source != sourceEnd) {
+ int length = trailingBytesForUTF8[**source] + 1;
+ if (length > sourceEnd - *source || !isLegalUTF8(*source, length))
return false;
- source += length;
+ *source += length;
}
return true;
}
diff --git a/clang/lib/Basic/ConvertUTFWrapper.cpp b/clang/lib/Basic/ConvertUTFWrapper.cpp
index a1b3f7f..6be3828 100644
--- a/clang/lib/Basic/ConvertUTFWrapper.cpp
+++ b/clang/lib/Basic/ConvertUTFWrapper.cpp
@@ -13,16 +13,19 @@
namespace clang {
bool ConvertUTF8toWide(unsigned WideCharWidth, llvm::StringRef Source,
- char *&ResultPtr) {
+ char *&ResultPtr, const UTF8 *&ErrorPtr) {
assert(WideCharWidth == 1 || WideCharWidth == 2 || WideCharWidth == 4);
ConversionResult result = conversionOK;
// Copy the character span over.
if (WideCharWidth == 1) {
- if (!isLegalUTF8String(reinterpret_cast<const UTF8*>(Source.begin()),
- reinterpret_cast<const UTF8*>(Source.end())))
+ const UTF8 *Pos = reinterpret_cast<const UTF8*>(Source.begin());
+ if (!isLegalUTF8String(&Pos, reinterpret_cast<const UTF8*>(Source.end()))) {
result = sourceIllegal;
- memcpy(ResultPtr, Source.data(), Source.size());
- ResultPtr += Source.size();
+ ErrorPtr = Pos;
+ } else {
+ memcpy(ResultPtr, Source.data(), Source.size());
+ ResultPtr += Source.size();
+ }
} else if (WideCharWidth == 2) {
const UTF8 *sourceStart = (const UTF8*)Source.data();
// FIXME: Make the type of the result buffer correct instead of
@@ -34,6 +37,8 @@
&targetStart, targetStart + 2*Source.size(), flags);
if (result == conversionOK)
ResultPtr = reinterpret_cast<char*>(targetStart);
+ else
+ ErrorPtr = sourceStart;
} else if (WideCharWidth == 4) {
const UTF8 *sourceStart = (const UTF8*)Source.data();
// FIXME: Make the type of the result buffer correct instead of
@@ -45,6 +50,8 @@
&targetStart, targetStart + 4*Source.size(), flags);
if (result == conversionOK)
ResultPtr = reinterpret_cast<char*>(targetStart);
+ else
+ ErrorPtr = sourceStart;
}
assert((result != targetExhausted)
&& "ConvertUTF8toUTFXX exhausted target buffer");
@@ -67,4 +74,3 @@
}
} // end namespace clang
-