Be more lenient with 4 byte UTF-8 sequences. Accept 4 byte sequences and convert them into surrogate pairs instead of expecting 2 separate 3 byte sequences each encoding one half of a surrogate pair. Note that in addition to supporting 4 byte sequences in strings from JNI, we also tolerate them in dex files. This is mainly for consistency, and there's no need to claim any sort of official support. bug: 18848397 bug: https://code.google.com/p/android/issues/detail?id=81341 Change-Id: Ibc98d29e59d98803e640f2489ea4c56912a59b29

commit: a5afcfc73141e5e378d79a326d02c5c2039fb025 [log] [tgz]
author: Narayan Kamath <narayan@google.com> Thu Jan 29 20:06:46 2015 +0000
committer: Narayan Kamath <narayan@google.com> Thu Feb 12 11:54:37 2015 +0000
tree: 424add9558fb816c4f1d2f4edd128f4f2a086d9a
parent: 5a3399deaf448c8434d9ba0916ff799b1b791d95 [diff] [blame]
diff --git a/runtime/check_jni.cc b/runtime/check_jni.cc
index e45d3a3..6ec0949 100644
--- a/runtime/check_jni.cc
+++ b/runtime/check_jni.cc

@@ -1095,6 +1095,8 @@
     return true;
   }
 
+  // Checks whether |bytes| is valid modified UTF-8. We also accept 4 byte UTF
+  // sequences in place of encoded surrogate pairs.
   static uint8_t CheckUtfBytes(const char* bytes, const char** errorKind) {
     while (*bytes != '\0') {
       uint8_t utf8 = *(bytes++);
@@ -1114,14 +1116,26 @@
       case 0x09:
       case 0x0a:
       case 0x0b:
-      case 0x0f:
-        /*
-         * Bit pattern 10xx or 1111, which are illegal start bytes.
-         * Note: 1111 is valid for normal UTF-8, but not the
-         * Modified UTF-8 used here.
-         */
+         // Bit patterns 10xx, which are illegal start bytes.
         *errorKind = "start";
         return utf8;
+      case 0x0f:
+        // Bit pattern 1111, which might be the start of a 4 byte sequence.
+        if ((utf8 & 0x08) == 0) {
+          // Bit pattern 1111 0xxx, which is the start of a 4 byte sequence.
+          // We consume one continuation byte here, and fall through to consume two more.
+          utf8 = *(bytes++);
+          if ((utf8 & 0xc0) != 0x80) {
+            *errorKind = "continuation";
+            return utf8;
+          }
+        } else {
+          *errorKind = "start";
+          return utf8;
+        }
+
+        // Fall through to the cases below to consume two more continuation bytes.
+        FALLTHROUGH_INTENDED;
       case 0x0e:
         // Bit pattern 1110, so there are two additional bytes.
         utf8 = *(bytes++);
@@ -1129,7 +1143,9 @@
           *errorKind = "continuation";
           return utf8;
         }
-        FALLTHROUGH_INTENDED;  // Fall-through to take care of the final byte.
+
+        // Fall through to consume one more continuation byte.
+        FALLTHROUGH_INTENDED;
       case 0x0c:
       case 0x0d:
         // Bit pattern 110x, so there is one additional byte.
commit	a5afcfc73141e5e378d79a326d02c5c2039fb025	[log] [tgz]
author	Narayan Kamath <narayan@google.com>	Thu Jan 29 20:06:46 2015 +0000
committer	Narayan Kamath <narayan@google.com>	Thu Feb 12 11:54:37 2015 +0000
tree	424add9558fb816c4f1d2f4edd128f4f2a086d9a
parent	5a3399deaf448c8434d9ba0916ff799b1b791d95 [diff] [blame]