Be more lenient with 4 byte UTF-8 sequences.

Accept 4 byte sequences and convert them into surrogate
pairs instead of expecting 2 separate 3 byte sequences
each encoding one half of a surrogate pair.

Note that in addition to supporting 4 byte sequences in
strings from JNI, we also tolerate them in dex files. This
is mainly for consistency, and there's no need to claim any
sort of official support.

bug: 18848397
bug: https://code.google.com/p/android/issues/detail?id=81341
Change-Id: Ibc98d29e59d98803e640f2489ea4c56912a59b29
diff --git a/runtime/utf.cc b/runtime/utf.cc
index 7ff296b..39c8d15 100644
--- a/runtime/utf.cc
+++ b/runtime/utf.cc
@@ -38,15 +38,30 @@
       // two-byte encoding
       continue;
     }
-    // three-byte encoding
     utf8++;
+    if ((ic & 0x10) == 0) {
+      // three-byte encoding
+      continue;
+    }
+
+    // four-byte encoding: needs to be converted into a surrogate
+    // pair.
+    utf8++;
+    len++;
   }
   return len;
 }
 
 void ConvertModifiedUtf8ToUtf16(uint16_t* utf16_data_out, const char* utf8_data_in) {
   while (*utf8_data_in != '\0') {
-    *utf16_data_out++ = GetUtf16FromUtf8(&utf8_data_in);
+    const uint32_t ch = GetUtf16FromUtf8(&utf8_data_in);
+    const uint16_t leading = GetLeadingUtf16Char(ch);
+    const uint16_t trailing = GetTrailingUtf16Char(ch);
+
+    *utf16_data_out++ = leading;
+    if (trailing != 0) {
+      *utf16_data_out++ = trailing;
+    }
   }
 }
 
@@ -102,12 +117,29 @@
       return 1;
     }
 
-    int c1 = GetUtf16FromUtf8(&utf8);
-    int c2 = *utf16++;
-    --utf16_length;
+    const uint32_t pair = GetUtf16FromUtf8(&utf8);
 
-    if (c1 != c2) {
-      return c1 > c2 ? 1 : -1;
+    // First compare the leading utf16 char.
+    const uint16_t lhs = GetLeadingUtf16Char(pair);
+    const uint16_t rhs = *utf16++;
+    --utf16_length;
+    if (lhs != rhs) {
+      return lhs > rhs ? 1 : -1;
+    }
+
+    // Then compare the trailing utf16 char. First check if there
+    // are any characters left to consume.
+    const uint16_t lhs2 = GetTrailingUtf16Char(pair);
+    if (lhs2 != 0) {
+      if (utf16_length == 0) {
+        return 1;
+      }
+
+      const uint16_t rhs2 = *utf16++;
+      --utf16_length;
+      if (lhs2 != rhs2) {
+        return lhs2 > rhs2 ? 1 : -1;
+      }
     }
   }
 }