Merge changes from topic "utf8decoder" am: 66805fc98d am: 6ded4af9ca
am: d50f5e2c1f

Change-Id: Ic3813456a964fa7de924c57e66f713439b7ee55a
diff --git a/benchmarks/src/benchmarks/regression/CharsetUtf8Benchmark.java b/benchmarks/src/benchmarks/regression/CharsetUtf8Benchmark.java
new file mode 100644
index 0000000..041e435
--- /dev/null
+++ b/benchmarks/src/benchmarks/regression/CharsetUtf8Benchmark.java
@@ -0,0 +1,69 @@
+/*
+ * Copyright (C) 2017 The Android Open Source Project
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package benchmarks.regression;
+
+import android.icu.lang.UCharacter;
+
+import java.nio.charset.Charset;
+import java.nio.charset.StandardCharsets;
+
+/**
+ * Decode the same size of ASCII, BMP, Supplementary character using fast-path UTF-8 decoder.
+ * The fast-path code is in {@link StringFactory#newStringFromBytes(byte[], int, int, Charset)}
+ */
+public class CharsetUtf8Benchmark {
+
+    private static final int NO_OF_BYTES = 0x400000; // 4MB
+    private static final byte[] ASCII = makeUnicodeRange(0, 0x7f, NO_OF_BYTES / 0x80);
+    private static final byte[] BMP2 = makeUnicodeRange(0x0080, 0x07ff, NO_OF_BYTES / 2 / 0x780);
+    private static final byte[] BMP3 = makeUnicodeRange(0x0800, 0xffff,
+            NO_OF_BYTES / 3 / 0xf000 /* 0x10000 - 0x0800 - no of surrogate code points */);
+    private static final byte[] SUPPLEMENTARY = makeUnicodeRange(0x10000, 0x10ffff,
+            NO_OF_BYTES / 4 / 0x100000);
+
+    private static byte[] makeUnicodeRange(int startingCodePoint, int endingCodePoint,
+            int repeated) {
+        StringBuilder builder = new StringBuilder();
+        for (int codePoint = startingCodePoint; codePoint <= endingCodePoint; codePoint++) {
+            if (codePoint < Character.MIN_SURROGATE || codePoint > Character.MAX_SURROGATE) {
+                builder.append(UCharacter.toString(codePoint));
+            }
+        }
+
+        String str = builder.toString();
+        builder = new StringBuilder();
+        for (int i = 0; i < repeated; i++) {
+            builder.append(str);
+        }
+        return builder.toString().getBytes();
+    }
+
+    public void time_ascii() {
+        new String(ASCII, StandardCharsets.UTF_8);
+    }
+
+    public void time_bmp2() {
+        new String(BMP2, StandardCharsets.UTF_8);
+    }
+
+    public void time_bmp3() {
+        new String(BMP3, StandardCharsets.UTF_8);
+    }
+
+    public void time_supplementary() {
+        new String(SUPPLEMENTARY, StandardCharsets.UTF_8);
+    }
+}
diff --git a/expectations/knownfailures.txt b/expectations/knownfailures.txt
index ed88ae7..54a443c 100644
--- a/expectations/knownfailures.txt
+++ b/expectations/knownfailures.txt
@@ -1838,11 +1838,5 @@
   result: EXEC_FAILED,
   bug: 67395816,
   name: "libcore.java.util.zip.DeflateRegressionTest#deterministicOutput"
-},
-{
-  description: "Awaiting fix in fast-path UTF-8 decoder / CharsetDecoder",
-  result: EXEC_FAILED,
-  bug: 69599767,
-  name: "libcore.java.lang.StringTest#test_23831"
 }
 ]
diff --git a/harmony-tests/src/test/java/org/apache/harmony/tests/java/nio/charset/CharsetEncoder2Test.java b/harmony-tests/src/test/java/org/apache/harmony/tests/java/nio/charset/CharsetEncoder2Test.java
index 9884625..9db8174 100644
--- a/harmony-tests/src/test/java/org/apache/harmony/tests/java/nio/charset/CharsetEncoder2Test.java
+++ b/harmony-tests/src/test/java/org/apache/harmony/tests/java/nio/charset/CharsetEncoder2Test.java
@@ -198,8 +198,8 @@
         byte[] orig = new byte[] { (byte) 0xed, (byte) 0xa0,
                 (byte) 0x80 };
         String s = new String(orig, "UTF-8");
-        assertEquals(1, s.length());
-        assertEquals(55296, s.charAt(0));
+        assertEquals(3, s.length());
+        assertEquals("\ufffd\ufffd\ufffd", s);
         Charset.forName("UTF-8").encode(CharBuffer.wrap(s));
 //        ByteBuffer buf = <result>
 //        for (byte o : orig) {
diff --git a/harmony-tests/src/test/java/org/apache/harmony/tests/javax/security/auth/x500/X500PrincipalTest.java b/harmony-tests/src/test/java/org/apache/harmony/tests/javax/security/auth/x500/X500PrincipalTest.java
index 14b21f7..a0dff1c 100644
--- a/harmony-tests/src/test/java/org/apache/harmony/tests/javax/security/auth/x500/X500PrincipalTest.java
+++ b/harmony-tests/src/test/java/org/apache/harmony/tests/javax/security/auth/x500/X500PrincipalTest.java
@@ -2532,16 +2532,20 @@
                 0x30, 0x0B, 0x06, 0x03, 0x55, 0x04, 0x03,
                 // UTF8 String
                 0x0C, 0x04, (byte) 0xD0, (byte) 0xAF, 0x41, 0x41 }); // 0xD0AF == the last letter(capital) of Russian alphabet
-        list.add("CN=\"\\E0\\90\\AF\"", "CN=" + ((char) 1071), "CN="
-                + ((char) 1071), new byte[] { 0x30, 0x0D, 0x31, 0x0B, 0x30,
-                0x09, 0x06, 0x03, 0x55, 0x04, 0x03,
+        list.add("CN=\"\\E0\\90\\AF\"", "CN=\ufffd\ufffd\ufffd", "CN=\ufffd\ufffd\ufffd",
+                new byte[] { 0x30, 0x14, 0x31, 0x12, 0x30, 0x10, 0x06, 0x03, 0x55, 0x04, 0x03,
                 // UTF8 String
-                0x0C, 0x02, (byte) 0xD0, (byte) 0xAF }); // UTF8(0xE090AF that is not quite correct)== UTF8(0xD0AF) == the last letter(capital) of Russian alphabet
-        list.add("CN=\"\\F0\\80\\90\\AF\"", "CN=" + ((char) 1071), "CN="
-                + ((char) 1071), new byte[] { 0x30, 0x0D, 0x31, 0x0B, 0x30,
-                0x09, 0x06, 0x03, 0x55, 0x04, 0x03,
+                0x0C, 0x09, (byte) 0xEF, (byte) 0xBF, (byte) 0xBD, (byte) 0xEF, (byte) 0xBF,
+                (byte) 0xBD, (byte) 0xEF, (byte) 0xBF, (byte) 0xBD });
+        // UTF8(0xE090AF) is not correct because it's a overlong form of UTF8(0xD0AF).
+        list.add("CN=\"\\F0\\80\\90\\AF\"", "CN=\ufffd\ufffd\ufffd\ufffd",
+                "CN=\ufffd\ufffd\ufffd\ufffd",
+                new byte[] { 0x30, 0x17, 0x31, 0x15, 0x30, 0x13, 0x06, 0x03, 0x55, 0x04, 0x03,
                 // UTF8 String
-                0x0C, 0x02, (byte) 0xD0, (byte) 0xAF }); // UTF8(0xF08090AF that is not quite correct)== UTF8(0xD0AF) == the last letter(capital) of Russian alphabet
+                0x0C, 0x0C, (byte) 0xEF, (byte) 0xBF, (byte) 0xBD, (byte) 0xEF, (byte) 0xBF,
+                (byte) 0xBD, (byte) 0xEF, (byte) 0xBF, (byte) 0xBD, (byte) 0xEF, (byte) 0xBF,
+                (byte) 0xBD });
+        // UTF8(0xF08090AF) is not correct because it's a overlong form of UTF8(0xD0AF).
 
         list.add("CN=\"\\41\"+ST=A", "CN=A+ST=A", "CN=A + ST=A"); // 0x41=='A'
         list.add("CN=\"\\41\\2C\"+ST=A", "CN=A\\,+ST=A", "CN=\"A,\" + ST=A"); // 0x41=='A', 0x2C=','
diff --git a/libart/src/main/java/java/lang/StringFactory.java b/libart/src/main/java/java/lang/StringFactory.java
index 208a657..1866562 100644
--- a/libart/src/main/java/java/lang/StringFactory.java
+++ b/libart/src/main/java/java/lang/StringFactory.java
@@ -65,6 +65,14 @@
         return newStringFromBytes(data, 0, data.length, Charset.forNameUEE(charsetName));
     }
 
+    private static final int[] TABLE_UTF8_NEEDED = new int[] {
+    //      0  1  2  3  4  5  6  7  8  9  a  b  c  d  e  f
+            0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // 0xc0 - 0xcf
+            1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // 0xd0 - 0xdf
+            2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, // 0xe0 - 0xef
+            3, 3, 3, 3, 3, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // 0xf0 - 0xff
+    };
+
     // TODO: Implement this method natively.
     public static String newStringFromBytes(byte[] data, int offset, int byteCount, Charset charset) {
         if ((offset | byteCount) < 0 || byteCount > data.length - offset) {
@@ -77,98 +85,137 @@
         // We inline UTF-8, ISO-8859-1, and US-ASCII decoders for speed.
         String canonicalCharsetName = charset.name();
         if (canonicalCharsetName.equals("UTF-8")) {
+            /*
+            This code converts a UTF-8 byte sequence to a Java String (UTF-16).
+            It implements the W3C recommended UTF-8 decoder.
+            https://www.w3.org/TR/encoding/#utf-8-decoder
+
+            Unicode 3.2 Well-Formed UTF-8 Byte Sequences
+            Code Points        First  Second Third Fourth
+            U+0000..U+007F     00..7F
+            U+0080..U+07FF     C2..DF 80..BF
+            U+0800..U+0FFF     E0     A0..BF 80..BF
+            U+1000..U+CFFF     E1..EC 80..BF 80..BF
+            U+D000..U+D7FF     ED     80..9F 80..BF
+            U+E000..U+FFFF     EE..EF 80..BF 80..BF
+            U+10000..U+3FFFF   F0     90..BF 80..BF 80..BF
+            U+40000..U+FFFFF   F1..F3 80..BF 80..BF 80..BF
+            U+100000..U+10FFFF F4     80..8F 80..BF 80..BF
+
+            Please refer to Unicode as the authority.
+            p.126 Table 3-7 in http://www.unicode.org/versions/Unicode10.0.0/ch03.pdf
+
+            Handling Malformed Input
+            The maximal subpart should be replaced by a single U+FFFD. Maximal subpart is
+            the longest code unit subsequence starting at an unconvertible offset that is either
+            1) the initial subsequence of a well-formed code unit sequence, or
+            2) a subsequence of length one:
+            One U+FFFD should be emitted for every sequence of bytes that is an incomplete prefix
+            of a valid sequence, and with the conversion to restart after the incomplete sequence.
+
+            For example, in byte sequence "41 C0 AF 41 F4 80 80 41", the maximal subparts are
+            "C0", "AF", and "F4 80 80". "F4 80 80" can be the initial subsequence of "F4 80 80 80",
+            but "C0" can't be the initial subsequence of any well-formed code unit sequence.
+            Thus, the output should be "A\ufffd\ufffdA\ufffdA".
+
+            Please refer to section "Best Practices for Using U+FFFD." in
+            http://www.unicode.org/versions/Unicode10.0.0/ch03.pdf
+            */
             byte[] d = data;
             char[] v = new char[byteCount];
 
             int idx = offset;
             int last = offset + byteCount;
             int s = 0;
-outer:
+
+            int codePoint = 0;
+            int utf8BytesSeen = 0;
+            int utf8BytesNeeded = 0;
+            int lowerBound = 0x80;
+            int upperBound = 0xbf;
+
             while (idx < last) {
-                byte b0 = d[idx++];
-                if ((b0 & 0x80) == 0) {
-                    // 0xxxxxxx
-                    // Range:  U-00000000 - U-0000007F
-                    int val = b0 & 0xff;
-                    v[s++] = (char) val;
-                } else if (((b0 & 0xe0) == 0xc0) || ((b0 & 0xf0) == 0xe0) ||
-                        ((b0 & 0xf8) == 0xf0) || ((b0 & 0xfc) == 0xf8) || ((b0 & 0xfe) == 0xfc)) {
-                    int utfCount = 1;
-                    if ((b0 & 0xf0) == 0xe0) utfCount = 2;
-                    else if ((b0 & 0xf8) == 0xf0) utfCount = 3;
-                    else if ((b0 & 0xfc) == 0xf8) utfCount = 4;
-                    else if ((b0 & 0xfe) == 0xfc) utfCount = 5;
+                int b = d[idx++] & 0xff;
+                if (utf8BytesNeeded == 0) {
+                    if ((b & 0x80) == 0) { // ASCII char. 0xxxxxxx
+                        v[s++] = (char) b;
+                        continue;
+                    }
 
-                    // 110xxxxx (10xxxxxx)+
-                    // Range:  U-00000080 - U-000007FF (count == 1)
-                    // Range:  U-00000800 - U-0000FFFF (count == 2)
-                    // Range:  U-00010000 - U-001FFFFF (count == 3)
-                    // Range:  U-00200000 - U-03FFFFFF (count == 4)
-                    // Range:  U-04000000 - U-7FFFFFFF (count == 5)
-
-                    if (idx + utfCount > last) {
+                    if ((b & 0x40) == 0) { // 10xxxxxx is illegal as first byte
                         v[s++] = REPLACEMENT_CHAR;
                         continue;
                     }
 
-                    // Extract usable bits from b0
-                    int val = b0 & (0x1f >> (utfCount - 1));
-                    for (int i = 0; i < utfCount; ++i) {
-                        byte b = d[idx++];
-                        if ((b & 0xc0) != 0x80) {
-                            v[s++] = REPLACEMENT_CHAR;
-                            idx--; // Put the input char back
-                            continue outer;
-                        }
-                        // Push new bits in from the right side
-                        val <<= 6;
-                        val |= b & 0x3f;
-                    }
-
-                    // Note: Java allows overlong char
-                    // specifications To disallow, check that val
-                    // is greater than or equal to the minimum
-                    // value for each count:
-                    //
-                    // count    min value
-                    // -----   ----------
-                    //   1           0x80
-                    //   2          0x800
-                    //   3        0x10000
-                    //   4       0x200000
-                    //   5      0x4000000
-
-                    // Allow surrogate values (0xD800 - 0xDFFF) to
-                    // be specified using 3-byte UTF values only
-                    if ((utfCount != 2) && (val >= 0xD800) && (val <= 0xDFFF)) {
+                    // 11xxxxxx
+                    int tableLookupIndex = b & 0x3f;
+                    utf8BytesNeeded = TABLE_UTF8_NEEDED[tableLookupIndex];
+                    if (utf8BytesNeeded == 0) {
                         v[s++] = REPLACEMENT_CHAR;
                         continue;
                     }
 
-                    // Reject chars greater than the Unicode maximum of U+10FFFF.
-                    if (val > 0x10FFFF) {
+                    // utf8BytesNeeded
+                    // 1: b & 0x1f
+                    // 2: b & 0x0f
+                    // 3: b & 0x07
+                    codePoint = b & (0x3f >> utf8BytesNeeded);
+                    if (b == 0xe0) {
+                        lowerBound = 0xa0;
+                    } else if (b == 0xed) {
+                        upperBound = 0x9f;
+                    } else if (b == 0xf0) {
+                        lowerBound = 0x90;
+                    } else if (b == 0xf4) {
+                        upperBound = 0x8f;
+                    }
+                } else {
+                    if (b < lowerBound || b > upperBound) {
+                        // The bytes seen are ill-formed. Substitute them with U+FFFD
                         v[s++] = REPLACEMENT_CHAR;
+                        codePoint = 0;
+                        utf8BytesNeeded = 0;
+                        utf8BytesSeen = 0;
+                        lowerBound = 0x80;
+                        upperBound = 0xbf;
+                        /*
+                         * According to the Unicode Standard,
+                         * "a UTF-8 conversion process is required to never consume well-formed
+                         * subsequences as part of its error handling for ill-formed subsequences"
+                         * The current byte could be part of well-formed subsequences. Reduce the
+                         * index by 1 to parse it in next loop.
+                         */
+                        idx--;
+                        continue;
+                    }
+
+                    lowerBound = 0x80;
+                    upperBound = 0xbf;
+                    codePoint = (codePoint << 6) | (b & 0x3f);
+                    utf8BytesSeen++;
+                    if (utf8BytesNeeded != utf8BytesSeen) {
                         continue;
                     }
 
                     // Encode chars from U+10000 up as surrogate pairs
-                    if (val < 0x10000) {
-                        v[s++] = (char) val;
+                    if (codePoint < 0x10000) {
+                        v[s++] = (char) codePoint;
                     } else {
-                        int x = val & 0xffff;
-                        int u = (val >> 16) & 0x1f;
-                        int w = (u - 1) & 0xffff;
-                        int hi = 0xd800 | (w << 6) | (x >> 10);
-                        int lo = 0xdc00 | (x & 0x3ff);
-                        v[s++] = (char) hi;
-                        v[s++] = (char) lo;
+                        v[s++] = (char) ((codePoint >> 10) + 0xd7c0);
+                        v[s++] = (char) ((codePoint & 0x3ff) + 0xdc00);
                     }
-                } else {
-                    // Illegal values 0x8*, 0x9*, 0xa*, 0xb*, 0xfd-0xff
-                    v[s++] = REPLACEMENT_CHAR;
+
+                    utf8BytesSeen = 0;
+                    utf8BytesNeeded = 0;
+                    codePoint = 0;
                 }
             }
 
+            // The bytes seen are ill-formed. Substitute them by U+FFFD
+            if (utf8BytesNeeded != 0) {
+                v[s++] = REPLACEMENT_CHAR;
+            }
+
             if (s == byteCount) {
                 // We guessed right, so we can use our temporary array as-is.
                 value = v;
diff --git a/luni/src/test/java/libcore/java/lang/StringTest.java b/luni/src/test/java/libcore/java/lang/StringTest.java
index 7e34d44..93f5aed 100644
--- a/luni/src/test/java/libcore/java/lang/StringTest.java
+++ b/luni/src/test/java/libcore/java/lang/StringTest.java
@@ -16,6 +16,11 @@
 
 package libcore.java.lang;
 
+import android.icu.lang.UCharacter;
+
+import java.io.ByteArrayInputStream;
+import java.io.DataInputStream;
+import java.io.IOException;
 import java.nio.ByteBuffer;
 import java.nio.CharBuffer;
 import java.nio.ReadOnlyBufferException;
@@ -24,9 +29,12 @@
 import java.nio.charset.CharsetEncoder;
 import java.nio.charset.CoderResult;
 import java.nio.charset.CodingErrorAction;
+import java.nio.charset.ModifiedUtf8;
+import java.nio.charset.StandardCharsets;
 import java.util.Arrays;
 import java.util.ArrayList;
 import java.util.Locale;
+
 import junit.framework.TestCase;
 
 public class StringTest extends TestCase {
@@ -297,12 +305,15 @@
         assertEquals("project_Id", "projectId".replaceAll("(?!^)(\\p{Upper})(?!$)", "_$1"));
     }
 
+    // Test that CharsetDecoder and fast-path decoder are consistent when handling ill-formed
+    // sequence. http://b/69599767
+    // This test was originally created for the bug
     // https://code.google.com/p/android/issues/detail?id=23831
-    public void test_23831() throws Exception {
+    public void test_69599767() throws Exception {
         byte[] bytes = { (byte) 0xf5, (byte) 0xa9, (byte) 0xea, (byte) 0x21 };
-        String expected = "\ufffd\ufffd\u0021";
+        String expected = "\ufffd\ufffd\ufffd\u0021";
 
-        // Since we use icu4c for CharsetDecoder...
+        // Since we use ICU4C for CharsetDecoder...
         CharsetDecoder decoder = Charset.forName("UTF-8").newDecoder();
         decoder.onMalformedInput(CodingErrorAction.REPLACE);
         assertEquals(expected, decoder.decode(ByteBuffer.wrap(bytes)).toString());
@@ -311,6 +322,175 @@
         assertEquals(expected, new String(bytes, "UTF-8"));
     }
 
+    public void testFastPathString_wellFormedUtf8Sequence() throws Exception {
+        // U+0000 null
+        assertFastPathUtf8DecodedEquals("\u0000", "00");
+        // U+0031 ASCII char '1'
+        assertFastPathUtf8DecodedEquals("1", "31");
+        // U+007f
+        assertFastPathUtf8DecodedEquals("\u007f", "7f");
+        // 2-byte UTF-8 sequence
+        assertFastPathUtf8DecodedEquals("\u0080", "c2 80");
+        assertFastPathUtf8DecodedEquals("\u07ff", "df bf");
+        // 3-byte UTF-8 sequence
+        assertFastPathUtf8DecodedEquals("\u0800", "e0 a0 80");
+        assertFastPathUtf8DecodedEquals("\ud7ff", "ed 9f bf"); // last code point before surrogate
+        assertFastPathUtf8DecodedEquals("\ue000", "ee 80 80"); // first code point after surrogate
+        assertFastPathUtf8DecodedEquals("\uffff", "ef bf bf");
+        // U+10000 The minimum value of a Unicode supplementary code point
+        assertEquals("\ud800\udc00", String.valueOf(Character.toChars(0x10000)));
+        assertFastPathUtf8DecodedEquals("\ud800\udc00", "f0 90 80 80");
+        // U+10ffff The maximum value of a Unicode code point
+        assertEquals("\udbff\udfff", String.valueOf(Character.toChars(0x10ffff)));
+        assertFastPathUtf8DecodedEquals("\udbff\udfff", "f4 8f bf bf");
+
+        // Null in the middle
+        assertFastPathUtf8DecodedEquals("1\u00002\u07ff", "31 00 32 df bf");
+
+        assertFastPathUtf8DecodedEquals("\u0800\udbff\udfff\uffff1\u0080",
+                "e0 a0 80 f4 8f bf bf ef bf bf 31 c2 80");
+
+        // Check UTF8 sequences of all code points is decoded correctly.
+        // Validate the decoder using byte sequence generated by UTF-8 encoder.
+        for (int codePoint = Character.MIN_CODE_POINT;
+                codePoint <= Character.MAX_CODE_POINT;
+                codePoint++) {
+            if (codePoint < Character.MIN_SURROGATE || codePoint > Character.MAX_SURROGATE) {
+                String expected = UCharacter.toString(codePoint);
+                // Android platform default is always UTF-8.
+                byte[] utf8Bytes = expected.getBytes();
+                assertEquals(expected, new String(utf8Bytes));
+            }
+        }
+    }
+
+    public void testFastPathString_illFormedUtf8Sequence() throws Exception {
+        // Overlong Sequence of ASCII char '1'
+        assertFastPathUtf8DecodedEquals("\ufffd\ufffd", "c0 b1");
+        assertFastPathUtf8DecodedEquals("\ufffd\ufffd\ufffd", "e0 80 b1");
+        assertFastPathUtf8DecodedEquals("\ufffd\ufffd\ufffd\ufffd", "f0 80 80 b1");
+        assertFastPathUtf8DecodedEquals("\ufffd\ufffd\ufffd\ufffd\ufffd", "f8 80 80 80 b1");
+        assertFastPathUtf8DecodedEquals("\ufffd\ufffd\ufffd\ufffd\ufffd\ufffd", "fc 80 80 80 80 b1");
+
+        // Overlong null \u0000
+        // "c0 80" is a Modified UTF-8 sequence representing \u0000, but illegal in UTF-8.
+        assertEquals("\u0000", decodeModifiedUTF8("c0 80"));
+        assertFastPathUtf8DecodedEquals("\ufffd\ufffd", "c0 80");
+
+        // Overlong BMP char U+0080. The correct UTF-8 encoded form of U+0080 is 2-byte "c2 80".
+        // The overlong form can be obtained by filling 0x80 into 1110xxxx 10xxxxxx 10xxxxxx
+        // == 1110000 10000010 10000000. (hex form e0 82 80)
+        assertFastPathUtf8DecodedEquals("\ufffd\ufffd\ufffd", "e0 82 80");
+
+        // Overlong Supplementary Characters U+10000.
+        // The correct UTF-8 encoded form of U+10000 is 4-byte "f0 90 80 80".
+        // The overlong form can be obtained by filling 0x10000 into
+        // 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx
+        // == 1110000 10000000 10010000 10000000 10000000. (hex form f8 80 90 80 80)
+        assertFastPathUtf8DecodedEquals("\ufffd\ufffd\ufffd\ufffd\ufffd", "f8 80 90 80 80");
+
+        // Single surrogate in CESU-8 encoding
+        // A CESU-8 sequence, but illegal in UTF-8.
+        assertEquals("\ud800", decodeCESU8("ed a0 80"));
+        assertFastPathUtf8DecodedEquals("\ufffd\ufffd\ufffd", "ed a0 80");
+
+        // Surrogate pair in CESU-8 encoding. The value is bytes U+10000
+        // Assert the bytes are valid CESU-8 sequence before decoding using UTF-8
+        String surrogatePair = decodeCESU8("ed a0 80 ed b0 80");
+        assertEquals("\ud800\udc00", surrogatePair);
+        assertEquals(0x10000, Character.codePointAt(surrogatePair.toCharArray(), 0));
+        assertFastPathUtf8DecodedEquals("\ufffd\ufffd\ufffd\ufffd\ufffd\ufffd",
+                "ed a0 80 ed b0 80");
+
+        // Illegal first-byte
+        assertFastPathUtf8DecodedEquals("\ufffd", "c0");
+        assertFastPathUtf8DecodedEquals("\ufffd", "80");
+
+        // Maximal valid subpart. byte 0x31 should be decoded into ASCII char '1', not part of
+        // ill-formed byte sequence
+        assertFastPathUtf8DecodedEquals("\ufffd1", "c2 31");
+        assertFastPathUtf8DecodedEquals("\ufffd1", "e1 31");
+        assertFastPathUtf8DecodedEquals("\ufffd1", "e1 80 31");
+        assertFastPathUtf8DecodedEquals("\ufffd1", "f1 31");
+        assertFastPathUtf8DecodedEquals("\ufffd1", "f1 80 31");
+        assertFastPathUtf8DecodedEquals("\ufffd1", "f1 80 80 31");;
+
+        // Ill-formed sequence in the end of stream
+        assertFastPathUtf8DecodedEquals("1\ufffd", "31 c2");
+        assertFastPathUtf8DecodedEquals("1\ufffd", "31 e1");
+        assertFastPathUtf8DecodedEquals("1\ufffd", "31 e1 80");
+        assertFastPathUtf8DecodedEquals("1\ufffd", "31 f1");
+        assertFastPathUtf8DecodedEquals("1\ufffd", "31 f1 80");
+        assertFastPathUtf8DecodedEquals("1\ufffd", "31 f1 80 80");
+
+        // Test lower and upper bound of first trail byte when leading byte is e0/ed/f0/f4
+        // Valid range of trail byte is A0..BF.
+        assertFastPathUtf8DecodedEquals("1\ufffd\ufffd", "31 e0 9f");
+        assertFastPathUtf8DecodedEquals("1\ufffd\ufffd", "31 e0 c0");
+        // Valid range of trail byte is 80..9F.
+        assertFastPathUtf8DecodedEquals("1\ufffd\u007f", "31 ed 7f");
+        assertFastPathUtf8DecodedEquals("1\ufffd\ufffd", "31 ed a0");
+        // Valid range of trail byte is 90..BF.
+        assertFastPathUtf8DecodedEquals("1\ufffd\ufffd", "31 f0 8f");
+        assertFastPathUtf8DecodedEquals("1\ufffd\ufffd", "31 f0 c0");
+        // Valid range of trail byte is 80..8F.
+        assertFastPathUtf8DecodedEquals("1\ufffd\u007f", "31 f4 7f");
+        assertFastPathUtf8DecodedEquals("1\ufffd\ufffd", "31 f4 90");
+
+        // More ill-formed sequences
+        assertFastPathUtf8DecodedEquals("\ufffd\ufffd1", "f1 80 80 e1 80 31");
+        assertFastPathUtf8DecodedEquals("\ufffd\ufffd\ufffd1", "f1 80 80 c0 b1 31");
+        assertFastPathUtf8DecodedEquals("\ufffd\ufffd\ufffd1", "f1 80 80 ed a0 31");
+        assertFastPathUtf8DecodedEquals("A\ufffd\ufffdA\ufffdA", "41 C0 AF 41 F4 80 80 41");
+    }
+
+    private void assertFastPathUtf8DecodedEquals(String expected, String hexString)
+            throws Exception {
+        String actual = new String(hexStringtoBytes(hexString));
+        assertEquals("Fast-path UTF-8 decoder decodes sequence [" + hexString
+                        + "] into unexpected String",
+                expected, actual);
+        // Since we use ICU4C for CharsetDecoder,
+        // check UTF-8 CharsetDecoder has the same result as the fast-path decoder
+        CharsetDecoder decoder = StandardCharsets.UTF_8.newDecoder()
+                .onMalformedInput(CodingErrorAction.REPLACE);
+        assertEquals("Fast-path UTF-8 decoder and UTF-8 CharsetDecoder has a different conversion"
+                        + " result for sequence [" + hexString + "]",
+                decoder.decode(ByteBuffer.wrap(hexStringtoBytes(hexString))).toString(), actual);
+    }
+
+    private static String decodeCESU8(String hexString) throws IOException {
+        CharsetDecoder cesu8Decoder = Charset.forName("CESU-8").newDecoder();
+        return cesu8Decoder.decode(ByteBuffer.wrap(hexStringtoBytes(hexString))).toString();
+    }
+
+    private static String decodeModifiedUTF8(String hexString) throws IOException {
+        byte[] bytes = hexStringtoBytes(hexString);
+        // DataInputStream stores length as 2-byte short. Check the length before decoding
+        if (bytes.length > 0xffff) {
+            throw new IllegalArgumentException("Modified UTF-8 bytes are too long.");
+        }
+        byte[] buf = new byte[bytes.length + 2];
+        buf[0] = (byte)(bytes.length >>> 8);
+        buf[1] = (byte) bytes.length;
+        System.arraycopy(bytes, 0, buf, 2, bytes.length);
+        DataInputStream dis = new DataInputStream(new ByteArrayInputStream(buf));
+        return dis.readUTF();
+    }
+
+    private static byte[] hexStringtoBytes(String input) {
+        String[] parts = input.split(" ");
+        byte[] bytes = new byte[parts.length];
+        for (int i = 0; i < parts.length; i++) {
+            int val = Integer.parseInt(parts[i], 16);
+            if (val < 0 || val > 255) {
+                throw new IllegalArgumentException();
+            }
+            bytes[i] = (byte) (0xff & val);
+        }
+        return bytes;
+    }
+
     // https://code.google.com/p/android/issues/detail?id=55129
     public void test_55129() throws Exception {
         assertEquals("-h-e-l-l-o- -w-o-r-l-d-", "hello world".replace("", "-"));