Merge changes from topic "utf8decoder" am: 66805fc98d am: 6ded4af9ca
am: d50f5e2c1f
Change-Id: Ic3813456a964fa7de924c57e66f713439b7ee55a
diff --git a/benchmarks/src/benchmarks/regression/CharsetUtf8Benchmark.java b/benchmarks/src/benchmarks/regression/CharsetUtf8Benchmark.java
new file mode 100644
index 0000000..041e435
--- /dev/null
+++ b/benchmarks/src/benchmarks/regression/CharsetUtf8Benchmark.java
@@ -0,0 +1,69 @@
+/*
+ * Copyright (C) 2017 The Android Open Source Project
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package benchmarks.regression;
+
+import android.icu.lang.UCharacter;
+
+import java.nio.charset.Charset;
+import java.nio.charset.StandardCharsets;
+
+/**
+ * Decode the same size of ASCII, BMP, Supplementary character using fast-path UTF-8 decoder.
+ * The fast-path code is in {@link StringFactory#newStringFromBytes(byte[], int, int, Charset)}
+ */
+public class CharsetUtf8Benchmark {
+
+ private static final int NO_OF_BYTES = 0x400000; // 4MB
+ private static final byte[] ASCII = makeUnicodeRange(0, 0x7f, NO_OF_BYTES / 0x80);
+ private static final byte[] BMP2 = makeUnicodeRange(0x0080, 0x07ff, NO_OF_BYTES / 2 / 0x780);
+ private static final byte[] BMP3 = makeUnicodeRange(0x0800, 0xffff,
+ NO_OF_BYTES / 3 / 0xf000 /* 0x10000 - 0x0800 - no of surrogate code points */);
+ private static final byte[] SUPPLEMENTARY = makeUnicodeRange(0x10000, 0x10ffff,
+ NO_OF_BYTES / 4 / 0x100000);
+
+ private static byte[] makeUnicodeRange(int startingCodePoint, int endingCodePoint,
+ int repeated) {
+ StringBuilder builder = new StringBuilder();
+ for (int codePoint = startingCodePoint; codePoint <= endingCodePoint; codePoint++) {
+ if (codePoint < Character.MIN_SURROGATE || codePoint > Character.MAX_SURROGATE) {
+ builder.append(UCharacter.toString(codePoint));
+ }
+ }
+
+ String str = builder.toString();
+ builder = new StringBuilder();
+ for (int i = 0; i < repeated; i++) {
+ builder.append(str);
+ }
+ return builder.toString().getBytes();
+ }
+
+ public void time_ascii() {
+ new String(ASCII, StandardCharsets.UTF_8);
+ }
+
+ public void time_bmp2() {
+ new String(BMP2, StandardCharsets.UTF_8);
+ }
+
+ public void time_bmp3() {
+ new String(BMP3, StandardCharsets.UTF_8);
+ }
+
+ public void time_supplementary() {
+ new String(SUPPLEMENTARY, StandardCharsets.UTF_8);
+ }
+}
diff --git a/expectations/knownfailures.txt b/expectations/knownfailures.txt
index ed88ae7..54a443c 100644
--- a/expectations/knownfailures.txt
+++ b/expectations/knownfailures.txt
@@ -1838,11 +1838,5 @@
result: EXEC_FAILED,
bug: 67395816,
name: "libcore.java.util.zip.DeflateRegressionTest#deterministicOutput"
-},
-{
- description: "Awaiting fix in fast-path UTF-8 decoder / CharsetDecoder",
- result: EXEC_FAILED,
- bug: 69599767,
- name: "libcore.java.lang.StringTest#test_23831"
}
]
diff --git a/harmony-tests/src/test/java/org/apache/harmony/tests/java/nio/charset/CharsetEncoder2Test.java b/harmony-tests/src/test/java/org/apache/harmony/tests/java/nio/charset/CharsetEncoder2Test.java
index 9884625..9db8174 100644
--- a/harmony-tests/src/test/java/org/apache/harmony/tests/java/nio/charset/CharsetEncoder2Test.java
+++ b/harmony-tests/src/test/java/org/apache/harmony/tests/java/nio/charset/CharsetEncoder2Test.java
@@ -198,8 +198,8 @@
byte[] orig = new byte[] { (byte) 0xed, (byte) 0xa0,
(byte) 0x80 };
String s = new String(orig, "UTF-8");
- assertEquals(1, s.length());
- assertEquals(55296, s.charAt(0));
+ assertEquals(3, s.length());
+ assertEquals("\ufffd\ufffd\ufffd", s);
Charset.forName("UTF-8").encode(CharBuffer.wrap(s));
// ByteBuffer buf = <result>
// for (byte o : orig) {
diff --git a/harmony-tests/src/test/java/org/apache/harmony/tests/javax/security/auth/x500/X500PrincipalTest.java b/harmony-tests/src/test/java/org/apache/harmony/tests/javax/security/auth/x500/X500PrincipalTest.java
index 14b21f7..a0dff1c 100644
--- a/harmony-tests/src/test/java/org/apache/harmony/tests/javax/security/auth/x500/X500PrincipalTest.java
+++ b/harmony-tests/src/test/java/org/apache/harmony/tests/javax/security/auth/x500/X500PrincipalTest.java
@@ -2532,16 +2532,20 @@
0x30, 0x0B, 0x06, 0x03, 0x55, 0x04, 0x03,
// UTF8 String
0x0C, 0x04, (byte) 0xD0, (byte) 0xAF, 0x41, 0x41 }); // 0xD0AF == the last letter(capital) of Russian alphabet
- list.add("CN=\"\\E0\\90\\AF\"", "CN=" + ((char) 1071), "CN="
- + ((char) 1071), new byte[] { 0x30, 0x0D, 0x31, 0x0B, 0x30,
- 0x09, 0x06, 0x03, 0x55, 0x04, 0x03,
+ list.add("CN=\"\\E0\\90\\AF\"", "CN=\ufffd\ufffd\ufffd", "CN=\ufffd\ufffd\ufffd",
+ new byte[] { 0x30, 0x14, 0x31, 0x12, 0x30, 0x10, 0x06, 0x03, 0x55, 0x04, 0x03,
// UTF8 String
- 0x0C, 0x02, (byte) 0xD0, (byte) 0xAF }); // UTF8(0xE090AF that is not quite correct)== UTF8(0xD0AF) == the last letter(capital) of Russian alphabet
- list.add("CN=\"\\F0\\80\\90\\AF\"", "CN=" + ((char) 1071), "CN="
- + ((char) 1071), new byte[] { 0x30, 0x0D, 0x31, 0x0B, 0x30,
- 0x09, 0x06, 0x03, 0x55, 0x04, 0x03,
+ 0x0C, 0x09, (byte) 0xEF, (byte) 0xBF, (byte) 0xBD, (byte) 0xEF, (byte) 0xBF,
+ (byte) 0xBD, (byte) 0xEF, (byte) 0xBF, (byte) 0xBD });
+ // UTF8(0xE090AF) is not correct because it's a overlong form of UTF8(0xD0AF).
+ list.add("CN=\"\\F0\\80\\90\\AF\"", "CN=\ufffd\ufffd\ufffd\ufffd",
+ "CN=\ufffd\ufffd\ufffd\ufffd",
+ new byte[] { 0x30, 0x17, 0x31, 0x15, 0x30, 0x13, 0x06, 0x03, 0x55, 0x04, 0x03,
// UTF8 String
- 0x0C, 0x02, (byte) 0xD0, (byte) 0xAF }); // UTF8(0xF08090AF that is not quite correct)== UTF8(0xD0AF) == the last letter(capital) of Russian alphabet
+ 0x0C, 0x0C, (byte) 0xEF, (byte) 0xBF, (byte) 0xBD, (byte) 0xEF, (byte) 0xBF,
+ (byte) 0xBD, (byte) 0xEF, (byte) 0xBF, (byte) 0xBD, (byte) 0xEF, (byte) 0xBF,
+ (byte) 0xBD });
+ // UTF8(0xF08090AF) is not correct because it's a overlong form of UTF8(0xD0AF).
list.add("CN=\"\\41\"+ST=A", "CN=A+ST=A", "CN=A + ST=A"); // 0x41=='A'
list.add("CN=\"\\41\\2C\"+ST=A", "CN=A\\,+ST=A", "CN=\"A,\" + ST=A"); // 0x41=='A', 0x2C=','
diff --git a/libart/src/main/java/java/lang/StringFactory.java b/libart/src/main/java/java/lang/StringFactory.java
index 208a657..1866562 100644
--- a/libart/src/main/java/java/lang/StringFactory.java
+++ b/libart/src/main/java/java/lang/StringFactory.java
@@ -65,6 +65,14 @@
return newStringFromBytes(data, 0, data.length, Charset.forNameUEE(charsetName));
}
+ private static final int[] TABLE_UTF8_NEEDED = new int[] {
+ // 0 1 2 3 4 5 6 7 8 9 a b c d e f
+ 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // 0xc0 - 0xcf
+ 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // 0xd0 - 0xdf
+ 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, // 0xe0 - 0xef
+ 3, 3, 3, 3, 3, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // 0xf0 - 0xff
+ };
+
// TODO: Implement this method natively.
public static String newStringFromBytes(byte[] data, int offset, int byteCount, Charset charset) {
if ((offset | byteCount) < 0 || byteCount > data.length - offset) {
@@ -77,98 +85,137 @@
// We inline UTF-8, ISO-8859-1, and US-ASCII decoders for speed.
String canonicalCharsetName = charset.name();
if (canonicalCharsetName.equals("UTF-8")) {
+ /*
+ This code converts a UTF-8 byte sequence to a Java String (UTF-16).
+ It implements the W3C recommended UTF-8 decoder.
+ https://www.w3.org/TR/encoding/#utf-8-decoder
+
+ Unicode 3.2 Well-Formed UTF-8 Byte Sequences
+ Code Points First Second Third Fourth
+ U+0000..U+007F 00..7F
+ U+0080..U+07FF C2..DF 80..BF
+ U+0800..U+0FFF E0 A0..BF 80..BF
+ U+1000..U+CFFF E1..EC 80..BF 80..BF
+ U+D000..U+D7FF ED 80..9F 80..BF
+ U+E000..U+FFFF EE..EF 80..BF 80..BF
+ U+10000..U+3FFFF F0 90..BF 80..BF 80..BF
+ U+40000..U+FFFFF F1..F3 80..BF 80..BF 80..BF
+ U+100000..U+10FFFF F4 80..8F 80..BF 80..BF
+
+ Please refer to Unicode as the authority.
+ p.126 Table 3-7 in http://www.unicode.org/versions/Unicode10.0.0/ch03.pdf
+
+ Handling Malformed Input
+ The maximal subpart should be replaced by a single U+FFFD. Maximal subpart is
+ the longest code unit subsequence starting at an unconvertible offset that is either
+ 1) the initial subsequence of a well-formed code unit sequence, or
+ 2) a subsequence of length one:
+ One U+FFFD should be emitted for every sequence of bytes that is an incomplete prefix
+ of a valid sequence, and with the conversion to restart after the incomplete sequence.
+
+ For example, in byte sequence "41 C0 AF 41 F4 80 80 41", the maximal subparts are
+ "C0", "AF", and "F4 80 80". "F4 80 80" can be the initial subsequence of "F4 80 80 80",
+ but "C0" can't be the initial subsequence of any well-formed code unit sequence.
+ Thus, the output should be "A\ufffd\ufffdA\ufffdA".
+
+ Please refer to section "Best Practices for Using U+FFFD." in
+ http://www.unicode.org/versions/Unicode10.0.0/ch03.pdf
+ */
byte[] d = data;
char[] v = new char[byteCount];
int idx = offset;
int last = offset + byteCount;
int s = 0;
-outer:
+
+ int codePoint = 0;
+ int utf8BytesSeen = 0;
+ int utf8BytesNeeded = 0;
+ int lowerBound = 0x80;
+ int upperBound = 0xbf;
+
while (idx < last) {
- byte b0 = d[idx++];
- if ((b0 & 0x80) == 0) {
- // 0xxxxxxx
- // Range: U-00000000 - U-0000007F
- int val = b0 & 0xff;
- v[s++] = (char) val;
- } else if (((b0 & 0xe0) == 0xc0) || ((b0 & 0xf0) == 0xe0) ||
- ((b0 & 0xf8) == 0xf0) || ((b0 & 0xfc) == 0xf8) || ((b0 & 0xfe) == 0xfc)) {
- int utfCount = 1;
- if ((b0 & 0xf0) == 0xe0) utfCount = 2;
- else if ((b0 & 0xf8) == 0xf0) utfCount = 3;
- else if ((b0 & 0xfc) == 0xf8) utfCount = 4;
- else if ((b0 & 0xfe) == 0xfc) utfCount = 5;
+ int b = d[idx++] & 0xff;
+ if (utf8BytesNeeded == 0) {
+ if ((b & 0x80) == 0) { // ASCII char. 0xxxxxxx
+ v[s++] = (char) b;
+ continue;
+ }
- // 110xxxxx (10xxxxxx)+
- // Range: U-00000080 - U-000007FF (count == 1)
- // Range: U-00000800 - U-0000FFFF (count == 2)
- // Range: U-00010000 - U-001FFFFF (count == 3)
- // Range: U-00200000 - U-03FFFFFF (count == 4)
- // Range: U-04000000 - U-7FFFFFFF (count == 5)
-
- if (idx + utfCount > last) {
+ if ((b & 0x40) == 0) { // 10xxxxxx is illegal as first byte
v[s++] = REPLACEMENT_CHAR;
continue;
}
- // Extract usable bits from b0
- int val = b0 & (0x1f >> (utfCount - 1));
- for (int i = 0; i < utfCount; ++i) {
- byte b = d[idx++];
- if ((b & 0xc0) != 0x80) {
- v[s++] = REPLACEMENT_CHAR;
- idx--; // Put the input char back
- continue outer;
- }
- // Push new bits in from the right side
- val <<= 6;
- val |= b & 0x3f;
- }
-
- // Note: Java allows overlong char
- // specifications To disallow, check that val
- // is greater than or equal to the minimum
- // value for each count:
- //
- // count min value
- // ----- ----------
- // 1 0x80
- // 2 0x800
- // 3 0x10000
- // 4 0x200000
- // 5 0x4000000
-
- // Allow surrogate values (0xD800 - 0xDFFF) to
- // be specified using 3-byte UTF values only
- if ((utfCount != 2) && (val >= 0xD800) && (val <= 0xDFFF)) {
+ // 11xxxxxx
+ int tableLookupIndex = b & 0x3f;
+ utf8BytesNeeded = TABLE_UTF8_NEEDED[tableLookupIndex];
+ if (utf8BytesNeeded == 0) {
v[s++] = REPLACEMENT_CHAR;
continue;
}
- // Reject chars greater than the Unicode maximum of U+10FFFF.
- if (val > 0x10FFFF) {
+ // utf8BytesNeeded
+ // 1: b & 0x1f
+ // 2: b & 0x0f
+ // 3: b & 0x07
+ codePoint = b & (0x3f >> utf8BytesNeeded);
+ if (b == 0xe0) {
+ lowerBound = 0xa0;
+ } else if (b == 0xed) {
+ upperBound = 0x9f;
+ } else if (b == 0xf0) {
+ lowerBound = 0x90;
+ } else if (b == 0xf4) {
+ upperBound = 0x8f;
+ }
+ } else {
+ if (b < lowerBound || b > upperBound) {
+ // The bytes seen are ill-formed. Substitute them with U+FFFD
v[s++] = REPLACEMENT_CHAR;
+ codePoint = 0;
+ utf8BytesNeeded = 0;
+ utf8BytesSeen = 0;
+ lowerBound = 0x80;
+ upperBound = 0xbf;
+ /*
+ * According to the Unicode Standard,
+ * "a UTF-8 conversion process is required to never consume well-formed
+ * subsequences as part of its error handling for ill-formed subsequences"
+ * The current byte could be part of well-formed subsequences. Reduce the
+ * index by 1 to parse it in next loop.
+ */
+ idx--;
+ continue;
+ }
+
+ lowerBound = 0x80;
+ upperBound = 0xbf;
+ codePoint = (codePoint << 6) | (b & 0x3f);
+ utf8BytesSeen++;
+ if (utf8BytesNeeded != utf8BytesSeen) {
continue;
}
// Encode chars from U+10000 up as surrogate pairs
- if (val < 0x10000) {
- v[s++] = (char) val;
+ if (codePoint < 0x10000) {
+ v[s++] = (char) codePoint;
} else {
- int x = val & 0xffff;
- int u = (val >> 16) & 0x1f;
- int w = (u - 1) & 0xffff;
- int hi = 0xd800 | (w << 6) | (x >> 10);
- int lo = 0xdc00 | (x & 0x3ff);
- v[s++] = (char) hi;
- v[s++] = (char) lo;
+ v[s++] = (char) ((codePoint >> 10) + 0xd7c0);
+ v[s++] = (char) ((codePoint & 0x3ff) + 0xdc00);
}
- } else {
- // Illegal values 0x8*, 0x9*, 0xa*, 0xb*, 0xfd-0xff
- v[s++] = REPLACEMENT_CHAR;
+
+ utf8BytesSeen = 0;
+ utf8BytesNeeded = 0;
+ codePoint = 0;
}
}
+ // The bytes seen are ill-formed. Substitute them by U+FFFD
+ if (utf8BytesNeeded != 0) {
+ v[s++] = REPLACEMENT_CHAR;
+ }
+
if (s == byteCount) {
// We guessed right, so we can use our temporary array as-is.
value = v;
diff --git a/luni/src/test/java/libcore/java/lang/StringTest.java b/luni/src/test/java/libcore/java/lang/StringTest.java
index 7e34d44..93f5aed 100644
--- a/luni/src/test/java/libcore/java/lang/StringTest.java
+++ b/luni/src/test/java/libcore/java/lang/StringTest.java
@@ -16,6 +16,11 @@
package libcore.java.lang;
+import android.icu.lang.UCharacter;
+
+import java.io.ByteArrayInputStream;
+import java.io.DataInputStream;
+import java.io.IOException;
import java.nio.ByteBuffer;
import java.nio.CharBuffer;
import java.nio.ReadOnlyBufferException;
@@ -24,9 +29,12 @@
import java.nio.charset.CharsetEncoder;
import java.nio.charset.CoderResult;
import java.nio.charset.CodingErrorAction;
+import java.nio.charset.ModifiedUtf8;
+import java.nio.charset.StandardCharsets;
import java.util.Arrays;
import java.util.ArrayList;
import java.util.Locale;
+
import junit.framework.TestCase;
public class StringTest extends TestCase {
@@ -297,12 +305,15 @@
assertEquals("project_Id", "projectId".replaceAll("(?!^)(\\p{Upper})(?!$)", "_$1"));
}
+ // Test that CharsetDecoder and fast-path decoder are consistent when handling ill-formed
+ // sequence. http://b/69599767
+ // This test was originally created for the bug
// https://code.google.com/p/android/issues/detail?id=23831
- public void test_23831() throws Exception {
+ public void test_69599767() throws Exception {
byte[] bytes = { (byte) 0xf5, (byte) 0xa9, (byte) 0xea, (byte) 0x21 };
- String expected = "\ufffd\ufffd\u0021";
+ String expected = "\ufffd\ufffd\ufffd\u0021";
- // Since we use icu4c for CharsetDecoder...
+ // Since we use ICU4C for CharsetDecoder...
CharsetDecoder decoder = Charset.forName("UTF-8").newDecoder();
decoder.onMalformedInput(CodingErrorAction.REPLACE);
assertEquals(expected, decoder.decode(ByteBuffer.wrap(bytes)).toString());
@@ -311,6 +322,175 @@
assertEquals(expected, new String(bytes, "UTF-8"));
}
+ public void testFastPathString_wellFormedUtf8Sequence() throws Exception {
+ // U+0000 null
+ assertFastPathUtf8DecodedEquals("\u0000", "00");
+ // U+0031 ASCII char '1'
+ assertFastPathUtf8DecodedEquals("1", "31");
+ // U+007f
+ assertFastPathUtf8DecodedEquals("\u007f", "7f");
+ // 2-byte UTF-8 sequence
+ assertFastPathUtf8DecodedEquals("\u0080", "c2 80");
+ assertFastPathUtf8DecodedEquals("\u07ff", "df bf");
+ // 3-byte UTF-8 sequence
+ assertFastPathUtf8DecodedEquals("\u0800", "e0 a0 80");
+ assertFastPathUtf8DecodedEquals("\ud7ff", "ed 9f bf"); // last code point before surrogate
+ assertFastPathUtf8DecodedEquals("\ue000", "ee 80 80"); // first code point after surrogate
+ assertFastPathUtf8DecodedEquals("\uffff", "ef bf bf");
+ // U+10000 The minimum value of a Unicode supplementary code point
+ assertEquals("\ud800\udc00", String.valueOf(Character.toChars(0x10000)));
+ assertFastPathUtf8DecodedEquals("\ud800\udc00", "f0 90 80 80");
+ // U+10ffff The maximum value of a Unicode code point
+ assertEquals("\udbff\udfff", String.valueOf(Character.toChars(0x10ffff)));
+ assertFastPathUtf8DecodedEquals("\udbff\udfff", "f4 8f bf bf");
+
+ // Null in the middle
+ assertFastPathUtf8DecodedEquals("1\u00002\u07ff", "31 00 32 df bf");
+
+ assertFastPathUtf8DecodedEquals("\u0800\udbff\udfff\uffff1\u0080",
+ "e0 a0 80 f4 8f bf bf ef bf bf 31 c2 80");
+
+ // Check UTF8 sequences of all code points is decoded correctly.
+ // Validate the decoder using byte sequence generated by UTF-8 encoder.
+ for (int codePoint = Character.MIN_CODE_POINT;
+ codePoint <= Character.MAX_CODE_POINT;
+ codePoint++) {
+ if (codePoint < Character.MIN_SURROGATE || codePoint > Character.MAX_SURROGATE) {
+ String expected = UCharacter.toString(codePoint);
+ // Android platform default is always UTF-8.
+ byte[] utf8Bytes = expected.getBytes();
+ assertEquals(expected, new String(utf8Bytes));
+ }
+ }
+ }
+
+ public void testFastPathString_illFormedUtf8Sequence() throws Exception {
+ // Overlong Sequence of ASCII char '1'
+ assertFastPathUtf8DecodedEquals("\ufffd\ufffd", "c0 b1");
+ assertFastPathUtf8DecodedEquals("\ufffd\ufffd\ufffd", "e0 80 b1");
+ assertFastPathUtf8DecodedEquals("\ufffd\ufffd\ufffd\ufffd", "f0 80 80 b1");
+ assertFastPathUtf8DecodedEquals("\ufffd\ufffd\ufffd\ufffd\ufffd", "f8 80 80 80 b1");
+ assertFastPathUtf8DecodedEquals("\ufffd\ufffd\ufffd\ufffd\ufffd\ufffd", "fc 80 80 80 80 b1");
+
+ // Overlong null \u0000
+ // "c0 80" is a Modified UTF-8 sequence representing \u0000, but illegal in UTF-8.
+ assertEquals("\u0000", decodeModifiedUTF8("c0 80"));
+ assertFastPathUtf8DecodedEquals("\ufffd\ufffd", "c0 80");
+
+ // Overlong BMP char U+0080. The correct UTF-8 encoded form of U+0080 is 2-byte "c2 80".
+ // The overlong form can be obtained by filling 0x80 into 1110xxxx 10xxxxxx 10xxxxxx
+ // == 1110000 10000010 10000000. (hex form e0 82 80)
+ assertFastPathUtf8DecodedEquals("\ufffd\ufffd\ufffd", "e0 82 80");
+
+ // Overlong Supplementary Characters U+10000.
+ // The correct UTF-8 encoded form of U+10000 is 4-byte "f0 90 80 80".
+ // The overlong form can be obtained by filling 0x10000 into
+ // 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx
+ // == 1110000 10000000 10010000 10000000 10000000. (hex form f8 80 90 80 80)
+ assertFastPathUtf8DecodedEquals("\ufffd\ufffd\ufffd\ufffd\ufffd", "f8 80 90 80 80");
+
+ // Single surrogate in CESU-8 encoding
+ // A CESU-8 sequence, but illegal in UTF-8.
+ assertEquals("\ud800", decodeCESU8("ed a0 80"));
+ assertFastPathUtf8DecodedEquals("\ufffd\ufffd\ufffd", "ed a0 80");
+
+ // Surrogate pair in CESU-8 encoding. The value is bytes U+10000
+ // Assert the bytes are valid CESU-8 sequence before decoding using UTF-8
+ String surrogatePair = decodeCESU8("ed a0 80 ed b0 80");
+ assertEquals("\ud800\udc00", surrogatePair);
+ assertEquals(0x10000, Character.codePointAt(surrogatePair.toCharArray(), 0));
+ assertFastPathUtf8DecodedEquals("\ufffd\ufffd\ufffd\ufffd\ufffd\ufffd",
+ "ed a0 80 ed b0 80");
+
+ // Illegal first-byte
+ assertFastPathUtf8DecodedEquals("\ufffd", "c0");
+ assertFastPathUtf8DecodedEquals("\ufffd", "80");
+
+ // Maximal valid subpart. byte 0x31 should be decoded into ASCII char '1', not part of
+ // ill-formed byte sequence
+ assertFastPathUtf8DecodedEquals("\ufffd1", "c2 31");
+ assertFastPathUtf8DecodedEquals("\ufffd1", "e1 31");
+ assertFastPathUtf8DecodedEquals("\ufffd1", "e1 80 31");
+ assertFastPathUtf8DecodedEquals("\ufffd1", "f1 31");
+ assertFastPathUtf8DecodedEquals("\ufffd1", "f1 80 31");
+ assertFastPathUtf8DecodedEquals("\ufffd1", "f1 80 80 31");;
+
+ // Ill-formed sequence in the end of stream
+ assertFastPathUtf8DecodedEquals("1\ufffd", "31 c2");
+ assertFastPathUtf8DecodedEquals("1\ufffd", "31 e1");
+ assertFastPathUtf8DecodedEquals("1\ufffd", "31 e1 80");
+ assertFastPathUtf8DecodedEquals("1\ufffd", "31 f1");
+ assertFastPathUtf8DecodedEquals("1\ufffd", "31 f1 80");
+ assertFastPathUtf8DecodedEquals("1\ufffd", "31 f1 80 80");
+
+ // Test lower and upper bound of first trail byte when leading byte is e0/ed/f0/f4
+ // Valid range of trail byte is A0..BF.
+ assertFastPathUtf8DecodedEquals("1\ufffd\ufffd", "31 e0 9f");
+ assertFastPathUtf8DecodedEquals("1\ufffd\ufffd", "31 e0 c0");
+ // Valid range of trail byte is 80..9F.
+ assertFastPathUtf8DecodedEquals("1\ufffd\u007f", "31 ed 7f");
+ assertFastPathUtf8DecodedEquals("1\ufffd\ufffd", "31 ed a0");
+ // Valid range of trail byte is 90..BF.
+ assertFastPathUtf8DecodedEquals("1\ufffd\ufffd", "31 f0 8f");
+ assertFastPathUtf8DecodedEquals("1\ufffd\ufffd", "31 f0 c0");
+ // Valid range of trail byte is 80..8F.
+ assertFastPathUtf8DecodedEquals("1\ufffd\u007f", "31 f4 7f");
+ assertFastPathUtf8DecodedEquals("1\ufffd\ufffd", "31 f4 90");
+
+ // More ill-formed sequences
+ assertFastPathUtf8DecodedEquals("\ufffd\ufffd1", "f1 80 80 e1 80 31");
+ assertFastPathUtf8DecodedEquals("\ufffd\ufffd\ufffd1", "f1 80 80 c0 b1 31");
+ assertFastPathUtf8DecodedEquals("\ufffd\ufffd\ufffd1", "f1 80 80 ed a0 31");
+ assertFastPathUtf8DecodedEquals("A\ufffd\ufffdA\ufffdA", "41 C0 AF 41 F4 80 80 41");
+ }
+
+ private void assertFastPathUtf8DecodedEquals(String expected, String hexString)
+ throws Exception {
+ String actual = new String(hexStringtoBytes(hexString));
+ assertEquals("Fast-path UTF-8 decoder decodes sequence [" + hexString
+ + "] into unexpected String",
+ expected, actual);
+ // Since we use ICU4C for CharsetDecoder,
+ // check UTF-8 CharsetDecoder has the same result as the fast-path decoder
+ CharsetDecoder decoder = StandardCharsets.UTF_8.newDecoder()
+ .onMalformedInput(CodingErrorAction.REPLACE);
+ assertEquals("Fast-path UTF-8 decoder and UTF-8 CharsetDecoder has a different conversion"
+ + " result for sequence [" + hexString + "]",
+ decoder.decode(ByteBuffer.wrap(hexStringtoBytes(hexString))).toString(), actual);
+ }
+
+ private static String decodeCESU8(String hexString) throws IOException {
+ CharsetDecoder cesu8Decoder = Charset.forName("CESU-8").newDecoder();
+ return cesu8Decoder.decode(ByteBuffer.wrap(hexStringtoBytes(hexString))).toString();
+ }
+
+ private static String decodeModifiedUTF8(String hexString) throws IOException {
+ byte[] bytes = hexStringtoBytes(hexString);
+ // DataInputStream stores length as 2-byte short. Check the length before decoding
+ if (bytes.length > 0xffff) {
+ throw new IllegalArgumentException("Modified UTF-8 bytes are too long.");
+ }
+ byte[] buf = new byte[bytes.length + 2];
+ buf[0] = (byte)(bytes.length >>> 8);
+ buf[1] = (byte) bytes.length;
+ System.arraycopy(bytes, 0, buf, 2, bytes.length);
+ DataInputStream dis = new DataInputStream(new ByteArrayInputStream(buf));
+ return dis.readUTF();
+ }
+
+ private static byte[] hexStringtoBytes(String input) {
+ String[] parts = input.split(" ");
+ byte[] bytes = new byte[parts.length];
+ for (int i = 0; i < parts.length; i++) {
+ int val = Integer.parseInt(parts[i], 16);
+ if (val < 0 || val > 255) {
+ throw new IllegalArgumentException();
+ }
+ bytes[i] = (byte) (0xff & val);
+ }
+ return bytes;
+ }
+
// https://code.google.com/p/android/issues/detail?id=55129
public void test_55129() throws Exception {
assertEquals("-h-e-l-l-o- -w-o-r-l-d-", "hello world".replace("", "-"));