Part 2 of the "new String"/String.getBytes performance work.

I didn't plan on a part 2, but my benchmark was bogus. I'd failed to take into
account the fact that the ICU code (which I was comparing against) has a higher
intercept but lower slope than the Java I replaced it with. This new code
offers the best of both worlds: low intercept (start-up cost) and low slope
(per-byte/char cost).

The bad news is that this means I'm adding more native code. In addition to the
improved benchmark, I'll commit a benchmark that contains the pure Java
implementations so we can see when the JIT advances to the point that we can
retire this native code.

Change-Id: Ibac24c2e3deed216bd492acf2fac7554d3f96d85
diff --git a/luni/src/main/java/java/lang/String.java b/luni/src/main/java/java/lang/String.java
index df8da94..e9774e5 100644
--- a/luni/src/main/java/java/lang/String.java
+++ b/luni/src/main/java/java/lang/String.java
@@ -23,6 +23,7 @@
 import java.nio.ByteBuffer;
 import java.nio.CharBuffer;
 import java.nio.charset.Charset;
+import java.nio.charset.Charsets;
 import java.util.Comparator;
 import java.util.Formatter;
 import java.util.Locale;
@@ -236,21 +237,7 @@
      *             if the named charset is not supported.
      */
     public String(byte[] data, int start, int length, String charsetName) throws UnsupportedEncodingException {
-        this(data, start, length, charsetForName(charsetName));
-    }
-
-    /**
-     * Calls Charset.forName but only throws UnsupportedEncodingException, which is all String
-     * claims to throw.
-     */
-    private static Charset charsetForName(String charsetName) throws UnsupportedEncodingException {
-        try {
-            return Charset.forName(charsetName);
-        } catch (Exception cause) {
-            UnsupportedEncodingException ex = new UnsupportedEncodingException(charsetName);
-            ex.initCause(cause);
-            throw ex;
-        }
+        this(data, start, length, Charset.forNameUEE(charsetName));
     }
 
     /**
@@ -269,7 +256,7 @@
      *             if {@code charsetName} is not supported.
      */
     public String(byte[] data, String charsetName) throws UnsupportedEncodingException {
-        this(data, 0, data.length, charsetForName(charsetName));
+        this(data, 0, data.length, Charset.forNameUEE(charsetName));
     }
 
     /**
@@ -423,17 +410,12 @@
             this.offset = 0;
             this.value = new char[length];
             this.count = length;
-            for (int i = 0; i < count; ++i) {
-                value[i] = (char) (data[start++] & 0xff);
-            }
+            Charsets.isoLatin1BytesToChars(data, start, length, value);
         } else if (canonicalCharsetName.equals("US-ASCII")) {
             this.offset = 0;
             this.value = new char[length];
             this.count = length;
-            for (int i = 0; i < count; ++i) {
-                char ch = (char) (data[start++] & 0xff);
-                value[i] = (ch <= 0x7f) ? ch : REPLACEMENT_CHAR;
-            }
+            Charsets.asciiBytesToChars(data, start, length, value);
         } else {
             CharBuffer cb = charset.decode(ByteBuffer.wrap(data, start, length));
             this.offset = 0;
@@ -970,7 +952,7 @@
      * @throws UnsupportedEncodingException if the charset is not supported
      */
     public byte[] getBytes(String charsetName) throws UnsupportedEncodingException {
-        return getBytes(charsetForName(charsetName));
+        return getBytes(Charset.forNameUEE(charsetName));
     }
 
     /**
@@ -986,11 +968,11 @@
     public byte[] getBytes(Charset charset) {
         String canonicalCharsetName = charset.name();
         if (canonicalCharsetName.equals("UTF-8")) {
-            return getUtf8Bytes();
+            return Charsets.toUtf8Bytes(value, offset, count);
         } else if (canonicalCharsetName.equals("ISO-8859-1")) {
-            return getDirectMappedBytes(0xff);
+            return Charsets.toIsoLatin1Bytes(value, offset, count);
         } else if (canonicalCharsetName.equals("US-ASCII")) {
-            return getDirectMappedBytes(0x7f);
+            return Charsets.toAsciiBytes(value, offset, count);
         } else {
             CharBuffer chars = CharBuffer.wrap(this.value, this.offset, this.count);
             ByteBuffer buffer = charset.encode(chars.asReadOnlyBuffer());
@@ -1001,59 +983,6 @@
     }
 
     /**
-     * Translates this string's characters to US-ASCII or ISO-8859-1 bytes, using the fact that
-     * Unicode code points between U+0000 and U+007f inclusive are identical to US-ASCII, while
-     * U+0000 to U+00ff inclusive are identical to ISO-8859-1.
-     */
-    private byte[] getDirectMappedBytes(int maxValidChar) {
-        byte[] result = new byte[count];
-        int o = offset;
-        for (int i = 0; i < count; ++i) {
-            int ch = value[o++];
-            result[i] = (byte) ((ch <= maxValidChar) ? ch : '?');
-        }
-        return result;
-    }
-
-    private byte[] getUtf8Bytes() {
-        UnsafeByteSequence result = new UnsafeByteSequence(count);
-        final int end = offset + count;
-        for (int i = offset; i < end; ++i) {
-            int ch = value[i];
-            if (ch < 0x80) {
-                // One byte.
-                result.write(ch);
-            } else if (ch < 0x800) {
-                // Two bytes.
-                result.write((ch >> 6) | 0xc0);
-                result.write((ch & 0x3f) | 0x80);
-            } else if (ch >= Character.MIN_SURROGATE && ch <= Character.MAX_SURROGATE) {
-                // A supplementary character.
-                char high = (char) ch;
-                char low = (i + 1 != end) ? value[i + 1] : '\u0000';
-                if (!Character.isSurrogatePair(high, low)) {
-                    result.write('?');
-                    continue;
-                }
-                // Now we know we have a *valid* surrogate pair, we can consume the low surrogate.
-                ++i;
-                ch = Character.toCodePoint(high, low);
-                // Four bytes.
-                result.write((ch >> 18) | 0xf0);
-                result.write(((ch >> 12) & 0x3f) | 0x80);
-                result.write(((ch >> 6) & 0x3f) | 0x80);
-                result.write((ch & 0x3f) | 0x80);
-            } else {
-                // Three bytes.
-                result.write((ch >> 12) | 0xe0);
-                result.write(((ch >> 6) & 0x3f) | 0x80);
-                result.write((ch & 0x3f) | 0x80);
-            }
-        }
-        return result.toByteArray();
-    }
-
-    /**
      * Copies the specified characters in this string to the character array
      * starting at the specified offset in the character array.
      *
diff --git a/luni/src/main/java/java/nio/charset/Charset.java b/luni/src/main/java/java/nio/charset/Charset.java
index 64d736e..14a992f 100644
--- a/luni/src/main/java/java/nio/charset/Charset.java
+++ b/luni/src/main/java/java/nio/charset/Charset.java
@@ -22,6 +22,7 @@
 import java.io.IOException;
 import java.io.InputStream;
 import java.io.InputStreamReader;
+import java.io.UnsupportedEncodingException;
 import java.net.URL;
 import java.nio.ByteBuffer;
 import java.nio.CharBuffer;
@@ -312,6 +313,22 @@
     }
 
     /**
+     * Equivalent to {@code forName} but only throws {@code UnsupportedEncodingException},
+     * which is all pre-nio code claims to throw.
+     *
+     * @hide
+     */
+    public static Charset forNameUEE(String charsetName) throws UnsupportedEncodingException {
+        try {
+            return Charset.forName(charsetName);
+        } catch (Exception cause) {
+            UnsupportedEncodingException ex = new UnsupportedEncodingException(charsetName);
+            ex.initCause(cause);
+            throw ex;
+        }
+    }
+
+    /**
      * Determines whether the specified charset is supported by this runtime.
      *
      * @param charsetName
diff --git a/luni/src/main/java/java/nio/charset/Charsets.java b/luni/src/main/java/java/nio/charset/Charsets.java
index 089da06..1e11bef 100644
--- a/luni/src/main/java/java/nio/charset/Charsets.java
+++ b/luni/src/main/java/java/nio/charset/Charsets.java
@@ -21,15 +21,63 @@
  * unnecessary handling of UnsupportedEncodingException at call sites, compared to using the
  * charset's name.
  *
+ * Also various special-case charset conversions (for performance).
+ *
  * @hide internal use only
  */
 public class Charsets {
+    /**
+     * A cheap and type-safe constant for the ISO-8859-1 Charset.
+     */
     public static final Charset ISO_8859_1 = Charset.forName("ISO-8859-1");
 
+    /**
+     * A cheap and type-safe constant for the US-ASCII Charset.
+     */
     public static final Charset US_ASCII = Charset.forName("US-ASCII");
 
+    /**
+     * A cheap and type-safe constant for the UTF-8 Charset.
+     */
     public static final Charset UTF_8 = Charset.forName("UTF-8");
 
+    /**
+     * Returns a new byte array containing the bytes corresponding to the given characters,
+     * encoded in US-ASCII. Unrepresentable characters are replaced by (byte) '?'.
+     */
+    public static native byte[] toAsciiBytes(char[] chars, int offset, int length);
+
+    /**
+     * Returns a new byte array containing the bytes corresponding to the given characters,
+     * encoded in ISO-8859-1. Unrepresentable characters are replaced by (byte) '?'.
+     */
+    public static native byte[] toIsoLatin1Bytes(char[] chars, int offset, int length);
+
+    /**
+     * Returns a new byte array containing the bytes corresponding to the given characters,
+     * encoded in UTF-8. All characters are representable in UTF-8.
+     */
+    public static native byte[] toUtf8Bytes(char[] chars, int offset, int length);
+
+    /**
+     * Decodes the given US-ASCII bytes into the given char[]. Equivalent to but faster than:
+     *
+     * for (int i = 0; i < count; ++i) {
+     *     char ch = (char) (data[start++] & 0xff);
+     *     value[i] = (ch <= 0x7f) ? ch : REPLACEMENT_CHAR;
+     * }
+     */
+    public static native void asciiBytesToChars(byte[] bytes, int offset, int length, char[] chars);
+
+    /**
+     * Decodes the given ISO-8859-1 bytes into the given char[]. Equivalent to but faster than:
+     *
+     * for (int i = 0; i < count; ++i) {
+     *     value[i] = (char) (data[start++] & 0xff);
+     * }
+     */
+    public static native void isoLatin1BytesToChars(byte[] bytes, int offset, int length, char[] chars);
+
     private Charsets() {
     }
 }
diff --git a/luni/src/main/native/Register.cpp b/luni/src/main/native/Register.cpp
index 01cf1c2..71a5041 100644
--- a/luni/src/main/native/Register.cpp
+++ b/luni/src/main/native/Register.cpp
@@ -42,6 +42,7 @@
 extern int register_java_math_NativeBN(JNIEnv* env);
 extern int register_java_net_InetAddress(JNIEnv* env);
 extern int register_java_net_NetworkInterface(JNIEnv* env);
+extern int register_java_nio_charset_Charsets(JNIEnv* env);
 extern int register_java_util_regex_Matcher(JNIEnv* env);
 extern int register_java_util_regex_Pattern(JNIEnv* env);
 extern int register_java_util_zip_Adler32(JNIEnv* env);
@@ -90,6 +91,7 @@
             register_java_math_NativeBN(env) != -1 &&
             register_java_net_InetAddress(env) != -1 &&
             register_java_net_NetworkInterface(env) != -1 &&
+            register_java_nio_charset_Charsets(env) != -1 &&
             register_java_util_regex_Matcher(env) != -1 &&
             register_java_util_regex_Pattern(env) != -1 &&
             register_java_util_zip_Adler32(env) != -1 &&
diff --git a/luni/src/main/native/java_nio_charset_Charsets.cpp b/luni/src/main/native/java_nio_charset_Charsets.cpp
new file mode 100644
index 0000000..325c49a
--- /dev/null
+++ b/luni/src/main/native/java_nio_charset_Charsets.cpp
@@ -0,0 +1,241 @@
+/*
+ * Copyright (C) 2010 The Android Open Source Project
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#define LOG_TAG "String"
+
+#include "JNIHelp.h"
+#include "ScopedPrimitiveArray.h"
+#include "jni.h"
+#include "unicode/utf16.h"
+
+/**
+ * Approximates java.lang.UnsafeByteSequence so we don't have to pay the cost of calling back into
+ * Java when converting a char[] to a UTF-8 byte[]. This lets us have UTF-8 conversions slightly
+ * faster than ICU for large char[]s without paying for the NIO overhead with small char[]s.
+ *
+ * We could avoid this by keeping the UTF-8 bytes on the native heap until we're done and only
+ * creating a byte[] on the Java heap when we know how big it needs to be, but one shouldn't lie
+ * to the garbage collector (nor hide potentially large allocations from it).
+ *
+ * Because a call to append might require an allocation, it might fail. Callers should always
+ * check the return value of append.
+ */
+class NativeUnsafeByteSequence {
+public:
+    NativeUnsafeByteSequence(JNIEnv* env)
+        : mEnv(env), mJavaArray(NULL), mRawArray(NULL), mSize(0), mOffset(0)
+    {
+    }
+
+    ~NativeUnsafeByteSequence() {
+        // Release our pointer to the raw array, copying changes back to the Java heap.
+        mEnv->ReleaseByteArrayElements(mJavaArray, mRawArray, 0);
+    }
+
+    bool append(jbyte b) {
+        if (mOffset == mSize && !resize(mSize * 2)) {
+            return false;
+        }
+        mRawArray[mOffset++] = b;
+        return true;
+    }
+
+    bool resize(int size) {
+        if (size == mSize) {
+            return true;
+        }
+
+        // Allocate a new array.
+        jbyteArray newJavaArray = mEnv->NewByteArray(size);
+        if (newJavaArray == NULL) {
+            return false;
+        }
+        jbyte* newRawArray = mEnv->GetByteArrayElements(newJavaArray, NULL);
+        if (newJavaArray == NULL) {
+            return false;
+        }
+
+        // Copy data out of the old array and then let go of it.
+        memcpy(newRawArray, mRawArray, mSize);
+        mEnv->ReleaseByteArrayElements(mJavaArray, mRawArray, JNI_ABORT);
+
+        // Point ourselves at the new array.
+        mJavaArray = newJavaArray;
+        mRawArray = newRawArray;
+        mSize = size;
+        return true;
+    }
+
+    jbyteArray toByteArray() {
+        // Trim any unused space, if necessary.
+        bool okay = resize(mOffset);
+        return okay ? mJavaArray : NULL;
+    }
+
+private:
+    JNIEnv* mEnv;
+    jbyteArray mJavaArray;
+    jbyte* mRawArray;
+    jint mSize;
+    jint mOffset;
+
+    // Disallow copy and assignment.
+    NativeUnsafeByteSequence(const NativeUnsafeByteSequence&);
+    void operator=(const NativeUnsafeByteSequence&);
+};
+
+static void Charsets_asciiBytesToChars(JNIEnv* env, jclass, jbyteArray javaBytes, jint offset, jint length, jcharArray javaChars) {
+    ScopedByteArrayRO bytes(env, javaBytes);
+    if (bytes.get() == NULL) {
+        return;
+    }
+    ScopedCharArrayRW chars(env, javaChars);
+    if (chars.get() == NULL) {
+        return;
+    }
+
+    const jbyte* src = &bytes[offset];
+    jchar* dst = &chars[0];
+    static const jchar REPLACEMENT_CHAR = 0xfffd;
+    for (int i = length - 1; i >= 0; --i) {
+        jchar ch = static_cast<jchar>(*src++ & 0xff);
+        *dst++ = (ch <= 0x7f) ? ch : REPLACEMENT_CHAR;
+    }
+}
+
+static void Charsets_isoLatin1BytesToChars(JNIEnv* env, jclass, jbyteArray javaBytes, jint offset, jint length, jcharArray javaChars) {
+    ScopedByteArrayRO bytes(env, javaBytes);
+    if (bytes.get() == NULL) {
+        return;
+    }
+    ScopedCharArrayRW chars(env, javaChars);
+    if (chars.get() == NULL) {
+        return;
+    }
+
+    const jbyte* src = &bytes[offset];
+    jchar* dst = &chars[0];
+    for (int i = length - 1; i >= 0; --i) {
+        *dst++ = static_cast<jchar>(*src++ & 0xff);
+    }
+}
+
+/**
+ * Translates the given characters to US-ASCII or ISO-8859-1 bytes, using the fact that
+ * Unicode code points between U+0000 and U+007f inclusive are identical to US-ASCII, while
+ * U+0000 to U+00ff inclusive are identical to ISO-8859-1.
+ */
+static jbyteArray charsToBytes(JNIEnv* env, jcharArray javaChars, jint offset, jint length, jchar maxValidChar) {
+    ScopedCharArrayRO chars(env, javaChars);
+    if (chars.get() == NULL) {
+        return NULL;
+    }
+
+    jbyteArray javaBytes = env->NewByteArray(length);
+    ScopedByteArrayRW bytes(env, javaBytes);
+    if (bytes.get() == NULL) {
+        return NULL;
+    }
+
+    const jchar* src = &chars[offset];
+    jbyte* dst = &bytes[0];
+    for (int i = length - 1; i >= 0; --i) {
+        jchar ch = *src++;
+        if (ch > maxValidChar) {
+            ch = '?';
+        }
+        *dst++ = static_cast<jbyte>(ch);
+    }
+
+    return javaBytes;
+}
+
+static jbyteArray Charsets_toAsciiBytes(JNIEnv* env, jclass, jcharArray javaChars, jint offset, jint length) {
+    return charsToBytes(env, javaChars, offset, length, 0x7f);
+}
+
+static jbyteArray Charsets_toIsoLatin1Bytes(JNIEnv* env, jclass, jcharArray javaChars, jint offset, jint length) {
+    return charsToBytes(env, javaChars, offset, length, 0xff);
+}
+
+static jbyteArray Charsets_toUtf8Bytes(JNIEnv* env, jclass, jcharArray javaChars, jint offset, jint length) {
+    ScopedCharArrayRO chars(env, javaChars);
+    if (chars.get() == NULL) {
+        return NULL;
+    }
+
+    NativeUnsafeByteSequence out(env);
+    if (!out.resize(length)) {
+        return NULL;
+    }
+
+    const int end = offset + length;
+    for (int i = offset; i < end; ++i) {
+        jint ch = chars[i];
+        if (ch < 0x80) {
+            // One byte.
+            if (!out.append(ch)) {
+                return NULL;
+            }
+        } else if (ch < 0x800) {
+            // Two bytes.
+            if (!out.append((ch >> 6) | 0xc0) || !out.append((ch & 0x3f) | 0x80)) {
+                return NULL;
+            }
+        } else if (U16_IS_SURROGATE(ch)) {
+            // A supplementary character.
+            jchar high = (jchar) ch;
+            jchar low = (i + 1 != end) ? chars[i + 1] : 0;
+            if (!U16_IS_SURROGATE_LEAD(high) || !U16_IS_SURROGATE_TRAIL(low)) {
+                if (!out.append('?')) {
+                    return NULL;
+                }
+                continue;
+            }
+            // Now we know we have a *valid* surrogate pair, we can consume the low surrogate.
+            ++i;
+            ch = U16_GET_SUPPLEMENTARY(high, low);
+            // Four bytes.
+            jbyte b1 = (ch >> 18) | 0xf0;
+            jbyte b2 = ((ch >> 12) & 0x3f) | 0x80;
+            jbyte b3 = ((ch >> 6) & 0x3f) | 0x80;
+            jbyte b4 = (ch & 0x3f) | 0x80;
+            if (!out.append(b1) || !out.append(b2) || !out.append(b3) || !out.append(b4)) {
+                return NULL;
+            }
+        } else {
+            // Three bytes.
+            jbyte b1 = (ch >> 12) | 0xe0;
+            jbyte b2 = ((ch >> 6) & 0x3f) | 0x80;
+            jbyte b3 = (ch & 0x3f) | 0x80;
+            if (!out.append(b1) || !out.append(b2) || !out.append(b3)) {
+                return NULL;
+            }
+        }
+    }
+    return out.toByteArray();
+}
+
+static JNINativeMethod gMethods[] = {
+    { "asciiBytesToChars", "([BII[C)V", (void*) Charsets_asciiBytesToChars },
+    { "isoLatin1BytesToChars", "([BII[C)V", (void*) Charsets_isoLatin1BytesToChars },
+    { "toAsciiBytes", "([CII)[B", (void*) Charsets_toAsciiBytes },
+    { "toIsoLatin1Bytes", "([CII)[B", (void*) Charsets_toIsoLatin1Bytes },
+    { "toUtf8Bytes", "([CII)[B", (void*) Charsets_toUtf8Bytes },
+};
+int register_java_nio_charset_Charsets(JNIEnv* env) {
+    return jniRegisterNativeMethods(env, "java/nio/charset/Charsets", gMethods, NELEM(gMethods));
+}
diff --git a/luni/src/main/native/sub.mk b/luni/src/main/native/sub.mk
index 73ceedb..6bdbc3b 100644
--- a/luni/src/main/native/sub.mk
+++ b/luni/src/main/native/sub.mk
@@ -36,6 +36,7 @@
 	java_lang_System.cpp \
 	java_net_InetAddress.cpp \
 	java_net_NetworkInterface.cpp \
+	java_nio_charset_Charsets.cpp \
 	java_util_regex_Matcher.cpp \
 	java_util_regex_Pattern.cpp \
 	java_util_zip_Adler32.cpp \