Optional use of UTF-8 strings in resource bundles

Allows the use of UTF-8 for packing resources instead of the
default of UTF-16 for Java. When strings are extracted from the
ResStringPool, they are converted to UTF-16 and the result is
cached for subsequent calls.

When using aapt to package, add in the "-8" switch to pack the
resources using UTF-8. This will result in the value, key, and
type strings as well as the compiled XML string values taking
significantly less space in the final application package in
most scenarios.

Change-Id: I129483f8b3d3b1c5869dced05cb525e494a6c83a
diff --git a/include/utils/ResourceTypes.h b/include/utils/ResourceTypes.h
index 49145e8..a845908 100644
--- a/include/utils/ResourceTypes.h
+++ b/include/utils/ResourceTypes.h
@@ -393,7 +393,10 @@
     enum {
         // If set, the string index is sorted by the string values (based
         // on strcmp16()).
-        SORTED_FLAG = 1<<0
+        SORTED_FLAG = 1<<0,
+
+        // String pool is encoded in UTF-8
+        UTF8_FLAG = 1<<8
     };
     uint32_t flags;
 
@@ -456,9 +459,11 @@
     void*                       mOwnedData;
     const ResStringPool_header* mHeader;
     size_t                      mSize;
+    mutable Mutex               mDecodeLock;
     const uint32_t*             mEntries;
     const uint32_t*             mEntryStyles;
-    const char16_t*             mStrings;
+    const void*                 mStrings;
+    char16_t**                  mCache;
     uint32_t                    mStringPoolSize;    // number of uint16_t
     const uint32_t*             mStyles;
     uint32_t                    mStylePoolSize;    // number of uint32_t
diff --git a/include/utils/String16.h b/include/utils/String16.h
index a2d22ee..07a0c11 100644
--- a/include/utils/String16.h
+++ b/include/utils/String16.h
@@ -49,12 +49,17 @@
 // Version of strzcmp16 for comparing strings in different endianness.
 int strzcmp16_h_n(const char16_t *s1H, size_t n1, const char16_t *s2N, size_t n2);
 
+// Convert UTF-8 to UTF-16 including surrogate pairs
+void utf8_to_utf16(const uint8_t *src, size_t srcLen, char16_t* dst, const size_t dstLen);
+
 }
 
 // ---------------------------------------------------------------------------
 
 namespace android {
 
+// ---------------------------------------------------------------------------
+
 class String8;
 class TextOutput;
 
diff --git a/include/utils/String8.h b/include/utils/String8.h
index ecc5774..c4b18a4 100644
--- a/include/utils/String8.h
+++ b/include/utils/String8.h
@@ -60,6 +60,11 @@
 /*
  * Returns the UTF-8 length of "src".
  */
+size_t utf8_length_from_utf16(const char16_t *src, size_t src_len);
+
+/*
+ * Returns the UTF-8 length of "src".
+ */
 size_t utf8_length_from_utf32(const char32_t *src, size_t src_len);
 
 /*
@@ -120,6 +125,9 @@
 size_t utf32_to_utf8(const char32_t* src, size_t src_len,
                      char* dst, size_t dst_len);
 
+size_t utf16_to_utf8(const char16_t* src, size_t src_len,
+                     char* dst, size_t dst_len);
+
 }
 
 // ---------------------------------------------------------------------------
diff --git a/libs/utils/ResourceTypes.cpp b/libs/utils/ResourceTypes.cpp
index 450af8d..afca814 100644
--- a/libs/utils/ResourceTypes.cpp
+++ b/libs/utils/ResourceTypes.cpp
@@ -229,12 +229,12 @@
 // --------------------------------------------------------------------
 
 ResStringPool::ResStringPool()
-    : mError(NO_INIT), mOwnedData(NULL)
+    : mError(NO_INIT), mOwnedData(NULL), mHeader(NULL), mCache(NULL)
 {
 }
 
 ResStringPool::ResStringPool(const void* data, size_t size, bool copyData)
-    : mError(NO_INIT), mOwnedData(NULL)
+    : mError(NO_INIT), mOwnedData(NULL), mHeader(NULL), mCache(NULL)
 {
     setTo(data, size, copyData);
 }
@@ -296,7 +296,17 @@
                     (int)size);
             return (mError=BAD_TYPE);
         }
-        mStrings = (const char16_t*)
+
+        size_t charSize;
+        if (mHeader->flags&ResStringPool_header::UTF8_FLAG) {
+            charSize = sizeof(uint8_t);
+            mCache = (char16_t**)malloc(sizeof(char16_t**)*mHeader->stringCount);
+            memset(mCache, 0, sizeof(char16_t**)*mHeader->stringCount);
+        } else {
+            charSize = sizeof(char16_t);
+        }
+
+        mStrings = (const void*)
             (((const uint8_t*)data)+mHeader->stringsStart);
         if (mHeader->stringsStart >= (mHeader->header.size-sizeof(uint16_t))) {
             LOGW("Bad string block: string pool starts at %d, after total size %d\n",
@@ -305,7 +315,7 @@
         }
         if (mHeader->styleCount == 0) {
             mStringPoolSize =
-                (mHeader->header.size-mHeader->stringsStart)/sizeof(uint16_t);
+                (mHeader->header.size-mHeader->stringsStart)/charSize;
         } else {
             // check invariant: styles follow the strings
             if (mHeader->stylesStart <= mHeader->stringsStart) {
@@ -314,7 +324,7 @@
                 return (mError=BAD_TYPE);
             }
             mStringPoolSize =
-                (mHeader->stylesStart-mHeader->stringsStart)/sizeof(uint16_t);
+                (mHeader->stylesStart-mHeader->stringsStart)/charSize;
         }
 
         // check invariant: stringCount > 0 requires a string pool to exist
@@ -329,13 +339,19 @@
             for (i=0; i<mHeader->stringCount; i++) {
                 e[i] = dtohl(mEntries[i]);
             }
-            char16_t* s = const_cast<char16_t*>(mStrings);
-            for (i=0; i<mStringPoolSize; i++) {
-                s[i] = dtohs(mStrings[i]);
+            if (!(mHeader->flags&ResStringPool_header::UTF8_FLAG)) {
+                const char16_t* strings = (const char16_t*)mStrings;
+                char16_t* s = const_cast<char16_t*>(strings);
+                for (i=0; i<mStringPoolSize; i++) {
+                    s[i] = dtohs(strings[i]);
+                }
             }
         }
 
-        if (mStrings[mStringPoolSize-1] != 0) {
+        if ((mHeader->flags&ResStringPool_header::UTF8_FLAG &&
+                ((uint8_t*)mStrings)[mStringPoolSize-1] != 0) ||
+                (!mHeader->flags&ResStringPool_header::UTF8_FLAG &&
+                ((char16_t*)mStrings)[mStringPoolSize-1] != 0)) {
             LOGW("Bad string block: last string is not 0-terminated\n");
             return (mError=BAD_TYPE);
         }
@@ -410,24 +426,67 @@
         free(mOwnedData);
         mOwnedData = NULL;
     }
+    if (mHeader != NULL && mCache != NULL) {
+        for (size_t x = 0; x < mHeader->stringCount; x++) {
+            if (mCache[x] != NULL) {
+                free(mCache[x]);
+                mCache[x] = NULL;
+            }
+        }
+        free(mCache);
+        mCache = NULL;
+    }
 }
 
+#define DECODE_LENGTH(str, chrsz, len) \
+    len = *(str); \
+    if (*(str)&(1<<(chrsz*8-1))) { \
+        (str)++; \
+        len = (((len)&((1<<(chrsz*8-1))-1))<<(chrsz*8)) + *(str); \
+    } \
+    (str)++;
+
 const uint16_t* ResStringPool::stringAt(size_t idx, size_t* outLen) const
 {
     if (mError == NO_ERROR && idx < mHeader->stringCount) {
-        const uint32_t off = (mEntries[idx]/sizeof(uint16_t));
+        const bool isUTF8 = (mHeader->flags&ResStringPool_header::UTF8_FLAG) != 0;
+        const uint32_t off = mEntries[idx]/(isUTF8?sizeof(char):sizeof(char16_t));
         if (off < (mStringPoolSize-1)) {
-            const char16_t* str = mStrings+off;
-            *outLen = *str;
-            if ((*str)&0x8000) {
-                str++;
-                *outLen = (((*outLen)&0x7fff)<<16) + *str;
-            }
-            if ((uint32_t)(str+1+*outLen-mStrings) < mStringPoolSize) {
-                return str+1;
+            if (!isUTF8) {
+                const char16_t* strings = (char16_t*)mStrings;
+                const char16_t* str = strings+off;
+                DECODE_LENGTH(str, sizeof(char16_t), *outLen)
+                if ((uint32_t)(str+*outLen-strings) < mStringPoolSize) {
+                    return str;
+                } else {
+                    LOGW("Bad string block: string #%d extends to %d, past end at %d\n",
+                            (int)idx, (int)(str+*outLen-strings), (int)mStringPoolSize);
+                }
             } else {
-                LOGW("Bad string block: string #%d extends to %d, past end at %d\n",
-                        (int)idx, (int)(str+1+*outLen-mStrings), (int)mStringPoolSize);
+                const uint8_t* strings = (uint8_t*)mStrings;
+                const uint8_t* str = strings+off;
+                DECODE_LENGTH(str, sizeof(uint8_t), *outLen)
+                size_t encLen;
+                DECODE_LENGTH(str, sizeof(uint8_t), encLen)
+                if ((uint32_t)(str+encLen-strings) < mStringPoolSize) {
+                    AutoMutex lock(mDecodeLock);
+                    if (mCache[idx] != NULL) {
+                        return mCache[idx];
+                    }
+                    char16_t *u16str = (char16_t *)calloc(*outLen+1, sizeof(char16_t));
+                    if (!u16str) {
+                        LOGW("No memory when trying to allocate decode cache for string #%d\n",
+                                (int)idx);
+                        return NULL;
+                    }
+                    const unsigned char *u8src = reinterpret_cast<const unsigned char *>(str);
+                    utf8_to_utf16(u8src, encLen, u16str, *outLen);
+                    mCache[idx] = u16str;
+                    return u16str;
+                } else {
+                    LOGW("Bad string block: string #%d extends to %d, past end at %d\n",
+                            (int)idx, (int)(str+encLen-strings), (int)mStringPoolSize);
+                }
             }
         } else {
             LOGW("Bad string block: string #%d entry is at %d, past end at %d\n",
@@ -466,6 +525,10 @@
 
     size_t len;
 
+    // TODO optimize searching for UTF-8 strings taking into account
+    // the cache fill to determine when to convert the searched-for
+    // string key to UTF-8.
+
     if (mHeader->flags&ResStringPool_header::SORTED_FLAG) {
         // Do a binary search for the string...
         ssize_t l = 0;
@@ -1043,6 +1106,7 @@
 void ResXMLTree::uninit()
 {
     mError = NO_INIT;
+    mStrings.uninit();
     if (mOwnedData) {
         free(mOwnedData);
         mOwnedData = NULL;
diff --git a/libs/utils/String16.cpp b/libs/utils/String16.cpp
index aef67f2..eab7b2b 100644
--- a/libs/utils/String16.cpp
+++ b/libs/utils/String16.cpp
@@ -172,10 +172,6 @@
            : 0);
 }
 
-// ---------------------------------------------------------------------------
-
-namespace android {
-
 static inline size_t
 utf8_char_len(uint8_t ch)
 {
@@ -215,8 +211,38 @@
     //printf("Char at %p: len=%d, utf-16=%p\n", src, length, (void*)result);
 }
 
+void
+utf8_to_utf16(const uint8_t *src, size_t srcLen,
+        char16_t* dst, const size_t dstLen)
+{
+    const uint8_t* const end = src + srcLen;
+    const char16_t* const dstEnd = dst + dstLen;
+    while (src < end && dst < dstEnd) {
+        size_t len = utf8_char_len(*src);
+        uint32_t codepoint = utf8_to_utf32((const uint8_t*)src, len);
+
+        // Convert the UTF32 codepoint to one or more UTF16 codepoints
+        if (codepoint <= 0xFFFF) {
+            // Single UTF16 character
+            *dst++ = (char16_t) codepoint;
+        } else {
+            // Multiple UTF16 characters with surrogates
+            codepoint = codepoint - 0x10000;
+            *dst++ = (char16_t) ((codepoint >> 10) + 0xD800);
+            *dst++ = (char16_t) ((codepoint & 0x3FF) + 0xDC00);
+        }
+
+        src += len;
+    }
+    if (dst < dstEnd) {
+        *dst = 0;
+    }
+}
+
 // ---------------------------------------------------------------------------
 
+namespace android {
+
 static SharedBuffer* gEmptyStringBuf = NULL;
 static char16_t* gEmptyString = NULL;
 
@@ -260,30 +286,14 @@
         p += utf8len;
     }
     
-    SharedBuffer* buf = SharedBuffer::alloc((chars+1)*sizeof(char16_t));
+    size_t bufSize = (chars+1)*sizeof(char16_t);
+    SharedBuffer* buf = SharedBuffer::alloc(bufSize);
     if (buf) {
         p = in;
         char16_t* str = (char16_t*)buf->data();
-        char16_t* d = str;
-        while (p < end) {
-            size_t len = utf8_char_len(*p);
-            uint32_t codepoint = utf8_to_utf32((const uint8_t*)p, len);
-
-            // Convert the UTF32 codepoint to one or more UTF16 codepoints
-            if (codepoint <= 0xFFFF) {
-                // Single UTF16 character
-                *d++ = (char16_t) codepoint;
-            } else {
-                // Multiple UTF16 characters with surrogates
-                codepoint = codepoint - 0x10000;
-                *d++ = (char16_t) ((codepoint >> 10) + 0xD800);
-                *d++ = (char16_t) ((codepoint & 0x3FF) + 0xDC00);
-            }
-
-            p += len;
-        }
-        *d = 0;
         
+        utf8_to_utf16((const uint8_t*)p, len, str, bufSize);
+
         //printf("Created UTF-16 string from UTF-8 \"%s\":", in);
         //printHexData(1, str, buf->size(), 16, 1);
         //printf("\n");
diff --git a/libs/utils/String8.cpp b/libs/utils/String8.cpp
index e908ec1..3a34838 100644
--- a/libs/utils/String8.cpp
+++ b/libs/utils/String8.cpp
@@ -208,10 +208,23 @@
     return getEmptyString();
 }
 
-// Note: not dealing with expanding surrogate pairs.
 static char* allocFromUTF16(const char16_t* in, size_t len)
 {
-    return allocFromUTF16OrUTF32<char16_t, size_t>(in, len);
+    if (len == 0) return getEmptyString();
+
+    const size_t bytes = utf8_length_from_utf16(in, len);
+
+    SharedBuffer* buf = SharedBuffer::alloc(bytes+1);
+    LOG_ASSERT(buf, "Unable to allocate shared buffer");
+    if (buf) {
+        char* str = (char*)buf->data();
+
+        utf16_to_utf8(in, len, str, bytes+1);
+
+        return str;
+    }
+
+    return getEmptyString();
 }
 
 static char* allocFromUTF32(const char32_t* in, size_t len)
@@ -762,6 +775,26 @@
     return ret;
 }
 
+size_t utf8_length_from_utf16(const char16_t *src, size_t src_len)
+{
+    if (src == NULL || src_len == 0) {
+        return 0;
+    }
+    size_t ret = 0;
+    const char16_t* const end = src + src_len;
+    while (src < end) {
+        if ((*src & 0xFC00) == 0xD800 && (src + 1) < end
+                && (*++src & 0xFC00) == 0xDC00) {
+            // surrogate pairs are always 4 bytes.
+            ret += 4;
+            src++;
+        } else {
+            ret += android::utf32_to_utf8_bytes((char32_t) *src++);
+        }
+    }
+    return ret;
+}
+
 static int32_t utf32_at_internal(const char* cur, size_t *num_read)
 {
     const char first_char = *cur;
@@ -848,3 +881,33 @@
     }
     return cur - dst;
 }
+
+size_t utf16_to_utf8(const char16_t* src, size_t src_len,
+                     char* dst, size_t dst_len)
+{
+    if (src == NULL || src_len == 0 || dst == NULL || dst_len == 0) {
+        return 0;
+    }
+    const char16_t* cur_utf16 = src;
+    const char16_t* const end_utf16 = src + src_len;
+    char *cur = dst;
+    const char* const end = dst + dst_len;
+    while (cur_utf16 < end_utf16 && cur < end) {
+        char32_t utf32;
+        // surrogate pairs
+        if ((*cur_utf16 & 0xFC00) == 0xD800 && (cur_utf16 + 1) < end_utf16) {
+            utf32 = (*cur_utf16++ - 0xD800) << 10;
+            utf32 |= *cur_utf16++ - 0xDC00;
+            utf32 += 0x10000;
+        } else {
+            utf32 = (char32_t) *cur_utf16++;
+        }
+        size_t len = android::utf32_to_utf8_bytes(utf32);
+        android::utf32_to_utf8((uint8_t*)cur, utf32, len);
+        cur += len;
+    }
+    if (cur < end) {
+        *cur = '\0';
+    }
+    return cur - dst;
+}
diff --git a/tools/aapt/Bundle.h b/tools/aapt/Bundle.h
index 1ac13f2..cf70121 100644
--- a/tools/aapt/Bundle.h
+++ b/tools/aapt/Bundle.h
@@ -37,7 +37,7 @@
           mForce(false), mGrayscaleTolerance(0), mMakePackageDirs(false),
           mUpdate(false), mExtending(false),
           mRequireLocalization(false), mPseudolocalize(false),
-          mValues(false),
+          mUTF8(false), mValues(false),
           mCompressionMethod(0), mOutputAPKFile(NULL),
           mAssetSourceDir(NULL), mProguardFile(NULL),
           mAndroidManifestFile(NULL), mPublicOutputFile(NULL),
@@ -76,6 +76,8 @@
     void setRequireLocalization(bool val) { mRequireLocalization = val; }
     bool getPseudolocalize(void) const { return mPseudolocalize; }
     void setPseudolocalize(bool val) { mPseudolocalize = val; }
+    bool getUTF8(void) const { return mUTF8; }
+    void setUTF8(bool val) { mUTF8 = val; }
     bool getValues(void) const { return mValues; }
     void setValues(bool val) { mValues = val; }
     int getCompressionMethod(void) const { return mCompressionMethod; }
@@ -161,6 +163,7 @@
     bool        mExtending;
     bool        mRequireLocalization;
     bool        mPseudolocalize;
+    bool        mUTF8;
     bool        mValues;
     int         mCompressionMethod;
     bool        mJunkPath;
diff --git a/tools/aapt/Command.cpp b/tools/aapt/Command.cpp
index 1a536d6..ff9cc11 100644
--- a/tools/aapt/Command.cpp
+++ b/tools/aapt/Command.cpp
@@ -412,6 +412,7 @@
             }
             tree.restart();
             printXMLBlock(&tree);
+            tree.uninit();
             delete asset;
             asset = NULL;
         }
diff --git a/tools/aapt/Main.cpp b/tools/aapt/Main.cpp
index 98286c0..bd03b74 100644
--- a/tools/aapt/Main.cpp
+++ b/tools/aapt/Main.cpp
@@ -118,6 +118,7 @@
         "   -P  specify where to output public resource definitions\n"
         "   -S  directory in which to find resources.  Multiple directories will be scanned"
         "       and the first match found (left to right) will take precedence."
+        "   -8  Encode string resources in UTF-8.\n"
         "   -0  specifies an additional extension for which such files will not\n"
         "       be stored compressed in the .apk.  An empty string means to not\n"
         "       compress any files at all.\n"
@@ -370,6 +371,9 @@
                     bundle.setCompressionMethod(ZipEntry::kCompressStored);
                 }
                 break;
+            case '8':
+                bundle.setUTF8(true);
+                break;
             case '-':
                 if (strcmp(cp, "-min-sdk-version") == 0) {
                     argc--;
diff --git a/tools/aapt/Resource.cpp b/tools/aapt/Resource.cpp
index fdcada4..d04a873 100644
--- a/tools/aapt/Resource.cpp
+++ b/tools/aapt/Resource.cpp
@@ -613,6 +613,12 @@
 
     NOISY(printf("Found %d included resource packages\n", (int)table.size()));
 
+    // Standard flags for compiled XML and optional UTF-8 encoding
+    int xmlFlags = XML_COMPILE_STANDARD_RESOURCE;
+    if (bundle->getUTF8()) {
+        xmlFlags |= XML_COMPILE_UTF8;
+    }
+
     // --------------------------------------------------------------
     // First, gather all resource information.
     // --------------------------------------------------------------
@@ -763,7 +769,7 @@
         ResourceDirIterator it(layouts, String8("layout"));
         while ((err=it.next()) == NO_ERROR) {
             String8 src = it.getFile()->getPrintableSource();
-            err = compileXmlFile(assets, it.getFile(), &table);
+            err = compileXmlFile(assets, it.getFile(), &table, xmlFlags);
             if (err == NO_ERROR) {
                 ResXMLTree block;
                 block.setTo(it.getFile()->getData(), it.getFile()->getSize(), true);
@@ -782,7 +788,7 @@
     if (anims != NULL) {
         ResourceDirIterator it(anims, String8("anim"));
         while ((err=it.next()) == NO_ERROR) {
-            err = compileXmlFile(assets, it.getFile(), &table);
+            err = compileXmlFile(assets, it.getFile(), &table, xmlFlags);
             if (err != NO_ERROR) {
                 hasErrors = true;
             }
@@ -797,7 +803,7 @@
     if (xmls != NULL) {
         ResourceDirIterator it(xmls, String8("xml"));
         while ((err=it.next()) == NO_ERROR) {
-            err = compileXmlFile(assets, it.getFile(), &table);
+            err = compileXmlFile(assets, it.getFile(), &table, xmlFlags);
             if (err != NO_ERROR) {
                 hasErrors = true;
             }
@@ -819,7 +825,7 @@
     if (colors != NULL) {
         ResourceDirIterator it(colors, String8("color"));
         while ((err=it.next()) == NO_ERROR) {
-          err = compileXmlFile(assets, it.getFile(), &table);
+          err = compileXmlFile(assets, it.getFile(), &table, xmlFlags);
             if (err != NO_ERROR) {
                 hasErrors = true;
             }
@@ -835,7 +841,7 @@
         ResourceDirIterator it(menus, String8("menu"));
         while ((err=it.next()) == NO_ERROR) {
             String8 src = it.getFile()->getPrintableSource();
-            err = compileXmlFile(assets, it.getFile(), &table);
+            err = compileXmlFile(assets, it.getFile(), &table, xmlFlags);
             if (err != NO_ERROR) {
                 hasErrors = true;
             }
diff --git a/tools/aapt/ResourceTable.cpp b/tools/aapt/ResourceTable.cpp
index 19b9b01..a9cbd11 100644
--- a/tools/aapt/ResourceTable.cpp
+++ b/tools/aapt/ResourceTable.cpp
@@ -39,6 +39,10 @@
         root->removeWhitespace(false, NULL);
     }
 
+    if ((options&XML_COMPILE_UTF8) != 0) {
+        root->setUTF8(true);
+    }
+
     bool hasErrors = false;
     
     if ((options&XML_COMPILE_ASSIGN_ATTRIBUTE_IDS) != 0) {
@@ -2505,7 +2509,7 @@
 
     // Iterate through all data, collecting all values (strings,
     // references, etc).
-    StringPool valueStrings;
+    StringPool valueStrings = StringPool(false, bundle->getUTF8());
     for (pi=0; pi<N; pi++) {
         sp<Package> p = mOrderedPackages.itemAt(pi);
         if (p->getTypes().size() == 0) {
@@ -2513,8 +2517,8 @@
             continue;
         }
 
-        StringPool typeStrings;
-        StringPool keyStrings;
+        StringPool typeStrings = StringPool(false, bundle->getUTF8());
+        StringPool keyStrings = StringPool(false, bundle->getUTF8());
 
         const size_t N = p->getOrderedTypes().size();
         for (size_t ti=0; ti<N; ti++) {
diff --git a/tools/aapt/ResourceTable.h b/tools/aapt/ResourceTable.h
index caa01b3..cfa75a71 100644
--- a/tools/aapt/ResourceTable.h
+++ b/tools/aapt/ResourceTable.h
@@ -24,6 +24,7 @@
     XML_COMPILE_COMPACT_WHITESPACE = 1<<2,
     XML_COMPILE_STRIP_WHITESPACE = 1<<3,
     XML_COMPILE_STRIP_RAW_VALUES = 1<<4,
+    XML_COMPILE_UTF8 = 1<<5,
     
     XML_COMPILE_STANDARD_RESOURCE =
             XML_COMPILE_STRIP_COMMENTS | XML_COMPILE_ASSIGN_ATTRIBUTE_IDS
diff --git a/tools/aapt/StringPool.cpp b/tools/aapt/StringPool.cpp
index 715170a..ec58591 100644
--- a/tools/aapt/StringPool.cpp
+++ b/tools/aapt/StringPool.cpp
@@ -30,8 +30,8 @@
     }
 }
 
-StringPool::StringPool(bool sorted)
-    : mSorted(sorted), mValues(-1), mIdents(-1)
+StringPool::StringPool(bool sorted, bool utf8)
+    : mSorted(sorted), mUTF8(utf8), mValues(-1), mIdents(-1)
 {
 }
 
@@ -165,6 +165,16 @@
     return err == NO_ERROR ? pool : NULL;
 }
 
+#define ENCODE_LENGTH(str, chrsz, strSize) \
+{ \
+    size_t maxMask = 1 << ((chrsz*8)-1); \
+    size_t maxSize = maxMask-1; \
+    if (strSize > maxSize) { \
+        *str++ = maxMask | ((strSize>>(chrsz*8))&maxSize); \
+    } \
+    *str++ = strSize; \
+}
+
 status_t StringPool::writeStringBlock(const sp<AaptFile>& pool)
 {
     // Allow appending.  Sorry this is a little wacky.
@@ -213,28 +223,53 @@
         return NO_MEMORY;
     }
 
+    const size_t charSize = mUTF8 ? sizeof(uint8_t) : sizeof(char16_t);
+
     size_t strPos = 0;
     for (i=0; i<STRINGS; i++) {
         entry& ent = mEntries.editItemAt(i);
         const size_t strSize = (ent.value.size());
-        const size_t lenSize = strSize > 0x7fff ? sizeof(uint32_t) : sizeof(uint16_t);
-        const size_t totalSize = lenSize + ((strSize+1)*sizeof(uint16_t));
+        const size_t lenSize = strSize > (size_t)(1<<((charSize*8)-1))-1 ?
+            charSize*2 : charSize;
+
+        String8 encStr;
+        if (mUTF8) {
+            encStr = String8(ent.value);
+        }
+
+        const size_t encSize = mUTF8 ? encStr.size() : 0;
+        const size_t encLenSize = mUTF8 ?
+            (encSize > (size_t)(1<<((charSize*8)-1))-1 ?
+                charSize*2 : charSize) : 0;
 
         ent.offset = strPos;
-        uint16_t* dat = (uint16_t*)pool->editData(preSize + strPos + totalSize);
+
+        const size_t totalSize = lenSize + encLenSize +
+            ((mUTF8 ? encSize : strSize)+1)*charSize;
+
+        void* dat = (void*)pool->editData(preSize + strPos + totalSize);
         if (dat == NULL) {
             fprintf(stderr, "ERROR: Out of memory for string pool\n");
             return NO_MEMORY;
         }
-        dat += (preSize+strPos)/sizeof(uint16_t);
-        if (lenSize > sizeof(uint16_t)) {
-            *dat = htods(0x8000 | ((strSize>>16)&0x7fff));
-            dat++;
-        }
-        *dat++ = htods(strSize);
-        strcpy16_htod(dat, ent.value);
+        dat = (uint8_t*)dat + preSize + strPos;
+        if (mUTF8) {
+            uint8_t* strings = (uint8_t*)dat;
 
-        strPos += lenSize + (strSize+1)*sizeof(uint16_t);
+            ENCODE_LENGTH(strings, sizeof(uint8_t), strSize)
+
+            ENCODE_LENGTH(strings, sizeof(uint8_t), encSize)
+
+            strncpy((char*)strings, encStr, encSize+1);
+        } else {
+            uint16_t* strings = (uint16_t*)dat;
+
+            ENCODE_LENGTH(strings, sizeof(uint16_t), strSize)
+
+            strcpy16_htod(strings, ent.value);
+        }
+
+        strPos += totalSize;
     }
 
     // Pad ending string position up to a uint32_t boundary.
@@ -312,6 +347,9 @@
     if (mSorted) {
         header->flags |= htodl(ResStringPool_header::SORTED_FLAG);
     }
+    if (mUTF8) {
+        header->flags |= htodl(ResStringPool_header::UTF8_FLAG);
+    }
     header->stringsStart = htodl(preSize);
     header->stylesStart = htodl(STYLES > 0 ? (preSize+strPos) : 0);
 
diff --git a/tools/aapt/StringPool.h b/tools/aapt/StringPool.h
index 9082b37..7275259 100644
--- a/tools/aapt/StringPool.h
+++ b/tools/aapt/StringPool.h
@@ -68,8 +68,11 @@
      * lookup with ResStringPool::indexOfString() (O(log n)), at the expense
      * of support for styled string entries (which requires the same string
      * be included multiple times in the pool).
+     *
+     * If 'utf8' is true, strings will be encoded with UTF-8 instead of
+     * left in Java's native UTF-16.
      */
-    explicit StringPool(bool sorted = false);
+    explicit StringPool(bool sorted = false, bool utf8 = false);
 
     /**
      * Add a new string to the pool.  If mergeDuplicates is true, thenif
@@ -123,6 +126,7 @@
 
 private:
     const bool                              mSorted;
+    const bool                              mUTF8;
     // Raw array of unique strings, in some arbitrary order.
     Vector<entry>                           mEntries;
     // Array of indices into mEntries, in the order they were
diff --git a/tools/aapt/XMLNode.cpp b/tools/aapt/XMLNode.cpp
index d4d2a45c..036dde4 100644
--- a/tools/aapt/XMLNode.cpp
+++ b/tools/aapt/XMLNode.cpp
@@ -478,6 +478,7 @@
     , mFilename(filename)
     , mStartLineNumber(0)
     , mEndLineNumber(0)
+    , mUTF8(false)
 {
     if (isNamespace) {
         mNamespacePrefix = s1;
@@ -837,7 +838,7 @@
 status_t XMLNode::flatten(const sp<AaptFile>& dest,
         bool stripComments, bool stripRawValues) const
 {
-    StringPool strings;
+    StringPool strings = StringPool(false, mUTF8);
     Vector<uint32_t> resids;
     
     // First collect just the strings for attribute names that have a
diff --git a/tools/aapt/XMLNode.h b/tools/aapt/XMLNode.h
index a9bea43..dc92fa7 100644
--- a/tools/aapt/XMLNode.h
+++ b/tools/aapt/XMLNode.h
@@ -124,6 +124,8 @@
 
     void removeWhitespace(bool stripAll=true, const char** cDataTags=NULL);
 
+    void setUTF8(bool val) { mUTF8 = val; }
+
     status_t parseValues(const sp<AaptAssets>& assets, ResourceTable* table);
 
     status_t assignResourceIds(const sp<AaptAssets>& assets,
@@ -189,6 +191,9 @@
     String8 mFilename;
     int32_t mStartLineNumber;
     int32_t mEndLineNumber;
+
+    // Encode compiled XML with UTF-8 StringPools?
+    bool mUTF8;
 };
 
 #endif