Modify the interface of HanziToPinyin class to make it generic and add test class

commit: 4256586663f0d045c69ea818db4893b3365b9915 [log] [tgz]
author: Bai Tao <michaelbai@google.com> Thu Jan 21 08:48:30 2010 +0800
committer: Bai Tao <michaelbai@google.com> Sat Jan 23 12:57:21 2010 +0800
tree: e14afc7537bac6b291e025cc6c2b48755adf3a04
parent: 52a014492c10d825ec26b2179bd8369bf78363ef [diff]
diff --git a/core/java/com/android/internal/util/HanziToPinyin.java b/core/java/com/android/internal/util/HanziToPinyin.java
index 4368e98..6a4adaa 100644
--- a/core/java/com/android/internal/util/HanziToPinyin.java
+++ b/core/java/com/android/internal/util/HanziToPinyin.java

@@ -16,8 +16,6 @@
 
 package com.android.internal.util;
 
-import com.google.android.util.AbstractMessageParser.Token;
-
 import android.text.TextUtils;
 import android.util.Log;
 
@@ -298,8 +296,10 @@
         };
 
     /** First and last Chinese character with known Pinyin according to zh collation */
-    private static final String FIRST_UNIHAN =  "\u5416";
-    private static final String LAST_UNIHAN =  "\u5497";
+    private static final String FIRST_PINYIN_UNIHAN =  "\u5416";
+    private static final String LAST_PINYIN_UNIHAN =  "\u5497";
+    /** The first Chinese character in Unicode block */
+    private static final char FIRST_UNIHAN = '\u3400';
     private static final Collator COLLATOR = Collator.getInstance(Locale.CHINA);
 
     private static HanziToPinyin sInstance;
@@ -311,10 +311,18 @@
          */
         public static final String SEPARATOR = " ";
 
-        public static final int ASCII = 1;
+        public static final int LATIN = 1;
         public static final int PINYIN = 2;
         public static final int UNKNOWN = 3;
 
+        public Token() {
+        }
+
+        public Token(int type, String source, String target) {
+            this.type = type;
+            this.source = source;
+            this.target = target;
+        }
         /**
          * Type of this token, ASCII, PINYIN or UNKNOWN.
          */
@@ -347,6 +355,7 @@
                     return sInstance;
                 }
             }
+            Log.w(TAG, "There is no Chinese collator, HanziToPinyin is disabled");
             sInstance = new HanziToPinyin(false);
             return sInstance;
         }
@@ -359,11 +368,15 @@
         int offset = -1;
         int cmp;
         if (character < 256) {
-            token.type = Token.ASCII;
+            token.type = Token.LATIN;
+            token.target = letter;
+            return token;
+        } else if (character < FIRST_UNIHAN) {
+            token.type = Token.UNKNOWN;
             token.target = letter;
             return token;
         } else {
-            cmp = COLLATOR.compare(letter, FIRST_UNIHAN);
+            cmp = COLLATOR.compare(letter, FIRST_PINYIN_UNIHAN);
             if (cmp < 0) {
                 token.type = Token.UNKNOWN;
                 token.target = letter;
@@ -372,7 +385,7 @@
                 token.type = Token.PINYIN;
                 offset = 0;
             } else {
-                cmp = COLLATOR.compare(letter, LAST_UNIHAN);
+                cmp = COLLATOR.compare(letter, LAST_PINYIN_UNIHAN);
                 if (cmp > 0) {
                     token.type = Token.UNKNOWN;
                     token.target = letter;
@@ -412,44 +425,71 @@
         return token;
     }
 
+    /**
+     * Convert the input to a array of tokens. The sequence of ASCII or Unknown
+     * characters without space will be put into a Token, One Hanzi character 
+     * which has pinyin will be treated as a Token.
+     * If these is no China collator, the empty token array is returned.
+     */
     public ArrayList<Token> get(final String input) {
-        if (!mHasChinaCollator || TextUtils.isEmpty(input)) {
-            return null;
-        }
-
         ArrayList<Token> tokens = new ArrayList<Token>();
-        Token currentToken;
-
+        if (!mHasChinaCollator || TextUtils.isEmpty(input)) {
+            // return empty tokens.
+            return tokens;
+        }
         final int inputLength = input.length();
-
-        currentToken = getToken(input.charAt(0));
-
-        for (int i = 1; i < inputLength; i++) {
+        final StringBuilder sb = new StringBuilder();
+        int tokenType = Token.LATIN;
+        // Go through the input, create a new token when
+        // a. Token type changed
+        // b. Get the Pinyin of current charater.
+        // c. current character is space.
+        for (int i = 0; i < inputLength; i++) {
             final char character = input.charAt(i);
-            Token token = getToken(character);
-
-            if (token.type != currentToken.type) {
-                currentToken.target = currentToken.target.trim();
-                tokens.add(currentToken);
-                currentToken = token;
+            if (character == ' ') {
+                if (sb.length() > 0) {
+                    addToken(sb, tokens, tokenType);
+                }
+            } else if (character < 256) {
+                if (tokenType != Token.LATIN && sb.length() > 0) {
+                    addToken(sb, tokens, tokenType);
+                }
+                tokenType = Token.LATIN;
+                sb.append(character);
+            } else if (character < FIRST_UNIHAN) {
+                if (tokenType != Token.UNKNOWN && sb.length() > 0) {
+                    addToken(sb, tokens, tokenType);
+                }
+                tokenType = Token.UNKNOWN;
+                sb.append(character);
             } else {
-                switch (token.type) {
-                    case Token.ASCII:
-                    case Token.UNKNOWN:
-                        currentToken.source += token.source;
-                        currentToken.target += token.target;
-                        break;
-                    case Token.PINYIN:
-                        currentToken.source += token.source;
-                        currentToken.target += " " + token.target;
-                        break;
+                Token t = getToken(character);
+                if (t.type == Token.PINYIN) {
+                    if (sb.length() > 0) {
+                        addToken(sb, tokens, tokenType);
+                    }
+                    tokens.add(t);
+                    tokenType = Token.PINYIN;
+                } else {
+                    if (tokenType != t.type && sb.length() > 0) {
+                        addToken(sb, tokens, tokenType);
+                    }
+                    tokenType = t.type;
+                    sb.append(character);
                 }
             }
         }
-
-        currentToken.target = currentToken.target.trim();
-        tokens.add(currentToken);
-
+        if (sb.length() > 0) {
+            addToken(sb, tokens, tokenType);
+        }
         return tokens;
     }
+
+    private void addToken(final StringBuilder sb, final ArrayList<Token> tokens,
+            final int tokenType) {
+        String str = sb.toString();
+        tokens.add(new Token(tokenType, str, str));
+        sb.setLength(0);
+    }
+
 }

diff --git a/tests/AndroidTests/src/com/android/unit_tests/internal/util/HanziToPinyinTest.java b/tests/AndroidTests/src/com/android/unit_tests/internal/util/HanziToPinyinTest.java
new file mode 100644
index 0000000..8e1ff0b
--- /dev/null
+++ b/tests/AndroidTests/src/com/android/unit_tests/internal/util/HanziToPinyinTest.java

@@ -0,0 +1,70 @@
+/*
+ * Copyright (C) 2010 The Android Open Source Project
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package com.android.unit_tests.internal.util;
+
+import java.text.Collator;
+import java.util.ArrayList;
+import java.util.Locale;
+
+import android.test.suitebuilder.annotation.SmallTest;
+import android.util.Log;
+
+import com.android.internal.util.HanziToPinyin;
+import com.android.internal.util.HanziToPinyin.Token;
+
+import junit.framework.TestCase;
+
+public class HanziToPinyinTest extends TestCase {
+    private final static String ONE_HANZI = "\u675C";
+    private final static String TWO_HANZI = "\u675C\u9D51";
+    private final static String ASSIC = "test";
+    private final static String ONE_UNKNOWN = "\uFF71";
+    private final static String MISC = "test\u675C   Test with space\uFF71\uFF71\u675C";
+
+    @SmallTest
+    public void testGetToken() throws Exception {
+        ArrayList<Token> tokens = HanziToPinyin.getInstance().get(ONE_HANZI);
+        assertEquals(tokens.size(), 1);
+        assertEquals(tokens.get(0).type, Token.PINYIN);
+        assertTrue(tokens.get(0).target.equalsIgnoreCase("DU"));
+
+        tokens = HanziToPinyin.getInstance().get(TWO_HANZI);
+        assertEquals(tokens.size(), 2);
+        assertEquals(tokens.get(0).type, Token.PINYIN);
+        assertEquals(tokens.get(1).type, Token.PINYIN);
+        assertTrue(tokens.get(0).target.equalsIgnoreCase("DU"));
+        assertTrue(tokens.get(1).target.equalsIgnoreCase("JUAN"));
+
+        tokens = HanziToPinyin.getInstance().get(ASSIC);
+        assertEquals(tokens.size(), 1);
+        assertEquals(tokens.get(0).type, Token.LATIN);
+
+        tokens = HanziToPinyin.getInstance().get(ONE_UNKNOWN);
+        assertEquals(tokens.size(), 1);
+        assertEquals(tokens.get(0).type, Token.UNKNOWN);
+
+        tokens = HanziToPinyin.getInstance().get(MISC);
+        assertEquals(tokens.size(), 7);
+        assertEquals(tokens.get(0).type, Token.LATIN);
+        assertEquals(tokens.get(1).type, Token.PINYIN);
+        assertEquals(tokens.get(2).type, Token.LATIN);
+        assertEquals(tokens.get(3).type, Token.LATIN);
+        assertEquals(tokens.get(4).type, Token.LATIN);
+        assertEquals(tokens.get(5).type, Token.UNKNOWN);
+        assertEquals(tokens.get(6).type, Token.PINYIN);
+    }
+}
commit	4256586663f0d045c69ea818db4893b3365b9915	[log] [tgz]
author	Bai Tao <michaelbai@google.com>	Thu Jan 21 08:48:30 2010 +0800
committer	Bai Tao <michaelbai@google.com>	Sat Jan 23 12:57:21 2010 +0800
tree	e14afc7537bac6b291e025cc6c2b48755adf3a04
parent	52a014492c10d825ec26b2179bd8369bf78363ef [diff]