| Dmitri Plotnikov | 3a74962 | 2010-03-03 11:29:46 -0800 | [diff] [blame] | 1 | /* |
| 2 | * Copyright 2010, The Android Open Source Project |
| 3 | * |
| 4 | * Licensed under the Apache License, Version 2.0 (the "License"); |
| 5 | * you may not use this file except in compliance with the License. |
| 6 | * You may obtain a copy of the License at |
| 7 | * |
| 8 | * http://www.apache.org/licenses/LICENSE-2.0 |
| 9 | * |
| 10 | * Unless required by applicable law or agreed to in writing, software |
| 11 | * distributed under the License is distributed on an "AS IS" BASIS, |
| 12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. |
| 13 | * See the License for the specific language governing permissions and |
| 14 | * limitations under the License. |
| 15 | */ |
| 16 | |
| 17 | #include <ctype.h> |
| 18 | #include <string.h> |
| 19 | |
| 20 | #include <unicode/ucol.h> |
| 21 | #include <unicode/uiter.h> |
| 22 | #include <unicode/ustring.h> |
| 23 | #include <unicode/utypes.h> |
| 24 | |
| 25 | #include "PhonebookIndex.h" |
| 26 | #include "PhoneticStringUtils.h" |
| 27 | |
| Dmitri Plotnikov | 4b2aeb8 | 2010-03-03 14:48:34 -0800 | [diff] [blame] | 28 | #define MIN_OUTPUT_SIZE 6 // Minimum required size for the output buffer (in bytes) |
| Dmitri Plotnikov | 3a74962 | 2010-03-03 11:29:46 -0800 | [diff] [blame] | 29 | |
| 30 | namespace android { |
| 31 | |
| 32 | // IMPORTANT! Keep the codes below SORTED. We are doing a binary search on the array |
| 33 | static UChar DEFAULT_CHAR_MAP[] = { |
| 34 | 0x00C6, 'A', // AE |
| 35 | 0x00DF, 'S', // Etzett |
| 36 | 0x1100, 0x3131, // HANGUL LETTER KIYEOK |
| 37 | 0x1101, 0x3132, // HANGUL LETTER SSANGKIYEOK |
| 38 | 0x1102, 0x3134, // HANGUL LETTER NIEUN |
| 39 | 0x1103, 0x3137, // HANGUL LETTER TIKEUT |
| 40 | 0x1104, 0x3138, // HANGUL LETTER SSANGTIKEUT |
| 41 | 0x1105, 0x3139, // HANGUL LETTER RIEUL |
| 42 | 0x1106, 0x3141, // HANGUL LETTER MIEUM |
| 43 | 0x1107, 0x3142, // HANGUL LETTER PIEUP |
| 44 | 0x1108, 0x3143, // HANGUL LETTER SSANGPIEUP |
| 45 | 0x1109, 0x3145, // HANGUL LETTER SIOS |
| 46 | 0x110A, 0x3146, // HANGUL LETTER SSANGSIOS |
| 47 | 0x110B, 0x3147, // HANGUL LETTER IEUNG |
| 48 | 0x110C, 0x3148, // HANGUL LETTER CIEUC |
| 49 | 0x110D, 0x3149, // HANGUL LETTER SSANGCIEUC |
| 50 | 0x110E, 0x314A, // HANGUL LETTER CHIEUCH |
| 51 | 0x110F, 0x314B, // HANGUL LETTER KHIEUKH |
| 52 | 0x1110, 0x314C, // HANGUL LETTER THIEUTH |
| 53 | 0x1111, 0x314D, // HANGUL LETTER PHIEUPH |
| 54 | 0x1112, 0x314E, // HANGUL LETTER HIEUH |
| 55 | 0x111A, 0x3140, // HANGUL LETTER RIEUL-HIEUH |
| 56 | 0x1121, 0x3144, // HANGUL LETTER PIEUP-SIOS |
| 57 | 0x1161, 0x314F, // HANGUL LETTER A |
| 58 | 0x1162, 0x3150, // HANGUL LETTER AE |
| 59 | 0x1163, 0x3151, // HANGUL LETTER YA |
| 60 | 0x1164, 0x3152, // HANGUL LETTER YAE |
| 61 | 0x1165, 0x3153, // HANGUL LETTER EO |
| 62 | 0x1166, 0x3154, // HANGUL LETTER E |
| 63 | 0x1167, 0x3155, // HANGUL LETTER YEO |
| 64 | 0x1168, 0x3156, // HANGUL LETTER YE |
| 65 | 0x1169, 0x3157, // HANGUL LETTER O |
| 66 | 0x116A, 0x3158, // HANGUL LETTER WA |
| 67 | 0x116B, 0x3159, // HANGUL LETTER WAE |
| 68 | 0x116C, 0x315A, // HANGUL LETTER OE |
| 69 | 0x116D, 0x315B, // HANGUL LETTER YO |
| 70 | 0x116E, 0x315C, // HANGUL LETTER U |
| 71 | 0x116F, 0x315D, // HANGUL LETTER WEO |
| 72 | 0x1170, 0x315E, // HANGUL LETTER WE |
| 73 | 0x1171, 0x315F, // HANGUL LETTER WI |
| 74 | 0x1172, 0x3160, // HANGUL LETTER YU |
| 75 | 0x1173, 0x3161, // HANGUL LETTER EU |
| 76 | 0x1174, 0x3162, // HANGUL LETTER YI |
| 77 | 0x1175, 0x3163, // HANGUL LETTER I |
| 78 | 0x11AA, 0x3133, // HANGUL LETTER KIYEOK-SIOS |
| 79 | 0x11AC, 0x3135, // HANGUL LETTER NIEUN-CIEUC |
| 80 | 0x11AD, 0x3136, // HANGUL LETTER NIEUN-HIEUH |
| 81 | 0x11B0, 0x313A, // HANGUL LETTER RIEUL-KIYEOK |
| 82 | 0x11B1, 0x313B, // HANGUL LETTER RIEUL-MIEUM |
| 83 | 0x11B3, 0x313D, // HANGUL LETTER RIEUL-SIOS |
| 84 | 0x11B4, 0x313E, // HANGUL LETTER RIEUL-THIEUTH |
| 85 | 0x11B5, 0x313F, // HANGUL LETTER RIEUL-PHIEUPH |
| 86 | }; |
| 87 | |
| 88 | /** |
| 89 | * Binary search to map an individual character to the corresponding phone book index. |
| 90 | */ |
| 91 | static UChar map_character(UChar c, UChar * char_map, int32_t length) { |
| 92 | int from = 0, to = length; |
| 93 | while (from < to) { |
| 94 | int m = ((to + from) >> 1) & ~0x1; // Only consider even positions |
| 95 | UChar cm = char_map[m]; |
| 96 | if (cm == c) { |
| 97 | return char_map[m + 1]; |
| 98 | } else if (cm < c) { |
| 99 | from = m + 2; |
| 100 | } else { |
| 101 | to = m; |
| 102 | } |
| 103 | } |
| 104 | return 0; |
| 105 | } |
| 106 | |
| 107 | /** |
| 108 | * Returns TRUE if the character belongs to a Hanzi unicode block |
| 109 | */ |
| 110 | static bool is_CJK(UChar c) { |
| 111 | return |
| 112 | (0x4e00 <= c && c <= 0x9fff) // CJK_UNIFIED_IDEOGRAPHS |
| 113 | || (0x3400 <= c && c <= 0x4dbf) // CJK_UNIFIED_IDEOGRAPHS_EXTENSION_A |
| 114 | || (0x3000 <= c && c <= 0x303f) // CJK_SYMBOLS_AND_PUNCTUATION |
| 115 | || (0x2e80 <= c && c <= 0x2eff) // CJK_RADICALS_SUPPLEMENT |
| 116 | || (0x3300 <= c && c <= 0x33ff) // CJK_COMPATIBILITY |
| 117 | || (0xfe30 <= c && c <= 0xfe4f) // CJK_COMPATIBILITY_FORMS |
| 118 | || (0xf900 <= c && c <= 0xfaff); // CJK_COMPATIBILITY_IDEOGRAPHS |
| 119 | } |
| 120 | |
| Dmitri Plotnikov | 4b2aeb8 | 2010-03-03 14:48:34 -0800 | [diff] [blame] | 121 | int32_t GetPhonebookIndex(UCharIterator * iter, const char * locale, UChar * out, int32_t size, |
| 122 | UBool * isError) |
| 123 | { |
| 124 | if (size < MIN_OUTPUT_SIZE) { |
| 125 | *isError = TRUE; |
| 126 | return 0; |
| 127 | } |
| Dmitri Plotnikov | 3a74962 | 2010-03-03 11:29:46 -0800 | [diff] [blame] | 128 | |
| Dmitri Plotnikov | 4b2aeb8 | 2010-03-03 14:48:34 -0800 | [diff] [blame] | 129 | *isError = FALSE; |
| 130 | |
| 131 | // Normalize the first character to remove accents using the NFD normalization |
| 132 | UErrorCode errorCode = U_ZERO_ERROR; |
| 133 | int32_t len = unorm_next(iter, out, size, UNORM_NFD, |
| 134 | 0 /* options */, TRUE /* normalize */, NULL, &errorCode); |
| 135 | if (U_FAILURE(errorCode)) { |
| 136 | *isError = TRUE; |
| 137 | return 0; |
| 138 | } |
| 139 | |
| 140 | if (len == 0) { // Empty input string |
| 141 | return 0; |
| 142 | } |
| 143 | |
| 144 | UChar c = out[0]; |
| 145 | |
| Dmitri Plotnikov | 4b2aeb8 | 2010-03-03 14:48:34 -0800 | [diff] [blame] | 146 | if (!u_isalpha(c)) { |
| Daniel Lehmann | 167db39 | 2010-08-18 15:41:33 -0700 | [diff] [blame] | 147 | // Digits go into a # section. Everything else goes into the empty section |
| 148 | // The unicode function u_isdigit would also identify other characters as digits (arabic), |
| 149 | // but if we caught them here we'd risk having the same section before and after alpha-letters |
| 150 | // which might break the assumption that each section exists only once |
| 151 | if (c >= '0' && c <= '9') { |
| 152 | out[0] = '#'; |
| 153 | return 1; |
| 154 | } |
| Dmitri Plotnikov | 4b2aeb8 | 2010-03-03 14:48:34 -0800 | [diff] [blame] | 155 | return 0; |
| 156 | } |
| 157 | |
| 158 | c = u_toupper(c); |
| 159 | |
| 160 | // Check for explicitly mapped characters |
| 161 | UChar c_mapped = map_character(c, DEFAULT_CHAR_MAP, sizeof(DEFAULT_CHAR_MAP) / sizeof(UChar)); |
| 162 | if (c_mapped != 0) { |
| 163 | out[0] = c_mapped; |
| 164 | return 1; |
| 165 | } |
| 166 | |
| 167 | // Convert Kanas to Hiragana |
| 168 | UChar next = len > 2 ? out[1] : 0; |
| 169 | c = android::GetNormalizedCodePoint(c, next, NULL); |
| 170 | |
| 171 | // Traditional grouping of Hiragana characters |
| Yutaro Ogasawara | ae72de9 | 2011-11-23 11:14:21 +0900 | [diff] [blame^] | 172 | if (0x3041 <= c && c <= 0x309F) { |
| Dmitri Plotnikov | 4b2aeb8 | 2010-03-03 14:48:34 -0800 | [diff] [blame] | 173 | if (c < 0x304B) c = 0x3042; // a |
| 174 | else if (c < 0x3055) c = 0x304B; // ka |
| 175 | else if (c < 0x305F) c = 0x3055; // sa |
| 176 | else if (c < 0x306A) c = 0x305F; // ta |
| 177 | else if (c < 0x306F) c = 0x306A; // na |
| 178 | else if (c < 0x307E) c = 0x306F; // ha |
| Yutaro Ogasawara | ae72de9 | 2011-11-23 11:14:21 +0900 | [diff] [blame^] | 179 | else if (c < 0x3083) c = 0x307E; // ma |
| Dmitri Plotnikov | 4b2aeb8 | 2010-03-03 14:48:34 -0800 | [diff] [blame] | 180 | else if (c < 0x3089) c = 0x3084; // ya |
| Yutaro Ogasawara | ae72de9 | 2011-11-23 11:14:21 +0900 | [diff] [blame^] | 181 | else if (c < 0x308E) c = 0x3089; // ra |
| 182 | else if (c < 0x3094) c = 0x308F; // wa |
| 183 | else return 0; // Others are not readable |
| Dmitri Plotnikov | 4b2aeb8 | 2010-03-03 14:48:34 -0800 | [diff] [blame] | 184 | out[0] = c; |
| 185 | return 1; |
| Yutaro Ogasawara | ae72de9 | 2011-11-23 11:14:21 +0900 | [diff] [blame^] | 186 | } else if (0x30A0 <= c && c <= 0x30FF) { |
| 187 | // Dot, onbiki, iteration marks are not readable |
| 188 | return 0; |
| Dmitri Plotnikov | 4b2aeb8 | 2010-03-03 14:48:34 -0800 | [diff] [blame] | 189 | } |
| 190 | |
| 191 | if (is_CJK(c)) { |
| 192 | if (strncmp(locale, "ja", 2) == 0) { |
| 193 | // Japanese word meaning "misc" or "other" |
| Dmitri Plotnikov | 78def01 | 2010-03-04 09:12:22 -0800 | [diff] [blame] | 194 | out[0] = 0x4ED6; |
| 195 | return 1; |
| Dmitri Plotnikov | 4b2aeb8 | 2010-03-03 14:48:34 -0800 | [diff] [blame] | 196 | } else { |
| Dmitri Plotnikov | 3a74962 | 2010-03-03 11:29:46 -0800 | [diff] [blame] | 197 | return 0; |
| 198 | } |
| Dmitri Plotnikov | 4b2aeb8 | 2010-03-03 14:48:34 -0800 | [diff] [blame] | 199 | } |
| Dmitri Plotnikov | 3a74962 | 2010-03-03 11:29:46 -0800 | [diff] [blame] | 200 | |
| Dmitri Plotnikov | 4b2aeb8 | 2010-03-03 14:48:34 -0800 | [diff] [blame] | 201 | out[0] = c; |
| 202 | return 1; |
| Dmitri Plotnikov | 3a74962 | 2010-03-03 11:29:46 -0800 | [diff] [blame] | 203 | } |
| 204 | |
| 205 | } // namespace android |