blob: 5f8781c3bb319209ab4be8891050403eea1e80da [file] [log] [blame]
The Android Open Source Project455ed292009-03-13 13:04:22 -07001/*
2 * Copyright (C) 2009 The Android Open Source Project
3 *
4 * Licensed under the Apache License, Version 2.0 (the "License");
5 * you may not use this file except in compliance with the License.
6 * You may obtain a copy of the License at
7 *
8 * http://www.apache.org/licenses/LICENSE-2.0
9 *
10 * Unless required by applicable law or agreed to in writing, software
11 * distributed under the License is distributed on an "AS IS" BASIS,
12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 * See the License for the specific language governing permissions and
14 * limitations under the License.
15 */
16
17#include <stdio.h>
18#include <stdlib.h>
19
20#include "PhoneticStringUtils.h"
21
22// We'd like 0 length string last of sorted list. So when input string is NULL
23// or 0 length string, we use these instead.
24#define CODEPOINT_FOR_NULL_STR 0xFFFD
25#define STR_FOR_NULL_STR "\xEF\xBF\xBD"
26
27// We assume that users will not notice strings not sorted properly when the
28// first 128 characters are the same.
29#define MAX_CODEPOINTS 128
30
31namespace android {
32
33int GetCodePointFromUtf8(const char *src, size_t len, size_t index, int *next) {
34 if (src == NULL || len <= index) {
35 return -1;
36 }
37
38 if ((src[index] >> 7) == 0) {
39 if (next != NULL) {
40 *next = index + 1;
41 }
42 return src[index];
43 }
44 if ((src[index] & 64) == 0) {
45 return -1;
46 }
47 int mask;
48 size_t num_to_read;
49 for (num_to_read = 1, mask = 64; // 01000000
50 num_to_read < 7 && (src[index] & mask) == mask;
51 num_to_read++, mask >>= 1) {
52 }
53 if (num_to_read == 7) {
54 return -1;
55 }
56
57 if (num_to_read + index > len) {
58 return -1;
59 }
60
61 {
62 size_t i;
63 for (i = 0, mask = 0; i < (7 - num_to_read); i++) {
64 mask = (mask << 1) + 1;
65 }
66 }
67
68 int codepoint = mask & src[index];
69
70 for (size_t i = 1; i < num_to_read; i++) {
71 if ((src[i + index] & 192) != 128) { // must be 10xxxxxx
72 return -1;
73 }
74 codepoint = (codepoint << 6) + (src[i + index] & 63);
75 }
76
77 if (next != NULL) {
78 *next = index + num_to_read;
79 }
80
81 return codepoint;
82}
83
84int GetPhoneticallySortableCodePoint(int codepoint,
85 int next_codepoint,
86 bool *next_is_consumed) {
87 if (next_is_consumed != NULL) {
88 *next_is_consumed = false;
89 }
90
91 if (codepoint <= 0x0020 || codepoint == 0x3000) {
Daisuke Miyakawa0c45e822009-03-27 19:41:52 -070092 // Whitespace should be ignored.
The Android Open Source Project455ed292009-03-13 13:04:22 -070093 // Note: Formally, more "whitespace" exist. This block only
94 // handles part of them
Daisuke Miyakawa0c45e822009-03-27 19:41:52 -070095 return -1;
The Android Open Source Project455ed292009-03-13 13:04:22 -070096 } else if ((0x0021 <= codepoint && codepoint <= 0x007E) ||
97 (0xFF01 <= codepoint && codepoint <= 0xFF5E)) {
98 // Ascii and fullwidth ascii
99
100 if (0x0021 <= codepoint && codepoint <= 0x007E) {
101 // Convert ascii to fullwidth ascii so that they become
102 // behind hiragana.
103 // 65248 = 0xFF01 - 0x0021
104 codepoint += 65248;
105 }
106
107 // Now, there is only fullwidth ascii.
108 if (0xFF10 <= codepoint && codepoint <= 0xFF19) {
109 // Numbers should be after alphabets but before symbols.
110 // 86 = 0xFF66
111 // (the beginning of halfwidth-katakankana space) - 0xFF10
112 return codepoint + 86;
113 } else if (0xFF41 <= codepoint && codepoint <= 0xFF5A) {
114 // Make lower alphabets same as capital alphabets.
115 // 32 = 0xFF41 - 0xFF21
116 return codepoint - 32;
117 } else if (0xFF01 <= codepoint && codepoint <= 0xFF0F) {
118 // Symbols (Ascii except alphabet nor number)
119 // These should be at the end of sorting, just after numebers
120 // (see below)
121 //
122 // We use halfwidth-katakana space for storing those symbols.
123 // 111 = 0xFF70 (0xFF19 + 86 + 1) - 0xFF01
124 return codepoint + 111;
125 } else if (0xFF1A <= codepoint && codepoint <= 0xFF20) {
126 // Symbols (cont.)
127 // 101 = 0xFF7F (0xFF0F + 111 + 1) - 0xFF1A
128 return codepoint + 101;
129 } else if (0xFF3B <= codepoint && codepoint <= 0xFF40) {
130 // Symbols (cont.)
131 // 75 = 0xFF86 (0xFF20 + 101 + 1) - 0xFF3B (= 101 - 26)
132 return codepoint + 75;
133 } else if (0xFF5B <= codepoint && codepoint <= 0xFF5E) {
134 // Symbols (cont.)
135 // 49 = 0xFF8C (0xFF40 + 75 + 1) - 0xFF5B (= 75 - 26)
136 return codepoint + 49;
137 } else {
138 return codepoint;
139 }
140 } else if (codepoint == 0x02DC || codepoint == 0x223C) {
141 // tilde
142 return 0xFF5E;
143 } else if (codepoint <= 0x3040 ||
144 (0x3100 <= codepoint && codepoint < 0xFF00) ||
145 codepoint == CODEPOINT_FOR_NULL_STR) {
146 // Move Kanji and other non-Japanese characters behind symbols.
147 return codepoint + 0x10000;
148 }
149
150 // Below is Kana-related handling.
151
152 // First, convert fullwidth katakana and halfwidth katakana to hiragana
153 if (0x30A1 <= codepoint && codepoint <= 0x30F6) {
154 // Make fullwidth katakana same as hiragana.
155 // 96 == 0x30A1 - 0x3041c
156 codepoint = codepoint - 96;
157 } else if (0xFF66 <= codepoint && codepoint <= 0xFF9F) {
158 // Make halfwidth katakana same as hiragana
159 switch (codepoint) {
160 case 0xFF66: // wo
161 codepoint = 0x3092;
162 break;
163 case 0xFF67: // xa
164 codepoint = 0x3041;
165 break;
166 case 0xFF68: // xi
167 codepoint = 0x3043;
168 break;
169 case 0xFF69: // xu
170 codepoint = 0x3045;
171 break;
172 case 0xFF6A: // xe
173 codepoint = 0x3047;
174 break;
175 case 0xFF6B: // xo
176 codepoint = 0x3049;
177 break;
178 case 0xFF6C: // xya
179 codepoint = 0x3083;
180 break;
181 case 0xFF6D: // xyu
182 codepoint = 0x3085;
183 break;
184 case 0xFF6E: // xyo
185 codepoint = 0x3087;
186 break;
187 case 0xFF6F: // xtsu
188 codepoint = 0x3063;
189 break;
190 case 0xFF70: // -
191 codepoint = 0x30FC;
192 break;
193 case 0xFF9C: // wa
194 codepoint = 0x308F;
195 break;
196 case 0xFF9D: // n
197 codepoint = 0x3093;
198 break;
199 default:
200 {
201 if (0xFF71 <= codepoint && codepoint <= 0xFF75) {
202 // a, i, u, e, o
203 if (codepoint == 0xFF73 && next_codepoint == 0xFF9E) {
204 if (next_is_consumed != NULL) {
205 *next_is_consumed = true;
206 }
207 codepoint = 0x3094; // vu
208 } else {
209 codepoint = 0x3042 + (codepoint - 0xFF71) * 2;
210 }
211 } else if (0xFF76 <= codepoint && codepoint <= 0xFF81) {
212 // ka - chi
213 if (next_codepoint == 0xFF9E) {
214 // "dakuten" (voiced mark)
215 if (next_is_consumed != NULL) {
216 *next_is_consumed = true;
217 }
218 codepoint = 0x304B + (codepoint - 0xFF76) * 2 + 1;
219 } else {
220 codepoint = 0x304B + (codepoint - 0xFF76) * 2;
221 }
222 } else if (0xFF82 <= codepoint && codepoint <= 0xFF84) {
223 // tsu, te, to (skip xtsu)
224 if (next_codepoint == 0xFF9E) {
225 // "dakuten" (voiced mark)
226 if (next_is_consumed != NULL) {
227 *next_is_consumed = true;
228 }
229 codepoint = 0x3064 + (codepoint - 0xFF82) * 2 + 1;
230 } else {
231 codepoint = 0x3064 + (codepoint - 0xFF82) * 2;
232 }
233 } else if (0xFF85 <= codepoint && codepoint <= 0xFF89) {
234 // na, ni, nu, ne, no
235 codepoint = 0x306A + (codepoint - 0xFF85);
236 } else if (0xFF8A <= codepoint && codepoint <= 0xFF8E) {
237 // ha, hi, hu, he, ho
238 if (next_codepoint == 0xFF9E) {
239 // "dakuten" (voiced mark)
240 if (next_is_consumed != NULL) {
241 *next_is_consumed = true;
242 }
243 codepoint = 0x306F + (codepoint - 0xFF8A) * 3 + 1;
244 } else if (next_codepoint == 0xFF9F) {
245 // "han-dakuten" (half voiced mark)
246 if (next_is_consumed != NULL) {
247 *next_is_consumed = true;
248 }
249 codepoint = 0x306F + (codepoint - 0xFF8A) * 3 + 2;
250 } else {
251 codepoint = 0x306F + (codepoint - 0xFF8A) * 3;
252 }
253 } else if (0xFF8F <= codepoint && codepoint <= 0xFF93) {
254 // ma, mi, mu, me, mo
255 codepoint = 0x307E + (codepoint - 0xFF8F);
256 } else if (0xFF94 <= codepoint && codepoint <= 0xFF96) {
257 // ya, yu, yo
258 codepoint = 0x3084 + (codepoint - 0xFF94) * 2;
259 } else if (0xFF97 <= codepoint && codepoint <= 0xFF9B) {
260 // ra, ri, ru, re, ro
261 codepoint = 0x3089 + (codepoint - 0xFF97);
262 }
263 // Note: 0xFF9C, 0xFF9D are handled above
264 } // end of default
265 } // end of case
266 }
267
268 // Trivial kana conversions.
269 // e.g. xa => a
270 switch (codepoint) {
271 case 0x3041:
272 case 0x3043:
273 case 0x3045:
274 case 0x3047:
275 case 0x3049:
276 case 0x308E: // xwa
277 codepoint++;
278 break;
279 case 0x3095: // xka
280 codepoint = 0x304B;
281 break;
282 case 0x3096: // xku
283 codepoint = 0x304F;
284 break;
285 }
286
287 return codepoint;
288}
289
290bool GetUtf8FromCodePoint(int codepoint, char *dst, size_t len, size_t *index) {
291 if (codepoint < 128) { // 1 << 7
292 if (*index >= len) {
293 return false;
294 }
295 // 0xxxxxxx
296 dst[*index] = static_cast<char>(codepoint);
297 (*index)++;
298 } else if (codepoint < 2048) { // 1 << (6 + 5)
299 if (*index + 1 >= len) {
300 return false;
301 }
302 // 110xxxxx
303 dst[(*index)++] = static_cast<char>(192 | (codepoint >> 6));
304 // 10xxxxxx
305 dst[(*index)++] = static_cast<char>(128 | (codepoint & 63));
306 } else if (codepoint < 65536) { // 1 << (6 * 2 + 4)
307 if (*index + 2 >= len) {
308 return false;
309 }
310 // 1110xxxx
311 dst[(*index)++] = static_cast<char>(224 | (codepoint >> 12));
312 // 10xxxxxx
313 dst[(*index)++] = static_cast<char>(128 | ((codepoint >> 6) & 63));
314 dst[(*index)++] = static_cast<char>(128 | (codepoint & 63));
315 } else if (codepoint < 2097152) { // 1 << (6 * 3 + 3)
316 if (*index + 3 >= len) {
317 return false;
318 }
319 // 11110xxx
320 dst[(*index)++] = static_cast<char>(240 | (codepoint >> 18));
321 // 10xxxxxx
322 dst[(*index)++] = static_cast<char>(128 | ((codepoint >> 12) & 63));
323 dst[(*index)++] = static_cast<char>(128 | ((codepoint >> 6) & 63));
324 dst[(*index)++] = static_cast<char>(128 | (codepoint & 63));
325 } else if (codepoint < 67108864) { // 1 << (6 * 2 + 2)
326 if (*index + 4 >= len) {
327 return false;
328 }
329 // 111110xx
330 dst[(*index)++] = static_cast<char>(248 | (codepoint >> 24));
331 // 10xxxxxx
332 dst[(*index)++] = static_cast<char>(128 | ((codepoint >> 18) & 63));
333 dst[(*index)++] = static_cast<char>(128 | ((codepoint >> 12) & 63));
334 dst[(*index)++] = static_cast<char>(128 | ((codepoint >> 6) & 63));
335 dst[(*index)++] = static_cast<char>(128 | (codepoint & 63));
336 } else {
337 if (*index + 5 >= len) {
338 return false;
339 }
340 // 1111110x
341 dst[(*index)++] = static_cast<char>(252 | (codepoint >> 30));
342 // 10xxxxxx
343 dst[(*index)++] = static_cast<char>(128 | ((codepoint >> 24) & 63));
344 dst[(*index)++] = static_cast<char>(128 | ((codepoint >> 18) & 63));
345 dst[(*index)++] = static_cast<char>(128 | ((codepoint >> 12) & 63));
346 dst[(*index)++] = static_cast<char>(128 | ((codepoint >> 6) & 63));
347 dst[(*index)++] = static_cast<char>(128 | (codepoint & 63));
348 }
349 return true;
350}
351
352bool GetPhoneticallySortableString(const char *src, char **dst, size_t *len){
353 if (dst == NULL || len == NULL) {
354 return false;
355 }
356
357 if (src == NULL || *src == '\0') {
358 src = STR_FOR_NULL_STR;
359 }
360
361 size_t src_len = strlen(src);
362 int codepoints[MAX_CODEPOINTS];
363 size_t new_len = 0;
364
365 size_t codepoint_index;
366 {
367 int i, next;
368 for (codepoint_index = 0, i = 0, next = 0;
369 static_cast<size_t>(i) < src_len &&
370 codepoint_index < MAX_CODEPOINTS;
Daisuke Miyakawa0c45e822009-03-27 19:41:52 -0700371 i = next) {
The Android Open Source Project455ed292009-03-13 13:04:22 -0700372 int codepoint = GetCodePointFromUtf8(src, src_len, i, &next);
373 if (codepoint <= 0) {
374 return false;
375 }
376 int tmp_next;
377 int next_codepoint = GetCodePointFromUtf8(src, src_len,
378 next, &tmp_next);
379 bool next_is_consumed = false;
380
381 // It is ok even if next_codepoint is negative.
382 codepoints[codepoint_index] =
383 GetPhoneticallySortableCodePoint(codepoint,
384 next_codepoint,
385 &next_is_consumed);
The Android Open Source Project455ed292009-03-13 13:04:22 -0700386 // dakuten (voiced mark) or han-dakuten (half-voiced mark) existed.
387 if (next_is_consumed) {
388 next = tmp_next;
389 }
390
Daisuke Miyakawa0c45e822009-03-27 19:41:52 -0700391 if (codepoints[codepoint_index] < 0) {
392 // Do not increment codepoint_index.
393 continue;
394 }
395
The Android Open Source Project455ed292009-03-13 13:04:22 -0700396 if (codepoints[codepoint_index] < 128) { // 1 << 7
397 new_len++;
398 } else if (codepoints[codepoint_index] < 2048) {
399 // 1 << (6 + 5)
400 new_len += 2;
401 } else if (codepoints[codepoint_index] < 65536) {
402 // 1 << (6 * 2 + 4)
403 new_len += 3;
404 } else if (codepoints[codepoint_index] < 2097152) {
405 // 1 << (6 * 3 + 3)
406 new_len += 4;
407 } else if (codepoints[codepoint_index] < 67108864) {
408 // 1 << (6 * 2 + 2)
409 new_len += 5;
410 } else {
411 new_len += 6;
412 }
Daisuke Miyakawa0c45e822009-03-27 19:41:52 -0700413
414 codepoint_index++;
The Android Open Source Project455ed292009-03-13 13:04:22 -0700415 }
416 }
417
Daisuke Miyakawa0c45e822009-03-27 19:41:52 -0700418 if (codepoint_index == 0) {
419 // If all of codepoints are invalid, we place the string at the end of
420 // the list.
421 codepoints[0] = 0x10000 + CODEPOINT_FOR_NULL_STR;
422 codepoint_index = 1;
423 new_len = 4;
424 }
425
The Android Open Source Project455ed292009-03-13 13:04:22 -0700426 new_len += 1; // For '\0'.
427
428 *dst = static_cast<char *>(malloc(sizeof(char) * new_len));
429 if (*dst == NULL) {
430 return false;
431 }
432
433 size_t ch_index;
434 {
435 size_t i;
436 for (i = 0, ch_index = 0; i < codepoint_index; i++) {
437 if (!GetUtf8FromCodePoint(codepoints[i], *dst,
438 new_len, &ch_index)) {
439 free(*dst);
440 *dst = NULL;
441 return false;
442 }
443 }
444 }
445
446 if (ch_index != new_len - 1) {
447 free(*dst);
448 *dst = NULL;
449 return false;
450 }
451
452 (*dst)[new_len - 1] = '\0';
453 *len = new_len;
454 return true;
455}
456
457} // namespace android