blob: 24b1647ed3ed2f844866532cef8876c4207584da [file] [log] [blame]
The Android Open Source Project455ed292009-03-13 13:04:22 -07001/*
2 * Copyright (C) 2009 The Android Open Source Project
3 *
4 * Licensed under the Apache License, Version 2.0 (the "License");
5 * you may not use this file except in compliance with the License.
6 * You may obtain a copy of the License at
7 *
8 * http://www.apache.org/licenses/LICENSE-2.0
9 *
10 * Unless required by applicable law or agreed to in writing, software
11 * distributed under the License is distributed on an "AS IS" BASIS,
12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 * See the License for the specific language governing permissions and
14 * limitations under the License.
15 */
16
17#include <stdio.h>
18#include <stdlib.h>
19
20#include "PhoneticStringUtils.h"
21
22// We'd like 0 length string last of sorted list. So when input string is NULL
23// or 0 length string, we use these instead.
24#define CODEPOINT_FOR_NULL_STR 0xFFFD
25#define STR_FOR_NULL_STR "\xEF\xBF\xBD"
26
27// We assume that users will not notice strings not sorted properly when the
28// first 128 characters are the same.
29#define MAX_CODEPOINTS 128
30
31namespace android {
32
33int GetCodePointFromUtf8(const char *src, size_t len, size_t index, int *next) {
34 if (src == NULL || len <= index) {
35 return -1;
36 }
37
38 if ((src[index] >> 7) == 0) {
39 if (next != NULL) {
40 *next = index + 1;
41 }
42 return src[index];
43 }
44 if ((src[index] & 64) == 0) {
45 return -1;
46 }
47 int mask;
48 size_t num_to_read;
49 for (num_to_read = 1, mask = 64; // 01000000
50 num_to_read < 7 && (src[index] & mask) == mask;
51 num_to_read++, mask >>= 1) {
52 }
53 if (num_to_read == 7) {
54 return -1;
55 }
56
57 if (num_to_read + index > len) {
58 return -1;
59 }
60
61 {
62 size_t i;
63 for (i = 0, mask = 0; i < (7 - num_to_read); i++) {
64 mask = (mask << 1) + 1;
65 }
66 }
67
68 int codepoint = mask & src[index];
69
70 for (size_t i = 1; i < num_to_read; i++) {
71 if ((src[i + index] & 192) != 128) { // must be 10xxxxxx
72 return -1;
73 }
74 codepoint = (codepoint << 6) + (src[i + index] & 63);
75 }
76
77 if (next != NULL) {
78 *next = index + num_to_read;
79 }
80
81 return codepoint;
82}
83
Daisuke Miyakawad28cdc42009-05-18 14:51:52 +090084// Get hiragana from halfwidth katakana.
85static int GetHiraganaFromHalfwidthKatakana(int codepoint,
86 int next_codepoint,
87 bool *next_is_consumed) {
88 if (codepoint < 0xFF66 || 0xFF9F < codepoint) {
89 return codepoint;
90 }
91
92 switch (codepoint) {
93 case 0xFF66: // wo
94 return 0x3092;
95 case 0xFF67: // xa
96 return 0x3041;
97 case 0xFF68: // xi
98 return 0x3043;
99 case 0xFF69: // xu
100 return 0x3045;
101 case 0xFF6A: // xe
102 return 0x3047;
103 case 0xFF6B: // xo
104 return 0x3049;
105 case 0xFF6C: // xya
106 return 0x3083;
107 case 0xFF6D: // xyu
108 return 0x3085;
109 case 0xFF6E: // xyo
110 return 0x3087;
111 case 0xFF6F: // xtsu
112 return 0x3063;
113 case 0xFF70: // -
114 return 0x30FC;
115 case 0xFF9C: // wa
116 return 0x308F;
117 case 0xFF9D: // n
118 return 0x3093;
119 break;
120 default: {
121 if (0xFF71 <= codepoint && codepoint <= 0xFF75) {
122 // a, i, u, e, o
123 if (codepoint == 0xFF73 && next_codepoint == 0xFF9E) {
124 if (next_is_consumed != NULL) {
125 *next_is_consumed = true;
126 }
127 return 0x3094; // vu
128 } else {
129 return 0x3042 + (codepoint - 0xFF71) * 2;
130 }
131 } else if (0xFF76 <= codepoint && codepoint <= 0xFF81) {
132 // ka - chi
133 if (next_codepoint == 0xFF9E) {
134 // "dakuten" (voiced mark)
135 if (next_is_consumed != NULL) {
136 *next_is_consumed = true;
137 }
138 return 0x304B + (codepoint - 0xFF76) * 2 + 1;
139 } else {
140 return 0x304B + (codepoint - 0xFF76) * 2;
141 }
142 } else if (0xFF82 <= codepoint && codepoint <= 0xFF84) {
143 // tsu, te, to (skip xtsu)
144 if (next_codepoint == 0xFF9E) {
145 // "dakuten" (voiced mark)
146 if (next_is_consumed != NULL) {
147 *next_is_consumed = true;
148 }
149 return 0x3064 + (codepoint - 0xFF82) * 2 + 1;
150 } else {
151 return 0x3064 + (codepoint - 0xFF82) * 2;
152 }
153 } else if (0xFF85 <= codepoint && codepoint <= 0xFF89) {
154 // na, ni, nu, ne, no
155 return 0x306A + (codepoint - 0xFF85);
156 } else if (0xFF8A <= codepoint && codepoint <= 0xFF8E) {
157 // ha, hi, hu, he, ho
158 if (next_codepoint == 0xFF9E) {
159 // "dakuten" (voiced mark)
160 if (next_is_consumed != NULL) {
161 *next_is_consumed = true;
162 }
163 return 0x306F + (codepoint - 0xFF8A) * 3 + 1;
164 } else if (next_codepoint == 0xFF9F) {
165 // "han-dakuten" (half voiced mark)
166 if (next_is_consumed != NULL) {
167 *next_is_consumed = true;
168 }
169 return 0x306F + (codepoint - 0xFF8A) * 3 + 2;
170 } else {
171 return 0x306F + (codepoint - 0xFF8A) * 3;
172 }
173 } else if (0xFF8F <= codepoint && codepoint <= 0xFF93) {
174 // ma, mi, mu, me, mo
175 return 0x307E + (codepoint - 0xFF8F);
176 } else if (0xFF94 <= codepoint && codepoint <= 0xFF96) {
177 // ya, yu, yo
178 return 0x3084 + (codepoint - 0xFF94) * 2;
179 } else if (0xFF97 <= codepoint && codepoint <= 0xFF9B) {
180 // ra, ri, ru, re, ro
181 return 0x3089 + (codepoint - 0xFF97);
182 }
183 // Note: 0xFF9C, 0xFF9D are handled above
184 } // end of default
185 }
186
187 return codepoint;
188}
189
190// Assuming input is hiragana, convert the hiragana to "normalized" hiragana.
191static int GetNormalizedHiragana(int codepoint) {
192 if (codepoint < 0x3040 || 0x309F < codepoint) {
193 return codepoint;
194 }
195
196 // TODO: should care (semi-)voiced mark (0x3099, 0x309A).
197
198 // Trivial kana conversions.
199 // e.g. xa => a
200 switch (codepoint) {
201 case 0x3041:
202 case 0x3043:
203 case 0x3045:
204 case 0x3047:
205 case 0x3049:
206 case 0x308E: // xwa
207 return codepoint + 1;
208 case 0x3095: // xka
209 return 0x304B;
210 case 0x3096: // xku
211 return 0x304F;
212 default:
213 return codepoint;
214 }
215}
216
217static int GetNormalizedKana(int codepoint,
218 int next_codepoint,
219 bool *next_is_consumed) {
220 // First, convert fullwidth katakana and halfwidth katakana to hiragana.
221 if (0x30A1 <= codepoint && codepoint <= 0x30F6) {
222 // Make fullwidth katakana same as hiragana.
223 // 96 == 0x30A1 - 0x3041c
224 codepoint = codepoint - 96;
225 } else {
226 codepoint = GetHiraganaFromHalfwidthKatakana(
227 codepoint, next_codepoint, next_is_consumed);
228 }
229
230 // Normalize Hiragana.
231 return GetNormalizedHiragana(codepoint);
232}
233
The Android Open Source Project455ed292009-03-13 13:04:22 -0700234int GetPhoneticallySortableCodePoint(int codepoint,
235 int next_codepoint,
236 bool *next_is_consumed) {
237 if (next_is_consumed != NULL) {
238 *next_is_consumed = false;
239 }
240
241 if (codepoint <= 0x0020 || codepoint == 0x3000) {
Daisuke Miyakawa0c45e822009-03-27 19:41:52 -0700242 // Whitespace should be ignored.
The Android Open Source Project455ed292009-03-13 13:04:22 -0700243 // Note: Formally, more "whitespace" exist. This block only
244 // handles part of them
Daisuke Miyakawa0c45e822009-03-27 19:41:52 -0700245 return -1;
The Android Open Source Project455ed292009-03-13 13:04:22 -0700246 } else if ((0x0021 <= codepoint && codepoint <= 0x007E) ||
247 (0xFF01 <= codepoint && codepoint <= 0xFF5E)) {
248 // Ascii and fullwidth ascii
249
250 if (0x0021 <= codepoint && codepoint <= 0x007E) {
251 // Convert ascii to fullwidth ascii so that they become
252 // behind hiragana.
253 // 65248 = 0xFF01 - 0x0021
254 codepoint += 65248;
255 }
256
257 // Now, there is only fullwidth ascii.
258 if (0xFF10 <= codepoint && codepoint <= 0xFF19) {
259 // Numbers should be after alphabets but before symbols.
260 // 86 = 0xFF66
261 // (the beginning of halfwidth-katakankana space) - 0xFF10
262 return codepoint + 86;
263 } else if (0xFF41 <= codepoint && codepoint <= 0xFF5A) {
264 // Make lower alphabets same as capital alphabets.
265 // 32 = 0xFF41 - 0xFF21
266 return codepoint - 32;
267 } else if (0xFF01 <= codepoint && codepoint <= 0xFF0F) {
268 // Symbols (Ascii except alphabet nor number)
269 // These should be at the end of sorting, just after numebers
270 // (see below)
271 //
272 // We use halfwidth-katakana space for storing those symbols.
273 // 111 = 0xFF70 (0xFF19 + 86 + 1) - 0xFF01
274 return codepoint + 111;
275 } else if (0xFF1A <= codepoint && codepoint <= 0xFF20) {
276 // Symbols (cont.)
277 // 101 = 0xFF7F (0xFF0F + 111 + 1) - 0xFF1A
278 return codepoint + 101;
279 } else if (0xFF3B <= codepoint && codepoint <= 0xFF40) {
280 // Symbols (cont.)
281 // 75 = 0xFF86 (0xFF20 + 101 + 1) - 0xFF3B (= 101 - 26)
282 return codepoint + 75;
283 } else if (0xFF5B <= codepoint && codepoint <= 0xFF5E) {
284 // Symbols (cont.)
285 // 49 = 0xFF8C (0xFF40 + 75 + 1) - 0xFF5B (= 75 - 26)
286 return codepoint + 49;
287 } else {
288 return codepoint;
289 }
290 } else if (codepoint == 0x02DC || codepoint == 0x223C) {
291 // tilde
292 return 0xFF5E;
293 } else if (codepoint <= 0x3040 ||
294 (0x3100 <= codepoint && codepoint < 0xFF00) ||
295 codepoint == CODEPOINT_FOR_NULL_STR) {
296 // Move Kanji and other non-Japanese characters behind symbols.
297 return codepoint + 0x10000;
298 }
299
300 // Below is Kana-related handling.
301
Daisuke Miyakawad28cdc42009-05-18 14:51:52 +0900302 return GetNormalizedKana(codepoint, next_codepoint, next_is_consumed);
The Android Open Source Project455ed292009-03-13 13:04:22 -0700303}
304
Daisuke Miyakawad28cdc42009-05-18 14:51:52 +0900305int GetNormalizedCodePoint(int codepoint,
306 int next_codepoint,
307 bool *next_is_consumed) {
308 if (next_is_consumed != NULL) {
309 *next_is_consumed = false;
310 }
311
312 if (codepoint <= 0x0020 || codepoint == 0x3000) {
313 // Whitespaces. Keep it as is.
314 return codepoint;
315 } else if ((0x0021 <= codepoint && codepoint <= 0x007E) ||
316 (0xFF01 <= codepoint && codepoint <= 0xFF5E)) {
317 // Ascii and fullwidth ascii. Keep it as is
318 return codepoint;
319 } else if (codepoint == 0x02DC || codepoint == 0x223C) {
320 // tilde
321 return 0xFF5E;
322 } else if (codepoint <= 0x3040 ||
323 (0x3100 <= codepoint && codepoint < 0xFF00) ||
324 codepoint == CODEPOINT_FOR_NULL_STR) {
325 // Keep it as is.
326 return codepoint;
327 }
328
329 // Below is Kana-related handling.
330
331 return GetNormalizedKana(codepoint, next_codepoint, next_is_consumed);
332}
333
334
The Android Open Source Project455ed292009-03-13 13:04:22 -0700335bool GetUtf8FromCodePoint(int codepoint, char *dst, size_t len, size_t *index) {
336 if (codepoint < 128) { // 1 << 7
337 if (*index >= len) {
338 return false;
339 }
340 // 0xxxxxxx
341 dst[*index] = static_cast<char>(codepoint);
342 (*index)++;
343 } else if (codepoint < 2048) { // 1 << (6 + 5)
344 if (*index + 1 >= len) {
345 return false;
346 }
347 // 110xxxxx
348 dst[(*index)++] = static_cast<char>(192 | (codepoint >> 6));
349 // 10xxxxxx
350 dst[(*index)++] = static_cast<char>(128 | (codepoint & 63));
351 } else if (codepoint < 65536) { // 1 << (6 * 2 + 4)
352 if (*index + 2 >= len) {
353 return false;
354 }
355 // 1110xxxx
356 dst[(*index)++] = static_cast<char>(224 | (codepoint >> 12));
357 // 10xxxxxx
358 dst[(*index)++] = static_cast<char>(128 | ((codepoint >> 6) & 63));
359 dst[(*index)++] = static_cast<char>(128 | (codepoint & 63));
360 } else if (codepoint < 2097152) { // 1 << (6 * 3 + 3)
361 if (*index + 3 >= len) {
362 return false;
363 }
364 // 11110xxx
365 dst[(*index)++] = static_cast<char>(240 | (codepoint >> 18));
366 // 10xxxxxx
367 dst[(*index)++] = static_cast<char>(128 | ((codepoint >> 12) & 63));
368 dst[(*index)++] = static_cast<char>(128 | ((codepoint >> 6) & 63));
369 dst[(*index)++] = static_cast<char>(128 | (codepoint & 63));
370 } else if (codepoint < 67108864) { // 1 << (6 * 2 + 2)
371 if (*index + 4 >= len) {
372 return false;
373 }
374 // 111110xx
375 dst[(*index)++] = static_cast<char>(248 | (codepoint >> 24));
376 // 10xxxxxx
377 dst[(*index)++] = static_cast<char>(128 | ((codepoint >> 18) & 63));
378 dst[(*index)++] = static_cast<char>(128 | ((codepoint >> 12) & 63));
379 dst[(*index)++] = static_cast<char>(128 | ((codepoint >> 6) & 63));
380 dst[(*index)++] = static_cast<char>(128 | (codepoint & 63));
381 } else {
382 if (*index + 5 >= len) {
383 return false;
384 }
385 // 1111110x
386 dst[(*index)++] = static_cast<char>(252 | (codepoint >> 30));
387 // 10xxxxxx
388 dst[(*index)++] = static_cast<char>(128 | ((codepoint >> 24) & 63));
389 dst[(*index)++] = static_cast<char>(128 | ((codepoint >> 18) & 63));
390 dst[(*index)++] = static_cast<char>(128 | ((codepoint >> 12) & 63));
391 dst[(*index)++] = static_cast<char>(128 | ((codepoint >> 6) & 63));
392 dst[(*index)++] = static_cast<char>(128 | (codepoint & 63));
393 }
394 return true;
395}
396
Daisuke Miyakawad28cdc42009-05-18 14:51:52 +0900397static bool GetExpectedString(
398 const char *src, char **dst, size_t *len,
399 int (*get_codepoint_function)(int, int, bool*)) {
The Android Open Source Project455ed292009-03-13 13:04:22 -0700400 if (dst == NULL || len == NULL) {
401 return false;
402 }
403
404 if (src == NULL || *src == '\0') {
405 src = STR_FOR_NULL_STR;
406 }
407
408 size_t src_len = strlen(src);
409 int codepoints[MAX_CODEPOINTS];
410 size_t new_len = 0;
411
412 size_t codepoint_index;
413 {
414 int i, next;
415 for (codepoint_index = 0, i = 0, next = 0;
416 static_cast<size_t>(i) < src_len &&
417 codepoint_index < MAX_CODEPOINTS;
Daisuke Miyakawa0c45e822009-03-27 19:41:52 -0700418 i = next) {
The Android Open Source Project455ed292009-03-13 13:04:22 -0700419 int codepoint = GetCodePointFromUtf8(src, src_len, i, &next);
420 if (codepoint <= 0) {
421 return false;
422 }
423 int tmp_next;
424 int next_codepoint = GetCodePointFromUtf8(src, src_len,
425 next, &tmp_next);
426 bool next_is_consumed = false;
427
428 // It is ok even if next_codepoint is negative.
429 codepoints[codepoint_index] =
Daisuke Miyakawad28cdc42009-05-18 14:51:52 +0900430 get_codepoint_function(codepoint,
431 next_codepoint,
432 &next_is_consumed);
The Android Open Source Project455ed292009-03-13 13:04:22 -0700433 // dakuten (voiced mark) or han-dakuten (half-voiced mark) existed.
434 if (next_is_consumed) {
435 next = tmp_next;
436 }
437
Daisuke Miyakawa0c45e822009-03-27 19:41:52 -0700438 if (codepoints[codepoint_index] < 0) {
439 // Do not increment codepoint_index.
440 continue;
441 }
442
The Android Open Source Project455ed292009-03-13 13:04:22 -0700443 if (codepoints[codepoint_index] < 128) { // 1 << 7
444 new_len++;
445 } else if (codepoints[codepoint_index] < 2048) {
446 // 1 << (6 + 5)
447 new_len += 2;
448 } else if (codepoints[codepoint_index] < 65536) {
449 // 1 << (6 * 2 + 4)
450 new_len += 3;
451 } else if (codepoints[codepoint_index] < 2097152) {
452 // 1 << (6 * 3 + 3)
453 new_len += 4;
454 } else if (codepoints[codepoint_index] < 67108864) {
455 // 1 << (6 * 2 + 2)
456 new_len += 5;
457 } else {
458 new_len += 6;
459 }
Daisuke Miyakawa0c45e822009-03-27 19:41:52 -0700460
461 codepoint_index++;
The Android Open Source Project455ed292009-03-13 13:04:22 -0700462 }
463 }
464
Daisuke Miyakawa0c45e822009-03-27 19:41:52 -0700465 if (codepoint_index == 0) {
466 // If all of codepoints are invalid, we place the string at the end of
467 // the list.
468 codepoints[0] = 0x10000 + CODEPOINT_FOR_NULL_STR;
469 codepoint_index = 1;
470 new_len = 4;
471 }
472
The Android Open Source Project455ed292009-03-13 13:04:22 -0700473 new_len += 1; // For '\0'.
474
475 *dst = static_cast<char *>(malloc(sizeof(char) * new_len));
476 if (*dst == NULL) {
477 return false;
478 }
479
480 size_t ch_index;
481 {
482 size_t i;
483 for (i = 0, ch_index = 0; i < codepoint_index; i++) {
484 if (!GetUtf8FromCodePoint(codepoints[i], *dst,
485 new_len, &ch_index)) {
486 free(*dst);
487 *dst = NULL;
488 return false;
489 }
490 }
491 }
492
493 if (ch_index != new_len - 1) {
494 free(*dst);
495 *dst = NULL;
496 return false;
497 }
498
499 (*dst)[new_len - 1] = '\0';
500 *len = new_len;
501 return true;
502}
503
Daisuke Miyakawad28cdc42009-05-18 14:51:52 +0900504bool GetPhoneticallySortableString(const char *src, char **dst, size_t *len) {
505 return GetExpectedString(src, dst, len, GetPhoneticallySortableCodePoint);
506}
507
508bool GetNormalizedString(const char *src, char **dst, size_t *len) {
509 return GetExpectedString(src, dst, len, GetNormalizedCodePoint);
510}
511
The Android Open Source Project455ed292009-03-13 13:04:22 -0700512} // namespace android