blob: 7c8d1858db6a8edc1f34101eded431ca8664d13b [file] [log] [blame]
The Android Open Source Project455ed292009-03-13 13:04:22 -07001/*
2 * Copyright (C) 2009 The Android Open Source Project
3 *
4 * Licensed under the Apache License, Version 2.0 (the "License");
5 * you may not use this file except in compliance with the License.
6 * You may obtain a copy of the License at
7 *
8 * http://www.apache.org/licenses/LICENSE-2.0
9 *
10 * Unless required by applicable law or agreed to in writing, software
11 * distributed under the License is distributed on an "AS IS" BASIS,
12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 * See the License for the specific language governing permissions and
14 * limitations under the License.
15 */
16
17#include <stdio.h>
18#include <stdlib.h>
19
20#include "PhoneticStringUtils.h"
21
22// We'd like 0 length string last of sorted list. So when input string is NULL
23// or 0 length string, we use these instead.
24#define CODEPOINT_FOR_NULL_STR 0xFFFD
25#define STR_FOR_NULL_STR "\xEF\xBF\xBD"
26
27// We assume that users will not notice strings not sorted properly when the
28// first 128 characters are the same.
29#define MAX_CODEPOINTS 128
30
31namespace android {
32
33int GetCodePointFromUtf8(const char *src, size_t len, size_t index, int *next) {
34 if (src == NULL || len <= index) {
35 return -1;
36 }
37
38 if ((src[index] >> 7) == 0) {
39 if (next != NULL) {
40 *next = index + 1;
41 }
42 return src[index];
43 }
44 if ((src[index] & 64) == 0) {
45 return -1;
46 }
47 int mask;
48 size_t num_to_read;
49 for (num_to_read = 1, mask = 64; // 01000000
50 num_to_read < 7 && (src[index] & mask) == mask;
51 num_to_read++, mask >>= 1) {
52 }
53 if (num_to_read == 7) {
54 return -1;
55 }
56
57 if (num_to_read + index > len) {
58 return -1;
59 }
60
61 {
62 size_t i;
63 for (i = 0, mask = 0; i < (7 - num_to_read); i++) {
64 mask = (mask << 1) + 1;
65 }
66 }
67
68 int codepoint = mask & src[index];
69
70 for (size_t i = 1; i < num_to_read; i++) {
71 if ((src[i + index] & 192) != 128) { // must be 10xxxxxx
72 return -1;
73 }
74 codepoint = (codepoint << 6) + (src[i + index] & 63);
75 }
76
77 if (next != NULL) {
78 *next = index + num_to_read;
79 }
80
81 return codepoint;
82}
83
84int GetPhoneticallySortableCodePoint(int codepoint,
85 int next_codepoint,
86 bool *next_is_consumed) {
87 if (next_is_consumed != NULL) {
88 *next_is_consumed = false;
89 }
90
91 if (codepoint <= 0x0020 || codepoint == 0x3000) {
92 // Whitespace.
93 // Skip without increment of the variable "new_len".
94 // Note: Formally, more "whitespace" exist. This block only
95 // handles part of them
96 return 0x0020;
97 } else if ((0x0021 <= codepoint && codepoint <= 0x007E) ||
98 (0xFF01 <= codepoint && codepoint <= 0xFF5E)) {
99 // Ascii and fullwidth ascii
100
101 if (0x0021 <= codepoint && codepoint <= 0x007E) {
102 // Convert ascii to fullwidth ascii so that they become
103 // behind hiragana.
104 // 65248 = 0xFF01 - 0x0021
105 codepoint += 65248;
106 }
107
108 // Now, there is only fullwidth ascii.
109 if (0xFF10 <= codepoint && codepoint <= 0xFF19) {
110 // Numbers should be after alphabets but before symbols.
111 // 86 = 0xFF66
112 // (the beginning of halfwidth-katakankana space) - 0xFF10
113 return codepoint + 86;
114 } else if (0xFF41 <= codepoint && codepoint <= 0xFF5A) {
115 // Make lower alphabets same as capital alphabets.
116 // 32 = 0xFF41 - 0xFF21
117 return codepoint - 32;
118 } else if (0xFF01 <= codepoint && codepoint <= 0xFF0F) {
119 // Symbols (Ascii except alphabet nor number)
120 // These should be at the end of sorting, just after numebers
121 // (see below)
122 //
123 // We use halfwidth-katakana space for storing those symbols.
124 // 111 = 0xFF70 (0xFF19 + 86 + 1) - 0xFF01
125 return codepoint + 111;
126 } else if (0xFF1A <= codepoint && codepoint <= 0xFF20) {
127 // Symbols (cont.)
128 // 101 = 0xFF7F (0xFF0F + 111 + 1) - 0xFF1A
129 return codepoint + 101;
130 } else if (0xFF3B <= codepoint && codepoint <= 0xFF40) {
131 // Symbols (cont.)
132 // 75 = 0xFF86 (0xFF20 + 101 + 1) - 0xFF3B (= 101 - 26)
133 return codepoint + 75;
134 } else if (0xFF5B <= codepoint && codepoint <= 0xFF5E) {
135 // Symbols (cont.)
136 // 49 = 0xFF8C (0xFF40 + 75 + 1) - 0xFF5B (= 75 - 26)
137 return codepoint + 49;
138 } else {
139 return codepoint;
140 }
141 } else if (codepoint == 0x02DC || codepoint == 0x223C) {
142 // tilde
143 return 0xFF5E;
144 } else if (codepoint <= 0x3040 ||
145 (0x3100 <= codepoint && codepoint < 0xFF00) ||
146 codepoint == CODEPOINT_FOR_NULL_STR) {
147 // Move Kanji and other non-Japanese characters behind symbols.
148 return codepoint + 0x10000;
149 }
150
151 // Below is Kana-related handling.
152
153 // First, convert fullwidth katakana and halfwidth katakana to hiragana
154 if (0x30A1 <= codepoint && codepoint <= 0x30F6) {
155 // Make fullwidth katakana same as hiragana.
156 // 96 == 0x30A1 - 0x3041c
157 codepoint = codepoint - 96;
158 } else if (0xFF66 <= codepoint && codepoint <= 0xFF9F) {
159 // Make halfwidth katakana same as hiragana
160 switch (codepoint) {
161 case 0xFF66: // wo
162 codepoint = 0x3092;
163 break;
164 case 0xFF67: // xa
165 codepoint = 0x3041;
166 break;
167 case 0xFF68: // xi
168 codepoint = 0x3043;
169 break;
170 case 0xFF69: // xu
171 codepoint = 0x3045;
172 break;
173 case 0xFF6A: // xe
174 codepoint = 0x3047;
175 break;
176 case 0xFF6B: // xo
177 codepoint = 0x3049;
178 break;
179 case 0xFF6C: // xya
180 codepoint = 0x3083;
181 break;
182 case 0xFF6D: // xyu
183 codepoint = 0x3085;
184 break;
185 case 0xFF6E: // xyo
186 codepoint = 0x3087;
187 break;
188 case 0xFF6F: // xtsu
189 codepoint = 0x3063;
190 break;
191 case 0xFF70: // -
192 codepoint = 0x30FC;
193 break;
194 case 0xFF9C: // wa
195 codepoint = 0x308F;
196 break;
197 case 0xFF9D: // n
198 codepoint = 0x3093;
199 break;
200 default:
201 {
202 if (0xFF71 <= codepoint && codepoint <= 0xFF75) {
203 // a, i, u, e, o
204 if (codepoint == 0xFF73 && next_codepoint == 0xFF9E) {
205 if (next_is_consumed != NULL) {
206 *next_is_consumed = true;
207 }
208 codepoint = 0x3094; // vu
209 } else {
210 codepoint = 0x3042 + (codepoint - 0xFF71) * 2;
211 }
212 } else if (0xFF76 <= codepoint && codepoint <= 0xFF81) {
213 // ka - chi
214 if (next_codepoint == 0xFF9E) {
215 // "dakuten" (voiced mark)
216 if (next_is_consumed != NULL) {
217 *next_is_consumed = true;
218 }
219 codepoint = 0x304B + (codepoint - 0xFF76) * 2 + 1;
220 } else {
221 codepoint = 0x304B + (codepoint - 0xFF76) * 2;
222 }
223 } else if (0xFF82 <= codepoint && codepoint <= 0xFF84) {
224 // tsu, te, to (skip xtsu)
225 if (next_codepoint == 0xFF9E) {
226 // "dakuten" (voiced mark)
227 if (next_is_consumed != NULL) {
228 *next_is_consumed = true;
229 }
230 codepoint = 0x3064 + (codepoint - 0xFF82) * 2 + 1;
231 } else {
232 codepoint = 0x3064 + (codepoint - 0xFF82) * 2;
233 }
234 } else if (0xFF85 <= codepoint && codepoint <= 0xFF89) {
235 // na, ni, nu, ne, no
236 codepoint = 0x306A + (codepoint - 0xFF85);
237 } else if (0xFF8A <= codepoint && codepoint <= 0xFF8E) {
238 // ha, hi, hu, he, ho
239 if (next_codepoint == 0xFF9E) {
240 // "dakuten" (voiced mark)
241 if (next_is_consumed != NULL) {
242 *next_is_consumed = true;
243 }
244 codepoint = 0x306F + (codepoint - 0xFF8A) * 3 + 1;
245 } else if (next_codepoint == 0xFF9F) {
246 // "han-dakuten" (half voiced mark)
247 if (next_is_consumed != NULL) {
248 *next_is_consumed = true;
249 }
250 codepoint = 0x306F + (codepoint - 0xFF8A) * 3 + 2;
251 } else {
252 codepoint = 0x306F + (codepoint - 0xFF8A) * 3;
253 }
254 } else if (0xFF8F <= codepoint && codepoint <= 0xFF93) {
255 // ma, mi, mu, me, mo
256 codepoint = 0x307E + (codepoint - 0xFF8F);
257 } else if (0xFF94 <= codepoint && codepoint <= 0xFF96) {
258 // ya, yu, yo
259 codepoint = 0x3084 + (codepoint - 0xFF94) * 2;
260 } else if (0xFF97 <= codepoint && codepoint <= 0xFF9B) {
261 // ra, ri, ru, re, ro
262 codepoint = 0x3089 + (codepoint - 0xFF97);
263 }
264 // Note: 0xFF9C, 0xFF9D are handled above
265 } // end of default
266 } // end of case
267 }
268
269 // Trivial kana conversions.
270 // e.g. xa => a
271 switch (codepoint) {
272 case 0x3041:
273 case 0x3043:
274 case 0x3045:
275 case 0x3047:
276 case 0x3049:
277 case 0x308E: // xwa
278 codepoint++;
279 break;
280 case 0x3095: // xka
281 codepoint = 0x304B;
282 break;
283 case 0x3096: // xku
284 codepoint = 0x304F;
285 break;
286 }
287
288 return codepoint;
289}
290
291bool GetUtf8FromCodePoint(int codepoint, char *dst, size_t len, size_t *index) {
292 if (codepoint < 128) { // 1 << 7
293 if (*index >= len) {
294 return false;
295 }
296 // 0xxxxxxx
297 dst[*index] = static_cast<char>(codepoint);
298 (*index)++;
299 } else if (codepoint < 2048) { // 1 << (6 + 5)
300 if (*index + 1 >= len) {
301 return false;
302 }
303 // 110xxxxx
304 dst[(*index)++] = static_cast<char>(192 | (codepoint >> 6));
305 // 10xxxxxx
306 dst[(*index)++] = static_cast<char>(128 | (codepoint & 63));
307 } else if (codepoint < 65536) { // 1 << (6 * 2 + 4)
308 if (*index + 2 >= len) {
309 return false;
310 }
311 // 1110xxxx
312 dst[(*index)++] = static_cast<char>(224 | (codepoint >> 12));
313 // 10xxxxxx
314 dst[(*index)++] = static_cast<char>(128 | ((codepoint >> 6) & 63));
315 dst[(*index)++] = static_cast<char>(128 | (codepoint & 63));
316 } else if (codepoint < 2097152) { // 1 << (6 * 3 + 3)
317 if (*index + 3 >= len) {
318 return false;
319 }
320 // 11110xxx
321 dst[(*index)++] = static_cast<char>(240 | (codepoint >> 18));
322 // 10xxxxxx
323 dst[(*index)++] = static_cast<char>(128 | ((codepoint >> 12) & 63));
324 dst[(*index)++] = static_cast<char>(128 | ((codepoint >> 6) & 63));
325 dst[(*index)++] = static_cast<char>(128 | (codepoint & 63));
326 } else if (codepoint < 67108864) { // 1 << (6 * 2 + 2)
327 if (*index + 4 >= len) {
328 return false;
329 }
330 // 111110xx
331 dst[(*index)++] = static_cast<char>(248 | (codepoint >> 24));
332 // 10xxxxxx
333 dst[(*index)++] = static_cast<char>(128 | ((codepoint >> 18) & 63));
334 dst[(*index)++] = static_cast<char>(128 | ((codepoint >> 12) & 63));
335 dst[(*index)++] = static_cast<char>(128 | ((codepoint >> 6) & 63));
336 dst[(*index)++] = static_cast<char>(128 | (codepoint & 63));
337 } else {
338 if (*index + 5 >= len) {
339 return false;
340 }
341 // 1111110x
342 dst[(*index)++] = static_cast<char>(252 | (codepoint >> 30));
343 // 10xxxxxx
344 dst[(*index)++] = static_cast<char>(128 | ((codepoint >> 24) & 63));
345 dst[(*index)++] = static_cast<char>(128 | ((codepoint >> 18) & 63));
346 dst[(*index)++] = static_cast<char>(128 | ((codepoint >> 12) & 63));
347 dst[(*index)++] = static_cast<char>(128 | ((codepoint >> 6) & 63));
348 dst[(*index)++] = static_cast<char>(128 | (codepoint & 63));
349 }
350 return true;
351}
352
353bool GetPhoneticallySortableString(const char *src, char **dst, size_t *len){
354 if (dst == NULL || len == NULL) {
355 return false;
356 }
357
358 if (src == NULL || *src == '\0') {
359 src = STR_FOR_NULL_STR;
360 }
361
362 size_t src_len = strlen(src);
363 int codepoints[MAX_CODEPOINTS];
364 size_t new_len = 0;
365
366 size_t codepoint_index;
367 {
368 int i, next;
369 for (codepoint_index = 0, i = 0, next = 0;
370 static_cast<size_t>(i) < src_len &&
371 codepoint_index < MAX_CODEPOINTS;
372 i = next, codepoint_index++) {
373 int codepoint = GetCodePointFromUtf8(src, src_len, i, &next);
374 if (codepoint <= 0) {
375 return false;
376 }
377 int tmp_next;
378 int next_codepoint = GetCodePointFromUtf8(src, src_len,
379 next, &tmp_next);
380 bool next_is_consumed = false;
381
382 // It is ok even if next_codepoint is negative.
383 codepoints[codepoint_index] =
384 GetPhoneticallySortableCodePoint(codepoint,
385 next_codepoint,
386 &next_is_consumed);
387
388 // dakuten (voiced mark) or han-dakuten (half-voiced mark) existed.
389 if (next_is_consumed) {
390 next = tmp_next;
391 }
392
393 if (codepoints[codepoint_index] < 128) { // 1 << 7
394 new_len++;
395 } else if (codepoints[codepoint_index] < 2048) {
396 // 1 << (6 + 5)
397 new_len += 2;
398 } else if (codepoints[codepoint_index] < 65536) {
399 // 1 << (6 * 2 + 4)
400 new_len += 3;
401 } else if (codepoints[codepoint_index] < 2097152) {
402 // 1 << (6 * 3 + 3)
403 new_len += 4;
404 } else if (codepoints[codepoint_index] < 67108864) {
405 // 1 << (6 * 2 + 2)
406 new_len += 5;
407 } else {
408 new_len += 6;
409 }
410 }
411 }
412
413 new_len += 1; // For '\0'.
414
415 *dst = static_cast<char *>(malloc(sizeof(char) * new_len));
416 if (*dst == NULL) {
417 return false;
418 }
419
420 size_t ch_index;
421 {
422 size_t i;
423 for (i = 0, ch_index = 0; i < codepoint_index; i++) {
424 if (!GetUtf8FromCodePoint(codepoints[i], *dst,
425 new_len, &ch_index)) {
426 free(*dst);
427 *dst = NULL;
428 return false;
429 }
430 }
431 }
432
433 if (ch_index != new_len - 1) {
434 free(*dst);
435 *dst = NULL;
436 return false;
437 }
438
439 (*dst)[new_len - 1] = '\0';
440 *len = new_len;
441 return true;
442}
443
444} // namespace android