blob: b1118a9843baef20dee1b53f05f6dc5e182b9676 [file] [log] [blame]
The Android Open Source Project9066cfe2009-03-03 19:31:44 -08001/*
2 * Copyright (C) 2007 The Android Open Source Project
3 *
4 * Licensed under the Apache License, Version 2.0 (the "License");
5 * you may not use this file except in compliance with the License.
6 * You may obtain a copy of the License at
7 *
8 * http://www.apache.org/licenses/LICENSE-2.0
9 *
10 * Unless required by applicable law or agreed to in writing, software
11 * distributed under the License is distributed on an "AS IS" BASIS,
12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 * See the License for the specific language governing permissions and
14 * limitations under the License.
15 */
16
17#include <objmng/drm_i18n.h>
18
19#define IS_GB2312_HIGH_BYTE(c) ((c) >= 0xA1 && (c) <= 0xF7)
20#define IS_GB2312_LOW_BYTE(c) ((c) >= 0xA1 && (c) <= 0xFE)
21#define IS_GBK_HIGH_BYTE(c) ((c) >= 0x81 && (c) <= 0xFE)
22#define IS_GBK_LOW_BYTE(c) ((c) >= 0x40 && (c) <= 0xFE && (c) != 0x7F)
23#define IS_BIG5_HIGH_BYTE(c) ((c) >= 0xA1 && (c) <= 0xF9)
24#define IS_BIG5_LOW_BYTE(c) (((c) >= 0x40 && (c) <= 0x7E) \
25 || ((c) >= 0xA1 && (c) <= 0xFE))
26#define IS_ASCII(c) ((c) <= 127)
27
28#define INVALID_UNICODE 0xFFFD
29
30#define I18N_LATIN1_SUPPORT
31#define I18N_UTF8_UTF16_SUPPORT
32
33
34/**
35 * Simply convert ISO 8859-1 (latin1) to unicode
36 */
37static int32_t latin1ToWcs(const uint8_t *mbs, int32_t mbsLen,
38 uint16_t *wcsBuf, int32_t bufSizeInWideChar,
39 int32_t *bytesConsumed);
40
41/**
42 * Convert one unicode char to ISO 8859-1 (latin1) byte
43 */
44static int32_t wcToLatin1(uint16_t wc, uint8_t * mbs, int32_t bufSize);
45
46/**
47 * Convert UTF-8 to unicode
48 */
49static int32_t utf8ToWcs(const uint8_t *mbs, int32_t mbsLen,
50 uint16_t *wcsBuf, int32_t bufSizeInWideChar,
51 int32_t *bytesConsumed);
52
53/**
54 * Convert one unicode char to UTF-8 bytes
55 */
56static int32_t wcToUtf8(uint16_t wc, uint8_t * mbs, int32_t bufSize);
57
58/**
59 * Convert UTF-16 BE to unicode
60 */
61static int32_t utf16beToWcs(const uint8_t *mbs, int32_t mbsLen,
62 uint16_t *wcsBuf, int32_t bufSizeInWideChar,
63 int32_t *bytesConsumed);
64
65/**
66 * Convert one unicode char to UTF-16 BE bytes
67 */
68static int32_t wcToUtf16be(uint16_t wc, uint8_t * mbs, int32_t bufSize);
69
70/**
71 * Convert UTF-16 LE to unicode
72 */
73static int32_t utf16leToWcs(const uint8_t *mbs, int32_t mbsLen,
74 uint16_t *wcsBuf, int32_t bufSizeInWideChar,
75 int32_t *bytesConsumed);
76
77/**
78 * Convert one unicode char to UTF-16 LE bytes
79 */
80static int32_t wcToUtf16le(uint16_t wc, uint8_t * mbs, int32_t bufSize);
81
82/*
83 * see drm_i18n.h
84 */
85int32_t DRM_i18n_mbsToWcs(DRM_Charset_t charset,
86 const uint8_t *mbs, int32_t mbsLen,
87 uint16_t *wcsBuf, int32_t bufSizeInWideChar,
88 int32_t *bytesConsumed)
89{
90 switch (charset)
91 {
92#ifdef I18N_GB2312_SUPPORT
93 case DRM_CHARSET_GB2312:
94 return gb2312ToWcs(mbs, mbsLen, wcsBuf, bufSizeInWideChar, bytesConsumed);
95#endif
96#ifdef I18N_GBK_SUPPORT
97 case DRM_CHARSET_GBK:
98 return gbkToWcs(mbs, mbsLen, wcsBuf, bufSizeInWideChar, bytesConsumed);
99#endif
100#ifdef I18N_BIG5_SUPPORT
101 case DRM_CHARSET_BIG5:
102 return big5ToWcs(mbs, mbsLen, wcsBuf, bufSizeInWideChar, bytesConsumed);
103#endif
104#ifdef I18N_LATIN1_SUPPORT
105 case DRM_CHARSET_LATIN1:
106 return latin1ToWcs(mbs, mbsLen, wcsBuf, bufSizeInWideChar, bytesConsumed);
107#endif
108#ifdef I18N_ISO8859X_SUPPORT
109 case DRM_CHARSET_LATIN2:
110 case DRM_CHARSET_LATIN3:
111 case DRM_CHARSET_LATIN4:
112 case DRM_CHARSET_CYRILLIC:
113 case DRM_CHARSET_ARABIC:
114 case DRM_CHARSET_GREEK:
115 case DRM_CHARSET_HEBREW:
116 case DRM_CHARSET_LATIN5:
117 case DRM_CHARSET_LATIN6:
118 case DRM_CHARSET_THAI:
119 case DRM_CHARSET_LATIN7:
120 case DRM_CHARSET_LATIN8:
121 case DRM_CHARSET_LATIN9:
122 case DRM_CHARSET_LATIN10:
123 return iso8859xToWcs(charset, mbs, mbsLen, wcsBuf, bufSizeInWideChar, bytesConsumed);
124#endif
125#ifdef I18N_UTF8_UTF16_SUPPORT
126 case DRM_CHARSET_UTF8:
127 return utf8ToWcs(mbs, mbsLen, wcsBuf, bufSizeInWideChar, bytesConsumed);
128 case DRM_CHARSET_UTF16BE:
129 return utf16beToWcs(mbs, mbsLen, wcsBuf, bufSizeInWideChar, bytesConsumed);
130 case DRM_CHARSET_UTF16LE:
131 return utf16leToWcs(mbs, mbsLen, wcsBuf, bufSizeInWideChar, bytesConsumed);
132#endif
133 default:
134 return -1;
135 }
136}
137
138/*
139 * see drm_i18n.h
140 */
141int32_t DRM_i18n_wcsToMbs(DRM_Charset_t charset,
142 const uint16_t *wcs, int32_t wcsLen,
143 uint8_t *mbsBuf, int32_t bufSizeInByte)
144{
145 int32_t (* wcToMbFunc)(uint16_t, uint8_t *, int32_t);
146 int32_t charIndex = 0;
147 int32_t numMultiBytes = 0;
148
149 switch (charset)
150 {
151#ifdef I18N_LATIN1_SUPPORT
152 case DRM_CHARSET_LATIN1:
153 wcToMbFunc = wcToLatin1;
154 break;
155#endif
156#ifdef I18N_UTF8_UTF16_SUPPORT
157 case DRM_CHARSET_UTF8:
158 wcToMbFunc = wcToUtf8;
159 break;
160 case DRM_CHARSET_UTF16BE:
161 wcToMbFunc = wcToUtf16be;
162 break;
163 case DRM_CHARSET_UTF16LE:
164 wcToMbFunc = wcToUtf16le;
165 break;
166#endif
167#ifdef I18N_ISO8859X_SUPPORT
168 case DRM_CHARSET_LATIN2:
169 case DRM_CHARSET_LATIN3:
170 case DRM_CHARSET_LATIN4:
171 case DRM_CHARSET_CYRILLIC:
172 case DRM_CHARSET_ARABIC:
173 case DRM_CHARSET_GREEK:
174 case DRM_CHARSET_HEBREW:
175 case DRM_CHARSET_LATIN5:
176 case DRM_CHARSET_LATIN6:
177 case DRM_CHARSET_THAI:
178 case DRM_CHARSET_LATIN7:
179 case DRM_CHARSET_LATIN8:
180 case DRM_CHARSET_LATIN9:
181 case DRM_CHARSET_LATIN10:
182 return wcsToIso8859x(charset, wcs, wcsLen, mbsBuf, bufSizeInByte);
183#endif
184 default:
185 return -1;
186 }
187
188 if (mbsBuf) {
189 while (numMultiBytes < bufSizeInByte && charIndex < wcsLen) {
190 /* TODO: handle surrogate pair values here */
191 int32_t mbLen = wcToMbFunc(wcs[charIndex],
192 &mbsBuf[numMultiBytes], bufSizeInByte - numMultiBytes);
193
194 if (numMultiBytes + mbLen > bufSizeInByte) {
195 /* Insufficient buffer. Don't update numMultiBytes */
196 break;
197 }
198 charIndex++;
199 numMultiBytes += mbLen;
200 }
201 } else {
202 while (charIndex < wcsLen) {
203 /* TODO: handle surrogate pair values here */
204 numMultiBytes += wcToMbFunc(wcs[charIndex], NULL, 0);
205 charIndex++;
206 }
207 }
208
209 return numMultiBytes;
210}
211
212
213#ifdef I18N_LATIN1_SUPPORT
214
215int32_t latin1ToWcs(const uint8_t *mbs, int32_t mbsLen,
216 uint16_t *wcsBuf, int32_t bufSizeInWideChar,
217 int32_t *bytesConsumed)
218{
219 int32_t charsToConvert;
220 int32_t len;
221
222 if (wcsBuf == NULL) {
223 return mbsLen;
224 }
225
226 len = charsToConvert = mbsLen > bufSizeInWideChar ? bufSizeInWideChar : mbsLen;
227 if (len < 0)
228 return 0;
229 while (len--) {
230 *wcsBuf++ = *mbs++;
231 }
232
233 if (bytesConsumed)
234 *bytesConsumed = charsToConvert;
235
236 return charsToConvert;
237}
238
239int32_t wcToLatin1(uint16_t wc, uint8_t * mbs, int32_t bufSize)
240{
241 uint8_t ch;
242
243 if (wc < 0x100) {
244 ch = (uint8_t)(wc & 0xff);
245 } else {
246 ch = '?';
247 }
248 if (mbs && bufSize > 0)
249 *mbs = ch;
250 return 1;
251}
252
253#endif /* I18N_LATIN1_SUPPORT */
254
255#ifdef I18N_UTF8_UTF16_SUPPORT
256
257int32_t utf8ToWcs(const uint8_t *mbs, int32_t mbsLen,
258 uint16_t *wcsBuf, int32_t bufSizeInWideChar,
259 int32_t *bytesConsumed)
260{
261 int32_t charsConverted = 0;
262 int32_t i = 0;
263 int32_t wideChar;
264
265 if (wcsBuf == NULL) {
266 /* No conversion but we're still going to calculate bytesConsumed */
267 bufSizeInWideChar = mbsLen * 2;
268 }
269
270 while((i < mbsLen) && (charsConverted < bufSizeInWideChar)) {
271 uint8_t ch = mbs[i];
272 uint8_t ch2, ch3, ch4;
273
274 wideChar = -1;
275
276 if(IS_ASCII(ch)) {
277 wideChar = ch;
278 i++;
279 } else if ((ch & 0xc0) == 0xc0) {
280 int utfStart = i;
281 if ((ch & 0xe0) == 0xc0) {
282 /* 2 byte sequence */
283 if (i + 1 < mbsLen && ((ch2 = mbs[i + 1]) & 0xc0) == 0x80) {
284 wideChar = (uint16_t)(((ch & 0x1F) << 6) | (ch2 & 0x3F));
285 i += 2;
286 } else {
287 /* skip incomplete sequence */
288 i++;
289 }
290 } else if ((ch & 0xf0) == 0xe0) {
291 /* 3 byte sequence */
292 if (i + 2 < mbsLen
293 && ((ch2 = mbs[i + 1]) & 0xc0) == 0x80
294 && ((ch3 = mbs[i + 2]) & 0xc0) == 0x80) {
295 wideChar = (uint16_t)(((ch & 0x0F) << 12) | ((ch2 & 0x3F) << 6) | (ch3 & 0x3F));
296 i += 3;
297 } else {
298 /* skip incomplete sequence (up to 2 bytes) */
299 i++;
300 if (i < mbsLen && (mbs[i] & 0xc0) == 0x80)
301 i++;
302 }
303 } else if ((ch & 0xf8) == 0xf0) {
304 /* 4 byte sequence */
305 if (i + 3 < mbsLen
306 && ((ch2 = mbs[i + 1]) & 0xc0) == 0x80
307 && ((ch3 = mbs[i + 2]) & 0xc0) == 0x80
308 && ((ch4 = mbs[i + 3]) & 0xc0) == 0x80) {
309 /* FIXME: we do NOT support U+10000 - U+10FFFF for now.
310 * leave it as 0xFFFD. */
311 wideChar = INVALID_UNICODE;
312 i += 4;
313 } else {
314 /* skip incomplete sequence (up to 3 bytes) */
315 i++;
316 if (i < mbsLen && (mbs[i] & 0xc0) == 0x80) {
317 i++;
318 if (i < mbsLen && (mbs[i] & 0xc0) == 0x80) {
319 i++;
320 }
321 }
322 }
323 } else {
324 /* invalid */
325 i++;
326 }
327 if (i >= mbsLen && wideChar == -1) {
328 /* Possible incomplete UTF-8 sequence at the end of mbs.
329 * Leave it to the caller.
330 */
331 i = utfStart;
332 break;
333 }
334 } else {
335 /* invalid */
336 i++;
337 }
338 if(wcsBuf) {
339 if (wideChar == -1)
340 wideChar = INVALID_UNICODE;
341 wcsBuf[charsConverted] = (uint16_t)wideChar;
342 }
343 charsConverted++;
344 }
345
346 if (bytesConsumed)
347 *bytesConsumed = i;
348
349 return charsConverted;
350}
351
352int32_t wcToUtf8(uint16_t wc, uint8_t * mbs, int32_t bufSize)
353{
354 if (wc <= 0x7f) {
355 if (mbs && (bufSize >= 1)) {
356 *mbs = (uint8_t)wc;
357 }
358 return 1;
359 } else if (wc <= 0x7ff) {
360 if (mbs && (bufSize >= 2)) {
361 *mbs++ = (uint8_t)((wc >> 6) | 0xc0);
362 *mbs = (uint8_t)((wc & 0x3f) | 0x80);
363 }
364 return 2;
365 } else {
366 if (mbs && (bufSize >= 3)) {
367 *mbs++ = (uint8_t)((wc >> 12) | 0xe0);
368 *mbs++ = (uint8_t)(((wc >> 6) & 0x3f)| 0x80);
369 *mbs = (uint8_t)((wc & 0x3f) | 0x80);
370 }
371 return 3;
372 }
373}
374
375int32_t utf16beToWcs(const uint8_t *mbs, int32_t mbsLen,
376 uint16_t *wcsBuf, int32_t bufSizeInWideChar,
377 int32_t *bytesConsumed)
378{
379 int32_t charsToConvert;
380 int32_t len;
381
382 if (wcsBuf == NULL) {
383 return mbsLen / 2;
384 }
385
386 len = charsToConvert = (mbsLen / 2) > bufSizeInWideChar ? bufSizeInWideChar : (mbsLen / 2);
387 while (len--) {
388 /* TODO: handle surrogate pair values */
389 *wcsBuf++ = (uint16_t)((*mbs << 8) | *(mbs + 1));
390 mbs += 2;
391 }
392
393 if (bytesConsumed)
394 *bytesConsumed = charsToConvert * 2;
395
396 return charsToConvert;
397}
398
399int32_t wcToUtf16be(uint16_t wc, uint8_t * mbs, int32_t bufSize)
400{
401 if (mbs && bufSize >= 2) {
402 /* TODO: handle surrogate pair values */
403 *mbs = (uint8_t)(wc >> 8);
404 *(mbs + 1) = (uint8_t)(wc & 0xff);
405 }
406 return 2;
407}
408
409int32_t utf16leToWcs(const uint8_t *mbs, int32_t mbsLen,
410 uint16_t *wcsBuf, int32_t bufSizeInWideChar,
411 int32_t *bytesConsumed)
412{
413 int32_t charsToConvert;
414 int32_t len;
415
416 if (wcsBuf == NULL) {
417 return mbsLen / 2;
418 }
419
420 len = charsToConvert = (mbsLen / 2) > bufSizeInWideChar ? bufSizeInWideChar : (mbsLen / 2);
421 while (len--) {
422 /* TODO: handle surrogate pair values */
423 *wcsBuf++ = (uint16_t)(*mbs | (*(mbs + 1) << 8));
424 mbs += 2;
425 }
426
427 if (bytesConsumed)
428 *bytesConsumed = charsToConvert * 2;
429
430 return charsToConvert;
431}
432
433int32_t wcToUtf16le(uint16_t wc, uint8_t * mbs, int32_t bufSize)
434{
435 if (mbs && bufSize >= 2) {
436 /* TODO: handle surrogate pair values */
437 *mbs = (uint8_t)(wc & 0xff);
438 *(mbs + 1) = (uint8_t)(wc >> 8);
439 }
440 return 2;
441}
442
443#endif /* I18N_UTF8_UTF16_SUPPORT */
444