blob: b1118a9843baef20dee1b53f05f6dc5e182b9676 [file] [log] [blame]
/*
* Copyright (C) 2007 The Android Open Source Project
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#include <objmng/drm_i18n.h>
#define IS_GB2312_HIGH_BYTE(c) ((c) >= 0xA1 && (c) <= 0xF7)
#define IS_GB2312_LOW_BYTE(c) ((c) >= 0xA1 && (c) <= 0xFE)
#define IS_GBK_HIGH_BYTE(c) ((c) >= 0x81 && (c) <= 0xFE)
#define IS_GBK_LOW_BYTE(c) ((c) >= 0x40 && (c) <= 0xFE && (c) != 0x7F)
#define IS_BIG5_HIGH_BYTE(c) ((c) >= 0xA1 && (c) <= 0xF9)
#define IS_BIG5_LOW_BYTE(c) (((c) >= 0x40 && (c) <= 0x7E) \
|| ((c) >= 0xA1 && (c) <= 0xFE))
#define IS_ASCII(c) ((c) <= 127)
#define INVALID_UNICODE 0xFFFD
#define I18N_LATIN1_SUPPORT
#define I18N_UTF8_UTF16_SUPPORT
/**
* Simply convert ISO 8859-1 (latin1) to unicode
*/
static int32_t latin1ToWcs(const uint8_t *mbs, int32_t mbsLen,
uint16_t *wcsBuf, int32_t bufSizeInWideChar,
int32_t *bytesConsumed);
/**
* Convert one unicode char to ISO 8859-1 (latin1) byte
*/
static int32_t wcToLatin1(uint16_t wc, uint8_t * mbs, int32_t bufSize);
/**
* Convert UTF-8 to unicode
*/
static int32_t utf8ToWcs(const uint8_t *mbs, int32_t mbsLen,
uint16_t *wcsBuf, int32_t bufSizeInWideChar,
int32_t *bytesConsumed);
/**
* Convert one unicode char to UTF-8 bytes
*/
static int32_t wcToUtf8(uint16_t wc, uint8_t * mbs, int32_t bufSize);
/**
* Convert UTF-16 BE to unicode
*/
static int32_t utf16beToWcs(const uint8_t *mbs, int32_t mbsLen,
uint16_t *wcsBuf, int32_t bufSizeInWideChar,
int32_t *bytesConsumed);
/**
* Convert one unicode char to UTF-16 BE bytes
*/
static int32_t wcToUtf16be(uint16_t wc, uint8_t * mbs, int32_t bufSize);
/**
* Convert UTF-16 LE to unicode
*/
static int32_t utf16leToWcs(const uint8_t *mbs, int32_t mbsLen,
uint16_t *wcsBuf, int32_t bufSizeInWideChar,
int32_t *bytesConsumed);
/**
* Convert one unicode char to UTF-16 LE bytes
*/
static int32_t wcToUtf16le(uint16_t wc, uint8_t * mbs, int32_t bufSize);
/*
* see drm_i18n.h
*/
int32_t DRM_i18n_mbsToWcs(DRM_Charset_t charset,
const uint8_t *mbs, int32_t mbsLen,
uint16_t *wcsBuf, int32_t bufSizeInWideChar,
int32_t *bytesConsumed)
{
switch (charset)
{
#ifdef I18N_GB2312_SUPPORT
case DRM_CHARSET_GB2312:
return gb2312ToWcs(mbs, mbsLen, wcsBuf, bufSizeInWideChar, bytesConsumed);
#endif
#ifdef I18N_GBK_SUPPORT
case DRM_CHARSET_GBK:
return gbkToWcs(mbs, mbsLen, wcsBuf, bufSizeInWideChar, bytesConsumed);
#endif
#ifdef I18N_BIG5_SUPPORT
case DRM_CHARSET_BIG5:
return big5ToWcs(mbs, mbsLen, wcsBuf, bufSizeInWideChar, bytesConsumed);
#endif
#ifdef I18N_LATIN1_SUPPORT
case DRM_CHARSET_LATIN1:
return latin1ToWcs(mbs, mbsLen, wcsBuf, bufSizeInWideChar, bytesConsumed);
#endif
#ifdef I18N_ISO8859X_SUPPORT
case DRM_CHARSET_LATIN2:
case DRM_CHARSET_LATIN3:
case DRM_CHARSET_LATIN4:
case DRM_CHARSET_CYRILLIC:
case DRM_CHARSET_ARABIC:
case DRM_CHARSET_GREEK:
case DRM_CHARSET_HEBREW:
case DRM_CHARSET_LATIN5:
case DRM_CHARSET_LATIN6:
case DRM_CHARSET_THAI:
case DRM_CHARSET_LATIN7:
case DRM_CHARSET_LATIN8:
case DRM_CHARSET_LATIN9:
case DRM_CHARSET_LATIN10:
return iso8859xToWcs(charset, mbs, mbsLen, wcsBuf, bufSizeInWideChar, bytesConsumed);
#endif
#ifdef I18N_UTF8_UTF16_SUPPORT
case DRM_CHARSET_UTF8:
return utf8ToWcs(mbs, mbsLen, wcsBuf, bufSizeInWideChar, bytesConsumed);
case DRM_CHARSET_UTF16BE:
return utf16beToWcs(mbs, mbsLen, wcsBuf, bufSizeInWideChar, bytesConsumed);
case DRM_CHARSET_UTF16LE:
return utf16leToWcs(mbs, mbsLen, wcsBuf, bufSizeInWideChar, bytesConsumed);
#endif
default:
return -1;
}
}
/*
* see drm_i18n.h
*/
int32_t DRM_i18n_wcsToMbs(DRM_Charset_t charset,
const uint16_t *wcs, int32_t wcsLen,
uint8_t *mbsBuf, int32_t bufSizeInByte)
{
int32_t (* wcToMbFunc)(uint16_t, uint8_t *, int32_t);
int32_t charIndex = 0;
int32_t numMultiBytes = 0;
switch (charset)
{
#ifdef I18N_LATIN1_SUPPORT
case DRM_CHARSET_LATIN1:
wcToMbFunc = wcToLatin1;
break;
#endif
#ifdef I18N_UTF8_UTF16_SUPPORT
case DRM_CHARSET_UTF8:
wcToMbFunc = wcToUtf8;
break;
case DRM_CHARSET_UTF16BE:
wcToMbFunc = wcToUtf16be;
break;
case DRM_CHARSET_UTF16LE:
wcToMbFunc = wcToUtf16le;
break;
#endif
#ifdef I18N_ISO8859X_SUPPORT
case DRM_CHARSET_LATIN2:
case DRM_CHARSET_LATIN3:
case DRM_CHARSET_LATIN4:
case DRM_CHARSET_CYRILLIC:
case DRM_CHARSET_ARABIC:
case DRM_CHARSET_GREEK:
case DRM_CHARSET_HEBREW:
case DRM_CHARSET_LATIN5:
case DRM_CHARSET_LATIN6:
case DRM_CHARSET_THAI:
case DRM_CHARSET_LATIN7:
case DRM_CHARSET_LATIN8:
case DRM_CHARSET_LATIN9:
case DRM_CHARSET_LATIN10:
return wcsToIso8859x(charset, wcs, wcsLen, mbsBuf, bufSizeInByte);
#endif
default:
return -1;
}
if (mbsBuf) {
while (numMultiBytes < bufSizeInByte && charIndex < wcsLen) {
/* TODO: handle surrogate pair values here */
int32_t mbLen = wcToMbFunc(wcs[charIndex],
&mbsBuf[numMultiBytes], bufSizeInByte - numMultiBytes);
if (numMultiBytes + mbLen > bufSizeInByte) {
/* Insufficient buffer. Don't update numMultiBytes */
break;
}
charIndex++;
numMultiBytes += mbLen;
}
} else {
while (charIndex < wcsLen) {
/* TODO: handle surrogate pair values here */
numMultiBytes += wcToMbFunc(wcs[charIndex], NULL, 0);
charIndex++;
}
}
return numMultiBytes;
}
#ifdef I18N_LATIN1_SUPPORT
int32_t latin1ToWcs(const uint8_t *mbs, int32_t mbsLen,
uint16_t *wcsBuf, int32_t bufSizeInWideChar,
int32_t *bytesConsumed)
{
int32_t charsToConvert;
int32_t len;
if (wcsBuf == NULL) {
return mbsLen;
}
len = charsToConvert = mbsLen > bufSizeInWideChar ? bufSizeInWideChar : mbsLen;
if (len < 0)
return 0;
while (len--) {
*wcsBuf++ = *mbs++;
}
if (bytesConsumed)
*bytesConsumed = charsToConvert;
return charsToConvert;
}
int32_t wcToLatin1(uint16_t wc, uint8_t * mbs, int32_t bufSize)
{
uint8_t ch;
if (wc < 0x100) {
ch = (uint8_t)(wc & 0xff);
} else {
ch = '?';
}
if (mbs && bufSize > 0)
*mbs = ch;
return 1;
}
#endif /* I18N_LATIN1_SUPPORT */
#ifdef I18N_UTF8_UTF16_SUPPORT
int32_t utf8ToWcs(const uint8_t *mbs, int32_t mbsLen,
uint16_t *wcsBuf, int32_t bufSizeInWideChar,
int32_t *bytesConsumed)
{
int32_t charsConverted = 0;
int32_t i = 0;
int32_t wideChar;
if (wcsBuf == NULL) {
/* No conversion but we're still going to calculate bytesConsumed */
bufSizeInWideChar = mbsLen * 2;
}
while((i < mbsLen) && (charsConverted < bufSizeInWideChar)) {
uint8_t ch = mbs[i];
uint8_t ch2, ch3, ch4;
wideChar = -1;
if(IS_ASCII(ch)) {
wideChar = ch;
i++;
} else if ((ch & 0xc0) == 0xc0) {
int utfStart = i;
if ((ch & 0xe0) == 0xc0) {
/* 2 byte sequence */
if (i + 1 < mbsLen && ((ch2 = mbs[i + 1]) & 0xc0) == 0x80) {
wideChar = (uint16_t)(((ch & 0x1F) << 6) | (ch2 & 0x3F));
i += 2;
} else {
/* skip incomplete sequence */
i++;
}
} else if ((ch & 0xf0) == 0xe0) {
/* 3 byte sequence */
if (i + 2 < mbsLen
&& ((ch2 = mbs[i + 1]) & 0xc0) == 0x80
&& ((ch3 = mbs[i + 2]) & 0xc0) == 0x80) {
wideChar = (uint16_t)(((ch & 0x0F) << 12) | ((ch2 & 0x3F) << 6) | (ch3 & 0x3F));
i += 3;
} else {
/* skip incomplete sequence (up to 2 bytes) */
i++;
if (i < mbsLen && (mbs[i] & 0xc0) == 0x80)
i++;
}
} else if ((ch & 0xf8) == 0xf0) {
/* 4 byte sequence */
if (i + 3 < mbsLen
&& ((ch2 = mbs[i + 1]) & 0xc0) == 0x80
&& ((ch3 = mbs[i + 2]) & 0xc0) == 0x80
&& ((ch4 = mbs[i + 3]) & 0xc0) == 0x80) {
/* FIXME: we do NOT support U+10000 - U+10FFFF for now.
* leave it as 0xFFFD. */
wideChar = INVALID_UNICODE;
i += 4;
} else {
/* skip incomplete sequence (up to 3 bytes) */
i++;
if (i < mbsLen && (mbs[i] & 0xc0) == 0x80) {
i++;
if (i < mbsLen && (mbs[i] & 0xc0) == 0x80) {
i++;
}
}
}
} else {
/* invalid */
i++;
}
if (i >= mbsLen && wideChar == -1) {
/* Possible incomplete UTF-8 sequence at the end of mbs.
* Leave it to the caller.
*/
i = utfStart;
break;
}
} else {
/* invalid */
i++;
}
if(wcsBuf) {
if (wideChar == -1)
wideChar = INVALID_UNICODE;
wcsBuf[charsConverted] = (uint16_t)wideChar;
}
charsConverted++;
}
if (bytesConsumed)
*bytesConsumed = i;
return charsConverted;
}
int32_t wcToUtf8(uint16_t wc, uint8_t * mbs, int32_t bufSize)
{
if (wc <= 0x7f) {
if (mbs && (bufSize >= 1)) {
*mbs = (uint8_t)wc;
}
return 1;
} else if (wc <= 0x7ff) {
if (mbs && (bufSize >= 2)) {
*mbs++ = (uint8_t)((wc >> 6) | 0xc0);
*mbs = (uint8_t)((wc & 0x3f) | 0x80);
}
return 2;
} else {
if (mbs && (bufSize >= 3)) {
*mbs++ = (uint8_t)((wc >> 12) | 0xe0);
*mbs++ = (uint8_t)(((wc >> 6) & 0x3f)| 0x80);
*mbs = (uint8_t)((wc & 0x3f) | 0x80);
}
return 3;
}
}
int32_t utf16beToWcs(const uint8_t *mbs, int32_t mbsLen,
uint16_t *wcsBuf, int32_t bufSizeInWideChar,
int32_t *bytesConsumed)
{
int32_t charsToConvert;
int32_t len;
if (wcsBuf == NULL) {
return mbsLen / 2;
}
len = charsToConvert = (mbsLen / 2) > bufSizeInWideChar ? bufSizeInWideChar : (mbsLen / 2);
while (len--) {
/* TODO: handle surrogate pair values */
*wcsBuf++ = (uint16_t)((*mbs << 8) | *(mbs + 1));
mbs += 2;
}
if (bytesConsumed)
*bytesConsumed = charsToConvert * 2;
return charsToConvert;
}
int32_t wcToUtf16be(uint16_t wc, uint8_t * mbs, int32_t bufSize)
{
if (mbs && bufSize >= 2) {
/* TODO: handle surrogate pair values */
*mbs = (uint8_t)(wc >> 8);
*(mbs + 1) = (uint8_t)(wc & 0xff);
}
return 2;
}
int32_t utf16leToWcs(const uint8_t *mbs, int32_t mbsLen,
uint16_t *wcsBuf, int32_t bufSizeInWideChar,
int32_t *bytesConsumed)
{
int32_t charsToConvert;
int32_t len;
if (wcsBuf == NULL) {
return mbsLen / 2;
}
len = charsToConvert = (mbsLen / 2) > bufSizeInWideChar ? bufSizeInWideChar : (mbsLen / 2);
while (len--) {
/* TODO: handle surrogate pair values */
*wcsBuf++ = (uint16_t)(*mbs | (*(mbs + 1) << 8));
mbs += 2;
}
if (bytesConsumed)
*bytesConsumed = charsToConvert * 2;
return charsToConvert;
}
int32_t wcToUtf16le(uint16_t wc, uint8_t * mbs, int32_t bufSize)
{
if (mbs && bufSize >= 2) {
/* TODO: handle surrogate pair values */
*mbs = (uint8_t)(wc & 0xff);
*(mbs + 1) = (uint8_t)(wc >> 8);
}
return 2;
}
#endif /* I18N_UTF8_UTF16_SUPPORT */