Blame - libutils/Unicode.cpp - platform/system/core

blob: 1ee1a0b57cc0af4b3551a40c82a2fe28c053c396 [file] [log] [blame]

Kenny Root	ba0165b	2010-11-09 14:37:23 -0800	[diff] [blame]	1	/*
				2	* Copyright (C) 2005 The Android Open Source Project
				3	*
				4	* Licensed under the Apache License, Version 2.0 (the "License");
				5	* you may not use this file except in compliance with the License.
				6	* You may obtain a copy of the License at
				7	*
				8	* http://www.apache.org/licenses/LICENSE-2.0
				9	*
				10	* Unless required by applicable law or agreed to in writing, software
				11	* distributed under the License is distributed on an "AS IS" BASIS,
				12	* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
				13	* See the License for the specific language governing permissions and
				14	* limitations under the License.
				15	*/
				16
				17	#include <utils/Unicode.h>
				18
				19	#include <stddef.h>
				20
				21	#ifdef HAVE_WINSOCK
				22	# undef nhtol
				23	# undef htonl
				24	# undef nhtos
				25	# undef htons
				26
				27	# ifdef HAVE_LITTLE_ENDIAN
				28	# define ntohl(x) ( ((x) << 24) \| (((x) >> 24) & 255) \| (((x) << 8) & 0xff0000) \| (((x) >> 8) & 0xff00) )
				29	# define htonl(x) ntohl(x)
				30	# define ntohs(x) ( (((x) << 8) & 0xff00) \| (((x) >> 8) & 255) )
				31	# define htons(x) ntohs(x)
				32	# else
				33	# define ntohl(x) (x)
				34	# define htonl(x) (x)
				35	# define ntohs(x) (x)
				36	# define htons(x) (x)
				37	# endif
				38	#else
				39	# include <netinet/in.h>
				40	#endif
				41
				42	extern "C" {
				43
				44	static const char32_t kByteMask = 0x000000BF;
				45	static const char32_t kByteMark = 0x00000080;
				46
				47	// Surrogates aren't valid for UTF-32 characters, so define some
				48	// constants that will let us screen them out.
				49	static const char32_t kUnicodeSurrogateHighStart = 0x0000D800;
				50	static const char32_t kUnicodeSurrogateHighEnd = 0x0000DBFF;
				51	static const char32_t kUnicodeSurrogateLowStart = 0x0000DC00;
				52	static const char32_t kUnicodeSurrogateLowEnd = 0x0000DFFF;
				53	static const char32_t kUnicodeSurrogateStart = kUnicodeSurrogateHighStart;
				54	static const char32_t kUnicodeSurrogateEnd = kUnicodeSurrogateLowEnd;
				55	static const char32_t kUnicodeMaxCodepoint = 0x0010FFFF;
				56
				57	// Mask used to set appropriate bits in first byte of UTF-8 sequence,
				58	// indexed by number of bytes in the sequence.
				59	// 0xxxxxxx
				60	// -> (00-7f) 7bit. Bit mask for the first byte is 0x00000000
				61	// 110yyyyx 10xxxxxx
				62	// -> (c0-df)(80-bf) 11bit. Bit mask is 0x000000C0
				63	// 1110yyyy 10yxxxxx 10xxxxxx
				64	// -> (e0-ef)(80-bf)(80-bf) 16bit. Bit mask is 0x000000E0
				65	// 11110yyy 10yyxxxx 10xxxxxx 10xxxxxx
				66	// -> (f0-f7)(80-bf)(80-bf)(80-bf) 21bit. Bit mask is 0x000000F0
				67	static const char32_t kFirstByteMark[] = {
				68	0x00000000, 0x00000000, 0x000000C0, 0x000000E0, 0x000000F0
				69	};
				70
				71	// --------------------------------------------------------------------------
				72	// UTF-32
				73	// --------------------------------------------------------------------------
				74
				75	/**
				76	* Return number of UTF-8 bytes required for the character. If the character
				77	* is invalid, return size of 0.
				78	*/
				79	static inline size_t utf32_codepoint_utf8_length(char32_t srcChar)
				80	{
				81	// Figure out how many bytes the result will require.
				82	if (srcChar < 0x00000080) {
				83	return 1;
				84	} else if (srcChar < 0x00000800) {
				85	return 2;
				86	} else if (srcChar < 0x00010000) {
				87	if ((srcChar < kUnicodeSurrogateStart) \|\| (srcChar > kUnicodeSurrogateEnd)) {
				88	return 3;
				89	} else {
				90	// Surrogates are invalid UTF-32 characters.
				91	return 0;
				92	}
				93	}
				94	// Max code point for Unicode is 0x0010FFFF.
				95	else if (srcChar <= kUnicodeMaxCodepoint) {
				96	return 4;
				97	} else {
				98	// Invalid UTF-32 character.
				99	return 0;
				100	}
				101	}
				102
				103	// Write out the source character to <dstP>.
				104
				105	static inline void utf32_codepoint_to_utf8(uint8_t* dstP, char32_t srcChar, size_t bytes)
				106	{
				107	dstP += bytes;
				108	switch (bytes)
				109	{ /* note: everything falls through. */
				110	case 4: *--dstP = (uint8_t)((srcChar \| kByteMark) & kByteMask); srcChar >>= 6;
				111	case 3: *--dstP = (uint8_t)((srcChar \| kByteMark) & kByteMask); srcChar >>= 6;
				112	case 2: *--dstP = (uint8_t)((srcChar \| kByteMark) & kByteMask); srcChar >>= 6;
				113	case 1: *--dstP = (uint8_t)(srcChar \| kFirstByteMark[bytes]);
				114	}
				115	}
				116
				117	size_t strlen32(const char32_t *s)
				118	{
				119	const char32_t *ss = s;
				120	while ( *ss )
				121	ss++;
				122	return ss-s;
				123	}
				124
				125	size_t strnlen32(const char32_t *s, size_t maxlen)
				126	{
				127	const char32_t *ss = s;
				128	while ((maxlen > 0) && *ss) {
				129	ss++;
				130	maxlen--;
				131	}
				132	return ss-s;
				133	}
				134
				135	static inline int32_t utf32_at_internal(const char* cur, size_t *num_read)
				136	{
				137	const char first_char = *cur;
				138	if ((first_char & 0x80) == 0) { // ASCII
				139	*num_read = 1;
				140	return *cur;
				141	}
				142	cur++;
				143	char32_t mask, to_ignore_mask;
				144	size_t num_to_read = 0;
				145	char32_t utf32 = first_char;
				146	for (num_to_read = 1, mask = 0x40, to_ignore_mask = 0xFFFFFF80;
				147	(first_char & mask);
				148	num_to_read++, to_ignore_mask \|= mask, mask >>= 1) {
				149	// 0x3F == 00111111
				150	utf32 = (utf32 << 6) + (*cur++ & 0x3F);
				151	}
				152	to_ignore_mask \|= mask;
				153	utf32 &= ~(to_ignore_mask << (6 * (num_to_read - 1)));
				154
				155	*num_read = num_to_read;
				156	return static_cast<int32_t>(utf32);
				157	}
				158
				159	int32_t utf32_from_utf8_at(const char src, size_t src_len, size_t index, size_t next_index)
				160	{
				161	if (index >= src_len) {
				162	return -1;
				163	}
				164	size_t dummy_index;
				165	if (next_index == NULL) {
				166	next_index = &dummy_index;
				167	}
				168	size_t num_read;
				169	int32_t ret = utf32_at_internal(src + index, &num_read);
				170	if (ret >= 0) {
				171	*next_index = index + num_read;
				172	}
				173
				174	return ret;
				175	}
				176
				177	ssize_t utf32_to_utf8_length(const char32_t *src, size_t src_len)
				178	{
				179	if (src == NULL \|\| src_len == 0) {
				180	return -1;
				181	}
				182
				183	size_t ret = 0;
				184	const char32_t *end = src + src_len;
				185	while (src < end) {
				186	ret += utf32_codepoint_utf8_length(*src++);
				187	}
				188	return ret;
				189	}
				190
				191	void utf32_to_utf8(const char32_t* src, size_t src_len, char* dst)
				192	{
				193	if (src == NULL \|\| src_len == 0 \|\| dst == NULL) {
				194	return;
				195	}
				196
				197	const char32_t *cur_utf32 = src;
				198	const char32_t *end_utf32 = src + src_len;
				199	char *cur = dst;
				200	while (cur_utf32 < end_utf32) {
				201	size_t len = utf32_codepoint_utf8_length(*cur_utf32);
				202	utf32_codepoint_to_utf8((uint8_t )cur, cur_utf32++, len);
				203	cur += len;
				204	}
				205	*cur = '\0';
				206	}
				207
				208	// --------------------------------------------------------------------------
				209	// UTF-16
				210	// --------------------------------------------------------------------------
				211
				212	int strcmp16(const char16_t s1, const char16_t s2)
				213	{
				214	char16_t ch;
				215	int d = 0;
				216
				217	while ( 1 ) {
				218	d = (int)(ch = s1++) - (int)s2++;
				219	if ( d \|\| !ch )
				220	break;
				221	}
				222
				223	return d;
				224	}
				225
				226	int strncmp16(const char16_t s1, const char16_t s2, size_t n)
				227	{
				228	char16_t ch;
				229	int d = 0;
				230
				231	while ( n-- ) {
				232	d = (int)(ch = s1++) - (int)s2++;
				233	if ( d \|\| !ch )
				234	break;
				235	}
				236
				237	return d;
				238	}
				239
				240	char16_t strcpy16(char16_t dst, const char16_t *src)
				241	{
				242	char16_t *q = dst;
				243	const char16_t *p = src;
				244	char16_t ch;
				245
				246	do {
				247	q++ = ch = p++;
				248	} while ( ch );
				249
				250	return dst;
				251	}
				252
				253	size_t strlen16(const char16_t *s)
				254	{
				255	const char16_t *ss = s;
				256	while ( *ss )
				257	ss++;
				258	return ss-s;
				259	}
				260
				261
				262	char16_t strncpy16(char16_t dst, const char16_t *src, size_t n)
				263	{
				264	char16_t *q = dst;
				265	const char16_t *p = src;
				266	char ch;
				267
				268	while (n) {
				269	n--;
				270	q++ = ch = p++;
				271	if ( !ch )
				272	break;
				273	}
				274
				275	*q = 0;
				276
				277	return dst;
				278	}
				279
				280	size_t strnlen16(const char16_t *s, size_t maxlen)
				281	{
				282	const char16_t *ss = s;
				283
				284	/* Important: the maxlen test must precede the reference through ss;
				285	since the byte beyond the maximum may segfault */
				286	while ((maxlen > 0) && *ss) {
				287	ss++;
				288	maxlen--;
				289	}
				290	return ss-s;
				291	}
				292
				293	int strzcmp16(const char16_t s1, size_t n1, const char16_t s2, size_t n2)
				294	{
				295	const char16_t* e1 = s1+n1;
				296	const char16_t* e2 = s2+n2;
				297
				298	while (s1 < e1 && s2 < e2) {
				299	const int d = (int)s1++ - (int)s2++;
				300	if (d) {
				301	return d;
				302	}
				303	}
				304
				305	return n1 < n2
				306	? (0 - (int)*s2)
				307	: (n1 > n2
				308	? ((int)*s1 - 0)
				309	: 0);
				310	}
				311
				312	int strzcmp16_h_n(const char16_t s1H, size_t n1, const char16_t s2N, size_t n2)
				313	{
				314	const char16_t* e1 = s1H+n1;
				315	const char16_t* e2 = s2N+n2;
				316
				317	while (s1H < e1 && s2N < e2) {
				318	const char16_t c2 = ntohs(*s2N);
				319	const int d = (int)*s1H++ - (int)c2;
				320	s2N++;
				321	if (d) {
				322	return d;
				323	}
				324	}
				325
				326	return n1 < n2
				327	? (0 - (int)ntohs(*s2N))
				328	: (n1 > n2
				329	? ((int)*s1H - 0)
				330	: 0);
				331	}
				332
				333	void utf16_to_utf8(const char16_t* src, size_t src_len, char* dst)
				334	{
				335	if (src == NULL \|\| src_len == 0 \|\| dst == NULL) {
				336	return;
				337	}
				338
				339	const char16_t* cur_utf16 = src;
				340	const char16_t* const end_utf16 = src + src_len;
				341	char *cur = dst;
				342	while (cur_utf16 < end_utf16) {
				343	char32_t utf32;
				344	// surrogate pairs
Cylen Yao	72299bf	2014-06-04 19:11:27 +0800	[diff] [blame^]	345	if((*cur_utf16 & 0xFC00) == 0xD800 && (cur_utf16 + 1) < end_utf16
				346	&& (*(cur_utf16 + 1) & 0xFC00) == 0xDC00) {
Kenny Root	ba0165b	2010-11-09 14:37:23 -0800	[diff] [blame]	347	utf32 = (*cur_utf16++ - 0xD800) << 10;
				348	utf32 \|= *cur_utf16++ - 0xDC00;
				349	utf32 += 0x10000;
				350	} else {
				351	utf32 = (char32_t) *cur_utf16++;
				352	}
				353	const size_t len = utf32_codepoint_utf8_length(utf32);
				354	utf32_codepoint_to_utf8((uint8_t*)cur, utf32, len);
				355	cur += len;
				356	}
				357	*cur = '\0';
				358	}
				359
				360	// --------------------------------------------------------------------------
				361	// UTF-8
				362	// --------------------------------------------------------------------------
				363
				364	ssize_t utf8_length(const char *src)
				365	{
				366	const char *cur = src;
				367	size_t ret = 0;
				368	while (*cur != '\0') {
				369	const char first_char = *cur++;
				370	if ((first_char & 0x80) == 0) { // ASCII
				371	ret += 1;
				372	continue;
				373	}
				374	// (UTF-8's character must not be like 10xxxxxx,
				375	// but 110xxxxx, 1110xxxx, ... or 1111110x)
				376	if ((first_char & 0x40) == 0) {
				377	return -1;
				378	}
				379
				380	int32_t mask, to_ignore_mask;
				381	size_t num_to_read = 0;
				382	char32_t utf32 = 0;
				383	for (num_to_read = 1, mask = 0x40, to_ignore_mask = 0x80;
				384	num_to_read < 5 && (first_char & mask);
				385	num_to_read++, to_ignore_mask \|= mask, mask >>= 1) {
				386	if ((*cur & 0xC0) != 0x80) { // must be 10xxxxxx
				387	return -1;
				388	}
				389	// 0x3F == 00111111
				390	utf32 = (utf32 << 6) + (*cur++ & 0x3F);
				391	}
				392	// "first_char" must be (110xxxxx - 11110xxx)
				393	if (num_to_read == 5) {
				394	return -1;
				395	}
				396	to_ignore_mask \|= mask;
				397	utf32 \|= ((~to_ignore_mask) & first_char) << (6 * (num_to_read - 1));
				398	if (utf32 > kUnicodeMaxCodepoint) {
				399	return -1;
				400	}
				401
				402	ret += num_to_read;
				403	}
				404	return ret;
				405	}
				406
				407	ssize_t utf16_to_utf8_length(const char16_t *src, size_t src_len)
				408	{
				409	if (src == NULL \|\| src_len == 0) {
				410	return -1;
				411	}
				412
				413	size_t ret = 0;
				414	const char16_t* const end = src + src_len;
				415	while (src < end) {
				416	if ((*src & 0xFC00) == 0xD800 && (src + 1) < end
				417	&& (*++src & 0xFC00) == 0xDC00) {
				418	// surrogate pairs are always 4 bytes.
				419	ret += 4;
				420	src++;
				421	} else {
				422	ret += utf32_codepoint_utf8_length((char32_t) *src++);
				423	}
				424	}
				425	return ret;
				426	}
				427
				428	/**
				429	* Returns 1-4 based on the number of leading bits.
				430	*
				431	* 1111 -> 4
				432	* 1110 -> 3
				433	* 110x -> 2
				434	* 10xx -> 1
				435	* 0xxx -> 1
				436	*/
				437	static inline size_t utf8_codepoint_len(uint8_t ch)
				438	{
				439	return ((0xe5000000 >> ((ch >> 3) & 0x1e)) & 3) + 1;
				440	}
				441
				442	static inline void utf8_shift_and_mask(uint32_t* codePoint, const uint8_t byte)
				443	{
				444	*codePoint <<= 6;
				445	*codePoint \|= 0x3F & byte;
				446	}
				447
				448	size_t utf8_to_utf32_length(const char *src, size_t src_len)
				449	{
				450	if (src == NULL \|\| src_len == 0) {
				451	return 0;
				452	}
				453	size_t ret = 0;
				454	const char* cur;
				455	const char* end;
				456	size_t num_to_skip;
				457	for (cur = src, end = src + src_len, num_to_skip = 1;
				458	cur < end;
				459	cur += num_to_skip, ret++) {
				460	const char first_char = *cur;
				461	num_to_skip = 1;
				462	if ((first_char & 0x80) == 0) { // ASCII
				463	continue;
				464	}
				465	int32_t mask;
				466
				467	for (mask = 0x40; (first_char & mask); num_to_skip++, mask >>= 1) {
				468	}
				469	}
				470	return ret;
				471	}
				472
				473	void utf8_to_utf32(const char* src, size_t src_len, char32_t* dst)
				474	{
				475	if (src == NULL \|\| src_len == 0 \|\| dst == NULL) {
				476	return;
				477	}
				478
				479	const char* cur = src;
				480	const char* const end = src + src_len;
				481	char32_t* cur_utf32 = dst;
				482	while (cur < end) {
				483	size_t num_read;
				484	*cur_utf32++ = static_cast<char32_t>(utf32_at_internal(cur, &num_read));
				485	cur += num_read;
				486	}
				487	*cur_utf32 = 0;
				488	}
				489
				490	static inline uint32_t utf8_to_utf32_codepoint(const uint8_t *src, size_t length)
				491	{
				492	uint32_t unicode;
				493
				494	switch (length)
				495	{
				496	case 1:
				497	return src[0];
				498	case 2:
				499	unicode = src[0] & 0x1f;
				500	utf8_shift_and_mask(&unicode, src[1]);
				501	return unicode;
				502	case 3:
				503	unicode = src[0] & 0x0f;
				504	utf8_shift_and_mask(&unicode, src[1]);
				505	utf8_shift_and_mask(&unicode, src[2]);
				506	return unicode;
				507	case 4:
				508	unicode = src[0] & 0x07;
				509	utf8_shift_and_mask(&unicode, src[1]);
				510	utf8_shift_and_mask(&unicode, src[2]);
				511	utf8_shift_and_mask(&unicode, src[3]);
				512	return unicode;
				513	default:
				514	return 0xffff;
				515	}
				516
				517	//printf("Char at %p: len=%d, utf-16=%p\n", src, length, (void*)result);
				518	}
				519
				520	ssize_t utf8_to_utf16_length(const uint8_t* u8str, size_t u8len)
				521	{
				522	const uint8_t* const u8end = u8str + u8len;
				523	const uint8_t* u8cur = u8str;
				524
				525	/* Validate that the UTF-8 is the correct len */
				526	size_t u16measuredLen = 0;
				527	while (u8cur < u8end) {
				528	u16measuredLen++;
				529	int u8charLen = utf8_codepoint_len(*u8cur);
				530	uint32_t codepoint = utf8_to_utf32_codepoint(u8cur, u8charLen);
				531	if (codepoint > 0xFFFF) u16measuredLen++; // this will be a surrogate pair in utf16
				532	u8cur += u8charLen;
				533	}
				534
				535	/**
				536	* Make sure that we ended where we thought we would and the output UTF-16
				537	* will be exactly how long we were told it would be.
				538	*/
				539	if (u8cur != u8end) {
				540	return -1;
				541	}
				542
				543	return u16measuredLen;
				544	}
				545
Jeff Brown	aa983c9	2011-10-07 13:28:18 -0700	[diff] [blame]	546	char16_t* utf8_to_utf16_no_null_terminator(const uint8_t* u8str, size_t u8len, char16_t* u16str)
Kenny Root	ba0165b	2010-11-09 14:37:23 -0800	[diff] [blame]	547	{
				548	const uint8_t* const u8end = u8str + u8len;
				549	const uint8_t* u8cur = u8str;
				550	char16_t* u16cur = u16str;
				551
				552	while (u8cur < u8end) {
				553	size_t u8len = utf8_codepoint_len(*u8cur);
				554	uint32_t codepoint = utf8_to_utf32_codepoint(u8cur, u8len);
				555
				556	// Convert the UTF32 codepoint to one or more UTF16 codepoints
				557	if (codepoint <= 0xFFFF) {
				558	// Single UTF16 character
				559	*u16cur++ = (char16_t) codepoint;
				560	} else {
				561	// Multiple UTF16 characters with surrogates
				562	codepoint = codepoint - 0x10000;
				563	*u16cur++ = (char16_t) ((codepoint >> 10) + 0xD800);
				564	*u16cur++ = (char16_t) ((codepoint & 0x3FF) + 0xDC00);
				565	}
				566
				567	u8cur += u8len;
				568	}
Jeff Brown	aa983c9	2011-10-07 13:28:18 -0700	[diff] [blame]	569	return u16cur;
				570	}
				571
				572	void utf8_to_utf16(const uint8_t* u8str, size_t u8len, char16_t* u16str) {
				573	char16_t* end = utf8_to_utf16_no_null_terminator(u8str, u8len, u16str);
				574	*end = 0;
Kenny Root	ba0165b	2010-11-09 14:37:23 -0800	[diff] [blame]	575	}
				576
Dianne Hackborn	0f10d0a	2013-07-31 16:04:39 -0700	[diff] [blame]	577	char16_t* utf8_to_utf16_n(const uint8_t* src, size_t srcLen, char16_t* dst, size_t dstLen) {
				578	const uint8_t* const u8end = src + srcLen;
				579	const uint8_t* u8cur = src;
				580	const uint16_t* const u16end = dst + dstLen;
				581	char16_t* u16cur = dst;
				582
				583	while (u8cur < u8end && u16cur < u16end) {
				584	size_t u8len = utf8_codepoint_len(*u8cur);
				585	uint32_t codepoint = utf8_to_utf32_codepoint(u8cur, u8len);
				586
				587	// Convert the UTF32 codepoint to one or more UTF16 codepoints
				588	if (codepoint <= 0xFFFF) {
				589	// Single UTF16 character
				590	*u16cur++ = (char16_t) codepoint;
				591	} else {
				592	// Multiple UTF16 characters with surrogates
				593	codepoint = codepoint - 0x10000;
				594	*u16cur++ = (char16_t) ((codepoint >> 10) + 0xD800);
				595	if (u16cur >= u16end) {
				596	// Ooops... not enough room for this surrogate pair.
				597	return u16cur-1;
				598	}
				599	*u16cur++ = (char16_t) ((codepoint & 0x3FF) + 0xDC00);
				600	}
				601
				602	u8cur += u8len;
				603	}
				604	return u16cur;
				605	}
				606
Kenny Root	ba0165b	2010-11-09 14:37:23 -0800	[diff] [blame]	607	}