Blame - runtime/utf.cc - platform/art

blob: 7e064826356ec3f5f37d522335a4d0428a9ccec5 [file] [log] [blame]

Elliott Hughes	2faa5f1	2012-01-30 14:42:07 -0800	[diff] [blame]	1	/*
				2	* Copyright (C) 2011 The Android Open Source Project
				3	*
				4	* Licensed under the Apache License, Version 2.0 (the "License");
				5	* you may not use this file except in compliance with the License.
				6	* You may obtain a copy of the License at
				7	*
				8	* http://www.apache.org/licenses/LICENSE-2.0
				9	*
				10	* Unless required by applicable law or agreed to in writing, software
				11	* distributed under the License is distributed on an "AS IS" BASIS,
				12	* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
				13	* See the License for the specific language governing permissions and
				14	* limitations under the License.
				15	*/
Elliott Hughes	814e403	2011-08-23 12:07:56 -0700	[diff] [blame]	16
				17	#include "utf.h"
				18
Elliott Hughes	07ed66b	2012-12-12 18:34:25 -0800	[diff] [blame]	19	#include "base/logging.h"
Ian Rogers	2dd0e2c	2013-01-24 12:42:14 -0800	[diff] [blame]	20	#include "mirror/array.h"
Ian Rogers	4f6ad8a	2013-03-18 15:27:28 -0700	[diff] [blame]	21	#include "mirror/object-inl.h"
Ian Rogers	a672490	2013-09-23 09:23:37 -0700	[diff] [blame]	22	#include "utf-inl.h"
Elliott Hughes	b465ab0	2011-08-24 11:21:21 -0700	[diff] [blame]	23
Elliott Hughes	814e403	2011-08-23 12:07:56 -0700	[diff] [blame]	24	namespace art {
				25
Bruce Hoult	1646d7a	2015-10-28 15:06:12 +0300	[diff] [blame]	26	// This is used only from debugger and test code.
Elliott Hughes	814e403	2011-08-23 12:07:56 -0700	[diff] [blame]	27	size_t CountModifiedUtf8Chars(const char* utf8) {
Bruce Hoult	1646d7a	2015-10-28 15:06:12 +0300	[diff] [blame]	28	return CountModifiedUtf8Chars(utf8, strlen(utf8));
				29	}
				30
				31	/*
				32	* This does not validate UTF8 rules (nor did older code). But it gets the right answer
				33	* for valid UTF-8 and that's fine because it's used only to size a buffer for later
				34	* conversion.
				35	*
				36	* Modified UTF-8 consists of a series of bytes up to 21 bit Unicode code points as follows:
				37	* U+0001 - U+007F 0xxxxxxx
				38	* U+0080 - U+07FF 110xxxxx 10xxxxxx
				39	* U+0800 - U+FFFF 1110xxxx 10xxxxxx 10xxxxxx
				40	* U+10000 - U+1FFFFF 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx
				41	*
				42	* U+0000 is encoded using the 2nd form to avoid nulls inside strings (this differs from
				43	* standard UTF-8).
				44	* The four byte encoding converts to two utf16 characters.
				45	*/
				46	size_t CountModifiedUtf8Chars(const char* utf8, size_t byte_count) {
				47	DCHECK_LE(byte_count, strlen(utf8));
Elliott Hughes	814e403	2011-08-23 12:07:56 -0700	[diff] [blame]	48	size_t len = 0;
Bruce Hoult	1646d7a	2015-10-28 15:06:12 +0300	[diff] [blame]	49	const char* end = utf8 + byte_count;
				50	for (; utf8 < end; ++utf8) {
				51	int ic = *utf8;
Elliott Hughes	814e403	2011-08-23 12:07:56 -0700	[diff] [blame]	52	len++;
Bruce Hoult	1646d7a	2015-10-28 15:06:12 +0300	[diff] [blame]	53	if (LIKELY((ic & 0x80) == 0)) {
				54	// One-byte encoding.
Elliott Hughes	814e403	2011-08-23 12:07:56 -0700	[diff] [blame]	55	continue;
				56	}
Bruce Hoult	1646d7a	2015-10-28 15:06:12 +0300	[diff] [blame]	57	// Two- or three-byte encoding.
Elliott Hughes	814e403	2011-08-23 12:07:56 -0700	[diff] [blame]	58	utf8++;
				59	if ((ic & 0x20) == 0) {
Bruce Hoult	1646d7a	2015-10-28 15:06:12 +0300	[diff] [blame]	60	// Two-byte encoding.
Elliott Hughes	814e403	2011-08-23 12:07:56 -0700	[diff] [blame]	61	continue;
				62	}
Elliott Hughes	814e403	2011-08-23 12:07:56 -0700	[diff] [blame]	63	utf8++;
Narayan Kamath	a5afcfc	2015-01-29 20:06:46 +0000	[diff] [blame]	64	if ((ic & 0x10) == 0) {
Bruce Hoult	1646d7a	2015-10-28 15:06:12 +0300	[diff] [blame]	65	// Three-byte encoding.
Narayan Kamath	a5afcfc	2015-01-29 20:06:46 +0000	[diff] [blame]	66	continue;
				67	}
				68
Bruce Hoult	1646d7a	2015-10-28 15:06:12 +0300	[diff] [blame]	69	// Four-byte encoding: needs to be converted into a surrogate
Narayan Kamath	a5afcfc	2015-01-29 20:06:46 +0000	[diff] [blame]	70	// pair.
				71	utf8++;
				72	len++;
Elliott Hughes	814e403	2011-08-23 12:07:56 -0700	[diff] [blame]	73	}
				74	return len;
				75	}
				76
Bruce Hoult	1646d7a	2015-10-28 15:06:12 +0300	[diff] [blame]	77	// This is used only from debugger and test code.
Elliott Hughes	814e403	2011-08-23 12:07:56 -0700	[diff] [blame]	78	void ConvertModifiedUtf8ToUtf16(uint16_t* utf16_data_out, const char* utf8_data_in) {
				79	while (*utf8_data_in != '\0') {
Narayan Kamath	a5afcfc	2015-01-29 20:06:46 +0000	[diff] [blame]	80	const uint32_t ch = GetUtf16FromUtf8(&utf8_data_in);
				81	const uint16_t leading = GetLeadingUtf16Char(ch);
				82	const uint16_t trailing = GetTrailingUtf16Char(ch);
				83
				84	*utf16_data_out++ = leading;
				85	if (trailing != 0) {
				86	*utf16_data_out++ = trailing;
				87	}
Elliott Hughes	814e403	2011-08-23 12:07:56 -0700	[diff] [blame]	88	}
				89	}
				90
Bruce Hoult	1646d7a	2015-10-28 15:06:12 +0300	[diff] [blame]	91	void ConvertModifiedUtf8ToUtf16(uint16_t* utf16_data_out, size_t out_chars,
				92	const char* utf8_data_in, size_t in_bytes) {
				93	const char *in_start = utf8_data_in;
				94	const char *in_end = utf8_data_in + in_bytes;
				95	uint16_t *out_p = utf16_data_out;
				96
				97	if (LIKELY(out_chars == in_bytes)) {
				98	// Common case where all characters are ASCII.
				99	for (const char *p = in_start; p < in_end;) {
				100	// Safe even if char is signed because ASCII characters always have
				101	// the high bit cleared.
				102	out_p++ = dchecked_integral_cast<uint16_t>(p++);
				103	}
				104	return;
				105	}
				106
				107	// String contains non-ASCII characters.
				108	for (const char *p = in_start; p < in_end;) {
				109	const uint32_t ch = GetUtf16FromUtf8(&p);
				110	const uint16_t leading = GetLeadingUtf16Char(ch);
				111	const uint16_t trailing = GetTrailingUtf16Char(ch);
				112
				113	*out_p++ = leading;
				114	if (trailing != 0) {
				115	*out_p++ = trailing;
				116	}
				117	}
				118	}
				119
				120	void ConvertUtf16ToModifiedUtf8(char* utf8_out, size_t byte_count,
				121	const uint16_t* utf16_in, size_t char_count) {
				122	if (LIKELY(byte_count == char_count)) {
				123	// Common case where all characters are ASCII.
				124	const uint16_t *utf16_end = utf16_in + char_count;
				125	for (const uint16_t *p = utf16_in; p < utf16_end;) {
				126	utf8_out++ = dchecked_integral_cast<char>(p++);
				127	}
				128	return;
				129	}
				130
				131	// String contains non-ASCII characters.
Elliott Hughes	b465ab0	2011-08-24 11:21:21 -0700	[diff] [blame]	132	while (char_count--) {
Narayan Kamath	e16dad1	2015-02-13 11:49:22 +0000	[diff] [blame]	133	const uint16_t ch = *utf16_in++;
Elliott Hughes	b465ab0	2011-08-24 11:21:21 -0700	[diff] [blame]	134	if (ch > 0 && ch <= 0x7f) {
				135	*utf8_out++ = ch;
				136	} else {
Bruce Hoult	1646d7a	2015-10-28 15:06:12 +0300	[diff] [blame]	137	// Char_count == 0 here implies we've encountered an unpaired
Narayan Kamath	e16dad1	2015-02-13 11:49:22 +0000	[diff] [blame]	138	// surrogate and we have no choice but to encode it as 3-byte UTF
				139	// sequence. Note that unpaired surrogates can occur as a part of
				140	// "normal" operation.
				141	if ((ch >= 0xd800 && ch <= 0xdbff) && (char_count > 0)) {
				142	const uint16_t ch2 = *utf16_in;
				143
				144	// Check if the other half of the pair is within the expected
				145	// range. If it isn't, we will have to emit both "halves" as
				146	// separate 3 byte sequences.
				147	if (ch2 >= 0xdc00 && ch2 <= 0xdfff) {
				148	utf16_in++;
				149	char_count--;
				150	const uint32_t code_point = (ch << 10) + ch2 - 0x035fdc00;
				151	*utf8_out++ = (code_point >> 18) \| 0xf0;
				152	*utf8_out++ = ((code_point >> 12) & 0x3f) \| 0x80;
				153	*utf8_out++ = ((code_point >> 6) & 0x3f) \| 0x80;
				154	*utf8_out++ = (code_point & 0x3f) \| 0x80;
				155	continue;
				156	}
				157	}
				158
Elliott Hughes	b465ab0	2011-08-24 11:21:21 -0700	[diff] [blame]	159	if (ch > 0x07ff) {
Narayan Kamath	e16dad1	2015-02-13 11:49:22 +0000	[diff] [blame]	160	// Three byte encoding.
Elliott Hughes	b465ab0	2011-08-24 11:21:21 -0700	[diff] [blame]	161	*utf8_out++ = (ch >> 12) \| 0xe0;
				162	*utf8_out++ = ((ch >> 6) & 0x3f) \| 0x80;
				163	*utf8_out++ = (ch & 0x3f) \| 0x80;
				164	} else /(ch > 0x7f \|\| ch == 0)/ {
Narayan Kamath	e16dad1	2015-02-13 11:49:22 +0000	[diff] [blame]	165	// Two byte encoding.
Elliott Hughes	b465ab0	2011-08-24 11:21:21 -0700	[diff] [blame]	166	*utf8_out++ = (ch >> 6) \| 0xc0;
				167	*utf8_out++ = (ch & 0x3f) \| 0x80;
				168	}
				169	}
				170	}
				171	}
				172
Vladimir Marko	cac5a7e	2016-02-22 10:39:50 +0000	[diff] [blame]	173	int32_t ComputeUtf16HashFromModifiedUtf8(const char* utf8, size_t utf16_length) {
				174	uint32_t hash = 0;
				175	while (utf16_length != 0u) {
				176	const uint32_t pair = GetUtf16FromUtf8(&utf8);
				177	const uint16_t first = GetLeadingUtf16Char(pair);
				178	hash = hash * 31 + first;
				179	--utf16_length;
				180	const uint16_t second = GetTrailingUtf16Char(pair);
				181	if (second != 0) {
				182	hash = hash * 31 + second;
				183	DCHECK_NE(utf16_length, 0u);
				184	--utf16_length;
				185	}
				186	}
				187	return static_cast<int32_t>(hash);
				188	}
				189
Mathieu Chartier	208a5cb	2015-12-02 15:44:07 -0800	[diff] [blame]	190	uint32_t ComputeModifiedUtf8Hash(const char* chars) {
				191	uint32_t hash = 0;
Ian Rogers	68b5685	2014-08-29 20:19:11 -0700	[diff] [blame]	192	while (*chars != '\0') {
Mathieu Chartier	e7c9a8c	2014-11-06 16:35:45 -0800	[diff] [blame]	193	hash = hash * 31 + *chars++;
Ian Rogers	68b5685	2014-08-29 20:19:11 -0700	[diff] [blame]	194	}
Ian Rogers	8f41dc3	2014-10-30 15:16:16 -0700	[diff] [blame]	195	return static_cast<int32_t>(hash);
Ian Rogers	68b5685	2014-08-29 20:19:11 -0700	[diff] [blame]	196	}
				197
Vladimir Marko	a48aef4	2014-12-03 17:53:53 +0000	[diff] [blame]	198	int CompareModifiedUtf8ToUtf16AsCodePointValues(const char* utf8, const uint16_t* utf16,
				199	size_t utf16_length) {
Ian Rogers	637c65b	2013-05-31 11:46:00 -0700	[diff] [blame]	200	for (;;) {
Vladimir Marko	a48aef4	2014-12-03 17:53:53 +0000	[diff] [blame]	201	if (*utf8 == '\0') {
				202	return (utf16_length == 0) ? 0 : -1;
				203	} else if (utf16_length == 0) {
Ian Rogers	637c65b	2013-05-31 11:46:00 -0700	[diff] [blame]	204	return 1;
				205	}
				206
Narayan Kamath	a5afcfc	2015-01-29 20:06:46 +0000	[diff] [blame]	207	const uint32_t pair = GetUtf16FromUtf8(&utf8);
Ian Rogers	637c65b	2013-05-31 11:46:00 -0700	[diff] [blame]	208
Narayan Kamath	a5afcfc	2015-01-29 20:06:46 +0000	[diff] [blame]	209	// First compare the leading utf16 char.
				210	const uint16_t lhs = GetLeadingUtf16Char(pair);
				211	const uint16_t rhs = *utf16++;
				212	--utf16_length;
				213	if (lhs != rhs) {
				214	return lhs > rhs ? 1 : -1;
				215	}
				216
				217	// Then compare the trailing utf16 char. First check if there
				218	// are any characters left to consume.
				219	const uint16_t lhs2 = GetTrailingUtf16Char(pair);
				220	if (lhs2 != 0) {
				221	if (utf16_length == 0) {
				222	return 1;
				223	}
				224
				225	const uint16_t rhs2 = *utf16++;
				226	--utf16_length;
				227	if (lhs2 != rhs2) {
				228	return lhs2 > rhs2 ? 1 : -1;
				229	}
Ian Rogers	637c65b	2013-05-31 11:46:00 -0700	[diff] [blame]	230	}
				231	}
				232	}
				233
Elliott Hughes	814e403	2011-08-23 12:07:56 -0700	[diff] [blame]	234	size_t CountUtf8Bytes(const uint16_t* chars, size_t char_count) {
				235	size_t result = 0;
Bruce Hoult	1646d7a	2015-10-28 15:06:12 +0300	[diff] [blame]	236	const uint16_t *end = chars + char_count;
				237	while (chars < end) {
Narayan Kamath	e16dad1	2015-02-13 11:49:22 +0000	[diff] [blame]	238	const uint16_t ch = *chars++;
Bruce Hoult	1646d7a	2015-10-28 15:06:12 +0300	[diff] [blame]	239	if (LIKELY(ch != 0 && ch < 0x80)) {
				240	result++;
				241	continue;
				242	}
				243	if (ch < 0x800) {
				244	result += 2;
				245	continue;
				246	}
				247	if (ch >= 0xd800 && ch < 0xdc00) {
				248	if (chars < end) {
Narayan Kamath	e16dad1	2015-02-13 11:49:22 +0000	[diff] [blame]	249	const uint16_t ch2 = *chars;
				250	// If we find a properly paired surrogate, we emit it as a 4 byte
				251	// UTF sequence. If we find an unpaired leading or trailing surrogate,
				252	// we emit it as a 3 byte sequence like would have done earlier.
Bruce Hoult	1646d7a	2015-10-28 15:06:12 +0300	[diff] [blame]	253	if (ch2 >= 0xdc00 && ch2 < 0xe000) {
Narayan Kamath	e16dad1	2015-02-13 11:49:22 +0000	[diff] [blame]	254	chars++;
Narayan Kamath	e16dad1	2015-02-13 11:49:22 +0000	[diff] [blame]	255	result += 4;
Bruce Hoult	1646d7a	2015-10-28 15:06:12 +0300	[diff] [blame]	256	continue;
Narayan Kamath	e16dad1	2015-02-13 11:49:22 +0000	[diff] [blame]	257	}
Elliott Hughes	814e403	2011-08-23 12:07:56 -0700	[diff] [blame]	258	}
Elliott Hughes	814e403	2011-08-23 12:07:56 -0700	[diff] [blame]	259	}
Bruce Hoult	1646d7a	2015-10-28 15:06:12 +0300	[diff] [blame]	260	result += 3;
Elliott Hughes	814e403	2011-08-23 12:07:56 -0700	[diff] [blame]	261	}
				262	return result;
				263	}
				264
				265	} // namespace art