Blame - runtime/utf.cc - platform/art

blob: 93fcb321367c766a40ed2546261e85048c94fd52 [file] [log] [blame]

Elliott Hughes	2faa5f1	2012-01-30 14:42:07 -0800	[diff] [blame]	1	/*
				2	* Copyright (C) 2011 The Android Open Source Project
				3	*
				4	* Licensed under the Apache License, Version 2.0 (the "License");
				5	* you may not use this file except in compliance with the License.
				6	* You may obtain a copy of the License at
				7	*
				8	* http://www.apache.org/licenses/LICENSE-2.0
				9	*
				10	* Unless required by applicable law or agreed to in writing, software
				11	* distributed under the License is distributed on an "AS IS" BASIS,
				12	* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
				13	* See the License for the specific language governing permissions and
				14	* limitations under the License.
				15	*/
Elliott Hughes	814e403	2011-08-23 12:07:56 -0700	[diff] [blame]	16
				17	#include "utf.h"
				18
Andreas Gampe	5794381	2017-12-06 21:39:13 -0800	[diff] [blame]	19	#include <android-base/logging.h>
				20
Ian Rogers	2dd0e2c	2013-01-24 12:42:14 -0800	[diff] [blame]	21	#include "mirror/array.h"
Ian Rogers	4f6ad8a	2013-03-18 15:27:28 -0700	[diff] [blame]	22	#include "mirror/object-inl.h"
Ian Rogers	a672490	2013-09-23 09:23:37 -0700	[diff] [blame]	23	#include "utf-inl.h"
Elliott Hughes	b465ab0	2011-08-24 11:21:21 -0700	[diff] [blame]	24
Elliott Hughes	814e403	2011-08-23 12:07:56 -0700	[diff] [blame]	25	namespace art {
				26
Bruce Hoult	1646d7a	2015-10-28 15:06:12 +0300	[diff] [blame]	27	// This is used only from debugger and test code.
Elliott Hughes	814e403	2011-08-23 12:07:56 -0700	[diff] [blame]	28	size_t CountModifiedUtf8Chars(const char* utf8) {
Bruce Hoult	1646d7a	2015-10-28 15:06:12 +0300	[diff] [blame]	29	return CountModifiedUtf8Chars(utf8, strlen(utf8));
				30	}
				31
				32	/*
				33	* This does not validate UTF8 rules (nor did older code). But it gets the right answer
				34	* for valid UTF-8 and that's fine because it's used only to size a buffer for later
				35	* conversion.
				36	*
				37	* Modified UTF-8 consists of a series of bytes up to 21 bit Unicode code points as follows:
				38	* U+0001 - U+007F 0xxxxxxx
				39	* U+0080 - U+07FF 110xxxxx 10xxxxxx
				40	* U+0800 - U+FFFF 1110xxxx 10xxxxxx 10xxxxxx
				41	* U+10000 - U+1FFFFF 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx
				42	*
				43	* U+0000 is encoded using the 2nd form to avoid nulls inside strings (this differs from
				44	* standard UTF-8).
				45	* The four byte encoding converts to two utf16 characters.
				46	*/
				47	size_t CountModifiedUtf8Chars(const char* utf8, size_t byte_count) {
				48	DCHECK_LE(byte_count, strlen(utf8));
Elliott Hughes	814e403	2011-08-23 12:07:56 -0700	[diff] [blame]	49	size_t len = 0;
Bruce Hoult	1646d7a	2015-10-28 15:06:12 +0300	[diff] [blame]	50	const char* end = utf8 + byte_count;
				51	for (; utf8 < end; ++utf8) {
				52	int ic = *utf8;
Elliott Hughes	814e403	2011-08-23 12:07:56 -0700	[diff] [blame]	53	len++;
Bruce Hoult	1646d7a	2015-10-28 15:06:12 +0300	[diff] [blame]	54	if (LIKELY((ic & 0x80) == 0)) {
				55	// One-byte encoding.
Elliott Hughes	814e403	2011-08-23 12:07:56 -0700	[diff] [blame]	56	continue;
				57	}
Bruce Hoult	1646d7a	2015-10-28 15:06:12 +0300	[diff] [blame]	58	// Two- or three-byte encoding.
Elliott Hughes	814e403	2011-08-23 12:07:56 -0700	[diff] [blame]	59	utf8++;
				60	if ((ic & 0x20) == 0) {
Bruce Hoult	1646d7a	2015-10-28 15:06:12 +0300	[diff] [blame]	61	// Two-byte encoding.
Elliott Hughes	814e403	2011-08-23 12:07:56 -0700	[diff] [blame]	62	continue;
				63	}
Elliott Hughes	814e403	2011-08-23 12:07:56 -0700	[diff] [blame]	64	utf8++;
Narayan Kamath	a5afcfc	2015-01-29 20:06:46 +0000	[diff] [blame]	65	if ((ic & 0x10) == 0) {
Bruce Hoult	1646d7a	2015-10-28 15:06:12 +0300	[diff] [blame]	66	// Three-byte encoding.
Narayan Kamath	a5afcfc	2015-01-29 20:06:46 +0000	[diff] [blame]	67	continue;
				68	}
				69
Bruce Hoult	1646d7a	2015-10-28 15:06:12 +0300	[diff] [blame]	70	// Four-byte encoding: needs to be converted into a surrogate
Narayan Kamath	a5afcfc	2015-01-29 20:06:46 +0000	[diff] [blame]	71	// pair.
				72	utf8++;
				73	len++;
Elliott Hughes	814e403	2011-08-23 12:07:56 -0700	[diff] [blame]	74	}
				75	return len;
				76	}
				77
Bruce Hoult	1646d7a	2015-10-28 15:06:12 +0300	[diff] [blame]	78	// This is used only from debugger and test code.
Elliott Hughes	814e403	2011-08-23 12:07:56 -0700	[diff] [blame]	79	void ConvertModifiedUtf8ToUtf16(uint16_t* utf16_data_out, const char* utf8_data_in) {
				80	while (*utf8_data_in != '\0') {
Narayan Kamath	a5afcfc	2015-01-29 20:06:46 +0000	[diff] [blame]	81	const uint32_t ch = GetUtf16FromUtf8(&utf8_data_in);
				82	const uint16_t leading = GetLeadingUtf16Char(ch);
				83	const uint16_t trailing = GetTrailingUtf16Char(ch);
				84
				85	*utf16_data_out++ = leading;
				86	if (trailing != 0) {
				87	*utf16_data_out++ = trailing;
				88	}
Elliott Hughes	814e403	2011-08-23 12:07:56 -0700	[diff] [blame]	89	}
				90	}
				91
Bruce Hoult	1646d7a	2015-10-28 15:06:12 +0300	[diff] [blame]	92	void ConvertModifiedUtf8ToUtf16(uint16_t* utf16_data_out, size_t out_chars,
				93	const char* utf8_data_in, size_t in_bytes) {
				94	const char *in_start = utf8_data_in;
				95	const char *in_end = utf8_data_in + in_bytes;
				96	uint16_t *out_p = utf16_data_out;
				97
				98	if (LIKELY(out_chars == in_bytes)) {
				99	// Common case where all characters are ASCII.
				100	for (const char *p = in_start; p < in_end;) {
				101	// Safe even if char is signed because ASCII characters always have
				102	// the high bit cleared.
				103	out_p++ = dchecked_integral_cast<uint16_t>(p++);
				104	}
				105	return;
				106	}
				107
				108	// String contains non-ASCII characters.
				109	for (const char *p = in_start; p < in_end;) {
				110	const uint32_t ch = GetUtf16FromUtf8(&p);
				111	const uint16_t leading = GetLeadingUtf16Char(ch);
				112	const uint16_t trailing = GetTrailingUtf16Char(ch);
				113
				114	*out_p++ = leading;
				115	if (trailing != 0) {
				116	*out_p++ = trailing;
				117	}
				118	}
				119	}
				120
				121	void ConvertUtf16ToModifiedUtf8(char* utf8_out, size_t byte_count,
				122	const uint16_t* utf16_in, size_t char_count) {
				123	if (LIKELY(byte_count == char_count)) {
				124	// Common case where all characters are ASCII.
				125	const uint16_t *utf16_end = utf16_in + char_count;
				126	for (const uint16_t *p = utf16_in; p < utf16_end;) {
				127	utf8_out++ = dchecked_integral_cast<char>(p++);
				128	}
				129	return;
				130	}
				131
				132	// String contains non-ASCII characters.
Elliott Hughes	b465ab0	2011-08-24 11:21:21 -0700	[diff] [blame]	133	while (char_count--) {
Narayan Kamath	e16dad1	2015-02-13 11:49:22 +0000	[diff] [blame]	134	const uint16_t ch = *utf16_in++;
Elliott Hughes	b465ab0	2011-08-24 11:21:21 -0700	[diff] [blame]	135	if (ch > 0 && ch <= 0x7f) {
				136	*utf8_out++ = ch;
				137	} else {
Bruce Hoult	1646d7a	2015-10-28 15:06:12 +0300	[diff] [blame]	138	// Char_count == 0 here implies we've encountered an unpaired
Narayan Kamath	e16dad1	2015-02-13 11:49:22 +0000	[diff] [blame]	139	// surrogate and we have no choice but to encode it as 3-byte UTF
				140	// sequence. Note that unpaired surrogates can occur as a part of
				141	// "normal" operation.
				142	if ((ch >= 0xd800 && ch <= 0xdbff) && (char_count > 0)) {
				143	const uint16_t ch2 = *utf16_in;
				144
				145	// Check if the other half of the pair is within the expected
				146	// range. If it isn't, we will have to emit both "halves" as
				147	// separate 3 byte sequences.
				148	if (ch2 >= 0xdc00 && ch2 <= 0xdfff) {
				149	utf16_in++;
				150	char_count--;
				151	const uint32_t code_point = (ch << 10) + ch2 - 0x035fdc00;
				152	*utf8_out++ = (code_point >> 18) \| 0xf0;
				153	*utf8_out++ = ((code_point >> 12) & 0x3f) \| 0x80;
				154	*utf8_out++ = ((code_point >> 6) & 0x3f) \| 0x80;
				155	*utf8_out++ = (code_point & 0x3f) \| 0x80;
				156	continue;
				157	}
				158	}
				159
Elliott Hughes	b465ab0	2011-08-24 11:21:21 -0700	[diff] [blame]	160	if (ch > 0x07ff) {
Narayan Kamath	e16dad1	2015-02-13 11:49:22 +0000	[diff] [blame]	161	// Three byte encoding.
Elliott Hughes	b465ab0	2011-08-24 11:21:21 -0700	[diff] [blame]	162	*utf8_out++ = (ch >> 12) \| 0xe0;
				163	*utf8_out++ = ((ch >> 6) & 0x3f) \| 0x80;
				164	*utf8_out++ = (ch & 0x3f) \| 0x80;
				165	} else /(ch > 0x7f \|\| ch == 0)/ {
Narayan Kamath	e16dad1	2015-02-13 11:49:22 +0000	[diff] [blame]	166	// Two byte encoding.
Elliott Hughes	b465ab0	2011-08-24 11:21:21 -0700	[diff] [blame]	167	*utf8_out++ = (ch >> 6) \| 0xc0;
				168	*utf8_out++ = (ch & 0x3f) \| 0x80;
				169	}
				170	}
				171	}
				172	}
				173
Vladimir Marko	cac5a7e	2016-02-22 10:39:50 +0000	[diff] [blame]	174	int32_t ComputeUtf16HashFromModifiedUtf8(const char* utf8, size_t utf16_length) {
				175	uint32_t hash = 0;
				176	while (utf16_length != 0u) {
				177	const uint32_t pair = GetUtf16FromUtf8(&utf8);
				178	const uint16_t first = GetLeadingUtf16Char(pair);
				179	hash = hash * 31 + first;
				180	--utf16_length;
				181	const uint16_t second = GetTrailingUtf16Char(pair);
				182	if (second != 0) {
				183	hash = hash * 31 + second;
				184	DCHECK_NE(utf16_length, 0u);
				185	--utf16_length;
				186	}
				187	}
				188	return static_cast<int32_t>(hash);
				189	}
				190
Mathieu Chartier	208a5cb	2015-12-02 15:44:07 -0800	[diff] [blame]	191	uint32_t ComputeModifiedUtf8Hash(const char* chars) {
				192	uint32_t hash = 0;
Ian Rogers	68b5685	2014-08-29 20:19:11 -0700	[diff] [blame]	193	while (*chars != '\0') {
Mathieu Chartier	e7c9a8c	2014-11-06 16:35:45 -0800	[diff] [blame]	194	hash = hash * 31 + *chars++;
Ian Rogers	68b5685	2014-08-29 20:19:11 -0700	[diff] [blame]	195	}
Ian Rogers	8f41dc3	2014-10-30 15:16:16 -0700	[diff] [blame]	196	return static_cast<int32_t>(hash);
Ian Rogers	68b5685	2014-08-29 20:19:11 -0700	[diff] [blame]	197	}
				198
Vladimir Marko	a48aef4	2014-12-03 17:53:53 +0000	[diff] [blame]	199	int CompareModifiedUtf8ToUtf16AsCodePointValues(const char* utf8, const uint16_t* utf16,
				200	size_t utf16_length) {
Ian Rogers	637c65b	2013-05-31 11:46:00 -0700	[diff] [blame]	201	for (;;) {
Vladimir Marko	a48aef4	2014-12-03 17:53:53 +0000	[diff] [blame]	202	if (*utf8 == '\0') {
				203	return (utf16_length == 0) ? 0 : -1;
				204	} else if (utf16_length == 0) {
Ian Rogers	637c65b	2013-05-31 11:46:00 -0700	[diff] [blame]	205	return 1;
				206	}
				207
Narayan Kamath	a5afcfc	2015-01-29 20:06:46 +0000	[diff] [blame]	208	const uint32_t pair = GetUtf16FromUtf8(&utf8);
Ian Rogers	637c65b	2013-05-31 11:46:00 -0700	[diff] [blame]	209
Narayan Kamath	a5afcfc	2015-01-29 20:06:46 +0000	[diff] [blame]	210	// First compare the leading utf16 char.
				211	const uint16_t lhs = GetLeadingUtf16Char(pair);
				212	const uint16_t rhs = *utf16++;
				213	--utf16_length;
				214	if (lhs != rhs) {
				215	return lhs > rhs ? 1 : -1;
				216	}
				217
				218	// Then compare the trailing utf16 char. First check if there
				219	// are any characters left to consume.
				220	const uint16_t lhs2 = GetTrailingUtf16Char(pair);
				221	if (lhs2 != 0) {
				222	if (utf16_length == 0) {
				223	return 1;
				224	}
				225
				226	const uint16_t rhs2 = *utf16++;
				227	--utf16_length;
				228	if (lhs2 != rhs2) {
				229	return lhs2 > rhs2 ? 1 : -1;
				230	}
Ian Rogers	637c65b	2013-05-31 11:46:00 -0700	[diff] [blame]	231	}
				232	}
				233	}
				234
Elliott Hughes	814e403	2011-08-23 12:07:56 -0700	[diff] [blame]	235	size_t CountUtf8Bytes(const uint16_t* chars, size_t char_count) {
				236	size_t result = 0;
Bruce Hoult	1646d7a	2015-10-28 15:06:12 +0300	[diff] [blame]	237	const uint16_t *end = chars + char_count;
				238	while (chars < end) {
Narayan Kamath	e16dad1	2015-02-13 11:49:22 +0000	[diff] [blame]	239	const uint16_t ch = *chars++;
Bruce Hoult	1646d7a	2015-10-28 15:06:12 +0300	[diff] [blame]	240	if (LIKELY(ch != 0 && ch < 0x80)) {
				241	result++;
				242	continue;
				243	}
				244	if (ch < 0x800) {
				245	result += 2;
				246	continue;
				247	}
				248	if (ch >= 0xd800 && ch < 0xdc00) {
				249	if (chars < end) {
Narayan Kamath	e16dad1	2015-02-13 11:49:22 +0000	[diff] [blame]	250	const uint16_t ch2 = *chars;
				251	// If we find a properly paired surrogate, we emit it as a 4 byte
				252	// UTF sequence. If we find an unpaired leading or trailing surrogate,
				253	// we emit it as a 3 byte sequence like would have done earlier.
Bruce Hoult	1646d7a	2015-10-28 15:06:12 +0300	[diff] [blame]	254	if (ch2 >= 0xdc00 && ch2 < 0xe000) {
Narayan Kamath	e16dad1	2015-02-13 11:49:22 +0000	[diff] [blame]	255	chars++;
Narayan Kamath	e16dad1	2015-02-13 11:49:22 +0000	[diff] [blame]	256	result += 4;
Bruce Hoult	1646d7a	2015-10-28 15:06:12 +0300	[diff] [blame]	257	continue;
Narayan Kamath	e16dad1	2015-02-13 11:49:22 +0000	[diff] [blame]	258	}
Elliott Hughes	814e403	2011-08-23 12:07:56 -0700	[diff] [blame]	259	}
Elliott Hughes	814e403	2011-08-23 12:07:56 -0700	[diff] [blame]	260	}
Bruce Hoult	1646d7a	2015-10-28 15:06:12 +0300	[diff] [blame]	261	result += 3;
Elliott Hughes	814e403	2011-08-23 12:07:56 -0700	[diff] [blame]	262	}
				263	return result;
				264	}
				265
				266	} // namespace art