Blame - runtime/utf.cc - platform/art

blob: 3d13c3e492eafed173612b3add7812e1f0559c11 [file] [log] [blame]

Elliott Hughes	2faa5f1	2012-01-30 14:42:07 -0800	[diff] [blame]	1	/*
				2	* Copyright (C) 2011 The Android Open Source Project
				3	*
				4	* Licensed under the Apache License, Version 2.0 (the "License");
				5	* you may not use this file except in compliance with the License.
				6	* You may obtain a copy of the License at
				7	*
				8	* http://www.apache.org/licenses/LICENSE-2.0
				9	*
				10	* Unless required by applicable law or agreed to in writing, software
				11	* distributed under the License is distributed on an "AS IS" BASIS,
				12	* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
				13	* See the License for the specific language governing permissions and
				14	* limitations under the License.
				15	*/
Elliott Hughes	814e403	2011-08-23 12:07:56 -0700	[diff] [blame]	16
				17	#include "utf.h"
				18
Elliott Hughes	07ed66b	2012-12-12 18:34:25 -0800	[diff] [blame]	19	#include "base/logging.h"
Ian Rogers	2dd0e2c	2013-01-24 12:42:14 -0800	[diff] [blame]	20	#include "mirror/array.h"
Ian Rogers	4f6ad8a	2013-03-18 15:27:28 -0700	[diff] [blame]	21	#include "mirror/object-inl.h"
Ian Rogers	a672490	2013-09-23 09:23:37 -0700	[diff] [blame]	22	#include "utf-inl.h"
Elliott Hughes	b465ab0	2011-08-24 11:21:21 -0700	[diff] [blame]	23
Elliott Hughes	814e403	2011-08-23 12:07:56 -0700	[diff] [blame]	24	namespace art {
				25
				26	size_t CountModifiedUtf8Chars(const char* utf8) {
				27	size_t len = 0;
				28	int ic;
				29	while ((ic = *utf8++) != '\0') {
				30	len++;
				31	if ((ic & 0x80) == 0) {
				32	// one-byte encoding
				33	continue;
				34	}
				35	// two- or three-byte encoding
				36	utf8++;
				37	if ((ic & 0x20) == 0) {
				38	// two-byte encoding
				39	continue;
				40	}
Elliott Hughes	814e403	2011-08-23 12:07:56 -0700	[diff] [blame]	41	utf8++;
Narayan Kamath	a5afcfc	2015-01-29 20:06:46 +0000	[diff] [blame]	42	if ((ic & 0x10) == 0) {
				43	// three-byte encoding
				44	continue;
				45	}
				46
				47	// four-byte encoding: needs to be converted into a surrogate
				48	// pair.
				49	utf8++;
				50	len++;
Elliott Hughes	814e403	2011-08-23 12:07:56 -0700	[diff] [blame]	51	}
				52	return len;
				53	}
				54
				55	void ConvertModifiedUtf8ToUtf16(uint16_t* utf16_data_out, const char* utf8_data_in) {
				56	while (*utf8_data_in != '\0') {
Narayan Kamath	a5afcfc	2015-01-29 20:06:46 +0000	[diff] [blame]	57	const uint32_t ch = GetUtf16FromUtf8(&utf8_data_in);
				58	const uint16_t leading = GetLeadingUtf16Char(ch);
				59	const uint16_t trailing = GetTrailingUtf16Char(ch);
				60
				61	*utf16_data_out++ = leading;
				62	if (trailing != 0) {
				63	*utf16_data_out++ = trailing;
				64	}
Elliott Hughes	814e403	2011-08-23 12:07:56 -0700	[diff] [blame]	65	}
				66	}
				67
Elliott Hughes	b465ab0	2011-08-24 11:21:21 -0700	[diff] [blame]	68	void ConvertUtf16ToModifiedUtf8(char* utf8_out, const uint16_t* utf16_in, size_t char_count) {
				69	while (char_count--) {
Narayan Kamath	e16dad1	2015-02-13 11:49:22 +0000	[diff] [blame]	70	const uint16_t ch = *utf16_in++;
Elliott Hughes	b465ab0	2011-08-24 11:21:21 -0700	[diff] [blame]	71	if (ch > 0 && ch <= 0x7f) {
				72	*utf8_out++ = ch;
				73	} else {
Narayan Kamath	e16dad1	2015-02-13 11:49:22 +0000	[diff] [blame]	74	// char_count == 0 here implies we've encountered an unpaired
				75	// surrogate and we have no choice but to encode it as 3-byte UTF
				76	// sequence. Note that unpaired surrogates can occur as a part of
				77	// "normal" operation.
				78	if ((ch >= 0xd800 && ch <= 0xdbff) && (char_count > 0)) {
				79	const uint16_t ch2 = *utf16_in;
				80
				81	// Check if the other half of the pair is within the expected
				82	// range. If it isn't, we will have to emit both "halves" as
				83	// separate 3 byte sequences.
				84	if (ch2 >= 0xdc00 && ch2 <= 0xdfff) {
				85	utf16_in++;
				86	char_count--;
				87	const uint32_t code_point = (ch << 10) + ch2 - 0x035fdc00;
				88	*utf8_out++ = (code_point >> 18) \| 0xf0;
				89	*utf8_out++ = ((code_point >> 12) & 0x3f) \| 0x80;
				90	*utf8_out++ = ((code_point >> 6) & 0x3f) \| 0x80;
				91	*utf8_out++ = (code_point & 0x3f) \| 0x80;
				92	continue;
				93	}
				94	}
				95
Elliott Hughes	b465ab0	2011-08-24 11:21:21 -0700	[diff] [blame]	96	if (ch > 0x07ff) {
Narayan Kamath	e16dad1	2015-02-13 11:49:22 +0000	[diff] [blame]	97	// Three byte encoding.
Elliott Hughes	b465ab0	2011-08-24 11:21:21 -0700	[diff] [blame]	98	*utf8_out++ = (ch >> 12) \| 0xe0;
				99	*utf8_out++ = ((ch >> 6) & 0x3f) \| 0x80;
				100	*utf8_out++ = (ch & 0x3f) \| 0x80;
				101	} else /(ch > 0x7f \|\| ch == 0)/ {
Narayan Kamath	e16dad1	2015-02-13 11:49:22 +0000	[diff] [blame]	102	// Two byte encoding.
Elliott Hughes	b465ab0	2011-08-24 11:21:21 -0700	[diff] [blame]	103	*utf8_out++ = (ch >> 6) \| 0xc0;
				104	*utf8_out++ = (ch & 0x3f) \| 0x80;
				105	}
				106	}
				107	}
				108	}
				109
Ian Rogers	ef7d42f	2014-01-06 12:55:46 -0800	[diff] [blame]	110	int32_t ComputeUtf16Hash(mirror::CharArray* chars, int32_t offset,
Ian Rogers	0cfe1fb	2011-08-26 03:29:44 -0700	[diff] [blame]	111	size_t char_count) {
Ian Rogers	8f41dc3	2014-10-30 15:16:16 -0700	[diff] [blame]	112	uint32_t hash = 0;
Ian Rogers	0cfe1fb	2011-08-26 03:29:44 -0700	[diff] [blame]	113	for (size_t i = 0; i < char_count; i++) {
				114	hash = hash * 31 + chars->Get(offset + i);
				115	}
Ian Rogers	8f41dc3	2014-10-30 15:16:16 -0700	[diff] [blame]	116	return static_cast<int32_t>(hash);
Ian Rogers	0cfe1fb	2011-08-26 03:29:44 -0700	[diff] [blame]	117	}
				118
Elliott Hughes	814e403	2011-08-23 12:07:56 -0700	[diff] [blame]	119	int32_t ComputeUtf16Hash(const uint16_t* chars, size_t char_count) {
Ian Rogers	8f41dc3	2014-10-30 15:16:16 -0700	[diff] [blame]	120	uint32_t hash = 0;
Elliott Hughes	814e403	2011-08-23 12:07:56 -0700	[diff] [blame]	121	while (char_count--) {
				122	hash = hash * 31 + *chars++;
				123	}
Ian Rogers	8f41dc3	2014-10-30 15:16:16 -0700	[diff] [blame]	124	return static_cast<int32_t>(hash);
Elliott Hughes	814e403	2011-08-23 12:07:56 -0700	[diff] [blame]	125	}
				126
Mathieu Chartier	e7c9a8c	2014-11-06 16:35:45 -0800	[diff] [blame]	127	size_t ComputeModifiedUtf8Hash(const char* chars) {
				128	size_t hash = 0;
Ian Rogers	68b5685	2014-08-29 20:19:11 -0700	[diff] [blame]	129	while (*chars != '\0') {
Mathieu Chartier	e7c9a8c	2014-11-06 16:35:45 -0800	[diff] [blame]	130	hash = hash * 31 + *chars++;
Ian Rogers	68b5685	2014-08-29 20:19:11 -0700	[diff] [blame]	131	}
Ian Rogers	8f41dc3	2014-10-30 15:16:16 -0700	[diff] [blame]	132	return static_cast<int32_t>(hash);
Ian Rogers	68b5685	2014-08-29 20:19:11 -0700	[diff] [blame]	133	}
				134
Vladimir Marko	a48aef4	2014-12-03 17:53:53 +0000	[diff] [blame]	135	int CompareModifiedUtf8ToUtf16AsCodePointValues(const char* utf8, const uint16_t* utf16,
				136	size_t utf16_length) {
Ian Rogers	637c65b	2013-05-31 11:46:00 -0700	[diff] [blame]	137	for (;;) {
Vladimir Marko	a48aef4	2014-12-03 17:53:53 +0000	[diff] [blame]	138	if (*utf8 == '\0') {
				139	return (utf16_length == 0) ? 0 : -1;
				140	} else if (utf16_length == 0) {
Ian Rogers	637c65b	2013-05-31 11:46:00 -0700	[diff] [blame]	141	return 1;
				142	}
				143
Narayan Kamath	a5afcfc	2015-01-29 20:06:46 +0000	[diff] [blame]	144	const uint32_t pair = GetUtf16FromUtf8(&utf8);
Ian Rogers	637c65b	2013-05-31 11:46:00 -0700	[diff] [blame]	145
Narayan Kamath	a5afcfc	2015-01-29 20:06:46 +0000	[diff] [blame]	146	// First compare the leading utf16 char.
				147	const uint16_t lhs = GetLeadingUtf16Char(pair);
				148	const uint16_t rhs = *utf16++;
				149	--utf16_length;
				150	if (lhs != rhs) {
				151	return lhs > rhs ? 1 : -1;
				152	}
				153
				154	// Then compare the trailing utf16 char. First check if there
				155	// are any characters left to consume.
				156	const uint16_t lhs2 = GetTrailingUtf16Char(pair);
				157	if (lhs2 != 0) {
				158	if (utf16_length == 0) {
				159	return 1;
				160	}
				161
				162	const uint16_t rhs2 = *utf16++;
				163	--utf16_length;
				164	if (lhs2 != rhs2) {
				165	return lhs2 > rhs2 ? 1 : -1;
				166	}
Ian Rogers	637c65b	2013-05-31 11:46:00 -0700	[diff] [blame]	167	}
				168	}
				169	}
				170
Elliott Hughes	814e403	2011-08-23 12:07:56 -0700	[diff] [blame]	171	size_t CountUtf8Bytes(const uint16_t* chars, size_t char_count) {
				172	size_t result = 0;
				173	while (char_count--) {
Narayan Kamath	e16dad1	2015-02-13 11:49:22 +0000	[diff] [blame]	174	const uint16_t ch = *chars++;
Elliott Hughes	814e403	2011-08-23 12:07:56 -0700	[diff] [blame]	175	if (ch > 0 && ch <= 0x7f) {
				176	++result;
Narayan Kamath	e16dad1	2015-02-13 11:49:22 +0000	[diff] [blame]	177	} else if (ch >= 0xd800 && ch <= 0xdbff) {
				178	if (char_count > 0) {
				179	const uint16_t ch2 = *chars;
				180	// If we find a properly paired surrogate, we emit it as a 4 byte
				181	// UTF sequence. If we find an unpaired leading or trailing surrogate,
				182	// we emit it as a 3 byte sequence like would have done earlier.
				183	if (ch2 >= 0xdc00 && ch2 <= 0xdfff) {
				184	chars++;
				185	char_count--;
				186
				187	result += 4;
				188	} else {
				189	result += 3;
				190	}
Elliott Hughes	814e403	2011-08-23 12:07:56 -0700	[diff] [blame]	191	} else {
Narayan Kamath	e16dad1	2015-02-13 11:49:22 +0000	[diff] [blame]	192	// This implies we found an unpaired trailing surrogate at the end
				193	// of a string.
				194	result += 3;
Elliott Hughes	814e403	2011-08-23 12:07:56 -0700	[diff] [blame]	195	}
Narayan Kamath	e16dad1	2015-02-13 11:49:22 +0000	[diff] [blame]	196	} else if (ch > 0x7ff) {
				197	result += 3;
				198	} else {
				199	result += 2;
Elliott Hughes	814e403	2011-08-23 12:07:56 -0700	[diff] [blame]	200	}
				201	}
				202	return result;
				203	}
				204
				205	} // namespace art