Blame - current/sdk/common_os/include/art/libdexfile/dex/utf.h - platform/prebuilts/module_sdk/art

blob: d372bff662aec9ec2288807024b163ff0c75aaca [file] [log] [blame]

Martin Stjernholm	c15e7e4	2020-12-02 22:50:53 +0000	[diff] [blame]	1	/*
				2	* Copyright (C) 2011 The Android Open Source Project
				3	*
				4	* Licensed under the Apache License, Version 2.0 (the "License");
				5	* you may not use this file except in compliance with the License.
				6	* You may obtain a copy of the License at
				7	*
				8	* http://www.apache.org/licenses/LICENSE-2.0
				9	*
				10	* Unless required by applicable law or agreed to in writing, software
				11	* distributed under the License is distributed on an "AS IS" BASIS,
				12	* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
				13	* See the License for the specific language governing permissions and
				14	* limitations under the License.
				15	*/
				16
				17	#ifndef ART_LIBDEXFILE_DEX_UTF_H_
				18	#define ART_LIBDEXFILE_DEX_UTF_H_
				19
Martin Stjernholm	c15e7e4	2020-12-02 22:50:53 +0000	[diff] [blame]	20	#include <stddef.h>
				21	#include <stdint.h>
				22
				23	#include <string>
Martin Stjernholm	413b2b5	2021-11-15 13:56:19 +0000	[diff] [blame]	24	#include <string_view>
satayev	499be97	2022-05-13 15:05:39 +0000	[diff] [blame]	25	#include <type_traits>
				26
				27	#include "base/macros.h"
Martin Stjernholm	c15e7e4	2020-12-02 22:50:53 +0000	[diff] [blame]	28
				29	/*
				30	* All UTF-8 in art is actually modified UTF-8. Mostly, this distinction
				31	* doesn't matter.
				32	*
				33	* See http://en.wikipedia.org/wiki/UTF-8#Modified_UTF-8 for the details.
				34	*/
				35	namespace art {
				36
				37	/*
				38	* Returns the number of UTF-16 characters in the given modified UTF-8 string.
				39	*/
				40	size_t CountModifiedUtf8Chars(const char* utf8);
				41	size_t CountModifiedUtf8Chars(const char* utf8, size_t byte_count);
				42
				43	/*
Martin Stjernholm	c15e7e4	2020-12-02 22:50:53 +0000	[diff] [blame]	44	* Convert from Modified UTF-8 to UTF-16.
				45	*/
				46	void ConvertModifiedUtf8ToUtf16(uint16_t* utf16_out, const char* utf8_in);
				47	void ConvertModifiedUtf8ToUtf16(uint16_t* utf16_out, size_t out_chars,
				48	const char* utf8_in, size_t in_bytes);
				49
				50	/*
				51	* Compare two modified UTF-8 strings as UTF-16 code point values in a non-locale sensitive manner
				52	*/
				53	ALWAYS_INLINE int CompareModifiedUtf8ToModifiedUtf8AsUtf16CodePointValues(const char* utf8_1,
				54	const char* utf8_2);
				55
				56	/*
				57	* Compare a null-terminated modified UTF-8 string with a UTF-16 string (not null-terminated)
				58	* as code point values in a non-locale sensitive manner.
				59	*/
				60	int CompareModifiedUtf8ToUtf16AsCodePointValues(const char* utf8, const uint16_t* utf16,
				61	size_t utf16_length);
				62
				63	/*
Martin Stjernholm	413b2b5	2021-11-15 13:56:19 +0000	[diff] [blame]	64	* Helper template for converting UTF-16 to UTF-8 and similar encodings.
				65	*
				66	* Template arguments:
				67	* kUseShortZero: Encode U+0000 as a single byte with value 0 (otherwise emit 0xc0 0x80).
				68	* kUse4ByteSequence: Encode valid surrogate pairs as a 4-byte sequence.
				69	* kReplaceBadSurrogates: Replace unmatched surrogates with '?' (otherwise use 3-byte sequence).
				70	* Must be false if kUse4ByteSequence is false.
				71	* Append: The type of the `append` functor. Should be deduced automatically.
				72	*
				73	* Encoding kUseShortZero kUse4ByteSequence kReplaceBadSurrogates
				74	* UTF-8 true true true
				75	* Modified UTF8 false false n/a
				76	* JNI GetStringUTFChars false true false
				77	*/
				78	template <bool kUseShortZero, bool kUse4ByteSequence, bool kReplaceBadSurrogates, typename Append>
				79	void ConvertUtf16ToUtf8(const uint16_t* utf16, size_t char_count, Append&& append);
				80
				81	/*
Fairphone ODM	25c12f5	2023-12-15 17:24:06 +0800	[diff] [blame]	82	* Returns the number of modified UTF-8 bytes needed to represent the given
				83	* UTF-16 string.
				84	*/
				85	size_t CountModifiedUtf8BytesInUtf16(const uint16_t* chars, size_t char_count);
				86
				87	/*
Martin Stjernholm	c15e7e4	2020-12-02 22:50:53 +0000	[diff] [blame]	88	* Convert from UTF-16 to Modified UTF-8. Note that the output is _not_
Fairphone ODM	25c12f5	2023-12-15 17:24:06 +0800	[diff] [blame]	89	* NUL-terminated. You probably need to call CountModifiedUtf8BytesInUtf16 before calling
Martin Stjernholm	c15e7e4	2020-12-02 22:50:53 +0000	[diff] [blame]	90	* this anyway, so if you want a NUL-terminated string, you know where to
				91	* put the NUL byte.
				92	*/
				93	void ConvertUtf16ToModifiedUtf8(char* utf8_out, size_t byte_count,
				94	const uint16_t* utf16_in, size_t char_count);
				95
				96	/*
				97	* The java.lang.String hashCode() algorithm.
				98	*/
				99	template<typename MemoryType>
				100	int32_t ComputeUtf16Hash(const MemoryType* chars, size_t char_count) {
satayev	499be97	2022-05-13 15:05:39 +0000	[diff] [blame]	101	static_assert(std::is_same_v<MemoryType, char> \|\|
				102	std::is_same_v<MemoryType, uint8_t> \|\|
				103	std::is_same_v<MemoryType, uint16_t>);
				104	using UnsignedMemoryType = std::make_unsigned_t<MemoryType>;
Martin Stjernholm	c15e7e4	2020-12-02 22:50:53 +0000	[diff] [blame]	105	uint32_t hash = 0;
				106	while (char_count--) {
satayev	499be97	2022-05-13 15:05:39 +0000	[diff] [blame]	107	hash = hash * 31 + static_cast<UnsignedMemoryType>(*chars++);
Martin Stjernholm	c15e7e4	2020-12-02 22:50:53 +0000	[diff] [blame]	108	}
				109	return static_cast<int32_t>(hash);
				110	}
				111
				112	int32_t ComputeUtf16HashFromModifiedUtf8(const char* utf8, size_t utf16_length);
				113
				114	// Compute a hash code of a modified UTF-8 string. Not the standard java hash since it returns a
				115	// uint32_t and hashes individual chars instead of codepoint words.
				116	uint32_t ComputeModifiedUtf8Hash(const char* chars);
Martin Stjernholm	413b2b5	2021-11-15 13:56:19 +0000	[diff] [blame]	117	uint32_t ComputeModifiedUtf8Hash(std::string_view chars);
				118
				119	// The starting value of a modified UTF-8 hash.
				120	constexpr uint32_t StartModifiedUtf8Hash() {
				121	return 0u;
				122	}
				123
				124	// Update a modified UTF-8 hash with one character.
				125	ALWAYS_INLINE
				126	inline uint32_t UpdateModifiedUtf8Hash(uint32_t hash, char c) {
				127	return hash * 31u + static_cast<uint8_t>(c);
				128	}
				129
				130	// Update a modified UTF-8 hash with characters of a `std::string_view`.
				131	ALWAYS_INLINE
				132	inline uint32_t UpdateModifiedUtf8Hash(uint32_t hash, std::string_view chars) {
				133	for (char c : chars) {
				134	hash = UpdateModifiedUtf8Hash(hash, c);
				135	}
				136	return hash;
				137	}
Martin Stjernholm	c15e7e4	2020-12-02 22:50:53 +0000	[diff] [blame]	138
				139	/*
				140	* Retrieve the next UTF-16 character or surrogate pair from a UTF-8 string.
				141	* single byte, 2-byte and 3-byte UTF-8 sequences result in a single UTF-16
				142	* character (possibly one half of a surrogate) whereas 4-byte UTF-8 sequences
				143	* result in a surrogate pair. Use GetLeadingUtf16Char and GetTrailingUtf16Char
				144	* to process the return value of this function.
				145	*
				146	* Advances "*utf8_data_in" to the start of the next character.
				147	*
				148	* WARNING: If a string is corrupted by dropping a '\0' in the middle
				149	* of a multi byte sequence, you can end up overrunning the buffer with
				150	* reads (and possibly with the writes if the length was computed and
				151	* cached before the damage). For performance reasons, this function
				152	* assumes that the string being parsed is known to be valid (e.g., by
				153	* already being verified). Most strings we process here are coming
				154	* out of dex files or other internal translations, so the only real
				155	* risk comes from the JNI NewStringUTF call.
				156	*/
				157	uint32_t GetUtf16FromUtf8(const char** utf8_data_in);
				158
				159	/**
				160	* Gets the leading UTF-16 character from a surrogate pair, or the sole
				161	* UTF-16 character from the return value of GetUtf16FromUtf8.
				162	*/
				163	ALWAYS_INLINE uint16_t GetLeadingUtf16Char(uint32_t maybe_pair);
				164
				165	/**
				166	* Gets the trailing UTF-16 character from a surrogate pair, or 0 otherwise
				167	* from the return value of GetUtf16FromUtf8.
				168	*/
				169	ALWAYS_INLINE uint16_t GetTrailingUtf16Char(uint32_t maybe_pair);
				170
				171	// Returns a printable (escaped) version of a character.
				172	std::string PrintableChar(uint16_t ch);
				173
				174	// Returns an ASCII string corresponding to the given UTF-8 string.
				175	// Java escapes are used for non-ASCII characters.
				176	std::string PrintableString(const char* utf8);
				177
				178	} // namespace art
				179
				180	#endif // ART_LIBDEXFILE_DEX_UTF_H_