Blame - libdex/DexUtf.h - platform/dalvik

blob: cb3d919ae12e5530771ba2db622585e2321c801f [file] [log] [blame]

Dan Bornstein	9ea32b0	2011-03-09 17:40:41 -0800	[diff] [blame]	1	/*
				2	* Copyright (C) 2011 The Android Open Source Project
				3	*
				4	* Licensed under the Apache License, Version 2.0 (the "License");
				5	* you may not use this file except in compliance with the License.
				6	* You may obtain a copy of the License at
				7	*
				8	* http://www.apache.org/licenses/LICENSE-2.0
				9	*
				10	* Unless required by applicable law or agreed to in writing, software
				11	* distributed under the License is distributed on an "AS IS" BASIS,
				12	* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
				13	* See the License for the specific language governing permissions and
				14	* limitations under the License.
				15	*/
				16
				17	/*
				18	* Validate and manipulate MUTF-8 (modified UTF-8) encoded string data.
				19	*/
				20
Carl Shapiro	375fb11	2011-06-14 20:31:24 -0700	[diff] [blame]	21	#ifndef LIBDEX_DEXUTF_H_
				22	#define LIBDEX_DEXUTF_H_
Dan Bornstein	9ea32b0	2011-03-09 17:40:41 -0800	[diff] [blame]	23
				24	#include "DexFile.h"
				25
				26	/*
				27	* Retrieve the next UTF-16 character from a UTF-8 string.
				28	*
				29	* Advances "*pUtf8Ptr" to the start of the next character.
				30	*
				31	* WARNING: If a string is corrupted by dropping a '\0' in the middle
				32	* of a 3-byte sequence, you can end up overrunning the buffer with
				33	* reads (and possibly with the writes if the length was computed and
				34	* cached before the damage). For performance reasons, this function
				35	* assumes that the string being parsed is known to be valid (e.g., by
				36	* already being verified). Most strings we process here are coming
				37	* out of dex files or other internal translations, so the only real
				38	* risk comes from the JNI NewStringUTF call.
				39	*/
				40	DEX_INLINE u2 dexGetUtf16FromUtf8(const char** pUtf8Ptr)
				41	{
				42	unsigned int one, two, three;
				43
				44	one = (pUtf8Ptr)++;
				45	if ((one & 0x80) != 0) {
				46	/* two- or three-byte encoding */
				47	two = (pUtf8Ptr)++;
				48	if ((one & 0x20) != 0) {
				49	/* three-byte encoding */
				50	three = (pUtf8Ptr)++;
				51	return ((one & 0x0f) << 12) \|
				52	((two & 0x3f) << 6) \|
				53	(three & 0x3f);
				54	} else {
				55	/* two-byte encoding */
				56	return ((one & 0x1f) << 6) \|
				57	(two & 0x3f);
				58	}
				59	} else {
				60	/* one-byte encoding */
				61	return one;
				62	}
				63	}
				64
				65	/* Compare two '\0'-terminated modified UTF-8 strings, using Unicode
				66	* code point values for comparison. This treats different encodings
				67	* for the same code point as equivalent, except that only a real '\0'
				68	* byte is considered the string terminator. The return value is as
				69	* for strcmp(). */
				70	int dexUtf8Cmp(const char* s1, const char* s2);
				71
				72	/* for dexIsValidMemberNameUtf8(), a bit vector indicating valid low ascii */
				73	extern u4 DEX_MEMBER_VALID_LOW_ASCII[4];
				74
				75	/* Helper for dexIsValidMemberUtf8(); do not call directly. */
				76	bool dexIsValidMemberNameUtf8_0(const char** pUtf8Ptr);
				77
				78	/* Return whether the pointed-at modified-UTF-8 encoded character is
				79	* valid as part of a member name, updating the pointer to point past
				80	* the consumed character. This will consume two encoded UTF-16 code
				81	* points if the character is encoded as a surrogate pair. Also, if
				82	* this function returns false, then the given pointer may only have
				83	* been partially advanced. */
				84	DEX_INLINE bool dexIsValidMemberNameUtf8(const char** pUtf8Ptr) {
				85	u1 c = (u1) **pUtf8Ptr;
				86	if (c <= 0x7f) {
				87	// It's low-ascii, so check the table.
				88	u4 wordIdx = c >> 5;
				89	u4 bitIdx = c & 0x1f;
				90	(*pUtf8Ptr)++;
				91	return (DEX_MEMBER_VALID_LOW_ASCII[wordIdx] & (1 << bitIdx)) != 0;
				92	}
				93
				94	/*
				95	* It's a multibyte encoded character. Call a non-inline function
				96	* for the heavy lifting.
				97	*/
				98	return dexIsValidMemberNameUtf8_0(pUtf8Ptr);
				99	}
				100
				101	/* Return whether the given string is a valid field or method name. */
				102	bool dexIsValidMemberName(const char* s);
				103
				104	/* Return whether the given string is a valid type descriptor. */
				105	bool dexIsValidTypeDescriptor(const char* s);
				106
				107	/* Return whether the given string is a valid internal-form class
				108	* name, with components separated either by dots or slashes as
				109	* specified. A class name is like a type descriptor, except that it
				110	* can't name a primitive type (including void). In terms of syntax,
				111	* the form is either (a) the name of the class without adornment
				112	* (that is, not bracketed by "L" and ";"); or (b) identical to the
				113	* type descriptor syntax for array types. */
				114	bool dexIsValidClassName(const char* s, bool dotSeparator);
				115
				116	/* Return whether the given string is a valid reference descriptor. This
				117	* is true if dexIsValidTypeDescriptor() returns true and the descriptor
				118	* is for a class or array and not a primitive type. */
				119	bool dexIsReferenceDescriptor(const char* s);
				120
				121	/* Return whether the given string is a valid class descriptor. This
				122	* is true if dexIsValidTypeDescriptor() returns true and the descriptor
				123	* is for a class and not an array or primitive type. */
				124	bool dexIsClassDescriptor(const char* s);
				125
				126	/* Return whether the given string is a valid field type descriptor. This
				127	* is true if dexIsValidTypeDescriptor() returns true and the descriptor
				128	* is for anything but "void". */
				129	bool dexIsFieldDescriptor(const char* s);
				130
Carl Shapiro	375fb11	2011-06-14 20:31:24 -0700	[diff] [blame]	131	#endif // LIBDEX_DEXUTF_H_