| /* |
| * Copyright (C) 2011 The Android Open Source Project |
| * |
| * Licensed under the Apache License, Version 2.0 (the "License"); |
| * you may not use this file except in compliance with the License. |
| * You may obtain a copy of the License at |
| * |
| * http://www.apache.org/licenses/LICENSE-2.0 |
| * |
| * Unless required by applicable law or agreed to in writing, software |
| * distributed under the License is distributed on an "AS IS" BASIS, |
| * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. |
| * See the License for the specific language governing permissions and |
| * limitations under the License. |
| */ |
| |
| /* |
| * Validate and manipulate MUTF-8 (modified UTF-8) encoded string data. |
| */ |
| |
| #ifndef LIBDEX_DEXUTF_H_ |
| #define LIBDEX_DEXUTF_H_ |
| |
| #include "DexFile.h" |
| |
| /* |
| * Retrieve the next UTF-16 character from a UTF-8 string. |
| * |
| * Advances "*pUtf8Ptr" to the start of the next character. |
| * |
| * WARNING: If a string is corrupted by dropping a '\0' in the middle |
| * of a 3-byte sequence, you can end up overrunning the buffer with |
| * reads (and possibly with the writes if the length was computed and |
| * cached before the damage). For performance reasons, this function |
| * assumes that the string being parsed is known to be valid (e.g., by |
| * already being verified). Most strings we process here are coming |
| * out of dex files or other internal translations, so the only real |
| * risk comes from the JNI NewStringUTF call. |
| */ |
| DEX_INLINE u2 dexGetUtf16FromUtf8(const char** pUtf8Ptr) |
| { |
| unsigned int one, two, three; |
| |
| one = *(*pUtf8Ptr)++; |
| if ((one & 0x80) != 0) { |
| /* two- or three-byte encoding */ |
| two = *(*pUtf8Ptr)++; |
| if ((one & 0x20) != 0) { |
| /* three-byte encoding */ |
| three = *(*pUtf8Ptr)++; |
| return ((one & 0x0f) << 12) | |
| ((two & 0x3f) << 6) | |
| (three & 0x3f); |
| } else { |
| /* two-byte encoding */ |
| return ((one & 0x1f) << 6) | |
| (two & 0x3f); |
| } |
| } else { |
| /* one-byte encoding */ |
| return one; |
| } |
| } |
| |
| /* Compare two '\0'-terminated modified UTF-8 strings, using Unicode |
| * code point values for comparison. This treats different encodings |
| * for the same code point as equivalent, except that only a real '\0' |
| * byte is considered the string terminator. The return value is as |
| * for strcmp(). */ |
| int dexUtf8Cmp(const char* s1, const char* s2); |
| |
| /* for dexIsValidMemberNameUtf8(), a bit vector indicating valid low ascii */ |
| extern u4 DEX_MEMBER_VALID_LOW_ASCII[4]; |
| |
| /* Helper for dexIsValidMemberUtf8(); do not call directly. */ |
| bool dexIsValidMemberNameUtf8_0(const char** pUtf8Ptr); |
| |
| /* Return whether the pointed-at modified-UTF-8 encoded character is |
| * valid as part of a member name, updating the pointer to point past |
| * the consumed character. This will consume two encoded UTF-16 code |
| * points if the character is encoded as a surrogate pair. Also, if |
| * this function returns false, then the given pointer may only have |
| * been partially advanced. */ |
| DEX_INLINE bool dexIsValidMemberNameUtf8(const char** pUtf8Ptr) { |
| u1 c = (u1) **pUtf8Ptr; |
| if (c <= 0x7f) { |
| // It's low-ascii, so check the table. |
| u4 wordIdx = c >> 5; |
| u4 bitIdx = c & 0x1f; |
| (*pUtf8Ptr)++; |
| return (DEX_MEMBER_VALID_LOW_ASCII[wordIdx] & (1 << bitIdx)) != 0; |
| } |
| |
| /* |
| * It's a multibyte encoded character. Call a non-inline function |
| * for the heavy lifting. |
| */ |
| return dexIsValidMemberNameUtf8_0(pUtf8Ptr); |
| } |
| |
| /* Return whether the given string is a valid field or method name. */ |
| bool dexIsValidMemberName(const char* s); |
| |
| /* Return whether the given string is a valid type descriptor. */ |
| bool dexIsValidTypeDescriptor(const char* s); |
| |
| /* Return whether the given string is a valid internal-form class |
| * name, with components separated either by dots or slashes as |
| * specified. A class name is like a type descriptor, except that it |
| * can't name a primitive type (including void). In terms of syntax, |
| * the form is either (a) the name of the class without adornment |
| * (that is, not bracketed by "L" and ";"); or (b) identical to the |
| * type descriptor syntax for array types. */ |
| bool dexIsValidClassName(const char* s, bool dotSeparator); |
| |
| /* Return whether the given string is a valid reference descriptor. This |
| * is true if dexIsValidTypeDescriptor() returns true and the descriptor |
| * is for a class or array and not a primitive type. */ |
| bool dexIsReferenceDescriptor(const char* s); |
| |
| /* Return whether the given string is a valid class descriptor. This |
| * is true if dexIsValidTypeDescriptor() returns true and the descriptor |
| * is for a class and not an array or primitive type. */ |
| bool dexIsClassDescriptor(const char* s); |
| |
| /* Return whether the given string is a valid field type descriptor. This |
| * is true if dexIsValidTypeDescriptor() returns true and the descriptor |
| * is for anything but "void". */ |
| bool dexIsFieldDescriptor(const char* s); |
| |
| #endif // LIBDEX_DEXUTF_H_ |