Dan Bornstein | 9ea32b0 | 2011-03-09 17:40:41 -0800 | [diff] [blame] | 1 | /* |
| 2 | * Copyright (C) 2011 The Android Open Source Project |
| 3 | * |
| 4 | * Licensed under the Apache License, Version 2.0 (the "License"); |
| 5 | * you may not use this file except in compliance with the License. |
| 6 | * You may obtain a copy of the License at |
| 7 | * |
| 8 | * http://www.apache.org/licenses/LICENSE-2.0 |
| 9 | * |
| 10 | * Unless required by applicable law or agreed to in writing, software |
| 11 | * distributed under the License is distributed on an "AS IS" BASIS, |
| 12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. |
| 13 | * See the License for the specific language governing permissions and |
| 14 | * limitations under the License. |
| 15 | */ |
| 16 | |
| 17 | /* |
| 18 | * Validate and manipulate MUTF-8 (modified UTF-8) encoded string data. |
| 19 | */ |
| 20 | |
Carl Shapiro | 375fb11 | 2011-06-14 20:31:24 -0700 | [diff] [blame] | 21 | #ifndef LIBDEX_DEXUTF_H_ |
| 22 | #define LIBDEX_DEXUTF_H_ |
Dan Bornstein | 9ea32b0 | 2011-03-09 17:40:41 -0800 | [diff] [blame] | 23 | |
| 24 | #include "DexFile.h" |
| 25 | |
| 26 | /* |
| 27 | * Retrieve the next UTF-16 character from a UTF-8 string. |
| 28 | * |
| 29 | * Advances "*pUtf8Ptr" to the start of the next character. |
| 30 | * |
| 31 | * WARNING: If a string is corrupted by dropping a '\0' in the middle |
| 32 | * of a 3-byte sequence, you can end up overrunning the buffer with |
| 33 | * reads (and possibly with the writes if the length was computed and |
| 34 | * cached before the damage). For performance reasons, this function |
| 35 | * assumes that the string being parsed is known to be valid (e.g., by |
| 36 | * already being verified). Most strings we process here are coming |
| 37 | * out of dex files or other internal translations, so the only real |
| 38 | * risk comes from the JNI NewStringUTF call. |
| 39 | */ |
| 40 | DEX_INLINE u2 dexGetUtf16FromUtf8(const char** pUtf8Ptr) |
| 41 | { |
| 42 | unsigned int one, two, three; |
| 43 | |
| 44 | one = *(*pUtf8Ptr)++; |
| 45 | if ((one & 0x80) != 0) { |
| 46 | /* two- or three-byte encoding */ |
| 47 | two = *(*pUtf8Ptr)++; |
| 48 | if ((one & 0x20) != 0) { |
| 49 | /* three-byte encoding */ |
| 50 | three = *(*pUtf8Ptr)++; |
| 51 | return ((one & 0x0f) << 12) | |
| 52 | ((two & 0x3f) << 6) | |
| 53 | (three & 0x3f); |
| 54 | } else { |
| 55 | /* two-byte encoding */ |
| 56 | return ((one & 0x1f) << 6) | |
| 57 | (two & 0x3f); |
| 58 | } |
| 59 | } else { |
| 60 | /* one-byte encoding */ |
| 61 | return one; |
| 62 | } |
| 63 | } |
| 64 | |
| 65 | /* Compare two '\0'-terminated modified UTF-8 strings, using Unicode |
| 66 | * code point values for comparison. This treats different encodings |
| 67 | * for the same code point as equivalent, except that only a real '\0' |
| 68 | * byte is considered the string terminator. The return value is as |
| 69 | * for strcmp(). */ |
| 70 | int dexUtf8Cmp(const char* s1, const char* s2); |
| 71 | |
| 72 | /* for dexIsValidMemberNameUtf8(), a bit vector indicating valid low ascii */ |
| 73 | extern u4 DEX_MEMBER_VALID_LOW_ASCII[4]; |
| 74 | |
| 75 | /* Helper for dexIsValidMemberUtf8(); do not call directly. */ |
| 76 | bool dexIsValidMemberNameUtf8_0(const char** pUtf8Ptr); |
| 77 | |
| 78 | /* Return whether the pointed-at modified-UTF-8 encoded character is |
| 79 | * valid as part of a member name, updating the pointer to point past |
| 80 | * the consumed character. This will consume two encoded UTF-16 code |
| 81 | * points if the character is encoded as a surrogate pair. Also, if |
| 82 | * this function returns false, then the given pointer may only have |
| 83 | * been partially advanced. */ |
| 84 | DEX_INLINE bool dexIsValidMemberNameUtf8(const char** pUtf8Ptr) { |
| 85 | u1 c = (u1) **pUtf8Ptr; |
| 86 | if (c <= 0x7f) { |
| 87 | // It's low-ascii, so check the table. |
| 88 | u4 wordIdx = c >> 5; |
| 89 | u4 bitIdx = c & 0x1f; |
| 90 | (*pUtf8Ptr)++; |
| 91 | return (DEX_MEMBER_VALID_LOW_ASCII[wordIdx] & (1 << bitIdx)) != 0; |
| 92 | } |
| 93 | |
| 94 | /* |
| 95 | * It's a multibyte encoded character. Call a non-inline function |
| 96 | * for the heavy lifting. |
| 97 | */ |
| 98 | return dexIsValidMemberNameUtf8_0(pUtf8Ptr); |
| 99 | } |
| 100 | |
| 101 | /* Return whether the given string is a valid field or method name. */ |
| 102 | bool dexIsValidMemberName(const char* s); |
| 103 | |
| 104 | /* Return whether the given string is a valid type descriptor. */ |
| 105 | bool dexIsValidTypeDescriptor(const char* s); |
| 106 | |
| 107 | /* Return whether the given string is a valid internal-form class |
| 108 | * name, with components separated either by dots or slashes as |
| 109 | * specified. A class name is like a type descriptor, except that it |
| 110 | * can't name a primitive type (including void). In terms of syntax, |
| 111 | * the form is either (a) the name of the class without adornment |
| 112 | * (that is, not bracketed by "L" and ";"); or (b) identical to the |
| 113 | * type descriptor syntax for array types. */ |
| 114 | bool dexIsValidClassName(const char* s, bool dotSeparator); |
| 115 | |
| 116 | /* Return whether the given string is a valid reference descriptor. This |
| 117 | * is true if dexIsValidTypeDescriptor() returns true and the descriptor |
| 118 | * is for a class or array and not a primitive type. */ |
| 119 | bool dexIsReferenceDescriptor(const char* s); |
| 120 | |
| 121 | /* Return whether the given string is a valid class descriptor. This |
| 122 | * is true if dexIsValidTypeDescriptor() returns true and the descriptor |
| 123 | * is for a class and not an array or primitive type. */ |
| 124 | bool dexIsClassDescriptor(const char* s); |
| 125 | |
| 126 | /* Return whether the given string is a valid field type descriptor. This |
| 127 | * is true if dexIsValidTypeDescriptor() returns true and the descriptor |
| 128 | * is for anything but "void". */ |
| 129 | bool dexIsFieldDescriptor(const char* s); |
| 130 | |
Carl Shapiro | 375fb11 | 2011-06-14 20:31:24 -0700 | [diff] [blame] | 131 | #endif // LIBDEX_DEXUTF_H_ |