J. Duke | 319a3b9 | 2007-12-01 00:00:00 +0000 | [diff] [blame^] | 1 | /* |
| 2 | * Portions Copyright 2005 Sun Microsystems, Inc. All Rights Reserved. |
| 3 | * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. |
| 4 | * |
| 5 | * This code is free software; you can redistribute it and/or modify it |
| 6 | * under the terms of the GNU General Public License version 2 only, as |
| 7 | * published by the Free Software Foundation. Sun designates this |
| 8 | * particular file as subject to the "Classpath" exception as provided |
| 9 | * by Sun in the LICENSE file that accompanied this code. |
| 10 | * |
| 11 | * This code is distributed in the hope that it will be useful, but WITHOUT |
| 12 | * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or |
| 13 | * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License |
| 14 | * version 2 for more details (a copy is included in the LICENSE file that |
| 15 | * accompanied this code). |
| 16 | * |
| 17 | * You should have received a copy of the GNU General Public License version |
| 18 | * 2 along with this work; if not, write to the Free Software Foundation, |
| 19 | * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA. |
| 20 | * |
| 21 | * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara, |
| 22 | * CA 95054 USA or visit www.sun.com if you need additional information or |
| 23 | * have any questions. |
| 24 | */ |
| 25 | /* |
| 26 | ******************************************************************************* |
| 27 | * (C) Copyright IBM Corp. 1996-2005 - All Rights Reserved * |
| 28 | * * |
| 29 | * The original version of this source code and documentation is copyrighted * |
| 30 | * and owned by IBM, These materials are provided under terms of a License * |
| 31 | * Agreement between IBM and Sun. This technology is protected by multiple * |
| 32 | * US and International patents. This notice and attribution to IBM may not * |
| 33 | * to removed. * |
| 34 | ******************************************************************************* |
| 35 | */ |
| 36 | |
| 37 | package sun.text.normalizer; |
| 38 | |
| 39 | // This class contains utility functions so testing not needed |
| 40 | ///CLOVER:OFF |
| 41 | public final class Utility { |
| 42 | |
| 43 | /** |
| 44 | * Convert characters outside the range U+0020 to U+007F to |
| 45 | * Unicode escapes, and convert backslash to a double backslash. |
| 46 | */ |
| 47 | public static final String escape(String s) { |
| 48 | StringBuffer buf = new StringBuffer(); |
| 49 | for (int i=0; i<s.length(); ) { |
| 50 | int c = UTF16.charAt(s, i); |
| 51 | i += UTF16.getCharCount(c); |
| 52 | if (c >= ' ' && c <= 0x007F) { |
| 53 | if (c == '\\') { |
| 54 | buf.append("\\\\"); // That is, "\\" |
| 55 | } else { |
| 56 | buf.append((char)c); |
| 57 | } |
| 58 | } else { |
| 59 | boolean four = c <= 0xFFFF; |
| 60 | buf.append(four ? "\\u" : "\\U"); |
| 61 | hex(c, four ? 4 : 8, buf); |
| 62 | } |
| 63 | } |
| 64 | return buf.toString(); |
| 65 | } |
| 66 | |
| 67 | /* This map must be in ASCENDING ORDER OF THE ESCAPE CODE */ |
| 68 | static private final char[] UNESCAPE_MAP = { |
| 69 | /*" 0x22, 0x22 */ |
| 70 | /*' 0x27, 0x27 */ |
| 71 | /*? 0x3F, 0x3F */ |
| 72 | /*\ 0x5C, 0x5C */ |
| 73 | /*a*/ 0x61, 0x07, |
| 74 | /*b*/ 0x62, 0x08, |
| 75 | /*e*/ 0x65, 0x1b, |
| 76 | /*f*/ 0x66, 0x0c, |
| 77 | /*n*/ 0x6E, 0x0a, |
| 78 | /*r*/ 0x72, 0x0d, |
| 79 | /*t*/ 0x74, 0x09, |
| 80 | /*v*/ 0x76, 0x0b |
| 81 | }; |
| 82 | |
| 83 | /** |
| 84 | * Convert an escape to a 32-bit code point value. We attempt |
| 85 | * to parallel the icu4c unescapeAt() function. |
| 86 | * @param offset16 an array containing offset to the character |
| 87 | * <em>after</em> the backslash. Upon return offset16[0] will |
| 88 | * be updated to point after the escape sequence. |
| 89 | * @return character value from 0 to 10FFFF, or -1 on error. |
| 90 | */ |
| 91 | public static int unescapeAt(String s, int[] offset16) { |
| 92 | int c; |
| 93 | int result = 0; |
| 94 | int n = 0; |
| 95 | int minDig = 0; |
| 96 | int maxDig = 0; |
| 97 | int bitsPerDigit = 4; |
| 98 | int dig; |
| 99 | int i; |
| 100 | boolean braces = false; |
| 101 | |
| 102 | /* Check that offset is in range */ |
| 103 | int offset = offset16[0]; |
| 104 | int length = s.length(); |
| 105 | if (offset < 0 || offset >= length) { |
| 106 | return -1; |
| 107 | } |
| 108 | |
| 109 | /* Fetch first UChar after '\\' */ |
| 110 | c = UTF16.charAt(s, offset); |
| 111 | offset += UTF16.getCharCount(c); |
| 112 | |
| 113 | /* Convert hexadecimal and octal escapes */ |
| 114 | switch (c) { |
| 115 | case 'u': |
| 116 | minDig = maxDig = 4; |
| 117 | break; |
| 118 | case 'U': |
| 119 | minDig = maxDig = 8; |
| 120 | break; |
| 121 | case 'x': |
| 122 | minDig = 1; |
| 123 | if (offset < length && UTF16.charAt(s, offset) == 0x7B /*{*/) { |
| 124 | ++offset; |
| 125 | braces = true; |
| 126 | maxDig = 8; |
| 127 | } else { |
| 128 | maxDig = 2; |
| 129 | } |
| 130 | break; |
| 131 | default: |
| 132 | dig = UCharacter.digit(c, 8); |
| 133 | if (dig >= 0) { |
| 134 | minDig = 1; |
| 135 | maxDig = 3; |
| 136 | n = 1; /* Already have first octal digit */ |
| 137 | bitsPerDigit = 3; |
| 138 | result = dig; |
| 139 | } |
| 140 | break; |
| 141 | } |
| 142 | if (minDig != 0) { |
| 143 | while (offset < length && n < maxDig) { |
| 144 | c = UTF16.charAt(s, offset); |
| 145 | dig = UCharacter.digit(c, (bitsPerDigit == 3) ? 8 : 16); |
| 146 | if (dig < 0) { |
| 147 | break; |
| 148 | } |
| 149 | result = (result << bitsPerDigit) | dig; |
| 150 | offset += UTF16.getCharCount(c); |
| 151 | ++n; |
| 152 | } |
| 153 | if (n < minDig) { |
| 154 | return -1; |
| 155 | } |
| 156 | if (braces) { |
| 157 | if (c != 0x7D /*}*/) { |
| 158 | return -1; |
| 159 | } |
| 160 | ++offset; |
| 161 | } |
| 162 | if (result < 0 || result >= 0x110000) { |
| 163 | return -1; |
| 164 | } |
| 165 | // If an escape sequence specifies a lead surrogate, see |
| 166 | // if there is a trail surrogate after it, either as an |
| 167 | // escape or as a literal. If so, join them up into a |
| 168 | // supplementary. |
| 169 | if (offset < length && |
| 170 | UTF16.isLeadSurrogate((char) result)) { |
| 171 | int ahead = offset+1; |
| 172 | c = s.charAt(offset); // [sic] get 16-bit code unit |
| 173 | if (c == '\\' && ahead < length) { |
| 174 | int o[] = new int[] { ahead }; |
| 175 | c = unescapeAt(s, o); |
| 176 | ahead = o[0]; |
| 177 | } |
| 178 | if (UTF16.isTrailSurrogate((char) c)) { |
| 179 | offset = ahead; |
| 180 | result = UCharacterProperty.getRawSupplementary( |
| 181 | (char) result, (char) c); |
| 182 | } |
| 183 | } |
| 184 | offset16[0] = offset; |
| 185 | return result; |
| 186 | } |
| 187 | |
| 188 | /* Convert C-style escapes in table */ |
| 189 | for (i=0; i<UNESCAPE_MAP.length; i+=2) { |
| 190 | if (c == UNESCAPE_MAP[i]) { |
| 191 | offset16[0] = offset; |
| 192 | return UNESCAPE_MAP[i+1]; |
| 193 | } else if (c < UNESCAPE_MAP[i]) { |
| 194 | break; |
| 195 | } |
| 196 | } |
| 197 | |
| 198 | /* Map \cX to control-X: X & 0x1F */ |
| 199 | if (c == 'c' && offset < length) { |
| 200 | c = UTF16.charAt(s, offset); |
| 201 | offset16[0] = offset + UTF16.getCharCount(c); |
| 202 | return 0x1F & c; |
| 203 | } |
| 204 | |
| 205 | /* If no special forms are recognized, then consider |
| 206 | * the backslash to generically escape the next character. */ |
| 207 | offset16[0] = offset; |
| 208 | return c; |
| 209 | } |
| 210 | |
| 211 | /** |
| 212 | * Convert a integer to size width hex uppercase digits. |
| 213 | * E.g., hex('a', 4, str) => "0041". |
| 214 | * Append the output to the given StringBuffer. |
| 215 | * If width is too small to fit, nothing will be appended to output. |
| 216 | */ |
| 217 | public static StringBuffer hex(int ch, int width, StringBuffer output) { |
| 218 | return appendNumber(output, ch, 16, width); |
| 219 | } |
| 220 | |
| 221 | /** |
| 222 | * Convert a integer to size width (minimum) hex uppercase digits. |
| 223 | * E.g., hex('a', 4, str) => "0041". If the integer requires more |
| 224 | * than width digits, more will be used. |
| 225 | */ |
| 226 | public static String hex(int ch, int width) { |
| 227 | StringBuffer buf = new StringBuffer(); |
| 228 | return appendNumber(buf, ch, 16, width).toString(); |
| 229 | } |
| 230 | |
| 231 | /** |
| 232 | * Skip over a sequence of zero or more white space characters |
| 233 | * at pos. Return the index of the first non-white-space character |
| 234 | * at or after pos, or str.length(), if there is none. |
| 235 | */ |
| 236 | public static int skipWhitespace(String str, int pos) { |
| 237 | while (pos < str.length()) { |
| 238 | int c = UTF16.charAt(str, pos); |
| 239 | if (!UCharacterProperty.isRuleWhiteSpace(c)) { |
| 240 | break; |
| 241 | } |
| 242 | pos += UTF16.getCharCount(c); |
| 243 | } |
| 244 | return pos; |
| 245 | } |
| 246 | |
| 247 | static final char DIGITS[] = { |
| 248 | '0', '1', '2', '3', '4', '5', '6', '7', '8', '9', |
| 249 | 'A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'J', |
| 250 | 'K', 'L', 'M', 'N', 'O', 'P', 'Q', 'R', 'S', 'T', |
| 251 | 'U', 'V', 'W', 'X', 'Y', 'Z' |
| 252 | }; |
| 253 | |
| 254 | /** |
| 255 | * Append the digits of a positive integer to the given |
| 256 | * <code>StringBuffer</code> in the given radix. This is |
| 257 | * done recursively since it is easiest to generate the low- |
| 258 | * order digit first, but it must be appended last. |
| 259 | * |
| 260 | * @param result is the <code>StringBuffer</code> to append to |
| 261 | * @param n is the positive integer |
| 262 | * @param radix is the radix, from 2 to 36 inclusive |
| 263 | * @param minDigits is the minimum number of digits to append. |
| 264 | */ |
| 265 | private static void recursiveAppendNumber(StringBuffer result, int n, |
| 266 | int radix, int minDigits) |
| 267 | { |
| 268 | int digit = n % radix; |
| 269 | |
| 270 | if (n >= radix || minDigits > 1) { |
| 271 | recursiveAppendNumber(result, n / radix, radix, minDigits - 1); |
| 272 | } |
| 273 | |
| 274 | result.append(DIGITS[digit]); |
| 275 | } |
| 276 | |
| 277 | /** |
| 278 | * Append a number to the given StringBuffer in the given radix. |
| 279 | * Standard digits '0'-'9' are used and letters 'A'-'Z' for |
| 280 | * radices 11 through 36. |
| 281 | * @param result the digits of the number are appended here |
| 282 | * @param n the number to be converted to digits; may be negative. |
| 283 | * If negative, a '-' is prepended to the digits. |
| 284 | * @param radix a radix from 2 to 36 inclusive. |
| 285 | * @param minDigits the minimum number of digits, not including |
| 286 | * any '-', to produce. Values less than 2 have no effect. One |
| 287 | * digit is always emitted regardless of this parameter. |
| 288 | * @return a reference to result |
| 289 | */ |
| 290 | public static StringBuffer appendNumber(StringBuffer result, int n, |
| 291 | int radix, int minDigits) |
| 292 | throws IllegalArgumentException |
| 293 | { |
| 294 | if (radix < 2 || radix > 36) { |
| 295 | throw new IllegalArgumentException("Illegal radix " + radix); |
| 296 | } |
| 297 | |
| 298 | |
| 299 | int abs = n; |
| 300 | |
| 301 | if (n < 0) { |
| 302 | abs = -n; |
| 303 | result.append("-"); |
| 304 | } |
| 305 | |
| 306 | recursiveAppendNumber(result, abs, radix, minDigits); |
| 307 | |
| 308 | return result; |
| 309 | } |
| 310 | |
| 311 | /** |
| 312 | * Return true if the character is NOT printable ASCII. The tab, |
| 313 | * newline and linefeed characters are considered unprintable. |
| 314 | */ |
| 315 | public static boolean isUnprintable(int c) { |
| 316 | return !(c >= 0x20 && c <= 0x7E); |
| 317 | } |
| 318 | |
| 319 | /** |
| 320 | * Escape unprintable characters using <backslash>uxxxx notation |
| 321 | * for U+0000 to U+FFFF and <backslash>Uxxxxxxxx for U+10000 and |
| 322 | * above. If the character is printable ASCII, then do nothing |
| 323 | * and return FALSE. Otherwise, append the escaped notation and |
| 324 | * return TRUE. |
| 325 | */ |
| 326 | public static boolean escapeUnprintable(StringBuffer result, int c) { |
| 327 | if (isUnprintable(c)) { |
| 328 | result.append('\\'); |
| 329 | if ((c & ~0xFFFF) != 0) { |
| 330 | result.append('U'); |
| 331 | result.append(DIGITS[0xF&(c>>28)]); |
| 332 | result.append(DIGITS[0xF&(c>>24)]); |
| 333 | result.append(DIGITS[0xF&(c>>20)]); |
| 334 | result.append(DIGITS[0xF&(c>>16)]); |
| 335 | } else { |
| 336 | result.append('u'); |
| 337 | } |
| 338 | result.append(DIGITS[0xF&(c>>12)]); |
| 339 | result.append(DIGITS[0xF&(c>>8)]); |
| 340 | result.append(DIGITS[0xF&(c>>4)]); |
| 341 | result.append(DIGITS[0xF&c]); |
| 342 | return true; |
| 343 | } |
| 344 | return false; |
| 345 | } |
| 346 | |
| 347 | //// for StringPrep |
| 348 | /** |
| 349 | * Similar to StringBuffer.getChars, version 1.3. |
| 350 | * Since JDK 1.2 implements StringBuffer.getChars differently, this method |
| 351 | * is here to provide consistent results. |
| 352 | * To be removed after JDK 1.2 ceased to be the reference platform. |
| 353 | * @param src source string buffer |
| 354 | * @param srcBegin offset to the start of the src to retrieve from |
| 355 | * @param srcEnd offset to the end of the src to retrieve from |
| 356 | * @param dst char array to store the retrieved chars |
| 357 | * @param dstBegin offset to the start of the destination char array to |
| 358 | * store the retrieved chars |
| 359 | * @draft since ICU4J 2.0 |
| 360 | */ |
| 361 | public static void getChars(StringBuffer src, int srcBegin, int srcEnd, |
| 362 | char dst[], int dstBegin) |
| 363 | { |
| 364 | if (srcBegin == srcEnd) { |
| 365 | return; |
| 366 | } |
| 367 | src.getChars(srcBegin, srcEnd, dst, dstBegin); |
| 368 | } |
| 369 | |
| 370 | /** |
| 371 | * Convenience utility to compare two char[]s. |
| 372 | * @param len the length to compare. |
| 373 | * The start indices and start+len must be valid. |
| 374 | */ |
| 375 | public final static boolean arrayRegionMatches(char[] source, int sourceStart, |
| 376 | char[] target, int targetStart, |
| 377 | int len) |
| 378 | { |
| 379 | int sourceEnd = sourceStart + len; |
| 380 | int delta = targetStart - sourceStart; |
| 381 | for (int i = sourceStart; i < sourceEnd; i++) { |
| 382 | if (source[i] != target[i + delta]) |
| 383 | return false; |
| 384 | } |
| 385 | return true; |
| 386 | } |
| 387 | |
| 388 | } |
| 389 | ///CLOVER:ON |