blob: 895097f13d25566434db7a558b12499b16f61fbb [file] [log] [blame]
J. Duke319a3b92007-12-01 00:00:00 +00001/*
2 * Portions Copyright 2005 Sun Microsystems, Inc. All Rights Reserved.
3 * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
4 *
5 * This code is free software; you can redistribute it and/or modify it
6 * under the terms of the GNU General Public License version 2 only, as
7 * published by the Free Software Foundation. Sun designates this
8 * particular file as subject to the "Classpath" exception as provided
9 * by Sun in the LICENSE file that accompanied this code.
10 *
11 * This code is distributed in the hope that it will be useful, but WITHOUT
12 * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
13 * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
14 * version 2 for more details (a copy is included in the LICENSE file that
15 * accompanied this code).
16 *
17 * You should have received a copy of the GNU General Public License version
18 * 2 along with this work; if not, write to the Free Software Foundation,
19 * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
20 *
21 * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
22 * CA 95054 USA or visit www.sun.com if you need additional information or
23 * have any questions.
24 */
25/*
26 *******************************************************************************
27 * (C) Copyright IBM Corp. 1996-2005 - All Rights Reserved *
28 * *
29 * The original version of this source code and documentation is copyrighted *
30 * and owned by IBM, These materials are provided under terms of a License *
31 * Agreement between IBM and Sun. This technology is protected by multiple *
32 * US and International patents. This notice and attribution to IBM may not *
33 * to removed. *
34 *******************************************************************************
35 */
36
37package sun.text.normalizer;
38
39// This class contains utility functions so testing not needed
40///CLOVER:OFF
41public final class Utility {
42
43 /**
44 * Convert characters outside the range U+0020 to U+007F to
45 * Unicode escapes, and convert backslash to a double backslash.
46 */
47 public static final String escape(String s) {
48 StringBuffer buf = new StringBuffer();
49 for (int i=0; i<s.length(); ) {
50 int c = UTF16.charAt(s, i);
51 i += UTF16.getCharCount(c);
52 if (c >= ' ' && c <= 0x007F) {
53 if (c == '\\') {
54 buf.append("\\\\"); // That is, "\\"
55 } else {
56 buf.append((char)c);
57 }
58 } else {
59 boolean four = c <= 0xFFFF;
60 buf.append(four ? "\\u" : "\\U");
61 hex(c, four ? 4 : 8, buf);
62 }
63 }
64 return buf.toString();
65 }
66
67 /* This map must be in ASCENDING ORDER OF THE ESCAPE CODE */
68 static private final char[] UNESCAPE_MAP = {
69 /*" 0x22, 0x22 */
70 /*' 0x27, 0x27 */
71 /*? 0x3F, 0x3F */
72 /*\ 0x5C, 0x5C */
73 /*a*/ 0x61, 0x07,
74 /*b*/ 0x62, 0x08,
75 /*e*/ 0x65, 0x1b,
76 /*f*/ 0x66, 0x0c,
77 /*n*/ 0x6E, 0x0a,
78 /*r*/ 0x72, 0x0d,
79 /*t*/ 0x74, 0x09,
80 /*v*/ 0x76, 0x0b
81 };
82
83 /**
84 * Convert an escape to a 32-bit code point value. We attempt
85 * to parallel the icu4c unescapeAt() function.
86 * @param offset16 an array containing offset to the character
87 * <em>after</em> the backslash. Upon return offset16[0] will
88 * be updated to point after the escape sequence.
89 * @return character value from 0 to 10FFFF, or -1 on error.
90 */
91 public static int unescapeAt(String s, int[] offset16) {
92 int c;
93 int result = 0;
94 int n = 0;
95 int minDig = 0;
96 int maxDig = 0;
97 int bitsPerDigit = 4;
98 int dig;
99 int i;
100 boolean braces = false;
101
102 /* Check that offset is in range */
103 int offset = offset16[0];
104 int length = s.length();
105 if (offset < 0 || offset >= length) {
106 return -1;
107 }
108
109 /* Fetch first UChar after '\\' */
110 c = UTF16.charAt(s, offset);
111 offset += UTF16.getCharCount(c);
112
113 /* Convert hexadecimal and octal escapes */
114 switch (c) {
115 case 'u':
116 minDig = maxDig = 4;
117 break;
118 case 'U':
119 minDig = maxDig = 8;
120 break;
121 case 'x':
122 minDig = 1;
123 if (offset < length && UTF16.charAt(s, offset) == 0x7B /*{*/) {
124 ++offset;
125 braces = true;
126 maxDig = 8;
127 } else {
128 maxDig = 2;
129 }
130 break;
131 default:
132 dig = UCharacter.digit(c, 8);
133 if (dig >= 0) {
134 minDig = 1;
135 maxDig = 3;
136 n = 1; /* Already have first octal digit */
137 bitsPerDigit = 3;
138 result = dig;
139 }
140 break;
141 }
142 if (minDig != 0) {
143 while (offset < length && n < maxDig) {
144 c = UTF16.charAt(s, offset);
145 dig = UCharacter.digit(c, (bitsPerDigit == 3) ? 8 : 16);
146 if (dig < 0) {
147 break;
148 }
149 result = (result << bitsPerDigit) | dig;
150 offset += UTF16.getCharCount(c);
151 ++n;
152 }
153 if (n < minDig) {
154 return -1;
155 }
156 if (braces) {
157 if (c != 0x7D /*}*/) {
158 return -1;
159 }
160 ++offset;
161 }
162 if (result < 0 || result >= 0x110000) {
163 return -1;
164 }
165 // If an escape sequence specifies a lead surrogate, see
166 // if there is a trail surrogate after it, either as an
167 // escape or as a literal. If so, join them up into a
168 // supplementary.
169 if (offset < length &&
170 UTF16.isLeadSurrogate((char) result)) {
171 int ahead = offset+1;
172 c = s.charAt(offset); // [sic] get 16-bit code unit
173 if (c == '\\' && ahead < length) {
174 int o[] = new int[] { ahead };
175 c = unescapeAt(s, o);
176 ahead = o[0];
177 }
178 if (UTF16.isTrailSurrogate((char) c)) {
179 offset = ahead;
180 result = UCharacterProperty.getRawSupplementary(
181 (char) result, (char) c);
182 }
183 }
184 offset16[0] = offset;
185 return result;
186 }
187
188 /* Convert C-style escapes in table */
189 for (i=0; i<UNESCAPE_MAP.length; i+=2) {
190 if (c == UNESCAPE_MAP[i]) {
191 offset16[0] = offset;
192 return UNESCAPE_MAP[i+1];
193 } else if (c < UNESCAPE_MAP[i]) {
194 break;
195 }
196 }
197
198 /* Map \cX to control-X: X & 0x1F */
199 if (c == 'c' && offset < length) {
200 c = UTF16.charAt(s, offset);
201 offset16[0] = offset + UTF16.getCharCount(c);
202 return 0x1F & c;
203 }
204
205 /* If no special forms are recognized, then consider
206 * the backslash to generically escape the next character. */
207 offset16[0] = offset;
208 return c;
209 }
210
211 /**
212 * Convert a integer to size width hex uppercase digits.
213 * E.g., hex('a', 4, str) => "0041".
214 * Append the output to the given StringBuffer.
215 * If width is too small to fit, nothing will be appended to output.
216 */
217 public static StringBuffer hex(int ch, int width, StringBuffer output) {
218 return appendNumber(output, ch, 16, width);
219 }
220
221 /**
222 * Convert a integer to size width (minimum) hex uppercase digits.
223 * E.g., hex('a', 4, str) => "0041". If the integer requires more
224 * than width digits, more will be used.
225 */
226 public static String hex(int ch, int width) {
227 StringBuffer buf = new StringBuffer();
228 return appendNumber(buf, ch, 16, width).toString();
229 }
230
231 /**
232 * Skip over a sequence of zero or more white space characters
233 * at pos. Return the index of the first non-white-space character
234 * at or after pos, or str.length(), if there is none.
235 */
236 public static int skipWhitespace(String str, int pos) {
237 while (pos < str.length()) {
238 int c = UTF16.charAt(str, pos);
239 if (!UCharacterProperty.isRuleWhiteSpace(c)) {
240 break;
241 }
242 pos += UTF16.getCharCount(c);
243 }
244 return pos;
245 }
246
247 static final char DIGITS[] = {
248 '0', '1', '2', '3', '4', '5', '6', '7', '8', '9',
249 'A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'J',
250 'K', 'L', 'M', 'N', 'O', 'P', 'Q', 'R', 'S', 'T',
251 'U', 'V', 'W', 'X', 'Y', 'Z'
252 };
253
254 /**
255 * Append the digits of a positive integer to the given
256 * <code>StringBuffer</code> in the given radix. This is
257 * done recursively since it is easiest to generate the low-
258 * order digit first, but it must be appended last.
259 *
260 * @param result is the <code>StringBuffer</code> to append to
261 * @param n is the positive integer
262 * @param radix is the radix, from 2 to 36 inclusive
263 * @param minDigits is the minimum number of digits to append.
264 */
265 private static void recursiveAppendNumber(StringBuffer result, int n,
266 int radix, int minDigits)
267 {
268 int digit = n % radix;
269
270 if (n >= radix || minDigits > 1) {
271 recursiveAppendNumber(result, n / radix, radix, minDigits - 1);
272 }
273
274 result.append(DIGITS[digit]);
275 }
276
277 /**
278 * Append a number to the given StringBuffer in the given radix.
279 * Standard digits '0'-'9' are used and letters 'A'-'Z' for
280 * radices 11 through 36.
281 * @param result the digits of the number are appended here
282 * @param n the number to be converted to digits; may be negative.
283 * If negative, a '-' is prepended to the digits.
284 * @param radix a radix from 2 to 36 inclusive.
285 * @param minDigits the minimum number of digits, not including
286 * any '-', to produce. Values less than 2 have no effect. One
287 * digit is always emitted regardless of this parameter.
288 * @return a reference to result
289 */
290 public static StringBuffer appendNumber(StringBuffer result, int n,
291 int radix, int minDigits)
292 throws IllegalArgumentException
293 {
294 if (radix < 2 || radix > 36) {
295 throw new IllegalArgumentException("Illegal radix " + radix);
296 }
297
298
299 int abs = n;
300
301 if (n < 0) {
302 abs = -n;
303 result.append("-");
304 }
305
306 recursiveAppendNumber(result, abs, radix, minDigits);
307
308 return result;
309 }
310
311 /**
312 * Return true if the character is NOT printable ASCII. The tab,
313 * newline and linefeed characters are considered unprintable.
314 */
315 public static boolean isUnprintable(int c) {
316 return !(c >= 0x20 && c <= 0x7E);
317 }
318
319 /**
320 * Escape unprintable characters using <backslash>uxxxx notation
321 * for U+0000 to U+FFFF and <backslash>Uxxxxxxxx for U+10000 and
322 * above. If the character is printable ASCII, then do nothing
323 * and return FALSE. Otherwise, append the escaped notation and
324 * return TRUE.
325 */
326 public static boolean escapeUnprintable(StringBuffer result, int c) {
327 if (isUnprintable(c)) {
328 result.append('\\');
329 if ((c & ~0xFFFF) != 0) {
330 result.append('U');
331 result.append(DIGITS[0xF&(c>>28)]);
332 result.append(DIGITS[0xF&(c>>24)]);
333 result.append(DIGITS[0xF&(c>>20)]);
334 result.append(DIGITS[0xF&(c>>16)]);
335 } else {
336 result.append('u');
337 }
338 result.append(DIGITS[0xF&(c>>12)]);
339 result.append(DIGITS[0xF&(c>>8)]);
340 result.append(DIGITS[0xF&(c>>4)]);
341 result.append(DIGITS[0xF&c]);
342 return true;
343 }
344 return false;
345 }
346
347 //// for StringPrep
348 /**
349 * Similar to StringBuffer.getChars, version 1.3.
350 * Since JDK 1.2 implements StringBuffer.getChars differently, this method
351 * is here to provide consistent results.
352 * To be removed after JDK 1.2 ceased to be the reference platform.
353 * @param src source string buffer
354 * @param srcBegin offset to the start of the src to retrieve from
355 * @param srcEnd offset to the end of the src to retrieve from
356 * @param dst char array to store the retrieved chars
357 * @param dstBegin offset to the start of the destination char array to
358 * store the retrieved chars
359 * @draft since ICU4J 2.0
360 */
361 public static void getChars(StringBuffer src, int srcBegin, int srcEnd,
362 char dst[], int dstBegin)
363 {
364 if (srcBegin == srcEnd) {
365 return;
366 }
367 src.getChars(srcBegin, srcEnd, dst, dstBegin);
368 }
369
370 /**
371 * Convenience utility to compare two char[]s.
372 * @param len the length to compare.
373 * The start indices and start+len must be valid.
374 */
375 public final static boolean arrayRegionMatches(char[] source, int sourceStart,
376 char[] target, int targetStart,
377 int len)
378 {
379 int sourceEnd = sourceStart + len;
380 int delta = targetStart - sourceStart;
381 for (int i = sourceStart; i < sourceEnd; i++) {
382 if (source[i] != target[i + delta])
383 return false;
384 }
385 return true;
386 }
387
388}
389///CLOVER:ON