blob: 3b872a6063fe68bee273f8390fbb701e9706684a [file] [log] [blame]
J. Duke319a3b92007-12-01 00:00:00 +00001/*
2 * Portions Copyright 2005-2006 Sun Microsystems, Inc. All Rights Reserved.
3 * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
4 *
5 * This code is free software; you can redistribute it and/or modify it
6 * under the terms of the GNU General Public License version 2 only, as
7 * published by the Free Software Foundation. Sun designates this
8 * particular file as subject to the "Classpath" exception as provided
9 * by Sun in the LICENSE file that accompanied this code.
10 *
11 * This code is distributed in the hope that it will be useful, but WITHOUT
12 * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
13 * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
14 * version 2 for more details (a copy is included in the LICENSE file that
15 * accompanied this code).
16 *
17 * You should have received a copy of the GNU General Public License version
18 * 2 along with this work; if not, write to the Free Software Foundation,
19 * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
20 *
21 * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
22 * CA 95054 USA or visit www.sun.com if you need additional information or
23 * have any questions.
24 */
25
26/*
27 *******************************************************************************
28 * (C) Copyright IBM Corp. 1996-2005 - All Rights Reserved *
29 * *
30 * The original version of this source code and documentation is copyrighted *
31 * and owned by IBM, These materials are provided under terms of a License *
32 * Agreement between IBM and Sun. This technology is protected by multiple *
33 * US and International patents. This notice and attribution to IBM may not *
34 * to removed. *
35 *******************************************************************************
36 */
37
38package sun.text.normalizer;
39
40/**
41 * <p>Standalone utility class providing UTF16 character conversions and
42 * indexing conversions.</p>
43 * <p>Code that uses strings alone rarely need modification.
44 * By design, UTF-16 does not allow overlap, so searching for strings is a safe
45 * operation. Similarly, concatenation is always safe. Substringing is safe if
46 * the start and end are both on UTF-32 boundaries. In normal code, the values
47 * for start and end are on those boundaries, since they arose from operations
48 * like searching. If not, the nearest UTF-32 boundaries can be determined
49 * using <code>bounds()</code>.</p>
50 * <strong>Examples:</strong>
51 * <p>The following examples illustrate use of some of these methods.
52 * <pre>
53 * // iteration forwards: Original
54 * for (int i = 0; i &lt; s.length(); ++i) {
55 * char ch = s.charAt(i);
56 * doSomethingWith(ch);
57 * }
58 *
59 * // iteration forwards: Changes for UTF-32
60 * int ch;
61 * for (int i = 0; i &lt; s.length(); i+=UTF16.getCharCount(ch)) {
62 * ch = UTF16.charAt(s,i);
63 * doSomethingWith(ch);
64 * }
65 *
66 * // iteration backwards: Original
67 * for (int i = s.length() -1; i >= 0; --i) {
68 * char ch = s.charAt(i);
69 * doSomethingWith(ch);
70 * }
71 *
72 * // iteration backwards: Changes for UTF-32
73 * int ch;
74 * for (int i = s.length() -1; i > 0; i-=UTF16.getCharCount(ch)) {
75 * ch = UTF16.charAt(s,i);
76 * doSomethingWith(ch);
77 * }
78 * </pre>
79 * <strong>Notes:</strong>
80 * <ul>
81 * <li>
82 * <strong>Naming:</strong> For clarity, High and Low surrogates are called
83 * <code>Lead</code> and <code>Trail</code> in the API, which gives a better
84 * sense of their ordering in a string. <code>offset16</code> and
85 * <code>offset32</code> are used to distinguish offsets to UTF-16
86 * boundaries vs offsets to UTF-32 boundaries. <code>int char32</code> is
87 * used to contain UTF-32 characters, as opposed to <code>char16</code>,
88 * which is a UTF-16 code unit.
89 * </li>
90 * <li>
91 * <strong>Roundtripping Offsets:</strong> You can always roundtrip from a
92 * UTF-32 offset to a UTF-16 offset and back. Because of the difference in
93 * structure, you can roundtrip from a UTF-16 offset to a UTF-32 offset and
94 * back if and only if <code>bounds(string, offset16) != TRAIL</code>.
95 * </li>
96 * <li>
97 * <strong>Exceptions:</strong> The error checking will throw an exception
98 * if indices are out of bounds. Other than than that, all methods will
99 * behave reasonably, even if unmatched surrogates or out-of-bounds UTF-32
100 * values are present. <code>UCharacter.isLegal()</code> can be used to check
101 * for validity if desired.
102 * </li>
103 * <li>
104 * <strong>Unmatched Surrogates:</strong> If the string contains unmatched
105 * surrogates, then these are counted as one UTF-32 value. This matches
106 * their iteration behavior, which is vital. It also matches common display
107 * practice as missing glyphs (see the Unicode Standard Section 5.4, 5.5).
108 * </li>
109 * <li>
110 * <strong>Optimization:</strong> The method implementations may need
111 * optimization if the compiler doesn't fold static final methods. Since
112 * surrogate pairs will form an exceeding small percentage of all the text
113 * in the world, the singleton case should always be optimized for.
114 * </li>
115 * </ul>
116 * @author Mark Davis, with help from Markus Scherer
117 * @stable ICU 2.1
118 */
119
120public final class UTF16
121{
122 // public variables ---------------------------------------------------
123
124 /**
125 * The lowest Unicode code point value.
126 * @stable ICU 2.1
127 */
128 public static final int CODEPOINT_MIN_VALUE = 0;
129 /**
130 * The highest Unicode code point value (scalar value) according to the
131 * Unicode Standard.
132 * @stable ICU 2.1
133 */
134 public static final int CODEPOINT_MAX_VALUE = 0x10ffff;
135 /**
136 * The minimum value for Supplementary code points
137 * @stable ICU 2.1
138 */
139 public static final int SUPPLEMENTARY_MIN_VALUE = 0x10000;
140 /**
141 * Lead surrogate minimum value
142 * @stable ICU 2.1
143 */
144 public static final int LEAD_SURROGATE_MIN_VALUE = 0xD800;
145 /**
146 * Trail surrogate minimum value
147 * @stable ICU 2.1
148 */
149 public static final int TRAIL_SURROGATE_MIN_VALUE = 0xDC00;
150 /**
151 * Lead surrogate maximum value
152 * @stable ICU 2.1
153 */
154 public static final int LEAD_SURROGATE_MAX_VALUE = 0xDBFF;
155 /**
156 * Trail surrogate maximum value
157 * @stable ICU 2.1
158 */
159 public static final int TRAIL_SURROGATE_MAX_VALUE = 0xDFFF;
160 /**
161 * Surrogate minimum value
162 * @stable ICU 2.1
163 */
164 public static final int SURROGATE_MIN_VALUE = LEAD_SURROGATE_MIN_VALUE;
165
166 // public method ------------------------------------------------------
167
168 /**
169 * Extract a single UTF-32 value from a string.
170 * Used when iterating forwards or backwards (with
171 * <code>UTF16.getCharCount()</code>, as well as random access. If a
172 * validity check is required, use
173 * <code><a href="../lang/UCharacter.html#isLegal(char)">
174 * UCharacter.isLegal()</a></code> on the return value.
175 * If the char retrieved is part of a surrogate pair, its supplementary
176 * character will be returned. If a complete supplementary character is
177 * not found the incomplete character will be returned
178 * @param source array of UTF-16 chars
179 * @param offset16 UTF-16 offset to the start of the character.
180 * @return UTF-32 value for the UTF-32 value that contains the char at
181 * offset16. The boundaries of that codepoint are the same as in
182 * <code>bounds32()</code>.
183 * @exception IndexOutOfBoundsException thrown if offset16 is out of
184 * bounds.
185 * @stable ICU 2.1
186 */
187 public static int charAt(String source, int offset16)
188 {
189 if (offset16 < 0 || offset16 >= source.length()) {
190 throw new StringIndexOutOfBoundsException(offset16);
191 }
192
193 char single = source.charAt(offset16);
194 if (single < LEAD_SURROGATE_MIN_VALUE ||
195 single > TRAIL_SURROGATE_MAX_VALUE) {
196 return single;
197 }
198
199 // Convert the UTF-16 surrogate pair if necessary.
200 // For simplicity in usage, and because the frequency of pairs is
201 // low, look both directions.
202
203 if (single <= LEAD_SURROGATE_MAX_VALUE) {
204 ++ offset16;
205 if (source.length() != offset16) {
206 char trail = source.charAt(offset16);
207 if (trail >= TRAIL_SURROGATE_MIN_VALUE &&
208 trail <= TRAIL_SURROGATE_MAX_VALUE) {
209 return UCharacterProperty.getRawSupplementary(single,
210 trail);
211 }
212 }
213 }
214 else
215 {
216 -- offset16;
217 if (offset16 >= 0) {
218 // single is a trail surrogate so
219 char lead = source.charAt(offset16);
220 if (lead >= LEAD_SURROGATE_MIN_VALUE &&
221 lead <= LEAD_SURROGATE_MAX_VALUE) {
222 return UCharacterProperty.getRawSupplementary(lead,
223 single);
224 }
225 }
226 }
227 return single; // return unmatched surrogate
228 }
229
230 /**
231 * Extract a single UTF-32 value from a substring.
232 * Used when iterating forwards or backwards (with
233 * <code>UTF16.getCharCount()</code>, as well as random access. If a
234 * validity check is required, use
235 * <code><a href="../lang/UCharacter.html#isLegal(char)">UCharacter.isLegal()
236 * </a></code> on the return value.
237 * If the char retrieved is part of a surrogate pair, its supplementary
238 * character will be returned. If a complete supplementary character is
239 * not found the incomplete character will be returned
240 * @param source array of UTF-16 chars
241 * @param start offset to substring in the source array for analyzing
242 * @param limit offset to substring in the source array for analyzing
243 * @param offset16 UTF-16 offset relative to start
244 * @return UTF-32 value for the UTF-32 value that contains the char at
245 * offset16. The boundaries of that codepoint are the same as in
246 * <code>bounds32()</code>.
247 * @exception IndexOutOfBoundsException thrown if offset16 is not within
248 * the range of start and limit.
249 * @stable ICU 2.1
250 */
251 public static int charAt(char source[], int start, int limit,
252 int offset16)
253 {
254 offset16 += start;
255 if (offset16 < start || offset16 >= limit) {
256 throw new ArrayIndexOutOfBoundsException(offset16);
257 }
258
259 char single = source[offset16];
260 if (!isSurrogate(single)) {
261 return single;
262 }
263
264 // Convert the UTF-16 surrogate pair if necessary.
265 // For simplicity in usage, and because the frequency of pairs is
266 // low, look both directions.
267 if (single <= LEAD_SURROGATE_MAX_VALUE) {
268 offset16 ++;
269 if (offset16 >= limit) {
270 return single;
271 }
272 char trail = source[offset16];
273 if (isTrailSurrogate(trail)) {
274 return UCharacterProperty.getRawSupplementary(single, trail);
275 }
276 }
277 else { // isTrailSurrogate(single), so
278 if (offset16 == start) {
279 return single;
280 }
281 offset16 --;
282 char lead = source[offset16];
283 if (isLeadSurrogate(lead))
284 return UCharacterProperty.getRawSupplementary(lead, single);
285 }
286 return single; // return unmatched surrogate
287 }
288
289 /**
290 * Determines how many chars this char32 requires.
291 * If a validity check is required, use <code>
292 * <a href="../lang/UCharacter.html#isLegal(char)">isLegal()</a></code> on
293 * char32 before calling.
294 * @param char32 the input codepoint.
295 * @return 2 if is in supplementary space, otherwise 1.
296 * @stable ICU 2.1
297 */
298 public static int getCharCount(int char32)
299 {
300 if (char32 < SUPPLEMENTARY_MIN_VALUE) {
301 return 1;
302 }
303 return 2;
304 }
305
306 /**
307 * Determines whether the code value is a surrogate.
308 * @param char16 the input character.
309 * @return true iff the input character is a surrogate.
310 * @stable ICU 2.1
311 */
312 public static boolean isSurrogate(char char16)
313 {
314 return LEAD_SURROGATE_MIN_VALUE <= char16 &&
315 char16 <= TRAIL_SURROGATE_MAX_VALUE;
316 }
317
318 /**
319 * Determines whether the character is a trail surrogate.
320 * @param char16 the input character.
321 * @return true iff the input character is a trail surrogate.
322 * @stable ICU 2.1
323 */
324 public static boolean isTrailSurrogate(char char16)
325 {
326 return (TRAIL_SURROGATE_MIN_VALUE <= char16 &&
327 char16 <= TRAIL_SURROGATE_MAX_VALUE);
328 }
329
330 /**
331 * Determines whether the character is a lead surrogate.
332 * @param char16 the input character.
333 * @return true iff the input character is a lead surrogate
334 * @stable ICU 2.1
335 */
336 public static boolean isLeadSurrogate(char char16)
337 {
338 return LEAD_SURROGATE_MIN_VALUE <= char16 &&
339 char16 <= LEAD_SURROGATE_MAX_VALUE;
340 }
341
342 /**
343 * Returns the lead surrogate.
344 * If a validity check is required, use
345 * <code><a href="../lang/UCharacter.html#isLegal(char)">isLegal()</a></code>
346 * on char32 before calling.
347 * @param char32 the input character.
348 * @return lead surrogate if the getCharCount(ch) is 2; <br>
349 * and 0 otherwise (note: 0 is not a valid lead surrogate).
350 * @stable ICU 2.1
351 */
352 public static char getLeadSurrogate(int char32)
353 {
354 if (char32 >= SUPPLEMENTARY_MIN_VALUE) {
355 return (char)(LEAD_SURROGATE_OFFSET_ +
356 (char32 >> LEAD_SURROGATE_SHIFT_));
357 }
358
359 return 0;
360 }
361
362 /**
363 * Returns the trail surrogate.
364 * If a validity check is required, use
365 * <code><a href="../lang/UCharacter.html#isLegal(char)">isLegal()</a></code>
366 * on char32 before calling.
367 * @param char32 the input character.
368 * @return the trail surrogate if the getCharCount(ch) is 2; <br>otherwise
369 * the character itself
370 * @stable ICU 2.1
371 */
372 public static char getTrailSurrogate(int char32)
373 {
374 if (char32 >= SUPPLEMENTARY_MIN_VALUE) {
375 return (char)(TRAIL_SURROGATE_MIN_VALUE +
376 (char32 & TRAIL_SURROGATE_MASK_));
377 }
378
379 return (char)char32;
380 }
381
382 /**
383 * Convenience method corresponding to String.valueOf(char). Returns a one
384 * or two char string containing the UTF-32 value in UTF16 format. If a
385 * validity check is required, use
386 * <code><a href="../lang/UCharacter.html#isLegal(char)">isLegal()</a></code>
387 * on char32 before calling.
388 * @param char32 the input character.
389 * @return string value of char32 in UTF16 format
390 * @exception IllegalArgumentException thrown if char32 is a invalid
391 * codepoint.
392 * @stable ICU 2.1
393 */
394 public static String valueOf(int char32)
395 {
396 if (char32 < CODEPOINT_MIN_VALUE || char32 > CODEPOINT_MAX_VALUE) {
397 throw new IllegalArgumentException("Illegal codepoint");
398 }
399 return toString(char32);
400 }
401
402 /**
403 * Append a single UTF-32 value to the end of a StringBuffer.
404 * If a validity check is required, use
405 * <code><a href="../lang/UCharacter.html#isLegal(char)">isLegal()</a></code>
406 * on char32 before calling.
407 * @param target the buffer to append to
408 * @param char32 value to append.
409 * @return the updated StringBuffer
410 * @exception IllegalArgumentException thrown when char32 does not lie
411 * within the range of the Unicode codepoints
412 * @stable ICU 2.1
413 */
414 public static StringBuffer append(StringBuffer target, int char32)
415 {
416 // Check for irregular values
417 if (char32 < CODEPOINT_MIN_VALUE || char32 > CODEPOINT_MAX_VALUE) {
418 throw new IllegalArgumentException("Illegal codepoint: " + Integer.toHexString(char32));
419 }
420
421 // Write the UTF-16 values
422 if (char32 >= SUPPLEMENTARY_MIN_VALUE)
423 {
424 target.append(getLeadSurrogate(char32));
425 target.append(getTrailSurrogate(char32));
426 }
427 else {
428 target.append((char)char32);
429 }
430 return target;
431 }
432
433 //// for StringPrep
434 /**
435 * Shifts offset16 by the argument number of codepoints within a subarray.
436 * @param source char array
437 * @param start position of the subarray to be performed on
438 * @param limit position of the subarray to be performed on
439 * @param offset16 UTF16 position to shift relative to start
440 * @param shift32 number of codepoints to shift
441 * @return new shifted offset16 relative to start
442 * @exception IndexOutOfBoundsException if the new offset16 is out of
443 * bounds with respect to the subarray or the subarray bounds
444 * are out of range.
445 * @stable ICU 2.1
446 */
447 public static int moveCodePointOffset(char source[], int start, int limit,
448 int offset16, int shift32)
449 {
450 int size = source.length;
451 int count;
452 char ch;
453 int result = offset16 + start;
454 if (start<0 || limit<start) {
455 throw new StringIndexOutOfBoundsException(start);
456 }
457 if (limit>size) {
458 throw new StringIndexOutOfBoundsException(limit);
459 }
460 if (offset16<0 || result>limit) {
461 throw new StringIndexOutOfBoundsException(offset16);
462 }
463 if (shift32 > 0 ) {
464 if (shift32 + result > size) {
465 throw new StringIndexOutOfBoundsException(result);
466 }
467 count = shift32;
468 while (result < limit && count > 0)
469 {
470 ch = source[result];
471 if (isLeadSurrogate(ch) && (result+1 < limit) &&
472 isTrailSurrogate(source[result+1])) {
473 result ++;
474 }
475 count --;
476 result ++;
477 }
478 } else {
479 if (result + shift32 < start) {
480 throw new StringIndexOutOfBoundsException(result);
481 }
482 for (count=-shift32; count>0; count--) {
483 result--;
484 if (result<start) {
485 break;
486 }
487 ch = source[result];
488 if (isTrailSurrogate(ch) && result>start && isLeadSurrogate(source[result-1])) {
489 result--;
490 }
491 }
492 }
493 if (count != 0) {
494 throw new StringIndexOutOfBoundsException(shift32);
495 }
496 result -= start;
497 return result;
498 }
499
500 // private data members -------------------------------------------------
501
502 /**
503 * Shift value for lead surrogate to form a supplementary character.
504 */
505 private static final int LEAD_SURROGATE_SHIFT_ = 10;
506
507 /**
508 * Mask to retrieve the significant value from a trail surrogate.
509 */
510 private static final int TRAIL_SURROGATE_MASK_ = 0x3FF;
511
512 /**
513 * Value that all lead surrogate starts with
514 */
515 private static final int LEAD_SURROGATE_OFFSET_ =
516 LEAD_SURROGATE_MIN_VALUE -
517 (SUPPLEMENTARY_MIN_VALUE
518 >> LEAD_SURROGATE_SHIFT_);
519
520 // private methods ------------------------------------------------------
521
522 /**
523 * <p>Converts argument code point and returns a String object representing
524 * the code point's value in UTF16 format.</p>
525 * <p>This method does not check for the validity of the codepoint, the
526 * results are not guaranteed if a invalid codepoint is passed as
527 * argument.</p>
528 * <p>The result is a string whose length is 1 for non-supplementary code
529 * points, 2 otherwise.</p>
530 * @param ch code point
531 * @return string representation of the code point
532 */
533 private static String toString(int ch)
534 {
535 if (ch < SUPPLEMENTARY_MIN_VALUE) {
536 return String.valueOf((char)ch);
537 }
538
539 StringBuffer result = new StringBuffer();
540 result.append(getLeadSurrogate(ch));
541 result.append(getTrailSurrogate(ch));
542 return result.toString();
543 }
544}