blob: 26a5eca99fce1191cd34863e666f8065ae7cfd74 [file] [log] [blame]
J. Duke319a3b92007-12-01 00:00:00 +00001/*
2 * Portions Copyright 2005 Sun Microsystems, Inc. All Rights Reserved.
3 * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
4 *
5 * This code is free software; you can redistribute it and/or modify it
6 * under the terms of the GNU General Public License version 2 only, as
7 * published by the Free Software Foundation. Sun designates this
8 * particular file as subject to the "Classpath" exception as provided
9 * by Sun in the LICENSE file that accompanied this code.
10 *
11 * This code is distributed in the hope that it will be useful, but WITHOUT
12 * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
13 * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
14 * version 2 for more details (a copy is included in the LICENSE file that
15 * accompanied this code).
16 *
17 * You should have received a copy of the GNU General Public License version
18 * 2 along with this work; if not, write to the Free Software Foundation,
19 * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
20 *
21 * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
22 * CA 95054 USA or visit www.sun.com if you need additional information or
23 * have any questions.
24 */
25/*
26 *******************************************************************************
27 * (C) Copyright IBM Corp. 1996-2005 - All Rights Reserved *
28 * *
29 * The original version of this source code and documentation is copyrighted *
30 * and owned by IBM, These materials are provided under terms of a License *
31 * Agreement between IBM and Sun. This technology is protected by multiple *
32 * US and International patents. This notice and attribution to IBM may not *
33 * to removed. *
34 *******************************************************************************
35 */
36
37package sun.text.normalizer;
38
39import java.lang.ref.SoftReference;
40import java.util.HashMap;
41import java.util.Locale;
42import java.util.Map;
43
44/**
45 * <p>
46 * The UCharacter class provides extensions to the
47 * <a href=http://java.sun.com/j2se/1.3/docs/api/java/lang/Character.html>
48 * java.lang.Character</a> class. These extensions provide support for
49 * Unicode 3.2 properties and together with the <a href=../text/UTF16.html>UTF16</a>
50 * class, provide support for supplementary characters (those with code
51 * points above U+FFFF).
52 * </p>
53 * <p>
54 * Code points are represented in these API using ints. While it would be
55 * more convenient in Java to have a separate primitive datatype for them,
56 * ints suffice in the meantime.
57 * </p>
58 * <p>
59 * To use this class please add the jar file name icu4j.jar to the
60 * class path, since it contains data files which supply the information used
61 * by this file.<br>
62 * E.g. In Windows <br>
63 * <code>set CLASSPATH=%CLASSPATH%;$JAR_FILE_PATH/ucharacter.jar</code>.<br>
64 * Otherwise, another method would be to copy the files uprops.dat and
65 * unames.icu from the icu4j source subdirectory
66 * <i>$ICU4J_SRC/src/com.ibm.icu.impl.data</i> to your class directory
67 * <i>$ICU4J_CLASS/com.ibm.icu.impl.data</i>.
68 * </p>
69 * <p>
70 * Aside from the additions for UTF-16 support, and the updated Unicode 3.1
71 * properties, the main differences between UCharacter and Character are:
72 * <ul>
73 * <li> UCharacter is not designed to be a char wrapper and does not have
74 * APIs to which involves management of that single char.<br>
75 * These include:
76 * <ul>
77 * <li> char charValue(),
78 * <li> int compareTo(java.lang.Character, java.lang.Character), etc.
79 * </ul>
80 * <li> UCharacter does not include Character APIs that are deprecated, not
81 * does it include the Java-specific character information, such as
82 * boolean isJavaIdentifierPart(char ch).
83 * <li> Character maps characters 'A' - 'Z' and 'a' - 'z' to the numeric
84 * values '10' - '35'. UCharacter also does this in digit and
85 * getNumericValue, to adhere to the java semantics of these
86 * methods. New methods unicodeDigit, and
87 * getUnicodeNumericValue do not treat the above code points
88 * as having numeric values. This is a semantic change from ICU4J 1.3.1.
89 * </ul>
90 * <p>
91 * Further detail differences can be determined from the program
92 * <a href = http://oss.software.ibm.com/developerworks/opensource/cvs/icu4j/~checkout~/icu4j/src/com/ibm/icu/dev/test/lang/UCharacterCompare.java>
93 * com.ibm.icu.dev.test.lang.UCharacterCompare</a>
94 * </p>
95 * <p>
96 * This class is not subclassable
97 * </p>
98 * @author Syn Wee Quek
99 * @stable ICU 2.1
100 * @see com.ibm.icu.lang.UCharacterEnums
101 */
102
103public final class UCharacter
104{
105
106 /**
107 * Numeric Type constants.
108 * @see UProperty#NUMERIC_TYPE
109 * @stable ICU 2.4
110 */
111 public static interface NumericType
112 {
113 /**
114 * @stable ICU 2.4
115 */
116 public static final int NONE = 0;
117 /**
118 * @stable ICU 2.4
119 */
120 public static final int DECIMAL = 1;
121 /**
122 * @stable ICU 2.4
123 */
124 public static final int DIGIT = 2;
125 /**
126 * @stable ICU 2.4
127 */
128 public static final int NUMERIC = 3;
129 /**
130 * @stable ICU 2.4
131 */
132 public static final int COUNT = 4;
133 }
134
135 /**
136 * Hangul Syllable Type constants.
137 *
138 * @see UProperty#HANGUL_SYLLABLE_TYPE
139 * @stable ICU 2.6
140 */
141 public static interface HangulSyllableType
142 {
143 /**
144 * @stable ICU 2.6
145 */
146 public static final int NOT_APPLICABLE = 0; /*[NA]*/ /*See note !!*/
147 /**
148 * @stable ICU 2.6
149 */
150 public static final int LEADING_JAMO = 1; /*[L]*/
151 /**
152 * @stable ICU 2.6
153 */
154 public static final int VOWEL_JAMO = 2; /*[V]*/
155 /**
156 * @stable ICU 2.6
157 */
158 public static final int TRAILING_JAMO = 3; /*[T]*/
159 /**
160 * @stable ICU 2.6
161 */
162 public static final int LV_SYLLABLE = 4; /*[LV]*/
163 /**
164 * @stable ICU 2.6
165 */
166 public static final int LVT_SYLLABLE = 5; /*[LVT]*/
167 /**
168 * @stable ICU 2.6
169 */
170 public static final int COUNT = 6;
171 }
172
173 /**
174 * [Sun] This interface moved from UCharacterEnums.java.
175 *
176 * 'Enum' for the CharacterCategory constants. These constants are
177 * compatible in name <b>but not in value</b> with those defined in
178 * <code>java.lang.Character</code>.
179 * @see UCharacterCategory
180 * @draft ICU 3.0
181 * @deprecated This is a draft API and might change in a future release of ICU.
182 */
183 public static interface ECharacterCategory
184 {
185 /**
186 * Character type Lu
187 * @stable ICU 2.1
188 */
189 public static final int UPPERCASE_LETTER = 1;
190
191 /**
192 * Character type Lt
193 * @stable ICU 2.1
194 */
195 public static final int TITLECASE_LETTER = 3;
196
197 /**
198 * Character type Lo
199 * @stable ICU 2.1
200 */
201 public static final int OTHER_LETTER = 5;
202 }
203
204 // public data members -----------------------------------------------
205
206 /**
207 * The lowest Unicode code point value.
208 * @stable ICU 2.1
209 */
210 public static final int MIN_VALUE = UTF16.CODEPOINT_MIN_VALUE;
211
212 /**
213 * The highest Unicode code point value (scalar value) according to the
214 * Unicode Standard.
215 * This is a 21-bit value (21 bits, rounded up).<br>
216 * Up-to-date Unicode implementation of java.lang.Character.MIN_VALUE
217 * @stable ICU 2.1
218 */
219 public static final int MAX_VALUE = UTF16.CODEPOINT_MAX_VALUE;
220
221 /**
222 * The minimum value for Supplementary code points
223 * @stable ICU 2.1
224 */
225 public static final int SUPPLEMENTARY_MIN_VALUE =
226 UTF16.SUPPLEMENTARY_MIN_VALUE;
227
228 /**
229 * Special value that is returned by getUnicodeNumericValue(int) when no
230 * numeric value is defined for a code point.
231 * @stable ICU 2.4
232 * @see #getUnicodeNumericValue
233 */
234 public static final double NO_NUMERIC_VALUE = -123456789;
235
236 // public methods ----------------------------------------------------
237
238 /**
239 * Retrieves the numeric value of a decimal digit code point.
240 * <br>This method observes the semantics of
241 * <code>java.lang.Character.digit()</code>. Note that this
242 * will return positive values for code points for which isDigit
243 * returns false, just like java.lang.Character.
244 * <br><em>Semantic Change:</em> In release 1.3.1 and
245 * prior, this did not treat the European letters as having a
246 * digit value, and also treated numeric letters and other numbers as
247 * digits.
248 * This has been changed to conform to the java semantics.
249 * <br>A code point is a valid digit if and only if:
250 * <ul>
251 * <li>ch is a decimal digit or one of the european letters, and
252 * <li>the value of ch is less than the specified radix.
253 * </ul>
254 * @param ch the code point to query
255 * @param radix the radix
256 * @return the numeric value represented by the code point in the
257 * specified radix, or -1 if the code point is not a decimal digit
258 * or if its value is too large for the radix
259 * @stable ICU 2.1
260 */
261 public static int digit(int ch, int radix)
262 {
263 // when ch is out of bounds getProperty == 0
264 int props = getProperty(ch);
265 if (getNumericType(props) != NumericType.DECIMAL) {
266 return (radix <= 10) ? -1 : getEuropeanDigit(ch);
267 }
268 // if props == 0, it will just fall through and return -1
269 if (isNotExceptionIndicator(props)) {
270 // not contained in exception data
271 // getSignedValue is just shifting so we can check for the sign
272 // first
273 // Optimization
274 // int result = UCharacterProperty.getSignedValue(props);
275 // if (result >= 0) {
276 // return result;
277 // }
278 if (props >= 0) {
279 return UCharacterProperty.getSignedValue(props);
280 }
281 }
282 else {
283 int index = UCharacterProperty.getExceptionIndex(props);
284 if (PROPERTY_.hasExceptionValue(index,
285 UCharacterProperty.EXC_NUMERIC_VALUE_)) {
286 int result = PROPERTY_.getException(index,
287 UCharacterProperty.EXC_NUMERIC_VALUE_);
288 if (result >= 0) {
289 return result;
290 }
291 }
292 }
293
294 if (radix > 10) {
295 int result = getEuropeanDigit(ch);
296 if (result >= 0 && result < radix) {
297 return result;
298 }
299 }
300 return -1;
301 }
302
303 /**
304 * <p>Get the numeric value for a Unicode code point as defined in the
305 * Unicode Character Database.</p>
306 * <p>A "double" return type is necessary because some numeric values are
307 * fractions, negative, or too large for int.</p>
308 * <p>For characters without any numeric values in the Unicode Character
309 * Database, this function will return NO_NUMERIC_VALUE.</p>
310 * <p><em>API Change:</em> In release 2.2 and prior, this API has a
311 * return type int and returns -1 when the argument ch does not have a
312 * corresponding numeric value. This has been changed to synch with ICU4C
313 * </p>
314 * This corresponds to the ICU4C function u_getNumericValue.
315 * @param ch Code point to get the numeric value for.
316 * @return numeric value of ch, or NO_NUMERIC_VALUE if none is defined.
317 * @stable ICU 2.4
318 */
319 public static double getUnicodeNumericValue(int ch)
320 {
321 // equivalent to c version double u_getNumericValue(UChar32 c)
322 int props = PROPERTY_.getProperty(ch);
323 int numericType = getNumericType(props);
324 if (numericType > NumericType.NONE && numericType < NumericType.COUNT) {
325 if (isNotExceptionIndicator(props)) {
326 return UCharacterProperty.getSignedValue(props);
327 }
328 else {
329 int index = UCharacterProperty.getExceptionIndex(props);
330 boolean nex = false;
331 boolean dex = false;
332 double numerator = 0;
333 if (PROPERTY_.hasExceptionValue(index,
334 UCharacterProperty.EXC_NUMERIC_VALUE_)) {
335 int num = PROPERTY_.getException(index,
336 UCharacterProperty.EXC_NUMERIC_VALUE_);
337 // There are special values for huge numbers that are
338 // powers of ten. genprops/store.c documents:
339 // if numericValue = 0x7fffff00 + x then
340 // numericValue = 10 ^ x
341 if (num >= NUMERATOR_POWER_LIMIT_) {
342 num &= 0xff;
343 // 10^x without math.h
344 numerator = Math.pow(10, num);
345 }
346 else {
347 numerator = num;
348 }
349 nex = true;
350 }
351 double denominator = 0;
352 if (PROPERTY_.hasExceptionValue(index,
353 UCharacterProperty.EXC_DENOMINATOR_VALUE_)) {
354 denominator = PROPERTY_.getException(index,
355 UCharacterProperty.EXC_DENOMINATOR_VALUE_);
356 // faster path not in c
357 if (numerator != 0) {
358 return numerator / denominator;
359 }
360 dex = true;
361 }
362
363 if (nex) {
364 if (dex) {
365 return numerator / denominator;
366 }
367 return numerator;
368 }
369 if (dex) {
370 return 1 / denominator;
371 }
372 }
373 }
374 return NO_NUMERIC_VALUE;
375 }
376
377 /**
378 * Returns a value indicating a code point's Unicode category.
379 * Up-to-date Unicode implementation of java.lang.Character.getType()
380 * except for the above mentioned code points that had their category
381 * changed.<br>
382 * Return results are constants from the interface
383 * <a href=UCharacterCategory.html>UCharacterCategory</a><br>
384 * <em>NOTE:</em> the UCharacterCategory values are <em>not</em> compatible with
385 * those returned by java.lang.Character.getType. UCharacterCategory values
386 * match the ones used in ICU4C, while java.lang.Character type
387 * values, though similar, skip the value 17.</p>
388 * @param ch code point whose type is to be determined
389 * @return category which is a value of UCharacterCategory
390 * @stable ICU 2.1
391 */
392 public static int getType(int ch)
393 {
394 return getProperty(ch) & UCharacterProperty.TYPE_MASK;
395 }
396
397 //// for StringPrep
398 /**
399 * Returns a code point corresponding to the two UTF16 characters.
400 * @param lead the lead char
401 * @param trail the trail char
402 * @return code point if surrogate characters are valid.
403 * @exception IllegalArgumentException thrown when argument characters do
404 * not form a valid codepoint
405 * @stable ICU 2.1
406 */
407 public static int getCodePoint(char lead, char trail)
408 {
409 if (lead >= UTF16.LEAD_SURROGATE_MIN_VALUE &&
410 lead <= UTF16.LEAD_SURROGATE_MAX_VALUE &&
411 trail >= UTF16.TRAIL_SURROGATE_MIN_VALUE &&
412 trail <= UTF16.TRAIL_SURROGATE_MAX_VALUE) {
413 return UCharacterProperty.getRawSupplementary(lead, trail);
414 }
415 throw new IllegalArgumentException("Illegal surrogate characters");
416 }
417
418 //// for StringPrep
419 /**
420 * Returns the Bidirection property of a code point.
421 * For example, 0x0041 (letter A) has the LEFT_TO_RIGHT directional
422 * property.<br>
423 * Result returned belongs to the interface
424 * <a href=UCharacterDirection.html>UCharacterDirection</a>
425 * @param ch the code point to be determined its direction
426 * @return direction constant from UCharacterDirection.
427 * @stable ICU 2.1
428 */
429 public static int getDirection(int ch)
430 {
431 // when ch is out of bounds getProperty == 0
432 return (getProperty(ch) >> BIDI_SHIFT_) & BIDI_MASK_AFTER_SHIFT_;
433 }
434
435 /**
436 * The given string is mapped to its case folding equivalent according to
437 * UnicodeData.txt and CaseFolding.txt; if any character has no case
438 * folding equivalent, the character itself is returned.
439 * "Full", multiple-code point case folding mappings are returned here.
440 * For "simple" single-code point mappings use the API
441 * foldCase(int ch, boolean defaultmapping).
442 * @param str the String to be converted
443 * @param defaultmapping Indicates if all mappings defined in
444 * CaseFolding.txt is to be used, otherwise the
445 * mappings for dotted I and dotless i marked with
446 * 'I' in CaseFolding.txt will be skipped.
447 * @return the case folding equivalent of the character, if
448 * any; otherwise the character itself.
449 * @see #foldCase(int, boolean)
450 * @stable ICU 2.1
451 */
452 public static String foldCase(String str, boolean defaultmapping)
453 {
454 int size = str.length();
455 StringBuffer result = new StringBuffer(size);
456 int offset = 0;
457 int ch;
458
459 // case mapping loop
460 while (offset < size) {
461 ch = UTF16.charAt(str, offset);
462 offset += UTF16.getCharCount(ch);
463 int props = PROPERTY_.getProperty(ch);
464 if (isNotExceptionIndicator(props)) {
465 int type = UCharacterProperty.TYPE_MASK & props;
466 if (type == ECharacterCategory.UPPERCASE_LETTER ||
467 type == ECharacterCategory.TITLECASE_LETTER) {
468 ch += UCharacterProperty.getSignedValue(props);
469 }
470 }
471 else {
472 int index = UCharacterProperty.getExceptionIndex(props);
473 if (PROPERTY_.hasExceptionValue(index,
474 UCharacterProperty.EXC_CASE_FOLDING_)) {
475 int exception = PROPERTY_.getException(index,
476 UCharacterProperty.EXC_CASE_FOLDING_);
477 if (exception != 0) {
478 PROPERTY_.getFoldCase(exception & LAST_CHAR_MASK_,
479 exception >> SHIFT_24_, result);
480 }
481 else {
482 // special case folding mappings, hardcoded
483 if (ch != 0x49 && ch != 0x130) {
484 // return ch itself because there is no special
485 // mapping for it
486 UTF16.append(result, ch);
487 continue;
488 }
489 if (defaultmapping) {
490 // default mappings
491 if (ch == 0x49) {
492 // 0049; C; 0069; # LATIN CAPITAL LETTER I
493 result.append(
494 UCharacterProperty.LATIN_SMALL_LETTER_I_);
495 }
496 else if (ch == 0x130) {
497 // 0130; F; 0069 0307;
498 // # LATIN CAPITAL LETTER I WITH DOT ABOVE
499 result.append(
500 UCharacterProperty.LATIN_SMALL_LETTER_I_);
501 result.append((char)0x307);
502 }
503 }
504 else {
505 // Turkic mappings
506 if (ch == 0x49) {
507 // 0049; T; 0131; # LATIN CAPITAL LETTER I
508 result.append((char)0x131);
509 }
510 else if (ch == 0x130) {
511 // 0130; T; 0069;
512 // # LATIN CAPITAL LETTER I WITH DOT ABOVE
513 result.append(
514 UCharacterProperty.LATIN_SMALL_LETTER_I_);
515 }
516 }
517 }
518 // do not fall through to the output of c
519 continue;
520 }
521 else {
522 if (PROPERTY_.hasExceptionValue(index,
523 UCharacterProperty.EXC_LOWERCASE_)) {
524 ch = PROPERTY_.getException(index,
525 UCharacterProperty.EXC_LOWERCASE_);
526 }
527 }
528
529 }
530
531 // handle 1:1 code point mappings from UnicodeData.txt
532 UTF16.append(result, ch);
533 }
534
535 return result.toString();
536 }
537
538 /**
539 * <p>Get the "age" of the code point.</p>
540 * <p>The "age" is the Unicode version when the code point was first
541 * designated (as a non-character or for Private Use) or assigned a
542 * character.
543 * <p>This can be useful to avoid emitting code points to receiving
544 * processes that do not accept newer characters.</p>
545 * <p>The data is from the UCD file DerivedAge.txt.</p>
546 * @param ch The code point.
547 * @return the Unicode version number
548 * @stable ICU 2.6
549 */
550 public static VersionInfo getAge(int ch)
551 {
552 if (ch < MIN_VALUE || ch > MAX_VALUE) {
553 throw new IllegalArgumentException("Codepoint out of bounds");
554 }
555 return PROPERTY_.getAge(ch);
556 }
557
558 /**
559 * <p>Gets the property value for an Unicode property type of a code point.
560 * Also returns binary and mask property values.</p>
561 * <p>Unicode, especially in version 3.2, defines many more properties than
562 * the original set in UnicodeData.txt.</p>
563 * <p>The properties APIs are intended to reflect Unicode properties as
564 * defined in the Unicode Character Database (UCD) and Unicode Technical
565 * Reports (UTR). For details about the properties see
566 * http://www.unicode.org/.</p>
567 * <p>For names of Unicode properties see the UCD file PropertyAliases.txt.
568 * </p>
569 * <pre>
570 * Sample usage:
571 * int ea = UCharacter.getIntPropertyValue(c, UProperty.EAST_ASIAN_WIDTH);
572 * int ideo = UCharacter.getIntPropertyValue(c, UProperty.IDEOGRAPHIC);
573 * boolean b = (ideo == 1) ? true : false;
574 * </pre>
575 * @param ch code point to test.
576 * @param type UProperty selector constant, identifies which binary
577 * property to check. Must be
578 * UProperty.BINARY_START &lt;= type &lt; UProperty.BINARY_LIMIT or
579 * UProperty.INT_START &lt;= type &lt; UProperty.INT_LIMIT or
580 * UProperty.MASK_START &lt;= type &lt; UProperty.MASK_LIMIT.
581 * @return numeric value that is directly the property value or,
582 * for enumerated properties, corresponds to the numeric value of
583 * the enumerated constant of the respective property value
584 * enumeration type (cast to enum type if necessary).
585 * Returns 0 or 1 (for false / true) for binary Unicode properties.
586 * Returns a bit-mask for mask properties.
587 * Returns 0 if 'type' is out of bounds or if the Unicode version
588 * does not have data for the property at all, or not for this code
589 * point.
590 * @see UProperty
591 * @see #hasBinaryProperty
592 * @see #getIntPropertyMinValue
593 * @see #getIntPropertyMaxValue
594 * @see #getUnicodeVersion
595 * @stable ICU 2.4
596 */
597 public static int getIntPropertyValue(int ch, int type)
598 {
599 /*
600 * For Normalizer with Unicode 3.2, this method is called only for
601 * HANGUL_SYLLABLE_TYPE in UnicodeSet.addPropertyStarts().
602 */
603 if (type == UProperty.HANGUL_SYLLABLE_TYPE) {
604 /* purely algorithmic; hardcode known characters, check for assigned new ones */
605 if(ch<NormalizerImpl.JAMO_L_BASE) {
606 /* NA */
607 } else if(ch<=0x11ff) {
608 /* Jamo range */
609 if(ch<=0x115f) {
610 /* Jamo L range, HANGUL CHOSEONG ... */
611 if(ch==0x115f || ch<=0x1159 || getType(ch)==ECharacterCategory.OTHER_LETTER) {
612 return HangulSyllableType.LEADING_JAMO;
613 }
614 } else if(ch<=0x11a7) {
615 /* Jamo V range, HANGUL JUNGSEONG ... */
616 if(ch<=0x11a2 || getType(ch)==ECharacterCategory.OTHER_LETTER) {
617 return HangulSyllableType.VOWEL_JAMO;
618 }
619 } else {
620 /* Jamo T range */
621 if(ch<=0x11f9 || getType(ch)==ECharacterCategory.OTHER_LETTER) {
622 return HangulSyllableType.TRAILING_JAMO;
623 }
624 }
625 } else if((ch-=NormalizerImpl.HANGUL_BASE)<0) {
626 /* NA */
627 } else if(ch<NormalizerImpl.HANGUL_COUNT) {
628 /* Hangul syllable */
629 return ch%NormalizerImpl.JAMO_T_COUNT==0 ? HangulSyllableType.LV_SYLLABLE : HangulSyllableType.LVT_SYLLABLE;
630 }
631 }
632 return 0; /* NA */
633 }
634
635 // private variables -------------------------------------------------
636
637 /**
638 * Database storing the sets of character property
639 */
640 private static final UCharacterProperty PROPERTY_;
641 /**
642 * For optimization
643 */
644 private static final char[] PROPERTY_TRIE_INDEX_;
645 private static final char[] PROPERTY_TRIE_DATA_;
646 private static final int[] PROPERTY_DATA_;
647 private static final int PROPERTY_INITIAL_VALUE_;
648
649 // block to initialise character property database
650 static
651 {
652 try
653 {
654 PROPERTY_ = UCharacterProperty.getInstance();
655 PROPERTY_TRIE_INDEX_ = PROPERTY_.m_trieIndex_;
656 PROPERTY_TRIE_DATA_ = PROPERTY_.m_trieData_;
657 PROPERTY_DATA_ = PROPERTY_.m_property_;
658 PROPERTY_INITIAL_VALUE_
659 = PROPERTY_DATA_[PROPERTY_.m_trieInitialValue_];
660 }
661 catch (Exception e)
662 {
663 throw new RuntimeException(e.getMessage());
664 }
665 }
666
667 /**
668 * To get the last character out from a data type
669 */
670 private static final int LAST_CHAR_MASK_ = 0xFFFF;
671
672 /**
673 * To get the last byte out from a data type
674 */
675// private static final int LAST_BYTE_MASK_ = 0xFF;
676
677 /**
678 * Shift 16 bits
679 */
680// private static final int SHIFT_16_ = 16;
681
682 /**
683 * Shift 24 bits
684 */
685 private static final int SHIFT_24_ = 24;
686
687 /**
688 * Shift to get numeric type
689 */
690 private static final int NUMERIC_TYPE_SHIFT_ = 12;
691 /**
692 * Mask to get numeric type
693 */
694 private static final int NUMERIC_TYPE_MASK_ = 0x7 << NUMERIC_TYPE_SHIFT_;
695 /**
696 * Shift to get bidi bits
697 */
698 private static final int BIDI_SHIFT_ = 6;
699
700 /**
701 * Mask to be applied after shifting to get bidi bits
702 */
703 private static final int BIDI_MASK_AFTER_SHIFT_ = 0x1F;
704
705 /**
706 * <p>Numerator power limit.
707 * There are special values for huge numbers that are powers of ten.</p>
708 * <p>c version genprops/store.c documents:
709 * if numericValue = 0x7fffff00 + x then numericValue = 10 ^ x</p>
710 */
711 private static final int NUMERATOR_POWER_LIMIT_ = 0x7fffff00;
712 /**
713 * Integer properties mask and shift values for joining type.
714 * Equivalent to icu4c UPROPS_JT_MASK.
715 */
716 private static final int JOINING_TYPE_MASK_ = 0x00003800;
717 /**
718 * Integer properties mask and shift values for joining type.
719 * Equivalent to icu4c UPROPS_JT_SHIFT.
720 */
721 private static final int JOINING_TYPE_SHIFT_ = 11;
722 /**
723 * Integer properties mask and shift values for joining group.
724 * Equivalent to icu4c UPROPS_JG_MASK.
725 */
726 private static final int JOINING_GROUP_MASK_ = 0x000007e0;
727 /**
728 * Integer properties mask and shift values for joining group.
729 * Equivalent to icu4c UPROPS_JG_SHIFT.
730 */
731 private static final int JOINING_GROUP_SHIFT_ = 5;
732 /**
733 * Integer properties mask for decomposition type.
734 * Equivalent to icu4c UPROPS_DT_MASK.
735 */
736 private static final int DECOMPOSITION_TYPE_MASK_ = 0x0000001f;
737 /**
738 * Integer properties mask and shift values for East Asian cell width.
739 * Equivalent to icu4c UPROPS_EA_MASK
740 */
741 private static final int EAST_ASIAN_MASK_ = 0x00038000;
742 /**
743 * Integer properties mask and shift values for East Asian cell width.
744 * Equivalent to icu4c UPROPS_EA_SHIFT
745 */
746 private static final int EAST_ASIAN_SHIFT_ = 15;
747
748 /**
749 * Integer properties mask and shift values for line breaks.
750 * Equivalent to icu4c UPROPS_LB_MASK
751 */
752 private static final int LINE_BREAK_MASK_ = 0x007C0000;
753 /**
754 * Integer properties mask and shift values for line breaks.
755 * Equivalent to icu4c UPROPS_LB_SHIFT
756 */
757 private static final int LINE_BREAK_SHIFT_ = 18;
758 /**
759 * Integer properties mask and shift values for blocks.
760 * Equivalent to icu4c UPROPS_BLOCK_MASK
761 */
762 private static final int BLOCK_MASK_ = 0x00007f80;
763 /**
764 * Integer properties mask and shift values for blocks.
765 * Equivalent to icu4c UPROPS_BLOCK_SHIFT
766 */
767 private static final int BLOCK_SHIFT_ = 7;
768 /**
769 * Integer properties mask and shift values for scripts.
770 * Equivalent to icu4c UPROPS_SHIFT_MASK
771 */
772 private static final int SCRIPT_MASK_ = 0x0000007f;
773
774 // private constructor -----------------------------------------------
775 ///CLOVER:OFF
776 /**
777 * Private constructor to prevent instantiation
778 */
779 private UCharacter()
780 {
781 }
782 ///CLOVER:ON
783 // private methods ---------------------------------------------------
784
785 /**
786 * Getting the digit values of characters like 'A' - 'Z', normal,
787 * half-width and full-width. This method assumes that the other digit
788 * characters are checked by the calling method.
789 * @param ch character to test
790 * @return -1 if ch is not a character of the form 'A' - 'Z', otherwise
791 * its corresponding digit will be returned.
792 */
793 private static int getEuropeanDigit(int ch) {
794 if ((ch > 0x7a && ch < 0xff21)
795 || ch < 0x41 || (ch > 0x5a && ch < 0x61)
796 || ch > 0xff5a || (ch > 0xff31 && ch < 0xff41)) {
797 return -1;
798 }
799 if (ch <= 0x7a) {
800 // ch >= 0x41 or ch < 0x61
801 return ch + 10 - ((ch <= 0x5a) ? 0x41 : 0x61);
802 }
803 // ch >= 0xff21
804 if (ch <= 0xff3a) {
805 return ch + 10 - 0xff21;
806 }
807 // ch >= 0xff41 && ch <= 0xff5a
808 return ch + 10 - 0xff41;
809 }
810
811 /**
812 * Gets the numeric type of the property argument
813 * @param props 32 bit property
814 * @return the numeric type
815 */
816 private static int getNumericType(int props)
817 {
818 return (props & NUMERIC_TYPE_MASK_) >> NUMERIC_TYPE_SHIFT_;
819 }
820
821 /**
822 * Checks if the property value has a exception indicator
823 * @param props 32 bit property value
824 * @return true if property does not have a exception indicator, false
825 * otherwise
826 */
827 private static boolean isNotExceptionIndicator(int props)
828 {
829 return (props & UCharacterProperty.EXCEPTION_MASK) == 0;
830 }
831
832 /**
833 * Gets the property value at the index.
834 * This is optimized.
835 * Note this is alittle different from CharTrie the index m_trieData_
836 * is never negative.
837 * This is a duplicate of UCharacterProperty.getProperty. For optimization
838 * purposes, this method calls the trie data directly instead of through
839 * UCharacterProperty.getProperty.
840 * @param ch code point whose property value is to be retrieved
841 * @return property value of code point
842 * @stable ICU 2.6
843 */
844 private static int getProperty(int ch)
845 {
846 if (ch < UTF16.LEAD_SURROGATE_MIN_VALUE
847 || (ch > UTF16.LEAD_SURROGATE_MAX_VALUE
848 && ch < UTF16.SUPPLEMENTARY_MIN_VALUE)) {
849 // BMP codepoint
850 try { // using try for < 0 ch is faster than using an if statement
851 return PROPERTY_DATA_[
852 PROPERTY_TRIE_DATA_[
853 (PROPERTY_TRIE_INDEX_[ch >> 5] << 2)
854 + (ch & 0x1f)]];
855 } catch (ArrayIndexOutOfBoundsException e) {
856 return PROPERTY_INITIAL_VALUE_;
857 }
858 }
859 if (ch <= UTF16.LEAD_SURROGATE_MAX_VALUE) {
860 // surrogate
861 return PROPERTY_DATA_[
862 PROPERTY_TRIE_DATA_[
863 (PROPERTY_TRIE_INDEX_[(0x2800 >> 5) + (ch >> 5)] << 2)
864 + (ch & 0x1f)]];
865 }
866 // for optimization
867 if (ch <= UTF16.CODEPOINT_MAX_VALUE) {
868 // look at the construction of supplementary characters
869 // trail forms the ends of it.
870 return PROPERTY_DATA_[PROPERTY_.m_trie_.getSurrogateValue(
871 UTF16.getLeadSurrogate(ch),
872 (char)(ch & 0x3ff))];
873 }
874 // return m_dataOffset_ if there is an error, in this case we return
875 // the default value: m_initialValue_
876 // we cannot assume that m_initialValue_ is at offset 0
877 // this is for optimization.
878 return PROPERTY_INITIAL_VALUE_;
879 }
880}