J. Duke | 319a3b9 | 2007-12-01 00:00:00 +0000 | [diff] [blame^] | 1 | /* |
| 2 | * Portions Copyright 2005 Sun Microsystems, Inc. All Rights Reserved. |
| 3 | * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. |
| 4 | * |
| 5 | * This code is free software; you can redistribute it and/or modify it |
| 6 | * under the terms of the GNU General Public License version 2 only, as |
| 7 | * published by the Free Software Foundation. Sun designates this |
| 8 | * particular file as subject to the "Classpath" exception as provided |
| 9 | * by Sun in the LICENSE file that accompanied this code. |
| 10 | * |
| 11 | * This code is distributed in the hope that it will be useful, but WITHOUT |
| 12 | * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or |
| 13 | * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License |
| 14 | * version 2 for more details (a copy is included in the LICENSE file that |
| 15 | * accompanied this code). |
| 16 | * |
| 17 | * You should have received a copy of the GNU General Public License version |
| 18 | * 2 along with this work; if not, write to the Free Software Foundation, |
| 19 | * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA. |
| 20 | * |
| 21 | * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara, |
| 22 | * CA 95054 USA or visit www.sun.com if you need additional information or |
| 23 | * have any questions. |
| 24 | */ |
| 25 | /* |
| 26 | ******************************************************************************* |
| 27 | * (C) Copyright IBM Corp. 1996-2005 - All Rights Reserved * |
| 28 | * * |
| 29 | * The original version of this source code and documentation is copyrighted * |
| 30 | * and owned by IBM, These materials are provided under terms of a License * |
| 31 | * Agreement between IBM and Sun. This technology is protected by multiple * |
| 32 | * US and International patents. This notice and attribution to IBM may not * |
| 33 | * to removed. * |
| 34 | ******************************************************************************* |
| 35 | */ |
| 36 | |
| 37 | package sun.text.normalizer; |
| 38 | |
| 39 | import java.lang.ref.SoftReference; |
| 40 | import java.util.HashMap; |
| 41 | import java.util.Locale; |
| 42 | import java.util.Map; |
| 43 | |
| 44 | /** |
| 45 | * <p> |
| 46 | * The UCharacter class provides extensions to the |
| 47 | * <a href=http://java.sun.com/j2se/1.3/docs/api/java/lang/Character.html> |
| 48 | * java.lang.Character</a> class. These extensions provide support for |
| 49 | * Unicode 3.2 properties and together with the <a href=../text/UTF16.html>UTF16</a> |
| 50 | * class, provide support for supplementary characters (those with code |
| 51 | * points above U+FFFF). |
| 52 | * </p> |
| 53 | * <p> |
| 54 | * Code points are represented in these API using ints. While it would be |
| 55 | * more convenient in Java to have a separate primitive datatype for them, |
| 56 | * ints suffice in the meantime. |
| 57 | * </p> |
| 58 | * <p> |
| 59 | * To use this class please add the jar file name icu4j.jar to the |
| 60 | * class path, since it contains data files which supply the information used |
| 61 | * by this file.<br> |
| 62 | * E.g. In Windows <br> |
| 63 | * <code>set CLASSPATH=%CLASSPATH%;$JAR_FILE_PATH/ucharacter.jar</code>.<br> |
| 64 | * Otherwise, another method would be to copy the files uprops.dat and |
| 65 | * unames.icu from the icu4j source subdirectory |
| 66 | * <i>$ICU4J_SRC/src/com.ibm.icu.impl.data</i> to your class directory |
| 67 | * <i>$ICU4J_CLASS/com.ibm.icu.impl.data</i>. |
| 68 | * </p> |
| 69 | * <p> |
| 70 | * Aside from the additions for UTF-16 support, and the updated Unicode 3.1 |
| 71 | * properties, the main differences between UCharacter and Character are: |
| 72 | * <ul> |
| 73 | * <li> UCharacter is not designed to be a char wrapper and does not have |
| 74 | * APIs to which involves management of that single char.<br> |
| 75 | * These include: |
| 76 | * <ul> |
| 77 | * <li> char charValue(), |
| 78 | * <li> int compareTo(java.lang.Character, java.lang.Character), etc. |
| 79 | * </ul> |
| 80 | * <li> UCharacter does not include Character APIs that are deprecated, not |
| 81 | * does it include the Java-specific character information, such as |
| 82 | * boolean isJavaIdentifierPart(char ch). |
| 83 | * <li> Character maps characters 'A' - 'Z' and 'a' - 'z' to the numeric |
| 84 | * values '10' - '35'. UCharacter also does this in digit and |
| 85 | * getNumericValue, to adhere to the java semantics of these |
| 86 | * methods. New methods unicodeDigit, and |
| 87 | * getUnicodeNumericValue do not treat the above code points |
| 88 | * as having numeric values. This is a semantic change from ICU4J 1.3.1. |
| 89 | * </ul> |
| 90 | * <p> |
| 91 | * Further detail differences can be determined from the program |
| 92 | * <a href = http://oss.software.ibm.com/developerworks/opensource/cvs/icu4j/~checkout~/icu4j/src/com/ibm/icu/dev/test/lang/UCharacterCompare.java> |
| 93 | * com.ibm.icu.dev.test.lang.UCharacterCompare</a> |
| 94 | * </p> |
| 95 | * <p> |
| 96 | * This class is not subclassable |
| 97 | * </p> |
| 98 | * @author Syn Wee Quek |
| 99 | * @stable ICU 2.1 |
| 100 | * @see com.ibm.icu.lang.UCharacterEnums |
| 101 | */ |
| 102 | |
| 103 | public final class UCharacter |
| 104 | { |
| 105 | |
| 106 | /** |
| 107 | * Numeric Type constants. |
| 108 | * @see UProperty#NUMERIC_TYPE |
| 109 | * @stable ICU 2.4 |
| 110 | */ |
| 111 | public static interface NumericType |
| 112 | { |
| 113 | /** |
| 114 | * @stable ICU 2.4 |
| 115 | */ |
| 116 | public static final int NONE = 0; |
| 117 | /** |
| 118 | * @stable ICU 2.4 |
| 119 | */ |
| 120 | public static final int DECIMAL = 1; |
| 121 | /** |
| 122 | * @stable ICU 2.4 |
| 123 | */ |
| 124 | public static final int DIGIT = 2; |
| 125 | /** |
| 126 | * @stable ICU 2.4 |
| 127 | */ |
| 128 | public static final int NUMERIC = 3; |
| 129 | /** |
| 130 | * @stable ICU 2.4 |
| 131 | */ |
| 132 | public static final int COUNT = 4; |
| 133 | } |
| 134 | |
| 135 | /** |
| 136 | * Hangul Syllable Type constants. |
| 137 | * |
| 138 | * @see UProperty#HANGUL_SYLLABLE_TYPE |
| 139 | * @stable ICU 2.6 |
| 140 | */ |
| 141 | public static interface HangulSyllableType |
| 142 | { |
| 143 | /** |
| 144 | * @stable ICU 2.6 |
| 145 | */ |
| 146 | public static final int NOT_APPLICABLE = 0; /*[NA]*/ /*See note !!*/ |
| 147 | /** |
| 148 | * @stable ICU 2.6 |
| 149 | */ |
| 150 | public static final int LEADING_JAMO = 1; /*[L]*/ |
| 151 | /** |
| 152 | * @stable ICU 2.6 |
| 153 | */ |
| 154 | public static final int VOWEL_JAMO = 2; /*[V]*/ |
| 155 | /** |
| 156 | * @stable ICU 2.6 |
| 157 | */ |
| 158 | public static final int TRAILING_JAMO = 3; /*[T]*/ |
| 159 | /** |
| 160 | * @stable ICU 2.6 |
| 161 | */ |
| 162 | public static final int LV_SYLLABLE = 4; /*[LV]*/ |
| 163 | /** |
| 164 | * @stable ICU 2.6 |
| 165 | */ |
| 166 | public static final int LVT_SYLLABLE = 5; /*[LVT]*/ |
| 167 | /** |
| 168 | * @stable ICU 2.6 |
| 169 | */ |
| 170 | public static final int COUNT = 6; |
| 171 | } |
| 172 | |
| 173 | /** |
| 174 | * [Sun] This interface moved from UCharacterEnums.java. |
| 175 | * |
| 176 | * 'Enum' for the CharacterCategory constants. These constants are |
| 177 | * compatible in name <b>but not in value</b> with those defined in |
| 178 | * <code>java.lang.Character</code>. |
| 179 | * @see UCharacterCategory |
| 180 | * @draft ICU 3.0 |
| 181 | * @deprecated This is a draft API and might change in a future release of ICU. |
| 182 | */ |
| 183 | public static interface ECharacterCategory |
| 184 | { |
| 185 | /** |
| 186 | * Character type Lu |
| 187 | * @stable ICU 2.1 |
| 188 | */ |
| 189 | public static final int UPPERCASE_LETTER = 1; |
| 190 | |
| 191 | /** |
| 192 | * Character type Lt |
| 193 | * @stable ICU 2.1 |
| 194 | */ |
| 195 | public static final int TITLECASE_LETTER = 3; |
| 196 | |
| 197 | /** |
| 198 | * Character type Lo |
| 199 | * @stable ICU 2.1 |
| 200 | */ |
| 201 | public static final int OTHER_LETTER = 5; |
| 202 | } |
| 203 | |
| 204 | // public data members ----------------------------------------------- |
| 205 | |
| 206 | /** |
| 207 | * The lowest Unicode code point value. |
| 208 | * @stable ICU 2.1 |
| 209 | */ |
| 210 | public static final int MIN_VALUE = UTF16.CODEPOINT_MIN_VALUE; |
| 211 | |
| 212 | /** |
| 213 | * The highest Unicode code point value (scalar value) according to the |
| 214 | * Unicode Standard. |
| 215 | * This is a 21-bit value (21 bits, rounded up).<br> |
| 216 | * Up-to-date Unicode implementation of java.lang.Character.MIN_VALUE |
| 217 | * @stable ICU 2.1 |
| 218 | */ |
| 219 | public static final int MAX_VALUE = UTF16.CODEPOINT_MAX_VALUE; |
| 220 | |
| 221 | /** |
| 222 | * The minimum value for Supplementary code points |
| 223 | * @stable ICU 2.1 |
| 224 | */ |
| 225 | public static final int SUPPLEMENTARY_MIN_VALUE = |
| 226 | UTF16.SUPPLEMENTARY_MIN_VALUE; |
| 227 | |
| 228 | /** |
| 229 | * Special value that is returned by getUnicodeNumericValue(int) when no |
| 230 | * numeric value is defined for a code point. |
| 231 | * @stable ICU 2.4 |
| 232 | * @see #getUnicodeNumericValue |
| 233 | */ |
| 234 | public static final double NO_NUMERIC_VALUE = -123456789; |
| 235 | |
| 236 | // public methods ---------------------------------------------------- |
| 237 | |
| 238 | /** |
| 239 | * Retrieves the numeric value of a decimal digit code point. |
| 240 | * <br>This method observes the semantics of |
| 241 | * <code>java.lang.Character.digit()</code>. Note that this |
| 242 | * will return positive values for code points for which isDigit |
| 243 | * returns false, just like java.lang.Character. |
| 244 | * <br><em>Semantic Change:</em> In release 1.3.1 and |
| 245 | * prior, this did not treat the European letters as having a |
| 246 | * digit value, and also treated numeric letters and other numbers as |
| 247 | * digits. |
| 248 | * This has been changed to conform to the java semantics. |
| 249 | * <br>A code point is a valid digit if and only if: |
| 250 | * <ul> |
| 251 | * <li>ch is a decimal digit or one of the european letters, and |
| 252 | * <li>the value of ch is less than the specified radix. |
| 253 | * </ul> |
| 254 | * @param ch the code point to query |
| 255 | * @param radix the radix |
| 256 | * @return the numeric value represented by the code point in the |
| 257 | * specified radix, or -1 if the code point is not a decimal digit |
| 258 | * or if its value is too large for the radix |
| 259 | * @stable ICU 2.1 |
| 260 | */ |
| 261 | public static int digit(int ch, int radix) |
| 262 | { |
| 263 | // when ch is out of bounds getProperty == 0 |
| 264 | int props = getProperty(ch); |
| 265 | if (getNumericType(props) != NumericType.DECIMAL) { |
| 266 | return (radix <= 10) ? -1 : getEuropeanDigit(ch); |
| 267 | } |
| 268 | // if props == 0, it will just fall through and return -1 |
| 269 | if (isNotExceptionIndicator(props)) { |
| 270 | // not contained in exception data |
| 271 | // getSignedValue is just shifting so we can check for the sign |
| 272 | // first |
| 273 | // Optimization |
| 274 | // int result = UCharacterProperty.getSignedValue(props); |
| 275 | // if (result >= 0) { |
| 276 | // return result; |
| 277 | // } |
| 278 | if (props >= 0) { |
| 279 | return UCharacterProperty.getSignedValue(props); |
| 280 | } |
| 281 | } |
| 282 | else { |
| 283 | int index = UCharacterProperty.getExceptionIndex(props); |
| 284 | if (PROPERTY_.hasExceptionValue(index, |
| 285 | UCharacterProperty.EXC_NUMERIC_VALUE_)) { |
| 286 | int result = PROPERTY_.getException(index, |
| 287 | UCharacterProperty.EXC_NUMERIC_VALUE_); |
| 288 | if (result >= 0) { |
| 289 | return result; |
| 290 | } |
| 291 | } |
| 292 | } |
| 293 | |
| 294 | if (radix > 10) { |
| 295 | int result = getEuropeanDigit(ch); |
| 296 | if (result >= 0 && result < radix) { |
| 297 | return result; |
| 298 | } |
| 299 | } |
| 300 | return -1; |
| 301 | } |
| 302 | |
| 303 | /** |
| 304 | * <p>Get the numeric value for a Unicode code point as defined in the |
| 305 | * Unicode Character Database.</p> |
| 306 | * <p>A "double" return type is necessary because some numeric values are |
| 307 | * fractions, negative, or too large for int.</p> |
| 308 | * <p>For characters without any numeric values in the Unicode Character |
| 309 | * Database, this function will return NO_NUMERIC_VALUE.</p> |
| 310 | * <p><em>API Change:</em> In release 2.2 and prior, this API has a |
| 311 | * return type int and returns -1 when the argument ch does not have a |
| 312 | * corresponding numeric value. This has been changed to synch with ICU4C |
| 313 | * </p> |
| 314 | * This corresponds to the ICU4C function u_getNumericValue. |
| 315 | * @param ch Code point to get the numeric value for. |
| 316 | * @return numeric value of ch, or NO_NUMERIC_VALUE if none is defined. |
| 317 | * @stable ICU 2.4 |
| 318 | */ |
| 319 | public static double getUnicodeNumericValue(int ch) |
| 320 | { |
| 321 | // equivalent to c version double u_getNumericValue(UChar32 c) |
| 322 | int props = PROPERTY_.getProperty(ch); |
| 323 | int numericType = getNumericType(props); |
| 324 | if (numericType > NumericType.NONE && numericType < NumericType.COUNT) { |
| 325 | if (isNotExceptionIndicator(props)) { |
| 326 | return UCharacterProperty.getSignedValue(props); |
| 327 | } |
| 328 | else { |
| 329 | int index = UCharacterProperty.getExceptionIndex(props); |
| 330 | boolean nex = false; |
| 331 | boolean dex = false; |
| 332 | double numerator = 0; |
| 333 | if (PROPERTY_.hasExceptionValue(index, |
| 334 | UCharacterProperty.EXC_NUMERIC_VALUE_)) { |
| 335 | int num = PROPERTY_.getException(index, |
| 336 | UCharacterProperty.EXC_NUMERIC_VALUE_); |
| 337 | // There are special values for huge numbers that are |
| 338 | // powers of ten. genprops/store.c documents: |
| 339 | // if numericValue = 0x7fffff00 + x then |
| 340 | // numericValue = 10 ^ x |
| 341 | if (num >= NUMERATOR_POWER_LIMIT_) { |
| 342 | num &= 0xff; |
| 343 | // 10^x without math.h |
| 344 | numerator = Math.pow(10, num); |
| 345 | } |
| 346 | else { |
| 347 | numerator = num; |
| 348 | } |
| 349 | nex = true; |
| 350 | } |
| 351 | double denominator = 0; |
| 352 | if (PROPERTY_.hasExceptionValue(index, |
| 353 | UCharacterProperty.EXC_DENOMINATOR_VALUE_)) { |
| 354 | denominator = PROPERTY_.getException(index, |
| 355 | UCharacterProperty.EXC_DENOMINATOR_VALUE_); |
| 356 | // faster path not in c |
| 357 | if (numerator != 0) { |
| 358 | return numerator / denominator; |
| 359 | } |
| 360 | dex = true; |
| 361 | } |
| 362 | |
| 363 | if (nex) { |
| 364 | if (dex) { |
| 365 | return numerator / denominator; |
| 366 | } |
| 367 | return numerator; |
| 368 | } |
| 369 | if (dex) { |
| 370 | return 1 / denominator; |
| 371 | } |
| 372 | } |
| 373 | } |
| 374 | return NO_NUMERIC_VALUE; |
| 375 | } |
| 376 | |
| 377 | /** |
| 378 | * Returns a value indicating a code point's Unicode category. |
| 379 | * Up-to-date Unicode implementation of java.lang.Character.getType() |
| 380 | * except for the above mentioned code points that had their category |
| 381 | * changed.<br> |
| 382 | * Return results are constants from the interface |
| 383 | * <a href=UCharacterCategory.html>UCharacterCategory</a><br> |
| 384 | * <em>NOTE:</em> the UCharacterCategory values are <em>not</em> compatible with |
| 385 | * those returned by java.lang.Character.getType. UCharacterCategory values |
| 386 | * match the ones used in ICU4C, while java.lang.Character type |
| 387 | * values, though similar, skip the value 17.</p> |
| 388 | * @param ch code point whose type is to be determined |
| 389 | * @return category which is a value of UCharacterCategory |
| 390 | * @stable ICU 2.1 |
| 391 | */ |
| 392 | public static int getType(int ch) |
| 393 | { |
| 394 | return getProperty(ch) & UCharacterProperty.TYPE_MASK; |
| 395 | } |
| 396 | |
| 397 | //// for StringPrep |
| 398 | /** |
| 399 | * Returns a code point corresponding to the two UTF16 characters. |
| 400 | * @param lead the lead char |
| 401 | * @param trail the trail char |
| 402 | * @return code point if surrogate characters are valid. |
| 403 | * @exception IllegalArgumentException thrown when argument characters do |
| 404 | * not form a valid codepoint |
| 405 | * @stable ICU 2.1 |
| 406 | */ |
| 407 | public static int getCodePoint(char lead, char trail) |
| 408 | { |
| 409 | if (lead >= UTF16.LEAD_SURROGATE_MIN_VALUE && |
| 410 | lead <= UTF16.LEAD_SURROGATE_MAX_VALUE && |
| 411 | trail >= UTF16.TRAIL_SURROGATE_MIN_VALUE && |
| 412 | trail <= UTF16.TRAIL_SURROGATE_MAX_VALUE) { |
| 413 | return UCharacterProperty.getRawSupplementary(lead, trail); |
| 414 | } |
| 415 | throw new IllegalArgumentException("Illegal surrogate characters"); |
| 416 | } |
| 417 | |
| 418 | //// for StringPrep |
| 419 | /** |
| 420 | * Returns the Bidirection property of a code point. |
| 421 | * For example, 0x0041 (letter A) has the LEFT_TO_RIGHT directional |
| 422 | * property.<br> |
| 423 | * Result returned belongs to the interface |
| 424 | * <a href=UCharacterDirection.html>UCharacterDirection</a> |
| 425 | * @param ch the code point to be determined its direction |
| 426 | * @return direction constant from UCharacterDirection. |
| 427 | * @stable ICU 2.1 |
| 428 | */ |
| 429 | public static int getDirection(int ch) |
| 430 | { |
| 431 | // when ch is out of bounds getProperty == 0 |
| 432 | return (getProperty(ch) >> BIDI_SHIFT_) & BIDI_MASK_AFTER_SHIFT_; |
| 433 | } |
| 434 | |
| 435 | /** |
| 436 | * The given string is mapped to its case folding equivalent according to |
| 437 | * UnicodeData.txt and CaseFolding.txt; if any character has no case |
| 438 | * folding equivalent, the character itself is returned. |
| 439 | * "Full", multiple-code point case folding mappings are returned here. |
| 440 | * For "simple" single-code point mappings use the API |
| 441 | * foldCase(int ch, boolean defaultmapping). |
| 442 | * @param str the String to be converted |
| 443 | * @param defaultmapping Indicates if all mappings defined in |
| 444 | * CaseFolding.txt is to be used, otherwise the |
| 445 | * mappings for dotted I and dotless i marked with |
| 446 | * 'I' in CaseFolding.txt will be skipped. |
| 447 | * @return the case folding equivalent of the character, if |
| 448 | * any; otherwise the character itself. |
| 449 | * @see #foldCase(int, boolean) |
| 450 | * @stable ICU 2.1 |
| 451 | */ |
| 452 | public static String foldCase(String str, boolean defaultmapping) |
| 453 | { |
| 454 | int size = str.length(); |
| 455 | StringBuffer result = new StringBuffer(size); |
| 456 | int offset = 0; |
| 457 | int ch; |
| 458 | |
| 459 | // case mapping loop |
| 460 | while (offset < size) { |
| 461 | ch = UTF16.charAt(str, offset); |
| 462 | offset += UTF16.getCharCount(ch); |
| 463 | int props = PROPERTY_.getProperty(ch); |
| 464 | if (isNotExceptionIndicator(props)) { |
| 465 | int type = UCharacterProperty.TYPE_MASK & props; |
| 466 | if (type == ECharacterCategory.UPPERCASE_LETTER || |
| 467 | type == ECharacterCategory.TITLECASE_LETTER) { |
| 468 | ch += UCharacterProperty.getSignedValue(props); |
| 469 | } |
| 470 | } |
| 471 | else { |
| 472 | int index = UCharacterProperty.getExceptionIndex(props); |
| 473 | if (PROPERTY_.hasExceptionValue(index, |
| 474 | UCharacterProperty.EXC_CASE_FOLDING_)) { |
| 475 | int exception = PROPERTY_.getException(index, |
| 476 | UCharacterProperty.EXC_CASE_FOLDING_); |
| 477 | if (exception != 0) { |
| 478 | PROPERTY_.getFoldCase(exception & LAST_CHAR_MASK_, |
| 479 | exception >> SHIFT_24_, result); |
| 480 | } |
| 481 | else { |
| 482 | // special case folding mappings, hardcoded |
| 483 | if (ch != 0x49 && ch != 0x130) { |
| 484 | // return ch itself because there is no special |
| 485 | // mapping for it |
| 486 | UTF16.append(result, ch); |
| 487 | continue; |
| 488 | } |
| 489 | if (defaultmapping) { |
| 490 | // default mappings |
| 491 | if (ch == 0x49) { |
| 492 | // 0049; C; 0069; # LATIN CAPITAL LETTER I |
| 493 | result.append( |
| 494 | UCharacterProperty.LATIN_SMALL_LETTER_I_); |
| 495 | } |
| 496 | else if (ch == 0x130) { |
| 497 | // 0130; F; 0069 0307; |
| 498 | // # LATIN CAPITAL LETTER I WITH DOT ABOVE |
| 499 | result.append( |
| 500 | UCharacterProperty.LATIN_SMALL_LETTER_I_); |
| 501 | result.append((char)0x307); |
| 502 | } |
| 503 | } |
| 504 | else { |
| 505 | // Turkic mappings |
| 506 | if (ch == 0x49) { |
| 507 | // 0049; T; 0131; # LATIN CAPITAL LETTER I |
| 508 | result.append((char)0x131); |
| 509 | } |
| 510 | else if (ch == 0x130) { |
| 511 | // 0130; T; 0069; |
| 512 | // # LATIN CAPITAL LETTER I WITH DOT ABOVE |
| 513 | result.append( |
| 514 | UCharacterProperty.LATIN_SMALL_LETTER_I_); |
| 515 | } |
| 516 | } |
| 517 | } |
| 518 | // do not fall through to the output of c |
| 519 | continue; |
| 520 | } |
| 521 | else { |
| 522 | if (PROPERTY_.hasExceptionValue(index, |
| 523 | UCharacterProperty.EXC_LOWERCASE_)) { |
| 524 | ch = PROPERTY_.getException(index, |
| 525 | UCharacterProperty.EXC_LOWERCASE_); |
| 526 | } |
| 527 | } |
| 528 | |
| 529 | } |
| 530 | |
| 531 | // handle 1:1 code point mappings from UnicodeData.txt |
| 532 | UTF16.append(result, ch); |
| 533 | } |
| 534 | |
| 535 | return result.toString(); |
| 536 | } |
| 537 | |
| 538 | /** |
| 539 | * <p>Get the "age" of the code point.</p> |
| 540 | * <p>The "age" is the Unicode version when the code point was first |
| 541 | * designated (as a non-character or for Private Use) or assigned a |
| 542 | * character. |
| 543 | * <p>This can be useful to avoid emitting code points to receiving |
| 544 | * processes that do not accept newer characters.</p> |
| 545 | * <p>The data is from the UCD file DerivedAge.txt.</p> |
| 546 | * @param ch The code point. |
| 547 | * @return the Unicode version number |
| 548 | * @stable ICU 2.6 |
| 549 | */ |
| 550 | public static VersionInfo getAge(int ch) |
| 551 | { |
| 552 | if (ch < MIN_VALUE || ch > MAX_VALUE) { |
| 553 | throw new IllegalArgumentException("Codepoint out of bounds"); |
| 554 | } |
| 555 | return PROPERTY_.getAge(ch); |
| 556 | } |
| 557 | |
| 558 | /** |
| 559 | * <p>Gets the property value for an Unicode property type of a code point. |
| 560 | * Also returns binary and mask property values.</p> |
| 561 | * <p>Unicode, especially in version 3.2, defines many more properties than |
| 562 | * the original set in UnicodeData.txt.</p> |
| 563 | * <p>The properties APIs are intended to reflect Unicode properties as |
| 564 | * defined in the Unicode Character Database (UCD) and Unicode Technical |
| 565 | * Reports (UTR). For details about the properties see |
| 566 | * http://www.unicode.org/.</p> |
| 567 | * <p>For names of Unicode properties see the UCD file PropertyAliases.txt. |
| 568 | * </p> |
| 569 | * <pre> |
| 570 | * Sample usage: |
| 571 | * int ea = UCharacter.getIntPropertyValue(c, UProperty.EAST_ASIAN_WIDTH); |
| 572 | * int ideo = UCharacter.getIntPropertyValue(c, UProperty.IDEOGRAPHIC); |
| 573 | * boolean b = (ideo == 1) ? true : false; |
| 574 | * </pre> |
| 575 | * @param ch code point to test. |
| 576 | * @param type UProperty selector constant, identifies which binary |
| 577 | * property to check. Must be |
| 578 | * UProperty.BINARY_START <= type < UProperty.BINARY_LIMIT or |
| 579 | * UProperty.INT_START <= type < UProperty.INT_LIMIT or |
| 580 | * UProperty.MASK_START <= type < UProperty.MASK_LIMIT. |
| 581 | * @return numeric value that is directly the property value or, |
| 582 | * for enumerated properties, corresponds to the numeric value of |
| 583 | * the enumerated constant of the respective property value |
| 584 | * enumeration type (cast to enum type if necessary). |
| 585 | * Returns 0 or 1 (for false / true) for binary Unicode properties. |
| 586 | * Returns a bit-mask for mask properties. |
| 587 | * Returns 0 if 'type' is out of bounds or if the Unicode version |
| 588 | * does not have data for the property at all, or not for this code |
| 589 | * point. |
| 590 | * @see UProperty |
| 591 | * @see #hasBinaryProperty |
| 592 | * @see #getIntPropertyMinValue |
| 593 | * @see #getIntPropertyMaxValue |
| 594 | * @see #getUnicodeVersion |
| 595 | * @stable ICU 2.4 |
| 596 | */ |
| 597 | public static int getIntPropertyValue(int ch, int type) |
| 598 | { |
| 599 | /* |
| 600 | * For Normalizer with Unicode 3.2, this method is called only for |
| 601 | * HANGUL_SYLLABLE_TYPE in UnicodeSet.addPropertyStarts(). |
| 602 | */ |
| 603 | if (type == UProperty.HANGUL_SYLLABLE_TYPE) { |
| 604 | /* purely algorithmic; hardcode known characters, check for assigned new ones */ |
| 605 | if(ch<NormalizerImpl.JAMO_L_BASE) { |
| 606 | /* NA */ |
| 607 | } else if(ch<=0x11ff) { |
| 608 | /* Jamo range */ |
| 609 | if(ch<=0x115f) { |
| 610 | /* Jamo L range, HANGUL CHOSEONG ... */ |
| 611 | if(ch==0x115f || ch<=0x1159 || getType(ch)==ECharacterCategory.OTHER_LETTER) { |
| 612 | return HangulSyllableType.LEADING_JAMO; |
| 613 | } |
| 614 | } else if(ch<=0x11a7) { |
| 615 | /* Jamo V range, HANGUL JUNGSEONG ... */ |
| 616 | if(ch<=0x11a2 || getType(ch)==ECharacterCategory.OTHER_LETTER) { |
| 617 | return HangulSyllableType.VOWEL_JAMO; |
| 618 | } |
| 619 | } else { |
| 620 | /* Jamo T range */ |
| 621 | if(ch<=0x11f9 || getType(ch)==ECharacterCategory.OTHER_LETTER) { |
| 622 | return HangulSyllableType.TRAILING_JAMO; |
| 623 | } |
| 624 | } |
| 625 | } else if((ch-=NormalizerImpl.HANGUL_BASE)<0) { |
| 626 | /* NA */ |
| 627 | } else if(ch<NormalizerImpl.HANGUL_COUNT) { |
| 628 | /* Hangul syllable */ |
| 629 | return ch%NormalizerImpl.JAMO_T_COUNT==0 ? HangulSyllableType.LV_SYLLABLE : HangulSyllableType.LVT_SYLLABLE; |
| 630 | } |
| 631 | } |
| 632 | return 0; /* NA */ |
| 633 | } |
| 634 | |
| 635 | // private variables ------------------------------------------------- |
| 636 | |
| 637 | /** |
| 638 | * Database storing the sets of character property |
| 639 | */ |
| 640 | private static final UCharacterProperty PROPERTY_; |
| 641 | /** |
| 642 | * For optimization |
| 643 | */ |
| 644 | private static final char[] PROPERTY_TRIE_INDEX_; |
| 645 | private static final char[] PROPERTY_TRIE_DATA_; |
| 646 | private static final int[] PROPERTY_DATA_; |
| 647 | private static final int PROPERTY_INITIAL_VALUE_; |
| 648 | |
| 649 | // block to initialise character property database |
| 650 | static |
| 651 | { |
| 652 | try |
| 653 | { |
| 654 | PROPERTY_ = UCharacterProperty.getInstance(); |
| 655 | PROPERTY_TRIE_INDEX_ = PROPERTY_.m_trieIndex_; |
| 656 | PROPERTY_TRIE_DATA_ = PROPERTY_.m_trieData_; |
| 657 | PROPERTY_DATA_ = PROPERTY_.m_property_; |
| 658 | PROPERTY_INITIAL_VALUE_ |
| 659 | = PROPERTY_DATA_[PROPERTY_.m_trieInitialValue_]; |
| 660 | } |
| 661 | catch (Exception e) |
| 662 | { |
| 663 | throw new RuntimeException(e.getMessage()); |
| 664 | } |
| 665 | } |
| 666 | |
| 667 | /** |
| 668 | * To get the last character out from a data type |
| 669 | */ |
| 670 | private static final int LAST_CHAR_MASK_ = 0xFFFF; |
| 671 | |
| 672 | /** |
| 673 | * To get the last byte out from a data type |
| 674 | */ |
| 675 | // private static final int LAST_BYTE_MASK_ = 0xFF; |
| 676 | |
| 677 | /** |
| 678 | * Shift 16 bits |
| 679 | */ |
| 680 | // private static final int SHIFT_16_ = 16; |
| 681 | |
| 682 | /** |
| 683 | * Shift 24 bits |
| 684 | */ |
| 685 | private static final int SHIFT_24_ = 24; |
| 686 | |
| 687 | /** |
| 688 | * Shift to get numeric type |
| 689 | */ |
| 690 | private static final int NUMERIC_TYPE_SHIFT_ = 12; |
| 691 | /** |
| 692 | * Mask to get numeric type |
| 693 | */ |
| 694 | private static final int NUMERIC_TYPE_MASK_ = 0x7 << NUMERIC_TYPE_SHIFT_; |
| 695 | /** |
| 696 | * Shift to get bidi bits |
| 697 | */ |
| 698 | private static final int BIDI_SHIFT_ = 6; |
| 699 | |
| 700 | /** |
| 701 | * Mask to be applied after shifting to get bidi bits |
| 702 | */ |
| 703 | private static final int BIDI_MASK_AFTER_SHIFT_ = 0x1F; |
| 704 | |
| 705 | /** |
| 706 | * <p>Numerator power limit. |
| 707 | * There are special values for huge numbers that are powers of ten.</p> |
| 708 | * <p>c version genprops/store.c documents: |
| 709 | * if numericValue = 0x7fffff00 + x then numericValue = 10 ^ x</p> |
| 710 | */ |
| 711 | private static final int NUMERATOR_POWER_LIMIT_ = 0x7fffff00; |
| 712 | /** |
| 713 | * Integer properties mask and shift values for joining type. |
| 714 | * Equivalent to icu4c UPROPS_JT_MASK. |
| 715 | */ |
| 716 | private static final int JOINING_TYPE_MASK_ = 0x00003800; |
| 717 | /** |
| 718 | * Integer properties mask and shift values for joining type. |
| 719 | * Equivalent to icu4c UPROPS_JT_SHIFT. |
| 720 | */ |
| 721 | private static final int JOINING_TYPE_SHIFT_ = 11; |
| 722 | /** |
| 723 | * Integer properties mask and shift values for joining group. |
| 724 | * Equivalent to icu4c UPROPS_JG_MASK. |
| 725 | */ |
| 726 | private static final int JOINING_GROUP_MASK_ = 0x000007e0; |
| 727 | /** |
| 728 | * Integer properties mask and shift values for joining group. |
| 729 | * Equivalent to icu4c UPROPS_JG_SHIFT. |
| 730 | */ |
| 731 | private static final int JOINING_GROUP_SHIFT_ = 5; |
| 732 | /** |
| 733 | * Integer properties mask for decomposition type. |
| 734 | * Equivalent to icu4c UPROPS_DT_MASK. |
| 735 | */ |
| 736 | private static final int DECOMPOSITION_TYPE_MASK_ = 0x0000001f; |
| 737 | /** |
| 738 | * Integer properties mask and shift values for East Asian cell width. |
| 739 | * Equivalent to icu4c UPROPS_EA_MASK |
| 740 | */ |
| 741 | private static final int EAST_ASIAN_MASK_ = 0x00038000; |
| 742 | /** |
| 743 | * Integer properties mask and shift values for East Asian cell width. |
| 744 | * Equivalent to icu4c UPROPS_EA_SHIFT |
| 745 | */ |
| 746 | private static final int EAST_ASIAN_SHIFT_ = 15; |
| 747 | |
| 748 | /** |
| 749 | * Integer properties mask and shift values for line breaks. |
| 750 | * Equivalent to icu4c UPROPS_LB_MASK |
| 751 | */ |
| 752 | private static final int LINE_BREAK_MASK_ = 0x007C0000; |
| 753 | /** |
| 754 | * Integer properties mask and shift values for line breaks. |
| 755 | * Equivalent to icu4c UPROPS_LB_SHIFT |
| 756 | */ |
| 757 | private static final int LINE_BREAK_SHIFT_ = 18; |
| 758 | /** |
| 759 | * Integer properties mask and shift values for blocks. |
| 760 | * Equivalent to icu4c UPROPS_BLOCK_MASK |
| 761 | */ |
| 762 | private static final int BLOCK_MASK_ = 0x00007f80; |
| 763 | /** |
| 764 | * Integer properties mask and shift values for blocks. |
| 765 | * Equivalent to icu4c UPROPS_BLOCK_SHIFT |
| 766 | */ |
| 767 | private static final int BLOCK_SHIFT_ = 7; |
| 768 | /** |
| 769 | * Integer properties mask and shift values for scripts. |
| 770 | * Equivalent to icu4c UPROPS_SHIFT_MASK |
| 771 | */ |
| 772 | private static final int SCRIPT_MASK_ = 0x0000007f; |
| 773 | |
| 774 | // private constructor ----------------------------------------------- |
| 775 | ///CLOVER:OFF |
| 776 | /** |
| 777 | * Private constructor to prevent instantiation |
| 778 | */ |
| 779 | private UCharacter() |
| 780 | { |
| 781 | } |
| 782 | ///CLOVER:ON |
| 783 | // private methods --------------------------------------------------- |
| 784 | |
| 785 | /** |
| 786 | * Getting the digit values of characters like 'A' - 'Z', normal, |
| 787 | * half-width and full-width. This method assumes that the other digit |
| 788 | * characters are checked by the calling method. |
| 789 | * @param ch character to test |
| 790 | * @return -1 if ch is not a character of the form 'A' - 'Z', otherwise |
| 791 | * its corresponding digit will be returned. |
| 792 | */ |
| 793 | private static int getEuropeanDigit(int ch) { |
| 794 | if ((ch > 0x7a && ch < 0xff21) |
| 795 | || ch < 0x41 || (ch > 0x5a && ch < 0x61) |
| 796 | || ch > 0xff5a || (ch > 0xff31 && ch < 0xff41)) { |
| 797 | return -1; |
| 798 | } |
| 799 | if (ch <= 0x7a) { |
| 800 | // ch >= 0x41 or ch < 0x61 |
| 801 | return ch + 10 - ((ch <= 0x5a) ? 0x41 : 0x61); |
| 802 | } |
| 803 | // ch >= 0xff21 |
| 804 | if (ch <= 0xff3a) { |
| 805 | return ch + 10 - 0xff21; |
| 806 | } |
| 807 | // ch >= 0xff41 && ch <= 0xff5a |
| 808 | return ch + 10 - 0xff41; |
| 809 | } |
| 810 | |
| 811 | /** |
| 812 | * Gets the numeric type of the property argument |
| 813 | * @param props 32 bit property |
| 814 | * @return the numeric type |
| 815 | */ |
| 816 | private static int getNumericType(int props) |
| 817 | { |
| 818 | return (props & NUMERIC_TYPE_MASK_) >> NUMERIC_TYPE_SHIFT_; |
| 819 | } |
| 820 | |
| 821 | /** |
| 822 | * Checks if the property value has a exception indicator |
| 823 | * @param props 32 bit property value |
| 824 | * @return true if property does not have a exception indicator, false |
| 825 | * otherwise |
| 826 | */ |
| 827 | private static boolean isNotExceptionIndicator(int props) |
| 828 | { |
| 829 | return (props & UCharacterProperty.EXCEPTION_MASK) == 0; |
| 830 | } |
| 831 | |
| 832 | /** |
| 833 | * Gets the property value at the index. |
| 834 | * This is optimized. |
| 835 | * Note this is alittle different from CharTrie the index m_trieData_ |
| 836 | * is never negative. |
| 837 | * This is a duplicate of UCharacterProperty.getProperty. For optimization |
| 838 | * purposes, this method calls the trie data directly instead of through |
| 839 | * UCharacterProperty.getProperty. |
| 840 | * @param ch code point whose property value is to be retrieved |
| 841 | * @return property value of code point |
| 842 | * @stable ICU 2.6 |
| 843 | */ |
| 844 | private static int getProperty(int ch) |
| 845 | { |
| 846 | if (ch < UTF16.LEAD_SURROGATE_MIN_VALUE |
| 847 | || (ch > UTF16.LEAD_SURROGATE_MAX_VALUE |
| 848 | && ch < UTF16.SUPPLEMENTARY_MIN_VALUE)) { |
| 849 | // BMP codepoint |
| 850 | try { // using try for < 0 ch is faster than using an if statement |
| 851 | return PROPERTY_DATA_[ |
| 852 | PROPERTY_TRIE_DATA_[ |
| 853 | (PROPERTY_TRIE_INDEX_[ch >> 5] << 2) |
| 854 | + (ch & 0x1f)]]; |
| 855 | } catch (ArrayIndexOutOfBoundsException e) { |
| 856 | return PROPERTY_INITIAL_VALUE_; |
| 857 | } |
| 858 | } |
| 859 | if (ch <= UTF16.LEAD_SURROGATE_MAX_VALUE) { |
| 860 | // surrogate |
| 861 | return PROPERTY_DATA_[ |
| 862 | PROPERTY_TRIE_DATA_[ |
| 863 | (PROPERTY_TRIE_INDEX_[(0x2800 >> 5) + (ch >> 5)] << 2) |
| 864 | + (ch & 0x1f)]]; |
| 865 | } |
| 866 | // for optimization |
| 867 | if (ch <= UTF16.CODEPOINT_MAX_VALUE) { |
| 868 | // look at the construction of supplementary characters |
| 869 | // trail forms the ends of it. |
| 870 | return PROPERTY_DATA_[PROPERTY_.m_trie_.getSurrogateValue( |
| 871 | UTF16.getLeadSurrogate(ch), |
| 872 | (char)(ch & 0x3ff))]; |
| 873 | } |
| 874 | // return m_dataOffset_ if there is an error, in this case we return |
| 875 | // the default value: m_initialValue_ |
| 876 | // we cannot assume that m_initialValue_ is at offset 0 |
| 877 | // this is for optimization. |
| 878 | return PROPERTY_INITIAL_VALUE_; |
| 879 | } |
| 880 | } |