J. Duke | 319a3b9 | 2007-12-01 00:00:00 +0000 | [diff] [blame^] | 1 | /* |
| 2 | * Portions Copyright 2005 Sun Microsystems, Inc. All Rights Reserved. |
| 3 | * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. |
| 4 | * |
| 5 | * This code is free software; you can redistribute it and/or modify it |
| 6 | * under the terms of the GNU General Public License version 2 only, as |
| 7 | * published by the Free Software Foundation. Sun designates this |
| 8 | * particular file as subject to the "Classpath" exception as provided |
| 9 | * by Sun in the LICENSE file that accompanied this code. |
| 10 | * |
| 11 | * This code is distributed in the hope that it will be useful, but WITHOUT |
| 12 | * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or |
| 13 | * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License |
| 14 | * version 2 for more details (a copy is included in the LICENSE file that |
| 15 | * accompanied this code). |
| 16 | * |
| 17 | * You should have received a copy of the GNU General Public License version |
| 18 | * 2 along with this work; if not, write to the Free Software Foundation, |
| 19 | * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA. |
| 20 | * |
| 21 | * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara, |
| 22 | * CA 95054 USA or visit www.sun.com if you need additional information or |
| 23 | * have any questions. |
| 24 | */ |
| 25 | |
| 26 | /* |
| 27 | ******************************************************************************* |
| 28 | * (C) Copyright IBM Corp. 1996-2005 - All Rights Reserved * |
| 29 | * * |
| 30 | * The original version of this source code and documentation is copyrighted * |
| 31 | * and owned by IBM, These materials are provided under terms of a License * |
| 32 | * Agreement between IBM and Sun. This technology is protected by multiple * |
| 33 | * US and International patents. This notice and attribution to IBM may not * |
| 34 | * to removed. * |
| 35 | ******************************************************************************* |
| 36 | */ |
| 37 | |
| 38 | package sun.text.normalizer; |
| 39 | |
| 40 | import java.io.InputStream; |
| 41 | import java.io.DataInputStream; |
| 42 | import java.io.IOException; |
| 43 | |
| 44 | /** |
| 45 | * <p>Internal reader class for ICU data file uprops.icu containing |
| 46 | * Unicode codepoint data.</p> |
| 47 | * <p>This class simply reads uprops.icu, authenticates that it is a valid |
| 48 | * ICU data file and split its contents up into blocks of data for use in |
| 49 | * <a href=UCharacterProperty.html>com.ibm.icu.impl.UCharacterProperty</a>. |
| 50 | * </p> |
| 51 | * <p>uprops.icu which is in big-endian format is jared together with this |
| 52 | * package.</p> |
| 53 | * @author Syn Wee Quek |
| 54 | * @since release 2.1, February 1st 2002 |
| 55 | * @draft 2.1 |
| 56 | */ |
| 57 | /* Unicode character properties file format ------------------------------------ |
| 58 | |
| 59 | The file format prepared and written here contains several data |
| 60 | structures that store indexes or data. |
| 61 | |
| 62 | |
| 63 | |
| 64 | The following is a description of format version 3 . |
| 65 | |
| 66 | Data contents: |
| 67 | |
| 68 | The contents is a parsed, binary form of several Unicode character |
| 69 | database files, most prominently UnicodeData.txt. |
| 70 | |
| 71 | Any Unicode code point from 0 to 0x10ffff can be looked up to get |
| 72 | the properties, if any, for that code point. This means that the input |
| 73 | to the lookup are 21-bit unsigned integers, with not all of the |
| 74 | 21-bit range used. |
| 75 | |
| 76 | It is assumed that client code keeps a uint32_t pointer |
| 77 | to the beginning of the data: |
| 78 | |
| 79 | const uint32_t *p32; |
| 80 | |
| 81 | Formally, the file contains the following structures: |
| 82 | |
| 83 | const int32_t indexes[16] with values i0..i15: |
| 84 | |
| 85 | i0 propsIndex; -- 32-bit unit index to the table of 32-bit properties words |
| 86 | i1 exceptionsIndex; -- 32-bit unit index to the table of 32-bit exception words |
| 87 | i2 exceptionsTopIndex; -- 32-bit unit index to the array of UChars for special mappings |
| 88 | |
| 89 | i3 additionalTrieIndex; -- 32-bit unit index to the additional trie for more properties |
| 90 | i4 additionalVectorsIndex; -- 32-bit unit index to the table of properties vectors |
| 91 | i5 additionalVectorsColumns; -- number of 32-bit words per properties vector |
| 92 | |
| 93 | i6 reservedItemIndex; -- 32-bit unit index to the top of the properties vectors table |
| 94 | i7..i9 reservedIndexes; -- reserved values; 0 for now |
| 95 | |
| 96 | i10 maxValues; -- maximum code values for vector word 0, see uprops.h (format version 3.1+) |
| 97 | i11 maxValues2; -- maximum code values for vector word 2, see uprops.h (format version 3.2) |
| 98 | i12..i15 reservedIndexes; -- reserved values; 0 for now |
| 99 | |
| 100 | PT serialized properties trie, see utrie.h (byte size: 4*(i0-16)) |
| 101 | |
| 102 | P const uint32_t props32[i1-i0]; |
| 103 | E const uint32_t exceptions[i2-i1]; |
| 104 | U const UChar uchars[2*(i3-i2)]; |
| 105 | |
| 106 | AT serialized trie for additional properties (byte size: 4*(i4-i3)) |
| 107 | PV const uint32_t propsVectors[(i6-i4)/i5][i5]==uint32_t propsVectors[i6-i4]; |
| 108 | |
| 109 | Trie lookup and properties: |
| 110 | |
| 111 | In order to condense the data for the 21-bit code space, several properties of |
| 112 | the Unicode code assignment are exploited: |
| 113 | - The code space is sparse. |
| 114 | - There are several 10k of consecutive codes with the same properties. |
| 115 | - Characters and scripts are allocated in groups of 16 code points. |
| 116 | - Inside blocks for scripts the properties are often repetitive. |
| 117 | - The 21-bit space is not fully used for Unicode. |
| 118 | |
| 119 | The lookup of properties for a given code point is done with a trie lookup, |
| 120 | using the UTrie implementation. |
| 121 | The trie lookup result is a 16-bit index in the props32[] table where the |
| 122 | actual 32-bit properties word is stored. This is done to save space. |
| 123 | |
| 124 | (There are thousands of 16-bit entries in the trie data table, but |
| 125 | only a few hundred unique 32-bit properties words. |
| 126 | If the trie data table contained 32-bit words directly, then that would be |
| 127 | larger because the length of the table would be the same as now but the |
| 128 | width would be 32 bits instead of 16. This saves more than 10kB.) |
| 129 | |
| 130 | With a given Unicode code point |
| 131 | |
| 132 | UChar32 c; |
| 133 | |
| 134 | and 0<=c<0x110000, the lookup is done like this: |
| 135 | |
| 136 | uint16_t i; |
| 137 | UTRIE_GET16(c, i); |
| 138 | uint32_t props=p32[i]; |
| 139 | |
| 140 | For some characters, not all of the properties can be efficiently encoded |
| 141 | using 32 bits. For them, the 32-bit word contains an index into the exceptions[] |
| 142 | array: |
| 143 | |
| 144 | if(props&EXCEPTION_BIT)) { |
| 145 | uint16_t e=(uint16_t)(props>>VALUE_SHIFT); |
| 146 | ... |
| 147 | } |
| 148 | |
| 149 | The exception values are a variable number of uint32_t starting at |
| 150 | |
| 151 | const uint32_t *pe=p32+exceptionsIndex+e; |
| 152 | |
| 153 | The first uint32_t there contains flags about what values actually follow it. |
| 154 | Some of the exception values are UChar32 code points for the case mappings, |
| 155 | others are numeric values etc. |
| 156 | |
| 157 | 32-bit properties sets: |
| 158 | |
| 159 | Each 32-bit properties word contains: |
| 160 | |
| 161 | 0.. 4 general category |
| 162 | 5 has exception values |
| 163 | 6..10 BiDi category |
| 164 | 11 is mirrored |
| 165 | 12..14 numericType: |
| 166 | 0 no numeric value |
| 167 | 1 decimal digit value |
| 168 | 2 digit value |
| 169 | 3 numeric value |
| 170 | ### TODO: type 4 for Han digits & numbers?! |
| 171 | 15..19 reserved |
| 172 | 20..31 value according to bits 0..5: |
| 173 | if(has exception) { |
| 174 | exception index; |
| 175 | } else switch(general category) { |
| 176 | case Ll: delta to uppercase; -- same as titlecase |
| 177 | case Lu: -delta to lowercase; -- titlecase is same as c |
| 178 | case Lt: -delta to lowercase; -- uppercase is same as c |
| 179 | default: |
| 180 | if(is mirrored) { |
| 181 | delta to mirror; |
| 182 | } else if(numericType!=0) { |
| 183 | numericValue; |
| 184 | } else { |
| 185 | 0; |
| 186 | }; |
| 187 | } |
| 188 | |
| 189 | Exception values: |
| 190 | |
| 191 | In the first uint32_t exception word for a code point, |
| 192 | bits |
| 193 | 31..16 reserved |
| 194 | 15..0 flags that indicate which values follow: |
| 195 | |
| 196 | bit |
| 197 | 0 has uppercase mapping |
| 198 | 1 has lowercase mapping |
| 199 | 2 has titlecase mapping |
| 200 | 3 unused |
| 201 | 4 has numeric value (numerator) |
| 202 | if numericValue=0x7fffff00+x then numericValue=10^x |
| 203 | 5 has denominator value |
| 204 | 6 has a mirror-image Unicode code point |
| 205 | 7 has SpecialCasing.txt entries |
| 206 | 8 has CaseFolding.txt entries |
| 207 | |
| 208 | According to the flags in this word, one or more uint32_t words follow it |
| 209 | in the sequence of the bit flags in the flags word; if a flag is not set, |
| 210 | then the value is missing or 0: |
| 211 | |
| 212 | For the case mappings and the mirror-image Unicode code point, |
| 213 | one uint32_t or UChar32 each is the code point. |
| 214 | If the titlecase mapping is missing, then it is the same as the uppercase mapping. |
| 215 | |
| 216 | For the digit values, bits 31..16 contain the decimal digit value, and |
| 217 | bits 15..0 contain the digit value. A value of -1 indicates that |
| 218 | this value is missing. |
| 219 | |
| 220 | For the numeric/numerator value, an int32_t word contains the value directly, |
| 221 | except for when there is no numerator but a denominator, then the numerator |
| 222 | is implicitly 1. This means: |
| 223 | numerator denominator result |
| 224 | none none none |
| 225 | x none x |
| 226 | none y 1/y |
| 227 | x y x/y |
| 228 | |
| 229 | If the numerator value is 0x7fffff00+x then it is replaced with 10^x. |
| 230 | |
| 231 | For the denominator value, a uint32_t word contains the value directly. |
| 232 | |
| 233 | For special casing mappings, the 32-bit exception word contains: |
| 234 | 31 if set, this character has complex, conditional mappings |
| 235 | that are not stored; |
| 236 | otherwise, the mappings are stored according to the following bits |
| 237 | 30..24 number of UChars used for mappings |
| 238 | 23..16 reserved |
| 239 | 15.. 0 UChar offset from the beginning of the UChars array where the |
| 240 | UChars for the special case mappings are stored in the following format: |
| 241 | |
| 242 | Format of special casing UChars: |
| 243 | One UChar value with lengths as follows: |
| 244 | 14..10 number of UChars for titlecase mapping |
| 245 | 9.. 5 number of UChars for uppercase mapping |
| 246 | 4.. 0 number of UChars for lowercase mapping |
| 247 | |
| 248 | Followed by the UChars for lowercase, uppercase, titlecase mappings in this order. |
| 249 | |
| 250 | For case folding mappings, the 32-bit exception word contains: |
| 251 | 31..24 number of UChars used for the full mapping |
| 252 | 23..16 reserved |
| 253 | 15.. 0 UChar offset from the beginning of the UChars array where the |
| 254 | UChars for the special case mappings are stored in the following format: |
| 255 | |
| 256 | Format of case folding UChars: |
| 257 | Two UChars contain the simple mapping as follows: |
| 258 | 0, 0 no simple mapping |
| 259 | BMP,0 a simple mapping to a BMP code point |
| 260 | s1, s2 a simple mapping to a supplementary code point stored as two surrogates |
| 261 | This is followed by the UChars for the full case folding mappings. |
| 262 | |
| 263 | Example: |
| 264 | U+2160, ROMAN NUMERAL ONE, needs an exception because it has a lowercase |
| 265 | mapping and a numeric value. |
| 266 | Its exception values would be stored as 3 uint32_t words: |
| 267 | |
| 268 | - flags=0x0a (see above) with combining class 0 |
| 269 | - lowercase mapping 0x2170 |
| 270 | - numeric value=1 |
| 271 | |
| 272 | --- Additional properties (new in format version 2.1) --- |
| 273 | |
| 274 | The second trie for additional properties (AT) is also a UTrie with 16-bit data. |
| 275 | The data words consist of 32-bit unit indexes (not row indexes!) into the |
| 276 | table of unique properties vectors (PV). |
| 277 | Each vector contains a set of properties. |
| 278 | The width of a vector (number of uint32_t per row) may change |
| 279 | with the formatVersion, it is stored in i5. |
| 280 | |
| 281 | Current properties: see icu/source/common/uprops.h |
| 282 | |
| 283 | --- Changes in format version 3.1 --- |
| 284 | |
| 285 | See i10 maxValues above, contains only UBLOCK_COUNT and USCRIPT_CODE_LIMIT. |
| 286 | |
| 287 | --- Changes in format version 3.2 --- |
| 288 | |
| 289 | - The tries use linear Latin-1 ranges. |
| 290 | - The additional properties bits store full properties XYZ instead |
| 291 | of partial Other_XYZ, so that changes in the derivation formulas |
| 292 | need not be tracked in runtime library code. |
| 293 | - Joining Type and Line Break are also stored completely, so that uprops.c |
| 294 | needs no runtime formulas for enumerated properties either. |
| 295 | - Store the case-sensitive flag in the main properties word. |
| 296 | - i10 also contains U_LB_COUNT and U_EA_COUNT. |
| 297 | - i11 contains maxValues2 for vector word 2. |
| 298 | |
| 299 | ----------------------------------------------------------------------------- */ |
| 300 | |
| 301 | final class UCharacterPropertyReader implements ICUBinary.Authenticate |
| 302 | { |
| 303 | // public methods ---------------------------------------------------- |
| 304 | |
| 305 | public boolean isDataVersionAcceptable(byte version[]) |
| 306 | { |
| 307 | return version[0] == DATA_FORMAT_VERSION_[0] |
| 308 | && version[2] == DATA_FORMAT_VERSION_[2] |
| 309 | && version[3] == DATA_FORMAT_VERSION_[3]; |
| 310 | } |
| 311 | |
| 312 | // protected constructor --------------------------------------------- |
| 313 | |
| 314 | /** |
| 315 | * <p>Protected constructor.</p> |
| 316 | * @param inputStream ICU uprop.dat file input stream |
| 317 | * @exception IOException throw if data file fails authentication |
| 318 | * @draft 2.1 |
| 319 | */ |
| 320 | protected UCharacterPropertyReader(InputStream inputStream) |
| 321 | throws IOException |
| 322 | { |
| 323 | m_unicodeVersion_ = ICUBinary.readHeader(inputStream, DATA_FORMAT_ID_, |
| 324 | this); |
| 325 | m_dataInputStream_ = new DataInputStream(inputStream); |
| 326 | } |
| 327 | |
| 328 | // protected methods ------------------------------------------------- |
| 329 | |
| 330 | /** |
| 331 | * <p>Reads uprops.icu, parse it into blocks of data to be stored in |
| 332 | * UCharacterProperty.</P |
| 333 | * @param ucharppty UCharacterProperty instance |
| 334 | * @exception thrown when data reading fails |
| 335 | * @draft 2.1 |
| 336 | */ |
| 337 | protected void read(UCharacterProperty ucharppty) throws IOException |
| 338 | { |
| 339 | // read the indexes |
| 340 | int count = INDEX_SIZE_; |
| 341 | m_propertyOffset_ = m_dataInputStream_.readInt(); |
| 342 | count --; |
| 343 | m_exceptionOffset_ = m_dataInputStream_.readInt(); |
| 344 | count --; |
| 345 | m_caseOffset_ = m_dataInputStream_.readInt(); |
| 346 | count --; |
| 347 | m_additionalOffset_ = m_dataInputStream_.readInt(); |
| 348 | count --; |
| 349 | m_additionalVectorsOffset_ = m_dataInputStream_.readInt(); |
| 350 | count --; |
| 351 | m_additionalColumnsCount_ = m_dataInputStream_.readInt(); |
| 352 | count --; |
| 353 | m_reservedOffset_ = m_dataInputStream_.readInt(); |
| 354 | count --; |
| 355 | m_dataInputStream_.skipBytes(3 << 2); |
| 356 | count -= 3; |
| 357 | ucharppty.m_maxBlockScriptValue_ = m_dataInputStream_.readInt(); |
| 358 | count --; // 10 |
| 359 | ucharppty.m_maxJTGValue_ = m_dataInputStream_.readInt(); |
| 360 | count --; // 11 |
| 361 | m_dataInputStream_.skipBytes(count << 2); |
| 362 | |
| 363 | // read the trie index block |
| 364 | // m_props_index_ in terms of ints |
| 365 | ucharppty.m_trie_ = new CharTrie(m_dataInputStream_, ucharppty); |
| 366 | |
| 367 | // reads the 32 bit properties block |
| 368 | int size = m_exceptionOffset_ - m_propertyOffset_; |
| 369 | ucharppty.m_property_ = new int[size]; |
| 370 | for (int i = 0; i < size; i ++) { |
| 371 | ucharppty.m_property_[i] = m_dataInputStream_.readInt(); |
| 372 | } |
| 373 | |
| 374 | // reads the 32 bit exceptions block |
| 375 | size = m_caseOffset_ - m_exceptionOffset_; |
| 376 | ucharppty.m_exception_ = new int[size]; |
| 377 | for (int i = 0; i < size; i ++) { |
| 378 | ucharppty.m_exception_[i] = m_dataInputStream_.readInt(); |
| 379 | } |
| 380 | |
| 381 | // reads the 32 bit case block |
| 382 | size = (m_additionalOffset_ - m_caseOffset_) << 1; |
| 383 | ucharppty.m_case_ = new char[size]; |
| 384 | for (int i = 0; i < size; i ++) { |
| 385 | ucharppty.m_case_[i] = m_dataInputStream_.readChar(); |
| 386 | } |
| 387 | |
| 388 | // reads the additional property block |
| 389 | ucharppty.m_additionalTrie_ = new CharTrie(m_dataInputStream_, |
| 390 | ucharppty); |
| 391 | |
| 392 | // additional properties |
| 393 | size = m_reservedOffset_ - m_additionalVectorsOffset_; |
| 394 | ucharppty.m_additionalVectors_ = new int[size]; |
| 395 | for (int i = 0; i < size; i ++) { |
| 396 | ucharppty.m_additionalVectors_[i] = m_dataInputStream_.readInt(); |
| 397 | } |
| 398 | |
| 399 | m_dataInputStream_.close(); |
| 400 | ucharppty.m_additionalColumnsCount_ = m_additionalColumnsCount_; |
| 401 | ucharppty.m_unicodeVersion_ = VersionInfo.getInstance( |
| 402 | (int)m_unicodeVersion_[0], (int)m_unicodeVersion_[1], |
| 403 | (int)m_unicodeVersion_[2], (int)m_unicodeVersion_[3]); |
| 404 | } |
| 405 | |
| 406 | // private variables ------------------------------------------------- |
| 407 | |
| 408 | /** |
| 409 | * Index size |
| 410 | */ |
| 411 | private static final int INDEX_SIZE_ = 16; |
| 412 | |
| 413 | /** |
| 414 | * ICU data file input stream |
| 415 | */ |
| 416 | private DataInputStream m_dataInputStream_; |
| 417 | |
| 418 | /** |
| 419 | * Offset information in the indexes. |
| 420 | */ |
| 421 | private int m_propertyOffset_; |
| 422 | private int m_exceptionOffset_; |
| 423 | private int m_caseOffset_; |
| 424 | private int m_additionalOffset_; |
| 425 | private int m_additionalVectorsOffset_; |
| 426 | private int m_additionalColumnsCount_; |
| 427 | private int m_reservedOffset_; |
| 428 | private byte m_unicodeVersion_[]; |
| 429 | |
| 430 | /** |
| 431 | * File format version that this class understands. |
| 432 | * No guarantees are made if a older version is used |
| 433 | */ |
| 434 | private static final byte DATA_FORMAT_ID_[] = {(byte)0x55, (byte)0x50, |
| 435 | (byte)0x72, (byte)0x6F}; |
| 436 | private static final byte DATA_FORMAT_VERSION_[] = {(byte)0x3, (byte)0x1, |
| 437 | (byte)Trie.INDEX_STAGE_1_SHIFT_, |
| 438 | (byte)Trie.INDEX_STAGE_2_SHIFT_}; |
| 439 | } |