blob: 0378d5b592376b64e851baa685ba04504d144fe6 [file] [log] [blame]
J. Duke319a3b92007-12-01 00:00:00 +00001/*
2 * Portions Copyright 2003-2006 Sun Microsystems, Inc. All Rights Reserved.
3 * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
4 *
5 * This code is free software; you can redistribute it and/or modify it
6 * under the terms of the GNU General Public License version 2 only, as
7 * published by the Free Software Foundation. Sun designates this
8 * particular file as subject to the "Classpath" exception as provided
9 * by Sun in the LICENSE file that accompanied this code.
10 *
11 * This code is distributed in the hope that it will be useful, but WITHOUT
12 * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
13 * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
14 * version 2 for more details (a copy is included in the LICENSE file that
15 * accompanied this code).
16 *
17 * You should have received a copy of the GNU General Public License version
18 * 2 along with this work; if not, write to the Free Software Foundation,
19 * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
20 *
21 * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
22 * CA 95054 USA or visit www.sun.com if you need additional information or
23 * have any questions.
24 */
25
26/*
27 *******************************************************************************
28 * (C) Copyright IBM Corp. 1996-2005 - All Rights Reserved *
29 * *
30 * The original version of this source code and documentation is copyrighted *
31 * and owned by IBM, These materials are provided under terms of a License *
32 * Agreement between IBM and Sun. This technology is protected by multiple *
33 * US and International patents. This notice and attribution to IBM may not *
34 * to removed. *
35 *******************************************************************************
36 */
37
38package sun.text.normalizer;
39
40import java.io.DataInputStream;
41import java.io.InputStream;
42import java.io.IOException;
43
44/**
45 * @author Ram Viswanadha
46 */
47
48 /*
49 * Description of the format of unorm.icu version 2.1.
50 *
51 * Main change from version 1 to version 2:
52 * Use of new, common Trie instead of normalization-specific tries.
53 * Change to version 2.1: add third/auxiliary trie with associated data.
54 *
55 * For more details of how to use the data structures see the code
56 * in unorm.cpp (runtime normalization code) and
57 * in gennorm.c and gennorm/store.c (build-time data generation).
58 *
59 * For the serialized format of Trie see Trie.c/TrieHeader.
60 *
61 * - Overall partition
62 *
63 * unorm.icu customarily begins with a UDataInfo structure, see udata.h and .c.
64 * After that there are the following structures:
65 *
66 * char indexes[INDEX_TOP]; -- INDEX_TOP=32, see enum in this file
67 *
68 * Trie normTrie; -- size in bytes=indexes[INDEX_TRIE_SIZE]
69 *
70 * char extraData[extraDataTop]; -- extraDataTop=indexes[INDEX_UCHAR_COUNT]
71 * extraData[0] contains the number of units for
72 * FC_NFKC_Closure (formatVersion>=2.1)
73 *
74 * char combiningTable[combiningTableTop]; -- combiningTableTop=indexes[INDEX_COMBINE_DATA_COUNT]
75 * combiningTableTop may include one 16-bit padding unit
76 * to make sure that fcdTrie is 32-bit-aligned
77 *
78 * Trie fcdTrie; -- size in bytes=indexes[INDEX_FCD_TRIE_SIZE]
79 *
80 * Trie auxTrie; -- size in bytes=indexes[INDEX_AUX_TRIE_SIZE]
81 *
82 *
83 * The indexes array contains lengths and sizes of the following arrays and structures
84 * as well as the following values:
85 * indexes[INDEX_COMBINE_FWD_COUNT]=combineFwdTop
86 * -- one more than the highest combining index computed for forward-only-combining characters
87 * indexes[INDEX_COMBINE_BOTH_COUNT]=combineBothTop-combineFwdTop
88 * -- number of combining indexes computed for both-ways-combining characters
89 * indexes[INDEX_COMBINE_BACK_COUNT]=combineBackTop-combineBothTop
90 * -- number of combining indexes computed for backward-only-combining characters
91 *
92 * indexes[INDEX_MIN_NF*_NO_MAYBE] (where *={ C, D, KC, KD })
93 * -- first code point with a quick check NF* value of NO/MAYBE
94 *
95 *
96 * - Tries
97 *
98 * The main structures are two Trie tables ("compact arrays"),
99 * each with one index array and one data array.
100 * See Trie.h and Trie.c.
101 *
102 *
103 * - Tries in unorm.icu
104 *
105 * The first trie (normTrie above)
106 * provides data for the NF* quick checks and normalization.
107 * The second trie (fcdTrie above) provides data just for FCD checks.
108 *
109 *
110 * - norm32 data words from the first trie
111 *
112 * The norm32Table contains one 32-bit word "norm32" per code point.
113 * It contains the following bit fields:
114 * 31..16 extra data index, EXTRA_SHIFT is used to shift this field down
115 * if this index is <EXTRA_INDEX_TOP then it is an index into
116 * extraData[] where variable-length normalization data for this
117 * code point is found
118 * if this index is <EXTRA_INDEX_TOP+EXTRA_SURROGATE_TOP
119 * then this is a norm32 for a leading surrogate, and the index
120 * value is used together with the following trailing surrogate
121 * code unit in the second trie access
122 * if this index is >=EXTRA_INDEX_TOP+EXTRA_SURROGATE_TOP
123 * then this is a norm32 for a "special" character,
124 * i.e., the character is a Hangul syllable or a Jamo
125 * see EXTRA_HANGUL etc.
126 * generally, instead of extracting this index from the norm32 and
127 * comparing it with the above constants,
128 * the normalization code compares the entire norm32 value
129 * with MIN_SPECIAL, SURROGATES_TOP, MIN_HANGUL etc.
130 *
131 * 15..8 combining class (cc) according to UnicodeData.txt
132 *
133 * 7..6 COMBINES_ANY flags, used in composition to see if a character
134 * combines with any following or preceding character(s)
135 * at all
136 * 7 COMBINES_BACK
137 * 6 COMBINES_FWD
138 *
139 * 5..0 quick check flags, set for "no" or "maybe", with separate flags for
140 * each normalization form
141 * the higher bits are "maybe" flags; for NF*D there are no such flags
142 * the lower bits are "no" flags for all forms, in the same order
143 * as the "maybe" flags,
144 * which is (MSB to LSB): NFKD NFD NFKC NFC
145 * 5..4 QC_ANY_MAYBE
146 * 3..0 QC_ANY_NO
147 * see further related constants
148 *
149 *
150 * - Extra data per code point
151 *
152 * "Extra data" is referenced by the index in norm32.
153 * It is variable-length data. It is only present, and only those parts
154 * of it are, as needed for a given character.
155 * The norm32 extra data index is added to the beginning of extraData[]
156 * to get to a vector of 16-bit words with data at the following offsets:
157 *
158 * [-1] Combining index for composition.
159 * Stored only if norm32&COMBINES_ANY .
160 * [0] Lengths of the canonical and compatibility decomposition strings.
161 * Stored only if there are decompositions, i.e.,
162 * if norm32&(QC_NFD|QC_NFKD)
163 * High byte: length of NFKD, or 0 if none
164 * Low byte: length of NFD, or 0 if none
165 * Each length byte also has another flag:
166 * Bit 7 of a length byte is set if there are non-zero
167 * combining classes (cc's) associated with the respective
168 * decomposition. If this flag is set, then the decomposition
169 * is preceded by a 16-bit word that contains the
170 * leading and trailing cc's.
171 * Bits 6..0 of a length byte are the length of the
172 * decomposition string, not counting the cc word.
173 * [1..n] NFD
174 * [n+1..] NFKD
175 *
176 * Each of the two decompositions consists of up to two parts:
177 * - The 16-bit words with the leading and trailing cc's.
178 * This is only stored if bit 7 of the corresponding length byte
179 * is set. In this case, at least one of the cc's is not zero.
180 * High byte: leading cc==cc of the first code point in the decomposition string
181 * Low byte: trailing cc==cc of the last code point in the decomposition string
182 * - The decomposition string in UTF-16, with length code units.
183 *
184 *
185 * - Combining indexes and combiningTable[]
186 *
187 * Combining indexes are stored at the [-1] offset of the extra data
188 * if the character combines forward or backward with any other characters.
189 * They are used for (re)composition in NF*C.
190 * Values of combining indexes are arranged according to whether a character
191 * combines forward, backward, or both ways:
192 * forward-only < both ways < backward-only
193 *
194 * The index values for forward-only and both-ways combining characters
195 * are indexes into the combiningTable[].
196 * The index values for backward-only combining characters are simply
197 * incremented from the preceding index values to be unique.
198 *
199 * In the combiningTable[], a variable-length list
200 * of variable-length (back-index, code point) pair entries is stored
201 * for each forward-combining character.
202 *
203 * These back-indexes are the combining indexes of both-ways or backward-only
204 * combining characters that the forward-combining character combines with.
205 *
206 * Each list is sorted in ascending order of back-indexes.
207 * Each list is terminated with the last back-index having bit 15 set.
208 *
209 * Each pair (back-index, code point) takes up either 2 or 3
210 * 16-bit words.
211 * The first word of a list entry is the back-index, with its bit 15 set if
212 * this is the last pair in the list.
213 *
214 * The second word contains flags in bits 15..13 that determine
215 * if there is a third word and how the combined character is encoded:
216 * 15 set if there is a third word in this list entry
217 * 14 set if the result is a supplementary character
218 * 13 set if the result itself combines forward
219 *
220 * According to these bits 15..14 of the second word,
221 * the result character is encoded as follows:
222 * 00 or 01 The result is <=0x1fff and stored in bits 12..0 of
223 * the second word.
224 * 10 The result is 0x2000..0xffff and stored in the third word.
225 * Bits 12..0 of the second word are not used.
226 * 11 The result is a supplementary character.
227 * Bits 9..0 of the leading surrogate are in bits 9..0 of
228 * the second word.
229 * Add 0xd800 to these bits to get the complete surrogate.
230 * Bits 12..10 of the second word are not used.
231 * The trailing surrogate is stored in the third word.
232 *
233 *
234 * - FCD trie
235 *
236 * The FCD trie is very simple.
237 * It is a folded trie with 16-bit data words.
238 * In each word, the high byte contains the leading cc of the character,
239 * and the low byte contains the trailing cc of the character.
240 * These cc's are the cc's of the first and last code points in the
241 * canonical decomposition of the character.
242 *
243 * Since all 16 bits are used for cc's, lead surrogates must be tested
244 * by checking the code unit instead of the trie data.
245 * This is done only if the 16-bit data word is not zero.
246 * If the code unit is a leading surrogate and the data word is not zero,
247 * then instead of cc's it contains the offset for the second trie lookup.
248 *
249 *
250 * - Auxiliary trie and data
251 *
252 *
253 * The auxiliary 16-bit trie contains data for additional properties.
254 * Bits
255 * 15..13 reserved
256 * 12 not NFC_Skippable (f) (formatVersion>=2.2)
257 * 11 flag: not a safe starter for canonical closure
258 * 10 composition exclusion
259 * 9.. 0 index into extraData[] to FC_NFKC_Closure string
260 * (not for lead surrogate),
261 * or lead surrogate offset (for lead surrogate, if 9..0 not zero)
262 *
263 * Conditions for "NF* Skippable" from Mark Davis' com.ibm.text.UCD.NFSkippable:
264 * (used in NormalizerTransliterator)
265 *
266 * A skippable character is
267 * a) unassigned, or ALL of the following:
268 * b) of combining class 0.
269 * c) not decomposed by this normalization form.
270 * AND if NFC or NFKC,
271 * d) can never compose with a previous character.
272 * e) can never compose with a following character.
273 * f) can never change if another character is added.
274 * Example: a-breve might satisfy all but f, but if you
275 * add an ogonek it changes to a-ogonek + breve
276 *
277 * a)..e) must be tested from norm32.
278 * Since f) is more complicated, the (not-)NFC_Skippable flag (f) is built
279 * into the auxiliary trie.
280 * The same bit is used for NFC and NFKC; (c) differs for them.
281 * As usual, we build the "not skippable" flags so that unassigned
282 * code points get a 0 bit.
283 * This bit is only valid after (a)..(e) test FALSE; test NFD_NO before (f) as well.
284 * Test Hangul LV syllables entirely in code.
285 *
286 *
287 * - FC_NFKC_Closure strings in extraData[]
288 *
289 * Strings are either stored as a single code unit or as the length
290 * followed by that many units.
291 *
292 */
293final class NormalizerDataReader implements ICUBinary.Authenticate {
294
295 /**
296 * <p>Protected constructor.</p>
297 * @param inputStream ICU uprop.dat file input stream
298 * @exception IOException throw if data file fails authentication
299 * @draft 2.1
300 */
301 protected NormalizerDataReader(InputStream inputStream)
302 throws IOException{
303
304 unicodeVersion = ICUBinary.readHeader(inputStream, DATA_FORMAT_ID, this);
305 dataInputStream = new DataInputStream(inputStream);
306 }
307
308 // protected methods -------------------------------------------------
309
310 protected int[] readIndexes(int length)throws IOException{
311 int[] indexes = new int[length];
312 //Read the indexes
313 for (int i = 0; i <length ; i++) {
314 indexes[i] = dataInputStream.readInt();
315 }
316 return indexes;
317 }
318 /**
319 * <p>Reads unorm.icu, parse it into blocks of data to be stored in
320 * NormalizerImpl.</P
321 * @param normBytes
322 * @param fcdBytes
323 * @param auxBytes
324 * @param extraData
325 * @param combiningTable
326 * @exception thrown when data reading fails
327 * @draft 2.1
328 */
329 protected void read(byte[] normBytes, byte[] fcdBytes, byte[] auxBytes,
330 char[] extraData, char[] combiningTable)
331 throws IOException{
332
333 //Read the bytes that make up the normTrie
334 dataInputStream.read(normBytes);
335
336 //normTrieStream= new ByteArrayInputStream(normBytes);
337
338 //Read the extra data
339 for(int i=0;i<extraData.length;i++){
340 extraData[i]=dataInputStream.readChar();
341 }
342
343 //Read the combining class table
344 for(int i=0; i<combiningTable.length; i++){
345 combiningTable[i]=dataInputStream.readChar();
346 }
347
348 //Read the fcdTrie
349 dataInputStream.read(fcdBytes);
350
351
352 //Read the AuxTrie
353 dataInputStream.read(auxBytes);
354 }
355
356 public byte[] getDataFormatVersion(){
357 return DATA_FORMAT_VERSION;
358 }
359
360 public boolean isDataVersionAcceptable(byte version[])
361 {
362 return version[0] == DATA_FORMAT_VERSION[0]
363 && version[2] == DATA_FORMAT_VERSION[2]
364 && version[3] == DATA_FORMAT_VERSION[3];
365 }
366
367 public byte[] getUnicodeVersion(){
368 return unicodeVersion;
369 }
370 // private data members -------------------------------------------------
371
372
373 /**
374 * ICU data file input stream
375 */
376 private DataInputStream dataInputStream;
377
378 private byte[] unicodeVersion;
379
380 /**
381 * File format version that this class understands.
382 * No guarantees are made if a older version is used
383 * see store.c of gennorm for more information and values
384 */
385 private static final byte DATA_FORMAT_ID[] = {(byte)0x4E, (byte)0x6F,
386 (byte)0x72, (byte)0x6D};
387 private static final byte DATA_FORMAT_VERSION[] = {(byte)0x2, (byte)0x2,
388 (byte)0x5, (byte)0x2};
389
390}