Blame - jdk/src/share/classes/sun/text/normalizer/NormalizerDataReader.java - platform/libcore

blob: 0378d5b592376b64e851baa685ba04504d144fe6 [file] [log] [blame]

J. Duke	319a3b9	2007-12-01 00:00:00 +0000	[diff] [blame^]	1	/*
				2	* Portions Copyright 2003-2006 Sun Microsystems, Inc. All Rights Reserved.
				3	* DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
				4	*
				5	* This code is free software; you can redistribute it and/or modify it
				6	* under the terms of the GNU General Public License version 2 only, as
				7	* published by the Free Software Foundation. Sun designates this
				8	* particular file as subject to the "Classpath" exception as provided
				9	* by Sun in the LICENSE file that accompanied this code.
				10	*
				11	* This code is distributed in the hope that it will be useful, but WITHOUT
				12	* ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
				13	* FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
				14	* version 2 for more details (a copy is included in the LICENSE file that
				15	* accompanied this code).
				16	*
				17	* You should have received a copy of the GNU General Public License version
				18	* 2 along with this work; if not, write to the Free Software Foundation,
				19	* Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
				20	*
				21	* Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
				22	* CA 95054 USA or visit www.sun.com if you need additional information or
				23	* have any questions.
				24	*/
				25
				26	/*
				27	*******************************************************************************
				28	* (C) Copyright IBM Corp. 1996-2005 - All Rights Reserved *
				29	* *
				30	* The original version of this source code and documentation is copyrighted *
				31	* and owned by IBM, These materials are provided under terms of a License *
				32	* Agreement between IBM and Sun. This technology is protected by multiple *
				33	* US and International patents. This notice and attribution to IBM may not *
				34	* to removed. *
				35	*******************************************************************************
				36	*/
				37
				38	package sun.text.normalizer;
				39
				40	import java.io.DataInputStream;
				41	import java.io.InputStream;
				42	import java.io.IOException;
				43
				44	/**
				45	* @author Ram Viswanadha
				46	*/
				47
				48	/*
				49	* Description of the format of unorm.icu version 2.1.
				50	*
				51	* Main change from version 1 to version 2:
				52	* Use of new, common Trie instead of normalization-specific tries.
				53	* Change to version 2.1: add third/auxiliary trie with associated data.
				54	*
				55	* For more details of how to use the data structures see the code
				56	* in unorm.cpp (runtime normalization code) and
				57	* in gennorm.c and gennorm/store.c (build-time data generation).
				58	*
				59	* For the serialized format of Trie see Trie.c/TrieHeader.
				60	*
				61	* - Overall partition
				62	*
				63	* unorm.icu customarily begins with a UDataInfo structure, see udata.h and .c.
				64	* After that there are the following structures:
				65	*
				66	* char indexes[INDEX_TOP]; -- INDEX_TOP=32, see enum in this file
				67	*
				68	* Trie normTrie; -- size in bytes=indexes[INDEX_TRIE_SIZE]
				69	*
				70	* char extraData[extraDataTop]; -- extraDataTop=indexes[INDEX_UCHAR_COUNT]
				71	* extraData[0] contains the number of units for
				72	* FC_NFKC_Closure (formatVersion>=2.1)
				73	*
				74	* char combiningTable[combiningTableTop]; -- combiningTableTop=indexes[INDEX_COMBINE_DATA_COUNT]
				75	* combiningTableTop may include one 16-bit padding unit
				76	* to make sure that fcdTrie is 32-bit-aligned
				77	*
				78	* Trie fcdTrie; -- size in bytes=indexes[INDEX_FCD_TRIE_SIZE]
				79	*
				80	* Trie auxTrie; -- size in bytes=indexes[INDEX_AUX_TRIE_SIZE]
				81	*
				82	*
				83	* The indexes array contains lengths and sizes of the following arrays and structures
				84	* as well as the following values:
				85	* indexes[INDEX_COMBINE_FWD_COUNT]=combineFwdTop
				86	* -- one more than the highest combining index computed for forward-only-combining characters
				87	* indexes[INDEX_COMBINE_BOTH_COUNT]=combineBothTop-combineFwdTop
				88	* -- number of combining indexes computed for both-ways-combining characters
				89	* indexes[INDEX_COMBINE_BACK_COUNT]=combineBackTop-combineBothTop
				90	* -- number of combining indexes computed for backward-only-combining characters
				91	*
				92	* indexes[INDEX_MIN_NF_NO_MAYBE] (where ={ C, D, KC, KD })
				93	* -- first code point with a quick check NF* value of NO/MAYBE
				94	*
				95	*
				96	* - Tries
				97	*
				98	* The main structures are two Trie tables ("compact arrays"),
				99	* each with one index array and one data array.
				100	* See Trie.h and Trie.c.
				101	*
				102	*
				103	* - Tries in unorm.icu
				104	*
				105	* The first trie (normTrie above)
				106	* provides data for the NF* quick checks and normalization.
				107	* The second trie (fcdTrie above) provides data just for FCD checks.
				108	*
				109	*
				110	* - norm32 data words from the first trie
				111	*
				112	* The norm32Table contains one 32-bit word "norm32" per code point.
				113	* It contains the following bit fields:
				114	* 31..16 extra data index, EXTRA_SHIFT is used to shift this field down
				115	* if this index is <EXTRA_INDEX_TOP then it is an index into
				116	* extraData[] where variable-length normalization data for this
				117	* code point is found
				118	* if this index is <EXTRA_INDEX_TOP+EXTRA_SURROGATE_TOP
				119	* then this is a norm32 for a leading surrogate, and the index
				120	* value is used together with the following trailing surrogate
				121	* code unit in the second trie access
				122	* if this index is >=EXTRA_INDEX_TOP+EXTRA_SURROGATE_TOP
				123	* then this is a norm32 for a "special" character,
				124	* i.e., the character is a Hangul syllable or a Jamo
				125	* see EXTRA_HANGUL etc.
				126	* generally, instead of extracting this index from the norm32 and
				127	* comparing it with the above constants,
				128	* the normalization code compares the entire norm32 value
				129	* with MIN_SPECIAL, SURROGATES_TOP, MIN_HANGUL etc.
				130	*
				131	* 15..8 combining class (cc) according to UnicodeData.txt
				132	*
				133	* 7..6 COMBINES_ANY flags, used in composition to see if a character
				134	* combines with any following or preceding character(s)
				135	* at all
				136	* 7 COMBINES_BACK
				137	* 6 COMBINES_FWD
				138	*
				139	* 5..0 quick check flags, set for "no" or "maybe", with separate flags for
				140	* each normalization form
				141	* the higher bits are "maybe" flags; for NF*D there are no such flags
				142	* the lower bits are "no" flags for all forms, in the same order
				143	* as the "maybe" flags,
				144	* which is (MSB to LSB): NFKD NFD NFKC NFC
				145	* 5..4 QC_ANY_MAYBE
				146	* 3..0 QC_ANY_NO
				147	* see further related constants
				148	*
				149	*
				150	* - Extra data per code point
				151	*
				152	* "Extra data" is referenced by the index in norm32.
				153	* It is variable-length data. It is only present, and only those parts
				154	* of it are, as needed for a given character.
				155	* The norm32 extra data index is added to the beginning of extraData[]
				156	* to get to a vector of 16-bit words with data at the following offsets:
				157	*
				158	* [-1] Combining index for composition.
				159	* Stored only if norm32&COMBINES_ANY .
				160	* [0] Lengths of the canonical and compatibility decomposition strings.
				161	* Stored only if there are decompositions, i.e.,
				162	* if norm32&(QC_NFD\|QC_NFKD)
				163	* High byte: length of NFKD, or 0 if none
				164	* Low byte: length of NFD, or 0 if none
				165	* Each length byte also has another flag:
				166	* Bit 7 of a length byte is set if there are non-zero
				167	* combining classes (cc's) associated with the respective
				168	* decomposition. If this flag is set, then the decomposition
				169	* is preceded by a 16-bit word that contains the
				170	* leading and trailing cc's.
				171	* Bits 6..0 of a length byte are the length of the
				172	* decomposition string, not counting the cc word.
				173	* [1..n] NFD
				174	* [n+1..] NFKD
				175	*
				176	* Each of the two decompositions consists of up to two parts:
				177	* - The 16-bit words with the leading and trailing cc's.
				178	* This is only stored if bit 7 of the corresponding length byte
				179	* is set. In this case, at least one of the cc's is not zero.
				180	* High byte: leading cc==cc of the first code point in the decomposition string
				181	* Low byte: trailing cc==cc of the last code point in the decomposition string
				182	* - The decomposition string in UTF-16, with length code units.
				183	*
				184	*
				185	* - Combining indexes and combiningTable[]
				186	*
				187	* Combining indexes are stored at the [-1] offset of the extra data
				188	* if the character combines forward or backward with any other characters.
				189	* They are used for (re)composition in NF*C.
				190	* Values of combining indexes are arranged according to whether a character
				191	* combines forward, backward, or both ways:
				192	* forward-only < both ways < backward-only
				193	*
				194	* The index values for forward-only and both-ways combining characters
				195	* are indexes into the combiningTable[].
				196	* The index values for backward-only combining characters are simply
				197	* incremented from the preceding index values to be unique.
				198	*
				199	* In the combiningTable[], a variable-length list
				200	* of variable-length (back-index, code point) pair entries is stored
				201	* for each forward-combining character.
				202	*
				203	* These back-indexes are the combining indexes of both-ways or backward-only
				204	* combining characters that the forward-combining character combines with.
				205	*
				206	* Each list is sorted in ascending order of back-indexes.
				207	* Each list is terminated with the last back-index having bit 15 set.
				208	*
				209	* Each pair (back-index, code point) takes up either 2 or 3
				210	* 16-bit words.
				211	* The first word of a list entry is the back-index, with its bit 15 set if
				212	* this is the last pair in the list.
				213	*
				214	* The second word contains flags in bits 15..13 that determine
				215	* if there is a third word and how the combined character is encoded:
				216	* 15 set if there is a third word in this list entry
				217	* 14 set if the result is a supplementary character
				218	* 13 set if the result itself combines forward
				219	*
				220	* According to these bits 15..14 of the second word,
				221	* the result character is encoded as follows:
				222	* 00 or 01 The result is <=0x1fff and stored in bits 12..0 of
				223	* the second word.
				224	* 10 The result is 0x2000..0xffff and stored in the third word.
				225	* Bits 12..0 of the second word are not used.
				226	* 11 The result is a supplementary character.
				227	* Bits 9..0 of the leading surrogate are in bits 9..0 of
				228	* the second word.
				229	* Add 0xd800 to these bits to get the complete surrogate.
				230	* Bits 12..10 of the second word are not used.
				231	* The trailing surrogate is stored in the third word.
				232	*
				233	*
				234	* - FCD trie
				235	*
				236	* The FCD trie is very simple.
				237	* It is a folded trie with 16-bit data words.
				238	* In each word, the high byte contains the leading cc of the character,
				239	* and the low byte contains the trailing cc of the character.
				240	* These cc's are the cc's of the first and last code points in the
				241	* canonical decomposition of the character.
				242	*
				243	* Since all 16 bits are used for cc's, lead surrogates must be tested
				244	* by checking the code unit instead of the trie data.
				245	* This is done only if the 16-bit data word is not zero.
				246	* If the code unit is a leading surrogate and the data word is not zero,
				247	* then instead of cc's it contains the offset for the second trie lookup.
				248	*
				249	*
				250	* - Auxiliary trie and data
				251	*
				252	*
				253	* The auxiliary 16-bit trie contains data for additional properties.
				254	* Bits
				255	* 15..13 reserved
				256	* 12 not NFC_Skippable (f) (formatVersion>=2.2)
				257	* 11 flag: not a safe starter for canonical closure
				258	* 10 composition exclusion
				259	* 9.. 0 index into extraData[] to FC_NFKC_Closure string
				260	* (not for lead surrogate),
				261	* or lead surrogate offset (for lead surrogate, if 9..0 not zero)
				262	*
				263	* Conditions for "NF* Skippable" from Mark Davis' com.ibm.text.UCD.NFSkippable:
				264	* (used in NormalizerTransliterator)
				265	*
				266	* A skippable character is
				267	* a) unassigned, or ALL of the following:
				268	* b) of combining class 0.
				269	* c) not decomposed by this normalization form.
				270	* AND if NFC or NFKC,
				271	* d) can never compose with a previous character.
				272	* e) can never compose with a following character.
				273	* f) can never change if another character is added.
				274	* Example: a-breve might satisfy all but f, but if you
				275	* add an ogonek it changes to a-ogonek + breve
				276	*
				277	* a)..e) must be tested from norm32.
				278	* Since f) is more complicated, the (not-)NFC_Skippable flag (f) is built
				279	* into the auxiliary trie.
				280	* The same bit is used for NFC and NFKC; (c) differs for them.
				281	* As usual, we build the "not skippable" flags so that unassigned
				282	* code points get a 0 bit.
				283	* This bit is only valid after (a)..(e) test FALSE; test NFD_NO before (f) as well.
				284	* Test Hangul LV syllables entirely in code.
				285	*
				286	*
				287	* - FC_NFKC_Closure strings in extraData[]
				288	*
				289	* Strings are either stored as a single code unit or as the length
				290	* followed by that many units.
				291	*
				292	*/
				293	final class NormalizerDataReader implements ICUBinary.Authenticate {
				294
				295	/**
				296	* <p>Protected constructor.</p>
				297	* @param inputStream ICU uprop.dat file input stream
				298	* @exception IOException throw if data file fails authentication
				299	* @draft 2.1
				300	*/
				301	protected NormalizerDataReader(InputStream inputStream)
				302	throws IOException{
				303
				304	unicodeVersion = ICUBinary.readHeader(inputStream, DATA_FORMAT_ID, this);
				305	dataInputStream = new DataInputStream(inputStream);
				306	}
				307
				308	// protected methods -------------------------------------------------
				309
				310	protected int[] readIndexes(int length)throws IOException{
				311	int[] indexes = new int[length];
				312	//Read the indexes
				313	for (int i = 0; i <length ; i++) {
				314	indexes[i] = dataInputStream.readInt();
				315	}
				316	return indexes;
				317	}
				318	/**
				319	* <p>Reads unorm.icu, parse it into blocks of data to be stored in
				320	* NormalizerImpl.</P
				321	* @param normBytes
				322	* @param fcdBytes
				323	* @param auxBytes
				324	* @param extraData
				325	* @param combiningTable
				326	* @exception thrown when data reading fails
				327	* @draft 2.1
				328	*/
				329	protected void read(byte[] normBytes, byte[] fcdBytes, byte[] auxBytes,
				330	char[] extraData, char[] combiningTable)
				331	throws IOException{
				332
				333	//Read the bytes that make up the normTrie
				334	dataInputStream.read(normBytes);
				335
				336	//normTrieStream= new ByteArrayInputStream(normBytes);
				337
				338	//Read the extra data
				339	for(int i=0;i<extraData.length;i++){
				340	extraData[i]=dataInputStream.readChar();
				341	}
				342
				343	//Read the combining class table
				344	for(int i=0; i<combiningTable.length; i++){
				345	combiningTable[i]=dataInputStream.readChar();
				346	}
				347
				348	//Read the fcdTrie
				349	dataInputStream.read(fcdBytes);
				350
				351
				352	//Read the AuxTrie
				353	dataInputStream.read(auxBytes);
				354	}
				355
				356	public byte[] getDataFormatVersion(){
				357	return DATA_FORMAT_VERSION;
				358	}
				359
				360	public boolean isDataVersionAcceptable(byte version[])
				361	{
				362	return version[0] == DATA_FORMAT_VERSION[0]
				363	&& version[2] == DATA_FORMAT_VERSION[2]
				364	&& version[3] == DATA_FORMAT_VERSION[3];
				365	}
				366
				367	public byte[] getUnicodeVersion(){
				368	return unicodeVersion;
				369	}
				370	// private data members -------------------------------------------------
				371
				372
				373	/**
				374	* ICU data file input stream
				375	*/
				376	private DataInputStream dataInputStream;
				377
				378	private byte[] unicodeVersion;
				379
				380	/**
				381	* File format version that this class understands.
				382	* No guarantees are made if a older version is used
				383	* see store.c of gennorm for more information and values
				384	*/
				385	private static final byte DATA_FORMAT_ID[] = {(byte)0x4E, (byte)0x6F,
				386	(byte)0x72, (byte)0x6D};
				387	private static final byte DATA_FORMAT_VERSION[] = {(byte)0x2, (byte)0x2,
				388	(byte)0x5, (byte)0x2};
				389
				390	}