Blame - jdk/src/share/classes/java/text/RBTableBuilder.java - platform/libcore

blob: d89bb99f6f034d63945cf16c4a63deeef588bbf1 [file] [log] [blame]

J. Duke	319a3b9	2007-12-01 00:00:00 +0000	[diff] [blame^]	1	/*
				2	* Copyright 1999-2005 Sun Microsystems, Inc. All Rights Reserved.
				3	* DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
				4	*
				5	* This code is free software; you can redistribute it and/or modify it
				6	* under the terms of the GNU General Public License version 2 only, as
				7	* published by the Free Software Foundation. Sun designates this
				8	* particular file as subject to the "Classpath" exception as provided
				9	* by Sun in the LICENSE file that accompanied this code.
				10	*
				11	* This code is distributed in the hope that it will be useful, but WITHOUT
				12	* ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
				13	* FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
				14	* version 2 for more details (a copy is included in the LICENSE file that
				15	* accompanied this code).
				16	*
				17	* You should have received a copy of the GNU General Public License version
				18	* 2 along with this work; if not, write to the Free Software Foundation,
				19	* Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
				20	*
				21	* Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
				22	* CA 95054 USA or visit www.sun.com if you need additional information or
				23	* have any questions.
				24	*/
				25
				26	/*
				27	* (C) Copyright Taligent, Inc. 1996, 1997 - All Rights Reserved
				28	* (C) Copyright IBM Corp. 1996-1998 - All Rights Reserved
				29	*
				30	* The original version of this source code and documentation is copyrighted
				31	* and owned by Taligent, Inc., a wholly-owned subsidiary of IBM. These
				32	* materials are provided under terms of a License Agreement between Taligent
				33	* and Sun. This technology is protected by multiple US and International
				34	* patents. This notice and attribution to Taligent may not be removed.
				35	* Taligent is a registered trademark of Taligent, Inc.
				36	*
				37	*/
				38
				39	package java.text;
				40
				41	import java.util.Vector;
				42	import sun.text.UCompactIntArray;
				43	import sun.text.IntHashtable;
				44	import sun.text.ComposedCharIter;
				45	import sun.text.CollatorUtilities;
				46	import sun.text.normalizer.NormalizerImpl;
				47
				48	/**
				49	* This class contains all the code to parse a RuleBasedCollator pattern
				50	* and build a RBCollationTables object from it. A particular instance
				51	* of tis class exists only during the actual build process-- once an
				52	* RBCollationTables object has been built, the RBTableBuilder object
				53	* goes away. This object carries all of the state which is only needed
				54	* during the build process, plus a "shadow" copy of all of the state
				55	* that will go into the tables object itself. This object communicates
				56	* with RBCollationTables through a separate class, RBCollationTables.BuildAPI,
				57	* this is an inner class of RBCollationTables and provides a separate
				58	* private API for communication with RBTableBuilder.
				59	* This class isn't just an inner class of RBCollationTables itself because
				60	* of its large size. For source-code readability, it seemed better for the
				61	* builder to have its own source file.
				62	*/
				63	final class RBTableBuilder {
				64
				65	public RBTableBuilder(RBCollationTables.BuildAPI tables) {
				66	this.tables = tables;
				67	}
				68
				69	/**
				70	* Create a table-based collation object with the given rules.
				71	* This is the main function that actually builds the tables and
				72	* stores them back in the RBCollationTables object. It is called
				73	* ONLY by the RBCollationTables constructor.
				74	* @see java.util.RuleBasedCollator#RuleBasedCollator
				75	* @exception ParseException If the rules format is incorrect.
				76	*/
				77
				78	public void build(String pattern, int decmp) throws ParseException
				79	{
				80	boolean isSource = true;
				81	int i = 0;
				82	String expChars;
				83	String groupChars;
				84	if (pattern.length() == 0)
				85	throw new ParseException("Build rules empty.", 0);
				86
				87	// This array maps Unicode characters to their collation ordering
				88	mapping = new UCompactIntArray((int)RBCollationTables.UNMAPPED);
				89	// Normalize the build rules. Find occurances of all decomposed characters
				90	// and normalize the rules before feeding into the builder. By "normalize",
				91	// we mean that all precomposed Unicode characters must be converted into
				92	// a base character and one or more combining characters (such as accents).
				93	// When there are multiple combining characters attached to a base character,
				94	// the combining characters must be in their canonical order
				95	//
				96	// sherman/Note:
				97	//(1)decmp will be NO_DECOMPOSITION only in ko locale to prevent decompose
				98	//hangual syllables to jamos, so we can actually just call decompose with
				99	//normalizer's IGNORE_HANGUL option turned on
				100	//
				101	//(2)just call the "special version" in NormalizerImpl directly
				102	//pattern = Normalizer.decompose(pattern, false, Normalizer.IGNORE_HANGUL, true);
				103	//
				104	//Normalizer.Mode mode = CollatorUtilities.toNormalizerMode(decmp);
				105	//pattern = Normalizer.normalize(pattern, mode, 0, true);
				106
				107	pattern = NormalizerImpl.canonicalDecomposeWithSingleQuotation(pattern);
				108
				109	// Build the merged collation entries
				110	// Since rules can be specified in any order in the string
				111	// (e.g. "c , C < d , D < e , E .... C < CH")
				112	// this splits all of the rules in the string out into separate
				113	// objects and then sorts them. In the above example, it merges the
				114	// "C < CH" rule in just before the "C < D" rule.
				115	//
				116
				117	mPattern = new MergeCollation(pattern);
				118
				119	int order = 0;
				120
				121	// Now walk though each entry and add it to my own tables
				122	for (i = 0; i < mPattern.getCount(); ++i)
				123	{
				124	PatternEntry entry = mPattern.getItemAt(i);
				125	if (entry != null) {
				126	groupChars = entry.getChars();
				127	if (groupChars.length() > 1) {
				128	switch(groupChars.charAt(groupChars.length()-1)) {
				129	case '@':
				130	frenchSec = true;
				131	groupChars = groupChars.substring(0, groupChars.length()-1);
				132	break;
				133	case '!':
				134	seAsianSwapping = true;
				135	groupChars = groupChars.substring(0, groupChars.length()-1);
				136	break;
				137	}
				138	}
				139
				140	order = increment(entry.getStrength(), order);
				141	expChars = entry.getExtension();
				142
				143	if (expChars.length() != 0) {
				144	addExpandOrder(groupChars, expChars, order);
				145	} else if (groupChars.length() > 1) {
				146	char ch = groupChars.charAt(0);
				147	if (Character.isHighSurrogate(ch) && groupChars.length() == 2) {
				148	addOrder(Character.toCodePoint(ch, groupChars.charAt(1)), order);
				149	} else {
				150	addContractOrder(groupChars, order);
				151	}
				152	} else {
				153	char ch = groupChars.charAt(0);
				154	addOrder(ch, order);
				155	}
				156	}
				157	}
				158	addComposedChars();
				159
				160	commit();
				161	mapping.compact();
				162	/*
				163	System.out.println("mappingSize=" + mapping.getKSize());
				164	for (int j = 0; j < 0xffff; j++) {
				165	int value = mapping.elementAt(j);
				166	if (value != RBCollationTables.UNMAPPED)
				167	System.out.println("index=" + Integer.toString(j, 16)
				168	+ ", value=" + Integer.toString(value, 16));
				169	}
				170	*/
				171	tables.fillInTables(frenchSec, seAsianSwapping, mapping, contractTable, expandTable,
				172	contractFlags, maxSecOrder, maxTerOrder);
				173	}
				174
				175	/** Add expanding entries for pre-composed unicode characters so that this
				176	* collator can be used reasonably well with decomposition turned off.
				177	*/
				178	private void addComposedChars() throws ParseException {
				179	// Iterate through all of the pre-composed characters in Unicode
				180	ComposedCharIter iter = new ComposedCharIter();
				181	int c;
				182	while ((c = iter.next()) != ComposedCharIter.DONE) {
				183	if (getCharOrder(c) == RBCollationTables.UNMAPPED) {
				184	//
				185	// We don't already have an ordering for this pre-composed character.
				186	//
				187	// First, see if the decomposed string is already in our
				188	// tables as a single contracting-string ordering.
				189	// If so, just map the precomposed character to that order.
				190	//
				191	// TODO: What we should really be doing here is trying to find the
				192	// longest initial substring of the decomposition that is present
				193	// in the tables as a contracting character sequence, and find its
				194	// ordering. Then do this recursively with the remaining chars
				195	// so that we build a list of orderings, and add that list to
				196	// the expansion table.
				197	// That would be more correct but also significantly slower, so
				198	// I'm not totally sure it's worth doing.
				199	//
				200	String s = iter.decomposition();
				201
				202	//sherman/Note: if this is 1 character decomposed string, the
				203	//only thing need to do is to check if this decomposed character
				204	//has an entry in our order table, this order is not necessary
				205	//to be a contraction order, if it does have one, add an entry
				206	//for the precomposed character by using the same order, the
				207	//previous impl unnecessarily adds a single character expansion
				208	//entry.
				209	if (s.length() == 1) {
				210	int order = getCharOrder(s.charAt(0));
				211	if (order != RBCollationTables.UNMAPPED) {
				212	addOrder(c, order);
				213	}
				214	continue;
				215	} else if (s.length() == 2) {
				216	char ch0 = s.charAt(0);
				217	if (Character.isHighSurrogate(ch0)) {
				218	int order = getCharOrder(s.codePointAt(0));
				219	if (order != RBCollationTables.UNMAPPED) {
				220	addOrder(c, order);
				221	}
				222	continue;
				223	}
				224	}
				225	int contractOrder = getContractOrder(s);
				226	if (contractOrder != RBCollationTables.UNMAPPED) {
				227	addOrder(c, contractOrder);
				228	} else {
				229	//
				230	// We don't have a contracting ordering for the entire string
				231	// that results from the decomposition, but if we have orders
				232	// for each individual character, we can add an expanding
				233	// table entry for the pre-composed character
				234	//
				235	boolean allThere = true;
				236	for (int i = 0; i < s.length(); i++) {
				237	if (getCharOrder(s.charAt(i)) == RBCollationTables.UNMAPPED) {
				238	allThere = false;
				239	break;
				240	}
				241	}
				242	if (allThere) {
				243	addExpandOrder(c, s, RBCollationTables.UNMAPPED);
				244	}
				245	}
				246	}
				247	}
				248	}
				249
				250	/**
				251	* Look up for unmapped values in the expanded character table.
				252	*
				253	* When the expanding character tables are built by addExpandOrder,
				254	* it doesn't know what the final ordering of each character
				255	* in the expansion will be. Instead, it just puts the raw character
				256	* code into the table, adding CHARINDEX as a flag. Now that we've
				257	* finished building the mapping table, we can go back and look up
				258	* that character to see what its real collation order is and
				259	* stick that into the expansion table. That lets us avoid doing
				260	* a two-stage lookup later.
				261	*/
				262	private final void commit()
				263	{
				264	if (expandTable != null) {
				265	for (int i = 0; i < expandTable.size(); i++) {
				266	int[] valueList = (int [])expandTable.elementAt(i);
				267	for (int j = 0; j < valueList.length; j++) {
				268	int order = valueList[j];
				269	if (order < RBCollationTables.EXPANDCHARINDEX && order > CHARINDEX) {
				270	// found a expanding character that isn't filled in yet
				271	int ch = order - CHARINDEX;
				272
				273	// Get the real values for the non-filled entry
				274	int realValue = getCharOrder(ch);
				275
				276	if (realValue == RBCollationTables.UNMAPPED) {
				277	// The real value is still unmapped, maybe it's ignorable
				278	valueList[j] = IGNORABLEMASK & ch;
				279	} else {
				280	// just fill in the value
				281	valueList[j] = realValue;
				282	}
				283	}
				284	}
				285	}
				286	}
				287	}
				288	/**
				289	* Increment of the last order based on the comparison level.
				290	*/
				291	private final int increment(int aStrength, int lastValue)
				292	{
				293	switch(aStrength)
				294	{
				295	case Collator.PRIMARY:
				296	// increment priamry order and mask off secondary and tertiary difference
				297	lastValue += PRIMARYORDERINCREMENT;
				298	lastValue &= RBCollationTables.PRIMARYORDERMASK;
				299	isOverIgnore = true;
				300	break;
				301	case Collator.SECONDARY:
				302	// increment secondary order and mask off tertiary difference
				303	lastValue += SECONDARYORDERINCREMENT;
				304	lastValue &= RBCollationTables.SECONDARYDIFFERENCEONLY;
				305	// record max # of ignorable chars with secondary difference
				306	if (!isOverIgnore)
				307	maxSecOrder++;
				308	break;
				309	case Collator.TERTIARY:
				310	// increment tertiary order
				311	lastValue += TERTIARYORDERINCREMENT;
				312	// record max # of ignorable chars with tertiary difference
				313	if (!isOverIgnore)
				314	maxTerOrder++;
				315	break;
				316	}
				317	return lastValue;
				318	}
				319
				320	/**
				321	* Adds a character and its designated order into the collation table.
				322	*/
				323	private final void addOrder(int ch, int anOrder)
				324	{
				325	// See if the char already has an order in the mapping table
				326	int order = mapping.elementAt(ch);
				327
				328	if (order >= RBCollationTables.CONTRACTCHARINDEX) {
				329	// There's already an entry for this character that points to a contracting
				330	// character table. Instead of adding the character directly to the mapping
				331	// table, we must add it to the contract table instead.
				332	int length = 1;
				333	if (Character.isSupplementaryCodePoint(ch)) {
				334	length = Character.toChars(ch, keyBuf, 0);
				335	} else {
				336	keyBuf[0] = (char)ch;
				337	}
				338	addContractOrder(new String(keyBuf, 0, length), anOrder);
				339	} else {
				340	// add the entry to the mapping table,
				341	// the same later entry replaces the previous one
				342	mapping.setElementAt(ch, anOrder);
				343	}
				344	}
				345
				346	private final void addContractOrder(String groupChars, int anOrder) {
				347	addContractOrder(groupChars, anOrder, true);
				348	}
				349
				350	/**
				351	* Adds the contracting string into the collation table.
				352	*/
				353	private final void addContractOrder(String groupChars, int anOrder,
				354	boolean fwd)
				355	{
				356	if (contractTable == null) {
				357	contractTable = new Vector(INITIALTABLESIZE);
				358	}
				359
				360	//initial character
				361	int ch = groupChars.codePointAt(0);
				362	/*
				363	char ch0 = groupChars.charAt(0);
				364	int ch = Character.isHighSurrogate(ch0)?
				365	Character.toCodePoint(ch0, groupChars.charAt(1)):ch0;
				366	*/
				367	// See if the initial character of the string already has a contract table.
				368	int entry = mapping.elementAt(ch);
				369	Vector entryTable = getContractValuesImpl(entry - RBCollationTables.CONTRACTCHARINDEX);
				370
				371	if (entryTable == null) {
				372	// We need to create a new table of contract entries for this base char
				373	int tableIndex = RBCollationTables.CONTRACTCHARINDEX + contractTable.size();
				374	entryTable = new Vector(INITIALTABLESIZE);
				375	contractTable.addElement(entryTable);
				376
				377	// Add the initial character's current ordering first. then
				378	// update its mapping to point to this contract table
				379	entryTable.addElement(new EntryPair(groupChars.substring(0,Character.charCount(ch)), entry));
				380	mapping.setElementAt(ch, tableIndex);
				381	}
				382
				383	// Now add (or replace) this string in the table
				384	int index = RBCollationTables.getEntry(entryTable, groupChars, fwd);
				385	if (index != RBCollationTables.UNMAPPED) {
				386	EntryPair pair = (EntryPair) entryTable.elementAt(index);
				387	pair.value = anOrder;
				388	} else {
				389	EntryPair pair = (EntryPair)entryTable.lastElement();
				390
				391	// NOTE: This little bit of logic is here to speed CollationElementIterator
				392	// .nextContractChar(). This code ensures that the longest sequence in
				393	// this list is always the _last_ one in the list. This keeps
				394	// nextContractChar() from having to search the entire list for the longest
				395	// sequence.
				396	if (groupChars.length() > pair.entryName.length()) {
				397	entryTable.addElement(new EntryPair(groupChars, anOrder, fwd));
				398	} else {
				399	entryTable.insertElementAt(new EntryPair(groupChars, anOrder,
				400	fwd), entryTable.size() - 1);
				401	}
				402	}
				403
				404	// If this was a forward mapping for a contracting string, also add a
				405	// reverse mapping for it, so that CollationElementIterator.previous
				406	// can work right
				407	if (fwd && groupChars.length() > 1) {
				408	addContractFlags(groupChars);
				409	addContractOrder(new StringBuffer(groupChars).reverse().toString(),
				410	anOrder, false);
				411	}
				412	}
				413
				414	/**
				415	* If the given string has been specified as a contracting string
				416	* in this collation table, return its ordering.
				417	* Otherwise return UNMAPPED.
				418	*/
				419	private int getContractOrder(String groupChars)
				420	{
				421	int result = RBCollationTables.UNMAPPED;
				422	if (contractTable != null) {
				423	int ch = groupChars.codePointAt(0);
				424	/*
				425	char ch0 = groupChars.charAt(0);
				426	int ch = Character.isHighSurrogate(ch0)?
				427	Character.toCodePoint(ch0, groupChars.charAt(1)):ch0;
				428	*/
				429	Vector entryTable = getContractValues(ch);
				430	if (entryTable != null) {
				431	int index = RBCollationTables.getEntry(entryTable, groupChars, true);
				432	if (index != RBCollationTables.UNMAPPED) {
				433	EntryPair pair = (EntryPair) entryTable.elementAt(index);
				434	result = pair.value;
				435	}
				436	}
				437	}
				438	return result;
				439	}
				440
				441	private final int getCharOrder(int ch) {
				442	int order = mapping.elementAt(ch);
				443
				444	if (order >= RBCollationTables.CONTRACTCHARINDEX) {
				445	Vector groupList = getContractValuesImpl(order - RBCollationTables.CONTRACTCHARINDEX);
				446	EntryPair pair = (EntryPair)groupList.firstElement();
				447	order = pair.value;
				448	}
				449	return order;
				450	}
				451
				452	/**
				453	* Get the entry of hash table of the contracting string in the collation
				454	* table.
				455	* @param ch the starting character of the contracting string
				456	*/
				457	private Vector getContractValues(int ch)
				458	{
				459	int index = mapping.elementAt(ch);
				460	return getContractValuesImpl(index - RBCollationTables.CONTRACTCHARINDEX);
				461	}
				462
				463	private Vector getContractValuesImpl(int index)
				464	{
				465	if (index >= 0)
				466	{
				467	return (Vector)contractTable.elementAt(index);
				468	}
				469	else // not found
				470	{
				471	return null;
				472	}
				473	}
				474
				475	/**
				476	* Adds the expanding string into the collation table.
				477	*/
				478	private final void addExpandOrder(String contractChars,
				479	String expandChars,
				480	int anOrder) throws ParseException
				481	{
				482	// Create an expansion table entry
				483	int tableIndex = addExpansion(anOrder, expandChars);
				484
				485	// And add its index into the main mapping table
				486	if (contractChars.length() > 1) {
				487	char ch = contractChars.charAt(0);
				488	if (Character.isHighSurrogate(ch) && contractChars.length() == 2) {
				489	char ch2 = contractChars.charAt(1);
				490	if (Character.isLowSurrogate(ch2)) {
				491	//only add into table when it is a legal surrogate
				492	addOrder(Character.toCodePoint(ch, ch2), tableIndex);
				493	}
				494	} else {
				495	addContractOrder(contractChars, tableIndex);
				496	}
				497	} else {
				498	addOrder(contractChars.charAt(0), tableIndex);
				499	}
				500	}
				501
				502	private final void addExpandOrder(int ch, String expandChars, int anOrder)
				503	throws ParseException
				504	{
				505	int tableIndex = addExpansion(anOrder, expandChars);
				506	addOrder(ch, tableIndex);
				507	}
				508
				509	/**
				510	* Create a new entry in the expansion table that contains the orderings
				511	* for the given characers. If anOrder is valid, it is added to the
				512	* beginning of the expanded list of orders.
				513	*/
				514	private int addExpansion(int anOrder, String expandChars) {
				515	if (expandTable == null) {
				516	expandTable = new Vector(INITIALTABLESIZE);
				517	}
				518
				519	// If anOrder is valid, we want to add it at the beginning of the list
				520	int offset = (anOrder == RBCollationTables.UNMAPPED) ? 0 : 1;
				521
				522	int[] valueList = new int[expandChars.length() + offset];
				523	if (offset == 1) {
				524	valueList[0] = anOrder;
				525	}
				526
				527	int j = offset;
				528	for (int i = 0; i < expandChars.length(); i++) {
				529	char ch0 = expandChars.charAt(i);
				530	char ch1;
				531	int ch;
				532	if (Character.isHighSurrogate(ch0)) {
				533	if (++i == expandChars.length() \|\|
				534	!Character.isLowSurrogate(ch1=expandChars.charAt(i))) {
				535	//ether we are missing the low surrogate or the next char
				536	//is not a legal low surrogate, so stop loop
				537	break;
				538	}
				539	ch = Character.toCodePoint(ch0, ch1);
				540
				541	} else {
				542	ch = ch0;
				543	}
				544
				545	int mapValue = getCharOrder(ch);
				546
				547	if (mapValue != RBCollationTables.UNMAPPED) {
				548	valueList[j++] = mapValue;
				549	} else {
				550	// can't find it in the table, will be filled in by commit().
				551	valueList[j++] = CHARINDEX + ch;
				552	}
				553	}
				554	if (j < valueList.length) {
				555	//we had at least one supplementary character, the size of valueList
				556	//is bigger than it really needs...
				557	int[] tmpBuf = new int[j];
				558	while (--j >= 0) {
				559	tmpBuf[j] = valueList[j];
				560	}
				561	valueList = tmpBuf;
				562	}
				563	// Add the expanding char list into the expansion table.
				564	int tableIndex = RBCollationTables.EXPANDCHARINDEX + expandTable.size();
				565	expandTable.addElement(valueList);
				566
				567	return tableIndex;
				568	}
				569
				570	private void addContractFlags(String chars) {
				571	char c0;
				572	int c;
				573	int len = chars.length();
				574	for (int i = 0; i < len; i++) {
				575	c0 = chars.charAt(i);
				576	c = Character.isHighSurrogate(c0)
				577	?Character.toCodePoint(c0, chars.charAt(++i))
				578	:c0;
				579	contractFlags.put(c, 1);
				580	}
				581	}
				582
				583	// ==============================================================
				584	// constants
				585	// ==============================================================
				586	final static int CHARINDEX = 0x70000000; // need look up in .commit()
				587
				588	private final static int IGNORABLEMASK = 0x0000ffff;
				589	private final static int PRIMARYORDERINCREMENT = 0x00010000;
				590	private final static int SECONDARYORDERINCREMENT = 0x00000100;
				591	private final static int TERTIARYORDERINCREMENT = 0x00000001;
				592	private final static int INITIALTABLESIZE = 20;
				593	private final static int MAXKEYSIZE = 5;
				594
				595	// ==============================================================
				596	// instance variables
				597	// ==============================================================
				598
				599	// variables used by the build process
				600	private RBCollationTables.BuildAPI tables = null;
				601	private MergeCollation mPattern = null;
				602	private boolean isOverIgnore = false;
				603	private char[] keyBuf = new char[MAXKEYSIZE];
				604	private IntHashtable contractFlags = new IntHashtable(100);
				605
				606	// "shadow" copies of the instance variables in RBCollationTables
				607	// (the values in these variables are copied back into RBCollationTables
				608	// at the end of the build process)
				609	private boolean frenchSec = false;
				610	private boolean seAsianSwapping = false;
				611
				612	private UCompactIntArray mapping = null;
				613	private Vector contractTable = null;
				614	private Vector expandTable = null;
				615
				616	private short maxSecOrder = 0;
				617	private short maxTerOrder = 0;
				618	}