Blame - jdk/src/share/classes/sun/text/normalizer/UnicodeSet.java - platform/libcore

blob: eaffb7c8b44160a09d469bff9932bb646d839dad [file] [log] [blame]

J. Duke	319a3b9	2007-12-01 00:00:00 +0000	[diff] [blame^]	1	/*
				2	* Portions Copyright 2005-2006 Sun Microsystems, Inc. All Rights Reserved.
				3	* DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
				4	*
				5	* This code is free software; you can redistribute it and/or modify it
				6	* under the terms of the GNU General Public License version 2 only, as
				7	* published by the Free Software Foundation. Sun designates this
				8	* particular file as subject to the "Classpath" exception as provided
				9	* by Sun in the LICENSE file that accompanied this code.
				10	*
				11	* This code is distributed in the hope that it will be useful, but WITHOUT
				12	* ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
				13	* FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
				14	* version 2 for more details (a copy is included in the LICENSE file that
				15	* accompanied this code).
				16	*
				17	* You should have received a copy of the GNU General Public License version
				18	* 2 along with this work; if not, write to the Free Software Foundation,
				19	* Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
				20	*
				21	* Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
				22	* CA 95054 USA or visit www.sun.com if you need additional information or
				23	* have any questions.
				24	*/
				25
				26	/*
				27	*******************************************************************************
				28	* (C) Copyright IBM Corp. 1996-2005 - All Rights Reserved *
				29	* *
				30	* The original version of this source code and documentation is copyrighted *
				31	* and owned by IBM, These materials are provided under terms of a License *
				32	* Agreement between IBM and Sun. This technology is protected by multiple *
				33	* US and International patents. This notice and attribution to IBM may not *
				34	* to removed. *
				35	*******************************************************************************
				36	*/
				37
				38	package sun.text.normalizer;
				39
				40	import java.text.ParsePosition;
				41	import java.util.Map;
				42	import java.util.HashMap;
				43	import java.util.TreeSet;
				44	import java.util.Iterator;
				45	import java.util.Collection;
				46
				47	/**
				48	* A mutable set of Unicode characters and multicharacter strings. Objects of this class
				49	* represent <em>character classes</em> used in regular expressions.
				50	* A character specifies a subset of Unicode code points. Legal
				51	* code points are U+0000 to U+10FFFF, inclusive.
				52	*
				53	* <p>The UnicodeSet class is not designed to be subclassed.
				54	*
				55	* <p><code>UnicodeSet</code> supports two APIs. The first is the
				56	* <em>operand</em> API that allows the caller to modify the value of
				57	* a <code>UnicodeSet</code> object. It conforms to Java 2's
				58	* <code>java.util.Set</code> interface, although
				59	* <code>UnicodeSet</code> does not actually implement that
				60	* interface. All methods of <code>Set</code> are supported, with the
				61	* modification that they take a character range or single character
				62	* instead of an <code>Object</code>, and they take a
				63	* <code>UnicodeSet</code> instead of a <code>Collection</code>. The
				64	* operand API may be thought of in terms of boolean logic: a boolean
				65	* OR is implemented by <code>add</code>, a boolean AND is implemented
				66	* by <code>retain</code>, a boolean XOR is implemented by
				67	* <code>complement</code> taking an argument, and a boolean NOT is
				68	* implemented by <code>complement</code> with no argument. In terms
				69	* of traditional set theory function names, <code>add</code> is a
				70	* union, <code>retain</code> is an intersection, <code>remove</code>
				71	* is an asymmetric difference, and <code>complement</code> with no
				72	* argument is a set complement with respect to the superset range
				73	* <code>MIN_VALUE-MAX_VALUE</code>
				74	*
				75	* <p>The second API is the
				76	* <code>applyPattern()</code>/<code>toPattern()</code> API from the
				77	* <code>java.text.Format</code>-derived classes. Unlike the
				78	* methods that add characters, add categories, and control the logic
				79	* of the set, the method <code>applyPattern()</code> sets all
				80	* attributes of a <code>UnicodeSet</code> at once, based on a
				81	* string pattern.
				82	*
				83	* <p><b>Pattern syntax</b></p>
				84	*
				85	* Patterns are accepted by the constructors and the
				86	* <code>applyPattern()</code> methods and returned by the
				87	* <code>toPattern()</code> method. These patterns follow a syntax
				88	* similar to that employed by version 8 regular expression character
				89	* classes. Here are some simple examples:
				90	*
				91	* <blockquote>
				92	* <table>
				93	* <tr align="top">
				94	* <td nowrap valign="top" align="left"><code>[]</code></td>
				95	* <td valign="top">No characters</td>
				96	* </tr><tr align="top">
				97	* <td nowrap valign="top" align="left"><code>[a]</code></td>
				98	* <td valign="top">The character 'a'</td>
				99	* </tr><tr align="top">
				100	* <td nowrap valign="top" align="left"><code>[ae]</code></td>
				101	* <td valign="top">The characters 'a' and 'e'</td>
				102	* </tr>
				103	* <tr>
				104	* <td nowrap valign="top" align="left"><code>[a-e]</code></td>
				105	* <td valign="top">The characters 'a' through 'e' inclusive, in Unicode code
				106	* point order</td>
				107	* </tr>
				108	* <tr>
				109	* <td nowrap valign="top" align="left"><code>[\\u4E01]</code></td>
				110	* <td valign="top">The character U+4E01</td>
				111	* </tr>
				112	* <tr>
				113	* <td nowrap valign="top" align="left"><code>[a{ab}{ac}]</code></td>
				114	* <td valign="top">The character 'a' and the multicharacter strings "ab" and
				115	* "ac"</td>
				116	* </tr>
				117	* <tr>
				118	* <td nowrap valign="top" align="left"><code>[\p{Lu}]</code></td>
				119	* <td valign="top">All characters in the general category Uppercase Letter</td>
				120	* </tr>
				121	* </table>
				122	* </blockquote>
				123	*
				124	* Any character may be preceded by a backslash in order to remove any special
				125	* meaning. White space characters, as defined by UCharacterProperty.isRuleWhiteSpace(), are
				126	* ignored, unless they are escaped.
				127	*
				128	* <p>Property patterns specify a set of characters having a certain
				129	* property as defined by the Unicode standard. Both the POSIX-like
				130	* "[:Lu:]" and the Perl-like syntax "\p{Lu}" are recognized. For a
				131	* complete list of supported property patterns, see the User's Guide
				132	* for UnicodeSet at
				133	* <a href="http://oss.software.ibm.com/icu/userguide/unicodeSet.html">
				134	* http://oss.software.ibm.com/icu/userguide/unicodeSet.html</a>.
				135	* Actual determination of property data is defined by the underlying
				136	* Unicode database as implemented by UCharacter.
				137	*
				138	* <p>Patterns specify individual characters, ranges of characters, and
				139	* Unicode property sets. When elements are concatenated, they
				140	* specify their union. To complement a set, place a '^' immediately
				141	* after the opening '['. Property patterns are inverted by modifying
				142	* their delimiters; "[:^foo]" and "\P{foo}". In any other location,
				143	* '^' has no special meaning.
				144	*
				145	* <p>Ranges are indicated by placing two a '-' between two
				146	* characters, as in "a-z". This specifies the range of all
				147	* characters from the left to the right, in Unicode order. If the
				148	* left character is greater than or equal to the
				149	* right character it is a syntax error. If a '-' occurs as the first
				150	* character after the opening '[' or '[^', or if it occurs as the
				151	* last character before the closing ']', then it is taken as a
				152	* literal. Thus "[a\\-b]", "[-ab]", and "[ab-]" all indicate the same
				153	* set of three characters, 'a', 'b', and '-'.
				154	*
				155	* <p>Sets may be intersected using the '&' operator or the asymmetric
				156	* set difference may be taken using the '-' operator, for example,
				157	* "[[:L:]&[\\u0000-\\u0FFF]]" indicates the set of all Unicode letters
				158	* with values less than 4096. Operators ('&' and '\|') have equal
				159	* precedence and bind left-to-right. Thus
				160	* "[[:L:]-[a-z]-[\\u0100-\\u01FF]]" is equivalent to
				161	* "[[[:L:]-[a-z]]-[\\u0100-\\u01FF]]". This only really matters for
				162	* difference; intersection is commutative.
				163	*
				164	* <table>
				165	* <tr valign=top><td nowrap><code>[a]</code><td>The set containing 'a'
				166	* <tr valign=top><td nowrap><code>[a-z]</code><td>The set containing 'a'
				167	* through 'z' and all letters in between, in Unicode order
				168	* <tr valign=top><td nowrap><code>[^a-z]</code><td>The set containing
				169	* all characters but 'a' through 'z',
				170	* that is, U+0000 through 'a'-1 and 'z'+1 through U+10FFFF
				171	* <tr valign=top><td nowrap><code>[[<em>pat1</em>][<em>pat2</em>]]</code>
				172	* <td>The union of sets specified by <em>pat1</em> and <em>pat2</em>
				173	* <tr valign=top><td nowrap><code>[[<em>pat1</em>]&[<em>pat2</em>]]</code>
				174	* <td>The intersection of sets specified by <em>pat1</em> and <em>pat2</em>
				175	* <tr valign=top><td nowrap><code>[[<em>pat1</em>]-[<em>pat2</em>]]</code>
				176	* <td>The asymmetric difference of sets specified by <em>pat1</em> and
				177	* <em>pat2</em>
				178	* <tr valign=top><td nowrap><code>[:Lu:] or \p{Lu}</code>
				179	* <td>The set of characters having the specified
				180	* Unicode property; in
				181	* this case, Unicode uppercase letters
				182	* <tr valign=top><td nowrap><code>[:^Lu:] or \P{Lu}</code>
				183	* <td>The set of characters <em>not</em> having the given
				184	* Unicode property
				185	* </table>
				186	*
				187	* <p><b>Warning</b>: you cannot add an empty string ("") to a UnicodeSet.</p>
				188	*
				189	* <p><b>Formal syntax</b></p>
				190	*
				191	* <blockquote>
				192	* <table>
				193	* <tr align="top">
				194	* <td nowrap valign="top" align="right"><code>pattern :=  </code></td>
				195	* <td valign="top"><code>('[' '^'? item* ']') \|
				196	* property</code></td>
				197	* </tr>
				198	* <tr align="top">
				199	* <td nowrap valign="top" align="right"><code>item :=  </code></td>
				200	* <td valign="top"><code>char \| (char '-' char) \| pattern-expr<br>
				201	* </code></td>
				202	* </tr>
				203	* <tr align="top">
				204	* <td nowrap valign="top" align="right"><code>pattern-expr :=  </code></td>
				205	* <td valign="top"><code>pattern \| pattern-expr pattern \|
				206	* pattern-expr op pattern<br>
				207	* </code></td>
				208	* </tr>
				209	* <tr align="top">
				210	* <td nowrap valign="top" align="right"><code>op :=  </code></td>
				211	* <td valign="top"><code>'&' \| '-'<br>
				212	* </code></td>
				213	* </tr>
				214	* <tr align="top">
				215	* <td nowrap valign="top" align="right"><code>special :=  </code></td>
				216	* <td valign="top"><code>'[' \| ']' \| '-'<br>
				217	* </code></td>
				218	* </tr>
				219	* <tr align="top">
				220	* <td nowrap valign="top" align="right"><code>char :=  </code></td>
				221	* <td valign="top"><em>any character that is not</em><code> special<br>
				222	* \| ('\\' </code><em>any character</em><code>)<br>
				223	* \| ('\u' hex hex hex hex)<br>
				224	* </code></td>
				225	* </tr>
				226	* <tr align="top">
				227	* <td nowrap valign="top" align="right"><code>hex :=  </code></td>
				228	* <td valign="top"><em>any character for which
				229	* </em><code>Character.digit(c, 16)</code><em>
				230	* returns a non-negative result</em></td>
				231	* </tr>
				232	* <tr>
				233	* <td nowrap valign="top" align="right"><code>property :=  </code></td>
				234	* <td valign="top"><em>a Unicode property set pattern</td>
				235	* </tr>
				236	* </table>
				237	* <br>
				238	* <table border="1">
				239	* <tr>
				240	* <td>Legend: <table>
				241	* <tr>
				242	* <td nowrap valign="top"><code>a := b</code></td>
				243	* <td width="20" valign="top">  </td>
				244	* <td valign="top"><code>a</code> may be replaced by <code>b</code> </td>
				245	* </tr>
				246	* <tr>
				247	* <td nowrap valign="top"><code>a?</code></td>
				248	* <td valign="top"></td>
				249	* <td valign="top">zero or one instance of <code>a</code><br>
				250	* </td>
				251	* </tr>
				252	* <tr>
				253	* <td nowrap valign="top"><code>a*</code></td>
				254	* <td valign="top"></td>
				255	* <td valign="top">one or more instances of <code>a</code><br>
				256	* </td>
				257	* </tr>
				258	* <tr>
				259	* <td nowrap valign="top"><code>a \| b</code></td>
				260	* <td valign="top"></td>
				261	* <td valign="top">either <code>a</code> or <code>b</code><br>
				262	* </td>
				263	* </tr>
				264	* <tr>
				265	* <td nowrap valign="top"><code>'a'</code></td>
				266	* <td valign="top"></td>
				267	* <td valign="top">the literal string between the quotes </td>
				268	* </tr>
				269	* </table>
				270	* </td>
				271	* </tr>
				272	* </table>
				273	* </blockquote>
				274	*
				275	* @author Alan Liu
				276	* @stable ICU 2.0
				277	*/
				278	public class UnicodeSet implements UnicodeMatcher {
				279
				280	private static final int LOW = 0x000000; // LOW <= all valid values. ZERO for codepoints
				281	private static final int HIGH = 0x110000; // HIGH > all valid values. 10000 for code units.
				282	// 110000 for codepoints
				283
				284	/**
				285	* Minimum value that can be stored in a UnicodeSet.
				286	* @stable ICU 2.0
				287	*/
				288	public static final int MIN_VALUE = LOW;
				289
				290	/**
				291	* Maximum value that can be stored in a UnicodeSet.
				292	* @stable ICU 2.0
				293	*/
				294	public static final int MAX_VALUE = HIGH - 1;
				295
				296	private int len; // length used; list may be longer to minimize reallocs
				297	private int[] list; // MUST be terminated with HIGH
				298	private int[] rangeList; // internal buffer
				299	private int[] buffer; // internal buffer
				300
				301	// NOTE: normally the field should be of type SortedSet; but that is missing a public clone!!
				302	// is not private so that UnicodeSetIterator can get access
				303	TreeSet strings = new TreeSet();
				304
				305	/**
				306	* The pattern representation of this set. This may not be the
				307	* most economical pattern. It is the pattern supplied to
				308	* applyPattern(), with variables substituted and whitespace
				309	* removed. For sets constructed without applyPattern(), or
				310	* modified using the non-pattern API, this string will be null,
				311	* indicating that toPattern() must generate a pattern
				312	* representation from the inversion list.
				313	*/
				314	private String pat = null;
				315
				316	private static final int START_EXTRA = 16; // initial storage. Must be >= 0
				317	private static final int GROW_EXTRA = START_EXTRA; // extra amount for growth. Must be >= 0
				318
				319	/**
				320	* A set of all characters _except_ the second through last characters of
				321	* certain ranges. These ranges are ranges of characters whose
				322	* properties are all exactly alike, e.g. CJK Ideographs from
				323	* U+4E00 to U+9FA5.
				324	*/
				325	private static UnicodeSet INCLUSIONS = null;
				326
				327	//----------------------------------------------------------------
				328	// Public API
				329	//----------------------------------------------------------------
				330
				331	/**
				332	* Constructs an empty set.
				333	* @stable ICU 2.0
				334	*/
				335	public UnicodeSet() {
				336	list = new int[1 + START_EXTRA];
				337	list[len++] = HIGH;
				338	}
				339
				340	/**
				341	* Constructs a set containing the given range. If <code>end >
				342	* start</code> then an empty set is created.
				343	*
				344	* @param start first character, inclusive, of range
				345	* @param end last character, inclusive, of range
				346	* @stable ICU 2.0
				347	*/
				348	public UnicodeSet(int start, int end) {
				349	this();
				350	complement(start, end);
				351	}
				352
				353	/**
				354	* Constructs a set from the given pattern. See the class description
				355	* for the syntax of the pattern language. Whitespace is ignored.
				356	* @param pattern a string specifying what characters are in the set
				357	* @exception java.lang.IllegalArgumentException if the pattern contains
				358	* a syntax error.
				359	* @stable ICU 2.0
				360	*/
				361	public UnicodeSet(String pattern) {
				362	this();
				363	applyPattern(pattern, null, null, IGNORE_SPACE);
				364	}
				365
				366	/**
				367	* Make this object represent the same set as <code>other</code>.
				368	* @param other a <code>UnicodeSet</code> whose value will be
				369	* copied to this object
				370	* @stable ICU 2.0
				371	*/
				372	public UnicodeSet set(UnicodeSet other) {
				373	list = (int[]) other.list.clone();
				374	len = other.len;
				375	pat = other.pat;
				376	strings = (TreeSet)other.strings.clone();
				377	return this;
				378	}
				379
				380	/**
				381	* Modifies this set to represent the set specified by the given pattern.
				382	* See the class description for the syntax of the pattern language.
				383	* Whitespace is ignored.
				384	* @param pattern a string specifying what characters are in the set
				385	* @exception java.lang.IllegalArgumentException if the pattern
				386	* contains a syntax error.
				387	* @stable ICU 2.0
				388	*/
				389	public final UnicodeSet applyPattern(String pattern) {
				390	return applyPattern(pattern, null, null, IGNORE_SPACE);
				391	}
				392
				393	/**
				394	* Append the <code>toPattern()</code> representation of a
				395	* string to the given <code>StringBuffer</code>.
				396	*/
				397	private static void _appendToPat(StringBuffer buf, String s, boolean escapeUnprintable) {
				398	for (int i = 0; i < s.length(); i += UTF16.getCharCount(i)) {
				399	_appendToPat(buf, UTF16.charAt(s, i), escapeUnprintable);
				400	}
				401	}
				402
				403	/**
				404	* Append the <code>toPattern()</code> representation of a
				405	* character to the given <code>StringBuffer</code>.
				406	*/
				407	private static void _appendToPat(StringBuffer buf, int c, boolean escapeUnprintable) {
				408	if (escapeUnprintable && Utility.isUnprintable(c)) {
				409	// Use hex escape notation (<backslash>uxxxx or <backslash>Uxxxxxxxx) for anything
				410	// unprintable
				411	if (Utility.escapeUnprintable(buf, c)) {
				412	return;
				413	}
				414	}
				415	// Okay to let ':' pass through
				416	switch (c) {
				417	case '[': // SET_OPEN:
				418	case ']': // SET_CLOSE:
				419	case '-': // HYPHEN:
				420	case '^': // COMPLEMENT:
				421	case '&': // INTERSECTION:
				422	case '\\': //BACKSLASH:
				423	case '{':
				424	case '}':
				425	case '$':
				426	case ':':
				427	buf.append('\\');
				428	break;
				429	default:
				430	// Escape whitespace
				431	if (UCharacterProperty.isRuleWhiteSpace(c)) {
				432	buf.append('\\');
				433	}
				434	break;
				435	}
				436	UTF16.append(buf, c);
				437	}
				438
				439	/**
				440	* Append a string representation of this set to result. This will be
				441	* a cleaned version of the string passed to applyPattern(), if there
				442	* is one. Otherwise it will be generated.
				443	*/
				444	private StringBuffer _toPattern(StringBuffer result,
				445	boolean escapeUnprintable) {
				446	if (pat != null) {
				447	int i;
				448	int backslashCount = 0;
				449	for (i=0; i<pat.length(); ) {
				450	int c = UTF16.charAt(pat, i);
				451	i += UTF16.getCharCount(c);
				452	if (escapeUnprintable && Utility.isUnprintable(c)) {
				453	// If the unprintable character is preceded by an odd
				454	// number of backslashes, then it has been escaped.
				455	// Before unescaping it, we delete the final
				456	// backslash.
				457	if ((backslashCount % 2) == 1) {
				458	result.setLength(result.length() - 1);
				459	}
				460	Utility.escapeUnprintable(result, c);
				461	backslashCount = 0;
				462	} else {
				463	UTF16.append(result, c);
				464	if (c == '\\') {
				465	++backslashCount;
				466	} else {
				467	backslashCount = 0;
				468	}
				469	}
				470	}
				471	return result;
				472	}
				473
				474	return _generatePattern(result, escapeUnprintable);
				475	}
				476
				477	/**
				478	* Generate and append a string representation of this set to result.
				479	* This does not use this.pat, the cleaned up copy of the string
				480	* passed to applyPattern().
				481	* @stable ICU 2.0
				482	*/
				483	public StringBuffer _generatePattern(StringBuffer result,
				484	boolean escapeUnprintable) {
				485	result.append('[');
				486
				487	int count = getRangeCount();
				488
				489	// If the set contains at least 2 intervals and includes both
				490	// MIN_VALUE and MAX_VALUE, then the inverse representation will
				491	// be more economical.
				492	if (count > 1 &&
				493	getRangeStart(0) == MIN_VALUE &&
				494	getRangeEnd(count-1) == MAX_VALUE) {
				495
				496	// Emit the inverse
				497	result.append('^');
				498
				499	for (int i = 1; i < count; ++i) {
				500	int start = getRangeEnd(i-1)+1;
				501	int end = getRangeStart(i)-1;
				502	_appendToPat(result, start, escapeUnprintable);
				503	if (start != end) {
				504	if ((start+1) != end) {
				505	result.append('-');
				506	}
				507	_appendToPat(result, end, escapeUnprintable);
				508	}
				509	}
				510	}
				511
				512	// Default; emit the ranges as pairs
				513	else {
				514	for (int i = 0; i < count; ++i) {
				515	int start = getRangeStart(i);
				516	int end = getRangeEnd(i);
				517	_appendToPat(result, start, escapeUnprintable);
				518	if (start != end) {
				519	if ((start+1) != end) {
				520	result.append('-');
				521	}
				522	_appendToPat(result, end, escapeUnprintable);
				523	}
				524	}
				525	}
				526
				527	if (strings.size() > 0) {
				528	Iterator it = strings.iterator();
				529	while (it.hasNext()) {
				530	result.append('{');
				531	_appendToPat(result, (String) it.next(), escapeUnprintable);
				532	result.append('}');
				533	}
				534	}
				535	return result.append(']');
				536	}
				537
				538	/**
				539	* Adds the specified range to this set if it is not already
				540	* present. If this set already contains the specified range,
				541	* the call leaves this set unchanged. If <code>end > start</code>
				542	* then an empty range is added, leaving the set unchanged.
				543	*
				544	* @param start first character, inclusive, of range to be added
				545	* to this set.
				546	* @param end last character, inclusive, of range to be added
				547	* to this set.
				548	* @stable ICU 2.0
				549	*/
				550	public UnicodeSet add(int start, int end) {
				551	if (start < MIN_VALUE \|\| start > MAX_VALUE) {
				552	throw new IllegalArgumentException("Invalid code point U+" + Utility.hex(start, 6));
				553	}
				554	if (end < MIN_VALUE \|\| end > MAX_VALUE) {
				555	throw new IllegalArgumentException("Invalid code point U+" + Utility.hex(end, 6));
				556	}
				557	if (start < end) {
				558	add(range(start, end), 2, 0);
				559	} else if (start == end) {
				560	add(start);
				561	}
				562	return this;
				563	}
				564
				565	/**
				566	* Adds the specified character to this set if it is not already
				567	* present. If this set already contains the specified character,
				568	* the call leaves this set unchanged.
				569	* @stable ICU 2.0
				570	*/
				571	public final UnicodeSet add(int c) {
				572	if (c < MIN_VALUE \|\| c > MAX_VALUE) {
				573	throw new IllegalArgumentException("Invalid code point U+" + Utility.hex(c, 6));
				574	}
				575
				576	// find smallest i such that c < list[i]
				577	// if odd, then it is IN the set
				578	// if even, then it is OUT of the set
				579	int i = findCodePoint(c);
				580
				581	// already in set?
				582	if ((i & 1) != 0) return this;
				583
				584	// HIGH is 0x110000
				585	// assert(list[len-1] == HIGH);
				586
				587	// empty = [HIGH]
				588	// [start_0, limit_0, start_1, limit_1, HIGH]
				589
				590	// [..., start_k-1, limit_k-1, start_k, limit_k, ..., HIGH]
				591	// ^
				592	// list[i]
				593
				594	// i == 0 means c is before the first range
				595
				596	if (c == list[i]-1) {
				597	// c is before start of next range
				598	list[i] = c;
				599	// if we touched the HIGH mark, then add a new one
				600	if (c == MAX_VALUE) {
				601	ensureCapacity(len+1);
				602	list[len++] = HIGH;
				603	}
				604	if (i > 0 && c == list[i-1]) {
				605	// collapse adjacent ranges
				606
				607	// [..., start_k-1, c, c, limit_k, ..., HIGH]
				608	// ^
				609	// list[i]
				610	System.arraycopy(list, i+1, list, i-1, len-i-1);
				611	len -= 2;
				612	}
				613	}
				614
				615	else if (i > 0 && c == list[i-1]) {
				616	// c is after end of prior range
				617	list[i-1]++;
				618	// no need to chcek for collapse here
				619	}
				620
				621	else {
				622	// At this point we know the new char is not adjacent to
				623	// any existing ranges, and it is not 10FFFF.
				624
				625
				626	// [..., start_k-1, limit_k-1, start_k, limit_k, ..., HIGH]
				627	// ^
				628	// list[i]
				629
				630	// [..., start_k-1, limit_k-1, c, c+1, start_k, limit_k, ..., HIGH]
				631	// ^
				632	// list[i]
				633
				634	// Don't use ensureCapacity() to save on copying.
				635	// NOTE: This has no measurable impact on performance,
				636	// but it might help in some usage patterns.
				637	if (len+2 > list.length) {
				638	int[] temp = new int[len + 2 + GROW_EXTRA];
				639	if (i != 0) System.arraycopy(list, 0, temp, 0, i);
				640	System.arraycopy(list, i, temp, i+2, len-i);
				641	list = temp;
				642	} else {
				643	System.arraycopy(list, i, list, i+2, len-i);
				644	}
				645
				646	list[i] = c;
				647	list[i+1] = c+1;
				648	len += 2;
				649	}
				650
				651	pat = null;
				652	return this;
				653	}
				654
				655	/**
				656	* Adds the specified multicharacter to this set if it is not already
				657	* present. If this set already contains the multicharacter,
				658	* the call leaves this set unchanged.
				659	* Thus "ch" => {"ch"}
				660	* <br><b>Warning: you cannot add an empty string ("") to a UnicodeSet.</b>
				661	* @param s the source string
				662	* @return this object, for chaining
				663	* @stable ICU 2.0
				664	*/
				665	public final UnicodeSet add(String s) {
				666
				667	int cp = getSingleCP(s);
				668	if (cp < 0) {
				669	strings.add(s);
				670	pat = null;
				671	} else {
				672	add(cp, cp);
				673	}
				674	return this;
				675	}
				676
				677	/**
				678	* @return a code point IF the string consists of a single one.
				679	* otherwise returns -1.
				680	* @param string to test
				681	*/
				682	private static int getSingleCP(String s) {
				683	if (s.length() < 1) {
				684	throw new IllegalArgumentException("Can't use zero-length strings in UnicodeSet");
				685	}
				686	if (s.length() > 2) return -1;
				687	if (s.length() == 1) return s.charAt(0);
				688
				689	// at this point, len = 2
				690	int cp = UTF16.charAt(s, 0);
				691	if (cp > 0xFFFF) { // is surrogate pair
				692	return cp;
				693	}
				694	return -1;
				695	}
				696
				697	/**
				698	* Complements the specified range in this set. Any character in
				699	* the range will be removed if it is in this set, or will be
				700	* added if it is not in this set. If <code>end > start</code>
				701	* then an empty range is complemented, leaving the set unchanged.
				702	*
				703	* @param start first character, inclusive, of range to be removed
				704	* from this set.
				705	* @param end last character, inclusive, of range to be removed
				706	* from this set.
				707	* @stable ICU 2.0
				708	*/
				709	public UnicodeSet complement(int start, int end) {
				710	if (start < MIN_VALUE \|\| start > MAX_VALUE) {
				711	throw new IllegalArgumentException("Invalid code point U+" + Utility.hex(start, 6));
				712	}
				713	if (end < MIN_VALUE \|\| end > MAX_VALUE) {
				714	throw new IllegalArgumentException("Invalid code point U+" + Utility.hex(end, 6));
				715	}
				716	if (start <= end) {
				717	xor(range(start, end), 2, 0);
				718	}
				719	pat = null;
				720	return this;
				721	}
				722
				723	/**
				724	* This is equivalent to
				725	* <code>complement(MIN_VALUE, MAX_VALUE)</code>.
				726	* @stable ICU 2.0
				727	*/
				728	public UnicodeSet complement() {
				729	if (list[0] == LOW) {
				730	System.arraycopy(list, 1, list, 0, len-1);
				731	--len;
				732	} else {
				733	ensureCapacity(len+1);
				734	System.arraycopy(list, 0, list, 1, len);
				735	list[0] = LOW;
				736	++len;
				737	}
				738	pat = null;
				739	return this;
				740	}
				741
				742	/**
				743	* Returns true if this set contains the given character.
				744	* @param c character to be checked for containment
				745	* @return true if the test condition is met
				746	* @stable ICU 2.0
				747	*/
				748	public boolean contains(int c) {
				749	if (c < MIN_VALUE \|\| c > MAX_VALUE) {
				750	throw new IllegalArgumentException("Invalid code point U+" + Utility.hex(c, 6));
				751	}
				752
				753	/*
				754	// Set i to the index of the start item greater than ch
				755	// We know we will terminate without length test!
				756	int i = -1;
				757	while (true) {
				758	if (c < list[++i]) break;
				759	}
				760	*/
				761
				762	int i = findCodePoint(c);
				763
				764	return ((i & 1) != 0); // return true if odd
				765	}
				766
				767	/**
				768	* Returns the smallest value i such that c < list[i]. Caller
				769	* must ensure that c is a legal value or this method will enter
				770	* an infinite loop. This method performs a binary search.
				771	* @param c a character in the range MIN_VALUE..MAX_VALUE
				772	* inclusive
				773	* @return the smallest integer i in the range 0..len-1,
				774	* inclusive, such that c < list[i]
				775	*/
				776	private final int findCodePoint(int c) {
				777	/* Examples:
				778	findCodePoint(c)
				779	set list[] c=0 1 3 4 7 8
				780	=== ============== ===========
				781	[] [110000] 0 0 0 0 0 0
				782	[\u0000-\u0003] [0, 4, 110000] 1 1 1 2 2 2
				783	[\u0004-\u0007] [4, 8, 110000] 0 0 0 1 1 2
				784	[:all:] [0, 110000] 1 1 1 1 1 1
				785	*/
				786
				787	// Return the smallest i such that c < list[i]. Assume
				788	// list[len - 1] == HIGH and that c is legal (0..HIGH-1).
				789	if (c < list[0]) return 0;
				790	// High runner test. c is often after the last range, so an
				791	// initial check for this condition pays off.
				792	if (len >= 2 && c >= list[len-2]) return len-1;
				793	int lo = 0;
				794	int hi = len - 1;
				795	// invariant: c >= list[lo]
				796	// invariant: c < list[hi]
				797	for (;;) {
				798	int i = (lo + hi) >>> 1;
				799	if (i == lo) return hi;
				800	if (c < list[i]) {
				801	hi = i;
				802	} else {
				803	lo = i;
				804	}
				805	}
				806	}
				807
				808	/**
				809	* Adds all of the elements in the specified set to this set if
				810	* they're not already present. This operation effectively
				811	* modifies this set so that its value is the <i>union</i> of the two
				812	* sets. The behavior of this operation is unspecified if the specified
				813	* collection is modified while the operation is in progress.
				814	*
				815	* @param c set whose elements are to be added to this set.
				816	* @stable ICU 2.0
				817	*/
				818	public UnicodeSet addAll(UnicodeSet c) {
				819	add(c.list, c.len, 0);
				820	strings.addAll(c.strings);
				821	return this;
				822	}
				823
				824	/**
				825	* Retains only the elements in this set that are contained in the
				826	* specified set. In other words, removes from this set all of
				827	* its elements that are not contained in the specified set. This
				828	* operation effectively modifies this set so that its value is
				829	* the <i>intersection</i> of the two sets.
				830	*
				831	* @param c set that defines which elements this set will retain.
				832	* @stable ICU 2.0
				833	*/
				834	public UnicodeSet retainAll(UnicodeSet c) {
				835	retain(c.list, c.len, 0);
				836	strings.retainAll(c.strings);
				837	return this;
				838	}
				839
				840	/**
				841	* Removes from this set all of its elements that are contained in the
				842	* specified set. This operation effectively modifies this
				843	* set so that its value is the <i>asymmetric set difference</i> of
				844	* the two sets.
				845	*
				846	* @param c set that defines which elements will be removed from
				847	* this set.
				848	* @stable ICU 2.0
				849	*/
				850	public UnicodeSet removeAll(UnicodeSet c) {
				851	retain(c.list, c.len, 2);
				852	strings.removeAll(c.strings);
				853	return this;
				854	}
				855
				856	/**
				857	* Removes all of the elements from this set. This set will be
				858	* empty after this call returns.
				859	* @stable ICU 2.0
				860	*/
				861	public UnicodeSet clear() {
				862	list[0] = HIGH;
				863	len = 1;
				864	pat = null;
				865	strings.clear();
				866	return this;
				867	}
				868
				869	/**
				870	* Iteration method that returns the number of ranges contained in
				871	* this set.
				872	* @see #getRangeStart
				873	* @see #getRangeEnd
				874	* @stable ICU 2.0
				875	*/
				876	public int getRangeCount() {
				877	return len/2;
				878	}
				879
				880	/**
				881	* Iteration method that returns the first character in the
				882	* specified range of this set.
				883	* @exception ArrayIndexOutOfBoundsException if index is outside
				884	* the range <code>0..getRangeCount()-1</code>
				885	* @see #getRangeCount
				886	* @see #getRangeEnd
				887	* @stable ICU 2.0
				888	*/
				889	public int getRangeStart(int index) {
				890	return list[index*2];
				891	}
				892
				893	/**
				894	* Iteration method that returns the last character in the
				895	* specified range of this set.
				896	* @exception ArrayIndexOutOfBoundsException if index is outside
				897	* the range <code>0..getRangeCount()-1</code>
				898	* @see #getRangeStart
				899	* @see #getRangeEnd
				900	* @stable ICU 2.0
				901	*/
				902	public int getRangeEnd(int index) {
				903	return (list[index*2 + 1] - 1);
				904	}
				905
				906	//----------------------------------------------------------------
				907	// Implementation: Pattern parsing
				908	//----------------------------------------------------------------
				909
				910	/**
				911	* Parses the given pattern, starting at the given position. The character
				912	* at pattern.charAt(pos.getIndex()) must be '[', or the parse fails.
				913	* Parsing continues until the corresponding closing ']'. If a syntax error
				914	* is encountered between the opening and closing brace, the parse fails.
				915	* Upon return from a successful parse, the ParsePosition is updated to
				916	* point to the character following the closing ']', and an inversion
				917	* list for the parsed pattern is returned. This method
				918	* calls itself recursively to parse embedded subpatterns.
				919	*
				920	* @param pattern the string containing the pattern to be parsed. The
				921	* portion of the string from pos.getIndex(), which must be a '[', to the
				922	* corresponding closing ']', is parsed.
				923	* @param pos upon entry, the position at which to being parsing. The
				924	* character at pattern.charAt(pos.getIndex()) must be a '['. Upon return
				925	* from a successful parse, pos.getIndex() is either the character after the
				926	* closing ']' of the parsed pattern, or pattern.length() if the closing ']'
				927	* is the last character of the pattern string.
				928	* @return an inversion list for the parsed substring
				929	* of <code>pattern</code>
				930	* @exception java.lang.IllegalArgumentException if the parse fails.
				931	*/
				932	UnicodeSet applyPattern(String pattern,
				933	ParsePosition pos,
				934	SymbolTable symbols,
				935	int options) {
				936
				937	// Need to build the pattern in a temporary string because
				938	// _applyPattern calls add() etc., which set pat to empty.
				939	boolean parsePositionWasNull = pos == null;
				940	if (parsePositionWasNull) {
				941	pos = new ParsePosition(0);
				942	}
				943
				944	StringBuffer rebuiltPat = new StringBuffer();
				945	RuleCharacterIterator chars =
				946	new RuleCharacterIterator(pattern, symbols, pos);
				947	applyPattern(chars, symbols, rebuiltPat, options);
				948	if (chars.inVariable()) {
				949	syntaxError(chars, "Extra chars in variable value");
				950	}
				951	pat = rebuiltPat.toString();
				952	if (parsePositionWasNull) {
				953	int i = pos.getIndex();
				954
				955	// Skip over trailing whitespace
				956	if ((options & IGNORE_SPACE) != 0) {
				957	i = Utility.skipWhitespace(pattern, i);
				958	}
				959
				960	if (i != pattern.length()) {
				961	throw new IllegalArgumentException("Parse of \"" + pattern +
				962	"\" failed at " + i);
				963	}
				964	}
				965	return this;
				966	}
				967
				968	/**
				969	* Parse the pattern from the given RuleCharacterIterator. The
				970	* iterator is advanced over the parsed pattern.
				971	* @param chars iterator over the pattern characters. Upon return
				972	* it will be advanced to the first character after the parsed
				973	* pattern, or the end of the iteration if all characters are
				974	* parsed.
				975	* @param symbols symbol table to use to parse and dereference
				976	* variables, or null if none.
				977	* @param rebuiltPat the pattern that was parsed, rebuilt or
				978	* copied from the input pattern, as appropriate.
				979	* @param options a bit mask of zero or more of the following:
				980	* IGNORE_SPACE, CASE.
				981	*/
				982	void applyPattern(RuleCharacterIterator chars, SymbolTable symbols,
				983	StringBuffer rebuiltPat, int options) {
				984
				985	// Syntax characters: [ ] ^ - & { }
				986
				987	// Recognized special forms for chars, sets: c-c s-s s&s
				988
				989	int opts = RuleCharacterIterator.PARSE_VARIABLES \|
				990	RuleCharacterIterator.PARSE_ESCAPES;
				991	if ((options & IGNORE_SPACE) != 0) {
				992	opts \|= RuleCharacterIterator.SKIP_WHITESPACE;
				993	}
				994
				995	StringBuffer pat = new StringBuffer(), buf = null;
				996	boolean usePat = false;
				997	UnicodeSet scratch = null;
				998	Object backup = null;
				999
				1000	// mode: 0=before [, 1=between [...], 2=after ]
				1001	// lastItem: 0=none, 1=char, 2=set
				1002	int lastItem = 0, lastChar = 0, mode = 0;
				1003	char op = 0;
				1004
				1005	boolean invert = false;
				1006
				1007	clear();
				1008
				1009	while (mode != 2 && !chars.atEnd()) {
				1010	if (false) {
				1011	// Debugging assertion
				1012	if (!((lastItem == 0 && op == 0) \|\|
				1013	(lastItem == 1 && (op == 0 \|\| op == '-')) \|\|
				1014	(lastItem == 2 && (op == 0 \|\| op == '-' \|\| op == '&')))) {
				1015	throw new IllegalArgumentException();
				1016	}
				1017	}
				1018
				1019	int c = 0;
				1020	boolean literal = false;
				1021	UnicodeSet nested = null;
				1022
				1023	// -------- Check for property pattern
				1024
				1025	// setMode: 0=none, 1=unicodeset, 2=propertypat, 3=preparsed
				1026	int setMode = 0;
				1027	if (resemblesPropertyPattern(chars, opts)) {
				1028	setMode = 2;
				1029	}
				1030
				1031	// -------- Parse '[' of opening delimiter OR nested set.
				1032	// If there is a nested set, use `setMode' to define how
				1033	// the set should be parsed. If the '[' is part of the
				1034	// opening delimiter for this pattern, parse special
				1035	// strings "[", "[^", "[-", and "[^-". Check for stand-in
				1036	// characters representing a nested set in the symbol
				1037	// table.
				1038
				1039	else {
				1040	// Prepare to backup if necessary
				1041	backup = chars.getPos(backup);
				1042	c = chars.next(opts);
				1043	literal = chars.isEscaped();
				1044
				1045	if (c == '[' && !literal) {
				1046	if (mode == 1) {
				1047	chars.setPos(backup); // backup
				1048	setMode = 1;
				1049	} else {
				1050	// Handle opening '[' delimiter
				1051	mode = 1;
				1052	pat.append('[');
				1053	backup = chars.getPos(backup); // prepare to backup
				1054	c = chars.next(opts);
				1055	literal = chars.isEscaped();
				1056	if (c == '^' && !literal) {
				1057	invert = true;
				1058	pat.append('^');
				1059	backup = chars.getPos(backup); // prepare to backup
				1060	c = chars.next(opts);
				1061	literal = chars.isEscaped();
				1062	}
				1063	// Fall through to handle special leading '-';
				1064	// otherwise restart loop for nested [], \p{}, etc.
				1065	if (c == '-') {
				1066	literal = true;
				1067	// Fall through to handle literal '-' below
				1068	} else {
				1069	chars.setPos(backup); // backup
				1070	continue;
				1071	}
				1072	}
				1073	} else if (symbols != null) {
				1074	UnicodeMatcher m = symbols.lookupMatcher(c); // may be null
				1075	if (m != null) {
				1076	try {
				1077	nested = (UnicodeSet) m;
				1078	setMode = 3;
				1079	} catch (ClassCastException e) {
				1080	syntaxError(chars, "Syntax error");
				1081	}
				1082	}
				1083	}
				1084	}
				1085
				1086	// -------- Handle a nested set. This either is inline in
				1087	// the pattern or represented by a stand-in that has
				1088	// previously been parsed and was looked up in the symbol
				1089	// table.
				1090
				1091	if (setMode != 0) {
				1092	if (lastItem == 1) {
				1093	if (op != 0) {
				1094	syntaxError(chars, "Char expected after operator");
				1095	}
				1096	add(lastChar, lastChar);
				1097	_appendToPat(pat, lastChar, false);
				1098	lastItem = op = 0;
				1099	}
				1100
				1101	if (op == '-' \|\| op == '&') {
				1102	pat.append(op);
				1103	}
				1104
				1105	if (nested == null) {
				1106	if (scratch == null) scratch = new UnicodeSet();
				1107	nested = scratch;
				1108	}
				1109	switch (setMode) {
				1110	case 1:
				1111	nested.applyPattern(chars, symbols, pat, options);
				1112	break;
				1113	case 2:
				1114	chars.skipIgnored(opts);
				1115	nested.applyPropertyPattern(chars, pat, symbols);
				1116	break;
				1117	case 3: // `nested' already parsed
				1118	nested._toPattern(pat, false);
				1119	break;
				1120	}
				1121
				1122	usePat = true;
				1123
				1124	if (mode == 0) {
				1125	// Entire pattern is a category; leave parse loop
				1126	set(nested);
				1127	mode = 2;
				1128	break;
				1129	}
				1130
				1131	switch (op) {
				1132	case '-':
				1133	removeAll(nested);
				1134	break;
				1135	case '&':
				1136	retainAll(nested);
				1137	break;
				1138	case 0:
				1139	addAll(nested);
				1140	break;
				1141	}
				1142
				1143	op = 0;
				1144	lastItem = 2;
				1145
				1146	continue;
				1147	}
				1148
				1149	if (mode == 0) {
				1150	syntaxError(chars, "Missing '['");
				1151	}
				1152
				1153	// -------- Parse special (syntax) characters. If the
				1154	// current character is not special, or if it is escaped,
				1155	// then fall through and handle it below.
				1156
				1157	if (!literal) {
				1158	switch (c) {
				1159	case ']':
				1160	if (lastItem == 1) {
				1161	add(lastChar, lastChar);
				1162	_appendToPat(pat, lastChar, false);
				1163	}
				1164	// Treat final trailing '-' as a literal
				1165	if (op == '-') {
				1166	add(op, op);
				1167	pat.append(op);
				1168	} else if (op == '&') {
				1169	syntaxError(chars, "Trailing '&'");
				1170	}
				1171	pat.append(']');
				1172	mode = 2;
				1173	continue;
				1174	case '-':
				1175	if (op == 0) {
				1176	if (lastItem != 0) {
				1177	op = (char) c;
				1178	continue;
				1179	} else {
				1180	// Treat final trailing '-' as a literal
				1181	add(c, c);
				1182	c = chars.next(opts);
				1183	literal = chars.isEscaped();
				1184	if (c == ']' && !literal) {
				1185	pat.append("-]");
				1186	mode = 2;
				1187	continue;
				1188	}
				1189	}
				1190	}
				1191	syntaxError(chars, "'-' not after char or set");
				1192	case '&':
				1193	if (lastItem == 2 && op == 0) {
				1194	op = (char) c;
				1195	continue;
				1196	}
				1197	syntaxError(chars, "'&' not after set");
				1198	case '^':
				1199	syntaxError(chars, "'^' not after '['");
				1200	case '{':
				1201	if (op != 0) {
				1202	syntaxError(chars, "Missing operand after operator");
				1203	}
				1204	if (lastItem == 1) {
				1205	add(lastChar, lastChar);
				1206	_appendToPat(pat, lastChar, false);
				1207	}
				1208	lastItem = 0;
				1209	if (buf == null) {
				1210	buf = new StringBuffer();
				1211	} else {
				1212	buf.setLength(0);
				1213	}
				1214	boolean ok = false;
				1215	while (!chars.atEnd()) {
				1216	c = chars.next(opts);
				1217	literal = chars.isEscaped();
				1218	if (c == '}' && !literal) {
				1219	ok = true;
				1220	break;
				1221	}
				1222	UTF16.append(buf, c);
				1223	}
				1224	if (buf.length() < 1 \|\| !ok) {
				1225	syntaxError(chars, "Invalid multicharacter string");
				1226	}
				1227	// We have new string. Add it to set and continue;
				1228	// we don't need to drop through to the further
				1229	// processing
				1230	add(buf.toString());
				1231	pat.append('{');
				1232	_appendToPat(pat, buf.toString(), false);
				1233	pat.append('}');
				1234	continue;
				1235	case SymbolTable.SYMBOL_REF:
				1236	// symbols nosymbols
				1237	// [a-$] error error (ambiguous)
				1238	// [a$] anchor anchor
				1239	// [a-$x] var "x"* literal '$'
				1240	// [a-$.] error literal '$'
				1241	// *We won't get here in the case of var "x"
				1242	backup = chars.getPos(backup);
				1243	c = chars.next(opts);
				1244	literal = chars.isEscaped();
				1245	boolean anchor = (c == ']' && !literal);
				1246	if (symbols == null && !anchor) {
				1247	c = SymbolTable.SYMBOL_REF;
				1248	chars.setPos(backup);
				1249	break; // literal '$'
				1250	}
				1251	if (anchor && op == 0) {
				1252	if (lastItem == 1) {
				1253	add(lastChar, lastChar);
				1254	_appendToPat(pat, lastChar, false);
				1255	}
				1256	add(UnicodeMatcher.ETHER);
				1257	usePat = true;
				1258	pat.append(SymbolTable.SYMBOL_REF).append(']');
				1259	mode = 2;
				1260	continue;
				1261	}
				1262	syntaxError(chars, "Unquoted '$'");
				1263	default:
				1264	break;
				1265	}
				1266	}
				1267
				1268	// -------- Parse literal characters. This includes both
				1269	// escaped chars ("\u4E01") and non-syntax characters
				1270	// ("a").
				1271
				1272	switch (lastItem) {
				1273	case 0:
				1274	lastItem = 1;
				1275	lastChar = c;
				1276	break;
				1277	case 1:
				1278	if (op == '-') {
				1279	if (lastChar >= c) {
				1280	// Don't allow redundant (a-a) or empty (b-a) ranges;
				1281	// these are most likely typos.
				1282	syntaxError(chars, "Invalid range");
				1283	}
				1284	add(lastChar, c);
				1285	_appendToPat(pat, lastChar, false);
				1286	pat.append(op);
				1287	_appendToPat(pat, c, false);
				1288	lastItem = op = 0;
				1289	} else {
				1290	add(lastChar, lastChar);
				1291	_appendToPat(pat, lastChar, false);
				1292	lastChar = c;
				1293	}
				1294	break;
				1295	case 2:
				1296	if (op != 0) {
				1297	syntaxError(chars, "Set expected after operator");
				1298	}
				1299	lastChar = c;
				1300	lastItem = 1;
				1301	break;
				1302	}
				1303	}
				1304
				1305	if (mode != 2) {
				1306	syntaxError(chars, "Missing ']'");
				1307	}
				1308
				1309	chars.skipIgnored(opts);
				1310
				1311	if (invert) {
				1312	complement();
				1313	}
				1314
				1315	// Use the rebuilt pattern (pat) only if necessary. Prefer the
				1316	// generated pattern.
				1317	if (usePat) {
				1318	rebuiltPat.append(pat.toString());
				1319	} else {
				1320	_generatePattern(rebuiltPat, false);
				1321	}
				1322	}
				1323
				1324	private static void syntaxError(RuleCharacterIterator chars, String msg) {
				1325	throw new IllegalArgumentException("Error: " + msg + " at \"" +
				1326	Utility.escape(chars.toString()) +
				1327	'"');
				1328	}
				1329
				1330	//----------------------------------------------------------------
				1331	// Implementation: Utility methods
				1332	//----------------------------------------------------------------
				1333
				1334	private void ensureCapacity(int newLen) {
				1335	if (newLen <= list.length) return;
				1336	int[] temp = new int[newLen + GROW_EXTRA];
				1337	System.arraycopy(list, 0, temp, 0, len);
				1338	list = temp;
				1339	}
				1340
				1341	private void ensureBufferCapacity(int newLen) {
				1342	if (buffer != null && newLen <= buffer.length) return;
				1343	buffer = new int[newLen + GROW_EXTRA];
				1344	}
				1345
				1346	/**
				1347	* Assumes start <= end.
				1348	*/
				1349	private int[] range(int start, int end) {
				1350	if (rangeList == null) {
				1351	rangeList = new int[] { start, end+1, HIGH };
				1352	} else {
				1353	rangeList[0] = start;
				1354	rangeList[1] = end+1;
				1355	}
				1356	return rangeList;
				1357	}
				1358
				1359	//----------------------------------------------------------------
				1360	// Implementation: Fundamental operations
				1361	//----------------------------------------------------------------
				1362
				1363	// polarity = 0, 3 is normal: x xor y
				1364	// polarity = 1, 2: x xor ~y == x === y
				1365
				1366	private UnicodeSet xor(int[] other, int otherLen, int polarity) {
				1367	ensureBufferCapacity(len + otherLen);
				1368	int i = 0, j = 0, k = 0;
				1369	int a = list[i++];
				1370	int b;
				1371	if (polarity == 1 \|\| polarity == 2) {
				1372	b = LOW;
				1373	if (other[j] == LOW) { // skip base if already LOW
				1374	++j;
				1375	b = other[j];
				1376	}
				1377	} else {
				1378	b = other[j++];
				1379	}
				1380	// simplest of all the routines
				1381	// sort the values, discarding identicals!
				1382	while (true) {
				1383	if (a < b) {
				1384	buffer[k++] = a;
				1385	a = list[i++];
				1386	} else if (b < a) {
				1387	buffer[k++] = b;
				1388	b = other[j++];
				1389	} else if (a != HIGH) { // at this point, a == b
				1390	// discard both values!
				1391	a = list[i++];
				1392	b = other[j++];
				1393	} else { // DONE!
				1394	buffer[k++] = HIGH;
				1395	len = k;
				1396	break;
				1397	}
				1398	}
				1399	// swap list and buffer
				1400	int[] temp = list;
				1401	list = buffer;
				1402	buffer = temp;
				1403	pat = null;
				1404	return this;
				1405	}
				1406
				1407	// polarity = 0 is normal: x union y
				1408	// polarity = 2: x union ~y
				1409	// polarity = 1: ~x union y
				1410	// polarity = 3: ~x union ~y
				1411
				1412	private UnicodeSet add(int[] other, int otherLen, int polarity) {
				1413	ensureBufferCapacity(len + otherLen);
				1414	int i = 0, j = 0, k = 0;
				1415	int a = list[i++];
				1416	int b = other[j++];
				1417	// change from xor is that we have to check overlapping pairs
				1418	// polarity bit 1 means a is second, bit 2 means b is.
				1419	main:
				1420	while (true) {
				1421	switch (polarity) {
				1422	case 0: // both first; take lower if unequal
				1423	if (a < b) { // take a
				1424	// Back up over overlapping ranges in buffer[]
				1425	if (k > 0 && a <= buffer[k-1]) {
				1426	// Pick latter end value in buffer[] vs. list[]
				1427	a = max(list[i], buffer[--k]);
				1428	} else {
				1429	// No overlap
				1430	buffer[k++] = a;
				1431	a = list[i];
				1432	}
				1433	i++; // Common if/else code factored out
				1434	polarity ^= 1;
				1435	} else if (b < a) { // take b
				1436	if (k > 0 && b <= buffer[k-1]) {
				1437	b = max(other[j], buffer[--k]);
				1438	} else {
				1439	buffer[k++] = b;
				1440	b = other[j];
				1441	}
				1442	j++;
				1443	polarity ^= 2;
				1444	} else { // a == b, take a, drop b
				1445	if (a == HIGH) break main;
				1446	// This is symmetrical; it doesn't matter if
				1447	// we backtrack with a or b. - liu
				1448	if (k > 0 && a <= buffer[k-1]) {
				1449	a = max(list[i], buffer[--k]);
				1450	} else {
				1451	// No overlap
				1452	buffer[k++] = a;
				1453	a = list[i];
				1454	}
				1455	i++;
				1456	polarity ^= 1;
				1457	b = other[j++]; polarity ^= 2;
				1458	}
				1459	break;
				1460	case 3: // both second; take higher if unequal, and drop other
				1461	if (b <= a) { // take a
				1462	if (a == HIGH) break main;
				1463	buffer[k++] = a;
				1464	} else { // take b
				1465	if (b == HIGH) break main;
				1466	buffer[k++] = b;
				1467	}
				1468	a = list[i++]; polarity ^= 1; // factored common code
				1469	b = other[j++]; polarity ^= 2;
				1470	break;
				1471	case 1: // a second, b first; if b < a, overlap
				1472	if (a < b) { // no overlap, take a
				1473	buffer[k++] = a; a = list[i++]; polarity ^= 1;
				1474	} else if (b < a) { // OVERLAP, drop b
				1475	b = other[j++]; polarity ^= 2;
				1476	} else { // a == b, drop both!
				1477	if (a == HIGH) break main;
				1478	a = list[i++]; polarity ^= 1;
				1479	b = other[j++]; polarity ^= 2;
				1480	}
				1481	break;
				1482	case 2: // a first, b second; if a < b, overlap
				1483	if (b < a) { // no overlap, take b
				1484	buffer[k++] = b; b = other[j++]; polarity ^= 2;
				1485	} else if (a < b) { // OVERLAP, drop a
				1486	a = list[i++]; polarity ^= 1;
				1487	} else { // a == b, drop both!
				1488	if (a == HIGH) break main;
				1489	a = list[i++]; polarity ^= 1;
				1490	b = other[j++]; polarity ^= 2;
				1491	}
				1492	break;
				1493	}
				1494	}
				1495	buffer[k++] = HIGH; // terminate
				1496	len = k;
				1497	// swap list and buffer
				1498	int[] temp = list;
				1499	list = buffer;
				1500	buffer = temp;
				1501	pat = null;
				1502	return this;
				1503	}
				1504
				1505	// polarity = 0 is normal: x intersect y
				1506	// polarity = 2: x intersect ~y == set-minus
				1507	// polarity = 1: ~x intersect y
				1508	// polarity = 3: ~x intersect ~y
				1509
				1510	private UnicodeSet retain(int[] other, int otherLen, int polarity) {
				1511	ensureBufferCapacity(len + otherLen);
				1512	int i = 0, j = 0, k = 0;
				1513	int a = list[i++];
				1514	int b = other[j++];
				1515	// change from xor is that we have to check overlapping pairs
				1516	// polarity bit 1 means a is second, bit 2 means b is.
				1517	main:
				1518	while (true) {
				1519	switch (polarity) {
				1520	case 0: // both first; drop the smaller
				1521	if (a < b) { // drop a
				1522	a = list[i++]; polarity ^= 1;
				1523	} else if (b < a) { // drop b
				1524	b = other[j++]; polarity ^= 2;
				1525	} else { // a == b, take one, drop other
				1526	if (a == HIGH) break main;
				1527	buffer[k++] = a; a = list[i++]; polarity ^= 1;
				1528	b = other[j++]; polarity ^= 2;
				1529	}
				1530	break;
				1531	case 3: // both second; take lower if unequal
				1532	if (a < b) { // take a
				1533	buffer[k++] = a; a = list[i++]; polarity ^= 1;
				1534	} else if (b < a) { // take b
				1535	buffer[k++] = b; b = other[j++]; polarity ^= 2;
				1536	} else { // a == b, take one, drop other
				1537	if (a == HIGH) break main;
				1538	buffer[k++] = a; a = list[i++]; polarity ^= 1;
				1539	b = other[j++]; polarity ^= 2;
				1540	}
				1541	break;
				1542	case 1: // a second, b first;
				1543	if (a < b) { // NO OVERLAP, drop a
				1544	a = list[i++]; polarity ^= 1;
				1545	} else if (b < a) { // OVERLAP, take b
				1546	buffer[k++] = b; b = other[j++]; polarity ^= 2;
				1547	} else { // a == b, drop both!
				1548	if (a == HIGH) break main;
				1549	a = list[i++]; polarity ^= 1;
				1550	b = other[j++]; polarity ^= 2;
				1551	}
				1552	break;
				1553	case 2: // a first, b second; if a < b, overlap
				1554	if (b < a) { // no overlap, drop b
				1555	b = other[j++]; polarity ^= 2;
				1556	} else if (a < b) { // OVERLAP, take a
				1557	buffer[k++] = a; a = list[i++]; polarity ^= 1;
				1558	} else { // a == b, drop both!
				1559	if (a == HIGH) break main;
				1560	a = list[i++]; polarity ^= 1;
				1561	b = other[j++]; polarity ^= 2;
				1562	}
				1563	break;
				1564	}
				1565	}
				1566	buffer[k++] = HIGH; // terminate
				1567	len = k;
				1568	// swap list and buffer
				1569	int[] temp = list;
				1570	list = buffer;
				1571	buffer = temp;
				1572	pat = null;
				1573	return this;
				1574	}
				1575
				1576	private static final int max(int a, int b) {
				1577	return (a > b) ? a : b;
				1578	}
				1579
				1580	//----------------------------------------------------------------
				1581	// Generic filter-based scanning code
				1582	//----------------------------------------------------------------
				1583
				1584	private static interface Filter {
				1585	boolean contains(int codePoint);
				1586	}
				1587
				1588	// VersionInfo for unassigned characters
				1589	static final VersionInfo NO_VERSION = VersionInfo.getInstance(0, 0, 0, 0);
				1590
				1591	private static class VersionFilter implements Filter {
				1592	VersionInfo version;
				1593	VersionFilter(VersionInfo version) { this.version = version; }
				1594	public boolean contains(int ch) {
				1595	VersionInfo v = UCharacter.getAge(ch);
				1596	// Reference comparison ok; VersionInfo caches and reuses
				1597	// unique objects.
				1598	return v != NO_VERSION &&
				1599	v.compareTo(version) <= 0;
				1600	}
				1601	}
				1602
				1603	private static synchronized UnicodeSet getInclusions() {
				1604	if (INCLUSIONS == null) {
				1605	UCharacterProperty property = UCharacterProperty.getInstance();
				1606	INCLUSIONS = property.getInclusions();
				1607	}
				1608	return INCLUSIONS;
				1609	}
				1610
				1611	/**
				1612	* Generic filter-based scanning code for UCD property UnicodeSets.
				1613	*/
				1614	private UnicodeSet applyFilter(Filter filter) {
				1615	// Walk through all Unicode characters, noting the start
				1616	// and end of each range for which filter.contain(c) is
				1617	// true. Add each range to a set.
				1618	//
				1619	// To improve performance, use the INCLUSIONS set, which
				1620	// encodes information about character ranges that are known
				1621	// to have identical properties, such as the CJK Ideographs
				1622	// from U+4E00 to U+9FA5. INCLUSIONS contains all characters
				1623	// except the first characters of such ranges.
				1624	//
				1625	// TODO Where possible, instead of scanning over code points,
				1626	// use internal property data to initialize UnicodeSets for
				1627	// those properties. Scanning code points is slow.
				1628
				1629	clear();
				1630
				1631	int startHasProperty = -1;
				1632	UnicodeSet inclusions = getInclusions();
				1633	int limitRange = inclusions.getRangeCount();
				1634
				1635	for (int j=0; j<limitRange; ++j) {
				1636	// get current range
				1637	int start = inclusions.getRangeStart(j);
				1638	int end = inclusions.getRangeEnd(j);
				1639
				1640	// for all the code points in the range, process
				1641	for (int ch = start; ch <= end; ++ch) {
				1642	// only add to the unicodeset on inflection points --
				1643	// where the hasProperty value changes to false
				1644	if (filter.contains(ch)) {
				1645	if (startHasProperty < 0) {
				1646	startHasProperty = ch;
				1647	}
				1648	} else if (startHasProperty >= 0) {
				1649	add(startHasProperty, ch-1);
				1650	startHasProperty = -1;
				1651	}
				1652	}
				1653	}
				1654	if (startHasProperty >= 0) {
				1655	add(startHasProperty, 0x10FFFF);
				1656	}
				1657
				1658	return this;
				1659	}
				1660
				1661
				1662	/**
				1663	* Remove leading and trailing rule white space and compress
				1664	* internal rule white space to a single space character.
				1665	*
				1666	* @see UCharacterProperty#isRuleWhiteSpace
				1667	*/
				1668	private static String mungeCharName(String source) {
				1669	StringBuffer buf = new StringBuffer();
				1670	for (int i=0; i<source.length(); ) {
				1671	int ch = UTF16.charAt(source, i);
				1672	i += UTF16.getCharCount(ch);
				1673	if (UCharacterProperty.isRuleWhiteSpace(ch)) {
				1674	if (buf.length() == 0 \|\|
				1675	buf.charAt(buf.length() - 1) == ' ') {
				1676	continue;
				1677	}
				1678	ch = ' '; // convert to ' '
				1679	}
				1680	UTF16.append(buf, ch);
				1681	}
				1682	if (buf.length() != 0 &&
				1683	buf.charAt(buf.length() - 1) == ' ') {
				1684	buf.setLength(buf.length() - 1);
				1685	}
				1686	return buf.toString();
				1687	}
				1688
				1689	//----------------------------------------------------------------
				1690	// Property set API
				1691	//----------------------------------------------------------------
				1692
				1693	/**
				1694	* Modifies this set to contain those code points which have the
				1695	* given value for the given property. Prior contents of this
				1696	* set are lost.
				1697	* @param propertyAlias
				1698	* @param valueAlias
				1699	* @param symbols if not null, then symbols are first called to see if a property
				1700	* is available. If true, then everything else is skipped.
				1701	* @return this set
				1702	* @draft ICU 3.2
				1703	* @deprecated This is a draft API and might change in a future release of ICU.
				1704	*/
				1705	public UnicodeSet applyPropertyAlias(String propertyAlias,
				1706	String valueAlias, SymbolTable symbols) {
				1707	if (propertyAlias.equals("Age"))
				1708	{
				1709	// Must munge name, since
				1710	// VersionInfo.getInstance() does not do
				1711	// 'loose' matching.
				1712	VersionInfo version = VersionInfo.getInstance(mungeCharName(valueAlias));
				1713	applyFilter(new VersionFilter(version));
				1714	return this;
				1715	}
				1716	else
				1717	throw new IllegalArgumentException("Unsupported property");
				1718	}
				1719
				1720	/**
				1721	* Return true if the given iterator appears to point at a
				1722	* property pattern. Regardless of the result, return with the
				1723	* iterator unchanged.
				1724	* @param chars iterator over the pattern characters. Upon return
				1725	* it will be unchanged.
				1726	* @param iterOpts RuleCharacterIterator options
				1727	*/
				1728	private static boolean resemblesPropertyPattern(RuleCharacterIterator chars,
				1729	int iterOpts) {
				1730	boolean result = false;
				1731	iterOpts &= ~RuleCharacterIterator.PARSE_ESCAPES;
				1732	Object pos = chars.getPos(null);
				1733	int c = chars.next(iterOpts);
				1734	if (c == '[' \|\| c == '\\') {
				1735	int d = chars.next(iterOpts & ~RuleCharacterIterator.SKIP_WHITESPACE);
				1736	result = (c == '[') ? (d == ':') :
				1737	(d == 'N' \|\| d == 'p' \|\| d == 'P');
				1738	}
				1739	chars.setPos(pos);
				1740	return result;
				1741	}
				1742
				1743	/**
				1744	* Parse the given property pattern at the given parse position.
				1745	* @param symbols TODO
				1746	*/
				1747	private UnicodeSet applyPropertyPattern(String pattern, ParsePosition ppos, SymbolTable symbols) {
				1748	int pos = ppos.getIndex();
				1749
				1750	// On entry, ppos should point to one of the following locations:
				1751
				1752	// Minimum length is 5 characters, e.g. \p{L}
				1753	if ((pos+5) > pattern.length()) {
				1754	return null;
				1755	}
				1756
				1757	boolean posix = false; // true for [:pat:], false for \p{pat} \P{pat} \N{pat}
				1758	boolean isName = false; // true for \N{pat}, o/w false
				1759	boolean invert = false;
				1760
				1761	// Look for an opening [:, [:^, \p, or \P
				1762	if (pattern.regionMatches(pos, "[:", 0, 2)) {
				1763	posix = true;
				1764	pos = Utility.skipWhitespace(pattern, pos+2);
				1765	if (pos < pattern.length() && pattern.charAt(pos) == '^') {
				1766	++pos;
				1767	invert = true;
				1768	}
				1769	} else if (pattern.regionMatches(true, pos, "\\p", 0, 2) \|\|
				1770	pattern.regionMatches(pos, "\\N", 0, 2)) {
				1771	char c = pattern.charAt(pos+1);
				1772	invert = (c == 'P');
				1773	isName = (c == 'N');
				1774	pos = Utility.skipWhitespace(pattern, pos+2);
				1775	if (pos == pattern.length() \|\| pattern.charAt(pos++) != '{') {
				1776	// Syntax error; "\p" or "\P" not followed by "{"
				1777	return null;
				1778	}
				1779	} else {
				1780	// Open delimiter not seen
				1781	return null;
				1782	}
				1783
				1784	// Look for the matching close delimiter, either :] or }
				1785	int close = pattern.indexOf(posix ? ":]" : "}", pos);
				1786	if (close < 0) {
				1787	// Syntax error; close delimiter missing
				1788	return null;
				1789	}
				1790
				1791	// Look for an '=' sign. If this is present, we will parse a
				1792	// medium \p{gc=Cf} or long \p{GeneralCategory=Format}
				1793	// pattern.
				1794	int equals = pattern.indexOf('=', pos);
				1795	String propName, valueName;
				1796	if (equals >= 0 && equals < close && !isName) {
				1797	// Equals seen; parse medium/long pattern
				1798	propName = pattern.substring(pos, equals);
				1799	valueName = pattern.substring(equals+1, close);
				1800	}
				1801
				1802	else {
				1803	// Handle case where no '=' is seen, and \N{}
				1804	propName = pattern.substring(pos, close);
				1805	valueName = "";
				1806
				1807	// Handle \N{name}
				1808	if (isName) {
				1809	// This is a little inefficient since it means we have to
				1810	// parse "na" back to UProperty.NAME even though we already
				1811	// know it's UProperty.NAME. If we refactor the API to
				1812	// support args of (int, String) then we can remove
				1813	// "na" and make this a little more efficient.
				1814	valueName = propName;
				1815	propName = "na";
				1816	}
				1817	}
				1818
				1819	applyPropertyAlias(propName, valueName, symbols);
				1820
				1821	if (invert) {
				1822	complement();
				1823	}
				1824
				1825	// Move to the limit position after the close delimiter
				1826	ppos.setIndex(close + (posix ? 2 : 1));
				1827
				1828	return this;
				1829	}
				1830
				1831	/**
				1832	* Parse a property pattern.
				1833	* @param chars iterator over the pattern characters. Upon return
				1834	* it will be advanced to the first character after the parsed
				1835	* pattern, or the end of the iteration if all characters are
				1836	* parsed.
				1837	* @param rebuiltPat the pattern that was parsed, rebuilt or
				1838	* copied from the input pattern, as appropriate.
				1839	* @param symbols TODO
				1840	*/
				1841	private void applyPropertyPattern(RuleCharacterIterator chars,
				1842	StringBuffer rebuiltPat, SymbolTable symbols) {
				1843	String pat = chars.lookahead();
				1844	ParsePosition pos = new ParsePosition(0);
				1845	applyPropertyPattern(pat, pos, symbols);
				1846	if (pos.getIndex() == 0) {
				1847	syntaxError(chars, "Invalid property pattern");
				1848	}
				1849	chars.jumpahead(pos.getIndex());
				1850	rebuiltPat.append(pat.substring(0, pos.getIndex()));
				1851	}
				1852
				1853	//----------------------------------------------------------------
				1854	// Case folding API
				1855	//----------------------------------------------------------------
				1856
				1857	/**
				1858	* Bitmask for constructor and applyPattern() indicating that
				1859	* white space should be ignored. If set, ignore characters for
				1860	* which UCharacterProperty.isRuleWhiteSpace() returns true,
				1861	* unless they are quoted or escaped. This may be ORed together
				1862	* with other selectors.
				1863	* @internal
				1864	*/
				1865	public static final int IGNORE_SPACE = 1;
				1866
				1867	}