blob: 1a748ff173121c420b18d219faad05b7ae5b1445 [file] [log] [blame]
/*
* Portions Copyright 2005-2006 Sun Microsystems, Inc. All Rights Reserved.
* DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
*
* This code is free software; you can redistribute it and/or modify it
* under the terms of the GNU General Public License version 2 only, as
* published by the Free Software Foundation. Sun designates this
* particular file as subject to the "Classpath" exception as provided
* by Sun in the LICENSE file that accompanied this code.
*
* This code is distributed in the hope that it will be useful, but WITHOUT
* ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
* FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
* version 2 for more details (a copy is included in the LICENSE file that
* accompanied this code).
*
* You should have received a copy of the GNU General Public License version
* 2 along with this work; if not, write to the Free Software Foundation,
* Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
*
* Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
* CA 95054 USA or visit www.sun.com if you need additional information or
* have any questions.
*/
/*
*******************************************************************************
* (C) Copyright IBM Corp. 1996-2005 - All Rights Reserved *
* *
* The original version of this source code and documentation is copyrighted *
* and owned by IBM, These materials are provided under terms of a License *
* Agreement between IBM and Sun. This technology is protected by multiple *
* US and International patents. This notice and attribution to IBM may not *
* to removed. *
*******************************************************************************
*/
package sun.text.normalizer;
import java.io.BufferedInputStream;
import java.io.InputStream;
import java.io.IOException;
import java.text.BreakIterator;
import java.util.Locale;
/**
* <p>Internal class used for Unicode character property database.</p>
* <p>This classes store binary data read from uprops.icu.
* It does not have the capability to parse the data into more high-level
* information. It only returns bytes of information when required.</p>
* <p>Due to the form most commonly used for retrieval, array of char is used
* to store the binary data.</p>
* <p>UCharacterPropertyDB also contains information on accessing indexes to
* significant points in the binary data.</p>
* <p>Responsibility for molding the binary data into more meaning form lies on
* <a href=UCharacter.html>UCharacter</a>.</p>
* @author Syn Wee Quek
* @since release 2.1, february 1st 2002
* @draft 2.1
*/
public final class UCharacterProperty implements Trie.DataManipulate
{
// public data members -----------------------------------------------
/**
* Trie data
*/
public CharTrie m_trie_;
/**
* Optimization
* CharTrie index array
*/
public char[] m_trieIndex_;
/**
* Optimization
* CharTrie data array
*/
public char[] m_trieData_;
/**
* Optimization
* CharTrie data offset
*/
public int m_trieInitialValue_;
/**
* Character property table
*/
public int m_property_[];
/**
* Unicode version
*/
public VersionInfo m_unicodeVersion_;
/**
* Exception indicator for uppercase type
*/
public static final int EXC_UPPERCASE_ = 0;
/**
* Exception indicator for lowercase type
*/
public static final int EXC_LOWERCASE_ = 1;
/**
* Exception indicator for titlecase type
*/
public static final int EXC_TITLECASE_ = 2;
/**
* Exception indicator for digit type
*/
public static final int EXC_UNUSED_ = 3;
/**
* Exception indicator for numeric type
*/
public static final int EXC_NUMERIC_VALUE_ = 4;
/**
* Exception indicator for denominator type
*/
public static final int EXC_DENOMINATOR_VALUE_ = 5;
/**
* Exception indicator for mirror type
*/
public static final int EXC_MIRROR_MAPPING_ = 6;
/**
* Exception indicator for special casing type
*/
public static final int EXC_SPECIAL_CASING_ = 7;
/**
* Exception indicator for case folding type
*/
public static final int EXC_CASE_FOLDING_ = 8;
/**
* EXC_COMBINING_CLASS_ is not found in ICU.
* Used to retrieve the combining class of the character in the exception
* value
*/
public static final int EXC_COMBINING_CLASS_ = 9;
/**
* Latin lowercase i
*/
public static final char LATIN_SMALL_LETTER_I_ = 0x69;
/**
* Character type mask
*/
public static final int TYPE_MASK = 0x1F;
/**
* Exception test mask
*/
public static final int EXCEPTION_MASK = 0x20;
// public methods ----------------------------------------------------
/**
* Java friends implementation
*/
public void setIndexData(CharTrie.FriendAgent friendagent)
{
m_trieIndex_ = friendagent.getPrivateIndex();
m_trieData_ = friendagent.getPrivateData();
m_trieInitialValue_ = friendagent.getPrivateInitialValue();
}
/**
* Called by com.ibm.icu.util.Trie to extract from a lead surrogate's
* data the index array offset of the indexes for that lead surrogate.
* @param value data value for a surrogate from the trie, including the
* folding offset
* @return data offset or 0 if there is no data for the lead surrogate
*/
public int getFoldingOffset(int value)
{
if ((value & SUPPLEMENTARY_FOLD_INDICATOR_MASK_) != 0) {
return (value & SUPPLEMENTARY_FOLD_OFFSET_MASK_);
}
else {
return 0;
}
}
/**
* Gets the property value at the index.
* This is optimized.
* Note this is alittle different from CharTrie the index m_trieData_
* is never negative.
* @param ch code point whose property value is to be retrieved
* @return property value of code point
*/
public int getProperty(int ch)
{
if (ch < UTF16.LEAD_SURROGATE_MIN_VALUE
|| (ch > UTF16.LEAD_SURROGATE_MAX_VALUE
&& ch < UTF16.SUPPLEMENTARY_MIN_VALUE)) {
// BMP codepoint
// optimized
try {
return m_property_[
m_trieData_[
(m_trieIndex_[ch >> Trie.INDEX_STAGE_1_SHIFT_]
<< Trie.INDEX_STAGE_2_SHIFT_)
+ (ch & Trie.INDEX_STAGE_3_MASK_)]];
} catch (ArrayIndexOutOfBoundsException e) {
return m_property_[m_trieInitialValue_];
}
}
if (ch <= UTF16.LEAD_SURROGATE_MAX_VALUE) {
return m_property_[
m_trieData_[
(m_trieIndex_[Trie.LEAD_INDEX_OFFSET_
+ (ch >> Trie.INDEX_STAGE_1_SHIFT_)]
<< Trie.INDEX_STAGE_2_SHIFT_)
+ (ch & Trie.INDEX_STAGE_3_MASK_)]];
}
// for optimization
if (ch <= UTF16.CODEPOINT_MAX_VALUE) {
// look at the construction of supplementary characters
// trail forms the ends of it.
return m_property_[m_trie_.getSurrogateValue(
UTF16.getLeadSurrogate(ch),
(char)(ch & Trie.SURROGATE_MASK_))];
}
// return m_dataOffset_ if there is an error, in this case we return
// the default value: m_initialValue_
// we cannot assume that m_initialValue_ is at offset 0
// this is for optimization.
return m_property_[m_trieInitialValue_];
// return m_property_[m_trie_.getCodePointValue(ch)];
}
/**
* Getting the signed numeric value of a character embedded in the property
* argument
* @param prop the character
* @return signed numberic value
*/
public static int getSignedValue(int prop)
{
return (prop >> VALUE_SHIFT_);
}
/**
* Getting the exception index for argument property
* @param prop character property
* @return exception index
*/
public static int getExceptionIndex(int prop)
{
return (prop >> VALUE_SHIFT_) & UNSIGNED_VALUE_MASK_AFTER_SHIFT_;
}
/**
* Determines if the exception value passed in has the kind of information
* which the indicator wants, e.g if the exception value contains the digit
* value of the character
* @param index exception index
* @param indicator type indicator
* @return true if type value exist
*/
public boolean hasExceptionValue(int index, int indicator)
{
return (m_exception_[index] & (1 << indicator)) != 0;
}
/**
* Gets the exception value at the index, assuming that data type is
* available. Result is undefined if data is not available. Use
* hasExceptionValue() to determine data's availability.
* @param index
* @param etype exception data type
* @return exception data type value at index
*/
public int getException(int index, int etype)
{
// contained in exception data
if (etype == EXC_COMBINING_CLASS_) {
return m_exception_[index];
}
// contained in the exception digit address
index = addExceptionOffset(m_exception_[index], etype, ++ index);
return m_exception_[index];
}
/**
* Gets the folded case value at the index
* @param index of the case value to be retrieved
* @param count number of characters to retrieve
* @param str string buffer to which to append the result
*/
public void getFoldCase(int index, int count, StringBuffer str)
{
// first 2 chars are for the simple mappings
index += 2;
while (count > 0) {
str.append(m_case_[index]);
index ++;
count --;
}
}
/**
* Gets the unicode additional properties.
* C version getUnicodeProperties.
* @param codepoint codepoint whose additional properties is to be
* retrieved
* @return unicode properties
*/
public int getAdditional(int codepoint) {
return m_additionalVectors_[m_additionalTrie_.getCodePointValue(codepoint)];
}
/**
* <p>Get the "age" of the code point.</p>
* <p>The "age" is the Unicode version when the code point was first
* designated (as a non-character or for Private Use) or assigned a
* character.</p>
* <p>This can be useful to avoid emitting code points to receiving
* processes that do not accept newer characters.</p>
* <p>The data is from the UCD file DerivedAge.txt.</p>
* <p>This API does not check the validity of the codepoint.</p>
* @param codepoint The code point.
* @return the Unicode version number
* @draft ICU 2.1
*/
public VersionInfo getAge(int codepoint)
{
int version = getAdditional(codepoint) >> AGE_SHIFT_;
return VersionInfo.getInstance(
(version >> FIRST_NIBBLE_SHIFT_) & LAST_NIBBLE_MASK_,
version & LAST_NIBBLE_MASK_, 0, 0);
}
/**
* Forms a supplementary code point from the argument character<br>
* Note this is for internal use hence no checks for the validity of the
* surrogate characters are done
* @param lead lead surrogate character
* @param trail trailing surrogate character
* @return code point of the supplementary character
*/
public static int getRawSupplementary(char lead, char trail)
{
return (lead << LEAD_SURROGATE_SHIFT_) + trail + SURROGATE_OFFSET_;
}
/**
* Loads the property data and initialize the UCharacterProperty instance.
* @throws RuntimeException when data is missing or data has been corrupted
*/
public static UCharacterProperty getInstance() throws RuntimeException
{
if (INSTANCE_ == null) {
try {
INSTANCE_ = new UCharacterProperty();
}
catch (Exception e) {
throw new RuntimeException(e.getMessage());
}
}
return INSTANCE_;
}
/**
* Checks if the argument c is to be treated as a white space in ICU
* rules. Usually ICU rule white spaces are ignored unless quoted.
* @param c codepoint to check
* @return true if c is a ICU white space
*/
public static boolean isRuleWhiteSpace(int c)
{
/* "white space" in the sense of ICU rule parsers
This is a FIXED LIST that is NOT DEPENDENT ON UNICODE PROPERTIES.
See UTR #31: http://www.unicode.org/reports/tr31/.
U+0009..U+000D, U+0020, U+0085, U+200E..U+200F, and U+2028..U+2029
*/
return (c >= 0x0009 && c <= 0x2029 &&
(c <= 0x000D || c == 0x0020 || c == 0x0085 ||
c == 0x200E || c == 0x200F || c >= 0x2028));
}
// protected variables -----------------------------------------------
/**
* Case table
*/
char m_case_[];
/**
* Exception property table
*/
int m_exception_[];
/**
* Extra property trie
*/
CharTrie m_additionalTrie_;
/**
* Extra property vectors, 1st column for age and second for binary
* properties.
*/
int m_additionalVectors_[];
/**
* Number of additional columns
*/
int m_additionalColumnsCount_;
/**
* Maximum values for block, bits used as in vector word
* 0
*/
int m_maxBlockScriptValue_;
/**
* Maximum values for script, bits used as in vector word
* 0
*/
int m_maxJTGValue_;
// private variables -------------------------------------------------
/**
* UnicodeData.txt property object
*/
private static UCharacterProperty INSTANCE_ = null;
/**
* Default name of the datafile
*/
private static final String DATA_FILE_NAME_ = "/sun/text/resources/uprops.icu";
/**
* Default buffer size of datafile
*/
private static final int DATA_BUFFER_SIZE_ = 25000;
/**
* This, from what i infer is the max size of the indicators used for the
* exception values.
* Number of bits in an 8-bit integer value
*/
private static final int EXC_GROUP_ = 8;
/**
* Mask to get the group
*/
private static final int EXC_GROUP_MASK_ = 255;
/**
* Mask to get the digit value in the exception result
*/
private static final int EXC_DIGIT_MASK_ = 0xFFFF;
/**
* Offset table for data in exception block.<br>
* Table formed by the number of bits used for the index, e.g. 0 = 0 bits,
* 1 = 1 bits.
*/
private static final byte FLAGS_OFFSET_[] =
{
0, 1, 1, 2, 1, 2, 2, 3, 1, 2, 2, 3, 2, 3, 3, 4,
1, 2, 2, 3, 2, 3, 3, 4, 2, 3, 3, 4, 3, 4, 4, 5,
1, 2, 2, 3, 2, 3, 3, 4, 2, 3, 3, 4, 3, 4, 4, 5,
2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6,
1, 2, 2, 3, 2, 3, 3, 4, 2, 3, 3, 4, 3, 4, 4, 5,
2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6,
2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6,
3, 4, 4, 5, 4, 5, 5, 6, 4, 5, 5, 6, 5, 6, 6, 7,
1, 2, 2, 3, 2, 3, 3, 4, 2, 3, 3, 4, 3, 4, 4, 5,
2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6,
2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6,
3, 4, 4, 5, 4, 5, 5, 6, 4, 5, 5, 6, 5, 6, 6, 7,
2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6,
3, 4, 4, 5, 4, 5, 5, 6, 4, 5, 5, 6, 5, 6, 6, 7,
3, 4, 4, 5, 4, 5, 5, 6, 4, 5, 5, 6, 5, 6, 6, 7,
4, 5, 5, 6, 5, 6, 6, 7, 5, 6, 6, 7, 6, 7, 7, 8
};
/**
* Numeric value shift
*/
private static final int VALUE_SHIFT_ = 20;
/**
* Mask to be applied after shifting to obtain an unsigned numeric value
*/
private static final int UNSIGNED_VALUE_MASK_AFTER_SHIFT_ = 0x7FF;
/**
*
*/
private static final int NUMERIC_TYPE_SHIFT = 12;
/**
* Folding indicator mask
*/
private static final int SUPPLEMENTARY_FOLD_INDICATOR_MASK_ = 0x8000;
/**
* Folding offset mask
*/
private static final int SUPPLEMENTARY_FOLD_OFFSET_MASK_ = 0x7FFF;
/**
* Shift value for lead surrogate to form a supplementary character.
*/
private static final int LEAD_SURROGATE_SHIFT_ = 10;
/**
* Offset to add to combined surrogate pair to avoid msking.
*/
private static final int SURROGATE_OFFSET_ =
UTF16.SUPPLEMENTARY_MIN_VALUE -
(UTF16.SURROGATE_MIN_VALUE <<
LEAD_SURROGATE_SHIFT_) -
UTF16.TRAIL_SURROGATE_MIN_VALUE;
/**
* To get the last character out from a data type
*/
private static final int LAST_CHAR_MASK_ = 0xFFFF;
/**
* First nibble shift
*/
private static final int FIRST_NIBBLE_SHIFT_ = 0x4;
/**
* Second nibble mask
*/
private static final int LAST_NIBBLE_MASK_ = 0xF;
/**
* Age value shift
*/
private static final int AGE_SHIFT_ = 24;
// private constructors --------------------------------------------------
/**
* Constructor
* @exception thrown when data reading fails or data corrupted
*/
private UCharacterProperty() throws IOException
{
// jar access
InputStream is = ICUData.getRequiredStream(DATA_FILE_NAME_);
BufferedInputStream b = new BufferedInputStream(is, DATA_BUFFER_SIZE_);
UCharacterPropertyReader reader = new UCharacterPropertyReader(b);
reader.read(this);
b.close();
m_trie_.putIndexData(this);
}
/* Is followed by {case-ignorable}* cased ? */
/**
* Getting the correct address for data in the exception value
* @param evalue exception value
* @param indicator type of data to retrieve
* @param address current address to move from
* @return the correct address
*/
private int addExceptionOffset(int evalue, int indicator, int address)
{
int result = address;
if (indicator >= EXC_GROUP_) {
result += FLAGS_OFFSET_[evalue & EXC_GROUP_MASK_];
evalue >>= EXC_GROUP_;
indicator -= EXC_GROUP_;
}
int mask = (1 << indicator) - 1;
result += FLAGS_OFFSET_[evalue & mask];
return result;
}
private static final int TAB = 0x0009;
private static final int LF = 0x000a;
private static final int FF = 0x000c;
private static final int CR = 0x000d;
private static final int U_A = 0x0041;
private static final int U_Z = 0x005a;
private static final int U_a = 0x0061;
private static final int U_z = 0x007a;
private static final int DEL = 0x007f;
private static final int NL = 0x0085;
private static final int NBSP = 0x00a0;
private static final int CGJ = 0x034f;
private static final int FIGURESP= 0x2007;
private static final int HAIRSP = 0x200a;
private static final int ZWNJ = 0x200c;
private static final int ZWJ = 0x200d;
private static final int RLM = 0x200f;
private static final int NNBSP = 0x202f;
private static final int WJ = 0x2060;
private static final int INHSWAP = 0x206a;
private static final int NOMDIG = 0x206f;
private static final int ZWNBSP = 0xfeff;
public UnicodeSet addPropertyStarts(UnicodeSet set) {
int c;
/* add the start code point of each same-value range of each trie */
//utrie_enum(&normTrie, NULL, _enumPropertyStartsRange, set);
TrieIterator propsIter = new TrieIterator(m_trie_);
RangeValueIterator.Element propsResult = new RangeValueIterator.Element();
while(propsIter.next(propsResult)){
set.add(propsResult.start);
}
//utrie_enum(&propsVectorsTrie, NULL, _enumPropertyStartsRange, set);
TrieIterator propsVectorsIter = new TrieIterator(m_additionalTrie_);
RangeValueIterator.Element propsVectorsResult = new RangeValueIterator.Element();
while(propsVectorsIter.next(propsVectorsResult)){
set.add(propsVectorsResult.start);
}
/* add code points with hardcoded properties, plus the ones following them */
/* add for IS_THAT_CONTROL_SPACE() */
set.add(TAB); /* range TAB..CR */
set.add(CR+1);
set.add(0x1c);
set.add(0x1f+1);
set.add(NL);
set.add(NL+1);
/* add for u_isIDIgnorable() what was not added above */
set.add(DEL); /* range DEL..NBSP-1, NBSP added below */
set.add(HAIRSP);
set.add(RLM+1);
set.add(INHSWAP);
set.add(NOMDIG+1);
set.add(ZWNBSP);
set.add(ZWNBSP+1);
/* add no-break spaces for u_isWhitespace() what was not added above */
set.add(NBSP);
set.add(NBSP+1);
set.add(FIGURESP);
set.add(FIGURESP+1);
set.add(NNBSP);
set.add(NNBSP+1);
/* add for u_charDigitValue() */
set.add(0x3007);
set.add(0x3008);
set.add(0x4e00);
set.add(0x4e01);
set.add(0x4e8c);
set.add(0x4e8d);
set.add(0x4e09);
set.add(0x4e0a);
set.add(0x56db);
set.add(0x56dc);
set.add(0x4e94);
set.add(0x4e95);
set.add(0x516d);
set.add(0x516e);
set.add(0x4e03);
set.add(0x4e04);
set.add(0x516b);
set.add(0x516c);
set.add(0x4e5d);
set.add(0x4e5e);
/* add for u_digit() */
set.add(U_a);
set.add(U_z+1);
set.add(U_A);
set.add(U_Z+1);
/* add for UCHAR_DEFAULT_IGNORABLE_CODE_POINT what was not added above */
set.add(WJ); /* range WJ..NOMDIG */
set.add(0xfff0);
set.add(0xfffb+1);
set.add(0xe0000);
set.add(0xe0fff+1);
/* add for UCHAR_GRAPHEME_BASE and others */
set.add(CGJ);
set.add(CGJ+1);
/* add for UCHAR_JOINING_TYPE */
set.add(ZWNJ); /* range ZWNJ..ZWJ */
set.add(ZWJ+1);
/* add Jamo type boundaries for UCHAR_HANGUL_SYLLABLE_TYPE */
set.add(0x1100);
int value= UCharacter.HangulSyllableType.LEADING_JAMO;
int value2;
for(c=0x115a; c<=0x115f; ++c) {
value2= UCharacter.getIntPropertyValue(c, UProperty.HANGUL_SYLLABLE_TYPE);
if(value!=value2) {
value=value2;
set.add(c);
}
}
set.add(0x1160);
value=UCharacter.HangulSyllableType.VOWEL_JAMO;
for(c=0x11a3; c<=0x11a7; ++c) {
value2=UCharacter.getIntPropertyValue(c, UProperty.HANGUL_SYLLABLE_TYPE);
if(value!=value2) {
value=value2;
set.add(c);
}
}
set.add(0x11a8);
value=UCharacter.HangulSyllableType.TRAILING_JAMO;
for(c=0x11fa; c<=0x11ff; ++c) {
value2=UCharacter.getIntPropertyValue(c, UProperty.HANGUL_SYLLABLE_TYPE);
if(value!=value2) {
value=value2;
set.add(c);
}
}
/*
* Omit code points for u_charCellWidth() because
* - it is deprecated and not a real Unicode property
* - they are probably already set from the trie enumeration
*/
/*
* Omit code points with hardcoded specialcasing properties
* because we do not build property UnicodeSets for them right now.
*/
return set; // for chaining
}
/*----------------------------------------------------------------
* Inclusions list
*----------------------------------------------------------------*/
/*
* Return a set of characters for property enumeration.
* The set implicitly contains 0x110000 as well, which is one more than the highest
* Unicode code point.
*
* This set is used as an ordered list - its code points are ordered, and
* consecutive code points (in Unicode code point order) in the set define a range.
* For each two consecutive characters (start, limit) in the set,
* all of the UCD/normalization and related properties for
* all code points start..limit-1 are all the same,
* except for character names and ISO comments.
*
* All Unicode code points U+0000..U+10ffff are covered by these ranges.
* The ranges define a partition of the Unicode code space.
* ICU uses the inclusions set to enumerate properties for generating
* UnicodeSets containing all code points that have a certain property value.
*
* The Inclusion List is generated from the UCD. It is generated
* by enumerating the data tries, and code points for hardcoded properties
* are added as well.
*
* --------------------------------------------------------------------------
*
* The following are ideas for getting properties-unique code point ranges,
* with possible optimizations beyond the current implementation.
* These optimizations would require more code and be more fragile.
* The current implementation generates one single list (set) for all properties.
*
* To enumerate properties efficiently, one needs to know ranges of
* repetitive values, so that the value of only each start code point
* can be applied to the whole range.
* This information is in principle available in the uprops.icu/unorm.icu data.
*
* There are two obstacles:
*
* 1. Some properties are computed from multiple data structures,
* making it necessary to get repetitive ranges by intersecting
* ranges from multiple tries.
*
* 2. It is not economical to write code for getting repetitive ranges
* that are precise for each of some 50 properties.
*
* Compromise ideas:
*
* - Get ranges per trie, not per individual property.
* Each range contains the same values for a whole group of properties.
* This would generate currently five range sets, two for uprops.icu tries
* and three for unorm.icu tries.
*
* - Combine sets of ranges for multiple tries to get sufficient sets
* for properties, e.g., the uprops.icu main and auxiliary tries
* for all non-normalization properties.
*
* Ideas for representing ranges and combining them:
*
* - A UnicodeSet could hold just the start code points of ranges.
* Multiple sets are easily combined by or-ing them together.
*
* - Alternatively, a UnicodeSet could hold each even-numbered range.
* All ranges could be enumerated by using each start code point
* (for the even-numbered ranges) as well as each limit (end+1) code point
* (for the odd-numbered ranges).
* It should be possible to combine two such sets by xor-ing them,
* but no more than two.
*
* The second way to represent ranges may(?!) yield smaller UnicodeSet arrays,
* but the first one is certainly simpler and applicable for combining more than
* two range sets.
*
* It is possible to combine all range sets for all uprops/unorm tries into one
* set that can be used for all properties.
* As an optimization, there could be less-combined range sets for certain
* groups of properties.
* The relationship of which less-combined range set to use for which property
* depends on the implementation of the properties and must be hardcoded
* - somewhat error-prone and higher maintenance but can be tested easily
* by building property sets "the simple way" in test code.
*
* ---
*
* Do not use a UnicodeSet pattern because that causes infinite recursion;
* UnicodeSet depends on the inclusions set.
*/
public UnicodeSet getInclusions() {
UnicodeSet set = new UnicodeSet();
NormalizerImpl.addPropertyStarts(set);
addPropertyStarts(set);
return set;
}
}