Blame - jdk/src/share/classes/java/lang/ConditionalSpecialCasing.java - platform/libcore

blob: d81e550eef5ffcb00a5b0ad30fc6c97cf86e0f92 [file] [log] [blame]

J. Duke	319a3b9	2007-12-01 00:00:00 +0000	[diff] [blame^]	1	/*
				2	* Copyright 2003-2005 Sun Microsystems, Inc. All Rights Reserved.
				3	* DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
				4	*
				5	* This code is free software; you can redistribute it and/or modify it
				6	* under the terms of the GNU General Public License version 2 only, as
				7	* published by the Free Software Foundation. Sun designates this
				8	* particular file as subject to the "Classpath" exception as provided
				9	* by Sun in the LICENSE file that accompanied this code.
				10	*
				11	* This code is distributed in the hope that it will be useful, but WITHOUT
				12	* ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
				13	* FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
				14	* version 2 for more details (a copy is included in the LICENSE file that
				15	* accompanied this code).
				16	*
				17	* You should have received a copy of the GNU General Public License version
				18	* 2 along with this work; if not, write to the Free Software Foundation,
				19	* Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
				20	*
				21	* Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
				22	* CA 95054 USA or visit www.sun.com if you need additional information or
				23	* have any questions.
				24	*/
				25
				26	package java.lang;
				27
				28	import java.text.BreakIterator;
				29	import java.util.HashSet;
				30	import java.util.Hashtable;
				31	import java.util.Iterator;
				32	import java.util.Locale;
				33	import sun.text.Normalizer;
				34
				35
				36	/**
				37	* This is a utility class for <code>String.toLowerCase()</code> and
				38	* <code>String.toUpperCase()</code>, that handles special casing with
				39	* conditions. In other words, it handles the mappings with conditions
				40	* that are defined in
				41	* <a href="http://www.unicode.org/Public/UNIDATA/SpecialCasing.txt">Special
				42	* Casing Properties</a> file.
				43	* <p>
				44	* Note that the unconditional case mappings (including 1:M mappings)
				45	* are handled in <code>Character.toLower/UpperCase()</code>.
				46	*/
				47	final class ConditionalSpecialCasing {
				48
				49	// context conditions.
				50	final static int FINAL_CASED = 1;
				51	final static int AFTER_SOFT_DOTTED = 2;
				52	final static int MORE_ABOVE = 3;
				53	final static int AFTER_I = 4;
				54	final static int NOT_BEFORE_DOT = 5;
				55
				56	// combining class definitions
				57	final static int COMBINING_CLASS_ABOVE = 230;
				58
				59	// Special case mapping entries
				60	static Entry[] entry = {
				61	//# ================================================================================
				62	//# Conditional mappings
				63	//# ================================================================================
				64	new Entry(0x03A3, new char[]{0x03C2}, new char[]{0x03A3}, null, FINAL_CASED), // # GREEK CAPITAL LETTER SIGMA
				65
				66	//# ================================================================================
				67	//# Locale-sensitive mappings
				68	//# ================================================================================
				69	//# Lithuanian
				70	new Entry(0x0307, new char[]{0x0307}, new char[]{}, "lt", AFTER_SOFT_DOTTED), // # COMBINING DOT ABOVE
				71	new Entry(0x0049, new char[]{0x0069, 0x0307}, new char[]{0x0049}, "lt", MORE_ABOVE), // # LATIN CAPITAL LETTER I
				72	new Entry(0x004A, new char[]{0x006A, 0x0307}, new char[]{0x004A}, "lt", MORE_ABOVE), // # LATIN CAPITAL LETTER J
				73	new Entry(0x012E, new char[]{0x012F, 0x0307}, new char[]{0x012E}, "lt", MORE_ABOVE), // # LATIN CAPITAL LETTER I WITH OGONEK
				74	new Entry(0x00CC, new char[]{0x0069, 0x0307, 0x0300}, new char[]{0x00CC}, "lt", 0), // # LATIN CAPITAL LETTER I WITH GRAVE
				75	new Entry(0x00CD, new char[]{0x0069, 0x0307, 0x0301}, new char[]{0x00CD}, "lt", 0), // # LATIN CAPITAL LETTER I WITH ACUTE
				76	new Entry(0x0128, new char[]{0x0069, 0x0307, 0x0303}, new char[]{0x0128}, "lt", 0), // # LATIN CAPITAL LETTER I WITH TILDE
				77
				78	//# ================================================================================
				79	//# Turkish and Azeri
				80	// new Entry(0x0130, new char[]{0x0069}, new char[]{0x0130}, "tr", 0), // # LATIN CAPITAL LETTER I WITH DOT ABOVE
				81	// new Entry(0x0130, new char[]{0x0069}, new char[]{0x0130}, "az", 0), // # LATIN CAPITAL LETTER I WITH DOT ABOVE
				82	new Entry(0x0307, new char[]{}, new char[]{0x0307}, "tr", AFTER_I), // # COMBINING DOT ABOVE
				83	new Entry(0x0307, new char[]{}, new char[]{0x0307}, "az", AFTER_I), // # COMBINING DOT ABOVE
				84	new Entry(0x0049, new char[]{0x0131}, new char[]{0x0049}, "tr", NOT_BEFORE_DOT), // # LATIN CAPITAL LETTER I
				85	new Entry(0x0049, new char[]{0x0131}, new char[]{0x0049}, "az", NOT_BEFORE_DOT), // # LATIN CAPITAL LETTER I
				86	new Entry(0x0069, new char[]{0x0069}, new char[]{0x0130}, "tr", 0), // # LATIN SMALL LETTER I
				87	new Entry(0x0069, new char[]{0x0069}, new char[]{0x0130}, "az", 0) // # LATIN SMALL LETTER I
				88	};
				89
				90	// A hash table that contains the above entries
				91	static Hashtable entryTable = new Hashtable();
				92	static {
				93	// create hashtable from the entry
				94	for (int i = 0; i < entry.length; i ++) {
				95	Entry cur = entry[i];
				96	Integer cp = new Integer(cur.getCodePoint());
				97	HashSet set = (HashSet)entryTable.get(cp);
				98	if (set == null) {
				99	set = new HashSet();
				100	}
				101	set.add(cur);
				102	entryTable.put(cp, set);
				103	}
				104	}
				105
				106	static int toLowerCaseEx(String src, int index, Locale locale) {
				107	char[] result = lookUpTable(src, index, locale, true);
				108
				109	if (result != null) {
				110	if (result.length == 1) {
				111	return result[0];
				112	} else {
				113	return Character.ERROR;
				114	}
				115	} else {
				116	// default to Character class' one
				117	return Character.toLowerCase(src.codePointAt(index));
				118	}
				119	}
				120
				121	static int toUpperCaseEx(String src, int index, Locale locale) {
				122	char[] result = lookUpTable(src, index, locale, false);
				123
				124	if (result != null) {
				125	if (result.length == 1) {
				126	return result[0];
				127	} else {
				128	return Character.ERROR;
				129	}
				130	} else {
				131	// default to Character class' one
				132	return Character.toUpperCaseEx(src.codePointAt(index));
				133	}
				134	}
				135
				136	static char[] toLowerCaseCharArray(String src, int index, Locale locale) {
				137	return lookUpTable(src, index, locale, true);
				138	}
				139
				140	static char[] toUpperCaseCharArray(String src, int index, Locale locale) {
				141	char[] result = lookUpTable(src, index, locale, false);
				142	if (result != null) {
				143	return result;
				144	} else {
				145	return Character.toUpperCaseCharArray(src.codePointAt(index));
				146	}
				147	}
				148
				149	private static char[] lookUpTable(String src, int index, Locale locale, boolean bLowerCasing) {
				150	HashSet set = (HashSet)entryTable.get(new Integer(src.codePointAt(index)));
				151
				152	if (set != null) {
				153	Iterator iter = set.iterator();
				154	String currentLang = locale.getLanguage();
				155	while (iter.hasNext()) {
				156	Entry entry = (Entry)iter.next();
				157	String conditionLang= entry.getLanguage();
				158	if (((conditionLang == null) \|\| (conditionLang.equals(currentLang))) &&
				159	isConditionMet(src, index, locale, entry.getCondition())) {
				160	return (bLowerCasing ? entry.getLowerCase() : entry.getUpperCase());
				161	}
				162	}
				163	}
				164
				165	return null;
				166	}
				167
				168	private static boolean isConditionMet(String src, int index, Locale locale, int condition) {
				169	switch (condition) {
				170	case FINAL_CASED:
				171	return isFinalCased(src, index, locale);
				172
				173	case AFTER_SOFT_DOTTED:
				174	return isAfterSoftDotted(src, index);
				175
				176	case MORE_ABOVE:
				177	return isMoreAbove(src, index);
				178
				179	case AFTER_I:
				180	return isAfterI(src, index);
				181
				182	case NOT_BEFORE_DOT:
				183	return !isBeforeDot(src, index);
				184
				185	default:
				186	return true;
				187	}
				188	}
				189
				190	/**
				191	* Implements the "Final_Cased" condition
				192	*
				193	* Specification: Within the closest word boundaries containing C, there is a cased
				194	* letter before C, and there is no cased letter after C.
				195	*
				196	* Regular Expression:
				197	* Before C: [{cased==true}][{wordBoundary!=true}]*
				198	* After C: !([{wordBoundary!=true}]*[{cased}])
				199	*/
				200	private static boolean isFinalCased(String src, int index, Locale locale) {
				201	BreakIterator wordBoundary = BreakIterator.getWordInstance(locale);
				202	wordBoundary.setText(src);
				203	int ch;
				204
				205	// Look for a preceding 'cased' letter
				206	for (int i = index; (i >= 0) && !wordBoundary.isBoundary(i);
				207	i -= Character.charCount(ch)) {
				208
				209	ch = src.codePointBefore(i);
				210	if (isCased(ch)) {
				211
				212	int len = src.length();
				213	// Check that there is no 'cased' letter after the index
				214	for (i = index + Character.charCount(src.codePointAt(index));
				215	(i < len) && !wordBoundary.isBoundary(i);
				216	i += Character.charCount(ch)) {
				217
				218	ch = src.codePointAt(i);
				219	if (isCased(ch)) {
				220	return false;
				221	}
				222	}
				223
				224	return true;
				225	}
				226	}
				227
				228	return false;
				229	}
				230
				231	/**
				232	* Implements the "After_I" condition
				233	*
				234	* Specification: The last preceding base character was an uppercase I,
				235	* and there is no intervening combining character class 230 (ABOVE).
				236	*
				237	* Regular Expression:
				238	* Before C: [I]([{cc!=230}&{cc!=0}])*
				239	*/
				240	private static boolean isAfterI(String src, int index) {
				241	int ch;
				242	int cc;
				243
				244	// Look for the last preceding base character
				245	for (int i = index; i > 0; i -= Character.charCount(ch)) {
				246
				247	ch = src.codePointBefore(i);
				248
				249	if (ch == 'I') {
				250	return true;
				251	} else {
				252	cc = Normalizer.getCombiningClass(ch);
				253	if ((cc == 0) \|\| (cc == COMBINING_CLASS_ABOVE)) {
				254	return false;
				255	}
				256	}
				257	}
				258
				259	return false;
				260	}
				261
				262	/**
				263	* Implements the "After_Soft_Dotted" condition
				264	*
				265	* Specification: The last preceding character with combining class
				266	* of zero before C was Soft_Dotted, and there is no intervening
				267	* combining character class 230 (ABOVE).
				268	*
				269	* Regular Expression:
				270	* Before C: [{Soft_Dotted==true}]([{cc!=230}&{cc!=0}])*
				271	*/
				272	private static boolean isAfterSoftDotted(String src, int index) {
				273	int ch;
				274	int cc;
				275
				276	// Look for the last preceding character
				277	for (int i = index; i > 0; i -= Character.charCount(ch)) {
				278
				279	ch = src.codePointBefore(i);
				280
				281	if (isSoftDotted(ch)) {
				282	return true;
				283	} else {
				284	cc = Normalizer.getCombiningClass(ch);
				285	if ((cc == 0) \|\| (cc == COMBINING_CLASS_ABOVE)) {
				286	return false;
				287	}
				288	}
				289	}
				290
				291	return false;
				292	}
				293
				294	/**
				295	* Implements the "More_Above" condition
				296	*
				297	* Specification: C is followed by one or more characters of combining
				298	* class 230 (ABOVE) in the combining character sequence.
				299	*
				300	* Regular Expression:
				301	* After C: [{cc!=0}]*[{cc==230}]
				302	*/
				303	private static boolean isMoreAbove(String src, int index) {
				304	int ch;
				305	int cc;
				306	int len = src.length();
				307
				308	// Look for a following ABOVE combining class character
				309	for (int i = index + Character.charCount(src.codePointAt(index));
				310	i < len; i += Character.charCount(ch)) {
				311
				312	ch = src.codePointAt(i);
				313	cc = Normalizer.getCombiningClass(ch);
				314
				315	if (cc == COMBINING_CLASS_ABOVE) {
				316	return true;
				317	} else if (cc == 0) {
				318	return false;
				319	}
				320	}
				321
				322	return false;
				323	}
				324
				325	/**
				326	* Implements the "Before_Dot" condition
				327	*
				328	* Specification: C is followed by <code>U+0307 COMBINING DOT ABOVE</code>.
				329	* Any sequence of characters with a combining class that is
				330	* neither 0 nor 230 may intervene between the current character
				331	* and the combining dot above.
				332	*
				333	* Regular Expression:
				334	* After C: ([{cc!=230}&{cc!=0}])*[\u0307]
				335	*/
				336	private static boolean isBeforeDot(String src, int index) {
				337	int ch;
				338	int cc;
				339	int len = src.length();
				340
				341	// Look for a following COMBINING DOT ABOVE
				342	for (int i = index + Character.charCount(src.codePointAt(index));
				343	i < len; i += Character.charCount(ch)) {
				344
				345	ch = src.codePointAt(i);
				346
				347	if (ch == '\u0307') {
				348	return true;
				349	} else {
				350	cc = Normalizer.getCombiningClass(ch);
				351	if ((cc == 0) \|\| (cc == COMBINING_CLASS_ABOVE)) {
				352	return false;
				353	}
				354	}
				355	}
				356
				357	return false;
				358	}
				359
				360	/**
				361	* Examines whether a character is 'cased'.
				362	*
				363	* A character C is defined to be 'cased' if and only if at least one of
				364	* following are true for C: uppercase==true, or lowercase==true, or
				365	* general_category==titlecase_letter.
				366	*
				367	* The uppercase and lowercase property values are specified in the data
				368	* file DerivedCoreProperties.txt in the Unicode Character Database.
				369	*/
				370	private static boolean isCased(int ch) {
				371	int type = Character.getType(ch);
				372	if (type == Character.LOWERCASE_LETTER \|\|
				373	type == Character.UPPERCASE_LETTER \|\|
				374	type == Character.TITLECASE_LETTER) {
				375	return true;
				376	} else {
				377	// Check for Other_Lowercase and Other_Uppercase
				378	//
				379	if ((ch >= 0x02B0) && (ch <= 0x02B8)) {
				380	// MODIFIER LETTER SMALL H..MODIFIER LETTER SMALL Y
				381	return true;
				382	} else if ((ch >= 0x02C0) && (ch <= 0x02C1)) {
				383	// MODIFIER LETTER GLOTTAL STOP..MODIFIER LETTER REVERSED GLOTTAL STOP
				384	return true;
				385	} else if ((ch >= 0x02E0) && (ch <= 0x02E4)) {
				386	// MODIFIER LETTER SMALL GAMMA..MODIFIER LETTER SMALL REVERSED GLOTTAL STOP
				387	return true;
				388	} else if (ch == 0x0345) {
				389	// COMBINING GREEK YPOGEGRAMMENI
				390	return true;
				391	} else if (ch == 0x037A) {
				392	// GREEK YPOGEGRAMMENI
				393	return true;
				394	} else if ((ch >= 0x1D2C) && (ch <= 0x1D61)) {
				395	// MODIFIER LETTER CAPITAL A..MODIFIER LETTER SMALL CHI
				396	return true;
				397	} else if ((ch >= 0x2160) && (ch <= 0x217F)) {
				398	// ROMAN NUMERAL ONE..ROMAN NUMERAL ONE THOUSAND
				399	// SMALL ROMAN NUMERAL ONE..SMALL ROMAN NUMERAL ONE THOUSAND
				400	return true;
				401	} else if ((ch >= 0x24B6) && (ch <= 0x24E9)) {
				402	// CIRCLED LATIN CAPITAL LETTER A..CIRCLED LATIN CAPITAL LETTER Z
				403	// CIRCLED LATIN SMALL LETTER A..CIRCLED LATIN SMALL LETTER Z
				404	return true;
				405	} else {
				406	return false;
				407	}
				408	}
				409	}
				410
				411	private static boolean isSoftDotted(int ch) {
				412	switch (ch) {
				413	case 0x0069: // Soft_Dotted # L& LATIN SMALL LETTER I
				414	case 0x006A: // Soft_Dotted # L& LATIN SMALL LETTER J
				415	case 0x012F: // Soft_Dotted # L& LATIN SMALL LETTER I WITH OGONEK
				416	case 0x0268: // Soft_Dotted # L& LATIN SMALL LETTER I WITH STROKE
				417	case 0x0456: // Soft_Dotted # L& CYRILLIC SMALL LETTER BYELORUSSIAN-UKRAINIAN I
				418	case 0x0458: // Soft_Dotted # L& CYRILLIC SMALL LETTER JE
				419	case 0x1D62: // Soft_Dotted # L& LATIN SUBSCRIPT SMALL LETTER I
				420	case 0x1E2D: // Soft_Dotted # L& LATIN SMALL LETTER I WITH TILDE BELOW
				421	case 0x1ECB: // Soft_Dotted # L& LATIN SMALL LETTER I WITH DOT BELOW
				422	case 0x2071: // Soft_Dotted # L& SUPERSCRIPT LATIN SMALL LETTER I
				423	return true;
				424	default:
				425	return false;
				426	}
				427	}
				428
				429	/**
				430	* An internal class that represents an entry in the Special Casing Properties.
				431	*/
				432	static class Entry {
				433	int ch;
				434	char [] lower;
				435	char [] upper;
				436	String lang;
				437	int condition;
				438
				439	Entry(int ch, char[] lower, char[] upper, String lang, int condition) {
				440	this.ch = ch;
				441	this.lower = lower;
				442	this.upper = upper;
				443	this.lang = lang;
				444	this.condition = condition;
				445	}
				446
				447	int getCodePoint() {
				448	return ch;
				449	}
				450
				451	char[] getLowerCase() {
				452	return lower;
				453	}
				454
				455	char[] getUpperCase() {
				456	return upper;
				457	}
				458
				459	String getLanguage() {
				460	return lang;
				461	}
				462
				463	int getCondition() {
				464	return condition;
				465	}
				466	}
				467	}