blob: d81e550eef5ffcb00a5b0ad30fc6c97cf86e0f92 [file] [log] [blame]
J. Duke319a3b92007-12-01 00:00:00 +00001/*
2 * Copyright 2003-2005 Sun Microsystems, Inc. All Rights Reserved.
3 * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
4 *
5 * This code is free software; you can redistribute it and/or modify it
6 * under the terms of the GNU General Public License version 2 only, as
7 * published by the Free Software Foundation. Sun designates this
8 * particular file as subject to the "Classpath" exception as provided
9 * by Sun in the LICENSE file that accompanied this code.
10 *
11 * This code is distributed in the hope that it will be useful, but WITHOUT
12 * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
13 * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
14 * version 2 for more details (a copy is included in the LICENSE file that
15 * accompanied this code).
16 *
17 * You should have received a copy of the GNU General Public License version
18 * 2 along with this work; if not, write to the Free Software Foundation,
19 * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
20 *
21 * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
22 * CA 95054 USA or visit www.sun.com if you need additional information or
23 * have any questions.
24 */
25
26package java.lang;
27
28import java.text.BreakIterator;
29import java.util.HashSet;
30import java.util.Hashtable;
31import java.util.Iterator;
32import java.util.Locale;
33import sun.text.Normalizer;
34
35
36/**
37 * This is a utility class for <code>String.toLowerCase()</code> and
38 * <code>String.toUpperCase()</code>, that handles special casing with
39 * conditions. In other words, it handles the mappings with conditions
40 * that are defined in
41 * <a href="http://www.unicode.org/Public/UNIDATA/SpecialCasing.txt">Special
42 * Casing Properties</a> file.
43 * <p>
44 * Note that the unconditional case mappings (including 1:M mappings)
45 * are handled in <code>Character.toLower/UpperCase()</code>.
46 */
47final class ConditionalSpecialCasing {
48
49 // context conditions.
50 final static int FINAL_CASED = 1;
51 final static int AFTER_SOFT_DOTTED = 2;
52 final static int MORE_ABOVE = 3;
53 final static int AFTER_I = 4;
54 final static int NOT_BEFORE_DOT = 5;
55
56 // combining class definitions
57 final static int COMBINING_CLASS_ABOVE = 230;
58
59 // Special case mapping entries
60 static Entry[] entry = {
61 //# ================================================================================
62 //# Conditional mappings
63 //# ================================================================================
64 new Entry(0x03A3, new char[]{0x03C2}, new char[]{0x03A3}, null, FINAL_CASED), // # GREEK CAPITAL LETTER SIGMA
65
66 //# ================================================================================
67 //# Locale-sensitive mappings
68 //# ================================================================================
69 //# Lithuanian
70 new Entry(0x0307, new char[]{0x0307}, new char[]{}, "lt", AFTER_SOFT_DOTTED), // # COMBINING DOT ABOVE
71 new Entry(0x0049, new char[]{0x0069, 0x0307}, new char[]{0x0049}, "lt", MORE_ABOVE), // # LATIN CAPITAL LETTER I
72 new Entry(0x004A, new char[]{0x006A, 0x0307}, new char[]{0x004A}, "lt", MORE_ABOVE), // # LATIN CAPITAL LETTER J
73 new Entry(0x012E, new char[]{0x012F, 0x0307}, new char[]{0x012E}, "lt", MORE_ABOVE), // # LATIN CAPITAL LETTER I WITH OGONEK
74 new Entry(0x00CC, new char[]{0x0069, 0x0307, 0x0300}, new char[]{0x00CC}, "lt", 0), // # LATIN CAPITAL LETTER I WITH GRAVE
75 new Entry(0x00CD, new char[]{0x0069, 0x0307, 0x0301}, new char[]{0x00CD}, "lt", 0), // # LATIN CAPITAL LETTER I WITH ACUTE
76 new Entry(0x0128, new char[]{0x0069, 0x0307, 0x0303}, new char[]{0x0128}, "lt", 0), // # LATIN CAPITAL LETTER I WITH TILDE
77
78 //# ================================================================================
79 //# Turkish and Azeri
80// new Entry(0x0130, new char[]{0x0069}, new char[]{0x0130}, "tr", 0), // # LATIN CAPITAL LETTER I WITH DOT ABOVE
81// new Entry(0x0130, new char[]{0x0069}, new char[]{0x0130}, "az", 0), // # LATIN CAPITAL LETTER I WITH DOT ABOVE
82 new Entry(0x0307, new char[]{}, new char[]{0x0307}, "tr", AFTER_I), // # COMBINING DOT ABOVE
83 new Entry(0x0307, new char[]{}, new char[]{0x0307}, "az", AFTER_I), // # COMBINING DOT ABOVE
84 new Entry(0x0049, new char[]{0x0131}, new char[]{0x0049}, "tr", NOT_BEFORE_DOT), // # LATIN CAPITAL LETTER I
85 new Entry(0x0049, new char[]{0x0131}, new char[]{0x0049}, "az", NOT_BEFORE_DOT), // # LATIN CAPITAL LETTER I
86 new Entry(0x0069, new char[]{0x0069}, new char[]{0x0130}, "tr", 0), // # LATIN SMALL LETTER I
87 new Entry(0x0069, new char[]{0x0069}, new char[]{0x0130}, "az", 0) // # LATIN SMALL LETTER I
88 };
89
90 // A hash table that contains the above entries
91 static Hashtable entryTable = new Hashtable();
92 static {
93 // create hashtable from the entry
94 for (int i = 0; i < entry.length; i ++) {
95 Entry cur = entry[i];
96 Integer cp = new Integer(cur.getCodePoint());
97 HashSet set = (HashSet)entryTable.get(cp);
98 if (set == null) {
99 set = new HashSet();
100 }
101 set.add(cur);
102 entryTable.put(cp, set);
103 }
104 }
105
106 static int toLowerCaseEx(String src, int index, Locale locale) {
107 char[] result = lookUpTable(src, index, locale, true);
108
109 if (result != null) {
110 if (result.length == 1) {
111 return result[0];
112 } else {
113 return Character.ERROR;
114 }
115 } else {
116 // default to Character class' one
117 return Character.toLowerCase(src.codePointAt(index));
118 }
119 }
120
121 static int toUpperCaseEx(String src, int index, Locale locale) {
122 char[] result = lookUpTable(src, index, locale, false);
123
124 if (result != null) {
125 if (result.length == 1) {
126 return result[0];
127 } else {
128 return Character.ERROR;
129 }
130 } else {
131 // default to Character class' one
132 return Character.toUpperCaseEx(src.codePointAt(index));
133 }
134 }
135
136 static char[] toLowerCaseCharArray(String src, int index, Locale locale) {
137 return lookUpTable(src, index, locale, true);
138 }
139
140 static char[] toUpperCaseCharArray(String src, int index, Locale locale) {
141 char[] result = lookUpTable(src, index, locale, false);
142 if (result != null) {
143 return result;
144 } else {
145 return Character.toUpperCaseCharArray(src.codePointAt(index));
146 }
147 }
148
149 private static char[] lookUpTable(String src, int index, Locale locale, boolean bLowerCasing) {
150 HashSet set = (HashSet)entryTable.get(new Integer(src.codePointAt(index)));
151
152 if (set != null) {
153 Iterator iter = set.iterator();
154 String currentLang = locale.getLanguage();
155 while (iter.hasNext()) {
156 Entry entry = (Entry)iter.next();
157 String conditionLang= entry.getLanguage();
158 if (((conditionLang == null) || (conditionLang.equals(currentLang))) &&
159 isConditionMet(src, index, locale, entry.getCondition())) {
160 return (bLowerCasing ? entry.getLowerCase() : entry.getUpperCase());
161 }
162 }
163 }
164
165 return null;
166 }
167
168 private static boolean isConditionMet(String src, int index, Locale locale, int condition) {
169 switch (condition) {
170 case FINAL_CASED:
171 return isFinalCased(src, index, locale);
172
173 case AFTER_SOFT_DOTTED:
174 return isAfterSoftDotted(src, index);
175
176 case MORE_ABOVE:
177 return isMoreAbove(src, index);
178
179 case AFTER_I:
180 return isAfterI(src, index);
181
182 case NOT_BEFORE_DOT:
183 return !isBeforeDot(src, index);
184
185 default:
186 return true;
187 }
188 }
189
190 /**
191 * Implements the "Final_Cased" condition
192 *
193 * Specification: Within the closest word boundaries containing C, there is a cased
194 * letter before C, and there is no cased letter after C.
195 *
196 * Regular Expression:
197 * Before C: [{cased==true}][{wordBoundary!=true}]*
198 * After C: !([{wordBoundary!=true}]*[{cased}])
199 */
200 private static boolean isFinalCased(String src, int index, Locale locale) {
201 BreakIterator wordBoundary = BreakIterator.getWordInstance(locale);
202 wordBoundary.setText(src);
203 int ch;
204
205 // Look for a preceding 'cased' letter
206 for (int i = index; (i >= 0) && !wordBoundary.isBoundary(i);
207 i -= Character.charCount(ch)) {
208
209 ch = src.codePointBefore(i);
210 if (isCased(ch)) {
211
212 int len = src.length();
213 // Check that there is no 'cased' letter after the index
214 for (i = index + Character.charCount(src.codePointAt(index));
215 (i < len) && !wordBoundary.isBoundary(i);
216 i += Character.charCount(ch)) {
217
218 ch = src.codePointAt(i);
219 if (isCased(ch)) {
220 return false;
221 }
222 }
223
224 return true;
225 }
226 }
227
228 return false;
229 }
230
231 /**
232 * Implements the "After_I" condition
233 *
234 * Specification: The last preceding base character was an uppercase I,
235 * and there is no intervening combining character class 230 (ABOVE).
236 *
237 * Regular Expression:
238 * Before C: [I]([{cc!=230}&{cc!=0}])*
239 */
240 private static boolean isAfterI(String src, int index) {
241 int ch;
242 int cc;
243
244 // Look for the last preceding base character
245 for (int i = index; i > 0; i -= Character.charCount(ch)) {
246
247 ch = src.codePointBefore(i);
248
249 if (ch == 'I') {
250 return true;
251 } else {
252 cc = Normalizer.getCombiningClass(ch);
253 if ((cc == 0) || (cc == COMBINING_CLASS_ABOVE)) {
254 return false;
255 }
256 }
257 }
258
259 return false;
260 }
261
262 /**
263 * Implements the "After_Soft_Dotted" condition
264 *
265 * Specification: The last preceding character with combining class
266 * of zero before C was Soft_Dotted, and there is no intervening
267 * combining character class 230 (ABOVE).
268 *
269 * Regular Expression:
270 * Before C: [{Soft_Dotted==true}]([{cc!=230}&{cc!=0}])*
271 */
272 private static boolean isAfterSoftDotted(String src, int index) {
273 int ch;
274 int cc;
275
276 // Look for the last preceding character
277 for (int i = index; i > 0; i -= Character.charCount(ch)) {
278
279 ch = src.codePointBefore(i);
280
281 if (isSoftDotted(ch)) {
282 return true;
283 } else {
284 cc = Normalizer.getCombiningClass(ch);
285 if ((cc == 0) || (cc == COMBINING_CLASS_ABOVE)) {
286 return false;
287 }
288 }
289 }
290
291 return false;
292 }
293
294 /**
295 * Implements the "More_Above" condition
296 *
297 * Specification: C is followed by one or more characters of combining
298 * class 230 (ABOVE) in the combining character sequence.
299 *
300 * Regular Expression:
301 * After C: [{cc!=0}]*[{cc==230}]
302 */
303 private static boolean isMoreAbove(String src, int index) {
304 int ch;
305 int cc;
306 int len = src.length();
307
308 // Look for a following ABOVE combining class character
309 for (int i = index + Character.charCount(src.codePointAt(index));
310 i < len; i += Character.charCount(ch)) {
311
312 ch = src.codePointAt(i);
313 cc = Normalizer.getCombiningClass(ch);
314
315 if (cc == COMBINING_CLASS_ABOVE) {
316 return true;
317 } else if (cc == 0) {
318 return false;
319 }
320 }
321
322 return false;
323 }
324
325 /**
326 * Implements the "Before_Dot" condition
327 *
328 * Specification: C is followed by <code>U+0307 COMBINING DOT ABOVE</code>.
329 * Any sequence of characters with a combining class that is
330 * neither 0 nor 230 may intervene between the current character
331 * and the combining dot above.
332 *
333 * Regular Expression:
334 * After C: ([{cc!=230}&{cc!=0}])*[\u0307]
335 */
336 private static boolean isBeforeDot(String src, int index) {
337 int ch;
338 int cc;
339 int len = src.length();
340
341 // Look for a following COMBINING DOT ABOVE
342 for (int i = index + Character.charCount(src.codePointAt(index));
343 i < len; i += Character.charCount(ch)) {
344
345 ch = src.codePointAt(i);
346
347 if (ch == '\u0307') {
348 return true;
349 } else {
350 cc = Normalizer.getCombiningClass(ch);
351 if ((cc == 0) || (cc == COMBINING_CLASS_ABOVE)) {
352 return false;
353 }
354 }
355 }
356
357 return false;
358 }
359
360 /**
361 * Examines whether a character is 'cased'.
362 *
363 * A character C is defined to be 'cased' if and only if at least one of
364 * following are true for C: uppercase==true, or lowercase==true, or
365 * general_category==titlecase_letter.
366 *
367 * The uppercase and lowercase property values are specified in the data
368 * file DerivedCoreProperties.txt in the Unicode Character Database.
369 */
370 private static boolean isCased(int ch) {
371 int type = Character.getType(ch);
372 if (type == Character.LOWERCASE_LETTER ||
373 type == Character.UPPERCASE_LETTER ||
374 type == Character.TITLECASE_LETTER) {
375 return true;
376 } else {
377 // Check for Other_Lowercase and Other_Uppercase
378 //
379 if ((ch >= 0x02B0) && (ch <= 0x02B8)) {
380 // MODIFIER LETTER SMALL H..MODIFIER LETTER SMALL Y
381 return true;
382 } else if ((ch >= 0x02C0) && (ch <= 0x02C1)) {
383 // MODIFIER LETTER GLOTTAL STOP..MODIFIER LETTER REVERSED GLOTTAL STOP
384 return true;
385 } else if ((ch >= 0x02E0) && (ch <= 0x02E4)) {
386 // MODIFIER LETTER SMALL GAMMA..MODIFIER LETTER SMALL REVERSED GLOTTAL STOP
387 return true;
388 } else if (ch == 0x0345) {
389 // COMBINING GREEK YPOGEGRAMMENI
390 return true;
391 } else if (ch == 0x037A) {
392 // GREEK YPOGEGRAMMENI
393 return true;
394 } else if ((ch >= 0x1D2C) && (ch <= 0x1D61)) {
395 // MODIFIER LETTER CAPITAL A..MODIFIER LETTER SMALL CHI
396 return true;
397 } else if ((ch >= 0x2160) && (ch <= 0x217F)) {
398 // ROMAN NUMERAL ONE..ROMAN NUMERAL ONE THOUSAND
399 // SMALL ROMAN NUMERAL ONE..SMALL ROMAN NUMERAL ONE THOUSAND
400 return true;
401 } else if ((ch >= 0x24B6) && (ch <= 0x24E9)) {
402 // CIRCLED LATIN CAPITAL LETTER A..CIRCLED LATIN CAPITAL LETTER Z
403 // CIRCLED LATIN SMALL LETTER A..CIRCLED LATIN SMALL LETTER Z
404 return true;
405 } else {
406 return false;
407 }
408 }
409 }
410
411 private static boolean isSoftDotted(int ch) {
412 switch (ch) {
413 case 0x0069: // Soft_Dotted # L& LATIN SMALL LETTER I
414 case 0x006A: // Soft_Dotted # L& LATIN SMALL LETTER J
415 case 0x012F: // Soft_Dotted # L& LATIN SMALL LETTER I WITH OGONEK
416 case 0x0268: // Soft_Dotted # L& LATIN SMALL LETTER I WITH STROKE
417 case 0x0456: // Soft_Dotted # L& CYRILLIC SMALL LETTER BYELORUSSIAN-UKRAINIAN I
418 case 0x0458: // Soft_Dotted # L& CYRILLIC SMALL LETTER JE
419 case 0x1D62: // Soft_Dotted # L& LATIN SUBSCRIPT SMALL LETTER I
420 case 0x1E2D: // Soft_Dotted # L& LATIN SMALL LETTER I WITH TILDE BELOW
421 case 0x1ECB: // Soft_Dotted # L& LATIN SMALL LETTER I WITH DOT BELOW
422 case 0x2071: // Soft_Dotted # L& SUPERSCRIPT LATIN SMALL LETTER I
423 return true;
424 default:
425 return false;
426 }
427 }
428
429 /**
430 * An internal class that represents an entry in the Special Casing Properties.
431 */
432 static class Entry {
433 int ch;
434 char [] lower;
435 char [] upper;
436 String lang;
437 int condition;
438
439 Entry(int ch, char[] lower, char[] upper, String lang, int condition) {
440 this.ch = ch;
441 this.lower = lower;
442 this.upper = upper;
443 this.lang = lang;
444 this.condition = condition;
445 }
446
447 int getCodePoint() {
448 return ch;
449 }
450
451 char[] getLowerCase() {
452 return lower;
453 }
454
455 char[] getUpperCase() {
456 return upper;
457 }
458
459 String getLanguage() {
460 return lang;
461 }
462
463 int getCondition() {
464 return condition;
465 }
466 }
467}