J. Duke | 319a3b9 | 2007-12-01 00:00:00 +0000 | [diff] [blame^] | 1 | /* |
| 2 | * Copyright 1996-2001 Sun Microsystems, Inc. All Rights Reserved. |
| 3 | * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. |
| 4 | * |
| 5 | * This code is free software; you can redistribute it and/or modify it |
| 6 | * under the terms of the GNU General Public License version 2 only, as |
| 7 | * published by the Free Software Foundation. Sun designates this |
| 8 | * particular file as subject to the "Classpath" exception as provided |
| 9 | * by Sun in the LICENSE file that accompanied this code. |
| 10 | * |
| 11 | * This code is distributed in the hope that it will be useful, but WITHOUT |
| 12 | * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or |
| 13 | * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License |
| 14 | * version 2 for more details (a copy is included in the LICENSE file that |
| 15 | * accompanied this code). |
| 16 | * |
| 17 | * You should have received a copy of the GNU General Public License version |
| 18 | * 2 along with this work; if not, write to the Free Software Foundation, |
| 19 | * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA. |
| 20 | * |
| 21 | * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara, |
| 22 | * CA 95054 USA or visit www.sun.com if you need additional information or |
| 23 | * have any questions. |
| 24 | */ |
| 25 | |
| 26 | /* |
| 27 | * (C) Copyright Taligent, Inc. 1996,1997 - All Rights Reserved |
| 28 | * (C) Copyright IBM Corp. 1996, 1997 - All Rights Reserved |
| 29 | * |
| 30 | * The original version of this source code and documentation is copyrighted |
| 31 | * and owned by Taligent, Inc., a wholly-owned subsidiary of IBM. These |
| 32 | * materials are provided under terms of a License Agreement between Taligent |
| 33 | * and Sun. This technology is protected by multiple US and International |
| 34 | * patents. This notice and attribution to Taligent may not be removed. |
| 35 | * Taligent is a registered trademark of Taligent, Inc. |
| 36 | * |
| 37 | */ |
| 38 | |
| 39 | package java.text; |
| 40 | /** |
| 41 | * CollationRules contains the default en_US collation rules as a base |
| 42 | * for building other collation tables. |
| 43 | * <p>Note that decompositions are done before these rules are used, |
| 44 | * so they do not have to contain accented characters, such as A-grave. |
| 45 | * @see RuleBasedCollator |
| 46 | * @see LocaleElements |
| 47 | * @author Helena Shih, Mark Davis |
| 48 | */ |
| 49 | final class CollationRules { |
| 50 | final static String DEFAULTRULES = new String( |
| 51 | "" // no FRENCH accent order by default, add in French Delta |
| 52 | // IGNORABLES (up to first < character) |
| 53 | // COMPLETELY IGNORE format characters |
| 54 | + "='\u200B'=\u200C=\u200D=\u200E=\u200F" |
| 55 | // Control Characters |
| 56 | + "=\u0000 =\u0001 =\u0002 =\u0003 =\u0004" //null, .. eot |
| 57 | + "=\u0005 =\u0006 =\u0007 =\u0008 ='\u0009'" //enq, ... |
| 58 | + "='\u000b' =\u000e" //vt,, so |
| 59 | + "=\u000f ='\u0010' =\u0011 =\u0012 =\u0013" //si, dle, dc1, dc2, dc3 |
| 60 | + "=\u0014 =\u0015 =\u0016 =\u0017 =\u0018" //dc4, nak, syn, etb, can |
| 61 | + "=\u0019 =\u001a =\u001b =\u001c =\u001d" //em, sub, esc, fs, gs |
| 62 | + "=\u001e =\u001f =\u007f" //rs, us, del |
| 63 | //....then the C1 Latin 1 reserved control codes |
| 64 | + "=\u0080 =\u0081 =\u0082 =\u0083 =\u0084 =\u0085" |
| 65 | + "=\u0086 =\u0087 =\u0088 =\u0089 =\u008a =\u008b" |
| 66 | + "=\u008c =\u008d =\u008e =\u008f =\u0090 =\u0091" |
| 67 | + "=\u0092 =\u0093 =\u0094 =\u0095 =\u0096 =\u0097" |
| 68 | + "=\u0098 =\u0099 =\u009a =\u009b =\u009c =\u009d" |
| 69 | + "=\u009e =\u009f" |
| 70 | // IGNORE except for secondary, tertiary difference |
| 71 | // Spaces |
| 72 | + ";'\u0020';'\u00A0'" // spaces |
| 73 | + ";'\u2000';'\u2001';'\u2002';'\u2003';'\u2004'" // spaces |
| 74 | + ";'\u2005';'\u2006';'\u2007';'\u2008';'\u2009'" // spaces |
| 75 | + ";'\u200A';'\u3000';'\uFEFF'" // spaces |
| 76 | + ";'\r' ;'\t' ;'\n';'\f';'\u000b'" // whitespace |
| 77 | |
| 78 | // Non-spacing accents |
| 79 | |
| 80 | + ";\u0301" // non-spacing acute accent |
| 81 | + ";\u0300" // non-spacing grave accent |
| 82 | + ";\u0306" // non-spacing breve accent |
| 83 | + ";\u0302" // non-spacing circumflex accent |
| 84 | + ";\u030c" // non-spacing caron/hacek accent |
| 85 | + ";\u030a" // non-spacing ring above accent |
| 86 | + ";\u030d" // non-spacing vertical line above |
| 87 | + ";\u0308" // non-spacing diaeresis accent |
| 88 | + ";\u030b" // non-spacing double acute accent |
| 89 | + ";\u0303" // non-spacing tilde accent |
| 90 | + ";\u0307" // non-spacing dot above/overdot accent |
| 91 | + ";\u0304" // non-spacing macron accent |
| 92 | + ";\u0337" // non-spacing short slash overlay (overstruck diacritic) |
| 93 | + ";\u0327" // non-spacing cedilla accent |
| 94 | + ";\u0328" // non-spacing ogonek accent |
| 95 | + ";\u0323" // non-spacing dot-below/underdot accent |
| 96 | + ";\u0332" // non-spacing underscore/underline accent |
| 97 | // with the rest of the general diacritical marks in binary order |
| 98 | + ";\u0305" // non-spacing overscore/overline |
| 99 | + ";\u0309" // non-spacing hook above |
| 100 | + ";\u030e" // non-spacing double vertical line above |
| 101 | + ";\u030f" // non-spacing double grave |
| 102 | + ";\u0310" // non-spacing chandrabindu |
| 103 | + ";\u0311" // non-spacing inverted breve |
| 104 | + ";\u0312" // non-spacing turned comma above/cedilla above |
| 105 | + ";\u0313" // non-spacing comma above |
| 106 | + ";\u0314" // non-spacing reversed comma above |
| 107 | + ";\u0315" // non-spacing comma above right |
| 108 | + ";\u0316" // non-spacing grave below |
| 109 | + ";\u0317" // non-spacing acute below |
| 110 | + ";\u0318" // non-spacing left tack below |
| 111 | + ";\u0319" // non-spacing tack below |
| 112 | + ";\u031a" // non-spacing left angle above |
| 113 | + ";\u031b" // non-spacing horn |
| 114 | + ";\u031c" // non-spacing left half ring below |
| 115 | + ";\u031d" // non-spacing up tack below |
| 116 | + ";\u031e" // non-spacing down tack below |
| 117 | + ";\u031f" // non-spacing plus sign below |
| 118 | + ";\u0320" // non-spacing minus sign below |
| 119 | + ";\u0321" // non-spacing palatalized hook below |
| 120 | + ";\u0322" // non-spacing retroflex hook below |
| 121 | + ";\u0324" // non-spacing double dot below |
| 122 | + ";\u0325" // non-spacing ring below |
| 123 | + ";\u0326" // non-spacing comma below |
| 124 | + ";\u0329" // non-spacing vertical line below |
| 125 | + ";\u032a" // non-spacing bridge below |
| 126 | + ";\u032b" // non-spacing inverted double arch below |
| 127 | + ";\u032c" // non-spacing hacek below |
| 128 | + ";\u032d" // non-spacing circumflex below |
| 129 | + ";\u032e" // non-spacing breve below |
| 130 | + ";\u032f" // non-spacing inverted breve below |
| 131 | + ";\u0330" // non-spacing tilde below |
| 132 | + ";\u0331" // non-spacing macron below |
| 133 | + ";\u0333" // non-spacing double underscore |
| 134 | + ";\u0334" // non-spacing tilde overlay |
| 135 | + ";\u0335" // non-spacing short bar overlay |
| 136 | + ";\u0336" // non-spacing long bar overlay |
| 137 | + ";\u0338" // non-spacing long slash overlay |
| 138 | + ";\u0339" // non-spacing right half ring below |
| 139 | + ";\u033a" // non-spacing inverted bridge below |
| 140 | + ";\u033b" // non-spacing square below |
| 141 | + ";\u033c" // non-spacing seagull below |
| 142 | + ";\u033d" // non-spacing x above |
| 143 | + ";\u033e" // non-spacing vertical tilde |
| 144 | + ";\u033f" // non-spacing double overscore |
| 145 | //+ ";\u0340" // non-spacing grave tone mark == \u0300 |
| 146 | //+ ";\u0341" // non-spacing acute tone mark == \u0301 |
| 147 | + ";\u0342;" |
| 148 | //+ "\u0343;" // == \u0313 |
| 149 | + "\u0344;\u0345;\u0360;\u0361" // newer |
| 150 | + ";\u0483;\u0484;\u0485;\u0486" // Cyrillic accents |
| 151 | |
| 152 | + ";\u20D0;\u20D1;\u20D2" // symbol accents |
| 153 | + ";\u20D3;\u20D4;\u20D5" // symbol accents |
| 154 | + ";\u20D6;\u20D7;\u20D8" // symbol accents |
| 155 | + ";\u20D9;\u20DA;\u20DB" // symbol accents |
| 156 | + ";\u20DC;\u20DD;\u20DE" // symbol accents |
| 157 | + ";\u20DF;\u20E0;\u20E1" // symbol accents |
| 158 | |
| 159 | + ",'\u002D';\u00AD" // dashes |
| 160 | + ";\u2010;\u2011;\u2012" // dashes |
| 161 | + ";\u2013;\u2014;\u2015" // dashes |
| 162 | + ";\u2212" // dashes |
| 163 | |
| 164 | // other punctuation |
| 165 | |
| 166 | + "<'\u005f'" // underline/underscore (spacing) |
| 167 | + "<\u00af" // overline or macron (spacing) |
| 168 | + "<'\u002c'" // comma (spacing) |
| 169 | + "<'\u003b'" // semicolon |
| 170 | + "<'\u003a'" // colon |
| 171 | + "<'\u0021'" // exclamation point |
| 172 | + "<\u00a1" // inverted exclamation point |
| 173 | + "<'\u003f'" // question mark |
| 174 | + "<\u00bf" // inverted question mark |
| 175 | + "<'\u002f'" // slash |
| 176 | + "<'\u002e'" // period/full stop |
| 177 | + "<\u00b4" // acute accent (spacing) |
| 178 | + "<'\u0060'" // grave accent (spacing) |
| 179 | + "<'\u005e'" // circumflex accent (spacing) |
| 180 | + "<\u00a8" // diaresis/umlaut accent (spacing) |
| 181 | + "<'\u007e'" // tilde accent (spacing) |
| 182 | + "<\u00b7" // middle dot (spacing) |
| 183 | + "<\u00b8" // cedilla accent (spacing) |
| 184 | + "<'\u0027'" // apostrophe |
| 185 | + "<'\"'" // quotation marks |
| 186 | + "<\u00ab" // left angle quotes |
| 187 | + "<\u00bb" // right angle quotes |
| 188 | + "<'\u0028'" // left parenthesis |
| 189 | + "<'\u0029'" // right parenthesis |
| 190 | + "<'\u005b'" // left bracket |
| 191 | + "<'\u005d'" // right bracket |
| 192 | + "<'\u007b'" // left brace |
| 193 | + "<'\u007d'" // right brace |
| 194 | + "<\u00a7" // section symbol |
| 195 | + "<\u00b6" // paragraph symbol |
| 196 | + "<\u00a9" // copyright symbol |
| 197 | + "<\u00ae" // registered trademark symbol |
| 198 | + "<'\u0040'" // at sign |
| 199 | + "<\u00a4" // international currency symbol |
| 200 | + "<\u0e3f" // baht sign |
| 201 | + "<\u00a2" // cent sign |
| 202 | + "<\u20a1" // colon sign |
| 203 | + "<\u20a2" // cruzeiro sign |
| 204 | + "<'\u0024'" // dollar sign |
| 205 | + "<\u20ab" // dong sign |
| 206 | + "<\u20ac" // euro sign |
| 207 | + "<\u20a3" // franc sign |
| 208 | + "<\u20a4" // lira sign |
| 209 | + "<\u20a5" // mill sign |
| 210 | + "<\u20a6" // naira sign |
| 211 | + "<\u20a7" // peseta sign |
| 212 | + "<\u00a3" // pound-sterling sign |
| 213 | + "<\u20a8" // rupee sign |
| 214 | + "<\u20aa" // new shekel sign |
| 215 | + "<\u20a9" // won sign |
| 216 | + "<\u00a5" // yen sign |
| 217 | + "<'\u002a'" // asterisk |
| 218 | + "<'\\'" // backslash |
| 219 | + "<'\u0026'" // ampersand |
| 220 | + "<'\u0023'" // number sign |
| 221 | + "<'\u0025'" // percent sign |
| 222 | + "<'\u002b'" // plus sign |
| 223 | + "<\u00b1" // plus-or-minus sign |
| 224 | + "<\u00f7" // divide sign |
| 225 | + "<\u00d7" // multiply sign |
| 226 | + "<'\u003c'" // less-than sign |
| 227 | + "<'\u003d'" // equal sign |
| 228 | + "<'\u003e'" // greater-than sign |
| 229 | + "<\u00ac" // end of line symbol/logical NOT symbol |
| 230 | + "<'\u007c'" // vertical line/logical OR symbol |
| 231 | + "<\u00a6" // broken vertical line |
| 232 | + "<\u00b0" // degree symbol |
| 233 | + "<\u00b5" // micro symbol |
| 234 | |
| 235 | // NUMERICS |
| 236 | |
| 237 | + "<0<1<2<3<4<5<6<7<8<9" |
| 238 | + "<\u00bc<\u00bd<\u00be" // 1/4,1/2,3/4 fractions |
| 239 | |
| 240 | // NON-IGNORABLES |
| 241 | + "<a,A" |
| 242 | + "<b,B" |
| 243 | + "<c,C" |
| 244 | + "<d,D" |
| 245 | + "<\u00F0,\u00D0" // eth |
| 246 | + "<e,E" |
| 247 | + "<f,F" |
| 248 | + "<g,G" |
| 249 | + "<h,H" |
| 250 | + "<i,I" |
| 251 | + "<j,J" |
| 252 | + "<k,K" |
| 253 | + "<l,L" |
| 254 | + "<m,M" |
| 255 | + "<n,N" |
| 256 | + "<o,O" |
| 257 | + "<p,P" |
| 258 | + "<q,Q" |
| 259 | + "<r,R" |
| 260 | + "<s, S & SS,\u00DF" // s-zet |
| 261 | + "<t,T" |
| 262 | + "& TH, \u00DE &TH, \u00FE " // thorn |
| 263 | + "<u,U" |
| 264 | + "<v,V" |
| 265 | + "<w,W" |
| 266 | + "<x,X" |
| 267 | + "<y,Y" |
| 268 | + "<z,Z" |
| 269 | + "&AE,\u00C6" // ae & AE ligature |
| 270 | + "&AE,\u00E6" |
| 271 | + "&OE,\u0152" // oe & OE ligature |
| 272 | + "&OE,\u0153" |
| 273 | ); |
| 274 | } |