J. Duke | 319a3b9 | 2007-12-01 00:00:00 +0000 | [diff] [blame^] | 1 | /* |
| 2 | * Copyright 2000-2006 Sun Microsystems, Inc. All Rights Reserved. |
| 3 | * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. |
| 4 | * |
| 5 | * This code is free software; you can redistribute it and/or modify it |
| 6 | * under the terms of the GNU General Public License version 2 only, as |
| 7 | * published by the Free Software Foundation. Sun designates this |
| 8 | * particular file as subject to the "Classpath" exception as provided |
| 9 | * by Sun in the LICENSE file that accompanied this code. |
| 10 | * |
| 11 | * This code is distributed in the hope that it will be useful, but WITHOUT |
| 12 | * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or |
| 13 | * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License |
| 14 | * version 2 for more details (a copy is included in the LICENSE file that |
| 15 | * accompanied this code). |
| 16 | * |
| 17 | * You should have received a copy of the GNU General Public License version |
| 18 | * 2 along with this work; if not, write to the Free Software Foundation, |
| 19 | * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA. |
| 20 | * |
| 21 | * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara, |
| 22 | * CA 95054 USA or visit www.sun.com if you need additional information or |
| 23 | * have any questions. |
| 24 | */ |
| 25 | |
| 26 | package java.nio.charset; |
| 27 | |
| 28 | import java.nio.ByteBuffer; |
| 29 | import java.nio.CharBuffer; |
| 30 | import java.nio.charset.spi.CharsetProvider; |
| 31 | import java.security.AccessController; |
| 32 | import java.security.AccessControlException; |
| 33 | import java.security.PrivilegedAction; |
| 34 | import java.util.Collections; |
| 35 | import java.util.HashSet; |
| 36 | import java.util.Iterator; |
| 37 | import java.util.Locale; |
| 38 | import java.util.Map; |
| 39 | import java.util.NoSuchElementException; |
| 40 | import java.util.Set; |
| 41 | import java.util.ServiceLoader; |
| 42 | import java.util.ServiceConfigurationError; |
| 43 | import java.util.SortedMap; |
| 44 | import java.util.TreeMap; |
| 45 | import sun.misc.ASCIICaseInsensitiveComparator; |
| 46 | import sun.nio.cs.StandardCharsets; |
| 47 | import sun.nio.cs.ThreadLocalCoders; |
| 48 | import sun.security.action.GetPropertyAction; |
| 49 | |
| 50 | |
| 51 | /** |
| 52 | * A named mapping between sequences of sixteen-bit Unicode <a |
| 53 | * href="../../lang/Character.html#unicode">code units</a> and sequences of |
| 54 | * bytes. This class defines methods for creating decoders and encoders and |
| 55 | * for retrieving the various names associated with a charset. Instances of |
| 56 | * this class are immutable. |
| 57 | * |
| 58 | * <p> This class also defines static methods for testing whether a particular |
| 59 | * charset is supported, for locating charset instances by name, and for |
| 60 | * constructing a map that contains every charset for which support is |
| 61 | * available in the current Java virtual machine. Support for new charsets can |
| 62 | * be added via the service-provider interface defined in the {@link |
| 63 | * java.nio.charset.spi.CharsetProvider} class. |
| 64 | * |
| 65 | * <p> All of the methods defined in this class are safe for use by multiple |
| 66 | * concurrent threads. |
| 67 | * |
| 68 | * |
| 69 | * <a name="names"><a name="charenc"> |
| 70 | * <h4>Charset names</h4> |
| 71 | * |
| 72 | * <p> Charsets are named by strings composed of the following characters: |
| 73 | * |
| 74 | * <ul> |
| 75 | * |
| 76 | * <li> The uppercase letters <tt>'A'</tt> through <tt>'Z'</tt> |
| 77 | * (<tt>'\u0041'</tt> through <tt>'\u005a'</tt>), |
| 78 | * |
| 79 | * <li> The lowercase letters <tt>'a'</tt> through <tt>'z'</tt> |
| 80 | * (<tt>'\u0061'</tt> through <tt>'\u007a'</tt>), |
| 81 | * |
| 82 | * <li> The digits <tt>'0'</tt> through <tt>'9'</tt> |
| 83 | * (<tt>'\u0030'</tt> through <tt>'\u0039'</tt>), |
| 84 | * |
| 85 | * <li> The dash character <tt>'-'</tt> |
| 86 | * (<tt>'\u002d'</tt>, <small>HYPHEN-MINUS</small>), |
| 87 | * |
| 88 | * <li> The period character <tt>'.'</tt> |
| 89 | * (<tt>'\u002e'</tt>, <small>FULL STOP</small>), |
| 90 | * |
| 91 | * <li> The colon character <tt>':'</tt> |
| 92 | * (<tt>'\u003a'</tt>, <small>COLON</small>), and |
| 93 | * |
| 94 | * <li> The underscore character <tt>'_'</tt> |
| 95 | * (<tt>'\u005f'</tt>, <small>LOW LINE</small>). |
| 96 | * |
| 97 | * </ul> |
| 98 | * |
| 99 | * A charset name must begin with either a letter or a digit. The empty string |
| 100 | * is not a legal charset name. Charset names are not case-sensitive; that is, |
| 101 | * case is always ignored when comparing charset names. Charset names |
| 102 | * generally follow the conventions documented in <a |
| 103 | * href="http://www.ietf.org/rfc/rfc2278.txt"><i>RFC 2278: IANA Charset |
| 104 | * Registration Procedures</i></a>. |
| 105 | * |
| 106 | * <p> Every charset has a <i>canonical name</i> and may also have one or more |
| 107 | * <i>aliases</i>. The canonical name is returned by the {@link #name() name} method |
| 108 | * of this class. Canonical names are, by convention, usually in upper case. |
| 109 | * The aliases of a charset are returned by the {@link #aliases() aliases} |
| 110 | * method. |
| 111 | * |
| 112 | * <a name="hn"> |
| 113 | * |
| 114 | * <p> Some charsets have an <i>historical name</i> that is defined for |
| 115 | * compatibility with previous versions of the Java platform. A charset's |
| 116 | * historical name is either its canonical name or one of its aliases. The |
| 117 | * historical name is returned by the <tt>getEncoding()</tt> methods of the |
| 118 | * {@link java.io.InputStreamReader#getEncoding InputStreamReader} and {@link |
| 119 | * java.io.OutputStreamWriter#getEncoding OutputStreamWriter} classes. |
| 120 | * |
| 121 | * <a name="iana"> |
| 122 | * |
| 123 | * <p> If a charset listed in the <a |
| 124 | * href="http://www.iana.org/assignments/character-sets"><i>IANA Charset |
| 125 | * Registry</i></a> is supported by an implementation of the Java platform then |
| 126 | * its canonical name must be the name listed in the registry. Many charsets |
| 127 | * are given more than one name in the registry, in which case the registry |
| 128 | * identifies one of the names as <i>MIME-preferred</i>. If a charset has more |
| 129 | * than one registry name then its canonical name must be the MIME-preferred |
| 130 | * name and the other names in the registry must be valid aliases. If a |
| 131 | * supported charset is not listed in the IANA registry then its canonical name |
| 132 | * must begin with one of the strings <tt>"X-"</tt> or <tt>"x-"</tt>. |
| 133 | * |
| 134 | * <p> The IANA charset registry does change over time, and so the canonical |
| 135 | * name and the aliases of a particular charset may also change over time. To |
| 136 | * ensure compatibility it is recommended that no alias ever be removed from a |
| 137 | * charset, and that if the canonical name of a charset is changed then its |
| 138 | * previous canonical name be made into an alias. |
| 139 | * |
| 140 | * |
| 141 | * <h4>Standard charsets</h4> |
| 142 | * |
| 143 | * <p> Every implementation of the Java platform is required to support the |
| 144 | * following standard charsets. Consult the release documentation for your |
| 145 | * implementation to see if any other charsets are supported. The behavior |
| 146 | * of such optional charsets may differ between implementations. |
| 147 | * |
| 148 | * <blockquote><table width="80%" summary="Description of standard charsets"> |
| 149 | * <tr><th><p align="left">Charset</p></th><th><p align="left">Description</p></th></tr> |
| 150 | * <tr><td valign=top><tt>US-ASCII</tt></td> |
| 151 | * <td>Seven-bit ASCII, a.k.a. <tt>ISO646-US</tt>, |
| 152 | * a.k.a. the Basic Latin block of the Unicode character set</td></tr> |
| 153 | * <tr><td valign=top><tt>ISO-8859-1 </tt></td> |
| 154 | * <td>ISO Latin Alphabet No. 1, a.k.a. <tt>ISO-LATIN-1</tt></td></tr> |
| 155 | * <tr><td valign=top><tt>UTF-8</tt></td> |
| 156 | * <td>Eight-bit UCS Transformation Format</td></tr> |
| 157 | * <tr><td valign=top><tt>UTF-16BE</tt></td> |
| 158 | * <td>Sixteen-bit UCS Transformation Format, |
| 159 | * big-endian byte order</td></tr> |
| 160 | * <tr><td valign=top><tt>UTF-16LE</tt></td> |
| 161 | * <td>Sixteen-bit UCS Transformation Format, |
| 162 | * little-endian byte order</td></tr> |
| 163 | * <tr><td valign=top><tt>UTF-16</tt></td> |
| 164 | * <td>Sixteen-bit UCS Transformation Format, |
| 165 | * byte order identified by an optional byte-order mark</td></tr> |
| 166 | * </table></blockquote> |
| 167 | * |
| 168 | * <p> The <tt>UTF-8</tt> charset is specified by <a |
| 169 | * href="http://www.ietf.org/rfc/rfc2279.txt"><i>RFC 2279</i></a>; the |
| 170 | * transformation format upon which it is based is specified in |
| 171 | * Amendment 2 of ISO 10646-1 and is also described in the <a |
| 172 | * href="http://www.unicode.org/unicode/standard/standard.html"><i>Unicode |
| 173 | * Standard</i></a>. |
| 174 | * |
| 175 | * <p> The <tt>UTF-16</tt> charsets are specified by <a |
| 176 | * href="http://www.ietf.org/rfc/rfc2781.txt"><i>RFC 2781</i></a>; the |
| 177 | * transformation formats upon which they are based are specified in |
| 178 | * Amendment 1 of ISO 10646-1 and are also described in the <a |
| 179 | * href="http://www.unicode.org/unicode/standard/standard.html"><i>Unicode |
| 180 | * Standard</i></a>. |
| 181 | * |
| 182 | * <p> The <tt>UTF-16</tt> charsets use sixteen-bit quantities and are |
| 183 | * therefore sensitive to byte order. In these encodings the byte order of a |
| 184 | * stream may be indicated by an initial <i>byte-order mark</i> represented by |
| 185 | * the Unicode character <tt>'\uFEFF'</tt>. Byte-order marks are handled |
| 186 | * as follows: |
| 187 | * |
| 188 | * <ul> |
| 189 | * |
| 190 | * <li><p> When decoding, the <tt>UTF-16BE</tt> and <tt>UTF-16LE</tt> |
| 191 | * charsets ignore byte-order marks; when encoding, they do not write |
| 192 | * byte-order marks. </p></li> |
| 193 | * |
| 194 | * <li><p> When decoding, the <tt>UTF-16</tt> charset interprets a byte-order |
| 195 | * mark to indicate the byte order of the stream but defaults to big-endian |
| 196 | * if there is no byte-order mark; when encoding, it uses big-endian byte |
| 197 | * order and writes a big-endian byte-order mark. </p></li> |
| 198 | * |
| 199 | * </ul> |
| 200 | * |
| 201 | * In any case, when a byte-order mark is read at the beginning of a decoding |
| 202 | * operation it is omitted from the resulting sequence of characters. Byte |
| 203 | * order marks occuring after the first element of an input sequence are not |
| 204 | * omitted since the same code is used to represent <small>ZERO-WIDTH |
| 205 | * NON-BREAKING SPACE</small>. |
| 206 | * |
| 207 | * <p> Every instance of the Java virtual machine has a default charset, which |
| 208 | * may or may not be one of the standard charsets. The default charset is |
| 209 | * determined during virtual-machine startup and typically depends upon the |
| 210 | * locale and charset being used by the underlying operating system. </p> |
| 211 | * |
| 212 | * |
| 213 | * <h4>Terminology</h4> |
| 214 | * |
| 215 | * <p> The name of this class is taken from the terms used in <a |
| 216 | * href="http://www.ietf.org/rfc/rfc2278.txt""><i>RFC 2278</i></a>. In that |
| 217 | * document a <i>charset</i> is defined as the combination of a coded character |
| 218 | * set and a character-encoding scheme. |
| 219 | * |
| 220 | * <p> A <i>coded character set</i> is a mapping between a set of abstract |
| 221 | * characters and a set of integers. US-ASCII, ISO 8859-1, |
| 222 | * JIS X 0201, and full Unicode, which is the same as |
| 223 | * ISO 10646-1, are examples of coded character sets. |
| 224 | * |
| 225 | * <p> A <i>character-encoding scheme</i> is a mapping between a coded |
| 226 | * character set and a set of octet (eight-bit byte) sequences. UTF-8, UCS-2, |
| 227 | * UTF-16, ISO 2022, and EUC are examples of character-encoding schemes. |
| 228 | * Encoding schemes are often associated with a particular coded character set; |
| 229 | * UTF-8, for example, is used only to encode Unicode. Some schemes, however, |
| 230 | * are associated with multiple character sets; EUC, for example, can be used |
| 231 | * to encode characters in a variety of Asian character sets. |
| 232 | * |
| 233 | * <p> When a coded character set is used exclusively with a single |
| 234 | * character-encoding scheme then the corresponding charset is usually named |
| 235 | * for the character set; otherwise a charset is usually named for the encoding |
| 236 | * scheme and, possibly, the locale of the character sets that it supports. |
| 237 | * Hence <tt>US-ASCII</tt> is the name of the charset for US-ASCII while |
| 238 | * <tt>EUC-JP</tt> is the name of the charset that encodes the |
| 239 | * JIS X 0201, JIS X 0208, and JIS X 0212 |
| 240 | * character sets. |
| 241 | * |
| 242 | * <p> The native character encoding of the Java programming language is |
| 243 | * UTF-16. A charset in the Java platform therefore defines a mapping between |
| 244 | * sequences of sixteen-bit UTF-16 code units and sequences of bytes. </p> |
| 245 | * |
| 246 | * |
| 247 | * @author Mark Reinhold |
| 248 | * @author JSR-51 Expert Group |
| 249 | * @since 1.4 |
| 250 | * |
| 251 | * @see CharsetDecoder |
| 252 | * @see CharsetEncoder |
| 253 | * @see java.nio.charset.spi.CharsetProvider |
| 254 | * @see java.lang.Character |
| 255 | */ |
| 256 | |
| 257 | public abstract class Charset |
| 258 | implements Comparable<Charset> |
| 259 | { |
| 260 | |
| 261 | /* -- Static methods -- */ |
| 262 | |
| 263 | private static String bugLevel = null; |
| 264 | |
| 265 | static boolean atBugLevel(String bl) { // package-private |
| 266 | if (bugLevel == null) { |
| 267 | if (!sun.misc.VM.isBooted()) |
| 268 | return false; |
| 269 | bugLevel = AccessController.doPrivileged( |
| 270 | new GetPropertyAction("sun.nio.cs.bugLevel")); |
| 271 | if (bugLevel == null) |
| 272 | bugLevel = ""; |
| 273 | } |
| 274 | return (bugLevel != null) && bugLevel.equals(bl); |
| 275 | } |
| 276 | |
| 277 | /** |
| 278 | * Checks that the given string is a legal charset name. </p> |
| 279 | * |
| 280 | * @param s |
| 281 | * A purported charset name |
| 282 | * |
| 283 | * @throws IllegalCharsetNameException |
| 284 | * If the given name is not a legal charset name |
| 285 | */ |
| 286 | private static void checkName(String s) { |
| 287 | int n = s.length(); |
| 288 | if (!atBugLevel("1.4")) { |
| 289 | if (n == 0) |
| 290 | throw new IllegalCharsetNameException(s); |
| 291 | } |
| 292 | for (int i = 0; i < n; i++) { |
| 293 | char c = s.charAt(i); |
| 294 | if (c >= 'A' && c <= 'Z') continue; |
| 295 | if (c >= 'a' && c <= 'z') continue; |
| 296 | if (c >= '0' && c <= '9') continue; |
| 297 | if (c == '-' && i != 0) continue; |
| 298 | if (c == ':' && i != 0) continue; |
| 299 | if (c == '_' && i != 0) continue; |
| 300 | if (c == '.' && i != 0) continue; |
| 301 | throw new IllegalCharsetNameException(s); |
| 302 | } |
| 303 | } |
| 304 | |
| 305 | /* The standard set of charsets */ |
| 306 | private static CharsetProvider standardProvider = new StandardCharsets(); |
| 307 | |
| 308 | // Cache of the most-recently-returned charsets, |
| 309 | // along with the names that were used to find them |
| 310 | // |
| 311 | private static volatile Object[] cache1 = null; // "Level 1" cache |
| 312 | private static volatile Object[] cache2 = null; // "Level 2" cache |
| 313 | |
| 314 | private static void cache(String charsetName, Charset cs) { |
| 315 | cache2 = cache1; |
| 316 | cache1 = new Object[] { charsetName, cs }; |
| 317 | } |
| 318 | |
| 319 | // Creates an iterator that walks over the available providers, ignoring |
| 320 | // those whose lookup or instantiation causes a security exception to be |
| 321 | // thrown. Should be invoked with full privileges. |
| 322 | // |
| 323 | private static Iterator providers() { |
| 324 | return new Iterator() { |
| 325 | |
| 326 | ClassLoader cl = ClassLoader.getSystemClassLoader(); |
| 327 | ServiceLoader<CharsetProvider> sl = |
| 328 | ServiceLoader.load(CharsetProvider.class, cl); |
| 329 | Iterator<CharsetProvider> i = sl.iterator(); |
| 330 | |
| 331 | Object next = null; |
| 332 | |
| 333 | private boolean getNext() { |
| 334 | while (next == null) { |
| 335 | try { |
| 336 | if (!i.hasNext()) |
| 337 | return false; |
| 338 | next = i.next(); |
| 339 | } catch (ServiceConfigurationError sce) { |
| 340 | if (sce.getCause() instanceof SecurityException) { |
| 341 | // Ignore security exceptions |
| 342 | continue; |
| 343 | } |
| 344 | throw sce; |
| 345 | } |
| 346 | } |
| 347 | return true; |
| 348 | } |
| 349 | |
| 350 | public boolean hasNext() { |
| 351 | return getNext(); |
| 352 | } |
| 353 | |
| 354 | public Object next() { |
| 355 | if (!getNext()) |
| 356 | throw new NoSuchElementException(); |
| 357 | Object n = next; |
| 358 | next = null; |
| 359 | return n; |
| 360 | } |
| 361 | |
| 362 | public void remove() { |
| 363 | throw new UnsupportedOperationException(); |
| 364 | } |
| 365 | |
| 366 | }; |
| 367 | } |
| 368 | |
| 369 | // Thread-local gate to prevent recursive provider lookups |
| 370 | private static ThreadLocal gate = new ThreadLocal(); |
| 371 | |
| 372 | private static Charset lookupViaProviders(final String charsetName) { |
| 373 | |
| 374 | // The runtime startup sequence looks up standard charsets as a |
| 375 | // consequence of the VM's invocation of System.initializeSystemClass |
| 376 | // in order to, e.g., set system properties and encode filenames. At |
| 377 | // that point the application class loader has not been initialized, |
| 378 | // however, so we can't look for providers because doing so will cause |
| 379 | // that loader to be prematurely initialized with incomplete |
| 380 | // information. |
| 381 | // |
| 382 | if (!sun.misc.VM.isBooted()) |
| 383 | return null; |
| 384 | |
| 385 | if (gate.get() != null) |
| 386 | // Avoid recursive provider lookups |
| 387 | return null; |
| 388 | try { |
| 389 | gate.set(gate); |
| 390 | |
| 391 | return AccessController.doPrivileged( |
| 392 | new PrivilegedAction<Charset>() { |
| 393 | public Charset run() { |
| 394 | for (Iterator i = providers(); i.hasNext();) { |
| 395 | CharsetProvider cp = (CharsetProvider)i.next(); |
| 396 | Charset cs = cp.charsetForName(charsetName); |
| 397 | if (cs != null) |
| 398 | return cs; |
| 399 | } |
| 400 | return null; |
| 401 | } |
| 402 | }); |
| 403 | |
| 404 | } finally { |
| 405 | gate.set(null); |
| 406 | } |
| 407 | } |
| 408 | |
| 409 | /* The extended set of charsets */ |
| 410 | private static Object extendedProviderLock = new Object(); |
| 411 | private static boolean extendedProviderProbed = false; |
| 412 | private static CharsetProvider extendedProvider = null; |
| 413 | |
| 414 | private static void probeExtendedProvider() { |
| 415 | AccessController.doPrivileged(new PrivilegedAction<Object>() { |
| 416 | public Object run() { |
| 417 | try { |
| 418 | Class epc |
| 419 | = Class.forName("sun.nio.cs.ext.ExtendedCharsets"); |
| 420 | extendedProvider = (CharsetProvider)epc.newInstance(); |
| 421 | } catch (ClassNotFoundException x) { |
| 422 | // Extended charsets not available |
| 423 | // (charsets.jar not present) |
| 424 | } catch (InstantiationException x) { |
| 425 | throw new Error(x); |
| 426 | } catch (IllegalAccessException x) { |
| 427 | throw new Error(x); |
| 428 | } |
| 429 | return null; |
| 430 | } |
| 431 | }); |
| 432 | } |
| 433 | |
| 434 | private static Charset lookupExtendedCharset(String charsetName) { |
| 435 | CharsetProvider ecp = null; |
| 436 | synchronized (extendedProviderLock) { |
| 437 | if (!extendedProviderProbed) { |
| 438 | probeExtendedProvider(); |
| 439 | extendedProviderProbed = true; |
| 440 | } |
| 441 | ecp = extendedProvider; |
| 442 | } |
| 443 | return (ecp != null) ? ecp.charsetForName(charsetName) : null; |
| 444 | } |
| 445 | |
| 446 | private static Charset lookup(String charsetName) { |
| 447 | if (charsetName == null) |
| 448 | throw new IllegalArgumentException("Null charset name"); |
| 449 | |
| 450 | Object[] a; |
| 451 | if ((a = cache1) != null && charsetName.equals(a[0])) |
| 452 | return (Charset)a[1]; |
| 453 | // We expect most programs to use one Charset repeatedly. |
| 454 | // We convey a hint to this effect to the VM by putting the |
| 455 | // level 1 cache miss code in a separate method. |
| 456 | return lookup2(charsetName); |
| 457 | } |
| 458 | |
| 459 | private static Charset lookup2(String charsetName) { |
| 460 | Object[] a; |
| 461 | if ((a = cache2) != null && charsetName.equals(a[0])) { |
| 462 | cache2 = cache1; |
| 463 | cache1 = a; |
| 464 | return (Charset)a[1]; |
| 465 | } |
| 466 | |
| 467 | Charset cs; |
| 468 | if ((cs = standardProvider.charsetForName(charsetName)) != null || |
| 469 | (cs = lookupExtendedCharset(charsetName)) != null || |
| 470 | (cs = lookupViaProviders(charsetName)) != null) |
| 471 | { |
| 472 | cache(charsetName, cs); |
| 473 | return cs; |
| 474 | } |
| 475 | |
| 476 | /* Only need to check the name if we didn't find a charset for it */ |
| 477 | checkName(charsetName); |
| 478 | return null; |
| 479 | } |
| 480 | |
| 481 | /** |
| 482 | * Tells whether the named charset is supported. </p> |
| 483 | * |
| 484 | * @param charsetName |
| 485 | * The name of the requested charset; may be either |
| 486 | * a canonical name or an alias |
| 487 | * |
| 488 | * @return <tt>true</tt> if, and only if, support for the named charset |
| 489 | * is available in the current Java virtual machine |
| 490 | * |
| 491 | * @throws IllegalCharsetNameException |
| 492 | * If the given charset name is illegal |
| 493 | * |
| 494 | * @throws IllegalArgumentException |
| 495 | * If the given <tt>charsetName</tt> is null |
| 496 | */ |
| 497 | public static boolean isSupported(String charsetName) { |
| 498 | return (lookup(charsetName) != null); |
| 499 | } |
| 500 | |
| 501 | /** |
| 502 | * Returns a charset object for the named charset. </p> |
| 503 | * |
| 504 | * @param charsetName |
| 505 | * The name of the requested charset; may be either |
| 506 | * a canonical name or an alias |
| 507 | * |
| 508 | * @return A charset object for the named charset |
| 509 | * |
| 510 | * @throws IllegalCharsetNameException |
| 511 | * If the given charset name is illegal |
| 512 | * |
| 513 | * @throws IllegalArgumentException |
| 514 | * If the given <tt>charsetName</tt> is null |
| 515 | * |
| 516 | * @throws UnsupportedCharsetException |
| 517 | * If no support for the named charset is available |
| 518 | * in this instance of the Java virtual machine |
| 519 | */ |
| 520 | public static Charset forName(String charsetName) { |
| 521 | Charset cs = lookup(charsetName); |
| 522 | if (cs != null) |
| 523 | return cs; |
| 524 | throw new UnsupportedCharsetException(charsetName); |
| 525 | } |
| 526 | |
| 527 | // Fold charsets from the given iterator into the given map, ignoring |
| 528 | // charsets whose names already have entries in the map. |
| 529 | // |
| 530 | private static void put(Iterator i, Map m) { |
| 531 | while (i.hasNext()) { |
| 532 | Charset cs = (Charset)i.next(); |
| 533 | if (!m.containsKey(cs.name())) |
| 534 | m.put(cs.name(), cs); |
| 535 | } |
| 536 | } |
| 537 | |
| 538 | /** |
| 539 | * Constructs a sorted map from canonical charset names to charset objects. |
| 540 | * |
| 541 | * <p> The map returned by this method will have one entry for each charset |
| 542 | * for which support is available in the current Java virtual machine. If |
| 543 | * two or more supported charsets have the same canonical name then the |
| 544 | * resulting map will contain just one of them; which one it will contain |
| 545 | * is not specified. </p> |
| 546 | * |
| 547 | * <p> The invocation of this method, and the subsequent use of the |
| 548 | * resulting map, may cause time-consuming disk or network I/O operations |
| 549 | * to occur. This method is provided for applications that need to |
| 550 | * enumerate all of the available charsets, for example to allow user |
| 551 | * charset selection. This method is not used by the {@link #forName |
| 552 | * forName} method, which instead employs an efficient incremental lookup |
| 553 | * algorithm. |
| 554 | * |
| 555 | * <p> This method may return different results at different times if new |
| 556 | * charset providers are dynamically made available to the current Java |
| 557 | * virtual machine. In the absence of such changes, the charsets returned |
| 558 | * by this method are exactly those that can be retrieved via the {@link |
| 559 | * #forName forName} method. </p> |
| 560 | * |
| 561 | * @return An immutable, case-insensitive map from canonical charset names |
| 562 | * to charset objects |
| 563 | */ |
| 564 | public static SortedMap<String,Charset> availableCharsets() { |
| 565 | return AccessController.doPrivileged( |
| 566 | new PrivilegedAction<SortedMap<String,Charset>>() { |
| 567 | public SortedMap<String,Charset> run() { |
| 568 | TreeMap<String,Charset> m = |
| 569 | new TreeMap<String,Charset>( |
| 570 | ASCIICaseInsensitiveComparator.CASE_INSENSITIVE_ORDER); |
| 571 | put(standardProvider.charsets(), m); |
| 572 | for (Iterator i = providers(); i.hasNext();) { |
| 573 | CharsetProvider cp = (CharsetProvider)i.next(); |
| 574 | put(cp.charsets(), m); |
| 575 | } |
| 576 | return Collections.unmodifiableSortedMap(m); |
| 577 | } |
| 578 | }); |
| 579 | } |
| 580 | |
| 581 | private static volatile Charset defaultCharset; |
| 582 | |
| 583 | /** |
| 584 | * Returns the default charset of this Java virtual machine. |
| 585 | * |
| 586 | * <p> The default charset is determined during virtual-machine startup and |
| 587 | * typically depends upon the locale and charset of the underlying |
| 588 | * operating system. |
| 589 | * |
| 590 | * @return A charset object for the default charset |
| 591 | * |
| 592 | * @since 1.5 |
| 593 | */ |
| 594 | public static Charset defaultCharset() { |
| 595 | if (defaultCharset == null) { |
| 596 | synchronized (Charset.class) { |
| 597 | String csn = AccessController.doPrivileged( |
| 598 | new GetPropertyAction("file.encoding")); |
| 599 | Charset cs = lookup(csn); |
| 600 | if (cs != null) |
| 601 | defaultCharset = cs; |
| 602 | else |
| 603 | defaultCharset = forName("UTF-8"); |
| 604 | } |
| 605 | } |
| 606 | return defaultCharset; |
| 607 | } |
| 608 | |
| 609 | |
| 610 | /* -- Instance fields and methods -- */ |
| 611 | |
| 612 | private final String name; // tickles a bug in oldjavac |
| 613 | private final String[] aliases; // tickles a bug in oldjavac |
| 614 | private Set aliasSet = null; |
| 615 | |
| 616 | /** |
| 617 | * Initializes a new charset with the given canonical name and alias |
| 618 | * set. </p> |
| 619 | * |
| 620 | * @param canonicalName |
| 621 | * The canonical name of this charset |
| 622 | * |
| 623 | * @param aliases |
| 624 | * An array of this charset's aliases, or null if it has no aliases |
| 625 | * |
| 626 | * @throws IllegalCharsetNameException |
| 627 | * If the canonical name or any of the aliases are illegal |
| 628 | */ |
| 629 | protected Charset(String canonicalName, String[] aliases) { |
| 630 | checkName(canonicalName); |
| 631 | String[] as = (aliases == null) ? new String[0] : aliases; |
| 632 | for (int i = 0; i < as.length; i++) |
| 633 | checkName(as[i]); |
| 634 | this.name = canonicalName; |
| 635 | this.aliases = as; |
| 636 | } |
| 637 | |
| 638 | /** |
| 639 | * Returns this charset's canonical name. </p> |
| 640 | * |
| 641 | * @return The canonical name of this charset |
| 642 | */ |
| 643 | public final String name() { |
| 644 | return name; |
| 645 | } |
| 646 | |
| 647 | /** |
| 648 | * Returns a set containing this charset's aliases. </p> |
| 649 | * |
| 650 | * @return An immutable set of this charset's aliases |
| 651 | */ |
| 652 | public final Set<String> aliases() { |
| 653 | if (aliasSet != null) |
| 654 | return aliasSet; |
| 655 | int n = aliases.length; |
| 656 | HashSet hs = new HashSet(n); |
| 657 | for (int i = 0; i < n; i++) |
| 658 | hs.add(aliases[i]); |
| 659 | aliasSet = Collections.unmodifiableSet(hs); |
| 660 | return aliasSet; |
| 661 | } |
| 662 | |
| 663 | /** |
| 664 | * Returns this charset's human-readable name for the default locale. |
| 665 | * |
| 666 | * <p> The default implementation of this method simply returns this |
| 667 | * charset's canonical name. Concrete subclasses of this class may |
| 668 | * override this method in order to provide a localized display name. </p> |
| 669 | * |
| 670 | * @return The display name of this charset in the default locale |
| 671 | */ |
| 672 | public String displayName() { |
| 673 | return name; |
| 674 | } |
| 675 | |
| 676 | /** |
| 677 | * Tells whether or not this charset is registered in the <a |
| 678 | * href="http://www.iana.org/assignments/character-sets">IANA Charset |
| 679 | * Registry</a>. </p> |
| 680 | * |
| 681 | * @return <tt>true</tt> if, and only if, this charset is known by its |
| 682 | * implementor to be registered with the IANA |
| 683 | */ |
| 684 | public final boolean isRegistered() { |
| 685 | return !name.startsWith("X-") && !name.startsWith("x-"); |
| 686 | } |
| 687 | |
| 688 | /** |
| 689 | * Returns this charset's human-readable name for the given locale. |
| 690 | * |
| 691 | * <p> The default implementation of this method simply returns this |
| 692 | * charset's canonical name. Concrete subclasses of this class may |
| 693 | * override this method in order to provide a localized display name. </p> |
| 694 | * |
| 695 | * @param locale |
| 696 | * The locale for which the display name is to be retrieved |
| 697 | * |
| 698 | * @return The display name of this charset in the given locale |
| 699 | */ |
| 700 | public String displayName(Locale locale) { |
| 701 | return name; |
| 702 | } |
| 703 | |
| 704 | /** |
| 705 | * Tells whether or not this charset contains the given charset. |
| 706 | * |
| 707 | * <p> A charset <i>C</i> is said to <i>contain</i> a charset <i>D</i> if, |
| 708 | * and only if, every character representable in <i>D</i> is also |
| 709 | * representable in <i>C</i>. If this relationship holds then it is |
| 710 | * guaranteed that every string that can be encoded in <i>D</i> can also be |
| 711 | * encoded in <i>C</i> without performing any replacements. |
| 712 | * |
| 713 | * <p> That <i>C</i> contains <i>D</i> does not imply that each character |
| 714 | * representable in <i>C</i> by a particular byte sequence is represented |
| 715 | * in <i>D</i> by the same byte sequence, although sometimes this is the |
| 716 | * case. |
| 717 | * |
| 718 | * <p> Every charset contains itself. |
| 719 | * |
| 720 | * <p> This method computes an approximation of the containment relation: |
| 721 | * If it returns <tt>true</tt> then the given charset is known to be |
| 722 | * contained by this charset; if it returns <tt>false</tt>, however, then |
| 723 | * it is not necessarily the case that the given charset is not contained |
| 724 | * in this charset. |
| 725 | * |
| 726 | * @return <tt>true</tt> if the given charset is contained in this charset |
| 727 | */ |
| 728 | public abstract boolean contains(Charset cs); |
| 729 | |
| 730 | /** |
| 731 | * Constructs a new decoder for this charset. </p> |
| 732 | * |
| 733 | * @return A new decoder for this charset |
| 734 | */ |
| 735 | public abstract CharsetDecoder newDecoder(); |
| 736 | |
| 737 | /** |
| 738 | * Constructs a new encoder for this charset. </p> |
| 739 | * |
| 740 | * @return A new encoder for this charset |
| 741 | * |
| 742 | * @throws UnsupportedOperationException |
| 743 | * If this charset does not support encoding |
| 744 | */ |
| 745 | public abstract CharsetEncoder newEncoder(); |
| 746 | |
| 747 | /** |
| 748 | * Tells whether or not this charset supports encoding. |
| 749 | * |
| 750 | * <p> Nearly all charsets support encoding. The primary exceptions are |
| 751 | * special-purpose <i>auto-detect</i> charsets whose decoders can determine |
| 752 | * which of several possible encoding schemes is in use by examining the |
| 753 | * input byte sequence. Such charsets do not support encoding because |
| 754 | * there is no way to determine which encoding should be used on output. |
| 755 | * Implementations of such charsets should override this method to return |
| 756 | * <tt>false</tt>. </p> |
| 757 | * |
| 758 | * @return <tt>true</tt> if, and only if, this charset supports encoding |
| 759 | */ |
| 760 | public boolean canEncode() { |
| 761 | return true; |
| 762 | } |
| 763 | |
| 764 | /** |
| 765 | * Convenience method that decodes bytes in this charset into Unicode |
| 766 | * characters. |
| 767 | * |
| 768 | * <p> An invocation of this method upon a charset <tt>cs</tt> returns the |
| 769 | * same result as the expression |
| 770 | * |
| 771 | * <pre> |
| 772 | * cs.newDecoder() |
| 773 | * .onMalformedInput(CodingErrorAction.REPLACE) |
| 774 | * .onUnmappableCharacter(CodingErrorAction.REPLACE) |
| 775 | * .decode(bb); </pre> |
| 776 | * |
| 777 | * except that it is potentially more efficient because it can cache |
| 778 | * decoders between successive invocations. |
| 779 | * |
| 780 | * <p> This method always replaces malformed-input and unmappable-character |
| 781 | * sequences with this charset's default replacement byte array. In order |
| 782 | * to detect such sequences, use the {@link |
| 783 | * CharsetDecoder#decode(java.nio.ByteBuffer)} method directly. </p> |
| 784 | * |
| 785 | * @param bb The byte buffer to be decoded |
| 786 | * |
| 787 | * @return A char buffer containing the decoded characters |
| 788 | */ |
| 789 | public final CharBuffer decode(ByteBuffer bb) { |
| 790 | try { |
| 791 | return ThreadLocalCoders.decoderFor(this) |
| 792 | .onMalformedInput(CodingErrorAction.REPLACE) |
| 793 | .onUnmappableCharacter(CodingErrorAction.REPLACE) |
| 794 | .decode(bb); |
| 795 | } catch (CharacterCodingException x) { |
| 796 | throw new Error(x); // Can't happen |
| 797 | } |
| 798 | } |
| 799 | |
| 800 | /** |
| 801 | * Convenience method that encodes Unicode characters into bytes in this |
| 802 | * charset. |
| 803 | * |
| 804 | * <p> An invocation of this method upon a charset <tt>cs</tt> returns the |
| 805 | * same result as the expression |
| 806 | * |
| 807 | * <pre> |
| 808 | * cs.newEncoder() |
| 809 | * .onMalformedInput(CodingErrorAction.REPLACE) |
| 810 | * .onUnmappableCharacter(CodingErrorAction.REPLACE) |
| 811 | * .encode(bb); </pre> |
| 812 | * |
| 813 | * except that it is potentially more efficient because it can cache |
| 814 | * encoders between successive invocations. |
| 815 | * |
| 816 | * <p> This method always replaces malformed-input and unmappable-character |
| 817 | * sequences with this charset's default replacement string. In order to |
| 818 | * detect such sequences, use the {@link |
| 819 | * CharsetEncoder#encode(java.nio.CharBuffer)} method directly. </p> |
| 820 | * |
| 821 | * @param cb The char buffer to be encoded |
| 822 | * |
| 823 | * @return A byte buffer containing the encoded characters |
| 824 | */ |
| 825 | public final ByteBuffer encode(CharBuffer cb) { |
| 826 | try { |
| 827 | return ThreadLocalCoders.encoderFor(this) |
| 828 | .onMalformedInput(CodingErrorAction.REPLACE) |
| 829 | .onUnmappableCharacter(CodingErrorAction.REPLACE) |
| 830 | .encode(cb); |
| 831 | } catch (CharacterCodingException x) { |
| 832 | throw new Error(x); // Can't happen |
| 833 | } |
| 834 | } |
| 835 | |
| 836 | /** |
| 837 | * Convenience method that encodes a string into bytes in this charset. |
| 838 | * |
| 839 | * <p> An invocation of this method upon a charset <tt>cs</tt> returns the |
| 840 | * same result as the expression |
| 841 | * |
| 842 | * <pre> |
| 843 | * cs.encode(CharBuffer.wrap(s)); </pre> |
| 844 | * |
| 845 | * @param str The string to be encoded |
| 846 | * |
| 847 | * @return A byte buffer containing the encoded characters |
| 848 | */ |
| 849 | public final ByteBuffer encode(String str) { |
| 850 | return encode(CharBuffer.wrap(str)); |
| 851 | } |
| 852 | |
| 853 | /** |
| 854 | * Compares this charset to another. |
| 855 | * |
| 856 | * <p> Charsets are ordered by their canonical names, without regard to |
| 857 | * case. </p> |
| 858 | * |
| 859 | * @param that |
| 860 | * The charset to which this charset is to be compared |
| 861 | * |
| 862 | * @return A negative integer, zero, or a positive integer as this charset |
| 863 | * is less than, equal to, or greater than the specified charset |
| 864 | */ |
| 865 | public final int compareTo(Charset that) { |
| 866 | return (name().compareToIgnoreCase(that.name())); |
| 867 | } |
| 868 | |
| 869 | /** |
| 870 | * Computes a hashcode for this charset. </p> |
| 871 | * |
| 872 | * @return An integer hashcode |
| 873 | */ |
| 874 | public final int hashCode() { |
| 875 | return name().hashCode(); |
| 876 | } |
| 877 | |
| 878 | /** |
| 879 | * Tells whether or not this object is equal to another. |
| 880 | * |
| 881 | * <p> Two charsets are equal if, and only if, they have the same canonical |
| 882 | * names. A charset is never equal to any other type of object. </p> |
| 883 | * |
| 884 | * @return <tt>true</tt> if, and only if, this charset is equal to the |
| 885 | * given object |
| 886 | */ |
| 887 | public final boolean equals(Object ob) { |
| 888 | if (!(ob instanceof Charset)) |
| 889 | return false; |
| 890 | if (this == ob) |
| 891 | return true; |
| 892 | return name.equals(((Charset)ob).name()); |
| 893 | } |
| 894 | |
| 895 | /** |
| 896 | * Returns a string describing this charset. </p> |
| 897 | * |
| 898 | * @return A string describing this charset |
| 899 | */ |
| 900 | public final String toString() { |
| 901 | return name(); |
| 902 | } |
| 903 | |
| 904 | } |