blob: 1372343271274fda5f35a1c31209060193ef8dff [file] [log] [blame]
J. Duke319a3b92007-12-01 00:00:00 +00001/*
2 * Copyright 2000-2006 Sun Microsystems, Inc. All Rights Reserved.
3 * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
4 *
5 * This code is free software; you can redistribute it and/or modify it
6 * under the terms of the GNU General Public License version 2 only, as
7 * published by the Free Software Foundation. Sun designates this
8 * particular file as subject to the "Classpath" exception as provided
9 * by Sun in the LICENSE file that accompanied this code.
10 *
11 * This code is distributed in the hope that it will be useful, but WITHOUT
12 * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
13 * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
14 * version 2 for more details (a copy is included in the LICENSE file that
15 * accompanied this code).
16 *
17 * You should have received a copy of the GNU General Public License version
18 * 2 along with this work; if not, write to the Free Software Foundation,
19 * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
20 *
21 * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
22 * CA 95054 USA or visit www.sun.com if you need additional information or
23 * have any questions.
24 */
25
26package java.nio.charset;
27
28import java.nio.ByteBuffer;
29import java.nio.CharBuffer;
30import java.nio.charset.spi.CharsetProvider;
31import java.security.AccessController;
32import java.security.AccessControlException;
33import java.security.PrivilegedAction;
34import java.util.Collections;
35import java.util.HashSet;
36import java.util.Iterator;
37import java.util.Locale;
38import java.util.Map;
39import java.util.NoSuchElementException;
40import java.util.Set;
41import java.util.ServiceLoader;
42import java.util.ServiceConfigurationError;
43import java.util.SortedMap;
44import java.util.TreeMap;
45import sun.misc.ASCIICaseInsensitiveComparator;
46import sun.nio.cs.StandardCharsets;
47import sun.nio.cs.ThreadLocalCoders;
48import sun.security.action.GetPropertyAction;
49
50
51/**
52 * A named mapping between sequences of sixteen-bit Unicode <a
53 * href="../../lang/Character.html#unicode">code units</a> and sequences of
54 * bytes. This class defines methods for creating decoders and encoders and
55 * for retrieving the various names associated with a charset. Instances of
56 * this class are immutable.
57 *
58 * <p> This class also defines static methods for testing whether a particular
59 * charset is supported, for locating charset instances by name, and for
60 * constructing a map that contains every charset for which support is
61 * available in the current Java virtual machine. Support for new charsets can
62 * be added via the service-provider interface defined in the {@link
63 * java.nio.charset.spi.CharsetProvider} class.
64 *
65 * <p> All of the methods defined in this class are safe for use by multiple
66 * concurrent threads.
67 *
68 *
69 * <a name="names"><a name="charenc">
70 * <h4>Charset names</h4>
71 *
72 * <p> Charsets are named by strings composed of the following characters:
73 *
74 * <ul>
75 *
76 * <li> The uppercase letters <tt>'A'</tt> through <tt>'Z'</tt>
77 * (<tt>'&#92;u0041'</tt>&nbsp;through&nbsp;<tt>'&#92;u005a'</tt>),
78 *
79 * <li> The lowercase letters <tt>'a'</tt> through <tt>'z'</tt>
80 * (<tt>'&#92;u0061'</tt>&nbsp;through&nbsp;<tt>'&#92;u007a'</tt>),
81 *
82 * <li> The digits <tt>'0'</tt> through <tt>'9'</tt>
83 * (<tt>'&#92;u0030'</tt>&nbsp;through&nbsp;<tt>'&#92;u0039'</tt>),
84 *
85 * <li> The dash character <tt>'-'</tt>
86 * (<tt>'&#92;u002d'</tt>,&nbsp;<small>HYPHEN-MINUS</small>),
87 *
88 * <li> The period character <tt>'.'</tt>
89 * (<tt>'&#92;u002e'</tt>,&nbsp;<small>FULL STOP</small>),
90 *
91 * <li> The colon character <tt>':'</tt>
92 * (<tt>'&#92;u003a'</tt>,&nbsp;<small>COLON</small>), and
93 *
94 * <li> The underscore character <tt>'_'</tt>
95 * (<tt>'&#92;u005f'</tt>,&nbsp;<small>LOW&nbsp;LINE</small>).
96 *
97 * </ul>
98 *
99 * A charset name must begin with either a letter or a digit. The empty string
100 * is not a legal charset name. Charset names are not case-sensitive; that is,
101 * case is always ignored when comparing charset names. Charset names
102 * generally follow the conventions documented in <a
103 * href="http://www.ietf.org/rfc/rfc2278.txt"><i>RFC&nbsp;2278:&nbsp;IANA Charset
104 * Registration Procedures</i></a>.
105 *
106 * <p> Every charset has a <i>canonical name</i> and may also have one or more
107 * <i>aliases</i>. The canonical name is returned by the {@link #name() name} method
108 * of this class. Canonical names are, by convention, usually in upper case.
109 * The aliases of a charset are returned by the {@link #aliases() aliases}
110 * method.
111 *
112 * <a name="hn">
113 *
114 * <p> Some charsets have an <i>historical name</i> that is defined for
115 * compatibility with previous versions of the Java platform. A charset's
116 * historical name is either its canonical name or one of its aliases. The
117 * historical name is returned by the <tt>getEncoding()</tt> methods of the
118 * {@link java.io.InputStreamReader#getEncoding InputStreamReader} and {@link
119 * java.io.OutputStreamWriter#getEncoding OutputStreamWriter} classes.
120 *
121 * <a name="iana">
122 *
123 * <p> If a charset listed in the <a
124 * href="http://www.iana.org/assignments/character-sets"><i>IANA Charset
125 * Registry</i></a> is supported by an implementation of the Java platform then
126 * its canonical name must be the name listed in the registry. Many charsets
127 * are given more than one name in the registry, in which case the registry
128 * identifies one of the names as <i>MIME-preferred</i>. If a charset has more
129 * than one registry name then its canonical name must be the MIME-preferred
130 * name and the other names in the registry must be valid aliases. If a
131 * supported charset is not listed in the IANA registry then its canonical name
132 * must begin with one of the strings <tt>"X-"</tt> or <tt>"x-"</tt>.
133 *
134 * <p> The IANA charset registry does change over time, and so the canonical
135 * name and the aliases of a particular charset may also change over time. To
136 * ensure compatibility it is recommended that no alias ever be removed from a
137 * charset, and that if the canonical name of a charset is changed then its
138 * previous canonical name be made into an alias.
139 *
140 *
141 * <h4>Standard charsets</h4>
142 *
143 * <p> Every implementation of the Java platform is required to support the
144 * following standard charsets. Consult the release documentation for your
145 * implementation to see if any other charsets are supported. The behavior
146 * of such optional charsets may differ between implementations.
147 *
148 * <blockquote><table width="80%" summary="Description of standard charsets">
149 * <tr><th><p align="left">Charset</p></th><th><p align="left">Description</p></th></tr>
150 * <tr><td valign=top><tt>US-ASCII</tt></td>
151 * <td>Seven-bit ASCII, a.k.a. <tt>ISO646-US</tt>,
152 * a.k.a. the Basic Latin block of the Unicode character set</td></tr>
153 * <tr><td valign=top><tt>ISO-8859-1&nbsp;&nbsp;</tt></td>
154 * <td>ISO Latin Alphabet No. 1, a.k.a. <tt>ISO-LATIN-1</tt></td></tr>
155 * <tr><td valign=top><tt>UTF-8</tt></td>
156 * <td>Eight-bit UCS Transformation Format</td></tr>
157 * <tr><td valign=top><tt>UTF-16BE</tt></td>
158 * <td>Sixteen-bit UCS Transformation Format,
159 * big-endian byte&nbsp;order</td></tr>
160 * <tr><td valign=top><tt>UTF-16LE</tt></td>
161 * <td>Sixteen-bit UCS Transformation Format,
162 * little-endian byte&nbsp;order</td></tr>
163 * <tr><td valign=top><tt>UTF-16</tt></td>
164 * <td>Sixteen-bit UCS Transformation Format,
165 * byte&nbsp;order identified by an optional byte-order mark</td></tr>
166 * </table></blockquote>
167 *
168 * <p> The <tt>UTF-8</tt> charset is specified by <a
169 * href="http://www.ietf.org/rfc/rfc2279.txt"><i>RFC&nbsp;2279</i></a>; the
170 * transformation format upon which it is based is specified in
171 * Amendment&nbsp;2 of ISO&nbsp;10646-1 and is also described in the <a
172 * href="http://www.unicode.org/unicode/standard/standard.html"><i>Unicode
173 * Standard</i></a>.
174 *
175 * <p> The <tt>UTF-16</tt> charsets are specified by <a
176 * href="http://www.ietf.org/rfc/rfc2781.txt"><i>RFC&nbsp;2781</i></a>; the
177 * transformation formats upon which they are based are specified in
178 * Amendment&nbsp;1 of ISO&nbsp;10646-1 and are also described in the <a
179 * href="http://www.unicode.org/unicode/standard/standard.html"><i>Unicode
180 * Standard</i></a>.
181 *
182 * <p> The <tt>UTF-16</tt> charsets use sixteen-bit quantities and are
183 * therefore sensitive to byte order. In these encodings the byte order of a
184 * stream may be indicated by an initial <i>byte-order mark</i> represented by
185 * the Unicode character <tt>'&#92;uFEFF'</tt>. Byte-order marks are handled
186 * as follows:
187 *
188 * <ul>
189 *
190 * <li><p> When decoding, the <tt>UTF-16BE</tt> and <tt>UTF-16LE</tt>
191 * charsets ignore byte-order marks; when encoding, they do not write
192 * byte-order marks. </p></li>
193 *
194 * <li><p> When decoding, the <tt>UTF-16</tt> charset interprets a byte-order
195 * mark to indicate the byte order of the stream but defaults to big-endian
196 * if there is no byte-order mark; when encoding, it uses big-endian byte
197 * order and writes a big-endian byte-order mark. </p></li>
198 *
199 * </ul>
200 *
201 * In any case, when a byte-order mark is read at the beginning of a decoding
202 * operation it is omitted from the resulting sequence of characters. Byte
203 * order marks occuring after the first element of an input sequence are not
204 * omitted since the same code is used to represent <small>ZERO-WIDTH
205 * NON-BREAKING SPACE</small>.
206 *
207 * <p> Every instance of the Java virtual machine has a default charset, which
208 * may or may not be one of the standard charsets. The default charset is
209 * determined during virtual-machine startup and typically depends upon the
210 * locale and charset being used by the underlying operating system. </p>
211 *
212 *
213 * <h4>Terminology</h4>
214 *
215 * <p> The name of this class is taken from the terms used in <a
216 * href="http://www.ietf.org/rfc/rfc2278.txt""><i>RFC&nbsp;2278</i></a>. In that
217 * document a <i>charset</i> is defined as the combination of a coded character
218 * set and a character-encoding scheme.
219 *
220 * <p> A <i>coded character set</i> is a mapping between a set of abstract
221 * characters and a set of integers. US-ASCII, ISO&nbsp;8859-1,
222 * JIS&nbsp;X&nbsp;0201, and full Unicode, which is the same as
223 * ISO&nbsp;10646-1, are examples of coded character sets.
224 *
225 * <p> A <i>character-encoding scheme</i> is a mapping between a coded
226 * character set and a set of octet (eight-bit byte) sequences. UTF-8, UCS-2,
227 * UTF-16, ISO&nbsp;2022, and EUC are examples of character-encoding schemes.
228 * Encoding schemes are often associated with a particular coded character set;
229 * UTF-8, for example, is used only to encode Unicode. Some schemes, however,
230 * are associated with multiple character sets; EUC, for example, can be used
231 * to encode characters in a variety of Asian character sets.
232 *
233 * <p> When a coded character set is used exclusively with a single
234 * character-encoding scheme then the corresponding charset is usually named
235 * for the character set; otherwise a charset is usually named for the encoding
236 * scheme and, possibly, the locale of the character sets that it supports.
237 * Hence <tt>US-ASCII</tt> is the name of the charset for US-ASCII while
238 * <tt>EUC-JP</tt> is the name of the charset that encodes the
239 * JIS&nbsp;X&nbsp;0201, JIS&nbsp;X&nbsp;0208, and JIS&nbsp;X&nbsp;0212
240 * character sets.
241 *
242 * <p> The native character encoding of the Java programming language is
243 * UTF-16. A charset in the Java platform therefore defines a mapping between
244 * sequences of sixteen-bit UTF-16 code units and sequences of bytes. </p>
245 *
246 *
247 * @author Mark Reinhold
248 * @author JSR-51 Expert Group
249 * @since 1.4
250 *
251 * @see CharsetDecoder
252 * @see CharsetEncoder
253 * @see java.nio.charset.spi.CharsetProvider
254 * @see java.lang.Character
255 */
256
257public abstract class Charset
258 implements Comparable<Charset>
259{
260
261 /* -- Static methods -- */
262
263 private static String bugLevel = null;
264
265 static boolean atBugLevel(String bl) { // package-private
266 if (bugLevel == null) {
267 if (!sun.misc.VM.isBooted())
268 return false;
269 bugLevel = AccessController.doPrivileged(
270 new GetPropertyAction("sun.nio.cs.bugLevel"));
271 if (bugLevel == null)
272 bugLevel = "";
273 }
274 return (bugLevel != null) && bugLevel.equals(bl);
275 }
276
277 /**
278 * Checks that the given string is a legal charset name. </p>
279 *
280 * @param s
281 * A purported charset name
282 *
283 * @throws IllegalCharsetNameException
284 * If the given name is not a legal charset name
285 */
286 private static void checkName(String s) {
287 int n = s.length();
288 if (!atBugLevel("1.4")) {
289 if (n == 0)
290 throw new IllegalCharsetNameException(s);
291 }
292 for (int i = 0; i < n; i++) {
293 char c = s.charAt(i);
294 if (c >= 'A' && c <= 'Z') continue;
295 if (c >= 'a' && c <= 'z') continue;
296 if (c >= '0' && c <= '9') continue;
297 if (c == '-' && i != 0) continue;
298 if (c == ':' && i != 0) continue;
299 if (c == '_' && i != 0) continue;
300 if (c == '.' && i != 0) continue;
301 throw new IllegalCharsetNameException(s);
302 }
303 }
304
305 /* The standard set of charsets */
306 private static CharsetProvider standardProvider = new StandardCharsets();
307
308 // Cache of the most-recently-returned charsets,
309 // along with the names that were used to find them
310 //
311 private static volatile Object[] cache1 = null; // "Level 1" cache
312 private static volatile Object[] cache2 = null; // "Level 2" cache
313
314 private static void cache(String charsetName, Charset cs) {
315 cache2 = cache1;
316 cache1 = new Object[] { charsetName, cs };
317 }
318
319 // Creates an iterator that walks over the available providers, ignoring
320 // those whose lookup or instantiation causes a security exception to be
321 // thrown. Should be invoked with full privileges.
322 //
323 private static Iterator providers() {
324 return new Iterator() {
325
326 ClassLoader cl = ClassLoader.getSystemClassLoader();
327 ServiceLoader<CharsetProvider> sl =
328 ServiceLoader.load(CharsetProvider.class, cl);
329 Iterator<CharsetProvider> i = sl.iterator();
330
331 Object next = null;
332
333 private boolean getNext() {
334 while (next == null) {
335 try {
336 if (!i.hasNext())
337 return false;
338 next = i.next();
339 } catch (ServiceConfigurationError sce) {
340 if (sce.getCause() instanceof SecurityException) {
341 // Ignore security exceptions
342 continue;
343 }
344 throw sce;
345 }
346 }
347 return true;
348 }
349
350 public boolean hasNext() {
351 return getNext();
352 }
353
354 public Object next() {
355 if (!getNext())
356 throw new NoSuchElementException();
357 Object n = next;
358 next = null;
359 return n;
360 }
361
362 public void remove() {
363 throw new UnsupportedOperationException();
364 }
365
366 };
367 }
368
369 // Thread-local gate to prevent recursive provider lookups
370 private static ThreadLocal gate = new ThreadLocal();
371
372 private static Charset lookupViaProviders(final String charsetName) {
373
374 // The runtime startup sequence looks up standard charsets as a
375 // consequence of the VM's invocation of System.initializeSystemClass
376 // in order to, e.g., set system properties and encode filenames. At
377 // that point the application class loader has not been initialized,
378 // however, so we can't look for providers because doing so will cause
379 // that loader to be prematurely initialized with incomplete
380 // information.
381 //
382 if (!sun.misc.VM.isBooted())
383 return null;
384
385 if (gate.get() != null)
386 // Avoid recursive provider lookups
387 return null;
388 try {
389 gate.set(gate);
390
391 return AccessController.doPrivileged(
392 new PrivilegedAction<Charset>() {
393 public Charset run() {
394 for (Iterator i = providers(); i.hasNext();) {
395 CharsetProvider cp = (CharsetProvider)i.next();
396 Charset cs = cp.charsetForName(charsetName);
397 if (cs != null)
398 return cs;
399 }
400 return null;
401 }
402 });
403
404 } finally {
405 gate.set(null);
406 }
407 }
408
409 /* The extended set of charsets */
410 private static Object extendedProviderLock = new Object();
411 private static boolean extendedProviderProbed = false;
412 private static CharsetProvider extendedProvider = null;
413
414 private static void probeExtendedProvider() {
415 AccessController.doPrivileged(new PrivilegedAction<Object>() {
416 public Object run() {
417 try {
418 Class epc
419 = Class.forName("sun.nio.cs.ext.ExtendedCharsets");
420 extendedProvider = (CharsetProvider)epc.newInstance();
421 } catch (ClassNotFoundException x) {
422 // Extended charsets not available
423 // (charsets.jar not present)
424 } catch (InstantiationException x) {
425 throw new Error(x);
426 } catch (IllegalAccessException x) {
427 throw new Error(x);
428 }
429 return null;
430 }
431 });
432 }
433
434 private static Charset lookupExtendedCharset(String charsetName) {
435 CharsetProvider ecp = null;
436 synchronized (extendedProviderLock) {
437 if (!extendedProviderProbed) {
438 probeExtendedProvider();
439 extendedProviderProbed = true;
440 }
441 ecp = extendedProvider;
442 }
443 return (ecp != null) ? ecp.charsetForName(charsetName) : null;
444 }
445
446 private static Charset lookup(String charsetName) {
447 if (charsetName == null)
448 throw new IllegalArgumentException("Null charset name");
449
450 Object[] a;
451 if ((a = cache1) != null && charsetName.equals(a[0]))
452 return (Charset)a[1];
453 // We expect most programs to use one Charset repeatedly.
454 // We convey a hint to this effect to the VM by putting the
455 // level 1 cache miss code in a separate method.
456 return lookup2(charsetName);
457 }
458
459 private static Charset lookup2(String charsetName) {
460 Object[] a;
461 if ((a = cache2) != null && charsetName.equals(a[0])) {
462 cache2 = cache1;
463 cache1 = a;
464 return (Charset)a[1];
465 }
466
467 Charset cs;
468 if ((cs = standardProvider.charsetForName(charsetName)) != null ||
469 (cs = lookupExtendedCharset(charsetName)) != null ||
470 (cs = lookupViaProviders(charsetName)) != null)
471 {
472 cache(charsetName, cs);
473 return cs;
474 }
475
476 /* Only need to check the name if we didn't find a charset for it */
477 checkName(charsetName);
478 return null;
479 }
480
481 /**
482 * Tells whether the named charset is supported. </p>
483 *
484 * @param charsetName
485 * The name of the requested charset; may be either
486 * a canonical name or an alias
487 *
488 * @return <tt>true</tt> if, and only if, support for the named charset
489 * is available in the current Java virtual machine
490 *
491 * @throws IllegalCharsetNameException
492 * If the given charset name is illegal
493 *
494 * @throws IllegalArgumentException
495 * If the given <tt>charsetName</tt> is null
496 */
497 public static boolean isSupported(String charsetName) {
498 return (lookup(charsetName) != null);
499 }
500
501 /**
502 * Returns a charset object for the named charset. </p>
503 *
504 * @param charsetName
505 * The name of the requested charset; may be either
506 * a canonical name or an alias
507 *
508 * @return A charset object for the named charset
509 *
510 * @throws IllegalCharsetNameException
511 * If the given charset name is illegal
512 *
513 * @throws IllegalArgumentException
514 * If the given <tt>charsetName</tt> is null
515 *
516 * @throws UnsupportedCharsetException
517 * If no support for the named charset is available
518 * in this instance of the Java virtual machine
519 */
520 public static Charset forName(String charsetName) {
521 Charset cs = lookup(charsetName);
522 if (cs != null)
523 return cs;
524 throw new UnsupportedCharsetException(charsetName);
525 }
526
527 // Fold charsets from the given iterator into the given map, ignoring
528 // charsets whose names already have entries in the map.
529 //
530 private static void put(Iterator i, Map m) {
531 while (i.hasNext()) {
532 Charset cs = (Charset)i.next();
533 if (!m.containsKey(cs.name()))
534 m.put(cs.name(), cs);
535 }
536 }
537
538 /**
539 * Constructs a sorted map from canonical charset names to charset objects.
540 *
541 * <p> The map returned by this method will have one entry for each charset
542 * for which support is available in the current Java virtual machine. If
543 * two or more supported charsets have the same canonical name then the
544 * resulting map will contain just one of them; which one it will contain
545 * is not specified. </p>
546 *
547 * <p> The invocation of this method, and the subsequent use of the
548 * resulting map, may cause time-consuming disk or network I/O operations
549 * to occur. This method is provided for applications that need to
550 * enumerate all of the available charsets, for example to allow user
551 * charset selection. This method is not used by the {@link #forName
552 * forName} method, which instead employs an efficient incremental lookup
553 * algorithm.
554 *
555 * <p> This method may return different results at different times if new
556 * charset providers are dynamically made available to the current Java
557 * virtual machine. In the absence of such changes, the charsets returned
558 * by this method are exactly those that can be retrieved via the {@link
559 * #forName forName} method. </p>
560 *
561 * @return An immutable, case-insensitive map from canonical charset names
562 * to charset objects
563 */
564 public static SortedMap<String,Charset> availableCharsets() {
565 return AccessController.doPrivileged(
566 new PrivilegedAction<SortedMap<String,Charset>>() {
567 public SortedMap<String,Charset> run() {
568 TreeMap<String,Charset> m =
569 new TreeMap<String,Charset>(
570 ASCIICaseInsensitiveComparator.CASE_INSENSITIVE_ORDER);
571 put(standardProvider.charsets(), m);
572 for (Iterator i = providers(); i.hasNext();) {
573 CharsetProvider cp = (CharsetProvider)i.next();
574 put(cp.charsets(), m);
575 }
576 return Collections.unmodifiableSortedMap(m);
577 }
578 });
579 }
580
581 private static volatile Charset defaultCharset;
582
583 /**
584 * Returns the default charset of this Java virtual machine.
585 *
586 * <p> The default charset is determined during virtual-machine startup and
587 * typically depends upon the locale and charset of the underlying
588 * operating system.
589 *
590 * @return A charset object for the default charset
591 *
592 * @since 1.5
593 */
594 public static Charset defaultCharset() {
595 if (defaultCharset == null) {
596 synchronized (Charset.class) {
597 String csn = AccessController.doPrivileged(
598 new GetPropertyAction("file.encoding"));
599 Charset cs = lookup(csn);
600 if (cs != null)
601 defaultCharset = cs;
602 else
603 defaultCharset = forName("UTF-8");
604 }
605 }
606 return defaultCharset;
607 }
608
609
610 /* -- Instance fields and methods -- */
611
612 private final String name; // tickles a bug in oldjavac
613 private final String[] aliases; // tickles a bug in oldjavac
614 private Set aliasSet = null;
615
616 /**
617 * Initializes a new charset with the given canonical name and alias
618 * set. </p>
619 *
620 * @param canonicalName
621 * The canonical name of this charset
622 *
623 * @param aliases
624 * An array of this charset's aliases, or null if it has no aliases
625 *
626 * @throws IllegalCharsetNameException
627 * If the canonical name or any of the aliases are illegal
628 */
629 protected Charset(String canonicalName, String[] aliases) {
630 checkName(canonicalName);
631 String[] as = (aliases == null) ? new String[0] : aliases;
632 for (int i = 0; i < as.length; i++)
633 checkName(as[i]);
634 this.name = canonicalName;
635 this.aliases = as;
636 }
637
638 /**
639 * Returns this charset's canonical name. </p>
640 *
641 * @return The canonical name of this charset
642 */
643 public final String name() {
644 return name;
645 }
646
647 /**
648 * Returns a set containing this charset's aliases. </p>
649 *
650 * @return An immutable set of this charset's aliases
651 */
652 public final Set<String> aliases() {
653 if (aliasSet != null)
654 return aliasSet;
655 int n = aliases.length;
656 HashSet hs = new HashSet(n);
657 for (int i = 0; i < n; i++)
658 hs.add(aliases[i]);
659 aliasSet = Collections.unmodifiableSet(hs);
660 return aliasSet;
661 }
662
663 /**
664 * Returns this charset's human-readable name for the default locale.
665 *
666 * <p> The default implementation of this method simply returns this
667 * charset's canonical name. Concrete subclasses of this class may
668 * override this method in order to provide a localized display name. </p>
669 *
670 * @return The display name of this charset in the default locale
671 */
672 public String displayName() {
673 return name;
674 }
675
676 /**
677 * Tells whether or not this charset is registered in the <a
678 * href="http://www.iana.org/assignments/character-sets">IANA Charset
679 * Registry</a>. </p>
680 *
681 * @return <tt>true</tt> if, and only if, this charset is known by its
682 * implementor to be registered with the IANA
683 */
684 public final boolean isRegistered() {
685 return !name.startsWith("X-") && !name.startsWith("x-");
686 }
687
688 /**
689 * Returns this charset's human-readable name for the given locale.
690 *
691 * <p> The default implementation of this method simply returns this
692 * charset's canonical name. Concrete subclasses of this class may
693 * override this method in order to provide a localized display name. </p>
694 *
695 * @param locale
696 * The locale for which the display name is to be retrieved
697 *
698 * @return The display name of this charset in the given locale
699 */
700 public String displayName(Locale locale) {
701 return name;
702 }
703
704 /**
705 * Tells whether or not this charset contains the given charset.
706 *
707 * <p> A charset <i>C</i> is said to <i>contain</i> a charset <i>D</i> if,
708 * and only if, every character representable in <i>D</i> is also
709 * representable in <i>C</i>. If this relationship holds then it is
710 * guaranteed that every string that can be encoded in <i>D</i> can also be
711 * encoded in <i>C</i> without performing any replacements.
712 *
713 * <p> That <i>C</i> contains <i>D</i> does not imply that each character
714 * representable in <i>C</i> by a particular byte sequence is represented
715 * in <i>D</i> by the same byte sequence, although sometimes this is the
716 * case.
717 *
718 * <p> Every charset contains itself.
719 *
720 * <p> This method computes an approximation of the containment relation:
721 * If it returns <tt>true</tt> then the given charset is known to be
722 * contained by this charset; if it returns <tt>false</tt>, however, then
723 * it is not necessarily the case that the given charset is not contained
724 * in this charset.
725 *
726 * @return <tt>true</tt> if the given charset is contained in this charset
727 */
728 public abstract boolean contains(Charset cs);
729
730 /**
731 * Constructs a new decoder for this charset. </p>
732 *
733 * @return A new decoder for this charset
734 */
735 public abstract CharsetDecoder newDecoder();
736
737 /**
738 * Constructs a new encoder for this charset. </p>
739 *
740 * @return A new encoder for this charset
741 *
742 * @throws UnsupportedOperationException
743 * If this charset does not support encoding
744 */
745 public abstract CharsetEncoder newEncoder();
746
747 /**
748 * Tells whether or not this charset supports encoding.
749 *
750 * <p> Nearly all charsets support encoding. The primary exceptions are
751 * special-purpose <i>auto-detect</i> charsets whose decoders can determine
752 * which of several possible encoding schemes is in use by examining the
753 * input byte sequence. Such charsets do not support encoding because
754 * there is no way to determine which encoding should be used on output.
755 * Implementations of such charsets should override this method to return
756 * <tt>false</tt>. </p>
757 *
758 * @return <tt>true</tt> if, and only if, this charset supports encoding
759 */
760 public boolean canEncode() {
761 return true;
762 }
763
764 /**
765 * Convenience method that decodes bytes in this charset into Unicode
766 * characters.
767 *
768 * <p> An invocation of this method upon a charset <tt>cs</tt> returns the
769 * same result as the expression
770 *
771 * <pre>
772 * cs.newDecoder()
773 * .onMalformedInput(CodingErrorAction.REPLACE)
774 * .onUnmappableCharacter(CodingErrorAction.REPLACE)
775 * .decode(bb); </pre>
776 *
777 * except that it is potentially more efficient because it can cache
778 * decoders between successive invocations.
779 *
780 * <p> This method always replaces malformed-input and unmappable-character
781 * sequences with this charset's default replacement byte array. In order
782 * to detect such sequences, use the {@link
783 * CharsetDecoder#decode(java.nio.ByteBuffer)} method directly. </p>
784 *
785 * @param bb The byte buffer to be decoded
786 *
787 * @return A char buffer containing the decoded characters
788 */
789 public final CharBuffer decode(ByteBuffer bb) {
790 try {
791 return ThreadLocalCoders.decoderFor(this)
792 .onMalformedInput(CodingErrorAction.REPLACE)
793 .onUnmappableCharacter(CodingErrorAction.REPLACE)
794 .decode(bb);
795 } catch (CharacterCodingException x) {
796 throw new Error(x); // Can't happen
797 }
798 }
799
800 /**
801 * Convenience method that encodes Unicode characters into bytes in this
802 * charset.
803 *
804 * <p> An invocation of this method upon a charset <tt>cs</tt> returns the
805 * same result as the expression
806 *
807 * <pre>
808 * cs.newEncoder()
809 * .onMalformedInput(CodingErrorAction.REPLACE)
810 * .onUnmappableCharacter(CodingErrorAction.REPLACE)
811 * .encode(bb); </pre>
812 *
813 * except that it is potentially more efficient because it can cache
814 * encoders between successive invocations.
815 *
816 * <p> This method always replaces malformed-input and unmappable-character
817 * sequences with this charset's default replacement string. In order to
818 * detect such sequences, use the {@link
819 * CharsetEncoder#encode(java.nio.CharBuffer)} method directly. </p>
820 *
821 * @param cb The char buffer to be encoded
822 *
823 * @return A byte buffer containing the encoded characters
824 */
825 public final ByteBuffer encode(CharBuffer cb) {
826 try {
827 return ThreadLocalCoders.encoderFor(this)
828 .onMalformedInput(CodingErrorAction.REPLACE)
829 .onUnmappableCharacter(CodingErrorAction.REPLACE)
830 .encode(cb);
831 } catch (CharacterCodingException x) {
832 throw new Error(x); // Can't happen
833 }
834 }
835
836 /**
837 * Convenience method that encodes a string into bytes in this charset.
838 *
839 * <p> An invocation of this method upon a charset <tt>cs</tt> returns the
840 * same result as the expression
841 *
842 * <pre>
843 * cs.encode(CharBuffer.wrap(s)); </pre>
844 *
845 * @param str The string to be encoded
846 *
847 * @return A byte buffer containing the encoded characters
848 */
849 public final ByteBuffer encode(String str) {
850 return encode(CharBuffer.wrap(str));
851 }
852
853 /**
854 * Compares this charset to another.
855 *
856 * <p> Charsets are ordered by their canonical names, without regard to
857 * case. </p>
858 *
859 * @param that
860 * The charset to which this charset is to be compared
861 *
862 * @return A negative integer, zero, or a positive integer as this charset
863 * is less than, equal to, or greater than the specified charset
864 */
865 public final int compareTo(Charset that) {
866 return (name().compareToIgnoreCase(that.name()));
867 }
868
869 /**
870 * Computes a hashcode for this charset. </p>
871 *
872 * @return An integer hashcode
873 */
874 public final int hashCode() {
875 return name().hashCode();
876 }
877
878 /**
879 * Tells whether or not this object is equal to another.
880 *
881 * <p> Two charsets are equal if, and only if, they have the same canonical
882 * names. A charset is never equal to any other type of object. </p>
883 *
884 * @return <tt>true</tt> if, and only if, this charset is equal to the
885 * given object
886 */
887 public final boolean equals(Object ob) {
888 if (!(ob instanceof Charset))
889 return false;
890 if (this == ob)
891 return true;
892 return name.equals(((Charset)ob).name());
893 }
894
895 /**
896 * Returns a string describing this charset. </p>
897 *
898 * @return A string describing this charset
899 */
900 public final String toString() {
901 return name();
902 }
903
904}