blob: 369af3a96eb1848916fb1ea992f42895de76d804 [file] [log] [blame]
J. Duke319a3b92007-12-01 00:00:00 +00001/*
2 * Copyright 2000-2001 Sun Microsystems, Inc. All Rights Reserved.
3 * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
4 *
5 * This code is free software; you can redistribute it and/or modify it
6 * under the terms of the GNU General Public License version 2 only, as
7 * published by the Free Software Foundation. Sun designates this
8 * particular file as subject to the "Classpath" exception as provided
9 * by Sun in the LICENSE file that accompanied this code.
10 *
11 * This code is distributed in the hope that it will be useful, but WITHOUT
12 * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
13 * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
14 * version 2 for more details (a copy is included in the LICENSE file that
15 * accompanied this code).
16 *
17 * You should have received a copy of the GNU General Public License version
18 * 2 along with this work; if not, write to the Free Software Foundation,
19 * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
20 *
21 * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
22 * CA 95054 USA or visit www.sun.com if you need additional information or
23 * have any questions.
24 */
25
26package sun.nio.cs;
27
28import java.nio.CharBuffer;
29import java.nio.charset.CoderResult;
30import java.nio.charset.MalformedInputException;
31import java.nio.charset.UnmappableCharacterException;
32
33
34/**
35 * Utility class for dealing with surrogates.
36 *
37 * @author Mark Reinhold
38 */
39
40public class Surrogate {
41
42 private Surrogate() { }
43
44 // UTF-16 surrogate-character ranges
45 //
46 public static final char MIN_HIGH = '\uD800';
47 public static final char MAX_HIGH = '\uDBFF';
48 public static final char MIN_LOW = '\uDC00';
49 public static final char MAX_LOW = '\uDFFF';
50 public static final char MIN = MIN_HIGH;
51 public static final char MAX = MAX_LOW;
52
53 // Range of UCS-4 values that need surrogates in UTF-16
54 //
55 public static final int UCS4_MIN = 0x10000;
56 public static final int UCS4_MAX = (1 << 20) + UCS4_MIN - 1;
57
58 /**
59 * Tells whether or not the given UTF-16 value is a high surrogate.
60 */
61 public static boolean isHigh(int c) {
62 return (MIN_HIGH <= c) && (c <= MAX_HIGH);
63 }
64
65 /**
66 * Tells whether or not the given UTF-16 value is a low surrogate.
67 */
68 public static boolean isLow(int c) {
69 return (MIN_LOW <= c) && (c <= MAX_LOW);
70 }
71
72 /**
73 * Tells whether or not the given UTF-16 value is a surrogate character,
74 */
75 public static boolean is(int c) {
76 return (MIN <= c) && (c <= MAX);
77 }
78
79 /**
80 * Tells whether or not the given UCS-4 character must be represented as a
81 * surrogate pair in UTF-16.
82 */
83 public static boolean neededFor(int uc) {
84 return (uc >= UCS4_MIN) && (uc <= UCS4_MAX);
85 }
86
87 /**
88 * Returns the high UTF-16 surrogate for the given UCS-4 character.
89 */
90 public static char high(int uc) {
91 assert neededFor(uc);
92 return (char)(0xd800 | (((uc - UCS4_MIN) >> 10) & 0x3ff));
93 }
94
95 /**
96 * Returns the low UTF-16 surrogate for the given UCS-4 character.
97 */
98 public static char low(int uc) {
99 assert neededFor(uc);
100 return (char)(0xdc00 | ((uc - UCS4_MIN) & 0x3ff));
101 }
102
103 /**
104 * Converts the given surrogate pair into a 32-bit UCS-4 character.
105 */
106 public static int toUCS4(char c, char d) {
107 assert isHigh(c) && isLow(d);
108 return (((c & 0x3ff) << 10) | (d & 0x3ff)) + 0x10000;
109 }
110
111 /**
112 * Surrogate parsing support. Charset implementations may use instances of
113 * this class to handle the details of parsing UTF-16 surrogate pairs.
114 */
115 public static class Parser {
116
117 public Parser() { }
118
119 private int character; // UCS-4
120 private CoderResult error = CoderResult.UNDERFLOW;
121 private boolean isPair;
122
123 /**
124 * Returns the UCS-4 character previously parsed.
125 */
126 public int character() {
127 assert (error == null);
128 return character;
129 }
130
131 /**
132 * Tells whether or not the previously-parsed UCS-4 character was
133 * originally represented by a surrogate pair.
134 */
135 public boolean isPair() {
136 assert (error == null);
137 return isPair;
138 }
139
140 /**
141 * Returns the number of UTF-16 characters consumed by the previous
142 * parse.
143 */
144 public int increment() {
145 assert (error == null);
146 return isPair ? 2 : 1;
147 }
148
149 /**
150 * If the previous parse operation detected an error, return the object
151 * describing that error.
152 */
153 public CoderResult error() {
154 assert (error != null);
155 return error;
156 }
157
158 /**
159 * Returns an unmappable-input result object, with the appropriate
160 * input length, for the previously-parsed character.
161 */
162 public CoderResult unmappableResult() {
163 assert (error == null);
164 return CoderResult.unmappableForLength(isPair ? 2 : 1);
165 }
166
167 /**
168 * Parses a UCS-4 character from the given source buffer, handling
169 * surrogates.
170 *
171 * @param c The first character
172 * @param in The source buffer, from which one more character
173 * will be consumed if c is a high surrogate
174 *
175 * @returns Either a parsed UCS-4 character, in which case the isPair()
176 * and increment() methods will return meaningful values, or
177 * -1, in which case error() will return a descriptive result
178 * object
179 */
180 public int parse(char c, CharBuffer in) {
181 if (Surrogate.isHigh(c)) {
182 if (!in.hasRemaining()) {
183 error = CoderResult.UNDERFLOW;
184 return -1;
185 }
186 char d = in.get();
187 if (Surrogate.isLow(d)) {
188 character = toUCS4(c, d);
189 isPair = true;
190 error = null;
191 return character;
192 }
193 error = CoderResult.malformedForLength(1);
194 return -1;
195 }
196 if (Surrogate.isLow(c)) {
197 error = CoderResult.malformedForLength(1);
198 return -1;
199 }
200 character = c;
201 isPair = false;
202 error = null;
203 return character;
204 }
205
206 /**
207 * Parses a UCS-4 character from the given source buffer, handling
208 * surrogates.
209 *
210 * @param c The first character
211 * @param ia The input array, from which one more character
212 * will be consumed if c is a high surrogate
213 * @param ip The input index
214 * @param il The input limit
215 *
216 * @returns Either a parsed UCS-4 character, in which case the isPair()
217 * and increment() methods will return meaningful values, or
218 * -1, in which case error() will return a descriptive result
219 * object
220 */
221 public int parse(char c, char[] ia, int ip, int il) {
222 assert (ia[ip] == c);
223 if (Surrogate.isHigh(c)) {
224 if (il - ip < 2) {
225 error = CoderResult.UNDERFLOW;
226 return -1;
227 }
228 char d = ia[ip + 1];
229 if (Surrogate.isLow(d)) {
230 character = toUCS4(c, d);
231 isPair = true;
232 error = null;
233 return character;
234 }
235 error = CoderResult.malformedForLength(1);
236 return -1;
237 }
238 if (Surrogate.isLow(c)) {
239 error = CoderResult.malformedForLength(1);
240 return -1;
241 }
242 character = c;
243 isPair = false;
244 error = null;
245 return character;
246 }
247
248 }
249
250 /**
251 * Surrogate generation support. Charset implementations may use instances
252 * of this class to handle the details of generating UTF-16 surrogate
253 * pairs.
254 */
255 public static class Generator {
256
257 public Generator() { }
258
259 private CoderResult error = CoderResult.OVERFLOW;
260
261 /**
262 * If the previous generation operation detected an error, return the
263 * object describing that error.
264 */
265 public CoderResult error() {
266 assert error != null;
267 return error;
268 }
269
270 /**
271 * Generates one or two UTF-16 characters to represent the given UCS-4
272 * character.
273 *
274 * @param uc The UCS-4 character
275 * @param len The number of input bytes from which the UCS-4 value
276 * was constructed (used when creating result objects)
277 * @param dst The destination buffer, to which one or two UTF-16
278 * characters will be written
279 *
280 * @returns Either a positive count of the number of UTF-16 characters
281 * written to the destination buffer, or -1, in which case
282 * error() will return a descriptive result object
283 */
284 public int generate(int uc, int len, CharBuffer dst) {
285 if (uc <= 0xffff) {
286 if (Surrogate.is(uc)) {
287 error = CoderResult.malformedForLength(len);
288 return -1;
289 }
290 if (dst.remaining() < 1) {
291 error = CoderResult.OVERFLOW;
292 return -1;
293 }
294 dst.put((char)uc);
295 error = null;
296 return 1;
297 }
298 if (uc < Surrogate.UCS4_MIN) {
299 error = CoderResult.malformedForLength(len);
300 return -1;
301 }
302 if (uc <= Surrogate.UCS4_MAX) {
303 if (dst.remaining() < 2) {
304 error = CoderResult.OVERFLOW;
305 return -1;
306 }
307 dst.put(Surrogate.high(uc));
308 dst.put(Surrogate.low(uc));
309 error = null;
310 return 2;
311 }
312 error = CoderResult.unmappableForLength(len);
313 return -1;
314 }
315
316 /**
317 * Generates one or two UTF-16 characters to represent the given UCS-4
318 * character.
319 *
320 * @param uc The UCS-4 character
321 * @param len The number of input bytes from which the UCS-4 value
322 * was constructed (used when creating result objects)
323 * @param da The destination array, to which one or two UTF-16
324 * characters will be written
325 * @param dp The destination position
326 * @param dl The destination limit
327 *
328 * @returns Either a positive count of the number of UTF-16 characters
329 * written to the destination buffer, or -1, in which case
330 * error() will return a descriptive result object
331 */
332 public int generate(int uc, int len, char[] da, int dp, int dl) {
333 if (uc <= 0xffff) {
334 if (Surrogate.is(uc)) {
335 error = CoderResult.malformedForLength(len);
336 return -1;
337 }
338 if (dl - dp < 1) {
339 error = CoderResult.OVERFLOW;
340 return -1;
341 }
342 da[dp] = (char)uc;
343 error = null;
344 return 1;
345 }
346 if (uc < Surrogate.UCS4_MIN) {
347 error = CoderResult.malformedForLength(len);
348 return -1;
349 }
350 if (uc <= Surrogate.UCS4_MAX) {
351 if (dl - dp < 2) {
352 error = CoderResult.OVERFLOW;
353 return -1;
354 }
355 da[dp] = Surrogate.high(uc);
356 da[dp + 1] = Surrogate.low(uc);
357 error = null;
358 return 2;
359 }
360 error = CoderResult.unmappableForLength(len);
361 return -1;
362 }
363
364 }
365
366}