blob: e54cffa5bb315bfd70ceff5722b898ed297dc3f3 [file] [log] [blame]
J. Duke319a3b92007-12-01 00:00:00 +00001/*
2 * Copyright 2000-2006 Sun Microsystems, Inc. All Rights Reserved.
3 * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
4 *
5 * This code is free software; you can redistribute it and/or modify it
6 * under the terms of the GNU General Public License version 2 only, as
7 * published by the Free Software Foundation. Sun designates this
8 * particular file as subject to the "Classpath" exception as provided
9 * by Sun in the LICENSE file that accompanied this code.
10 *
11 * This code is distributed in the hope that it will be useful, but WITHOUT
12 * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
13 * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
14 * version 2 for more details (a copy is included in the LICENSE file that
15 * accompanied this code).
16 *
17 * You should have received a copy of the GNU General Public License version
18 * 2 along with this work; if not, write to the Free Software Foundation,
19 * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
20 *
21 * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
22 * CA 95054 USA or visit www.sun.com if you need additional information or
23 * have any questions.
24 */
25
26package sun.nio.cs;
27
28import java.nio.ByteBuffer;
29import java.nio.CharBuffer;
30import java.nio.BufferOverflowException;
31import java.nio.BufferUnderflowException;
32import java.nio.charset.Charset;
33import java.nio.charset.CharsetDecoder;
34import java.nio.charset.CharsetEncoder;
35import java.nio.charset.CoderResult;
36import java.nio.charset.CharacterCodingException;
37import java.nio.charset.MalformedInputException;
38import java.nio.charset.UnmappableCharacterException;
39
40
41/*
42 * # Bits Bit pattern
43 * 1 7 0xxxxxxx
44 * 2 11 110xxxxx 10xxxxxx
45 * 3 16 1110xxxx 10xxxxxx 10xxxxxx
46 * 4 21 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx
47 * 5 26 111110xx 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx
48 * 6 31 1111110x 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx
49 *
50 * UCS-2 uses 1-3, UTF-16 uses 1-4, UCS-4 uses 1-6
51 */
52
53class UTF_8 extends Unicode
54{
55
56 public UTF_8() {
57 super("UTF-8", StandardCharsets.aliases_UTF_8);
58 }
59
60 public String historicalName() {
61 return "UTF8";
62 }
63
64 public CharsetDecoder newDecoder() {
65 return new Decoder(this);
66 }
67
68 public CharsetEncoder newEncoder() {
69 return new Encoder(this);
70 }
71
72
73 private static class Decoder extends CharsetDecoder {
74 private Decoder(Charset cs) {
75 super(cs, 1.0f, 1.0f);
76 }
77
78 private boolean isContinuation(int b) {
79 return ((b & 0xc0) == 0x80);
80 }
81
82 private final Surrogate.Generator sgg = new Surrogate.Generator();
83
84 private CoderResult decodeArrayLoop(ByteBuffer src,
85 CharBuffer dst)
86 {
87 byte[] sa = src.array();
88 int sp = src.arrayOffset() + src.position();
89 int sl = src.arrayOffset() + src.limit();
90 assert (sp <= sl);
91 sp = (sp <= sl ? sp : sl);
92 char[] da = dst.array();
93 int dp = dst.arrayOffset() + dst.position();
94 int dl = dst.arrayOffset() + dst.limit();
95 assert (dp <= dl);
96 dp = (dp <= dl ? dp : dl);
97
98 try {
99 while (sp < sl) {
100 int b1 = sa[sp];
101 int b2, b3;
102 switch ((b1 >> 4) & 0x0f) {
103
104 case 0: case 1: case 2: case 3:
105 case 4: case 5: case 6: case 7:
106 // 1 byte, 7 bits: 0xxxxxxx
107 if (dl - dp < 1)
108 return CoderResult.OVERFLOW;
109 da[dp++] = (char)(b1 & 0x7f);
110 sp++;
111 continue;
112
113 case 12: case 13:
114 // 2 bytes, 11 bits: 110xxxxx 10xxxxxx
115 if (sl - sp < 2)
116 return CoderResult.UNDERFLOW;
117 if (dl - dp < 1)
118 return CoderResult.OVERFLOW;
119 if (!isContinuation(b2 = sa[sp + 1]))
120 return CoderResult.malformedForLength(1);
121 da[dp++] = ((char)(((b1 & 0x1f) << 6) |
122 ((b2 & 0x3f) << 0)));
123 sp += 2;
124 continue;
125
126 case 14:
127 // 3 bytes, 16 bits: 1110xxxx 10xxxxxx 10xxxxxx
128 if (sl - sp < 3)
129 return CoderResult.UNDERFLOW;
130 if (dl - dp < 1)
131 return CoderResult.OVERFLOW;
132 if (!isContinuation(b2 = sa[sp + 1]))
133 return CoderResult.malformedForLength(1);
134 if (!isContinuation(b3 = sa[sp + 2]))
135 return CoderResult.malformedForLength(2);
136 da[dp++] = ((char)(((b1 & 0x0f) << 12) |
137 ((b2 & 0x3f) << 06) |
138 ((b3 & 0x3f) << 0)));
139 sp += 3;
140 continue;
141
142 case 15:
143 // 4, 5, or 6 bytes
144
145 int b4, b5, b6, uc, n;
146 switch (b1 & 0x0f) {
147
148 case 0: case 1: case 2: case 3:
149 case 4: case 5: case 6: case 7:
150 // 4 bytes, 21 bits
151 if (sl - sp < 4)
152 return CoderResult.UNDERFLOW;
153 if (!isContinuation(b2 = sa[sp + 1]))
154 return CoderResult.malformedForLength(1);
155 if (!isContinuation(b3 = sa[sp + 2]))
156 return CoderResult.malformedForLength(2);
157 if (!isContinuation(b4 = sa[sp + 3]))
158 return CoderResult.malformedForLength(3);
159 uc = (((b1 & 0x07) << 18) |
160 ((b2 & 0x3f) << 12) |
161 ((b3 & 0x3f) << 06) |
162 ((b4 & 0x3f) << 00));
163 n = 4;
164 break;
165
166 case 8: case 9: case 10: case 11:
167 // 5 bytes, 26 bits
168 if (sl - sp < 5)
169 return CoderResult.UNDERFLOW;
170 if (!isContinuation(b2 = sa[sp + 1]))
171 return CoderResult.malformedForLength(1);
172 if (!isContinuation(b3 = sa[sp + 2]))
173 return CoderResult.malformedForLength(2);
174 if (!isContinuation(b4 = sa[sp + 3]))
175 return CoderResult.malformedForLength(3);
176 if (!isContinuation(b5 = sa[sp + 4]))
177 return CoderResult.malformedForLength(4);
178 uc = (((b1 & 0x03) << 24) |
179 ((b2 & 0x3f) << 18) |
180 ((b3 & 0x3f) << 12) |
181 ((b4 & 0x3f) << 06) |
182 ((b5 & 0x3f) << 00));
183 n = 5;
184 break;
185
186 case 12: case 13:
187 // 6 bytes, 31 bits
188 if (sl - sp < 6)
189 return CoderResult.UNDERFLOW;
190 if (!isContinuation(b2 = sa[sp + 1]))
191 return CoderResult.malformedForLength(1);
192 if (!isContinuation(b3 = sa[sp + 2]))
193 return CoderResult.malformedForLength(2);
194 if (!isContinuation(b4 = sa[sp + 3]))
195 return CoderResult.malformedForLength(3);
196 if (!isContinuation(b5 = sa[sp + 4]))
197 return CoderResult.malformedForLength(4);
198 if (!isContinuation(b6 = sa[sp + 5]))
199 return CoderResult.malformedForLength(5);
200 uc = (((b1 & 0x01) << 30) |
201 ((b2 & 0x3f) << 24) |
202 ((b3 & 0x3f) << 18) |
203 ((b4 & 0x3f) << 12) |
204 ((b5 & 0x3f) << 06) |
205 ((b6 & 0x3f)));
206 n = 6;
207 break;
208
209 default:
210 return CoderResult.malformedForLength(1);
211
212 }
213
214 int gn = sgg.generate(uc, n, da, dp, dl);
215 if (gn < 0)
216 return sgg.error();
217 dp += gn;
218 sp += n;
219 continue;
220
221 default:
222 return CoderResult.malformedForLength(1);
223
224 }
225
226 }
227
228 return CoderResult.UNDERFLOW;
229 } finally {
230 src.position(sp - src.arrayOffset());
231 dst.position(dp - dst.arrayOffset());
232 }
233 }
234
235 private CoderResult decodeBufferLoop(ByteBuffer src,
236 CharBuffer dst)
237 {
238 int mark = src.position();
239 try {
240 while (src.hasRemaining()) {
241 int b1 = src.get();
242 int b2, b3;
243 switch ((b1 >> 4) & 0x0f) {
244
245 case 0: case 1: case 2: case 3:
246 case 4: case 5: case 6: case 7:
247 // 1 byte, 7 bits: 0xxxxxxx
248 if (dst.remaining() < 1)
249 return CoderResult.OVERFLOW;
250 dst.put((char)b1);
251 mark++;
252 continue;
253
254 case 12: case 13:
255 // 2 bytes, 11 bits: 110xxxxx 10xxxxxx
256 if (src.remaining() < 1)
257 return CoderResult.UNDERFLOW;
258 if (dst.remaining() < 1)
259 return CoderResult.OVERFLOW;
260 if (!isContinuation(b2 = src.get()))
261 return CoderResult.malformedForLength(1);
262 dst.put((char)(((b1 & 0x1f) << 6) |
263 ((b2 & 0x3f) << 0)));
264 mark += 2;
265 continue;
266
267 case 14:
268 // 3 bytes, 16 bits: 1110xxxx 10xxxxxx 10xxxxxx
269 if (src.remaining() < 2)
270 return CoderResult.UNDERFLOW;
271 if (dst.remaining() < 1)
272 return CoderResult.OVERFLOW;
273 if (!isContinuation(b2 = src.get()))
274 return CoderResult.malformedForLength(1);
275 if (!isContinuation(b3 = src.get()))
276 return CoderResult.malformedForLength(2);
277 dst.put((char)(((b1 & 0x0f) << 12) |
278 ((b2 & 0x3f) << 06) |
279 ((b3 & 0x3f) << 0)));
280 mark += 3;
281 continue;
282
283 case 15:
284 // 4, 5, or 6 bytes
285
286 int b4, b5, b6, uc, n;
287 switch (b1 & 0x0f) {
288
289 case 0: case 1: case 2: case 3:
290 case 4: case 5: case 6: case 7:
291 // 4 bytes, 21 bits
292 if (src.remaining() < 3)
293 return CoderResult.UNDERFLOW;
294 if (!isContinuation(b2 = src.get()))
295 return CoderResult.malformedForLength(1);
296 if (!isContinuation(b3 = src.get()))
297 return CoderResult.malformedForLength(2);
298 if (!isContinuation(b4 = src.get()))
299 return CoderResult.malformedForLength(3);
300 uc = (((b1 & 0x07) << 18) |
301 ((b2 & 0x3f) << 12) |
302 ((b3 & 0x3f) << 06) |
303 ((b4 & 0x3f) << 00));
304 n = 4;
305 break;
306
307 case 8: case 9: case 10: case 11:
308 // 5 bytes, 26 bits
309 if (src.remaining() < 4)
310 return CoderResult.UNDERFLOW;
311 if (!isContinuation(b2 = src.get()))
312 return CoderResult.malformedForLength(1);
313 if (!isContinuation(b3 = src.get()))
314 return CoderResult.malformedForLength(2);
315 if (!isContinuation(b4 = src.get()))
316 return CoderResult.malformedForLength(3);
317 if (!isContinuation(b5 = src.get()))
318 return CoderResult.malformedForLength(4);
319 uc = (((b1 & 0x03) << 24) |
320 ((b2 & 0x3f) << 18) |
321 ((b3 & 0x3f) << 12) |
322 ((b4 & 0x3f) << 06) |
323 ((b5 & 0x3f) << 00));
324 n = 5;
325 break;
326
327 case 12: case 13:
328 // 6 bytes, 31 bits
329 if (src.remaining() < 4)
330 return CoderResult.UNDERFLOW;
331 if (!isContinuation(b2 = src.get()))
332 return CoderResult.malformedForLength(1);
333 if (!isContinuation(b3 = src.get()))
334 return CoderResult.malformedForLength(2);
335 if (!isContinuation(b4 = src.get()))
336 return CoderResult.malformedForLength(3);
337 if (!isContinuation(b5 = src.get()))
338 return CoderResult.malformedForLength(4);
339 if (!isContinuation(b6 = src.get()))
340 return CoderResult.malformedForLength(5);
341 uc = (((b1 & 0x01) << 30) |
342 ((b2 & 0x3f) << 24) |
343 ((b3 & 0x3f) << 18) |
344 ((b4 & 0x3f) << 12) |
345 ((b5 & 0x3f) << 06) |
346 ((b6 & 0x3f)));
347 n = 6;
348 break;
349
350 default:
351 return CoderResult.malformedForLength(1);
352
353 }
354
355 if (sgg.generate(uc, n, dst) < 0)
356 return sgg.error();
357 mark += n;
358 continue;
359
360 default:
361 return CoderResult.malformedForLength(1);
362
363 }
364
365 }
366 return CoderResult.UNDERFLOW;
367 } finally {
368 src.position(mark);
369 }
370 }
371
372 protected CoderResult decodeLoop(ByteBuffer src,
373 CharBuffer dst)
374 {
375 if (src.hasArray() && dst.hasArray())
376 return decodeArrayLoop(src, dst);
377 else
378 return decodeBufferLoop(src, dst);
379 }
380
381 }
382
383
384 private static class Encoder extends CharsetEncoder {
385
386 private Encoder(Charset cs) {
387 super(cs, 1.1f, 4.0f);
388 }
389
390 public boolean canEncode(char c) {
391 return !Surrogate.is(c);
392 }
393
394 private final Surrogate.Parser sgp = new Surrogate.Parser();
395
396 private CoderResult encodeArrayLoop(CharBuffer src,
397 ByteBuffer dst)
398 {
399 char[] sa = src.array();
400 int sp = src.arrayOffset() + src.position();
401 int sl = src.arrayOffset() + src.limit();
402 assert (sp <= sl);
403 sp = (sp <= sl ? sp : sl);
404 byte[] da = dst.array();
405 int dp = dst.arrayOffset() + dst.position();
406 int dl = dst.arrayOffset() + dst.limit();
407 assert (dp <= dl);
408 dp = (dp <= dl ? dp : dl);
409
410 try {
411 while (sp < sl) {
412 char c = sa[sp];
413
414 if (c < 0x80) {
415 // Have at most seven bits
416 if (dp >= dl)
417 return CoderResult.OVERFLOW;
418 da[dp++] = (byte)c;
419 sp++;
420 continue;
421 }
422
423 if (!Surrogate.is(c)) {
424 // 2 bytes, 11 bits
425 if (c < 0x800) {
426 if (dl - dp < 2)
427 return CoderResult.OVERFLOW;
428 da[dp++] = (byte)(0xc0 | ((c >> 06)));
429 da[dp++] = (byte)(0x80 | ((c >> 00) & 0x3f));
430 sp++;
431 continue;
432 }
433 if (c <= '\uFFFF') {
434 // 3 bytes, 16 bits
435 if (dl - dp < 3)
436 return CoderResult.OVERFLOW;
437 da[dp++] = (byte)(0xe0 | ((c >> 12)));
438 da[dp++] = (byte)(0x80 | ((c >> 06) & 0x3f));
439 da[dp++] = (byte)(0x80 | ((c >> 00) & 0x3f));
440 sp++;
441 continue;
442 }
443 }
444
445 // Have a surrogate pair
446 int uc = sgp.parse(c, sa, sp, sl);
447 if (uc < 0)
448 return sgp.error();
449 if (uc < 0x200000) {
450 if (dl - dp < 4)
451 return CoderResult.OVERFLOW;
452 da[dp++] = (byte)(0xf0 | ((uc >> 18)));
453 da[dp++] = (byte)(0x80 | ((uc >> 12) & 0x3f));
454 da[dp++] = (byte)(0x80 | ((uc >> 06) & 0x3f));
455 da[dp++] = (byte)(0x80 | ((uc >> 00) & 0x3f));
456 sp += sgp.increment();
457 continue;
458 }
459 assert false;
460
461 }
462 return CoderResult.UNDERFLOW;
463 } finally {
464 src.position(sp - src.arrayOffset());
465 dst.position(dp - dst.arrayOffset());
466 }
467 }
468
469 private CoderResult encodeBufferLoop(CharBuffer src,
470 ByteBuffer dst)
471 {
472 int mark = src.position();
473 try {
474 while (src.hasRemaining()) {
475 char c = src.get();
476
477 if (c < 0x80) {
478 // Have at most seven bits
479 if (!dst.hasRemaining())
480 return CoderResult.OVERFLOW;
481 dst.put((byte)c);
482 mark++;
483 continue;
484 }
485
486 if (!Surrogate.is(c)) {
487 if (c < 0x800) {
488 // 2 bytes, 11 bits
489 if (dst.remaining() < 2)
490 return CoderResult.OVERFLOW;
491 dst.put((byte)(0xc0 | ((c >> 06))));
492 dst.put((byte)(0x80 | ((c >> 00) & 0x3f)));
493 mark++;
494 continue;
495 }
496 if (c <= '\uFFFF') {
497 // 3 bytes, 16 bits
498 if (dst.remaining() < 3)
499 return CoderResult.OVERFLOW;
500 dst.put((byte)(0xe0 | ((c >> 12))));
501 dst.put((byte)(0x80 | ((c >> 06) & 0x3f)));
502 dst.put((byte)(0x80 | ((c >> 00) & 0x3f)));
503 mark++;
504 continue;
505 }
506 }
507
508 // Have a surrogate pair
509 int uc = sgp.parse(c, src);
510 if (uc < 0)
511 return sgp.error();
512 if (uc < 0x200000) {
513 if (dst.remaining() < 4)
514 return CoderResult.OVERFLOW;
515 dst.put((byte)(0xf0 | ((uc >> 18))));
516 dst.put((byte)(0x80 | ((uc >> 12) & 0x3f)));
517 dst.put((byte)(0x80 | ((uc >> 06) & 0x3f)));
518 dst.put((byte)(0x80 | ((uc >> 00) & 0x3f)));
519 mark += sgp.increment();
520 continue;
521 }
522 assert false;
523
524 }
525 return CoderResult.UNDERFLOW;
526 } finally {
527 src.position(mark);
528 }
529 }
530
531 protected final CoderResult encodeLoop(CharBuffer src,
532 ByteBuffer dst)
533 {
534 if (src.hasArray() && dst.hasArray())
535 return encodeArrayLoop(src, dst);
536 else
537 return encodeBufferLoop(src, dst);
538 }
539
540 }
541
542}