blob: 5ff94940ca30f0a04abab7d745f260669d591935 [file] [log] [blame]
Hye-Shik Chang2bb146f2004-07-18 03:06:29 +00001/*
2 * _codecs_jp.c: Codecs collection for Japanese encodings
3 *
4 * Written by Hye-Shik Chang <perky@FreeBSD.org>
5 * $CJKCodecs: _codecs_jp.c,v 1.14 2004/07/07 17:54:47 perky Exp $
6 */
7
8#define USING_BINARY_PAIR_SEARCH
9#define EMPBASE 0x20000
10
11#include "cjkcodecs.h"
12#include "mappings_jp.h"
13#include "mappings_jisx0213_pair.h"
14#include "alg_jisx0201.h"
15#include "emu_jisx0213_2000.h"
16
17/*
18 * CP932 codec
19 */
20
21ENCODER(cp932)
22{
23 while (inleft > 0) {
24 Py_UNICODE c = IN1;
25 DBCHAR code;
26 unsigned char c1, c2;
27
28 if (c <= 0x80) {
29 WRITE1((unsigned char)c)
30 NEXT(1, 1)
31 continue;
32 }
33 else if (c >= 0xff61 && c <= 0xff9f) {
34 WRITE1(c - 0xfec0)
35 NEXT(1, 1)
36 continue;
37 }
38 else if (c >= 0xf8f0 && c <= 0xf8f3) {
39 /* Windows compatability */
40 REQUIRE_OUTBUF(1)
41 if (c == 0xf8f0)
42 OUT1(0xa0)
43 else
44 OUT1(c - 0xfef1 + 0xfd)
45 NEXT(1, 1)
46 continue;
47 }
48
49 UCS4INVALID(c)
50 REQUIRE_OUTBUF(2)
51
52 TRYMAP_ENC(cp932ext, code, c) {
53 OUT1(code >> 8)
54 OUT2(code & 0xff)
55 }
56 else TRYMAP_ENC(jisxcommon, code, c) {
57 if (code & 0x8000) /* MSB set: JIS X 0212 */
58 return 1;
59
60 /* JIS X 0208 */
61 c1 = code >> 8;
62 c2 = code & 0xff;
63 c2 = (((c1 - 0x21) & 1) ? 0x5e : 0) + (c2 - 0x21);
64 c1 = (c1 - 0x21) >> 1;
65 OUT1(c1 < 0x1f ? c1 + 0x81 : c1 + 0xc1)
66 OUT2(c2 < 0x3f ? c2 + 0x40 : c2 + 0x41)
67 }
68 else if (c >= 0xe000 && c < 0xe758) {
69 /* User-defined area */
70 c1 = (Py_UNICODE)(c - 0xe000) / 188;
71 c2 = (Py_UNICODE)(c - 0xe000) % 188;
72 OUT1(c1 + 0xf0)
73 OUT2(c2 < 0x3f ? c2 + 0x40 : c2 + 0x41)
74 }
75 else
76 return 1;
77
78 NEXT(1, 2)
79 }
80
81 return 0;
82}
83
84DECODER(cp932)
85{
86 while (inleft > 0) {
87 unsigned char c = IN1, c2;
88
89 REQUIRE_OUTBUF(1)
90 if (c <= 0x80) {
91 OUT1(c)
92 NEXT(1, 1)
93 continue;
94 }
95 else if (c >= 0xa0 && c <= 0xdf) {
96 if (c == 0xa0)
97 OUT1(0xf8f0) /* half-width katakana */
98 else
99 OUT1(0xfec0 + c)
100 NEXT(1, 1)
101 continue;
102 }
103 else if (c >= 0xfd/* && c <= 0xff*/) {
104 /* Windows compatibility */
105 OUT1(0xf8f1 - 0xfd + c)
106 NEXT(1, 1)
107 continue;
108 }
109
110 REQUIRE_INBUF(2)
111 c2 = IN2;
112
113 TRYMAP_DEC(cp932ext, **outbuf, c, c2);
114 else if ((c >= 0x81 && c <= 0x9f) || (c >= 0xe0 && c <= 0xea)){
115 if (c2 < 0x40 || (c2 > 0x7e && c2 < 0x80) || c2 > 0xfc)
116 return 2;
117
118 c = (c < 0xe0 ? c - 0x81 : c - 0xc1);
119 c2 = (c2 < 0x80 ? c2 - 0x40 : c2 - 0x41);
120 c = (2 * c + (c2 < 0x5e ? 0 : 1) + 0x21);
121 c2 = (c2 < 0x5e ? c2 : c2 - 0x5e) + 0x21;
122
123 TRYMAP_DEC(jisx0208, **outbuf, c, c2);
124 else return 2;
125 }
126 else if (c >= 0xf0 && c <= 0xf9) {
127 if ((c2 >= 0x40 && c2 <= 0x7e) ||
128 (c2 >= 0x80 && c2 <= 0xfc))
129 OUT1(0xe000 + 188 * (c - 0xf0) +
130 (c2 < 0x80 ? c2 - 0x40 : c2 - 0x41))
131 else
132 return 2;
133 }
134 else
135 return 2;
136
137 NEXT(2, 1)
138 }
139
140 return 0;
141}
142
143
144/*
145 * EUC-JIS-2004 codec
146 */
147
148ENCODER(euc_jis_2004)
149{
150 while (inleft > 0) {
151 ucs4_t c = IN1;
152 DBCHAR code;
153 int insize;
154
155 if (c < 0x80) {
156 WRITE1(c)
157 NEXT(1, 1)
158 continue;
159 }
160
161 DECODE_SURROGATE(c)
162 insize = GET_INSIZE(c);
163
164 if (c <= 0xFFFF) {
165 EMULATE_JISX0213_2000_ENCODE_BMP(code, c)
166 else TRYMAP_ENC(jisx0213_bmp, code, c) {
167 if (code == MULTIC) {
168 if (inleft < 2) {
169 if (flags & MBENC_FLUSH) {
170 code = find_pairencmap(
171 (ucs2_t)c, 0,
172 jisx0213_pair_encmap,
173 JISX0213_ENCPAIRS);
174 if (code == DBCINV)
175 return 1;
176 }
177 else
178 return MBERR_TOOFEW;
179 }
180 else {
181 code = find_pairencmap(
182 (ucs2_t)c, (*inbuf)[1],
183 jisx0213_pair_encmap,
184 JISX0213_ENCPAIRS);
185 if (code == DBCINV) {
186 code = find_pairencmap(
187 (ucs2_t)c, 0,
188 jisx0213_pair_encmap,
189 JISX0213_ENCPAIRS);
190 if (code == DBCINV)
191 return 1;
192 } else
193 insize = 2;
194 }
195 }
196 }
197 else TRYMAP_ENC(jisxcommon, code, c);
198 else if (c >= 0xff61 && c <= 0xff9f) {
199 /* JIS X 0201 half-width katakana */
200 WRITE2(0x8e, c - 0xfec0)
201 NEXT(1, 2)
202 continue;
203 }
204 else if (c == 0xff3c)
205 /* F/W REVERSE SOLIDUS (see NOTES) */
206 code = 0x2140;
207 else if (c == 0xff5e)
208 /* F/W TILDE (see NOTES) */
209 code = 0x2232;
210 else
211 return 1;
212 }
213 else if (c >> 16 == EMPBASE >> 16) {
214 EMULATE_JISX0213_2000_ENCODE_EMP(code, c)
215 else TRYMAP_ENC(jisx0213_emp, code, c & 0xffff);
216 else return insize;
217 }
218 else
219 return insize;
220
221 if (code & 0x8000) {
222 /* Codeset 2 */
223 WRITE3(0x8f, code >> 8, (code & 0xFF) | 0x80)
224 NEXT(insize, 3)
225 } else {
226 /* Codeset 1 */
227 WRITE2((code >> 8) | 0x80, (code & 0xFF) | 0x80)
228 NEXT(insize, 2)
229 }
230 }
231
232 return 0;
233}
234
235DECODER(euc_jis_2004)
236{
237 while (inleft > 0) {
238 unsigned char c = IN1;
239 ucs4_t code;
240
241 REQUIRE_OUTBUF(1)
242
243 if (c < 0x80) {
244 OUT1(c)
245 NEXT(1, 1)
246 continue;
247 }
248
249 if (c == 0x8e) {
250 /* JIS X 0201 half-width katakana */
251 unsigned char c2;
252
253 REQUIRE_INBUF(2)
254 c2 = IN2;
255 if (c2 >= 0xa1 && c2 <= 0xdf) {
256 OUT1(0xfec0 + c2)
257 NEXT(2, 1)
258 }
259 else
260 return 2;
261 }
262 else if (c == 0x8f) {
263 unsigned char c2, c3;
264
265 REQUIRE_INBUF(3)
266 c2 = IN2 ^ 0x80;
267 c3 = IN3 ^ 0x80;
268
269 /* JIS X 0213 Plane 2 or JIS X 0212 (see NOTES) */
270 EMULATE_JISX0213_2000_DECODE_PLANE2(**outbuf, c2, c3)
271 else TRYMAP_DEC(jisx0213_2_bmp, **outbuf, c2, c3) ;
272 else TRYMAP_DEC(jisx0213_2_emp, code, c2, c3) {
273 WRITEUCS4(EMPBASE | code)
274 NEXT_IN(3)
275 continue;
276 }
277 else TRYMAP_DEC(jisx0212, **outbuf, c2, c3) ;
278 else return 3;
279 NEXT(3, 1)
280 }
281 else {
282 unsigned char c2;
283
284 REQUIRE_INBUF(2)
285 c ^= 0x80;
286 c2 = IN2 ^ 0x80;
287
288 /* JIS X 0213 Plane 1 */
289 EMULATE_JISX0213_2000_DECODE_PLANE1(**outbuf, c, c2)
290 else if (c == 0x21 && c2 == 0x40) **outbuf = 0xff3c;
291 else if (c == 0x22 && c2 == 0x32) **outbuf = 0xff5e;
292 else TRYMAP_DEC(jisx0208, **outbuf, c, c2);
293 else TRYMAP_DEC(jisx0213_1_bmp, **outbuf, c, c2);
294 else TRYMAP_DEC(jisx0213_1_emp, code, c, c2) {
295 WRITEUCS4(EMPBASE | code)
296 NEXT_IN(2)
297 continue;
298 }
299 else TRYMAP_DEC(jisx0213_pair, code, c, c2) {
300 WRITE2(code >> 16, code & 0xffff)
301 NEXT(2, 2)
302 continue;
303 }
304 else return 2;
305 NEXT(2, 1)
306 }
307 }
308
309 return 0;
310}
311
312
313/*
314 * EUC-JP codec
315 */
316
317ENCODER(euc_jp)
318{
319 while (inleft > 0) {
320 Py_UNICODE c = IN1;
321 DBCHAR code;
322
323 if (c < 0x80) {
324 WRITE1((unsigned char)c)
325 NEXT(1, 1)
326 continue;
327 }
328
329 UCS4INVALID(c)
330
331 TRYMAP_ENC(jisxcommon, code, c);
332 else if (c >= 0xff61 && c <= 0xff9f) {
333 /* JIS X 0201 half-width katakana */
334 WRITE2(0x8e, c - 0xfec0)
335 NEXT(1, 2)
336 continue;
337 }
338#ifndef STRICT_BUILD
339 else if (c == 0xff3c) /* FULL-WIDTH REVERSE SOLIDUS */
340 code = 0x2140;
341 else if (c == 0xa5) { /* YEN SIGN */
342 WRITE1(0x5c);
343 NEXT(1, 1)
344 continue;
345 } else if (c == 0x203e) { /* OVERLINE */
346 WRITE1(0x7e);
347 NEXT(1, 1)
348 continue;
349 }
350#endif
351 else
352 return 1;
353
354 if (code & 0x8000) {
355 /* JIS X 0212 */
356 WRITE3(0x8f, code >> 8, (code & 0xFF) | 0x80)
357 NEXT(1, 3)
358 } else {
359 /* JIS X 0208 */
360 WRITE2((code >> 8) | 0x80, (code & 0xFF) | 0x80)
361 NEXT(1, 2)
362 }
363 }
364
365 return 0;
366}
367
368DECODER(euc_jp)
369{
370 while (inleft > 0) {
371 unsigned char c = IN1;
372
373 REQUIRE_OUTBUF(1)
374
375 if (c < 0x80) {
376 OUT1(c)
377 NEXT(1, 1)
378 continue;
379 }
380
381 if (c == 0x8e) {
382 /* JIS X 0201 half-width katakana */
383 unsigned char c2;
384
385 REQUIRE_INBUF(2)
386 c2 = IN2;
387 if (c2 >= 0xa1 && c2 <= 0xdf) {
388 OUT1(0xfec0 + c2)
389 NEXT(2, 1)
390 }
391 else
392 return 2;
393 }
394 else if (c == 0x8f) {
395 unsigned char c2, c3;
396
397 REQUIRE_INBUF(3)
398 c2 = IN2;
399 c3 = IN3;
400 /* JIS X 0212 */
401 TRYMAP_DEC(jisx0212, **outbuf, c2 ^ 0x80, c3 ^ 0x80) {
402 NEXT(3, 1)
403 }
404 else
405 return 3;
406 }
407 else {
408 unsigned char c2;
409
410 REQUIRE_INBUF(2)
411 c2 = IN2;
412 /* JIS X 0208 */
413#ifndef STRICT_BUILD
414 if (c == 0xa1 && c2 == 0xc0)
415 /* FULL-WIDTH REVERSE SOLIDUS */
416 **outbuf = 0xff3c;
417 else
418#endif
419 TRYMAP_DEC(jisx0208, **outbuf,
420 c ^ 0x80, c2 ^ 0x80) ;
421 else return 2;
422 NEXT(2, 1)
423 }
424 }
425
426 return 0;
427}
428
429
430/*
431 * SHIFT_JIS codec
432 */
433
434ENCODER(shift_jis)
435{
436 while (inleft > 0) {
437 Py_UNICODE c = IN1;
438 DBCHAR code;
439 unsigned char c1, c2;
440
441#ifdef STRICT_BUILD
442 JISX0201_R_ENCODE(c, code)
443#else
444 if (c < 0x80) code = c;
445 else if (c == 0x00a5) code = 0x5c; /* YEN SIGN */
446 else if (c == 0x203e) code = 0x7e; /* OVERLINE */
447#endif
448 else JISX0201_K_ENCODE(c, code)
449 else UCS4INVALID(c)
450 else code = NOCHAR;
451
452 if (code < 0x80 || (code >= 0xa1 && code <= 0xdf)) {
453 REQUIRE_OUTBUF(1)
454
455 OUT1((unsigned char)code)
456 NEXT(1, 1)
457 continue;
458 }
459
460 REQUIRE_OUTBUF(2)
461
462 if (code == NOCHAR) {
463 TRYMAP_ENC(jisxcommon, code, c);
464#ifndef STRICT_BUILD
465 else if (c == 0xff3c)
466 code = 0x2140; /* FULL-WIDTH REVERSE SOLIDUS */
467#endif
468 else
469 return 1;
470
471 if (code & 0x8000) /* MSB set: JIS X 0212 */
472 return 1;
473 }
474
475 c1 = code >> 8;
476 c2 = code & 0xff;
477 c2 = (((c1 - 0x21) & 1) ? 0x5e : 0) + (c2 - 0x21);
478 c1 = (c1 - 0x21) >> 1;
479 OUT1(c1 < 0x1f ? c1 + 0x81 : c1 + 0xc1)
480 OUT2(c2 < 0x3f ? c2 + 0x40 : c2 + 0x41)
481 NEXT(1, 2)
482 }
483
484 return 0;
485}
486
487DECODER(shift_jis)
488{
489 while (inleft > 0) {
490 unsigned char c = IN1;
491
492 REQUIRE_OUTBUF(1)
493
494#ifdef STRICT_BUILD
495 JISX0201_R_DECODE(c, **outbuf)
496#else
497 if (c < 0x80) **outbuf = c;
498#endif
499 else JISX0201_K_DECODE(c, **outbuf)
500 else if ((c >= 0x81 && c <= 0x9f) || (c >= 0xe0 && c <= 0xea)){
501 unsigned char c1, c2;
502
503 REQUIRE_INBUF(2)
504 c2 = IN2;
505 if (c2 < 0x40 || (c2 > 0x7e && c2 < 0x80) || c2 > 0xfc)
506 return 2;
507
508 c1 = (c < 0xe0 ? c - 0x81 : c - 0xc1);
509 c2 = (c2 < 0x80 ? c2 - 0x40 : c2 - 0x41);
510 c1 = (2 * c1 + (c2 < 0x5e ? 0 : 1) + 0x21);
511 c2 = (c2 < 0x5e ? c2 : c2 - 0x5e) + 0x21;
512
513#ifndef STRICT_BUILD
514 if (c1 == 0x21 && c2 == 0x40) {
515 /* FULL-WIDTH REVERSE SOLIDUS */
516 OUT1(0xff3c)
517 NEXT(2, 1)
518 continue;
519 }
520#endif
521 TRYMAP_DEC(jisx0208, **outbuf, c1, c2) {
522 NEXT(2, 1)
523 continue;
524 }
525 else
526 return 2;
527 }
528 else
529 return 2;
530
531 NEXT(1, 1) /* JIS X 0201 */
532 }
533
534 return 0;
535}
536
537
538/*
539 * SHIFT_JIS-2004 codec
540 */
541
542ENCODER(shift_jis_2004)
543{
544 while (inleft > 0) {
545 ucs4_t c = IN1;
546 DBCHAR code = NOCHAR;
547 int c1, c2;
548 size_t insize;
549
550 JISX0201_ENCODE(c, code)
551 else DECODE_SURROGATE(c)
552
553 if (code < 0x80 || (code >= 0xa1 && code <= 0xdf)) {
554 WRITE1((unsigned char)code)
555 NEXT(1, 1)
556 continue;
557 }
558
559 REQUIRE_OUTBUF(2)
560 insize = GET_INSIZE(c);
561
562 if (code == NOCHAR) {
563 if (c <= 0xffff) {
564 EMULATE_JISX0213_2000_ENCODE_BMP(code, c)
565 else TRYMAP_ENC(jisx0213_bmp, code, c) {
566 if (code == MULTIC) {
567 if (inleft < 2) {
568 if (flags & MBENC_FLUSH) {
569 code = find_pairencmap
570 ((ucs2_t)c, 0,
571 jisx0213_pair_encmap,
572 JISX0213_ENCPAIRS);
573 if (code == DBCINV)
574 return 1;
575 }
576 else
577 return MBERR_TOOFEW;
578 }
579 else {
580 code = find_pairencmap(
581 (ucs2_t)c, IN2,
582 jisx0213_pair_encmap,
583 JISX0213_ENCPAIRS);
584 if (code == DBCINV) {
585 code = find_pairencmap(
586 (ucs2_t)c, 0,
587 jisx0213_pair_encmap,
588 JISX0213_ENCPAIRS);
589 if (code == DBCINV)
590 return 1;
591 }
592 else
593 insize = 2;
594 }
595 }
596 }
597 else TRYMAP_ENC(jisxcommon, code, c) {
598 /* abandon JIS X 0212 codes */
599 if (code & 0x8000)
600 return 1;
601 }
602 else return 1;
603 }
604 else if (c >> 16 == EMPBASE >> 16) {
605 EMULATE_JISX0213_2000_ENCODE_EMP(code, c)
606 else TRYMAP_ENC(jisx0213_emp, code, c&0xffff);
607 else return insize;
608 }
609 else
610 return insize;
611 }
612
613 c1 = code >> 8;
614 c2 = (code & 0xff) - 0x21;
615
616 if (c1 & 0x80) { /* Plane 2 */
617 if (c1 >= 0xee) c1 -= 0x87;
618 else if (c1 >= 0xac || c1 == 0xa8) c1 -= 0x49;
619 else c1 -= 0x43;
620 }
621 else /* Plane 1 */
622 c1 -= 0x21;
623
624 if (c1 & 1) c2 += 0x5e;
625 c1 >>= 1;
626 OUT1(c1 + (c1 < 0x1f ? 0x81 : 0xc1))
627 OUT2(c2 + (c2 < 0x3f ? 0x40 : 0x41))
628
629 NEXT(insize, 2)
630 }
631
632 return 0;
633}
634
635DECODER(shift_jis_2004)
636{
637 while (inleft > 0) {
638 unsigned char c = IN1;
639
640 REQUIRE_OUTBUF(1)
641 JISX0201_DECODE(c, **outbuf)
642 else if ((c >= 0x81 && c <= 0x9f) || (c >= 0xe0 && c <= 0xfc)){
643 unsigned char c1, c2 = IN2;
644 ucs4_t code;
645
646 REQUIRE_INBUF(2)
647 if (c2 < 0x40 || (c2 > 0x7e && c2 < 0x80) || c2 > 0xfc)
648 return 2;
649
650 c1 = (c < 0xe0 ? c - 0x81 : c - 0xc1);
651 c2 = (c2 < 0x80 ? c2 - 0x40 : c2 - 0x41);
652 c1 = (2 * c1 + (c2 < 0x5e ? 0 : 1));
653 c2 = (c2 < 0x5e ? c2 : c2 - 0x5e) + 0x21;
654
655 if (c1 < 0x5e) { /* Plane 1 */
656 c1 += 0x21;
657 EMULATE_JISX0213_2000_DECODE_PLANE1(**outbuf,
658 c1, c2)
659 else TRYMAP_DEC(jisx0208, **outbuf, c1, c2) {
660 NEXT_OUT(1)
661 }
662 else TRYMAP_DEC(jisx0213_1_bmp, **outbuf,
663 c1, c2) {
664 NEXT_OUT(1)
665 }
666 else TRYMAP_DEC(jisx0213_1_emp, code, c1, c2) {
667 WRITEUCS4(EMPBASE | code)
668 }
669 else TRYMAP_DEC(jisx0213_pair, code, c1, c2) {
670 WRITE2(code >> 16, code & 0xffff)
671 NEXT_OUT(2)
672 }
673 else
674 return 2;
675 NEXT_IN(2)
676 }
677 else { /* Plane 2 */
678 if (c1 >= 0x67) c1 += 0x07;
679 else if (c1 >= 0x63 || c1 == 0x5f) c1 -= 0x37;
680 else c1 -= 0x3d;
681
682 EMULATE_JISX0213_2000_DECODE_PLANE2(**outbuf,
683 c1, c2)
684 else TRYMAP_DEC(jisx0213_2_bmp, **outbuf,
685 c1, c2) ;
686 else TRYMAP_DEC(jisx0213_2_emp, code, c1, c2) {
687 WRITEUCS4(EMPBASE | code)
688 NEXT_IN(2)
689 continue;
690 }
691 else
692 return 2;
693 NEXT(2, 1)
694 }
695 continue;
696 }
697 else
698 return 2;
699
700 NEXT(1, 1) /* JIS X 0201 */
701 }
702
703 return 0;
704}
705
706
707BEGIN_MAPPINGS_LIST
708 MAPPING_DECONLY(jisx0208)
709 MAPPING_DECONLY(jisx0212)
710 MAPPING_ENCONLY(jisxcommon)
711 MAPPING_DECONLY(jisx0213_1_bmp)
712 MAPPING_DECONLY(jisx0213_2_bmp)
713 MAPPING_ENCONLY(jisx0213_bmp)
714 MAPPING_DECONLY(jisx0213_1_emp)
715 MAPPING_DECONLY(jisx0213_2_emp)
716 MAPPING_ENCONLY(jisx0213_emp)
717 MAPPING_ENCDEC(jisx0213_pair)
718 MAPPING_ENCDEC(cp932ext)
719END_MAPPINGS_LIST
720
721BEGIN_CODECS_LIST
722 CODEC_STATELESS(shift_jis)
723 CODEC_STATELESS(cp932)
724 CODEC_STATELESS(euc_jp)
725 CODEC_STATELESS(shift_jis_2004)
726 CODEC_STATELESS(euc_jis_2004)
727 { "euc_jisx0213", (void *)2000, NULL, _STATELESS_METHODS(euc_jis_2004) },
728 { "shift_jisx0213", (void *)2000, NULL, _STATELESS_METHODS(shift_jis_2004) },
729END_CODECS_LIST
730
731I_AM_A_MODULE_FOR(jp)