blob: 9e9e96c4d1c16345108f01e709208c112a929850 [file] [log] [blame]
Hye-Shik Chang2bb146f2004-07-18 03:06:29 +00001/*
2 * _codecs_cn.c: Codecs collection for Mainland Chinese encodings
3 *
4 * Written by Hye-Shik Chang <perky@FreeBSD.org>
Hye-Shik Chang2bb146f2004-07-18 03:06:29 +00005 */
6
7#include "cjkcodecs.h"
8#include "mappings_cn.h"
9
Hye-Shik Changc5c57e62005-12-12 11:48:32 +000010/**
11 * hz is predefined as 100 on AIX. So we undefine it to avoid
12 * conflict against hz codec's.
13 */
14#ifdef _AIX
15#undef hz
16#endif
17
Thomas Wouters89f507f2006-12-13 04:49:30 +000018/* GBK and GB2312 map differently in few codepoints that are listed below:
19 *
Antoine Pitrouf95a1b32010-05-09 15:52:27 +000020 * gb2312 gbk
21 * A1A4 U+30FB KATAKANA MIDDLE DOT U+00B7 MIDDLE DOT
22 * A1AA U+2015 HORIZONTAL BAR U+2014 EM DASH
23 * A844 undefined U+2015 HORIZONTAL BAR
Thomas Wouters89f507f2006-12-13 04:49:30 +000024 */
25
26#define GBK_DECODE(dc1, dc2, assi) \
Antoine Pitrouf95a1b32010-05-09 15:52:27 +000027 if ((dc1) == 0xa1 && (dc2) == 0xaa) (assi) = 0x2014; \
28 else if ((dc1) == 0xa8 && (dc2) == 0x44) (assi) = 0x2015; \
29 else if ((dc1) == 0xa1 && (dc2) == 0xa4) (assi) = 0x00b7; \
30 else TRYMAP_DEC(gb2312, assi, dc1 ^ 0x80, dc2 ^ 0x80); \
31 else TRYMAP_DEC(gbkext, assi, dc1, dc2);
Thomas Wouters89f507f2006-12-13 04:49:30 +000032
33#define GBK_ENCODE(code, assi) \
Antoine Pitrouf95a1b32010-05-09 15:52:27 +000034 if ((code) == 0x2014) (assi) = 0xa1aa; \
35 else if ((code) == 0x2015) (assi) = 0xa844; \
36 else if ((code) == 0x00b7) (assi) = 0xa1a4; \
37 else if ((code) != 0x30fb && TRYMAP_ENC_COND(gbcommon, assi, code));
Hye-Shik Chang2bb146f2004-07-18 03:06:29 +000038
39/*
40 * GB2312 codec
41 */
42
43ENCODER(gb2312)
44{
Antoine Pitrouf95a1b32010-05-09 15:52:27 +000045 while (inleft > 0) {
46 Py_UNICODE c = IN1;
47 DBCHAR code;
Hye-Shik Chang2bb146f2004-07-18 03:06:29 +000048
Antoine Pitrouf95a1b32010-05-09 15:52:27 +000049 if (c < 0x80) {
50 WRITE1((unsigned char)c)
51 NEXT(1, 1)
52 continue;
53 }
54 UCS4INVALID(c)
Hye-Shik Chang2bb146f2004-07-18 03:06:29 +000055
Antoine Pitrouf95a1b32010-05-09 15:52:27 +000056 REQUIRE_OUTBUF(2)
57 TRYMAP_ENC(gbcommon, code, c);
58 else return 1;
Hye-Shik Chang2bb146f2004-07-18 03:06:29 +000059
Antoine Pitrouf95a1b32010-05-09 15:52:27 +000060 if (code & 0x8000) /* MSB set: GBK */
61 return 1;
Hye-Shik Chang2bb146f2004-07-18 03:06:29 +000062
Antoine Pitrouf95a1b32010-05-09 15:52:27 +000063 OUT1((code >> 8) | 0x80)
64 OUT2((code & 0xFF) | 0x80)
65 NEXT(1, 2)
66 }
Hye-Shik Chang2bb146f2004-07-18 03:06:29 +000067
Antoine Pitrouf95a1b32010-05-09 15:52:27 +000068 return 0;
Hye-Shik Chang2bb146f2004-07-18 03:06:29 +000069}
70
71DECODER(gb2312)
72{
Antoine Pitrouf95a1b32010-05-09 15:52:27 +000073 while (inleft > 0) {
74 unsigned char c = **inbuf;
Hye-Shik Chang2bb146f2004-07-18 03:06:29 +000075
Antoine Pitrouf95a1b32010-05-09 15:52:27 +000076 REQUIRE_OUTBUF(1)
Hye-Shik Chang2bb146f2004-07-18 03:06:29 +000077
Antoine Pitrouf95a1b32010-05-09 15:52:27 +000078 if (c < 0x80) {
79 OUT1(c)
80 NEXT(1, 1)
81 continue;
82 }
Hye-Shik Chang2bb146f2004-07-18 03:06:29 +000083
Antoine Pitrouf95a1b32010-05-09 15:52:27 +000084 REQUIRE_INBUF(2)
85 TRYMAP_DEC(gb2312, **outbuf, c ^ 0x80, IN2 ^ 0x80) {
86 NEXT(2, 1)
87 }
Victor Stinner2cded9c2011-07-08 01:45:13 +020088 else return 1;
Antoine Pitrouf95a1b32010-05-09 15:52:27 +000089 }
Hye-Shik Chang2bb146f2004-07-18 03:06:29 +000090
Antoine Pitrouf95a1b32010-05-09 15:52:27 +000091 return 0;
Hye-Shik Chang2bb146f2004-07-18 03:06:29 +000092}
93
94
95/*
96 * GBK codec
97 */
98
99ENCODER(gbk)
100{
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000101 while (inleft > 0) {
102 Py_UNICODE c = IN1;
103 DBCHAR code;
Hye-Shik Chang2bb146f2004-07-18 03:06:29 +0000104
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000105 if (c < 0x80) {
106 WRITE1((unsigned char)c)
107 NEXT(1, 1)
108 continue;
109 }
110 UCS4INVALID(c)
Hye-Shik Chang2bb146f2004-07-18 03:06:29 +0000111
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000112 REQUIRE_OUTBUF(2)
Hye-Shik Chang2bb146f2004-07-18 03:06:29 +0000113
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000114 GBK_ENCODE(c, code)
115 else return 1;
Hye-Shik Chang2bb146f2004-07-18 03:06:29 +0000116
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000117 OUT1((code >> 8) | 0x80)
118 if (code & 0x8000)
119 OUT2((code & 0xFF)) /* MSB set: GBK */
120 else
121 OUT2((code & 0xFF) | 0x80) /* MSB unset: GB2312 */
122 NEXT(1, 2)
123 }
Hye-Shik Chang2bb146f2004-07-18 03:06:29 +0000124
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000125 return 0;
Hye-Shik Chang2bb146f2004-07-18 03:06:29 +0000126}
127
128DECODER(gbk)
129{
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000130 while (inleft > 0) {
131 unsigned char c = IN1;
Hye-Shik Chang2bb146f2004-07-18 03:06:29 +0000132
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000133 REQUIRE_OUTBUF(1)
Hye-Shik Chang2bb146f2004-07-18 03:06:29 +0000134
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000135 if (c < 0x80) {
136 OUT1(c)
137 NEXT(1, 1)
138 continue;
139 }
Hye-Shik Chang2bb146f2004-07-18 03:06:29 +0000140
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000141 REQUIRE_INBUF(2)
Hye-Shik Chang2bb146f2004-07-18 03:06:29 +0000142
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000143 GBK_DECODE(c, IN2, **outbuf)
Victor Stinner2cded9c2011-07-08 01:45:13 +0200144 else return 1;
Hye-Shik Chang2bb146f2004-07-18 03:06:29 +0000145
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000146 NEXT(2, 1)
147 }
Hye-Shik Chang2bb146f2004-07-18 03:06:29 +0000148
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000149 return 0;
Hye-Shik Chang2bb146f2004-07-18 03:06:29 +0000150}
151
152
153/*
154 * GB18030 codec
155 */
156
157ENCODER(gb18030)
158{
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000159 while (inleft > 0) {
160 ucs4_t c = IN1;
161 DBCHAR code;
Hye-Shik Chang2bb146f2004-07-18 03:06:29 +0000162
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000163 if (c < 0x80) {
164 WRITE1(c)
165 NEXT(1, 1)
166 continue;
167 }
Hye-Shik Chang2bb146f2004-07-18 03:06:29 +0000168
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000169 DECODE_SURROGATE(c)
170 if (c > 0x10FFFF)
Hye-Shik Chang2bb146f2004-07-18 03:06:29 +0000171#if Py_UNICODE_SIZE == 2
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000172 return 2; /* surrogates pair */
Hye-Shik Chang2bb146f2004-07-18 03:06:29 +0000173#else
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000174 return 1;
Hye-Shik Chang2bb146f2004-07-18 03:06:29 +0000175#endif
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000176 else if (c >= 0x10000) {
177 ucs4_t tc = c - 0x10000;
Hye-Shik Chang2bb146f2004-07-18 03:06:29 +0000178
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000179 REQUIRE_OUTBUF(4)
Hye-Shik Chang2bb146f2004-07-18 03:06:29 +0000180
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000181 OUT4((unsigned char)(tc % 10) + 0x30)
182 tc /= 10;
183 OUT3((unsigned char)(tc % 126) + 0x81)
184 tc /= 126;
185 OUT2((unsigned char)(tc % 10) + 0x30)
186 tc /= 10;
187 OUT1((unsigned char)(tc + 0x90))
Hye-Shik Chang2bb146f2004-07-18 03:06:29 +0000188
189#if Py_UNICODE_SIZE == 2
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000190 NEXT(2, 4) /* surrogates pair */
Hye-Shik Chang2bb146f2004-07-18 03:06:29 +0000191#else
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000192 NEXT(1, 4)
Hye-Shik Chang2bb146f2004-07-18 03:06:29 +0000193#endif
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000194 continue;
195 }
Hye-Shik Chang2bb146f2004-07-18 03:06:29 +0000196
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000197 REQUIRE_OUTBUF(2)
Hye-Shik Chang2bb146f2004-07-18 03:06:29 +0000198
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000199 GBK_ENCODE(c, code)
200 else TRYMAP_ENC(gb18030ext, code, c);
201 else {
202 const struct _gb18030_to_unibmp_ranges *utrrange;
Hye-Shik Chang2bb146f2004-07-18 03:06:29 +0000203
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000204 REQUIRE_OUTBUF(4)
Hye-Shik Chang2bb146f2004-07-18 03:06:29 +0000205
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000206 for (utrrange = gb18030_to_unibmp_ranges;
207 utrrange->first != 0;
208 utrrange++)
209 if (utrrange->first <= c &&
210 c <= utrrange->last) {
211 Py_UNICODE tc;
Hye-Shik Chang2bb146f2004-07-18 03:06:29 +0000212
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000213 tc = c - utrrange->first +
214 utrrange->base;
Hye-Shik Chang2bb146f2004-07-18 03:06:29 +0000215
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000216 OUT4((unsigned char)(tc % 10) + 0x30)
217 tc /= 10;
218 OUT3((unsigned char)(tc % 126) + 0x81)
219 tc /= 126;
220 OUT2((unsigned char)(tc % 10) + 0x30)
221 tc /= 10;
222 OUT1((unsigned char)tc + 0x81)
Hye-Shik Chang2bb146f2004-07-18 03:06:29 +0000223
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000224 NEXT(1, 4)
225 break;
226 }
Hye-Shik Chang2bb146f2004-07-18 03:06:29 +0000227
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000228 if (utrrange->first == 0)
229 return 1;
230 continue;
231 }
Hye-Shik Chang2bb146f2004-07-18 03:06:29 +0000232
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000233 OUT1((code >> 8) | 0x80)
234 if (code & 0x8000)
235 OUT2((code & 0xFF)) /* MSB set: GBK or GB18030ext */
236 else
237 OUT2((code & 0xFF) | 0x80) /* MSB unset: GB2312 */
Hye-Shik Chang2bb146f2004-07-18 03:06:29 +0000238
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000239 NEXT(1, 2)
240 }
Hye-Shik Chang2bb146f2004-07-18 03:06:29 +0000241
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000242 return 0;
Hye-Shik Chang2bb146f2004-07-18 03:06:29 +0000243}
244
245DECODER(gb18030)
246{
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000247 while (inleft > 0) {
248 unsigned char c = IN1, c2;
Hye-Shik Chang2bb146f2004-07-18 03:06:29 +0000249
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000250 REQUIRE_OUTBUF(1)
Hye-Shik Chang2bb146f2004-07-18 03:06:29 +0000251
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000252 if (c < 0x80) {
253 OUT1(c)
254 NEXT(1, 1)
255 continue;
256 }
Hye-Shik Chang2bb146f2004-07-18 03:06:29 +0000257
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000258 REQUIRE_INBUF(2)
Hye-Shik Chang2bb146f2004-07-18 03:06:29 +0000259
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000260 c2 = IN2;
261 if (c2 >= 0x30 && c2 <= 0x39) { /* 4 bytes seq */
262 const struct _gb18030_to_unibmp_ranges *utr;
263 unsigned char c3, c4;
264 ucs4_t lseq;
Hye-Shik Chang2bb146f2004-07-18 03:06:29 +0000265
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000266 REQUIRE_INBUF(4)
267 c3 = IN3;
268 c4 = IN4;
269 if (c < 0x81 || c3 < 0x81 || c4 < 0x30 || c4 > 0x39)
Victor Stinner2cded9c2011-07-08 01:45:13 +0200270 return 1;
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000271 c -= 0x81; c2 -= 0x30;
272 c3 -= 0x81; c4 -= 0x30;
Hye-Shik Chang2bb146f2004-07-18 03:06:29 +0000273
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000274 if (c < 4) { /* U+0080 - U+FFFF */
275 lseq = ((ucs4_t)c * 10 + c2) * 1260 +
276 (ucs4_t)c3 * 10 + c4;
277 if (lseq < 39420) {
278 for (utr = gb18030_to_unibmp_ranges;
279 lseq >= (utr + 1)->base;
280 utr++) ;
281 OUT1(utr->first - utr->base + lseq)
282 NEXT(4, 1)
283 continue;
284 }
285 }
286 else if (c >= 15) { /* U+10000 - U+10FFFF */
287 lseq = 0x10000 + (((ucs4_t)c-15) * 10 + c2)
288 * 1260 + (ucs4_t)c3 * 10 + c4;
289 if (lseq <= 0x10FFFF) {
290 WRITEUCS4(lseq);
291 NEXT_IN(4)
292 continue;
293 }
294 }
Victor Stinner2cded9c2011-07-08 01:45:13 +0200295 return 1;
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000296 }
Hye-Shik Chang2bb146f2004-07-18 03:06:29 +0000297
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000298 GBK_DECODE(c, c2, **outbuf)
299 else TRYMAP_DEC(gb18030ext, **outbuf, c, c2);
Victor Stinner2cded9c2011-07-08 01:45:13 +0200300 else return 1;
Hye-Shik Chang2bb146f2004-07-18 03:06:29 +0000301
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000302 NEXT(2, 1)
303 }
Hye-Shik Chang2bb146f2004-07-18 03:06:29 +0000304
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000305 return 0;
Hye-Shik Chang2bb146f2004-07-18 03:06:29 +0000306}
307
308
309/*
310 * HZ codec
311 */
312
313ENCODER_INIT(hz)
314{
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000315 state->i = 0;
316 return 0;
Hye-Shik Chang2bb146f2004-07-18 03:06:29 +0000317}
318
319ENCODER_RESET(hz)
320{
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000321 if (state->i != 0) {
322 WRITE2('~', '}')
323 state->i = 0;
324 NEXT_OUT(2)
325 }
326 return 0;
Hye-Shik Chang2bb146f2004-07-18 03:06:29 +0000327}
328
329ENCODER(hz)
330{
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000331 while (inleft > 0) {
332 Py_UNICODE c = IN1;
333 DBCHAR code;
Hye-Shik Chang2bb146f2004-07-18 03:06:29 +0000334
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000335 if (c < 0x80) {
336 if (state->i == 0) {
337 WRITE1((unsigned char)c)
338 NEXT(1, 1)
339 }
340 else {
341 WRITE3('~', '}', (unsigned char)c)
342 NEXT(1, 3)
343 state->i = 0;
344 }
345 continue;
346 }
Hye-Shik Chang2bb146f2004-07-18 03:06:29 +0000347
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000348 UCS4INVALID(c)
Hye-Shik Chang2bb146f2004-07-18 03:06:29 +0000349
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000350 TRYMAP_ENC(gbcommon, code, c);
351 else return 1;
Hye-Shik Chang2bb146f2004-07-18 03:06:29 +0000352
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000353 if (code & 0x8000) /* MSB set: GBK */
354 return 1;
Hye-Shik Chang2bb146f2004-07-18 03:06:29 +0000355
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000356 if (state->i == 0) {
357 WRITE4('~', '{', code >> 8, code & 0xff)
358 NEXT(1, 4)
359 state->i = 1;
360 }
361 else {
362 WRITE2(code >> 8, code & 0xff)
363 NEXT(1, 2)
364 }
365 }
Hye-Shik Chang2bb146f2004-07-18 03:06:29 +0000366
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000367 return 0;
Hye-Shik Chang2bb146f2004-07-18 03:06:29 +0000368}
369
370DECODER_INIT(hz)
371{
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000372 state->i = 0;
373 return 0;
Hye-Shik Chang2bb146f2004-07-18 03:06:29 +0000374}
375
376DECODER_RESET(hz)
377{
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000378 state->i = 0;
379 return 0;
Hye-Shik Chang2bb146f2004-07-18 03:06:29 +0000380}
381
382DECODER(hz)
383{
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000384 while (inleft > 0) {
385 unsigned char c = IN1;
Hye-Shik Chang2bb146f2004-07-18 03:06:29 +0000386
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000387 if (c == '~') {
388 unsigned char c2 = IN2;
Hye-Shik Chang2bb146f2004-07-18 03:06:29 +0000389
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000390 REQUIRE_INBUF(2)
391 if (c2 == '~') {
392 WRITE1('~')
393 NEXT(2, 1)
394 continue;
395 }
396 else if (c2 == '{' && state->i == 0)
397 state->i = 1; /* set GB */
398 else if (c2 == '}' && state->i == 1)
399 state->i = 0; /* set ASCII */
400 else if (c2 == '\n')
401 ; /* line-continuation */
402 else
Victor Stinner2cded9c2011-07-08 01:45:13 +0200403 return 1;
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000404 NEXT(2, 0);
405 continue;
406 }
Hye-Shik Chang2bb146f2004-07-18 03:06:29 +0000407
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000408 if (c & 0x80)
409 return 1;
Hye-Shik Chang2bb146f2004-07-18 03:06:29 +0000410
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000411 if (state->i == 0) { /* ASCII mode */
412 WRITE1(c)
413 NEXT(1, 1)
414 }
415 else { /* GB mode */
416 REQUIRE_INBUF(2)
417 REQUIRE_OUTBUF(1)
418 TRYMAP_DEC(gb2312, **outbuf, c, IN2) {
419 NEXT(2, 1)
420 }
421 else
Victor Stinner2cded9c2011-07-08 01:45:13 +0200422 return 1;
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000423 }
424 }
Hye-Shik Chang2bb146f2004-07-18 03:06:29 +0000425
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000426 return 0;
Hye-Shik Chang2bb146f2004-07-18 03:06:29 +0000427}
428
429
430BEGIN_MAPPINGS_LIST
431 MAPPING_DECONLY(gb2312)
432 MAPPING_DECONLY(gbkext)
433 MAPPING_ENCONLY(gbcommon)
434 MAPPING_ENCDEC(gb18030ext)
435END_MAPPINGS_LIST
436
437BEGIN_CODECS_LIST
438 CODEC_STATELESS(gb2312)
439 CODEC_STATELESS(gbk)
440 CODEC_STATELESS(gb18030)
441 CODEC_STATEFUL(hz)
442END_CODECS_LIST
443
444I_AM_A_MODULE_FOR(cn)