blob: 4a5f0c123c44a6f65cc12dd065710713c76b5bc1 [file] [log] [blame]
Hye-Shik Chang2bb146f2004-07-18 03:06:29 +00001/*
2 * cjkcodecs.h: common header for cjkcodecs
3 *
4 * Written by Hye-Shik Chang <perky@FreeBSD.org>
Hye-Shik Chang2bb146f2004-07-18 03:06:29 +00005 */
6
7#ifndef _CJKCODECS_H_
8#define _CJKCODECS_H_
9
Hye-Shik Chang4b96c132006-03-04 16:08:19 +000010#define PY_SSIZE_T_CLEAN
Hye-Shik Chang2bb146f2004-07-18 03:06:29 +000011#include "Python.h"
12#include "multibytecodec.h"
13
14
Hye-Shik Chang331649a2005-10-06 15:51:59 +000015/* a unicode "undefined" codepoint */
Antoine Pitrouf95a1b32010-05-09 15:52:27 +000016#define UNIINV 0xFFFE
Hye-Shik Chang331649a2005-10-06 15:51:59 +000017
18/* internal-use DBCS codepoints which aren't used by any charsets */
Antoine Pitrouf95a1b32010-05-09 15:52:27 +000019#define NOCHAR 0xFFFF
20#define MULTIC 0xFFFE
21#define DBCINV 0xFFFD
Hye-Shik Chang2bb146f2004-07-18 03:06:29 +000022
23/* shorter macros to save source size of mapping tables */
24#define U UNIINV
25#define N NOCHAR
26#define M MULTIC
27#define D DBCINV
28
29struct dbcs_index {
Antoine Pitrouf95a1b32010-05-09 15:52:27 +000030 const ucs2_t *map;
31 unsigned char bottom, top;
Hye-Shik Chang2bb146f2004-07-18 03:06:29 +000032};
33typedef struct dbcs_index decode_map;
34
35struct widedbcs_index {
Victor Stinnera0dd0212013-04-11 22:09:04 +020036 const Py_UCS4 *map;
Antoine Pitrouf95a1b32010-05-09 15:52:27 +000037 unsigned char bottom, top;
Hye-Shik Chang2bb146f2004-07-18 03:06:29 +000038};
39typedef struct widedbcs_index widedecode_map;
40
41struct unim_index {
Antoine Pitrouf95a1b32010-05-09 15:52:27 +000042 const DBCHAR *map;
43 unsigned char bottom, top;
Hye-Shik Chang2bb146f2004-07-18 03:06:29 +000044};
45typedef struct unim_index encode_map;
46
47struct unim_index_bytebased {
Antoine Pitrouf95a1b32010-05-09 15:52:27 +000048 const unsigned char *map;
49 unsigned char bottom, top;
Hye-Shik Chang2bb146f2004-07-18 03:06:29 +000050};
51
52struct dbcs_map {
Antoine Pitrouf95a1b32010-05-09 15:52:27 +000053 const char *charset;
54 const struct unim_index *encmap;
55 const struct dbcs_index *decmap;
Hye-Shik Chang2bb146f2004-07-18 03:06:29 +000056};
57
58struct pair_encodemap {
Victor Stinnera0dd0212013-04-11 22:09:04 +020059 Py_UCS4 uniseq;
Antoine Pitrouf95a1b32010-05-09 15:52:27 +000060 DBCHAR code;
Hye-Shik Chang2bb146f2004-07-18 03:06:29 +000061};
62
Hye-Shik Chang64a9e382004-07-18 15:02:45 +000063static const MultibyteCodec *codec_list;
64static const struct dbcs_map *mapping_list;
Hye-Shik Chang2bb146f2004-07-18 03:06:29 +000065
Antoine Pitrouf95a1b32010-05-09 15:52:27 +000066#define CODEC_INIT(encoding) \
67 static int encoding##_codec_init(const void *config)
Hye-Shik Chang2bb146f2004-07-18 03:06:29 +000068
Antoine Pitrouf95a1b32010-05-09 15:52:27 +000069#define ENCODER_INIT(encoding) \
70 static int encoding##_encode_init( \
71 MultibyteCodec_State *state, const void *config)
72#define ENCODER(encoding) \
73 static Py_ssize_t encoding##_encode( \
74 MultibyteCodec_State *state, const void *config, \
Victor Stinnerd9491262013-04-14 02:06:32 +020075 int kind, void *data, \
76 Py_ssize_t *inpos, Py_ssize_t inlen, \
Antoine Pitrouf95a1b32010-05-09 15:52:27 +000077 unsigned char **outbuf, Py_ssize_t outleft, int flags)
78#define ENCODER_RESET(encoding) \
79 static Py_ssize_t encoding##_encode_reset( \
80 MultibyteCodec_State *state, const void *config, \
81 unsigned char **outbuf, Py_ssize_t outleft)
Hye-Shik Chang2bb146f2004-07-18 03:06:29 +000082
Antoine Pitrouf95a1b32010-05-09 15:52:27 +000083#define DECODER_INIT(encoding) \
84 static int encoding##_decode_init( \
85 MultibyteCodec_State *state, const void *config)
86#define DECODER(encoding) \
87 static Py_ssize_t encoding##_decode( \
88 MultibyteCodec_State *state, const void *config, \
89 const unsigned char **inbuf, Py_ssize_t inleft, \
Victor Stinnera0dd0212013-04-11 22:09:04 +020090 _PyUnicodeWriter *writer)
Antoine Pitrouf95a1b32010-05-09 15:52:27 +000091#define DECODER_RESET(encoding) \
92 static Py_ssize_t encoding##_decode_reset( \
93 MultibyteCodec_State *state, const void *config)
Hye-Shik Chang2bb146f2004-07-18 03:06:29 +000094
Antoine Pitrouf95a1b32010-05-09 15:52:27 +000095#define NEXT_IN(i) \
Victor Stinnera0dd0212013-04-11 22:09:04 +020096 do { \
97 (*inbuf) += (i); \
98 (inleft) -= (i); \
99 } while (0)
Victor Stinnerd9491262013-04-14 02:06:32 +0200100#define NEXT_INCHAR(i) \
101 do { \
102 (*inpos) += (i); \
103 } while (0)
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000104#define NEXT_OUT(o) \
Victor Stinnerd9491262013-04-14 02:06:32 +0200105 do { \
106 (*outbuf) += (o); \
107 (outleft) -= (o); \
108 } while (0)
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000109#define NEXT(i, o) \
Victor Stinnerd9491262013-04-14 02:06:32 +0200110 do { \
111 NEXT_INCHAR(i); \
112 NEXT_OUT(o); \
113 } while (0)
Hye-Shik Chang2bb146f2004-07-18 03:06:29 +0000114
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000115#define REQUIRE_INBUF(n) \
116 if (inleft < (n)) \
117 return MBERR_TOOFEW;
118#define REQUIRE_OUTBUF(n) \
119 if (outleft < (n)) \
120 return MBERR_TOOSMALL;
Hye-Shik Chang2bb146f2004-07-18 03:06:29 +0000121
Victor Stinnerd9491262013-04-14 02:06:32 +0200122#define INBYTE1 ((*inbuf)[0])
123#define INBYTE2 ((*inbuf)[1])
124#define INBYTE3 ((*inbuf)[2])
125#define INBYTE4 ((*inbuf)[3])
126
127#define INCHAR1 PyUnicode_READ(kind, data, *inpos)
128#define INCHAR2 PyUnicode_READ(kind, data, *inpos + 1)
Hye-Shik Chang2bb146f2004-07-18 03:06:29 +0000129
Victor Stinnera0dd0212013-04-11 22:09:04 +0200130#define OUTCHAR(c) \
131 do { \
132 if (_PyUnicodeWriter_WriteChar(writer, (c)) < 0) \
Victor Stinnerd1f99422013-07-16 21:41:43 +0200133 return MBERR_EXCEPTION; \
Victor Stinnera0dd0212013-04-11 22:09:04 +0200134 } while (0)
135
136#define OUTCHAR2(c1, c2) \
137 do { \
138 Py_UCS4 _c1 = (c1); \
139 Py_UCS4 _c2 = (c2); \
140 if (_PyUnicodeWriter_Prepare(writer, 2, Py_MAX(_c1, c2)) < 0) \
Victor Stinnerd1f99422013-07-16 21:41:43 +0200141 return MBERR_EXCEPTION; \
Victor Stinnera0dd0212013-04-11 22:09:04 +0200142 PyUnicode_WRITE(writer->kind, writer->data, writer->pos, _c1); \
143 PyUnicode_WRITE(writer->kind, writer->data, writer->pos + 1, _c2); \
144 writer->pos += 2; \
145 } while (0)
146
Victor Stinnerd9491262013-04-14 02:06:32 +0200147#define OUTBYTE1(c) ((*outbuf)[0]) = (c);
148#define OUTBYTE2(c) ((*outbuf)[1]) = (c);
149#define OUTBYTE3(c) ((*outbuf)[2]) = (c);
150#define OUTBYTE4(c) ((*outbuf)[3]) = (c);
Hye-Shik Chang2bb146f2004-07-18 03:06:29 +0000151
Victor Stinnerd9491262013-04-14 02:06:32 +0200152#define WRITEBYTE1(c1) \
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000153 REQUIRE_OUTBUF(1) \
154 (*outbuf)[0] = (c1);
Victor Stinnerd9491262013-04-14 02:06:32 +0200155#define WRITEBYTE2(c1, c2) \
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000156 REQUIRE_OUTBUF(2) \
157 (*outbuf)[0] = (c1); \
158 (*outbuf)[1] = (c2);
Victor Stinnerd9491262013-04-14 02:06:32 +0200159#define WRITEBYTE3(c1, c2, c3) \
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000160 REQUIRE_OUTBUF(3) \
161 (*outbuf)[0] = (c1); \
162 (*outbuf)[1] = (c2); \
163 (*outbuf)[2] = (c3);
Victor Stinnerd9491262013-04-14 02:06:32 +0200164#define WRITEBYTE4(c1, c2, c3, c4) \
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000165 REQUIRE_OUTBUF(4) \
166 (*outbuf)[0] = (c1); \
167 (*outbuf)[1] = (c2); \
168 (*outbuf)[2] = (c3); \
169 (*outbuf)[3] = (c4);
Hye-Shik Chang2bb146f2004-07-18 03:06:29 +0000170
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000171#define _TRYMAP_ENC(m, assi, val) \
172 ((m)->map != NULL && (val) >= (m)->bottom && \
173 (val)<= (m)->top && ((assi) = (m)->map[(val) - \
174 (m)->bottom]) != NOCHAR)
Victor Stinnerbd97ac32013-10-28 23:54:13 +0100175#define TRYMAP_ENC(charset, assi, uni) \
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000176 _TRYMAP_ENC(&charset##_encmap[(uni) >> 8], assi, (uni) & 0xff)
Thomas Wouters89f507f2006-12-13 04:49:30 +0000177
Victor Stinner11bdf912013-10-28 23:18:39 +0100178#define _TRYMAP_DEC(m, assi, val) \
Victor Stinnera0dd0212013-04-11 22:09:04 +0200179 ((m)->map != NULL && \
180 (val) >= (m)->bottom && \
181 (val)<= (m)->top && \
182 ((assi) = (m)->map[(val) - (m)->bottom]) != UNIINV)
Victor Stinner11bdf912013-10-28 23:18:39 +0100183#define TRYMAP_DEC(charset, assi, c1, c2) \
184 _TRYMAP_DEC(&charset##_decmap[c1], assi, c2)
Victor Stinnera0dd0212013-04-11 22:09:04 +0200185
Hye-Shik Chang64a9e382004-07-18 15:02:45 +0000186#define BEGIN_MAPPINGS_LIST static const struct dbcs_map _mapping_list[] = {
Hye-Shik Chang2bb146f2004-07-18 03:06:29 +0000187#define MAPPING_ENCONLY(enc) {#enc, (void*)enc##_encmap, NULL},
188#define MAPPING_DECONLY(enc) {#enc, NULL, (void*)enc##_decmap},
189#define MAPPING_ENCDEC(enc) {#enc, (void*)enc##_encmap, (void*)enc##_decmap},
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000190#define END_MAPPINGS_LIST \
191 {"", NULL, NULL} }; \
192 static const struct dbcs_map *mapping_list = \
193 (const struct dbcs_map *)_mapping_list;
Hye-Shik Chang2bb146f2004-07-18 03:06:29 +0000194
Hye-Shik Chang64a9e382004-07-18 15:02:45 +0000195#define BEGIN_CODECS_LIST static const MultibyteCodec _codec_list[] = {
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000196#define _STATEFUL_METHODS(enc) \
197 enc##_encode, \
198 enc##_encode_init, \
199 enc##_encode_reset, \
200 enc##_decode, \
201 enc##_decode_init, \
202 enc##_decode_reset,
203#define _STATELESS_METHODS(enc) \
204 enc##_encode, NULL, NULL, \
205 enc##_decode, NULL, NULL,
206#define CODEC_STATEFUL(enc) { \
207 #enc, NULL, NULL, \
208 _STATEFUL_METHODS(enc) \
Hye-Shik Chang2bb146f2004-07-18 03:06:29 +0000209},
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000210#define CODEC_STATELESS(enc) { \
211 #enc, NULL, NULL, \
212 _STATELESS_METHODS(enc) \
Hye-Shik Chang2bb146f2004-07-18 03:06:29 +0000213},
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000214#define CODEC_STATELESS_WINIT(enc) { \
215 #enc, NULL, \
216 enc##_codec_init, \
217 _STATELESS_METHODS(enc) \
Hye-Shik Chang2bb146f2004-07-18 03:06:29 +0000218},
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000219#define END_CODECS_LIST \
220 {"", NULL,} }; \
221 static const MultibyteCodec *codec_list = \
222 (const MultibyteCodec *)_codec_list;
Hye-Shik Chang2bb146f2004-07-18 03:06:29 +0000223
Benjamin Petersonb173f782009-05-05 22:31:58 +0000224
225
Hye-Shik Chang2bb146f2004-07-18 03:06:29 +0000226static PyObject *
227getmultibytecodec(void)
228{
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000229 static PyObject *cofunc = NULL;
Hye-Shik Chang2bb146f2004-07-18 03:06:29 +0000230
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000231 if (cofunc == NULL) {
232 PyObject *mod = PyImport_ImportModuleNoBlock("_multibytecodec");
233 if (mod == NULL)
234 return NULL;
235 cofunc = PyObject_GetAttrString(mod, "__create_codec");
236 Py_DECREF(mod);
237 }
238 return cofunc;
Hye-Shik Chang2bb146f2004-07-18 03:06:29 +0000239}
240
241static PyObject *
242getcodec(PyObject *self, PyObject *encoding)
243{
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000244 PyObject *codecobj, *r, *cofunc;
245 const MultibyteCodec *codec;
246 const char *enc;
Hye-Shik Chang2bb146f2004-07-18 03:06:29 +0000247
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000248 if (!PyUnicode_Check(encoding)) {
249 PyErr_SetString(PyExc_TypeError,
250 "encoding name must be a string.");
251 return NULL;
252 }
253 enc = _PyUnicode_AsString(encoding);
254 if (enc == NULL)
255 return NULL;
Hye-Shik Chang2bb146f2004-07-18 03:06:29 +0000256
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000257 cofunc = getmultibytecodec();
258 if (cofunc == NULL)
259 return NULL;
Hye-Shik Chang2bb146f2004-07-18 03:06:29 +0000260
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000261 for (codec = codec_list; codec->encoding[0]; codec++)
262 if (strcmp(codec->encoding, enc) == 0)
263 break;
Hye-Shik Chang2bb146f2004-07-18 03:06:29 +0000264
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000265 if (codec->encoding[0] == '\0') {
266 PyErr_SetString(PyExc_LookupError,
267 "no such codec is supported.");
268 return NULL;
269 }
Hye-Shik Chang2bb146f2004-07-18 03:06:29 +0000270
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000271 codecobj = PyCapsule_New((void *)codec, PyMultibyteCodec_CAPSULE_NAME, NULL);
272 if (codecobj == NULL)
273 return NULL;
Hye-Shik Chang2bb146f2004-07-18 03:06:29 +0000274
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000275 r = PyObject_CallFunctionObjArgs(cofunc, codecobj, NULL);
276 Py_DECREF(codecobj);
Hye-Shik Chang2bb146f2004-07-18 03:06:29 +0000277
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000278 return r;
Hye-Shik Chang2bb146f2004-07-18 03:06:29 +0000279}
280
281static struct PyMethodDef __methods[] = {
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000282 {"getcodec", (PyCFunction)getcodec, METH_O, ""},
283 {NULL, NULL},
Hye-Shik Chang2bb146f2004-07-18 03:06:29 +0000284};
285
286static int
287register_maps(PyObject *module)
288{
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000289 const struct dbcs_map *h;
Hye-Shik Chang2bb146f2004-07-18 03:06:29 +0000290
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000291 for (h = mapping_list; h->charset[0] != '\0'; h++) {
292 char mhname[256] = "__map_";
293 int r;
294 strcpy(mhname + sizeof("__map_") - 1, h->charset);
295 r = PyModule_AddObject(module, mhname,
296 PyCapsule_New((void *)h, PyMultibyteCodec_CAPSULE_NAME, NULL));
297 if (r == -1)
298 return -1;
299 }
300 return 0;
Hye-Shik Chang2bb146f2004-07-18 03:06:29 +0000301}
302
303#ifdef USING_BINARY_PAIR_SEARCH
304static DBCHAR
305find_pairencmap(ucs2_t body, ucs2_t modifier,
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000306 const struct pair_encodemap *haystack, int haystacksize)
Hye-Shik Chang2bb146f2004-07-18 03:06:29 +0000307{
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000308 int pos, min, max;
Victor Stinnera0dd0212013-04-11 22:09:04 +0200309 Py_UCS4 value = body << 16 | modifier;
Hye-Shik Chang2bb146f2004-07-18 03:06:29 +0000310
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000311 min = 0;
312 max = haystacksize;
Hye-Shik Chang2bb146f2004-07-18 03:06:29 +0000313
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000314 for (pos = haystacksize >> 1; min != max; pos = (min + max) >> 1)
315 if (value < haystack[pos].uniseq) {
316 if (max == pos) break;
317 else max = pos;
318 }
319 else if (value > haystack[pos].uniseq) {
320 if (min == pos) break;
321 else min = pos;
322 }
323 else
324 break;
Hye-Shik Chang2bb146f2004-07-18 03:06:29 +0000325
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000326 if (value == haystack[pos].uniseq)
327 return haystack[pos].code;
328 else
329 return DBCINV;
Hye-Shik Chang2bb146f2004-07-18 03:06:29 +0000330}
331#endif
332
333#ifdef USING_IMPORTED_MAPS
334#define IMPORT_MAP(locale, charset, encmap, decmap) \
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000335 importmap("_codecs_" #locale, "__map_" #charset, \
336 (const void**)encmap, (const void**)decmap)
Hye-Shik Chang2bb146f2004-07-18 03:06:29 +0000337
338static int
339importmap(const char *modname, const char *symbol,
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000340 const void **encmap, const void **decmap)
Hye-Shik Chang2bb146f2004-07-18 03:06:29 +0000341{
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000342 PyObject *o, *mod;
Hye-Shik Chang2bb146f2004-07-18 03:06:29 +0000343
Serhiy Storchakac6792272013-10-19 21:03:34 +0300344 mod = PyImport_ImportModule(modname);
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000345 if (mod == NULL)
346 return -1;
Hye-Shik Chang2bb146f2004-07-18 03:06:29 +0000347
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000348 o = PyObject_GetAttrString(mod, (char*)symbol);
349 if (o == NULL)
350 goto errorexit;
351 else if (!PyCapsule_IsValid(o, PyMultibyteCodec_CAPSULE_NAME)) {
352 PyErr_SetString(PyExc_ValueError,
353 "map data must be a Capsule.");
354 goto errorexit;
355 }
356 else {
357 struct dbcs_map *map;
358 map = PyCapsule_GetPointer(o, PyMultibyteCodec_CAPSULE_NAME);
359 if (encmap != NULL)
360 *encmap = map->encmap;
361 if (decmap != NULL)
362 *decmap = map->decmap;
363 Py_DECREF(o);
364 }
Hye-Shik Chang2bb146f2004-07-18 03:06:29 +0000365
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000366 Py_DECREF(mod);
367 return 0;
Hye-Shik Chang2bb146f2004-07-18 03:06:29 +0000368
369errorexit:
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000370 Py_DECREF(mod);
371 return -1;
Hye-Shik Chang2bb146f2004-07-18 03:06:29 +0000372}
373#endif
374
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000375#define I_AM_A_MODULE_FOR(loc) \
376 static struct PyModuleDef __module = { \
377 PyModuleDef_HEAD_INIT, \
378 "_codecs_"#loc, \
379 NULL, \
380 0, \
381 __methods, \
382 NULL, \
383 NULL, \
384 NULL, \
385 NULL \
386 }; \
387 PyObject* \
388 PyInit__codecs_##loc(void) \
389 { \
390 PyObject *m = PyModule_Create(&__module); \
391 if (m != NULL) \
392 (void)register_maps(m); \
393 return m; \
394 }
Hye-Shik Chang2bb146f2004-07-18 03:06:29 +0000395
396#endif