blob: 3df47f3f2bf9988bda060d2f650200fa3caec990 [file] [log] [blame]
Hye-Shik Chang3e2a3062004-01-17 14:29:29 +00001/*
2 * multibytecodec.c: Common Multibyte Codec Implementation
3 *
4 * Written by Hye-Shik Chang <perky@FreeBSD.org>
5 * $CJKCodecs: multibytecodec.c,v 1.6 2004/01/17 11:26:10 perky Exp $
6 */
7
8#include "Python.h"
9#include "multibytecodec.h"
10
11
12typedef struct {
13 const Py_UNICODE *inbuf, *inbuf_top, *inbuf_end;
14 unsigned char *outbuf, *outbuf_end;
15 PyObject *excobj, *outobj;
16} MultibyteEncodeBuffer;
17
18typedef struct {
19 const unsigned char *inbuf, *inbuf_top, *inbuf_end;
20 Py_UNICODE *outbuf, *outbuf_end;
21 PyObject *excobj, *outobj;
22} MultibyteDecodeBuffer;
23
24PyDoc_STRVAR(MultibyteCodec_Encode__doc__,
25"I.encode(unicode[, errors]) -> (string, length consumed)\n\
26\n\
27Return an encoded string version of `unicode'. errors may be given to\n\
28set a different error handling scheme. Default is 'strict' meaning that\n\
29encoding errors raise a UnicodeEncodeError. Other possible values are\n\
30'ignore', 'replace' and 'xmlcharrefreplace' as well as any other name\n\
31registered with codecs.register_error that can handle UnicodeEncodeErrors.");
32
33PyDoc_STRVAR(MultibyteCodec_Decode__doc__,
34"I.decode(string[, errors]) -> (unicodeobject, length consumed)\n\
35\n\
36Decodes `string' using I, an MultibyteCodec instance. errors may be given\n\
37to set a different error handling scheme. Default is 'strict' meaning\n\
38that encoding errors raise a UnicodeDecodeError. Other possible values\n\
39are 'ignore' and 'replace' as well as any other name registerd with\n\
40codecs.register_error that is able to handle UnicodeDecodeErrors.");
41
42PyDoc_STRVAR(MultibyteCodec_StreamReader__doc__,
43"I.StreamReader(stream[, errors]) -> StreamReader instance");
44
45PyDoc_STRVAR(MultibyteCodec_StreamWriter__doc__,
46"I.StreamWriter(stream[, errors]) -> StreamWriter instance");
47
48static char *codeckwarglist[] = {"input", "errors", NULL};
49static char *streamkwarglist[] = {"stream", "errors", NULL};
50
51static PyObject *multibytecodec_encode(MultibyteCodec *,
52 MultibyteCodec_State *, const Py_UNICODE **, size_t,
53 PyObject *, int);
54static PyObject *mbstreamreader_create(MultibyteCodec *,
55 PyObject *, const char *);
56static PyObject *mbstreamwriter_create(MultibyteCodec *,
57 PyObject *, const char *);
58
59#define MBENC_RESET MBENC_MAX<<1 /* reset after an encoding session */
60
61static PyObject *
62make_tuple(PyObject *unicode, int len)
63{
64 PyObject *v, *w;
65
66 if (unicode == NULL)
67 return NULL;
68
69 v = PyTuple_New(2);
70 if (v == NULL) {
71 Py_DECREF(unicode);
72 return NULL;
73 }
74 PyTuple_SET_ITEM(v, 0, unicode);
75
76 w = PyInt_FromLong(len);
77 if (w == NULL) {
78 Py_DECREF(v);
79 return NULL;
80 }
81 PyTuple_SET_ITEM(v, 1, w);
82
83 return v;
84}
85
86static PyObject *
87get_errorcallback(const char *errors)
88{
89 if (errors == NULL || strcmp(errors, "strict") == 0)
90 return ERROR_STRICT;
91 else if (strcmp(errors, "ignore") == 0)
92 return ERROR_IGNORE;
93 else if (strcmp(errors, "replace") == 0)
94 return ERROR_REPLACE;
95 else {
96 return PyCodec_LookupError(errors);
97 }
98}
99
100static int
101expand_encodebuffer(MultibyteEncodeBuffer *buf, int esize)
102{
103 int orgpos, orgsize;
104
105 orgpos = (int)((char*)buf->outbuf - PyString_AS_STRING(buf->outobj));
106 orgsize = PyString_GET_SIZE(buf->outobj);
107 if (_PyString_Resize(&buf->outobj, orgsize + (
108 esize < (orgsize >> 1) ? (orgsize >> 1) | 1 : esize)) == -1)
109 return -1;
110
111 buf->outbuf = (unsigned char *)PyString_AS_STRING(buf->outobj) + orgpos;
112 buf->outbuf_end = (unsigned char *)PyString_AS_STRING(buf->outobj)
113 + PyString_GET_SIZE(buf->outobj);
114
115 return 0;
116}
117#define RESERVE_ENCODEBUFFER(buf, s) { \
118 if ((s) < 1 || (buf)->outbuf + (s) > (buf)->outbuf_end) \
119 if (expand_encodebuffer(buf, s) == -1) \
120 goto errorexit; \
121}
122
123static int
124expand_decodebuffer(MultibyteDecodeBuffer *buf, int esize)
125{
126 int orgpos, orgsize;
127
128 orgpos = (int)(buf->outbuf - PyUnicode_AS_UNICODE(buf->outobj));
129 orgsize = PyUnicode_GET_SIZE(buf->outobj);
130 if (PyUnicode_Resize(&buf->outobj, orgsize + (
131 esize < (orgsize >> 1) ? (orgsize >> 1) | 1 : esize)) == -1)
132 return -1;
133
134 buf->outbuf = PyUnicode_AS_UNICODE(buf->outobj) + orgpos;
135 buf->outbuf_end = PyUnicode_AS_UNICODE(buf->outobj)
136 + PyUnicode_GET_SIZE(buf->outobj);
137
138 return 0;
139}
140#define RESERVE_DECODEBUFFER(buf, s) { \
141 if ((s) < 1 || (buf)->outbuf + (s) > (buf)->outbuf_end) \
142 if (expand_decodebuffer(buf, s) == -1) \
143 goto errorexit; \
144}
145
146static int
147multibytecodec_encerror(MultibyteCodec *codec,
148 MultibyteCodec_State *state,
149 MultibyteEncodeBuffer *buf,
150 PyObject *errors, int e)
151{
152 PyObject *retobj = NULL, *retstr = NULL, *argsobj, *tobj;
153 int retstrsize, newpos;
154 const char *reason;
155 size_t esize;
156 int start, end;
157
158 if (e > 0) {
159 reason = "illegal multibyte sequence";
160 esize = e;
161 } else {
162 switch (e) {
163 case MBERR_TOOSMALL:
164 RESERVE_ENCODEBUFFER(buf, -1);
165 return 0; /* retry it */
166 case MBERR_TOOFEW:
167 reason = "incomplete multibyte sequence";
168 esize = (size_t)(buf->inbuf_end - buf->inbuf);
169 break;
170 case MBERR_INTERNAL:
171 PyErr_SetString(PyExc_RuntimeError, "internal codec error");
172 return -1;
173 default:
174 PyErr_SetString(PyExc_RuntimeError, "unknown runtime error");
175 return -1;
176 }
177 }
178
179 if (errors == ERROR_REPLACE) {
180 const Py_UNICODE replchar = '?', *inbuf = &replchar;
181 int r;
182
183 for (;;) {
184 size_t outleft;
185
186 outleft = (size_t)(buf->outbuf_end - buf->outbuf);
187 r = codec->encode(state, &inbuf, 1, &buf->outbuf, outleft, 0);
188 if (r == MBERR_TOOSMALL) {
189 RESERVE_ENCODEBUFFER(buf, -1);
190 continue;
191 } else
192 break;
193 }
194
195 if (r != 0) {
196 RESERVE_ENCODEBUFFER(buf, 1);
197 *buf->outbuf++ = '?';
198 }
199 }
200 if (errors == ERROR_IGNORE || errors == ERROR_REPLACE) {
201 buf->inbuf += esize;
202 return 0;
203 }
204
205 start = (int)(buf->inbuf - buf->inbuf_top);
206 end = start + esize;
207
208 /* use cached exception object if available */
209 if (buf->excobj == NULL) {
210 buf->excobj = PyUnicodeEncodeError_Create(codec->encoding,
211 buf->inbuf_top, (int)(buf->inbuf_end - buf->inbuf_top),
212 start, end, reason);
213 if (buf->excobj == NULL)
214 goto errorexit;
215 } else
216 if (PyUnicodeEncodeError_SetStart(buf->excobj, start) != 0 ||
217 PyUnicodeEncodeError_SetEnd(buf->excobj, end) != 0 ||
218 PyUnicodeEncodeError_SetReason(buf->excobj, reason) != 0)
219 goto errorexit;
220
221 if (errors == ERROR_STRICT) {
222 PyCodec_StrictErrors(buf->excobj);
223 goto errorexit;
224 }
225
226 argsobj = PyTuple_New(1);
227 if (argsobj == NULL)
228 goto errorexit;
229
230 PyTuple_SET_ITEM(argsobj, 0, buf->excobj);
231 Py_INCREF(buf->excobj);
232 retobj = PyObject_CallObject(errors, argsobj);
233 Py_DECREF(argsobj);
234 if (retobj == NULL)
235 goto errorexit;
236
237 if (!PyTuple_Check(retobj) || PyTuple_GET_SIZE(retobj) != 2 ||
238 !PyUnicode_Check((tobj = PyTuple_GET_ITEM(retobj, 0))) ||
239 !PyInt_Check(PyTuple_GET_ITEM(retobj, 1))) {
240 PyErr_SetString(PyExc_ValueError,
241 "encoding error handler must return (unicode, int) tuple");
242 goto errorexit;
243 }
244
245 {
246 const Py_UNICODE *uraw = PyUnicode_AS_UNICODE(tobj);
247
248 retstr = multibytecodec_encode(codec, state, &uraw,
249 PyUnicode_GET_SIZE(tobj), ERROR_STRICT, MBENC_FLUSH);
250 if (retstr == NULL)
251 goto errorexit;
252 }
253
254 retstrsize = PyString_GET_SIZE(retstr);
255 RESERVE_ENCODEBUFFER(buf, retstrsize);
256
257 memcpy(buf->outbuf, PyString_AS_STRING(retstr), retstrsize);
258 buf->outbuf += retstrsize;
259
260 newpos = (int)PyInt_AS_LONG(PyTuple_GET_ITEM(retobj, 1));
261 if (newpos < 0)
262 newpos += (int)(buf->inbuf_end - buf->inbuf_top);
263 if (newpos < 0 || buf->inbuf_top + newpos > buf->inbuf_end) {
264 PyErr_Format(PyExc_IndexError,
265 "position %d from error handler out of bounds", newpos);
266 goto errorexit;
267 }
268 buf->inbuf = buf->inbuf_top + newpos;
269
270 Py_DECREF(retobj);
271 Py_DECREF(retstr);
272 return 0;
273
274errorexit:
275 Py_XDECREF(retobj);
276 Py_XDECREF(retstr);
277 return -1;
278}
279
280static int
281multibytecodec_decerror(MultibyteCodec *codec,
282 MultibyteCodec_State *state,
283 MultibyteDecodeBuffer *buf,
284 PyObject *errors, int e)
285{
286 PyObject *argsobj, *retobj = NULL, *retuni = NULL;
287 int retunisize, newpos;
288 const char *reason;
289 size_t esize;
290 int start, end;
291
292 if (e > 0) {
293 reason = "illegal multibyte sequence";
294 esize = e;
295 } else {
296 switch (e) {
297 case MBERR_TOOSMALL:
298 RESERVE_DECODEBUFFER(buf, -1);
299 return 0; /* retry it */
300 case MBERR_TOOFEW:
301 reason = "incomplete multibyte sequence";
302 esize = (size_t)(buf->inbuf_end - buf->inbuf);
303 break;
304 case MBERR_INTERNAL:
305 PyErr_SetString(PyExc_RuntimeError, "internal codec error");
306 return -1;
307 default:
308 PyErr_SetString(PyExc_RuntimeError, "unknown runtime error");
309 return -1;
310 }
311 }
312
313 if (errors == ERROR_REPLACE) {
314 RESERVE_DECODEBUFFER(buf, 1);
315 *buf->outbuf++ = Py_UNICODE_REPLACEMENT_CHARACTER;
316 }
317 if (errors == ERROR_IGNORE || errors == ERROR_REPLACE) {
318 buf->inbuf += esize;
319 return 0;
320 }
321
322 start = (int)(buf->inbuf - buf->inbuf_top);
323 end = start + esize;
324
325 /* use cached exception object if available */
326 if (buf->excobj == NULL) {
327 buf->excobj = PyUnicodeDecodeError_Create(codec->encoding,
328 buf->inbuf_top, (int)(buf->inbuf_end - buf->inbuf_top),
329 start, end, reason);
330 if (buf->excobj == NULL)
331 goto errorexit;
332 } else
333 if (PyUnicodeDecodeError_SetStart(buf->excobj, start) ||
334 PyUnicodeDecodeError_SetEnd(buf->excobj, end) ||
335 PyUnicodeDecodeError_SetReason(buf->excobj, reason))
336 goto errorexit;
337
338 if (errors == ERROR_STRICT) {
339 PyCodec_StrictErrors(buf->excobj);
340 goto errorexit;
341 }
342
343 argsobj = PyTuple_New(1);
344 if (argsobj == NULL)
345 goto errorexit;
346
347 PyTuple_SET_ITEM(argsobj, 0, buf->excobj);
348 Py_INCREF(buf->excobj);
349 retobj = PyObject_CallObject(errors, argsobj);
350 Py_DECREF(argsobj);
351 if (retobj == NULL)
352 goto errorexit;
353
354 if (!PyTuple_Check(retobj) || PyTuple_GET_SIZE(retobj) != 2 ||
355 !PyUnicode_Check((retuni = PyTuple_GET_ITEM(retobj, 0))) ||
356 !PyInt_Check(PyTuple_GET_ITEM(retobj, 1))) {
357 PyErr_SetString(PyExc_ValueError,
358 "decoding error handler must return (unicode, int) tuple");
359 goto errorexit;
360 }
361
362 retunisize = PyUnicode_GET_SIZE(retuni);
363 if (retunisize > 0) {
364 RESERVE_DECODEBUFFER(buf, retunisize);
365 memcpy((char *)buf->outbuf, PyUnicode_AS_DATA(retuni),
366 retunisize * Py_UNICODE_SIZE);
367 buf->outbuf += retunisize;
368 }
369
370 newpos = (int)PyInt_AS_LONG(PyTuple_GET_ITEM(retobj, 1));
371 if (newpos < 0)
372 newpos += (int)(buf->inbuf_end - buf->inbuf_top);
373 if (newpos < 0 || buf->inbuf_top + newpos > buf->inbuf_end) {
374 PyErr_Format(PyExc_IndexError,
375 "position %d from error handler out of bounds", newpos);
376 goto errorexit;
377 }
378 buf->inbuf = buf->inbuf_top + newpos;
379 Py_DECREF(retobj);
380 return 0;
381
382errorexit:
383 Py_XDECREF(retobj);
384 return -1;
385}
386
387static PyObject *
388multibytecodec_encode(MultibyteCodec *codec,
389 MultibyteCodec_State *state,
390 const Py_UNICODE **data, size_t datalen,
391 PyObject *errors, int flags)
392{
393 MultibyteEncodeBuffer buf;
394 int finalsize, r = 0;
395
396 if (datalen == 0)
397 return PyString_FromString("");
398
399 buf.excobj = NULL;
400 buf.inbuf = buf.inbuf_top = *data;
401 buf.inbuf_end = buf.inbuf_top + datalen;
402 buf.outobj = PyString_FromStringAndSize(NULL, datalen * 2 + 16);
403 if (buf.outobj == NULL)
404 goto errorexit;
405 buf.outbuf = (unsigned char *)PyString_AS_STRING(buf.outobj);
406 buf.outbuf_end = buf.outbuf + PyString_GET_SIZE(buf.outobj);
407
408 while (buf.inbuf < buf.inbuf_end) {
409 size_t inleft, outleft;
410
411 /* we don't reuse inleft and outleft here.
412 * error callbacks can relocate the cursor anywhere on buffer */
413 inleft = (size_t)(buf.inbuf_end - buf.inbuf);
414 outleft = (size_t)(buf.outbuf_end - buf.outbuf);
415 r = codec->encode(state, &buf.inbuf, inleft,
416 &buf.outbuf, outleft, flags);
417 *data = buf.inbuf;
418 if ((r == 0) || (r == MBERR_TOOFEW && !(flags & MBENC_FLUSH)))
419 break;
420 else if (multibytecodec_encerror(codec, state, &buf, errors, r))
421 goto errorexit;
422 else if (r == MBERR_TOOFEW)
423 break;
424 }
425
426 if (codec->encreset != NULL)
427 for (;;) {
428 size_t outleft;
429
430 outleft = (size_t)(buf.outbuf_end - buf.outbuf);
431 r = codec->encreset(state, &buf.outbuf, outleft);
432 if (r == 0)
433 break;
434 else if (multibytecodec_encerror(codec, state, &buf, errors, r))
435 goto errorexit;
436 }
437
438 finalsize = (int)((char*)buf.outbuf - PyString_AS_STRING(buf.outobj));
439
440 if (finalsize != PyString_GET_SIZE(buf.outobj))
441 if (_PyString_Resize(&buf.outobj, finalsize) == -1)
442 goto errorexit;
443
444 Py_XDECREF(buf.excobj);
445 return buf.outobj;
446
447errorexit:
448 Py_XDECREF(buf.excobj);
449 Py_XDECREF(buf.outobj);
450 return NULL;
451}
452
453static PyObject *
454MultibyteCodec_Encode(MultibyteCodecObject *self,
455 PyObject *args, PyObject *kwargs)
456{
457 MultibyteCodec_State state;
458 Py_UNICODE *data;
459 PyObject *errorcb, *r, *arg;
460 const char *errors = NULL;
461 int datalen;
462
463 if (!PyArg_ParseTupleAndKeywords(args, kwargs, "O|z:encode",
464 codeckwarglist, &arg, &errors))
465 return NULL;
466
467 if (PyUnicode_Check(arg)) {
468 data = PyUnicode_AS_UNICODE(arg);
469 datalen = PyUnicode_GET_SIZE(arg);
470 arg = NULL; /* forget reference */
471 } else {
472 arg = PyObject_Unicode(arg);
473 if (arg == NULL || !PyUnicode_Check(arg)) {
474 Py_XDECREF(arg);
475 return NULL;
476 }
477 data = PyUnicode_AS_UNICODE(arg);
478 datalen = PyUnicode_GET_SIZE(arg);
479 }
480
481 errorcb = get_errorcallback(errors);
482 if (errorcb == NULL) {
483 Py_XDECREF(arg);
484 return NULL;
485 }
486
487 if (self->codec->encinit != NULL && self->codec->encinit(&state) != 0)
488 goto errorexit;
489 r = multibytecodec_encode(self->codec, &state, (const Py_UNICODE **)&data,
490 datalen, errorcb, MBENC_FLUSH | MBENC_RESET);
491 if (r == NULL)
492 goto errorexit;
493
494 if (errorcb > ERROR_MAX) {
495 Py_DECREF(errorcb);
496 }
497 Py_XDECREF(arg);
498 return make_tuple(r, datalen);
499
500errorexit:
501 if (errorcb > ERROR_MAX) {
502 Py_DECREF(errorcb);
503 }
504 Py_XDECREF(arg);
505 return NULL;
506}
507
508static PyObject *
509MultibyteCodec_Decode(MultibyteCodecObject *self,
510 PyObject *args, PyObject *kwargs)
511{
512 MultibyteCodec_State state;
513 MultibyteDecodeBuffer buf;
514 PyObject *errorcb;
515 const char *data, *errors = NULL;
516 int datalen, finalsize;
517
518 if (!PyArg_ParseTupleAndKeywords(args, kwargs, "s#|z:decode",
519 codeckwarglist, &data, &datalen, &errors))
520 return NULL;
521
522 errorcb = get_errorcallback(errors);
523 if (errorcb == NULL)
524 return NULL;
525
526 if (datalen == 0) {
527 if (errorcb > ERROR_MAX)
528 {Py_DECREF(errorcb);}
529 return make_tuple(PyUnicode_FromUnicode(NULL, 0), 0);
530 }
531
532 buf.outobj = buf.excobj = NULL;
533 buf.inbuf = buf.inbuf_top = (unsigned char *)data;
534 buf.inbuf_end = buf.inbuf_top + datalen;
535 buf.outobj = PyUnicode_FromUnicode(NULL, datalen);
536 if (buf.outobj == NULL)
537 goto errorexit;
538 buf.outbuf = PyUnicode_AS_UNICODE(buf.outobj);
539 buf.outbuf_end = buf.outbuf + PyUnicode_GET_SIZE(buf.outobj);
540
541 if (self->codec->decinit != NULL && self->codec->decinit(&state) != 0)
542 goto errorexit;
543
544 while (buf.inbuf < buf.inbuf_end) {
545 size_t inleft, outleft;
546 int r;
547
548 inleft = (size_t)(buf.inbuf_end - buf.inbuf);
549 outleft = (size_t)(buf.outbuf_end - buf.outbuf);
550
551 r = self->codec->decode(&state, &buf.inbuf, inleft,
552 &buf.outbuf, outleft);
553 if (r == 0)
554 break;
555 else if (multibytecodec_decerror(self->codec, &state, &buf, errorcb, r))
556 goto errorexit;
557 }
558
559 finalsize = (int)(buf.outbuf - PyUnicode_AS_UNICODE(buf.outobj));
560
561 if (finalsize != PyUnicode_GET_SIZE(buf.outobj))
562 if (PyUnicode_Resize(&buf.outobj, finalsize) == -1)
563 goto errorexit;
564
565 Py_XDECREF(buf.excobj);
566 if (errorcb > ERROR_MAX)
567 {Py_DECREF(errorcb);}
568 return make_tuple(buf.outobj, datalen);
569
570errorexit:
571 if (errorcb > ERROR_MAX)
572 {Py_DECREF(errorcb);}
573 Py_XDECREF(buf.excobj);
574 Py_XDECREF(buf.outobj);
575
576 return NULL;
577}
578
579static PyObject *
580MultibyteCodec_StreamReader(MultibyteCodecObject *self,
581 PyObject *args, PyObject *kwargs)
582{
583 PyObject *stream;
584 char *errors = NULL;
585
586 if (!PyArg_ParseTupleAndKeywords(args, kwargs, "O|s:StreamReader",
587 streamkwarglist, &stream, &errors))
588 return NULL;
589
590 return mbstreamreader_create(self->codec, stream, errors);
591}
592
593static PyObject *
594MultibyteCodec_StreamWriter(MultibyteCodecObject *self,
595 PyObject *args, PyObject *kwargs)
596{
597 PyObject *stream;
598 char *errors = NULL;
599
600 if (!PyArg_ParseTupleAndKeywords(args, kwargs, "O|s:StreamWriter",
601 streamkwarglist, &stream, &errors))
602 return NULL;
603
604 return mbstreamwriter_create(self->codec, stream, errors);
605}
606
607static struct PyMethodDef multibytecodec_methods[] = {
608 {"encode", (PyCFunction)MultibyteCodec_Encode,
609 METH_VARARGS | METH_KEYWORDS,
610 MultibyteCodec_Encode__doc__},
611 {"decode", (PyCFunction)MultibyteCodec_Decode,
612 METH_VARARGS | METH_KEYWORDS,
613 MultibyteCodec_Decode__doc__},
614 {"StreamReader",(PyCFunction)MultibyteCodec_StreamReader,
615 METH_VARARGS | METH_KEYWORDS,
616 MultibyteCodec_StreamReader__doc__},
617 {"StreamWriter",(PyCFunction)MultibyteCodec_StreamWriter,
618 METH_VARARGS | METH_KEYWORDS,
619 MultibyteCodec_StreamWriter__doc__},
620 {NULL, NULL},
621};
622
623static void
624multibytecodec_dealloc(MultibyteCodecObject *self)
625{
626 PyObject_Del(self);
627}
628
629
630
631static PyTypeObject MultibyteCodec_Type = {
632 PyObject_HEAD_INIT(NULL)
633 0, /* ob_size */
634 "MultibyteCodec", /* tp_name */
635 sizeof(MultibyteCodecObject), /* tp_basicsize */
636 0, /* tp_itemsize */
637 /* methods */
638 (destructor)multibytecodec_dealloc, /* tp_dealloc */
639 0, /* tp_print */
640 0, /* tp_getattr */
641 0, /* tp_setattr */
642 0, /* tp_compare */
643 0, /* tp_repr */
644 0, /* tp_as_number */
645 0, /* tp_as_sequence */
646 0, /* tp_as_mapping */
647 0, /* tp_hash */
648 0, /* tp_call */
649 0, /* tp_str */
650 PyObject_GenericGetAttr, /* tp_getattro */
651 0, /* tp_setattro */
652 0, /* tp_as_buffer */
653 Py_TPFLAGS_DEFAULT, /* tp_flags */
654 0, /* tp_doc */
655 0, /* tp_traverse */
656 0, /* tp_clear */
657 0, /* tp_richcompare */
658 0, /* tp_weaklistoffset */
659 0, /* tp_iter */
660 0, /* tp_iterext */
661 multibytecodec_methods, /* tp_methods */
662};
663
664static PyObject *
665mbstreamreader_iread(MultibyteStreamReaderObject *self,
666 const char *method, int sizehint)
667{
668 MultibyteDecodeBuffer buf;
669 PyObject *cres;
670 int rsize, r, finalsize = 0;
671
672 if (sizehint == 0)
673 return PyUnicode_FromUnicode(NULL, 0);
674
675 buf.outobj = buf.excobj = NULL;
676 cres = NULL;
677
678 for (;;) {
679 if (sizehint < 0)
680 cres = PyObject_CallMethod(self->stream, (char *)method, NULL);
681 else
682 cres = PyObject_CallMethod(self->stream,
683 (char *)method, "i", sizehint);
684 if (cres == NULL)
685 goto errorexit;
686
687 if (!PyString_Check(cres)) {
688 PyErr_SetString(PyExc_TypeError,
689 "stream function returned a non-string object");
690 goto errorexit;
691 }
692
693 if (self->pendingsize > 0) {
694 PyObject *ctr;
695 char *ctrdata;
696
697 rsize = PyString_GET_SIZE(cres) + self->pendingsize;
698 ctr = PyString_FromStringAndSize(NULL, rsize);
699 if (ctr == NULL)
700 goto errorexit;
701 ctrdata = PyString_AS_STRING(ctr);
702 memcpy(ctrdata, self->pending, self->pendingsize);
703 memcpy(ctrdata + self->pendingsize,
704 PyString_AS_STRING(cres), PyString_GET_SIZE(cres));
705 Py_DECREF(cres);
706 cres = ctr;
707 self->pendingsize = 0;
708 }
709
710 rsize = PyString_GET_SIZE(cres);
711 buf.inbuf = buf.inbuf_top = (unsigned char *)PyString_AS_STRING(cres);
712 buf.inbuf_end = buf.inbuf_top + rsize;
713 if (buf.outobj == NULL) {
714 buf.outobj = PyUnicode_FromUnicode(NULL, rsize);
715 if (buf.outobj == NULL)
716 goto errorexit;
717 buf.outbuf = PyUnicode_AS_UNICODE(buf.outobj);
718 buf.outbuf_end = buf.outbuf + PyUnicode_GET_SIZE(buf.outobj);
719 }
720
721 r = 0;
722 if (rsize > 0)
723 while (buf.inbuf < buf.inbuf_end) {
724 size_t inleft, outleft;
725
726 inleft = (size_t)(buf.inbuf_end - buf.inbuf);
727 outleft = (size_t)(buf.outbuf_end - buf.outbuf);
728
729 r = self->codec->decode(&self->state, &buf.inbuf, inleft,
730 &buf.outbuf, outleft);
731 if (r == 0 || r == MBERR_TOOFEW)
732 break;
733 else if (multibytecodec_decerror(self->codec,
734 &self->state, &buf, self->errors, r))
735 goto errorexit;
736 }
737
738 if (rsize == 0 || sizehint < 0) { /* end of file */
739 if (buf.inbuf < buf.inbuf_end &&
740 multibytecodec_decerror(self->codec, &self->state, &buf,
741 self->errors, MBERR_TOOFEW))
742 goto errorexit;
743 }
744
745 if (buf.inbuf < buf.inbuf_end) { /* pending sequence exists */
746 size_t npendings;
747
748 /* we can't assume that pendingsize is still 0 here. because
749 * this function can be called recursively from error callback */
750 npendings = (size_t)(buf.inbuf_end - buf.inbuf);
751 if (npendings + self->pendingsize > MAXDECPENDING) {
752 PyErr_SetString(PyExc_RuntimeError,
753 "pending buffer overflow");
754 goto errorexit;
755 }
756 memcpy(self->pending + self->pendingsize, buf.inbuf, npendings);
757 self->pendingsize += npendings;
758 }
759
760 finalsize = (int)(buf.outbuf - PyUnicode_AS_UNICODE(buf.outobj));
761
762 Py_DECREF(cres);
763 cres = NULL;
764
765 if (sizehint < 0 || finalsize != 0 || rsize == 0)
766 break;
767
768 sizehint = 1; /* read 1 more byte and retry */
769 }
770
771 if (finalsize != PyUnicode_GET_SIZE(buf.outobj))
772 if (PyUnicode_Resize(&buf.outobj, finalsize) == -1)
773 goto errorexit;
774
775 Py_XDECREF(cres);
776 Py_XDECREF(buf.excobj);
777 return buf.outobj;
778
779errorexit:
780 Py_XDECREF(cres);
781 Py_XDECREF(buf.excobj);
782 Py_XDECREF(buf.outobj);
783 return NULL;
784}
785
786static PyObject *
787mbstreamreader_read(MultibyteStreamReaderObject *self, PyObject *args)
788{
789 PyObject *sizeobj = NULL;
790 long size;
791
792 if (!PyArg_ParseTuple(args, "|O:read", &sizeobj))
793 return NULL;
794
795 if (sizeobj == Py_None || sizeobj == NULL)
796 size = -1;
797 else if (PyInt_Check(sizeobj))
798 size = PyInt_AsLong(sizeobj);
799 else {
800 PyErr_SetString(PyExc_TypeError, "arg 1 must be an integer");
801 return NULL;
802 }
803
804 return mbstreamreader_iread(self, "read", size);
805}
806
807static PyObject *
808mbstreamreader_readline(MultibyteStreamReaderObject *self, PyObject *args)
809{
810 PyObject *sizeobj = NULL;
811 long size;
812
813 if (!PyArg_ParseTuple(args, "|O:readline", &sizeobj))
814 return NULL;
815
816 if (sizeobj == Py_None || sizeobj == NULL)
817 size = -1;
818 else if (PyInt_Check(sizeobj))
819 size = PyInt_AsLong(sizeobj);
820 else {
821 PyErr_SetString(PyExc_TypeError, "arg 1 must be an integer");
822 return NULL;
823 }
824
825 return mbstreamreader_iread(self, "readline", size);
826}
827
828static PyObject *
829mbstreamreader_readlines(MultibyteStreamReaderObject *self, PyObject *args)
830{
831 PyObject *sizehintobj = NULL, *r, *sr;
832 long sizehint;
833
834 if (!PyArg_ParseTuple(args, "|O:readlines", &sizehintobj))
835 return NULL;
836
837 if (sizehintobj == Py_None || sizehintobj == NULL)
838 sizehint = -1;
839 else if (PyInt_Check(sizehintobj))
840 sizehint = PyInt_AsLong(sizehintobj);
841 else {
842 PyErr_SetString(PyExc_TypeError, "arg 1 must be an integer");
843 return NULL;
844 }
845
846 r = mbstreamreader_iread(self, "read", sizehint);
847 if (r == NULL)
848 return NULL;
849
850 sr = PyUnicode_Splitlines(r, 1);
851 Py_DECREF(r);
852 return sr;
853}
854
855static PyObject *
856mbstreamreader_reset(MultibyteStreamReaderObject *self)
857{
858 if (self->codec->decreset != NULL &&
859 self->codec->decreset(&self->state) != 0)
860 return NULL;
861 self->pendingsize = 0;
862
863 Py_INCREF(Py_None);
864 return Py_None;
865}
866
867static struct PyMethodDef mbstreamreader_methods[] = {
868 {"read", (PyCFunction)mbstreamreader_read,
869 METH_VARARGS, NULL},
870 {"readline", (PyCFunction)mbstreamreader_readline,
871 METH_VARARGS, NULL},
872 {"readlines", (PyCFunction)mbstreamreader_readlines,
873 METH_VARARGS, NULL},
874 {"reset", (PyCFunction)mbstreamreader_reset,
875 METH_NOARGS, NULL},
876 {NULL, NULL},
877};
878
879static void
880mbstreamreader_dealloc(MultibyteStreamReaderObject *self)
881{
882 if (self->errors > ERROR_MAX) {
883 Py_DECREF(self->errors);
884 }
885 Py_DECREF(self->stream);
886 PyObject_Del(self);
887}
888
889
890
891static PyTypeObject MultibyteStreamReader_Type = {
892 PyObject_HEAD_INIT(NULL)
893 0, /* ob_size */
894 "MultibyteStreamReader", /* tp_name */
895 sizeof(MultibyteStreamReaderObject), /* tp_basicsize */
896 0, /* tp_itemsize */
897 /* methods */
898 (destructor)mbstreamreader_dealloc, /* tp_dealloc */
899 0, /* tp_print */
900 0, /* tp_getattr */
901 0, /* tp_setattr */
902 0, /* tp_compare */
903 0, /* tp_repr */
904 0, /* tp_as_number */
905 0, /* tp_as_sequence */
906 0, /* tp_as_mapping */
907 0, /* tp_hash */
908 0, /* tp_call */
909 0, /* tp_str */
910 PyObject_GenericGetAttr, /* tp_getattro */
911 0, /* tp_setattro */
912 0, /* tp_as_buffer */
913 Py_TPFLAGS_DEFAULT, /* tp_flags */
914 0, /* tp_doc */
915 0, /* tp_traverse */
916 0, /* tp_clear */
917 0, /* tp_richcompare */
918 0, /* tp_weaklistoffset */
919 0, /* tp_iter */
920 0, /* tp_iterext */
921 mbstreamreader_methods, /* tp_methods */
922};
923
924static int
925mbstreamwriter_iwrite(MultibyteStreamWriterObject *self,
926 PyObject *unistr)
927{
928 PyObject *wr, *r = NULL;
929 Py_UNICODE *inbuf, *inbuf_end, *inbuf_tmp = NULL;
930 int rsize;
931
932 if (!PyUnicode_Check(unistr)) {
933 PyErr_SetString(PyExc_TypeError,
934 "only unicode objects are encodable.");
935 return -1;
936 }
937
938 rsize = PyUnicode_GET_SIZE(unistr);
939 if (rsize == 0)
940 return 0;
941
942 if (self->pendingsize > 0) {
943 inbuf_tmp = PyMem_New(Py_UNICODE, rsize + self->pendingsize);
944 if (inbuf_tmp == NULL)
945 goto errorexit;
946 memcpy(inbuf_tmp, self->pending, Py_UNICODE_SIZE * self->pendingsize);
947 memcpy(inbuf_tmp + self->pendingsize, PyUnicode_AS_UNICODE(unistr),
948 Py_UNICODE_SIZE * rsize);
949 rsize += self->pendingsize;
950 self->pendingsize = 0;
951 inbuf = inbuf_tmp;
952 } else
953 inbuf = (Py_UNICODE *)PyUnicode_AS_UNICODE(unistr);
954
955 inbuf_end = inbuf + rsize;
956
957 r = multibytecodec_encode(self->codec, &self->state,
958 (const Py_UNICODE **)&inbuf, rsize, self->errors, 0);
959 if (r == NULL)
960 goto errorexit;
961
962 if (inbuf < inbuf_end) {
963 self->pendingsize = (int)(inbuf_end - inbuf);
964 if (self->pendingsize > MAXENCPENDING) {
965 self->pendingsize = 0;
966 PyErr_SetString(PyExc_RuntimeError, "pending buffer overflow");
967 goto errorexit;
968 }
969 memcpy(self->pending, inbuf, self->pendingsize * Py_UNICODE_SIZE);
970 }
971
972 wr = PyObject_CallMethod(self->stream, "write", "O", r);
973 if (wr == NULL)
974 goto errorexit;
975
976 if (inbuf_tmp != NULL)
977 PyMem_Del(inbuf_tmp);
978 Py_DECREF(r);
979 Py_DECREF(wr);
980 return 0;
981
982errorexit:
983 if (inbuf_tmp != NULL)
984 PyMem_Del(inbuf_tmp);
985 Py_XDECREF(r);
986 return -1;
987}
988
989static PyObject *
990mbstreamwriter_write(MultibyteStreamWriterObject *self, PyObject *args)
991{
992 PyObject *strobj;
993
994 if (!PyArg_ParseTuple(args, "O:write", &strobj))
995 return NULL;
996
997 if (mbstreamwriter_iwrite(self, strobj))
998 return NULL;
999 else {
1000 Py_INCREF(Py_None);
1001 return Py_None;
1002 }
1003}
1004
1005static PyObject *
1006mbstreamwriter_writelines(MultibyteStreamWriterObject *self, PyObject *args)
1007{
1008 PyObject *lines, *strobj;
1009 int i, r;
1010
1011 if (!PyArg_ParseTuple(args, "O:writelines", &lines))
1012 return NULL;
1013
1014 if (!PySequence_Check(lines)) {
1015 PyErr_SetString(PyExc_TypeError, "arg must be a sequence object");
1016 return NULL;
1017 }
1018
1019 for (i = 0; i < PySequence_Length(lines); i++) {
1020 /* length can be changed even within this loop */
1021 strobj = PySequence_GetItem(lines, i);
1022 if (strobj == NULL)
1023 return NULL;
1024
1025 r = mbstreamwriter_iwrite(self, strobj);
1026 Py_DECREF(strobj);
1027 if (r == -1)
1028 return NULL;
1029 }
1030
1031 Py_INCREF(Py_None);
1032 return Py_None;
1033}
1034
1035static PyObject *
1036mbstreamwriter_reset(MultibyteStreamWriterObject *self)
1037{
1038 const Py_UNICODE *pending;
1039 PyObject *pwrt;
1040
1041 pending = self->pending;
1042 pwrt = multibytecodec_encode(self->codec, &self->state,
1043 &pending, self->pendingsize, self->errors,
1044 MBENC_FLUSH | MBENC_RESET);
1045 /* some pending buffer can be truncated when UnicodeEncodeError is
1046 * raised on 'strict' mode. but, 'reset' method is designed to
1047 * reset the pending buffer or states so failed string sequence
1048 * ought to be missed */
1049 self->pendingsize = 0;
1050 if (pwrt == NULL)
1051 return NULL;
1052
1053 if (PyString_Size(pwrt) > 0) {
1054 PyObject *wr;
1055 wr = PyObject_CallMethod(self->stream, "write", "O", pwrt);
1056 if (wr == NULL) {
1057 Py_DECREF(pwrt);
1058 return NULL;
1059 }
1060 }
1061 Py_DECREF(pwrt);
1062
1063 Py_INCREF(Py_None);
1064 return Py_None;
1065}
1066
1067static void
1068mbstreamwriter_dealloc(MultibyteStreamWriterObject *self)
1069{
1070 if (self->errors > ERROR_MAX) {
1071 Py_DECREF(self->errors);
1072 }
1073 Py_DECREF(self->stream);
1074 PyObject_Del(self);
1075}
1076
1077static struct PyMethodDef mbstreamwriter_methods[] = {
1078 {"write", (PyCFunction)mbstreamwriter_write,
1079 METH_VARARGS, NULL},
1080 {"writelines", (PyCFunction)mbstreamwriter_writelines,
1081 METH_VARARGS, NULL},
1082 {"reset", (PyCFunction)mbstreamwriter_reset,
1083 METH_NOARGS, NULL},
1084 {NULL, NULL},
1085};
1086
1087
1088
1089static PyTypeObject MultibyteStreamWriter_Type = {
1090 PyObject_HEAD_INIT(NULL)
1091 0, /* ob_size */
1092 "MultibyteStreamWriter", /* tp_name */
1093 sizeof(MultibyteStreamWriterObject), /* tp_basicsize */
1094 0, /* tp_itemsize */
1095 /* methods */
1096 (destructor)mbstreamwriter_dealloc, /* tp_dealloc */
1097 0, /* tp_print */
1098 0, /* tp_getattr */
1099 0, /* tp_setattr */
1100 0, /* tp_compare */
1101 0, /* tp_repr */
1102 0, /* tp_as_number */
1103 0, /* tp_as_sequence */
1104 0, /* tp_as_mapping */
1105 0, /* tp_hash */
1106 0, /* tp_call */
1107 0, /* tp_str */
1108 PyObject_GenericGetAttr, /* tp_getattro */
1109 0, /* tp_setattro */
1110 0, /* tp_as_buffer */
1111 Py_TPFLAGS_DEFAULT, /* tp_flags */
1112 0, /* tp_doc */
1113 0, /* tp_traverse */
1114 0, /* tp_clear */
1115 0, /* tp_richcompare */
1116 0, /* tp_weaklistoffset */
1117 0, /* tp_iter */
1118 0, /* tp_iterext */
1119 mbstreamwriter_methods, /* tp_methods */
1120};
1121
1122static PyObject *
1123__create_codec(PyObject *ignore, PyObject *arg)
1124{
1125 MultibyteCodecObject *self;
1126
1127
1128 if (!PyCObject_Check(arg)) {
1129 PyErr_SetString(PyExc_ValueError, "argument type invalid");
1130 return NULL;
1131 }
1132
1133 self = PyObject_New(MultibyteCodecObject, &MultibyteCodec_Type);
1134 if (self == NULL)
1135 return NULL;
1136
1137 self->codec = PyCObject_AsVoidPtr(arg);
1138
1139 return (PyObject *)self;
1140}
1141
1142static PyObject *
1143mbstreamreader_create(MultibyteCodec *codec,
1144 PyObject *stream, const char *errors)
1145{
1146 MultibyteStreamReaderObject *self;
1147
1148 self = PyObject_New(MultibyteStreamReaderObject,
1149 &MultibyteStreamReader_Type);
1150 if (self == NULL)
1151 return NULL;
1152
1153 self->codec = codec;
1154 self->stream = stream;
1155 Py_INCREF(stream);
1156 self->pendingsize = 0;
1157 self->errors = get_errorcallback(errors);
1158 if (self->errors == NULL)
1159 goto errorexit;
1160 if (self->codec->decinit != NULL && self->codec->decinit(&self->state) != 0)
1161 goto errorexit;
1162
1163 return (PyObject *)self;
1164
1165errorexit:
1166 Py_XDECREF(self);
1167 return NULL;
1168}
1169
1170static PyObject *
1171mbstreamwriter_create(MultibyteCodec *codec,
1172 PyObject *stream, const char *errors)
1173{
1174 MultibyteStreamWriterObject *self;
1175
1176 self = PyObject_New(MultibyteStreamWriterObject,
1177 &MultibyteStreamWriter_Type);
1178 if (self == NULL)
1179 return NULL;
1180
1181 self->codec = codec;
1182 self->stream = stream;
1183 Py_INCREF(stream);
1184 self->pendingsize = 0;
1185 self->errors = get_errorcallback(errors);
1186 if (self->errors == NULL)
1187 goto errorexit;
1188 if (self->codec->encinit != NULL && self->codec->encinit(&self->state) != 0)
1189 goto errorexit;
1190
1191 return (PyObject *)self;
1192
1193errorexit:
1194 Py_XDECREF(self);
1195 return NULL;
1196}
1197
1198static struct PyMethodDef __methods[] = {
1199 {"__create_codec", (PyCFunction)__create_codec, METH_O},
1200 {NULL, NULL},
1201};
1202
1203void
1204init_multibytecodec(void)
1205{
1206 Py_InitModule("_multibytecodec", __methods);
1207
1208 if (PyErr_Occurred())
1209 Py_FatalError("can't initialize the _multibytecodec module");
1210}