blob: 4f368f8b8fcc2609be7f2b7bbad2fe9be01d7c27 [file] [log] [blame]
Guido van Rossume2d67f92000-03-10 23:09:23 +00001/* ------------------------------------------------------------------------
2
3 _codecs -- Provides access to the codec registry and the builtin
4 codecs.
5
6 This module should never be imported directly. The standard library
7 module "codecs" wraps this builtin module for use within Python.
8
9 The codec registry is accessible via:
10
11 register(search_function) -> None
12
13 lookup(encoding) -> (encoder, decoder, stream_reader, stream_writer)
14
15 The builtin Unicode codecs use the following interface:
16
17 <encoding>_encode(Unicode_object[,errors='strict']) ->
18 (string object, bytes consumed)
19
20 <encoding>_decode(char_buffer_obj[,errors='strict']) ->
21 (Unicode object, bytes consumed)
22
23 These <encoding>s are available: utf_8, unicode_escape,
24 raw_unicode_escape, unicode_internal, latin_1, ascii (7-bit)
25
26Written by Marc-Andre Lemburg (mal@lemburg.com).
27
28(c) Copyright CNRI, All Rights Reserved. NO WARRANTY.
29
30 ------------------------------------------------------------------------ */
31
32#include "Python.h"
33
34/* --- Registry ----------------------------------------------------------- */
35
36static
37PyObject *codecregister(PyObject *self, PyObject *args)
38{
39 PyObject *search_function;
40
41 if (!PyArg_ParseTuple(args, "O:register", &search_function))
42 goto onError;
43
44 if (PyCodec_Register(search_function))
45 goto onError;
46
47 Py_INCREF(Py_None);
48 return Py_None;
49
50 onError:
51 return NULL;
52}
53
54static
55PyObject *codeclookup(PyObject *self, PyObject *args)
56{
57 char *encoding;
58
59 if (!PyArg_ParseTuple(args, "s:lookup", &encoding))
60 goto onError;
61
62 return _PyCodec_Lookup(encoding);
63
64 onError:
65 return NULL;
66}
67
68/* --- Helpers ------------------------------------------------------------ */
69
70static
71PyObject *codec_tuple(PyObject *unicode,
72 int len)
73{
74 PyObject *v,*w;
75
76 if (unicode == NULL)
77 return NULL;
78 v = PyTuple_New(2);
79 if (v == NULL) {
80 Py_DECREF(unicode);
81 return NULL;
82 }
83 PyTuple_SET_ITEM(v,0,unicode);
84 w = PyInt_FromLong(len);
85 if (w == NULL) {
86 Py_DECREF(v);
87 return NULL;
88 }
89 PyTuple_SET_ITEM(v,1,w);
90 return v;
91}
92
93/* --- Decoder ------------------------------------------------------------ */
94
95static PyObject *
96unicode_internal_decode(PyObject *self,
97 PyObject *args)
98{
99 const char *data;
100 int size;
101 const char *errors = NULL;
102
103 if (!PyArg_ParseTuple(args, "s#|z:unicode_internal_decode",
104 &data, &size, &errors))
105 return NULL;
106
107 return codec_tuple(PyUnicode_FromUnicode((Py_UNICODE *)data,
108 size / sizeof(Py_UNICODE)),
109 size);
110}
111
112static PyObject *
113utf_8_decode(PyObject *self,
114 PyObject *args)
115{
116 const char *data;
117 int size;
118 const char *errors = NULL;
119
120 if (!PyArg_ParseTuple(args, "t#|z:utf_8_decode",
121 &data, &size, &errors))
122 return NULL;
123
124 return codec_tuple(PyUnicode_DecodeUTF8(data, size, errors),
125 size);
126}
127
128static PyObject *
129utf_16_decode(PyObject *self,
130 PyObject *args)
131{
132 const char *data;
133 int size;
134 const char *errors = NULL;
135 int byteorder = 0;
136
137 if (!PyArg_ParseTuple(args, "t#|z:utf_16_decode",
138 &data, &size, &errors))
139 return NULL;
140 return codec_tuple(PyUnicode_DecodeUTF16(data, size, errors, &byteorder),
141 size);
142}
143
144static PyObject *
145utf_16_le_decode(PyObject *self,
146 PyObject *args)
147{
148 const char *data;
149 int size;
150 const char *errors = NULL;
151 int byteorder = -1;
152
153 if (!PyArg_ParseTuple(args, "t#|z:utf_16_le_decode",
154 &data, &size, &errors))
155 return NULL;
156 return codec_tuple(PyUnicode_DecodeUTF16(data, size, errors, &byteorder),
157 size);
158}
159
160static PyObject *
161utf_16_be_decode(PyObject *self,
162 PyObject *args)
163{
164 const char *data;
165 int size;
166 const char *errors = NULL;
167 int byteorder = 1;
168
169 if (!PyArg_ParseTuple(args, "t#|z:utf_16_be_decode",
170 &data, &size, &errors))
171 return NULL;
172 return codec_tuple(PyUnicode_DecodeUTF16(data, size, errors, &byteorder),
173 size);
174}
175
176/* This non-standard version also provides access to the byteorder
177 parameter of the builtin UTF-16 codec.
178
179 It returns a tuple (unicode, bytesread, byteorder) with byteorder
180 being the value in effect at the end of data.
181
182*/
183
184static PyObject *
185utf_16_ex_decode(PyObject *self,
186 PyObject *args)
187{
188 const char *data;
189 int size;
190 const char *errors = NULL;
191 int byteorder = 0;
192 PyObject *unicode, *tuple;
193
194 if (!PyArg_ParseTuple(args, "t#|zi:utf_16_ex_decode",
195 &data, &size, &errors, &byteorder))
196 return NULL;
197
198 unicode = PyUnicode_DecodeUTF16(data, size, errors, &byteorder);
199 if (unicode == NULL)
200 return NULL;
201 tuple = Py_BuildValue("Oii", unicode, size, byteorder);
202 Py_DECREF(unicode);
203 return tuple;
204}
205
206static PyObject *
207unicode_escape_decode(PyObject *self,
208 PyObject *args)
209{
210 const char *data;
211 int size;
212 const char *errors = NULL;
213
214 if (!PyArg_ParseTuple(args, "t#|z:unicode_escape_decode",
215 &data, &size, &errors))
216 return NULL;
217
218 return codec_tuple(PyUnicode_DecodeUnicodeEscape(data, size, errors),
219 size);
220}
221
222static PyObject *
223raw_unicode_escape_decode(PyObject *self,
224 PyObject *args)
225{
226 const char *data;
227 int size;
228 const char *errors = NULL;
229
230 if (!PyArg_ParseTuple(args, "t#|z:raw_unicode_escape_decode",
231 &data, &size, &errors))
232 return NULL;
233
234 return codec_tuple(PyUnicode_DecodeRawUnicodeEscape(data, size, errors),
235 size);
236}
237
238static PyObject *
239latin_1_decode(PyObject *self,
240 PyObject *args)
241{
242 const char *data;
243 int size;
244 const char *errors = NULL;
245
246 if (!PyArg_ParseTuple(args, "t#|z:latin_1_decode",
247 &data, &size, &errors))
248 return NULL;
249
250 return codec_tuple(PyUnicode_DecodeLatin1(data, size, errors),
251 size);
252}
253
254static PyObject *
255ascii_decode(PyObject *self,
256 PyObject *args)
257{
258 const char *data;
259 int size;
260 const char *errors = NULL;
261
262 if (!PyArg_ParseTuple(args, "t#|z:ascii_decode",
263 &data, &size, &errors))
264 return NULL;
265
266 return codec_tuple(PyUnicode_DecodeASCII(data, size, errors),
267 size);
268}
269
270static PyObject *
271charmap_decode(PyObject *self,
272 PyObject *args)
273{
274 const char *data;
275 int size;
276 const char *errors = NULL;
277 PyObject *mapping = NULL;
278
279 if (!PyArg_ParseTuple(args, "t#|zO:charmap_decode",
280 &data, &size, &errors, &mapping))
281 return NULL;
282 if (mapping == Py_None)
283 mapping = NULL;
284
285 return codec_tuple(PyUnicode_DecodeCharmap(data, size, mapping, errors),
286 size);
287}
288
Guido van Rossum24bdb042000-03-28 20:29:59 +0000289#ifdef MS_WIN32
290
291static PyObject *
292mbcs_decode(PyObject *self,
293 PyObject *args)
294{
295 const char *data;
296 int size;
297 const char *errors = NULL;
298
299 if (!PyArg_ParseTuple(args, "t#|z:mbcs_decode",
300 &data, &size, &errors))
301 return NULL;
302
303 return codec_tuple(PyUnicode_DecodeMBCS(data, size, errors),
304 size);
305}
306
307#endif /* MS_WIN32 */
308
Guido van Rossume2d67f92000-03-10 23:09:23 +0000309/* --- Encoder ------------------------------------------------------------ */
310
311static PyObject *
312readbuffer_encode(PyObject *self,
313 PyObject *args)
314{
315 const char *data;
316 int size;
317 const char *errors = NULL;
318
319 if (!PyArg_ParseTuple(args, "s#|z:readbuffer_encode",
320 &data, &size, &errors))
321 return NULL;
322
323 return codec_tuple(PyString_FromStringAndSize(data, size),
324 size);
325}
326
327static PyObject *
328charbuffer_encode(PyObject *self,
329 PyObject *args)
330{
331 const char *data;
332 int size;
333 const char *errors = NULL;
334
335 if (!PyArg_ParseTuple(args, "t#|z:charbuffer_encode",
336 &data, &size, &errors))
337 return NULL;
338
339 return codec_tuple(PyString_FromStringAndSize(data, size),
340 size);
341}
342
343static PyObject *
344utf_8_encode(PyObject *self,
345 PyObject *args)
346{
347 PyObject *str;
348 const char *errors = NULL;
349
350 if (!PyArg_ParseTuple(args, "U|z:utf_8_encode",
351 &str, &errors))
352 return NULL;
353
354 return codec_tuple(PyUnicode_EncodeUTF8(PyUnicode_AS_UNICODE(str),
355 PyUnicode_GET_SIZE(str),
356 errors),
357 PyUnicode_GET_SIZE(str));
358}
359
360/* This version provides access to the byteorder parameter of the
361 builtin UTF-16 codecs as optional third argument. It defaults to 0
362 which means: use the native byte order and prepend the data with a
363 BOM mark.
364
365*/
366
367static PyObject *
368utf_16_encode(PyObject *self,
369 PyObject *args)
370{
371 PyObject *str;
372 const char *errors = NULL;
373 int byteorder = 0;
374
375 if (!PyArg_ParseTuple(args, "U|zi:utf_16_encode",
376 &str, &errors, &byteorder))
377 return NULL;
378
379 return codec_tuple(PyUnicode_EncodeUTF16(PyUnicode_AS_UNICODE(str),
380 PyUnicode_GET_SIZE(str),
381 errors,
382 byteorder),
383 PyUnicode_GET_SIZE(str));
384}
385
386static PyObject *
387utf_16_le_encode(PyObject *self,
388 PyObject *args)
389{
390 PyObject *str;
391 const char *errors = NULL;
392
393 if (!PyArg_ParseTuple(args, "U|zi:utf_16_le_encode",
394 &str, &errors))
395 return NULL;
396
397 return codec_tuple(PyUnicode_EncodeUTF16(PyUnicode_AS_UNICODE(str),
398 PyUnicode_GET_SIZE(str),
399 errors,
400 -1),
401 PyUnicode_GET_SIZE(str));
402}
403
404static PyObject *
405utf_16_be_encode(PyObject *self,
406 PyObject *args)
407{
408 PyObject *str;
409 const char *errors = NULL;
410
411 if (!PyArg_ParseTuple(args, "U|zi:utf_16_be_encode",
412 &str, &errors))
413 return NULL;
414
415 return codec_tuple(PyUnicode_EncodeUTF16(PyUnicode_AS_UNICODE(str),
416 PyUnicode_GET_SIZE(str),
417 errors,
418 +1),
419 PyUnicode_GET_SIZE(str));
420}
421
422static PyObject *
423unicode_escape_encode(PyObject *self,
424 PyObject *args)
425{
426 PyObject *str;
427 const char *errors = NULL;
428
429 if (!PyArg_ParseTuple(args, "U|z:unicode_escape_encode",
430 &str, &errors))
431 return NULL;
432
433 return codec_tuple(PyUnicode_EncodeUnicodeEscape(
434 PyUnicode_AS_UNICODE(str),
435 PyUnicode_GET_SIZE(str)),
436 PyUnicode_GET_SIZE(str));
437}
438
439static PyObject *
440raw_unicode_escape_encode(PyObject *self,
441 PyObject *args)
442{
443 PyObject *str;
444 const char *errors = NULL;
445
446 if (!PyArg_ParseTuple(args, "U|z:raw_unicode_escape_encode",
447 &str, &errors))
448 return NULL;
449
450 return codec_tuple(PyUnicode_EncodeRawUnicodeEscape(
451 PyUnicode_AS_UNICODE(str),
452 PyUnicode_GET_SIZE(str)),
453 PyUnicode_GET_SIZE(str));
454}
455
456static PyObject *
457latin_1_encode(PyObject *self,
458 PyObject *args)
459{
460 PyObject *str;
461 const char *errors = NULL;
462
463 if (!PyArg_ParseTuple(args, "U|z:latin_1_encode",
464 &str, &errors))
465 return NULL;
466
467 return codec_tuple(PyUnicode_EncodeLatin1(
468 PyUnicode_AS_UNICODE(str),
469 PyUnicode_GET_SIZE(str),
470 errors),
471 PyUnicode_GET_SIZE(str));
472}
473
474static PyObject *
475ascii_encode(PyObject *self,
476 PyObject *args)
477{
478 PyObject *str;
479 const char *errors = NULL;
480
481 if (!PyArg_ParseTuple(args, "U|z:ascii_encode",
482 &str, &errors))
483 return NULL;
484
485 return codec_tuple(PyUnicode_EncodeASCII(
486 PyUnicode_AS_UNICODE(str),
487 PyUnicode_GET_SIZE(str),
488 errors),
489 PyUnicode_GET_SIZE(str));
490}
491
492static PyObject *
493charmap_encode(PyObject *self,
494 PyObject *args)
495{
496 PyObject *str;
497 const char *errors = NULL;
498 PyObject *mapping = NULL;
499
500 if (!PyArg_ParseTuple(args, "U|zO:charmap_encode",
501 &str, &errors, &mapping))
502 return NULL;
503 if (mapping == Py_None)
504 mapping = NULL;
505
506 return codec_tuple(PyUnicode_EncodeCharmap(
507 PyUnicode_AS_UNICODE(str),
508 PyUnicode_GET_SIZE(str),
509 mapping,
510 errors),
511 PyUnicode_GET_SIZE(str));
512}
513
Guido van Rossum24bdb042000-03-28 20:29:59 +0000514#ifdef MS_WIN32
515
516static PyObject *
517mbcs_encode(PyObject *self,
518 PyObject *args)
519{
520 PyObject *str;
521 const char *errors = NULL;
522
523 if (!PyArg_ParseTuple(args, "U|z:mbcs_encode",
524 &str, &errors))
525 return NULL;
526
527 return codec_tuple(PyUnicode_EncodeMBCS(
528 PyUnicode_AS_UNICODE(str),
529 PyUnicode_GET_SIZE(str),
530 errors),
531 PyUnicode_GET_SIZE(str));
532}
533
534#endif /* MS_WIN32 */
535
Guido van Rossume2d67f92000-03-10 23:09:23 +0000536/* --- Module API --------------------------------------------------------- */
537
538static PyMethodDef _codecs_functions[] = {
539 {"register", codecregister, 1},
540 {"lookup", codeclookup, 1},
541 {"utf_8_encode", utf_8_encode, 1},
542 {"utf_8_decode", utf_8_decode, 1},
543 {"utf_16_encode", utf_16_encode, 1},
544 {"utf_16_le_encode", utf_16_le_encode, 1},
545 {"utf_16_be_encode", utf_16_be_encode, 1},
546 {"utf_16_decode", utf_16_decode, 1},
547 {"utf_16_le_decode", utf_16_le_decode, 1},
548 {"utf_16_be_decode", utf_16_be_decode, 1},
549 {"utf_16_ex_decode", utf_16_ex_decode, 1},
550 {"unicode_escape_encode", unicode_escape_encode, 1},
551 {"unicode_escape_decode", unicode_escape_decode, 1},
552 {"unicode_internal_encode", readbuffer_encode, 1},
553 {"unicode_internal_decode", unicode_internal_decode, 1},
554 {"raw_unicode_escape_encode", raw_unicode_escape_encode, 1},
555 {"raw_unicode_escape_decode", raw_unicode_escape_decode, 1},
556 {"latin_1_encode", latin_1_encode, 1},
557 {"latin_1_decode", latin_1_decode, 1},
558 {"ascii_encode", ascii_encode, 1},
559 {"ascii_decode", ascii_decode, 1},
560 {"charmap_encode", charmap_encode, 1},
561 {"charmap_decode", charmap_decode, 1},
562 {"readbuffer_encode", readbuffer_encode, 1},
563 {"charbuffer_encode", charbuffer_encode, 1},
Guido van Rossum24bdb042000-03-28 20:29:59 +0000564#ifdef MS_WIN32
565 {"mbcs_encode", mbcs_encode, 1},
566 {"mbcs_decode", mbcs_decode, 1},
567#endif
Guido van Rossume2d67f92000-03-10 23:09:23 +0000568 {NULL, NULL} /* sentinel */
569};
570
571DL_EXPORT(void)
572init_codecs()
573{
574 Py_InitModule("_codecs", _codecs_functions);
575}