blob: 6c8a2d44e6d9533ddb02b104f4863113ba5b0a4e [file] [log] [blame]
Guido van Rossume2d67f92000-03-10 23:09:23 +00001/* ------------------------------------------------------------------------
2
3 _codecs -- Provides access to the codec registry and the builtin
4 codecs.
5
6 This module should never be imported directly. The standard library
7 module "codecs" wraps this builtin module for use within Python.
8
9 The codec registry is accessible via:
10
11 register(search_function) -> None
12
13 lookup(encoding) -> (encoder, decoder, stream_reader, stream_writer)
14
15 The builtin Unicode codecs use the following interface:
16
17 <encoding>_encode(Unicode_object[,errors='strict']) ->
18 (string object, bytes consumed)
19
20 <encoding>_decode(char_buffer_obj[,errors='strict']) ->
21 (Unicode object, bytes consumed)
22
23 These <encoding>s are available: utf_8, unicode_escape,
24 raw_unicode_escape, unicode_internal, latin_1, ascii (7-bit)
25
26Written by Marc-Andre Lemburg (mal@lemburg.com).
27
28(c) Copyright CNRI, All Rights Reserved. NO WARRANTY.
29
30 ------------------------------------------------------------------------ */
31
32#include "Python.h"
33
34/* --- Registry ----------------------------------------------------------- */
35
36static
37PyObject *codecregister(PyObject *self, PyObject *args)
38{
39 PyObject *search_function;
40
41 if (!PyArg_ParseTuple(args, "O:register", &search_function))
42 goto onError;
43
44 if (PyCodec_Register(search_function))
45 goto onError;
46
47 Py_INCREF(Py_None);
48 return Py_None;
49
50 onError:
51 return NULL;
52}
53
54static
55PyObject *codeclookup(PyObject *self, PyObject *args)
56{
57 char *encoding;
58
59 if (!PyArg_ParseTuple(args, "s:lookup", &encoding))
60 goto onError;
61
62 return _PyCodec_Lookup(encoding);
63
64 onError:
65 return NULL;
66}
67
68/* --- Helpers ------------------------------------------------------------ */
69
70static
71PyObject *codec_tuple(PyObject *unicode,
72 int len)
73{
74 PyObject *v,*w;
75
76 if (unicode == NULL)
77 return NULL;
78 v = PyTuple_New(2);
79 if (v == NULL) {
80 Py_DECREF(unicode);
81 return NULL;
82 }
83 PyTuple_SET_ITEM(v,0,unicode);
84 w = PyInt_FromLong(len);
85 if (w == NULL) {
86 Py_DECREF(v);
87 return NULL;
88 }
89 PyTuple_SET_ITEM(v,1,w);
90 return v;
91}
92
93/* --- Decoder ------------------------------------------------------------ */
94
95static PyObject *
96unicode_internal_decode(PyObject *self,
97 PyObject *args)
98{
99 const char *data;
100 int size;
101 const char *errors = NULL;
102
103 if (!PyArg_ParseTuple(args, "s#|z:unicode_internal_decode",
104 &data, &size, &errors))
105 return NULL;
106
107 return codec_tuple(PyUnicode_FromUnicode((Py_UNICODE *)data,
108 size / sizeof(Py_UNICODE)),
109 size);
110}
111
112static PyObject *
113utf_8_decode(PyObject *self,
114 PyObject *args)
115{
116 const char *data;
117 int size;
118 const char *errors = NULL;
119
120 if (!PyArg_ParseTuple(args, "t#|z:utf_8_decode",
121 &data, &size, &errors))
122 return NULL;
123
124 return codec_tuple(PyUnicode_DecodeUTF8(data, size, errors),
125 size);
126}
127
128static PyObject *
129utf_16_decode(PyObject *self,
130 PyObject *args)
131{
132 const char *data;
133 int size;
134 const char *errors = NULL;
135 int byteorder = 0;
136
137 if (!PyArg_ParseTuple(args, "t#|z:utf_16_decode",
138 &data, &size, &errors))
139 return NULL;
140 return codec_tuple(PyUnicode_DecodeUTF16(data, size, errors, &byteorder),
141 size);
142}
143
144static PyObject *
145utf_16_le_decode(PyObject *self,
146 PyObject *args)
147{
148 const char *data;
149 int size;
150 const char *errors = NULL;
151 int byteorder = -1;
152
153 if (!PyArg_ParseTuple(args, "t#|z:utf_16_le_decode",
154 &data, &size, &errors))
155 return NULL;
156 return codec_tuple(PyUnicode_DecodeUTF16(data, size, errors, &byteorder),
157 size);
158}
159
160static PyObject *
161utf_16_be_decode(PyObject *self,
162 PyObject *args)
163{
164 const char *data;
165 int size;
166 const char *errors = NULL;
167 int byteorder = 1;
168
169 if (!PyArg_ParseTuple(args, "t#|z:utf_16_be_decode",
170 &data, &size, &errors))
171 return NULL;
172 return codec_tuple(PyUnicode_DecodeUTF16(data, size, errors, &byteorder),
173 size);
174}
175
176/* This non-standard version also provides access to the byteorder
177 parameter of the builtin UTF-16 codec.
178
179 It returns a tuple (unicode, bytesread, byteorder) with byteorder
180 being the value in effect at the end of data.
181
182*/
183
184static PyObject *
185utf_16_ex_decode(PyObject *self,
186 PyObject *args)
187{
188 const char *data;
189 int size;
190 const char *errors = NULL;
191 int byteorder = 0;
192 PyObject *unicode, *tuple;
193
194 if (!PyArg_ParseTuple(args, "t#|zi:utf_16_ex_decode",
195 &data, &size, &errors, &byteorder))
196 return NULL;
197
198 unicode = PyUnicode_DecodeUTF16(data, size, errors, &byteorder);
199 if (unicode == NULL)
200 return NULL;
201 tuple = Py_BuildValue("Oii", unicode, size, byteorder);
202 Py_DECREF(unicode);
203 return tuple;
204}
205
206static PyObject *
207unicode_escape_decode(PyObject *self,
208 PyObject *args)
209{
210 const char *data;
211 int size;
212 const char *errors = NULL;
213
214 if (!PyArg_ParseTuple(args, "t#|z:unicode_escape_decode",
215 &data, &size, &errors))
216 return NULL;
217
218 return codec_tuple(PyUnicode_DecodeUnicodeEscape(data, size, errors),
219 size);
220}
221
222static PyObject *
223raw_unicode_escape_decode(PyObject *self,
224 PyObject *args)
225{
226 const char *data;
227 int size;
228 const char *errors = NULL;
229
230 if (!PyArg_ParseTuple(args, "t#|z:raw_unicode_escape_decode",
231 &data, &size, &errors))
232 return NULL;
233
234 return codec_tuple(PyUnicode_DecodeRawUnicodeEscape(data, size, errors),
235 size);
236}
237
238static PyObject *
239latin_1_decode(PyObject *self,
240 PyObject *args)
241{
242 const char *data;
243 int size;
244 const char *errors = NULL;
245
246 if (!PyArg_ParseTuple(args, "t#|z:latin_1_decode",
247 &data, &size, &errors))
248 return NULL;
249
250 return codec_tuple(PyUnicode_DecodeLatin1(data, size, errors),
251 size);
252}
253
254static PyObject *
255ascii_decode(PyObject *self,
256 PyObject *args)
257{
258 const char *data;
259 int size;
260 const char *errors = NULL;
261
262 if (!PyArg_ParseTuple(args, "t#|z:ascii_decode",
263 &data, &size, &errors))
264 return NULL;
265
266 return codec_tuple(PyUnicode_DecodeASCII(data, size, errors),
267 size);
268}
269
270static PyObject *
271charmap_decode(PyObject *self,
272 PyObject *args)
273{
274 const char *data;
275 int size;
276 const char *errors = NULL;
277 PyObject *mapping = NULL;
278
279 if (!PyArg_ParseTuple(args, "t#|zO:charmap_decode",
280 &data, &size, &errors, &mapping))
281 return NULL;
282 if (mapping == Py_None)
283 mapping = NULL;
284
285 return codec_tuple(PyUnicode_DecodeCharmap(data, size, mapping, errors),
286 size);
287}
288
289/* --- Encoder ------------------------------------------------------------ */
290
291static PyObject *
292readbuffer_encode(PyObject *self,
293 PyObject *args)
294{
295 const char *data;
296 int size;
297 const char *errors = NULL;
298
299 if (!PyArg_ParseTuple(args, "s#|z:readbuffer_encode",
300 &data, &size, &errors))
301 return NULL;
302
303 return codec_tuple(PyString_FromStringAndSize(data, size),
304 size);
305}
306
307static PyObject *
308charbuffer_encode(PyObject *self,
309 PyObject *args)
310{
311 const char *data;
312 int size;
313 const char *errors = NULL;
314
315 if (!PyArg_ParseTuple(args, "t#|z:charbuffer_encode",
316 &data, &size, &errors))
317 return NULL;
318
319 return codec_tuple(PyString_FromStringAndSize(data, size),
320 size);
321}
322
323static PyObject *
324utf_8_encode(PyObject *self,
325 PyObject *args)
326{
327 PyObject *str;
328 const char *errors = NULL;
329
330 if (!PyArg_ParseTuple(args, "U|z:utf_8_encode",
331 &str, &errors))
332 return NULL;
333
334 return codec_tuple(PyUnicode_EncodeUTF8(PyUnicode_AS_UNICODE(str),
335 PyUnicode_GET_SIZE(str),
336 errors),
337 PyUnicode_GET_SIZE(str));
338}
339
340/* This version provides access to the byteorder parameter of the
341 builtin UTF-16 codecs as optional third argument. It defaults to 0
342 which means: use the native byte order and prepend the data with a
343 BOM mark.
344
345*/
346
347static PyObject *
348utf_16_encode(PyObject *self,
349 PyObject *args)
350{
351 PyObject *str;
352 const char *errors = NULL;
353 int byteorder = 0;
354
355 if (!PyArg_ParseTuple(args, "U|zi:utf_16_encode",
356 &str, &errors, &byteorder))
357 return NULL;
358
359 return codec_tuple(PyUnicode_EncodeUTF16(PyUnicode_AS_UNICODE(str),
360 PyUnicode_GET_SIZE(str),
361 errors,
362 byteorder),
363 PyUnicode_GET_SIZE(str));
364}
365
366static PyObject *
367utf_16_le_encode(PyObject *self,
368 PyObject *args)
369{
370 PyObject *str;
371 const char *errors = NULL;
372
373 if (!PyArg_ParseTuple(args, "U|zi:utf_16_le_encode",
374 &str, &errors))
375 return NULL;
376
377 return codec_tuple(PyUnicode_EncodeUTF16(PyUnicode_AS_UNICODE(str),
378 PyUnicode_GET_SIZE(str),
379 errors,
380 -1),
381 PyUnicode_GET_SIZE(str));
382}
383
384static PyObject *
385utf_16_be_encode(PyObject *self,
386 PyObject *args)
387{
388 PyObject *str;
389 const char *errors = NULL;
390
391 if (!PyArg_ParseTuple(args, "U|zi:utf_16_be_encode",
392 &str, &errors))
393 return NULL;
394
395 return codec_tuple(PyUnicode_EncodeUTF16(PyUnicode_AS_UNICODE(str),
396 PyUnicode_GET_SIZE(str),
397 errors,
398 +1),
399 PyUnicode_GET_SIZE(str));
400}
401
402static PyObject *
403unicode_escape_encode(PyObject *self,
404 PyObject *args)
405{
406 PyObject *str;
407 const char *errors = NULL;
408
409 if (!PyArg_ParseTuple(args, "U|z:unicode_escape_encode",
410 &str, &errors))
411 return NULL;
412
413 return codec_tuple(PyUnicode_EncodeUnicodeEscape(
414 PyUnicode_AS_UNICODE(str),
415 PyUnicode_GET_SIZE(str)),
416 PyUnicode_GET_SIZE(str));
417}
418
419static PyObject *
420raw_unicode_escape_encode(PyObject *self,
421 PyObject *args)
422{
423 PyObject *str;
424 const char *errors = NULL;
425
426 if (!PyArg_ParseTuple(args, "U|z:raw_unicode_escape_encode",
427 &str, &errors))
428 return NULL;
429
430 return codec_tuple(PyUnicode_EncodeRawUnicodeEscape(
431 PyUnicode_AS_UNICODE(str),
432 PyUnicode_GET_SIZE(str)),
433 PyUnicode_GET_SIZE(str));
434}
435
436static PyObject *
437latin_1_encode(PyObject *self,
438 PyObject *args)
439{
440 PyObject *str;
441 const char *errors = NULL;
442
443 if (!PyArg_ParseTuple(args, "U|z:latin_1_encode",
444 &str, &errors))
445 return NULL;
446
447 return codec_tuple(PyUnicode_EncodeLatin1(
448 PyUnicode_AS_UNICODE(str),
449 PyUnicode_GET_SIZE(str),
450 errors),
451 PyUnicode_GET_SIZE(str));
452}
453
454static PyObject *
455ascii_encode(PyObject *self,
456 PyObject *args)
457{
458 PyObject *str;
459 const char *errors = NULL;
460
461 if (!PyArg_ParseTuple(args, "U|z:ascii_encode",
462 &str, &errors))
463 return NULL;
464
465 return codec_tuple(PyUnicode_EncodeASCII(
466 PyUnicode_AS_UNICODE(str),
467 PyUnicode_GET_SIZE(str),
468 errors),
469 PyUnicode_GET_SIZE(str));
470}
471
472static PyObject *
473charmap_encode(PyObject *self,
474 PyObject *args)
475{
476 PyObject *str;
477 const char *errors = NULL;
478 PyObject *mapping = NULL;
479
480 if (!PyArg_ParseTuple(args, "U|zO:charmap_encode",
481 &str, &errors, &mapping))
482 return NULL;
483 if (mapping == Py_None)
484 mapping = NULL;
485
486 return codec_tuple(PyUnicode_EncodeCharmap(
487 PyUnicode_AS_UNICODE(str),
488 PyUnicode_GET_SIZE(str),
489 mapping,
490 errors),
491 PyUnicode_GET_SIZE(str));
492}
493
494/* --- Module API --------------------------------------------------------- */
495
496static PyMethodDef _codecs_functions[] = {
497 {"register", codecregister, 1},
498 {"lookup", codeclookup, 1},
499 {"utf_8_encode", utf_8_encode, 1},
500 {"utf_8_decode", utf_8_decode, 1},
501 {"utf_16_encode", utf_16_encode, 1},
502 {"utf_16_le_encode", utf_16_le_encode, 1},
503 {"utf_16_be_encode", utf_16_be_encode, 1},
504 {"utf_16_decode", utf_16_decode, 1},
505 {"utf_16_le_decode", utf_16_le_decode, 1},
506 {"utf_16_be_decode", utf_16_be_decode, 1},
507 {"utf_16_ex_decode", utf_16_ex_decode, 1},
508 {"unicode_escape_encode", unicode_escape_encode, 1},
509 {"unicode_escape_decode", unicode_escape_decode, 1},
510 {"unicode_internal_encode", readbuffer_encode, 1},
511 {"unicode_internal_decode", unicode_internal_decode, 1},
512 {"raw_unicode_escape_encode", raw_unicode_escape_encode, 1},
513 {"raw_unicode_escape_decode", raw_unicode_escape_decode, 1},
514 {"latin_1_encode", latin_1_encode, 1},
515 {"latin_1_decode", latin_1_decode, 1},
516 {"ascii_encode", ascii_encode, 1},
517 {"ascii_decode", ascii_decode, 1},
518 {"charmap_encode", charmap_encode, 1},
519 {"charmap_decode", charmap_decode, 1},
520 {"readbuffer_encode", readbuffer_encode, 1},
521 {"charbuffer_encode", charbuffer_encode, 1},
522 {NULL, NULL} /* sentinel */
523};
524
525DL_EXPORT(void)
526init_codecs()
527{
528 Py_InitModule("_codecs", _codecs_functions);
529}