blob: c8926fcafb5b649b14016aa36b563041f5db5a1f [file] [log] [blame]
Guido van Rossumfeee4b92000-03-10 22:57:27 +00001/* ------------------------------------------------------------------------
2
3 Python Codec Registry and support functions
4
5Written by Marc-Andre Lemburg (mal@lemburg.com).
6
Guido van Rossum16b1ad92000-08-03 16:24:25 +00007Copyright (c) Corporation for National Research Initiatives.
Guido van Rossumfeee4b92000-03-10 22:57:27 +00008
9 ------------------------------------------------------------------------ */
10
11#include "Python.h"
12#include <ctype.h>
13
Guido van Rossumfeee4b92000-03-10 22:57:27 +000014/* --- Codec Registry ----------------------------------------------------- */
15
16/* Import the standard encodings package which will register the first
Guido van Rossum98297ee2007-11-06 21:34:58 +000017 codec search function.
Guido van Rossumfeee4b92000-03-10 22:57:27 +000018
19 This is done in a lazy way so that the Unicode implementation does
20 not downgrade startup time of scripts not needing it.
21
Guido van Rossumb95de4f2000-03-31 17:25:23 +000022 ImportErrors are silently ignored by this function. Only one try is
23 made.
Guido van Rossumfeee4b92000-03-10 22:57:27 +000024
25*/
26
Gustavo Niemeyer5ddd4c32003-03-19 00:35:36 +000027static int _PyCodecRegistry_Init(void); /* Forward */
Guido van Rossumfeee4b92000-03-10 22:57:27 +000028
Guido van Rossumfeee4b92000-03-10 22:57:27 +000029int PyCodec_Register(PyObject *search_function)
30{
Nicholas Bastine5662ae2004-03-24 22:22:12 +000031 PyInterpreterState *interp = PyThreadState_GET()->interp;
Gustavo Niemeyer5ddd4c32003-03-19 00:35:36 +000032 if (interp->codec_search_path == NULL && _PyCodecRegistry_Init())
33 goto onError;
Guido van Rossumfeee4b92000-03-10 22:57:27 +000034 if (search_function == NULL) {
35 PyErr_BadArgument();
Guido van Rossumb95de4f2000-03-31 17:25:23 +000036 goto onError;
Guido van Rossumfeee4b92000-03-10 22:57:27 +000037 }
38 if (!PyCallable_Check(search_function)) {
Neal Norwitz3715c3e2005-11-24 22:09:18 +000039 PyErr_SetString(PyExc_TypeError, "argument must be callable");
Guido van Rossumb95de4f2000-03-31 17:25:23 +000040 goto onError;
Guido van Rossumfeee4b92000-03-10 22:57:27 +000041 }
Gustavo Niemeyer5ddd4c32003-03-19 00:35:36 +000042 return PyList_Append(interp->codec_search_path, search_function);
Guido van Rossumb95de4f2000-03-31 17:25:23 +000043
44 onError:
45 return -1;
Guido van Rossumfeee4b92000-03-10 22:57:27 +000046}
47
Guido van Rossum9e896b32000-04-05 20:11:21 +000048/* Convert a string to a normalized Python string: all characters are
49 converted to lower case, spaces are replaced with underscores. */
50
Guido van Rossumfeee4b92000-03-10 22:57:27 +000051static
Guido van Rossum9e896b32000-04-05 20:11:21 +000052PyObject *normalizestring(const char *string)
Guido van Rossumfeee4b92000-03-10 22:57:27 +000053{
Guido van Rossum33831132000-06-29 14:50:15 +000054 register size_t i;
Guido van Rossum582acec2000-06-28 22:07:35 +000055 size_t len = strlen(string);
Guido van Rossumfeee4b92000-03-10 22:57:27 +000056 char *p;
57 PyObject *v;
Guido van Rossum21431e82007-10-19 21:48:41 +000058
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000059 if (len > PY_SSIZE_T_MAX) {
60 PyErr_SetString(PyExc_OverflowError, "string is too large");
61 return NULL;
62 }
Guido van Rossum21431e82007-10-19 21:48:41 +000063
64 p = PyMem_Malloc(len + 1);
65 if (p == NULL)
66 return NULL;
Guido van Rossum9e896b32000-04-05 20:11:21 +000067 for (i = 0; i < len; i++) {
68 register char ch = string[i];
69 if (ch == ' ')
70 ch = '-';
71 else
Thomas Wouters477c8d52006-05-27 19:21:47 +000072 ch = tolower(Py_CHARMASK(ch));
Guido van Rossum9e896b32000-04-05 20:11:21 +000073 p[i] = ch;
74 }
Guido van Rossum21431e82007-10-19 21:48:41 +000075 p[i] = '\0';
76 v = PyUnicode_FromString(p);
77 if (v == NULL)
78 return NULL;
79 PyMem_Free(p);
Guido van Rossumfeee4b92000-03-10 22:57:27 +000080 return v;
81}
82
83/* Lookup the given encoding and return a tuple providing the codec
84 facilities.
85
86 The encoding string is looked up converted to all lower-case
87 characters. This makes encodings looked up through this mechanism
88 effectively case-insensitive.
89
Guido van Rossum98297ee2007-11-06 21:34:58 +000090 If no codec is found, a LookupError is set and NULL returned.
Guido van Rossumb95de4f2000-03-31 17:25:23 +000091
92 As side effect, this tries to load the encodings package, if not
93 yet done. This is part of the lazy load strategy for the encodings
94 package.
95
96*/
Guido van Rossumfeee4b92000-03-10 22:57:27 +000097
98PyObject *_PyCodec_Lookup(const char *encoding)
99{
Gustavo Niemeyer5ddd4c32003-03-19 00:35:36 +0000100 PyInterpreterState *interp;
Guido van Rossum5ba3c842000-03-24 20:52:23 +0000101 PyObject *result, *args = NULL, *v;
Thomas Wouters477c8d52006-05-27 19:21:47 +0000102 Py_ssize_t i, len;
Guido van Rossumfeee4b92000-03-10 22:57:27 +0000103
Fred Drake766de832000-05-09 19:55:59 +0000104 if (encoding == NULL) {
105 PyErr_BadArgument();
106 goto onError;
107 }
Gustavo Niemeyer5ddd4c32003-03-19 00:35:36 +0000108
Nicholas Bastine5662ae2004-03-24 22:22:12 +0000109 interp = PyThreadState_GET()->interp;
Gustavo Niemeyer5ddd4c32003-03-19 00:35:36 +0000110 if (interp->codec_search_path == NULL && _PyCodecRegistry_Init())
Barry Warsaw51ac5802000-03-20 16:36:48 +0000111 goto onError;
Guido van Rossumfeee4b92000-03-10 22:57:27 +0000112
Guido van Rossum9e896b32000-04-05 20:11:21 +0000113 /* Convert the encoding to a normalized Python string: all
Thomas Wouters7e474022000-07-16 12:04:32 +0000114 characters are converted to lower case, spaces and hyphens are
Guido van Rossum9e896b32000-04-05 20:11:21 +0000115 replaced with underscores. */
116 v = normalizestring(encoding);
Guido van Rossumfeee4b92000-03-10 22:57:27 +0000117 if (v == NULL)
118 goto onError;
Guido van Rossum21431e82007-10-19 21:48:41 +0000119 PyUnicode_InternInPlace(&v);
Guido van Rossumfeee4b92000-03-10 22:57:27 +0000120
121 /* First, try to lookup the name in the registry dictionary */
Gustavo Niemeyer5ddd4c32003-03-19 00:35:36 +0000122 result = PyDict_GetItem(interp->codec_search_cache, v);
Guido van Rossumfeee4b92000-03-10 22:57:27 +0000123 if (result != NULL) {
124 Py_INCREF(result);
Barry Warsaw51ac5802000-03-20 16:36:48 +0000125 Py_DECREF(v);
Guido van Rossumfeee4b92000-03-10 22:57:27 +0000126 return result;
127 }
Guido van Rossum98297ee2007-11-06 21:34:58 +0000128
Guido van Rossumfeee4b92000-03-10 22:57:27 +0000129 /* Next, scan the search functions in order of registration */
Guido van Rossumfeee4b92000-03-10 22:57:27 +0000130 args = PyTuple_New(1);
131 if (args == NULL)
132 goto onError;
133 PyTuple_SET_ITEM(args,0,v);
Guido van Rossum5ba3c842000-03-24 20:52:23 +0000134
Gustavo Niemeyer5ddd4c32003-03-19 00:35:36 +0000135 len = PyList_Size(interp->codec_search_path);
Guido van Rossum5ba3c842000-03-24 20:52:23 +0000136 if (len < 0)
137 goto onError;
Guido van Rossumb95de4f2000-03-31 17:25:23 +0000138 if (len == 0) {
139 PyErr_SetString(PyExc_LookupError,
140 "no codec search functions registered: "
141 "can't find encoding");
142 goto onError;
143 }
Guido van Rossumfeee4b92000-03-10 22:57:27 +0000144
145 for (i = 0; i < len; i++) {
146 PyObject *func;
Guido van Rossum98297ee2007-11-06 21:34:58 +0000147
Gustavo Niemeyer5ddd4c32003-03-19 00:35:36 +0000148 func = PyList_GetItem(interp->codec_search_path, i);
Guido van Rossumfeee4b92000-03-10 22:57:27 +0000149 if (func == NULL)
150 goto onError;
Guido van Rossum5ba3c842000-03-24 20:52:23 +0000151 result = PyEval_CallObject(func, args);
Guido van Rossumfeee4b92000-03-10 22:57:27 +0000152 if (result == NULL)
153 goto onError;
154 if (result == Py_None) {
155 Py_DECREF(result);
156 continue;
157 }
158 if (!PyTuple_Check(result) || PyTuple_GET_SIZE(result) != 4) {
159 PyErr_SetString(PyExc_TypeError,
160 "codec search functions must return 4-tuples");
161 Py_DECREF(result);
162 goto onError;
163 }
164 break;
165 }
166 if (i == len) {
167 /* XXX Perhaps we should cache misses too ? */
Martin v. Löwiseb42b022002-09-26 16:01:24 +0000168 PyErr_Format(PyExc_LookupError,
169 "unknown encoding: %s", encoding);
Guido van Rossumfeee4b92000-03-10 22:57:27 +0000170 goto onError;
171 }
172
173 /* Cache and return the result */
Neal Norwitz9edcc2e2007-08-11 04:58:26 +0000174 if (PyDict_SetItem(interp->codec_search_cache, v, result) < 0) {
175 Py_DECREF(result);
176 goto onError;
177 }
Guido van Rossumfeee4b92000-03-10 22:57:27 +0000178 Py_DECREF(args);
179 return result;
180
181 onError:
182 Py_XDECREF(args);
183 return NULL;
184}
185
186static
187PyObject *args_tuple(PyObject *object,
188 const char *errors)
189{
190 PyObject *args;
Guido van Rossum98297ee2007-11-06 21:34:58 +0000191
Guido van Rossumfeee4b92000-03-10 22:57:27 +0000192 args = PyTuple_New(1 + (errors != NULL));
193 if (args == NULL)
194 return NULL;
195 Py_INCREF(object);
196 PyTuple_SET_ITEM(args,0,object);
197 if (errors) {
198 PyObject *v;
Guido van Rossum98297ee2007-11-06 21:34:58 +0000199
Guido van Rossum21431e82007-10-19 21:48:41 +0000200 v = PyUnicode_FromString(errors);
Guido van Rossumfeee4b92000-03-10 22:57:27 +0000201 if (v == NULL) {
202 Py_DECREF(args);
203 return NULL;
204 }
205 PyTuple_SET_ITEM(args, 1, v);
206 }
207 return args;
208}
209
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000210/* Helper function to get a codec item */
Guido van Rossumfeee4b92000-03-10 22:57:27 +0000211
212static
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000213PyObject *codec_getitem(const char *encoding, int index)
Guido van Rossumfeee4b92000-03-10 22:57:27 +0000214{
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000215 PyObject *codecs;
216 PyObject *v;
Guido van Rossumfeee4b92000-03-10 22:57:27 +0000217
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000218 codecs = _PyCodec_Lookup(encoding);
219 if (codecs == NULL)
Guido van Rossumfeee4b92000-03-10 22:57:27 +0000220 return NULL;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000221 v = PyTuple_GET_ITEM(codecs, index);
222 Py_DECREF(codecs);
223 Py_INCREF(v);
224 return v;
225}
226
227/* Helper function to create an incremental codec. */
228
229static
230PyObject *codec_getincrementalcodec(const char *encoding,
231 const char *errors,
232 const char *attrname)
233{
234 PyObject *codecs, *ret, *inccodec;
235
236 codecs = _PyCodec_Lookup(encoding);
237 if (codecs == NULL)
238 return NULL;
239 inccodec = PyObject_GetAttrString(codecs, attrname);
240 Py_DECREF(codecs);
241 if (inccodec == NULL)
242 return NULL;
243 if (errors)
244 ret = PyObject_CallFunction(inccodec, "s", errors);
245 else
246 ret = PyObject_CallFunction(inccodec, NULL);
247 Py_DECREF(inccodec);
248 return ret;
249}
250
251/* Helper function to create a stream codec. */
252
253static
254PyObject *codec_getstreamcodec(const char *encoding,
255 PyObject *stream,
256 const char *errors,
257 const int index)
258{
Thomas Wouters0e3f5912006-08-11 14:57:12 +0000259 PyObject *codecs, *streamcodec, *codeccls;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000260
261 codecs = _PyCodec_Lookup(encoding);
262 if (codecs == NULL)
263 return NULL;
264
Thomas Wouters0e3f5912006-08-11 14:57:12 +0000265 codeccls = PyTuple_GET_ITEM(codecs, index);
266 if (errors != NULL)
267 streamcodec = PyObject_CallFunction(codeccls, "Os", stream, errors);
268 else
269 streamcodec = PyObject_CallFunction(codeccls, "O", stream);
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000270 Py_DECREF(codecs);
271 return streamcodec;
Guido van Rossumfeee4b92000-03-10 22:57:27 +0000272}
273
Guido van Rossum98297ee2007-11-06 21:34:58 +0000274/* Convenience APIs to query the Codec registry.
275
Guido van Rossumfeee4b92000-03-10 22:57:27 +0000276 All APIs return a codec object with incremented refcount.
Guido van Rossum98297ee2007-11-06 21:34:58 +0000277
Guido van Rossumfeee4b92000-03-10 22:57:27 +0000278 */
279
280PyObject *PyCodec_Encoder(const char *encoding)
281{
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000282 return codec_getitem(encoding, 0);
Guido van Rossumfeee4b92000-03-10 22:57:27 +0000283}
284
285PyObject *PyCodec_Decoder(const char *encoding)
286{
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000287 return codec_getitem(encoding, 1);
Guido van Rossumfeee4b92000-03-10 22:57:27 +0000288}
289
Thomas Woutersa9773292006-04-21 09:43:23 +0000290PyObject *PyCodec_IncrementalEncoder(const char *encoding,
291 const char *errors)
292{
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000293 return codec_getincrementalcodec(encoding, errors, "incrementalencoder");
Thomas Woutersa9773292006-04-21 09:43:23 +0000294}
295
296PyObject *PyCodec_IncrementalDecoder(const char *encoding,
297 const char *errors)
298{
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000299 return codec_getincrementalcodec(encoding, errors, "incrementaldecoder");
Thomas Woutersa9773292006-04-21 09:43:23 +0000300}
301
Guido van Rossumfeee4b92000-03-10 22:57:27 +0000302PyObject *PyCodec_StreamReader(const char *encoding,
303 PyObject *stream,
304 const char *errors)
305{
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000306 return codec_getstreamcodec(encoding, stream, errors, 2);
Guido van Rossumfeee4b92000-03-10 22:57:27 +0000307}
308
309PyObject *PyCodec_StreamWriter(const char *encoding,
310 PyObject *stream,
311 const char *errors)
312{
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000313 return codec_getstreamcodec(encoding, stream, errors, 3);
Guido van Rossumfeee4b92000-03-10 22:57:27 +0000314}
315
316/* Encode an object (e.g. an Unicode object) using the given encoding
317 and return the resulting encoded object (usually a Python string).
318
319 errors is passed to the encoder factory as argument if non-NULL. */
320
321PyObject *PyCodec_Encode(PyObject *object,
322 const char *encoding,
323 const char *errors)
324{
325 PyObject *encoder = NULL;
Neal Norwitz3715c3e2005-11-24 22:09:18 +0000326 PyObject *args = NULL, *result = NULL;
Guido van Rossum98297ee2007-11-06 21:34:58 +0000327 PyObject *v = NULL;
Guido van Rossumfeee4b92000-03-10 22:57:27 +0000328
329 encoder = PyCodec_Encoder(encoding);
330 if (encoder == NULL)
331 goto onError;
332
333 args = args_tuple(object, errors);
334 if (args == NULL)
335 goto onError;
Guido van Rossum98297ee2007-11-06 21:34:58 +0000336
337 result = PyEval_CallObject(encoder, args);
Guido van Rossumfeee4b92000-03-10 22:57:27 +0000338 if (result == NULL)
339 goto onError;
340
Guido van Rossum98297ee2007-11-06 21:34:58 +0000341 if (!PyTuple_Check(result) ||
Guido van Rossumfeee4b92000-03-10 22:57:27 +0000342 PyTuple_GET_SIZE(result) != 2) {
343 PyErr_SetString(PyExc_TypeError,
Guido van Rossum98297ee2007-11-06 21:34:58 +0000344 "encoder must return a tuple (object, integer)");
Guido van Rossumfeee4b92000-03-10 22:57:27 +0000345 goto onError;
346 }
Guido van Rossum98297ee2007-11-06 21:34:58 +0000347 v = PyTuple_GET_ITEM(result, 0);
348 if (PyBytes_Check(v)) {
349 char msg[100];
350 PyOS_snprintf(msg, sizeof(msg),
351 "encoder %s returned buffer instead of bytes",
352 encoding);
353 if (PyErr_WarnEx(PyExc_RuntimeWarning, msg, 1) < 0) {
354 v = NULL;
355 goto onError;
356 }
357 v = PyString_FromStringAndSize(PyBytes_AS_STRING(v), Py_Size(v));
358 }
359 else if (PyString_Check(v))
360 Py_INCREF(v);
361 else {
362 PyErr_SetString(PyExc_TypeError,
363 "encoding must return a tuple(bytes, integer)");
364 v = NULL;
365 }
Guido van Rossumfeee4b92000-03-10 22:57:27 +0000366 /* We don't check or use the second (integer) entry. */
367
Guido van Rossumfeee4b92000-03-10 22:57:27 +0000368 onError:
Neal Norwitz3715c3e2005-11-24 22:09:18 +0000369 Py_XDECREF(result);
Guido van Rossumfeee4b92000-03-10 22:57:27 +0000370 Py_XDECREF(args);
371 Py_XDECREF(encoder);
Guido van Rossum98297ee2007-11-06 21:34:58 +0000372 return v;
Guido van Rossumfeee4b92000-03-10 22:57:27 +0000373}
374
375/* Decode an object (usually a Python string) using the given encoding
376 and return an equivalent object (e.g. an Unicode object).
377
378 errors is passed to the decoder factory as argument if non-NULL. */
379
380PyObject *PyCodec_Decode(PyObject *object,
381 const char *encoding,
382 const char *errors)
383{
384 PyObject *decoder = NULL;
385 PyObject *args = NULL, *result = NULL;
386 PyObject *v;
387
388 decoder = PyCodec_Decoder(encoding);
389 if (decoder == NULL)
390 goto onError;
391
392 args = args_tuple(object, errors);
393 if (args == NULL)
394 goto onError;
Guido van Rossum98297ee2007-11-06 21:34:58 +0000395
Guido van Rossumfeee4b92000-03-10 22:57:27 +0000396 result = PyEval_CallObject(decoder,args);
397 if (result == NULL)
398 goto onError;
Guido van Rossum98297ee2007-11-06 21:34:58 +0000399 if (!PyTuple_Check(result) ||
Guido van Rossumfeee4b92000-03-10 22:57:27 +0000400 PyTuple_GET_SIZE(result) != 2) {
401 PyErr_SetString(PyExc_TypeError,
402 "decoder must return a tuple (object,integer)");
403 goto onError;
404 }
405 v = PyTuple_GET_ITEM(result,0);
406 Py_INCREF(v);
407 /* We don't check or use the second (integer) entry. */
408
409 Py_DECREF(args);
410 Py_DECREF(decoder);
411 Py_DECREF(result);
412 return v;
Guido van Rossum98297ee2007-11-06 21:34:58 +0000413
Guido van Rossumfeee4b92000-03-10 22:57:27 +0000414 onError:
415 Py_XDECREF(args);
416 Py_XDECREF(decoder);
417 Py_XDECREF(result);
418 return NULL;
419}
420
Walter Dörwald3aeb6322002-09-02 13:14:32 +0000421/* Register the error handling callback function error under the name
422 name. This function will be called by the codec when it encounters
423 an unencodable characters/undecodable bytes and doesn't know the
424 callback name, when name is specified as the error parameter
425 in the call to the encode/decode function.
426 Return 0 on success, -1 on error */
427int PyCodec_RegisterError(const char *name, PyObject *error)
428{
Nicholas Bastine5662ae2004-03-24 22:22:12 +0000429 PyInterpreterState *interp = PyThreadState_GET()->interp;
Gustavo Niemeyer5ddd4c32003-03-19 00:35:36 +0000430 if (interp->codec_search_path == NULL && _PyCodecRegistry_Init())
431 return -1;
Walter Dörwald3aeb6322002-09-02 13:14:32 +0000432 if (!PyCallable_Check(error)) {
433 PyErr_SetString(PyExc_TypeError, "handler must be callable");
434 return -1;
435 }
Gustavo Niemeyer5ddd4c32003-03-19 00:35:36 +0000436 return PyDict_SetItemString(interp->codec_error_registry,
437 (char *)name, error);
Walter Dörwald3aeb6322002-09-02 13:14:32 +0000438}
439
440/* Lookup the error handling callback function registered under the
441 name error. As a special case NULL can be passed, in which case
442 the error handling callback for strict encoding will be returned. */
443PyObject *PyCodec_LookupError(const char *name)
444{
445 PyObject *handler = NULL;
446
Nicholas Bastine5662ae2004-03-24 22:22:12 +0000447 PyInterpreterState *interp = PyThreadState_GET()->interp;
Gustavo Niemeyer5ddd4c32003-03-19 00:35:36 +0000448 if (interp->codec_search_path == NULL && _PyCodecRegistry_Init())
449 return NULL;
450
Walter Dörwald3aeb6322002-09-02 13:14:32 +0000451 if (name==NULL)
452 name = "strict";
Gustavo Niemeyer5ddd4c32003-03-19 00:35:36 +0000453 handler = PyDict_GetItemString(interp->codec_error_registry, (char *)name);
Walter Dörwald3aeb6322002-09-02 13:14:32 +0000454 if (!handler)
455 PyErr_Format(PyExc_LookupError, "unknown error handler name '%.400s'", name);
456 else
457 Py_INCREF(handler);
458 return handler;
459}
460
461static void wrong_exception_type(PyObject *exc)
462{
463 PyObject *type = PyObject_GetAttrString(exc, "__class__");
464 if (type != NULL) {
Walter Dörwald573c08c2007-05-25 15:46:59 +0000465 PyObject *name = PyObject_GetAttrString(type, "__name__");
466 Py_DECREF(type);
467 if (name != NULL) {
468 PyErr_Format(PyExc_TypeError,
469 "don't know how to handle %S in error callback", name);
470 Py_DECREF(name);
471 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +0000472 }
473}
474
475PyObject *PyCodec_StrictErrors(PyObject *exc)
476{
Brett Cannonbf364092006-03-01 04:25:17 +0000477 if (PyExceptionInstance_Check(exc))
478 PyErr_SetObject(PyExceptionInstance_Class(exc), exc);
Walter Dörwald3aeb6322002-09-02 13:14:32 +0000479 else
480 PyErr_SetString(PyExc_TypeError, "codec must pass exception instance");
481 return NULL;
482}
483
484
485PyObject *PyCodec_IgnoreErrors(PyObject *exc)
486{
Martin v. Löwis18e16552006-02-15 17:27:45 +0000487 Py_ssize_t end;
Walter Dörwald3aeb6322002-09-02 13:14:32 +0000488 if (PyObject_IsInstance(exc, PyExc_UnicodeEncodeError)) {
489 if (PyUnicodeEncodeError_GetEnd(exc, &end))
490 return NULL;
491 }
492 else if (PyObject_IsInstance(exc, PyExc_UnicodeDecodeError)) {
493 if (PyUnicodeDecodeError_GetEnd(exc, &end))
494 return NULL;
495 }
496 else if (PyObject_IsInstance(exc, PyExc_UnicodeTranslateError)) {
497 if (PyUnicodeTranslateError_GetEnd(exc, &end))
498 return NULL;
499 }
500 else {
501 wrong_exception_type(exc);
502 return NULL;
503 }
504 /* ouch: passing NULL, 0, pos gives None instead of u'' */
Martin v. Löwis18e16552006-02-15 17:27:45 +0000505 return Py_BuildValue("(u#n)", &end, 0, end);
Walter Dörwald3aeb6322002-09-02 13:14:32 +0000506}
507
508
509PyObject *PyCodec_ReplaceErrors(PyObject *exc)
510{
511 PyObject *restuple;
Martin v. Löwis18e16552006-02-15 17:27:45 +0000512 Py_ssize_t start;
513 Py_ssize_t end;
514 Py_ssize_t i;
Walter Dörwald3aeb6322002-09-02 13:14:32 +0000515
516 if (PyObject_IsInstance(exc, PyExc_UnicodeEncodeError)) {
517 PyObject *res;
518 Py_UNICODE *p;
519 if (PyUnicodeEncodeError_GetStart(exc, &start))
520 return NULL;
521 if (PyUnicodeEncodeError_GetEnd(exc, &end))
522 return NULL;
523 res = PyUnicode_FromUnicode(NULL, end-start);
524 if (res == NULL)
525 return NULL;
526 for (p = PyUnicode_AS_UNICODE(res), i = start;
527 i<end; ++p, ++i)
528 *p = '?';
Martin v. Löwis18e16552006-02-15 17:27:45 +0000529 restuple = Py_BuildValue("(On)", res, end);
Walter Dörwald3aeb6322002-09-02 13:14:32 +0000530 Py_DECREF(res);
531 return restuple;
532 }
533 else if (PyObject_IsInstance(exc, PyExc_UnicodeDecodeError)) {
534 Py_UNICODE res = Py_UNICODE_REPLACEMENT_CHARACTER;
535 if (PyUnicodeDecodeError_GetEnd(exc, &end))
536 return NULL;
Martin v. Löwis18e16552006-02-15 17:27:45 +0000537 return Py_BuildValue("(u#n)", &res, 1, end);
Walter Dörwald3aeb6322002-09-02 13:14:32 +0000538 }
539 else if (PyObject_IsInstance(exc, PyExc_UnicodeTranslateError)) {
540 PyObject *res;
541 Py_UNICODE *p;
542 if (PyUnicodeTranslateError_GetStart(exc, &start))
543 return NULL;
544 if (PyUnicodeTranslateError_GetEnd(exc, &end))
545 return NULL;
546 res = PyUnicode_FromUnicode(NULL, end-start);
547 if (res == NULL)
548 return NULL;
549 for (p = PyUnicode_AS_UNICODE(res), i = start;
550 i<end; ++p, ++i)
551 *p = Py_UNICODE_REPLACEMENT_CHARACTER;
Martin v. Löwis18e16552006-02-15 17:27:45 +0000552 restuple = Py_BuildValue("(On)", res, end);
Walter Dörwald3aeb6322002-09-02 13:14:32 +0000553 Py_DECREF(res);
554 return restuple;
555 }
556 else {
557 wrong_exception_type(exc);
558 return NULL;
559 }
560}
561
562PyObject *PyCodec_XMLCharRefReplaceErrors(PyObject *exc)
563{
564 if (PyObject_IsInstance(exc, PyExc_UnicodeEncodeError)) {
565 PyObject *restuple;
566 PyObject *object;
Martin v. Löwis18e16552006-02-15 17:27:45 +0000567 Py_ssize_t start;
568 Py_ssize_t end;
Walter Dörwald3aeb6322002-09-02 13:14:32 +0000569 PyObject *res;
570 Py_UNICODE *p;
571 Py_UNICODE *startp;
572 Py_UNICODE *outp;
573 int ressize;
574 if (PyUnicodeEncodeError_GetStart(exc, &start))
575 return NULL;
576 if (PyUnicodeEncodeError_GetEnd(exc, &end))
577 return NULL;
578 if (!(object = PyUnicodeEncodeError_GetObject(exc)))
579 return NULL;
580 startp = PyUnicode_AS_UNICODE(object);
581 for (p = startp+start, ressize = 0; p < startp+end; ++p) {
582 if (*p<10)
583 ressize += 2+1+1;
584 else if (*p<100)
585 ressize += 2+2+1;
586 else if (*p<1000)
587 ressize += 2+3+1;
588 else if (*p<10000)
589 ressize += 2+4+1;
Hye-Shik Chang7db07e62003-12-29 01:36:01 +0000590#ifndef Py_UNICODE_WIDE
591 else
592 ressize += 2+5+1;
593#else
Walter Dörwald3aeb6322002-09-02 13:14:32 +0000594 else if (*p<100000)
595 ressize += 2+5+1;
596 else if (*p<1000000)
597 ressize += 2+6+1;
598 else
599 ressize += 2+7+1;
Hye-Shik Chang7db07e62003-12-29 01:36:01 +0000600#endif
Walter Dörwald3aeb6322002-09-02 13:14:32 +0000601 }
602 /* allocate replacement */
603 res = PyUnicode_FromUnicode(NULL, ressize);
604 if (res == NULL) {
605 Py_DECREF(object);
606 return NULL;
607 }
608 /* generate replacement */
609 for (p = startp+start, outp = PyUnicode_AS_UNICODE(res);
610 p < startp+end; ++p) {
611 Py_UNICODE c = *p;
612 int digits;
613 int base;
614 *outp++ = '&';
615 *outp++ = '#';
616 if (*p<10) {
617 digits = 1;
618 base = 1;
619 }
620 else if (*p<100) {
621 digits = 2;
622 base = 10;
623 }
624 else if (*p<1000) {
625 digits = 3;
626 base = 100;
627 }
628 else if (*p<10000) {
629 digits = 4;
630 base = 1000;
631 }
Hye-Shik Chang7db07e62003-12-29 01:36:01 +0000632#ifndef Py_UNICODE_WIDE
633 else {
634 digits = 5;
635 base = 10000;
636 }
637#else
Walter Dörwald3aeb6322002-09-02 13:14:32 +0000638 else if (*p<100000) {
639 digits = 5;
640 base = 10000;
641 }
642 else if (*p<1000000) {
643 digits = 6;
644 base = 100000;
645 }
646 else {
647 digits = 7;
648 base = 1000000;
649 }
Hye-Shik Chang7db07e62003-12-29 01:36:01 +0000650#endif
Walter Dörwald3aeb6322002-09-02 13:14:32 +0000651 while (digits-->0) {
652 *outp++ = '0' + c/base;
653 c %= base;
654 base /= 10;
655 }
656 *outp++ = ';';
657 }
Martin v. Löwis18e16552006-02-15 17:27:45 +0000658 restuple = Py_BuildValue("(On)", res, end);
Walter Dörwald3aeb6322002-09-02 13:14:32 +0000659 Py_DECREF(res);
660 Py_DECREF(object);
661 return restuple;
662 }
663 else {
664 wrong_exception_type(exc);
665 return NULL;
666 }
667}
668
669static Py_UNICODE hexdigits[] = {
670 '0', '1', '2', '3', '4', '5', '6', '7',
671 '8', '9', 'a', 'b', 'c', 'd', 'e', 'f'
672};
673
674PyObject *PyCodec_BackslashReplaceErrors(PyObject *exc)
675{
676 if (PyObject_IsInstance(exc, PyExc_UnicodeEncodeError)) {
677 PyObject *restuple;
678 PyObject *object;
Martin v. Löwis18e16552006-02-15 17:27:45 +0000679 Py_ssize_t start;
680 Py_ssize_t end;
Walter Dörwald3aeb6322002-09-02 13:14:32 +0000681 PyObject *res;
682 Py_UNICODE *p;
683 Py_UNICODE *startp;
684 Py_UNICODE *outp;
685 int ressize;
686 if (PyUnicodeEncodeError_GetStart(exc, &start))
687 return NULL;
688 if (PyUnicodeEncodeError_GetEnd(exc, &end))
689 return NULL;
690 if (!(object = PyUnicodeEncodeError_GetObject(exc)))
691 return NULL;
692 startp = PyUnicode_AS_UNICODE(object);
693 for (p = startp+start, ressize = 0; p < startp+end; ++p) {
Hye-Shik Chang7db07e62003-12-29 01:36:01 +0000694#ifdef Py_UNICODE_WIDE
Walter Dörwald3aeb6322002-09-02 13:14:32 +0000695 if (*p >= 0x00010000)
696 ressize += 1+1+8;
Hye-Shik Chang7db07e62003-12-29 01:36:01 +0000697 else
698#endif
699 if (*p >= 0x100) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +0000700 ressize += 1+1+4;
701 }
702 else
703 ressize += 1+1+2;
704 }
705 res = PyUnicode_FromUnicode(NULL, ressize);
706 if (res==NULL)
707 return NULL;
708 for (p = startp+start, outp = PyUnicode_AS_UNICODE(res);
709 p < startp+end; ++p) {
710 Py_UNICODE c = *p;
711 *outp++ = '\\';
Hye-Shik Chang7db07e62003-12-29 01:36:01 +0000712#ifdef Py_UNICODE_WIDE
Walter Dörwald3aeb6322002-09-02 13:14:32 +0000713 if (c >= 0x00010000) {
714 *outp++ = 'U';
715 *outp++ = hexdigits[(c>>28)&0xf];
716 *outp++ = hexdigits[(c>>24)&0xf];
717 *outp++ = hexdigits[(c>>20)&0xf];
718 *outp++ = hexdigits[(c>>16)&0xf];
719 *outp++ = hexdigits[(c>>12)&0xf];
720 *outp++ = hexdigits[(c>>8)&0xf];
721 }
Hye-Shik Chang7db07e62003-12-29 01:36:01 +0000722 else
723#endif
724 if (c >= 0x100) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +0000725 *outp++ = 'u';
726 *outp++ = hexdigits[(c>>12)&0xf];
727 *outp++ = hexdigits[(c>>8)&0xf];
728 }
729 else
730 *outp++ = 'x';
731 *outp++ = hexdigits[(c>>4)&0xf];
732 *outp++ = hexdigits[c&0xf];
733 }
734
Martin v. Löwis18e16552006-02-15 17:27:45 +0000735 restuple = Py_BuildValue("(On)", res, end);
Walter Dörwald3aeb6322002-09-02 13:14:32 +0000736 Py_DECREF(res);
737 Py_DECREF(object);
738 return restuple;
739 }
740 else {
741 wrong_exception_type(exc);
742 return NULL;
743 }
744}
745
746static PyObject *strict_errors(PyObject *self, PyObject *exc)
747{
748 return PyCodec_StrictErrors(exc);
749}
750
751
752static PyObject *ignore_errors(PyObject *self, PyObject *exc)
753{
754 return PyCodec_IgnoreErrors(exc);
755}
756
757
758static PyObject *replace_errors(PyObject *self, PyObject *exc)
759{
760 return PyCodec_ReplaceErrors(exc);
761}
762
763
764static PyObject *xmlcharrefreplace_errors(PyObject *self, PyObject *exc)
765{
766 return PyCodec_XMLCharRefReplaceErrors(exc);
767}
768
769
770static PyObject *backslashreplace_errors(PyObject *self, PyObject *exc)
771{
772 return PyCodec_BackslashReplaceErrors(exc);
773}
774
Gustavo Niemeyer5ddd4c32003-03-19 00:35:36 +0000775static int _PyCodecRegistry_Init(void)
Guido van Rossumfeee4b92000-03-10 22:57:27 +0000776{
Walter Dörwald3aeb6322002-09-02 13:14:32 +0000777 static struct {
778 char *name;
779 PyMethodDef def;
780 } methods[] =
781 {
782 {
783 "strict",
784 {
785 "strict_errors",
786 strict_errors,
787 METH_O
788 }
789 },
790 {
791 "ignore",
792 {
793 "ignore_errors",
794 ignore_errors,
795 METH_O
796 }
797 },
798 {
799 "replace",
800 {
801 "replace_errors",
802 replace_errors,
803 METH_O
804 }
805 },
806 {
807 "xmlcharrefreplace",
808 {
809 "xmlcharrefreplace_errors",
810 xmlcharrefreplace_errors,
811 METH_O
812 }
813 },
814 {
815 "backslashreplace",
816 {
817 "backslashreplace_errors",
818 backslashreplace_errors,
819 METH_O
820 }
821 }
822 };
Walter Dörwald3aeb6322002-09-02 13:14:32 +0000823
Nicholas Bastine5662ae2004-03-24 22:22:12 +0000824 PyInterpreterState *interp = PyThreadState_GET()->interp;
Gustavo Niemeyer5ddd4c32003-03-19 00:35:36 +0000825 PyObject *mod;
Neal Norwitz739a8f82004-07-08 01:55:58 +0000826 unsigned i;
Gustavo Niemeyer5ddd4c32003-03-19 00:35:36 +0000827
828 if (interp->codec_search_path != NULL)
829 return 0;
830
831 interp->codec_search_path = PyList_New(0);
832 interp->codec_search_cache = PyDict_New();
833 interp->codec_error_registry = PyDict_New();
834
835 if (interp->codec_error_registry) {
836 for (i = 0; i < sizeof(methods)/sizeof(methods[0]); ++i) {
837 PyObject *func = PyCFunction_New(&methods[i].def, NULL);
838 int res;
839 if (!func)
840 Py_FatalError("can't initialize codec error registry");
841 res = PyCodec_RegisterError(methods[i].name, func);
842 Py_DECREF(func);
843 if (res)
844 Py_FatalError("can't initialize codec error registry");
Walter Dörwald3aeb6322002-09-02 13:14:32 +0000845 }
846 }
Guido van Rossumfeee4b92000-03-10 22:57:27 +0000847
Gustavo Niemeyer5ddd4c32003-03-19 00:35:36 +0000848 if (interp->codec_search_path == NULL ||
849 interp->codec_search_cache == NULL ||
850 interp->codec_error_registry == NULL)
851 Py_FatalError("can't initialize codec registry");
852
Thomas Woutersf7f438b2006-02-28 16:09:29 +0000853 mod = PyImport_ImportModuleLevel("encodings", NULL, NULL, NULL, 0);
Gustavo Niemeyer5ddd4c32003-03-19 00:35:36 +0000854 if (mod == NULL) {
855 if (PyErr_ExceptionMatches(PyExc_ImportError)) {
856 /* Ignore ImportErrors... this is done so that
857 distributions can disable the encodings package. Note
858 that other errors are not masked, e.g. SystemErrors
859 raised to inform the user of an error in the Python
860 configuration are still reported back to the user. */
861 PyErr_Clear();
862 return 0;
863 }
864 return -1;
865 }
866 Py_DECREF(mod);
867 return 0;
Guido van Rossumfeee4b92000-03-10 22:57:27 +0000868}