blob: 4b2467634dd30bfb083abf190c16ec74fe1e15b0 [file] [log] [blame]
Guido van Rossumfeee4b92000-03-10 22:57:27 +00001/* ------------------------------------------------------------------------
2
3 Python Codec Registry and support functions
4
5Written by Marc-Andre Lemburg (mal@lemburg.com).
6
Guido van Rossum16b1ad92000-08-03 16:24:25 +00007Copyright (c) Corporation for National Research Initiatives.
Guido van Rossumfeee4b92000-03-10 22:57:27 +00008
9 ------------------------------------------------------------------------ */
10
11#include "Python.h"
12#include <ctype.h>
13
Guido van Rossumfeee4b92000-03-10 22:57:27 +000014/* --- Codec Registry ----------------------------------------------------- */
15
16/* Import the standard encodings package which will register the first
17 codec search function.
18
19 This is done in a lazy way so that the Unicode implementation does
20 not downgrade startup time of scripts not needing it.
21
Guido van Rossumb95de4f2000-03-31 17:25:23 +000022 ImportErrors are silently ignored by this function. Only one try is
23 made.
Guido van Rossumfeee4b92000-03-10 22:57:27 +000024
25*/
26
Gustavo Niemeyer5ddd4c32003-03-19 00:35:36 +000027static int _PyCodecRegistry_Init(void); /* Forward */
Guido van Rossumfeee4b92000-03-10 22:57:27 +000028
Guido van Rossumfeee4b92000-03-10 22:57:27 +000029int PyCodec_Register(PyObject *search_function)
30{
Nicholas Bastine5662ae2004-03-24 22:22:12 +000031 PyInterpreterState *interp = PyThreadState_GET()->interp;
Gustavo Niemeyer5ddd4c32003-03-19 00:35:36 +000032 if (interp->codec_search_path == NULL && _PyCodecRegistry_Init())
33 goto onError;
Guido van Rossumfeee4b92000-03-10 22:57:27 +000034 if (search_function == NULL) {
35 PyErr_BadArgument();
Guido van Rossumb95de4f2000-03-31 17:25:23 +000036 goto onError;
Guido van Rossumfeee4b92000-03-10 22:57:27 +000037 }
38 if (!PyCallable_Check(search_function)) {
Neal Norwitz3715c3e2005-11-24 22:09:18 +000039 PyErr_SetString(PyExc_TypeError, "argument must be callable");
Guido van Rossumb95de4f2000-03-31 17:25:23 +000040 goto onError;
Guido van Rossumfeee4b92000-03-10 22:57:27 +000041 }
Gustavo Niemeyer5ddd4c32003-03-19 00:35:36 +000042 return PyList_Append(interp->codec_search_path, search_function);
Guido van Rossumb95de4f2000-03-31 17:25:23 +000043
44 onError:
45 return -1;
Guido van Rossumfeee4b92000-03-10 22:57:27 +000046}
47
Guido van Rossum9e896b32000-04-05 20:11:21 +000048/* Convert a string to a normalized Python string: all characters are
49 converted to lower case, spaces are replaced with underscores. */
50
Guido van Rossumfeee4b92000-03-10 22:57:27 +000051static
Guido van Rossum9e896b32000-04-05 20:11:21 +000052PyObject *normalizestring(const char *string)
Guido van Rossumfeee4b92000-03-10 22:57:27 +000053{
Guido van Rossum33831132000-06-29 14:50:15 +000054 register size_t i;
Guido van Rossum582acec2000-06-28 22:07:35 +000055 size_t len = strlen(string);
Guido van Rossumfeee4b92000-03-10 22:57:27 +000056 char *p;
57 PyObject *v;
Guido van Rossum21431e82007-10-19 21:48:41 +000058
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000059 if (len > PY_SSIZE_T_MAX) {
60 PyErr_SetString(PyExc_OverflowError, "string is too large");
61 return NULL;
62 }
Guido van Rossum21431e82007-10-19 21:48:41 +000063
64 p = PyMem_Malloc(len + 1);
65 if (p == NULL)
66 return NULL;
Guido van Rossum9e896b32000-04-05 20:11:21 +000067 for (i = 0; i < len; i++) {
68 register char ch = string[i];
69 if (ch == ' ')
70 ch = '-';
71 else
Thomas Wouters477c8d52006-05-27 19:21:47 +000072 ch = tolower(Py_CHARMASK(ch));
Guido van Rossum9e896b32000-04-05 20:11:21 +000073 p[i] = ch;
74 }
Guido van Rossum21431e82007-10-19 21:48:41 +000075 p[i] = '\0';
76 v = PyUnicode_FromString(p);
77 if (v == NULL)
78 return NULL;
79 PyMem_Free(p);
Guido van Rossumfeee4b92000-03-10 22:57:27 +000080 return v;
81}
82
83/* Lookup the given encoding and return a tuple providing the codec
84 facilities.
85
86 The encoding string is looked up converted to all lower-case
87 characters. This makes encodings looked up through this mechanism
88 effectively case-insensitive.
89
Fred Drake766de832000-05-09 19:55:59 +000090 If no codec is found, a LookupError is set and NULL returned.
Guido van Rossumb95de4f2000-03-31 17:25:23 +000091
92 As side effect, this tries to load the encodings package, if not
93 yet done. This is part of the lazy load strategy for the encodings
94 package.
95
96*/
Guido van Rossumfeee4b92000-03-10 22:57:27 +000097
98PyObject *_PyCodec_Lookup(const char *encoding)
99{
Gustavo Niemeyer5ddd4c32003-03-19 00:35:36 +0000100 PyInterpreterState *interp;
Guido van Rossum5ba3c842000-03-24 20:52:23 +0000101 PyObject *result, *args = NULL, *v;
Thomas Wouters477c8d52006-05-27 19:21:47 +0000102 Py_ssize_t i, len;
Guido van Rossumfeee4b92000-03-10 22:57:27 +0000103
Fred Drake766de832000-05-09 19:55:59 +0000104 if (encoding == NULL) {
105 PyErr_BadArgument();
106 goto onError;
107 }
Gustavo Niemeyer5ddd4c32003-03-19 00:35:36 +0000108
Nicholas Bastine5662ae2004-03-24 22:22:12 +0000109 interp = PyThreadState_GET()->interp;
Gustavo Niemeyer5ddd4c32003-03-19 00:35:36 +0000110 if (interp->codec_search_path == NULL && _PyCodecRegistry_Init())
Barry Warsaw51ac5802000-03-20 16:36:48 +0000111 goto onError;
Guido van Rossumfeee4b92000-03-10 22:57:27 +0000112
Guido van Rossum9e896b32000-04-05 20:11:21 +0000113 /* Convert the encoding to a normalized Python string: all
Thomas Wouters7e474022000-07-16 12:04:32 +0000114 characters are converted to lower case, spaces and hyphens are
Guido van Rossum9e896b32000-04-05 20:11:21 +0000115 replaced with underscores. */
116 v = normalizestring(encoding);
Guido van Rossumfeee4b92000-03-10 22:57:27 +0000117 if (v == NULL)
118 goto onError;
Guido van Rossum21431e82007-10-19 21:48:41 +0000119 PyUnicode_InternInPlace(&v);
Guido van Rossumfeee4b92000-03-10 22:57:27 +0000120
121 /* First, try to lookup the name in the registry dictionary */
Gustavo Niemeyer5ddd4c32003-03-19 00:35:36 +0000122 result = PyDict_GetItem(interp->codec_search_cache, v);
Guido van Rossumfeee4b92000-03-10 22:57:27 +0000123 if (result != NULL) {
124 Py_INCREF(result);
Barry Warsaw51ac5802000-03-20 16:36:48 +0000125 Py_DECREF(v);
Guido van Rossumfeee4b92000-03-10 22:57:27 +0000126 return result;
127 }
128
129 /* Next, scan the search functions in order of registration */
Guido van Rossumfeee4b92000-03-10 22:57:27 +0000130 args = PyTuple_New(1);
131 if (args == NULL)
132 goto onError;
133 PyTuple_SET_ITEM(args,0,v);
Guido van Rossum5ba3c842000-03-24 20:52:23 +0000134
Gustavo Niemeyer5ddd4c32003-03-19 00:35:36 +0000135 len = PyList_Size(interp->codec_search_path);
Guido van Rossum5ba3c842000-03-24 20:52:23 +0000136 if (len < 0)
137 goto onError;
Guido van Rossumb95de4f2000-03-31 17:25:23 +0000138 if (len == 0) {
139 PyErr_SetString(PyExc_LookupError,
140 "no codec search functions registered: "
141 "can't find encoding");
142 goto onError;
143 }
Guido van Rossumfeee4b92000-03-10 22:57:27 +0000144
145 for (i = 0; i < len; i++) {
146 PyObject *func;
147
Gustavo Niemeyer5ddd4c32003-03-19 00:35:36 +0000148 func = PyList_GetItem(interp->codec_search_path, i);
Guido van Rossumfeee4b92000-03-10 22:57:27 +0000149 if (func == NULL)
150 goto onError;
Guido van Rossum5ba3c842000-03-24 20:52:23 +0000151 result = PyEval_CallObject(func, args);
Guido van Rossumfeee4b92000-03-10 22:57:27 +0000152 if (result == NULL)
153 goto onError;
154 if (result == Py_None) {
155 Py_DECREF(result);
156 continue;
157 }
158 if (!PyTuple_Check(result) || PyTuple_GET_SIZE(result) != 4) {
159 PyErr_SetString(PyExc_TypeError,
160 "codec search functions must return 4-tuples");
161 Py_DECREF(result);
162 goto onError;
163 }
164 break;
165 }
166 if (i == len) {
167 /* XXX Perhaps we should cache misses too ? */
Martin v. Löwiseb42b022002-09-26 16:01:24 +0000168 PyErr_Format(PyExc_LookupError,
169 "unknown encoding: %s", encoding);
Guido van Rossumfeee4b92000-03-10 22:57:27 +0000170 goto onError;
171 }
172
173 /* Cache and return the result */
Neal Norwitz9edcc2e2007-08-11 04:58:26 +0000174 if (PyDict_SetItem(interp->codec_search_cache, v, result) < 0) {
175 Py_DECREF(result);
176 goto onError;
177 }
Guido van Rossumfeee4b92000-03-10 22:57:27 +0000178 Py_DECREF(args);
179 return result;
180
181 onError:
182 Py_XDECREF(args);
183 return NULL;
184}
185
186static
187PyObject *args_tuple(PyObject *object,
188 const char *errors)
189{
190 PyObject *args;
191
192 args = PyTuple_New(1 + (errors != NULL));
193 if (args == NULL)
194 return NULL;
195 Py_INCREF(object);
196 PyTuple_SET_ITEM(args,0,object);
197 if (errors) {
198 PyObject *v;
199
Guido van Rossum21431e82007-10-19 21:48:41 +0000200 v = PyUnicode_FromString(errors);
Guido van Rossumfeee4b92000-03-10 22:57:27 +0000201 if (v == NULL) {
202 Py_DECREF(args);
203 return NULL;
204 }
205 PyTuple_SET_ITEM(args, 1, v);
206 }
207 return args;
208}
209
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000210/* Helper function to get a codec item */
Guido van Rossumfeee4b92000-03-10 22:57:27 +0000211
212static
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000213PyObject *codec_getitem(const char *encoding, int index)
Guido van Rossumfeee4b92000-03-10 22:57:27 +0000214{
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000215 PyObject *codecs;
216 PyObject *v;
Guido van Rossumfeee4b92000-03-10 22:57:27 +0000217
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000218 codecs = _PyCodec_Lookup(encoding);
219 if (codecs == NULL)
Guido van Rossumfeee4b92000-03-10 22:57:27 +0000220 return NULL;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000221 v = PyTuple_GET_ITEM(codecs, index);
222 Py_DECREF(codecs);
223 Py_INCREF(v);
224 return v;
225}
226
227/* Helper function to create an incremental codec. */
228
229static
230PyObject *codec_getincrementalcodec(const char *encoding,
231 const char *errors,
232 const char *attrname)
233{
234 PyObject *codecs, *ret, *inccodec;
235
236 codecs = _PyCodec_Lookup(encoding);
237 if (codecs == NULL)
238 return NULL;
239 inccodec = PyObject_GetAttrString(codecs, attrname);
240 Py_DECREF(codecs);
241 if (inccodec == NULL)
242 return NULL;
243 if (errors)
244 ret = PyObject_CallFunction(inccodec, "s", errors);
245 else
246 ret = PyObject_CallFunction(inccodec, NULL);
247 Py_DECREF(inccodec);
248 return ret;
249}
250
251/* Helper function to create a stream codec. */
252
253static
254PyObject *codec_getstreamcodec(const char *encoding,
255 PyObject *stream,
256 const char *errors,
257 const int index)
258{
Thomas Wouters0e3f5912006-08-11 14:57:12 +0000259 PyObject *codecs, *streamcodec, *codeccls;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000260
261 codecs = _PyCodec_Lookup(encoding);
262 if (codecs == NULL)
263 return NULL;
264
Thomas Wouters0e3f5912006-08-11 14:57:12 +0000265 codeccls = PyTuple_GET_ITEM(codecs, index);
266 if (errors != NULL)
267 streamcodec = PyObject_CallFunction(codeccls, "Os", stream, errors);
268 else
269 streamcodec = PyObject_CallFunction(codeccls, "O", stream);
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000270 Py_DECREF(codecs);
271 return streamcodec;
Guido van Rossumfeee4b92000-03-10 22:57:27 +0000272}
273
274/* Convenience APIs to query the Codec registry.
275
276 All APIs return a codec object with incremented refcount.
277
278 */
279
280PyObject *PyCodec_Encoder(const char *encoding)
281{
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000282 return codec_getitem(encoding, 0);
Guido van Rossumfeee4b92000-03-10 22:57:27 +0000283}
284
285PyObject *PyCodec_Decoder(const char *encoding)
286{
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000287 return codec_getitem(encoding, 1);
Guido van Rossumfeee4b92000-03-10 22:57:27 +0000288}
289
Thomas Woutersa9773292006-04-21 09:43:23 +0000290PyObject *PyCodec_IncrementalEncoder(const char *encoding,
291 const char *errors)
292{
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000293 return codec_getincrementalcodec(encoding, errors, "incrementalencoder");
Thomas Woutersa9773292006-04-21 09:43:23 +0000294}
295
296PyObject *PyCodec_IncrementalDecoder(const char *encoding,
297 const char *errors)
298{
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000299 return codec_getincrementalcodec(encoding, errors, "incrementaldecoder");
Thomas Woutersa9773292006-04-21 09:43:23 +0000300}
301
Guido van Rossumfeee4b92000-03-10 22:57:27 +0000302PyObject *PyCodec_StreamReader(const char *encoding,
303 PyObject *stream,
304 const char *errors)
305{
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000306 return codec_getstreamcodec(encoding, stream, errors, 2);
Guido van Rossumfeee4b92000-03-10 22:57:27 +0000307}
308
309PyObject *PyCodec_StreamWriter(const char *encoding,
310 PyObject *stream,
311 const char *errors)
312{
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000313 return codec_getstreamcodec(encoding, stream, errors, 3);
Guido van Rossumfeee4b92000-03-10 22:57:27 +0000314}
315
316/* Encode an object (e.g. an Unicode object) using the given encoding
317 and return the resulting encoded object (usually a Python string).
318
319 errors is passed to the encoder factory as argument if non-NULL. */
320
321PyObject *PyCodec_Encode(PyObject *object,
322 const char *encoding,
323 const char *errors)
324{
325 PyObject *encoder = NULL;
Neal Norwitz3715c3e2005-11-24 22:09:18 +0000326 PyObject *args = NULL, *result = NULL;
Guido van Rossumfeee4b92000-03-10 22:57:27 +0000327 PyObject *v;
328
329 encoder = PyCodec_Encoder(encoding);
330 if (encoder == NULL)
331 goto onError;
332
333 args = args_tuple(object, errors);
334 if (args == NULL)
335 goto onError;
336
337 result = PyEval_CallObject(encoder,args);
338 if (result == NULL)
339 goto onError;
340
341 if (!PyTuple_Check(result) ||
342 PyTuple_GET_SIZE(result) != 2) {
343 PyErr_SetString(PyExc_TypeError,
344 "encoder must return a tuple (object,integer)");
345 goto onError;
346 }
347 v = PyTuple_GET_ITEM(result,0);
348 Py_INCREF(v);
349 /* We don't check or use the second (integer) entry. */
350
351 Py_DECREF(args);
352 Py_DECREF(encoder);
353 Py_DECREF(result);
354 return v;
355
356 onError:
Neal Norwitz3715c3e2005-11-24 22:09:18 +0000357 Py_XDECREF(result);
Guido van Rossumfeee4b92000-03-10 22:57:27 +0000358 Py_XDECREF(args);
359 Py_XDECREF(encoder);
360 return NULL;
361}
362
363/* Decode an object (usually a Python string) using the given encoding
364 and return an equivalent object (e.g. an Unicode object).
365
366 errors is passed to the decoder factory as argument if non-NULL. */
367
368PyObject *PyCodec_Decode(PyObject *object,
369 const char *encoding,
370 const char *errors)
371{
372 PyObject *decoder = NULL;
373 PyObject *args = NULL, *result = NULL;
374 PyObject *v;
375
376 decoder = PyCodec_Decoder(encoding);
377 if (decoder == NULL)
378 goto onError;
379
380 args = args_tuple(object, errors);
381 if (args == NULL)
382 goto onError;
383
384 result = PyEval_CallObject(decoder,args);
385 if (result == NULL)
386 goto onError;
387 if (!PyTuple_Check(result) ||
388 PyTuple_GET_SIZE(result) != 2) {
389 PyErr_SetString(PyExc_TypeError,
390 "decoder must return a tuple (object,integer)");
391 goto onError;
392 }
393 v = PyTuple_GET_ITEM(result,0);
394 Py_INCREF(v);
395 /* We don't check or use the second (integer) entry. */
396
397 Py_DECREF(args);
398 Py_DECREF(decoder);
399 Py_DECREF(result);
400 return v;
401
402 onError:
403 Py_XDECREF(args);
404 Py_XDECREF(decoder);
405 Py_XDECREF(result);
406 return NULL;
407}
408
Walter Dörwald3aeb6322002-09-02 13:14:32 +0000409/* Register the error handling callback function error under the name
410 name. This function will be called by the codec when it encounters
411 an unencodable characters/undecodable bytes and doesn't know the
412 callback name, when name is specified as the error parameter
413 in the call to the encode/decode function.
414 Return 0 on success, -1 on error */
415int PyCodec_RegisterError(const char *name, PyObject *error)
416{
Nicholas Bastine5662ae2004-03-24 22:22:12 +0000417 PyInterpreterState *interp = PyThreadState_GET()->interp;
Gustavo Niemeyer5ddd4c32003-03-19 00:35:36 +0000418 if (interp->codec_search_path == NULL && _PyCodecRegistry_Init())
419 return -1;
Walter Dörwald3aeb6322002-09-02 13:14:32 +0000420 if (!PyCallable_Check(error)) {
421 PyErr_SetString(PyExc_TypeError, "handler must be callable");
422 return -1;
423 }
Gustavo Niemeyer5ddd4c32003-03-19 00:35:36 +0000424 return PyDict_SetItemString(interp->codec_error_registry,
425 (char *)name, error);
Walter Dörwald3aeb6322002-09-02 13:14:32 +0000426}
427
428/* Lookup the error handling callback function registered under the
429 name error. As a special case NULL can be passed, in which case
430 the error handling callback for strict encoding will be returned. */
431PyObject *PyCodec_LookupError(const char *name)
432{
433 PyObject *handler = NULL;
434
Nicholas Bastine5662ae2004-03-24 22:22:12 +0000435 PyInterpreterState *interp = PyThreadState_GET()->interp;
Gustavo Niemeyer5ddd4c32003-03-19 00:35:36 +0000436 if (interp->codec_search_path == NULL && _PyCodecRegistry_Init())
437 return NULL;
438
Walter Dörwald3aeb6322002-09-02 13:14:32 +0000439 if (name==NULL)
440 name = "strict";
Gustavo Niemeyer5ddd4c32003-03-19 00:35:36 +0000441 handler = PyDict_GetItemString(interp->codec_error_registry, (char *)name);
Walter Dörwald3aeb6322002-09-02 13:14:32 +0000442 if (!handler)
443 PyErr_Format(PyExc_LookupError, "unknown error handler name '%.400s'", name);
444 else
445 Py_INCREF(handler);
446 return handler;
447}
448
449static void wrong_exception_type(PyObject *exc)
450{
451 PyObject *type = PyObject_GetAttrString(exc, "__class__");
452 if (type != NULL) {
Walter Dörwald573c08c2007-05-25 15:46:59 +0000453 PyObject *name = PyObject_GetAttrString(type, "__name__");
454 Py_DECREF(type);
455 if (name != NULL) {
456 PyErr_Format(PyExc_TypeError,
457 "don't know how to handle %S in error callback", name);
458 Py_DECREF(name);
459 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +0000460 }
461}
462
463PyObject *PyCodec_StrictErrors(PyObject *exc)
464{
Brett Cannonbf364092006-03-01 04:25:17 +0000465 if (PyExceptionInstance_Check(exc))
466 PyErr_SetObject(PyExceptionInstance_Class(exc), exc);
Walter Dörwald3aeb6322002-09-02 13:14:32 +0000467 else
468 PyErr_SetString(PyExc_TypeError, "codec must pass exception instance");
469 return NULL;
470}
471
472
473PyObject *PyCodec_IgnoreErrors(PyObject *exc)
474{
Martin v. Löwis18e16552006-02-15 17:27:45 +0000475 Py_ssize_t end;
Walter Dörwald3aeb6322002-09-02 13:14:32 +0000476 if (PyObject_IsInstance(exc, PyExc_UnicodeEncodeError)) {
477 if (PyUnicodeEncodeError_GetEnd(exc, &end))
478 return NULL;
479 }
480 else if (PyObject_IsInstance(exc, PyExc_UnicodeDecodeError)) {
481 if (PyUnicodeDecodeError_GetEnd(exc, &end))
482 return NULL;
483 }
484 else if (PyObject_IsInstance(exc, PyExc_UnicodeTranslateError)) {
485 if (PyUnicodeTranslateError_GetEnd(exc, &end))
486 return NULL;
487 }
488 else {
489 wrong_exception_type(exc);
490 return NULL;
491 }
492 /* ouch: passing NULL, 0, pos gives None instead of u'' */
Martin v. Löwis18e16552006-02-15 17:27:45 +0000493 return Py_BuildValue("(u#n)", &end, 0, end);
Walter Dörwald3aeb6322002-09-02 13:14:32 +0000494}
495
496
497PyObject *PyCodec_ReplaceErrors(PyObject *exc)
498{
499 PyObject *restuple;
Martin v. Löwis18e16552006-02-15 17:27:45 +0000500 Py_ssize_t start;
501 Py_ssize_t end;
502 Py_ssize_t i;
Walter Dörwald3aeb6322002-09-02 13:14:32 +0000503
504 if (PyObject_IsInstance(exc, PyExc_UnicodeEncodeError)) {
505 PyObject *res;
506 Py_UNICODE *p;
507 if (PyUnicodeEncodeError_GetStart(exc, &start))
508 return NULL;
509 if (PyUnicodeEncodeError_GetEnd(exc, &end))
510 return NULL;
511 res = PyUnicode_FromUnicode(NULL, end-start);
512 if (res == NULL)
513 return NULL;
514 for (p = PyUnicode_AS_UNICODE(res), i = start;
515 i<end; ++p, ++i)
516 *p = '?';
Martin v. Löwis18e16552006-02-15 17:27:45 +0000517 restuple = Py_BuildValue("(On)", res, end);
Walter Dörwald3aeb6322002-09-02 13:14:32 +0000518 Py_DECREF(res);
519 return restuple;
520 }
521 else if (PyObject_IsInstance(exc, PyExc_UnicodeDecodeError)) {
522 Py_UNICODE res = Py_UNICODE_REPLACEMENT_CHARACTER;
523 if (PyUnicodeDecodeError_GetEnd(exc, &end))
524 return NULL;
Martin v. Löwis18e16552006-02-15 17:27:45 +0000525 return Py_BuildValue("(u#n)", &res, 1, end);
Walter Dörwald3aeb6322002-09-02 13:14:32 +0000526 }
527 else if (PyObject_IsInstance(exc, PyExc_UnicodeTranslateError)) {
528 PyObject *res;
529 Py_UNICODE *p;
530 if (PyUnicodeTranslateError_GetStart(exc, &start))
531 return NULL;
532 if (PyUnicodeTranslateError_GetEnd(exc, &end))
533 return NULL;
534 res = PyUnicode_FromUnicode(NULL, end-start);
535 if (res == NULL)
536 return NULL;
537 for (p = PyUnicode_AS_UNICODE(res), i = start;
538 i<end; ++p, ++i)
539 *p = Py_UNICODE_REPLACEMENT_CHARACTER;
Martin v. Löwis18e16552006-02-15 17:27:45 +0000540 restuple = Py_BuildValue("(On)", res, end);
Walter Dörwald3aeb6322002-09-02 13:14:32 +0000541 Py_DECREF(res);
542 return restuple;
543 }
544 else {
545 wrong_exception_type(exc);
546 return NULL;
547 }
548}
549
550PyObject *PyCodec_XMLCharRefReplaceErrors(PyObject *exc)
551{
552 if (PyObject_IsInstance(exc, PyExc_UnicodeEncodeError)) {
553 PyObject *restuple;
554 PyObject *object;
Martin v. Löwis18e16552006-02-15 17:27:45 +0000555 Py_ssize_t start;
556 Py_ssize_t end;
Walter Dörwald3aeb6322002-09-02 13:14:32 +0000557 PyObject *res;
558 Py_UNICODE *p;
559 Py_UNICODE *startp;
560 Py_UNICODE *outp;
561 int ressize;
562 if (PyUnicodeEncodeError_GetStart(exc, &start))
563 return NULL;
564 if (PyUnicodeEncodeError_GetEnd(exc, &end))
565 return NULL;
566 if (!(object = PyUnicodeEncodeError_GetObject(exc)))
567 return NULL;
568 startp = PyUnicode_AS_UNICODE(object);
569 for (p = startp+start, ressize = 0; p < startp+end; ++p) {
570 if (*p<10)
571 ressize += 2+1+1;
572 else if (*p<100)
573 ressize += 2+2+1;
574 else if (*p<1000)
575 ressize += 2+3+1;
576 else if (*p<10000)
577 ressize += 2+4+1;
Hye-Shik Chang7db07e62003-12-29 01:36:01 +0000578#ifndef Py_UNICODE_WIDE
579 else
580 ressize += 2+5+1;
581#else
Walter Dörwald3aeb6322002-09-02 13:14:32 +0000582 else if (*p<100000)
583 ressize += 2+5+1;
584 else if (*p<1000000)
585 ressize += 2+6+1;
586 else
587 ressize += 2+7+1;
Hye-Shik Chang7db07e62003-12-29 01:36:01 +0000588#endif
Walter Dörwald3aeb6322002-09-02 13:14:32 +0000589 }
590 /* allocate replacement */
591 res = PyUnicode_FromUnicode(NULL, ressize);
592 if (res == NULL) {
593 Py_DECREF(object);
594 return NULL;
595 }
596 /* generate replacement */
597 for (p = startp+start, outp = PyUnicode_AS_UNICODE(res);
598 p < startp+end; ++p) {
599 Py_UNICODE c = *p;
600 int digits;
601 int base;
602 *outp++ = '&';
603 *outp++ = '#';
604 if (*p<10) {
605 digits = 1;
606 base = 1;
607 }
608 else if (*p<100) {
609 digits = 2;
610 base = 10;
611 }
612 else if (*p<1000) {
613 digits = 3;
614 base = 100;
615 }
616 else if (*p<10000) {
617 digits = 4;
618 base = 1000;
619 }
Hye-Shik Chang7db07e62003-12-29 01:36:01 +0000620#ifndef Py_UNICODE_WIDE
621 else {
622 digits = 5;
623 base = 10000;
624 }
625#else
Walter Dörwald3aeb6322002-09-02 13:14:32 +0000626 else if (*p<100000) {
627 digits = 5;
628 base = 10000;
629 }
630 else if (*p<1000000) {
631 digits = 6;
632 base = 100000;
633 }
634 else {
635 digits = 7;
636 base = 1000000;
637 }
Hye-Shik Chang7db07e62003-12-29 01:36:01 +0000638#endif
Walter Dörwald3aeb6322002-09-02 13:14:32 +0000639 while (digits-->0) {
640 *outp++ = '0' + c/base;
641 c %= base;
642 base /= 10;
643 }
644 *outp++ = ';';
645 }
Martin v. Löwis18e16552006-02-15 17:27:45 +0000646 restuple = Py_BuildValue("(On)", res, end);
Walter Dörwald3aeb6322002-09-02 13:14:32 +0000647 Py_DECREF(res);
648 Py_DECREF(object);
649 return restuple;
650 }
651 else {
652 wrong_exception_type(exc);
653 return NULL;
654 }
655}
656
657static Py_UNICODE hexdigits[] = {
658 '0', '1', '2', '3', '4', '5', '6', '7',
659 '8', '9', 'a', 'b', 'c', 'd', 'e', 'f'
660};
661
662PyObject *PyCodec_BackslashReplaceErrors(PyObject *exc)
663{
664 if (PyObject_IsInstance(exc, PyExc_UnicodeEncodeError)) {
665 PyObject *restuple;
666 PyObject *object;
Martin v. Löwis18e16552006-02-15 17:27:45 +0000667 Py_ssize_t start;
668 Py_ssize_t end;
Walter Dörwald3aeb6322002-09-02 13:14:32 +0000669 PyObject *res;
670 Py_UNICODE *p;
671 Py_UNICODE *startp;
672 Py_UNICODE *outp;
673 int ressize;
674 if (PyUnicodeEncodeError_GetStart(exc, &start))
675 return NULL;
676 if (PyUnicodeEncodeError_GetEnd(exc, &end))
677 return NULL;
678 if (!(object = PyUnicodeEncodeError_GetObject(exc)))
679 return NULL;
680 startp = PyUnicode_AS_UNICODE(object);
681 for (p = startp+start, ressize = 0; p < startp+end; ++p) {
Hye-Shik Chang7db07e62003-12-29 01:36:01 +0000682#ifdef Py_UNICODE_WIDE
Walter Dörwald3aeb6322002-09-02 13:14:32 +0000683 if (*p >= 0x00010000)
684 ressize += 1+1+8;
Hye-Shik Chang7db07e62003-12-29 01:36:01 +0000685 else
686#endif
687 if (*p >= 0x100) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +0000688 ressize += 1+1+4;
689 }
690 else
691 ressize += 1+1+2;
692 }
693 res = PyUnicode_FromUnicode(NULL, ressize);
694 if (res==NULL)
695 return NULL;
696 for (p = startp+start, outp = PyUnicode_AS_UNICODE(res);
697 p < startp+end; ++p) {
698 Py_UNICODE c = *p;
699 *outp++ = '\\';
Hye-Shik Chang7db07e62003-12-29 01:36:01 +0000700#ifdef Py_UNICODE_WIDE
Walter Dörwald3aeb6322002-09-02 13:14:32 +0000701 if (c >= 0x00010000) {
702 *outp++ = 'U';
703 *outp++ = hexdigits[(c>>28)&0xf];
704 *outp++ = hexdigits[(c>>24)&0xf];
705 *outp++ = hexdigits[(c>>20)&0xf];
706 *outp++ = hexdigits[(c>>16)&0xf];
707 *outp++ = hexdigits[(c>>12)&0xf];
708 *outp++ = hexdigits[(c>>8)&0xf];
709 }
Hye-Shik Chang7db07e62003-12-29 01:36:01 +0000710 else
711#endif
712 if (c >= 0x100) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +0000713 *outp++ = 'u';
714 *outp++ = hexdigits[(c>>12)&0xf];
715 *outp++ = hexdigits[(c>>8)&0xf];
716 }
717 else
718 *outp++ = 'x';
719 *outp++ = hexdigits[(c>>4)&0xf];
720 *outp++ = hexdigits[c&0xf];
721 }
722
Martin v. Löwis18e16552006-02-15 17:27:45 +0000723 restuple = Py_BuildValue("(On)", res, end);
Walter Dörwald3aeb6322002-09-02 13:14:32 +0000724 Py_DECREF(res);
725 Py_DECREF(object);
726 return restuple;
727 }
728 else {
729 wrong_exception_type(exc);
730 return NULL;
731 }
732}
733
734static PyObject *strict_errors(PyObject *self, PyObject *exc)
735{
736 return PyCodec_StrictErrors(exc);
737}
738
739
740static PyObject *ignore_errors(PyObject *self, PyObject *exc)
741{
742 return PyCodec_IgnoreErrors(exc);
743}
744
745
746static PyObject *replace_errors(PyObject *self, PyObject *exc)
747{
748 return PyCodec_ReplaceErrors(exc);
749}
750
751
752static PyObject *xmlcharrefreplace_errors(PyObject *self, PyObject *exc)
753{
754 return PyCodec_XMLCharRefReplaceErrors(exc);
755}
756
757
758static PyObject *backslashreplace_errors(PyObject *self, PyObject *exc)
759{
760 return PyCodec_BackslashReplaceErrors(exc);
761}
762
Gustavo Niemeyer5ddd4c32003-03-19 00:35:36 +0000763static int _PyCodecRegistry_Init(void)
Guido van Rossumfeee4b92000-03-10 22:57:27 +0000764{
Walter Dörwald3aeb6322002-09-02 13:14:32 +0000765 static struct {
766 char *name;
767 PyMethodDef def;
768 } methods[] =
769 {
770 {
771 "strict",
772 {
773 "strict_errors",
774 strict_errors,
775 METH_O
776 }
777 },
778 {
779 "ignore",
780 {
781 "ignore_errors",
782 ignore_errors,
783 METH_O
784 }
785 },
786 {
787 "replace",
788 {
789 "replace_errors",
790 replace_errors,
791 METH_O
792 }
793 },
794 {
795 "xmlcharrefreplace",
796 {
797 "xmlcharrefreplace_errors",
798 xmlcharrefreplace_errors,
799 METH_O
800 }
801 },
802 {
803 "backslashreplace",
804 {
805 "backslashreplace_errors",
806 backslashreplace_errors,
807 METH_O
808 }
809 }
810 };
Walter Dörwald3aeb6322002-09-02 13:14:32 +0000811
Nicholas Bastine5662ae2004-03-24 22:22:12 +0000812 PyInterpreterState *interp = PyThreadState_GET()->interp;
Gustavo Niemeyer5ddd4c32003-03-19 00:35:36 +0000813 PyObject *mod;
Neal Norwitz739a8f82004-07-08 01:55:58 +0000814 unsigned i;
Gustavo Niemeyer5ddd4c32003-03-19 00:35:36 +0000815
816 if (interp->codec_search_path != NULL)
817 return 0;
818
819 interp->codec_search_path = PyList_New(0);
820 interp->codec_search_cache = PyDict_New();
821 interp->codec_error_registry = PyDict_New();
822
823 if (interp->codec_error_registry) {
824 for (i = 0; i < sizeof(methods)/sizeof(methods[0]); ++i) {
825 PyObject *func = PyCFunction_New(&methods[i].def, NULL);
826 int res;
827 if (!func)
828 Py_FatalError("can't initialize codec error registry");
829 res = PyCodec_RegisterError(methods[i].name, func);
830 Py_DECREF(func);
831 if (res)
832 Py_FatalError("can't initialize codec error registry");
Walter Dörwald3aeb6322002-09-02 13:14:32 +0000833 }
834 }
Guido van Rossumfeee4b92000-03-10 22:57:27 +0000835
Gustavo Niemeyer5ddd4c32003-03-19 00:35:36 +0000836 if (interp->codec_search_path == NULL ||
837 interp->codec_search_cache == NULL ||
838 interp->codec_error_registry == NULL)
839 Py_FatalError("can't initialize codec registry");
840
Thomas Woutersf7f438b2006-02-28 16:09:29 +0000841 mod = PyImport_ImportModuleLevel("encodings", NULL, NULL, NULL, 0);
Gustavo Niemeyer5ddd4c32003-03-19 00:35:36 +0000842 if (mod == NULL) {
843 if (PyErr_ExceptionMatches(PyExc_ImportError)) {
844 /* Ignore ImportErrors... this is done so that
845 distributions can disable the encodings package. Note
846 that other errors are not masked, e.g. SystemErrors
847 raised to inform the user of an error in the Python
848 configuration are still reported back to the user. */
849 PyErr_Clear();
850 return 0;
851 }
852 return -1;
853 }
854 Py_DECREF(mod);
855 return 0;
Guido van Rossumfeee4b92000-03-10 22:57:27 +0000856}