blob: 532f1a67723901f2db774d98a89231558462822a [file] [log] [blame]
Guido van Rossumfeee4b92000-03-10 22:57:27 +00001/* ------------------------------------------------------------------------
2
3 Python Codec Registry and support functions
4
5Written by Marc-Andre Lemburg (mal@lemburg.com).
6
Guido van Rossum16b1ad92000-08-03 16:24:25 +00007Copyright (c) Corporation for National Research Initiatives.
Guido van Rossumfeee4b92000-03-10 22:57:27 +00008
9 ------------------------------------------------------------------------ */
10
11#include "Python.h"
12#include <ctype.h>
13
Guido van Rossumfeee4b92000-03-10 22:57:27 +000014/* --- Codec Registry ----------------------------------------------------- */
15
16/* Import the standard encodings package which will register the first
17 codec search function.
18
19 This is done in a lazy way so that the Unicode implementation does
20 not downgrade startup time of scripts not needing it.
21
Guido van Rossumb95de4f2000-03-31 17:25:23 +000022 ImportErrors are silently ignored by this function. Only one try is
23 made.
Guido van Rossumfeee4b92000-03-10 22:57:27 +000024
25*/
26
Gustavo Niemeyer5ddd4c32003-03-19 00:35:36 +000027static int _PyCodecRegistry_Init(void); /* Forward */
Guido van Rossumfeee4b92000-03-10 22:57:27 +000028
Guido van Rossumfeee4b92000-03-10 22:57:27 +000029int PyCodec_Register(PyObject *search_function)
30{
Nicholas Bastine5662ae2004-03-24 22:22:12 +000031 PyInterpreterState *interp = PyThreadState_GET()->interp;
Gustavo Niemeyer5ddd4c32003-03-19 00:35:36 +000032 if (interp->codec_search_path == NULL && _PyCodecRegistry_Init())
33 goto onError;
Guido van Rossumfeee4b92000-03-10 22:57:27 +000034 if (search_function == NULL) {
35 PyErr_BadArgument();
Guido van Rossumb95de4f2000-03-31 17:25:23 +000036 goto onError;
Guido van Rossumfeee4b92000-03-10 22:57:27 +000037 }
38 if (!PyCallable_Check(search_function)) {
Neal Norwitz3715c3e2005-11-24 22:09:18 +000039 PyErr_SetString(PyExc_TypeError, "argument must be callable");
Guido van Rossumb95de4f2000-03-31 17:25:23 +000040 goto onError;
Guido van Rossumfeee4b92000-03-10 22:57:27 +000041 }
Gustavo Niemeyer5ddd4c32003-03-19 00:35:36 +000042 return PyList_Append(interp->codec_search_path, search_function);
Guido van Rossumb95de4f2000-03-31 17:25:23 +000043
44 onError:
45 return -1;
Guido van Rossumfeee4b92000-03-10 22:57:27 +000046}
47
Guido van Rossum9e896b32000-04-05 20:11:21 +000048/* Convert a string to a normalized Python string: all characters are
49 converted to lower case, spaces are replaced with underscores. */
50
Guido van Rossumfeee4b92000-03-10 22:57:27 +000051static
Guido van Rossum9e896b32000-04-05 20:11:21 +000052PyObject *normalizestring(const char *string)
Guido van Rossumfeee4b92000-03-10 22:57:27 +000053{
Guido van Rossum33831132000-06-29 14:50:15 +000054 register size_t i;
Guido van Rossum582acec2000-06-28 22:07:35 +000055 size_t len = strlen(string);
Guido van Rossumfeee4b92000-03-10 22:57:27 +000056 char *p;
57 PyObject *v;
58
Guido van Rossum582acec2000-06-28 22:07:35 +000059 if (len > INT_MAX) {
60 PyErr_SetString(PyExc_OverflowError, "string is too large");
61 return NULL;
62 }
63
64 v = PyString_FromStringAndSize(NULL, (int)len);
Guido van Rossumfeee4b92000-03-10 22:57:27 +000065 if (v == NULL)
66 return NULL;
67 p = PyString_AS_STRING(v);
Guido van Rossum9e896b32000-04-05 20:11:21 +000068 for (i = 0; i < len; i++) {
69 register char ch = string[i];
70 if (ch == ' ')
71 ch = '-';
72 else
73 ch = tolower(ch);
74 p[i] = ch;
75 }
Guido van Rossumfeee4b92000-03-10 22:57:27 +000076 return v;
77}
78
79/* Lookup the given encoding and return a tuple providing the codec
80 facilities.
81
82 The encoding string is looked up converted to all lower-case
83 characters. This makes encodings looked up through this mechanism
84 effectively case-insensitive.
85
Fred Drake766de832000-05-09 19:55:59 +000086 If no codec is found, a LookupError is set and NULL returned.
Guido van Rossumb95de4f2000-03-31 17:25:23 +000087
88 As side effect, this tries to load the encodings package, if not
89 yet done. This is part of the lazy load strategy for the encodings
90 package.
91
92*/
Guido van Rossumfeee4b92000-03-10 22:57:27 +000093
94PyObject *_PyCodec_Lookup(const char *encoding)
95{
Gustavo Niemeyer5ddd4c32003-03-19 00:35:36 +000096 PyInterpreterState *interp;
Guido van Rossum5ba3c842000-03-24 20:52:23 +000097 PyObject *result, *args = NULL, *v;
Guido van Rossumfeee4b92000-03-10 22:57:27 +000098 int i, len;
99
Fred Drake766de832000-05-09 19:55:59 +0000100 if (encoding == NULL) {
101 PyErr_BadArgument();
102 goto onError;
103 }
Gustavo Niemeyer5ddd4c32003-03-19 00:35:36 +0000104
Nicholas Bastine5662ae2004-03-24 22:22:12 +0000105 interp = PyThreadState_GET()->interp;
Gustavo Niemeyer5ddd4c32003-03-19 00:35:36 +0000106 if (interp->codec_search_path == NULL && _PyCodecRegistry_Init())
Barry Warsaw51ac5802000-03-20 16:36:48 +0000107 goto onError;
Guido van Rossumfeee4b92000-03-10 22:57:27 +0000108
Guido van Rossum9e896b32000-04-05 20:11:21 +0000109 /* Convert the encoding to a normalized Python string: all
Thomas Wouters7e474022000-07-16 12:04:32 +0000110 characters are converted to lower case, spaces and hyphens are
Guido van Rossum9e896b32000-04-05 20:11:21 +0000111 replaced with underscores. */
112 v = normalizestring(encoding);
Guido van Rossumfeee4b92000-03-10 22:57:27 +0000113 if (v == NULL)
114 goto onError;
115 PyString_InternInPlace(&v);
116
117 /* First, try to lookup the name in the registry dictionary */
Gustavo Niemeyer5ddd4c32003-03-19 00:35:36 +0000118 result = PyDict_GetItem(interp->codec_search_cache, v);
Guido van Rossumfeee4b92000-03-10 22:57:27 +0000119 if (result != NULL) {
120 Py_INCREF(result);
Barry Warsaw51ac5802000-03-20 16:36:48 +0000121 Py_DECREF(v);
Guido van Rossumfeee4b92000-03-10 22:57:27 +0000122 return result;
123 }
124
125 /* Next, scan the search functions in order of registration */
Guido van Rossumfeee4b92000-03-10 22:57:27 +0000126 args = PyTuple_New(1);
127 if (args == NULL)
128 goto onError;
129 PyTuple_SET_ITEM(args,0,v);
Guido van Rossum5ba3c842000-03-24 20:52:23 +0000130
Gustavo Niemeyer5ddd4c32003-03-19 00:35:36 +0000131 len = PyList_Size(interp->codec_search_path);
Guido van Rossum5ba3c842000-03-24 20:52:23 +0000132 if (len < 0)
133 goto onError;
Guido van Rossumb95de4f2000-03-31 17:25:23 +0000134 if (len == 0) {
135 PyErr_SetString(PyExc_LookupError,
136 "no codec search functions registered: "
137 "can't find encoding");
138 goto onError;
139 }
Guido van Rossumfeee4b92000-03-10 22:57:27 +0000140
141 for (i = 0; i < len; i++) {
142 PyObject *func;
143
Gustavo Niemeyer5ddd4c32003-03-19 00:35:36 +0000144 func = PyList_GetItem(interp->codec_search_path, i);
Guido van Rossumfeee4b92000-03-10 22:57:27 +0000145 if (func == NULL)
146 goto onError;
Guido van Rossum5ba3c842000-03-24 20:52:23 +0000147 result = PyEval_CallObject(func, args);
Guido van Rossumfeee4b92000-03-10 22:57:27 +0000148 if (result == NULL)
149 goto onError;
150 if (result == Py_None) {
151 Py_DECREF(result);
152 continue;
153 }
154 if (!PyTuple_Check(result) || PyTuple_GET_SIZE(result) != 4) {
155 PyErr_SetString(PyExc_TypeError,
156 "codec search functions must return 4-tuples");
157 Py_DECREF(result);
158 goto onError;
159 }
160 break;
161 }
162 if (i == len) {
163 /* XXX Perhaps we should cache misses too ? */
Martin v. Löwiseb42b022002-09-26 16:01:24 +0000164 PyErr_Format(PyExc_LookupError,
165 "unknown encoding: %s", encoding);
Guido van Rossumfeee4b92000-03-10 22:57:27 +0000166 goto onError;
167 }
168
169 /* Cache and return the result */
Gustavo Niemeyer5ddd4c32003-03-19 00:35:36 +0000170 PyDict_SetItem(interp->codec_search_cache, v, result);
Guido van Rossumfeee4b92000-03-10 22:57:27 +0000171 Py_DECREF(args);
172 return result;
173
174 onError:
175 Py_XDECREF(args);
176 return NULL;
177}
178
179static
180PyObject *args_tuple(PyObject *object,
181 const char *errors)
182{
183 PyObject *args;
184
185 args = PyTuple_New(1 + (errors != NULL));
186 if (args == NULL)
187 return NULL;
188 Py_INCREF(object);
189 PyTuple_SET_ITEM(args,0,object);
190 if (errors) {
191 PyObject *v;
192
193 v = PyString_FromString(errors);
194 if (v == NULL) {
195 Py_DECREF(args);
196 return NULL;
197 }
198 PyTuple_SET_ITEM(args, 1, v);
199 }
200 return args;
201}
202
Walter Dörwaldd53850a2006-03-16 21:46:40 +0000203/* Helper function to get a codec item */
Guido van Rossumfeee4b92000-03-10 22:57:27 +0000204
205static
Walter Dörwaldd53850a2006-03-16 21:46:40 +0000206PyObject *codec_getitem(const char *encoding, int index)
Guido van Rossumfeee4b92000-03-10 22:57:27 +0000207{
Walter Dörwaldd53850a2006-03-16 21:46:40 +0000208 PyObject *codecs;
209 PyObject *v;
Guido van Rossumfeee4b92000-03-10 22:57:27 +0000210
Walter Dörwaldd53850a2006-03-16 21:46:40 +0000211 codecs = _PyCodec_Lookup(encoding);
212 if (codecs == NULL)
Guido van Rossumfeee4b92000-03-10 22:57:27 +0000213 return NULL;
Walter Dörwaldd53850a2006-03-16 21:46:40 +0000214 v = PyTuple_GET_ITEM(codecs, index);
215 Py_DECREF(codecs);
216 Py_INCREF(v);
217 return v;
218}
219
220/* Helper function to create an incremental codec. */
221
222static
223PyObject *codec_getincrementalcodec(const char *encoding,
224 const char *errors,
225 const char *attrname)
226{
227 PyObject *codecs, *ret, *inccodec;
228
229 codecs = _PyCodec_Lookup(encoding);
230 if (codecs == NULL)
231 return NULL;
232 inccodec = PyObject_GetAttrString(codecs, attrname);
233 if (inccodec == NULL) {
234 Py_DECREF(codecs);
235 return NULL;
236 }
237 if (errors)
238 ret = PyObject_CallFunction(inccodec, "s", errors);
239 else
240 ret = PyObject_CallFunction(inccodec, NULL);
241 Py_DECREF(inccodec);
242 Py_DECREF(codecs);
243 return ret;
244}
245
246/* Helper function to create a stream codec. */
247
248static
249PyObject *codec_getstreamcodec(const char *encoding,
250 PyObject *stream,
251 const char *errors,
252 const int index)
253{
254 PyObject *codecs, *streamcodec;
255
256 codecs = _PyCodec_Lookup(encoding);
257 if (codecs == NULL)
258 return NULL;
259
260 streamcodec = PyEval_CallFunction(
261 PyTuple_GET_ITEM(codecs, index), "Os", stream, errors);
262 Py_DECREF(codecs);
263 return streamcodec;
Guido van Rossumfeee4b92000-03-10 22:57:27 +0000264}
265
266/* Convenience APIs to query the Codec registry.
267
268 All APIs return a codec object with incremented refcount.
269
270 */
271
272PyObject *PyCodec_Encoder(const char *encoding)
273{
Walter Dörwaldd53850a2006-03-16 21:46:40 +0000274 return codec_getitem(encoding, 0);
Guido van Rossumfeee4b92000-03-10 22:57:27 +0000275}
276
277PyObject *PyCodec_Decoder(const char *encoding)
278{
Walter Dörwaldd53850a2006-03-16 21:46:40 +0000279 return codec_getitem(encoding, 1);
Guido van Rossumfeee4b92000-03-10 22:57:27 +0000280}
281
Walter Dörwaldabb02e52006-03-15 11:35:15 +0000282PyObject *PyCodec_IncrementalEncoder(const char *encoding,
283 const char *errors)
284{
Walter Dörwaldd53850a2006-03-16 21:46:40 +0000285 return codec_getincrementalcodec(encoding, errors, "incrementalencoder");
Walter Dörwaldabb02e52006-03-15 11:35:15 +0000286}
287
288PyObject *PyCodec_IncrementalDecoder(const char *encoding,
289 const char *errors)
290{
Walter Dörwaldd53850a2006-03-16 21:46:40 +0000291 return codec_getincrementalcodec(encoding, errors, "incrementaldecoder");
Walter Dörwaldabb02e52006-03-15 11:35:15 +0000292}
293
Guido van Rossumfeee4b92000-03-10 22:57:27 +0000294PyObject *PyCodec_StreamReader(const char *encoding,
295 PyObject *stream,
296 const char *errors)
297{
Walter Dörwaldd53850a2006-03-16 21:46:40 +0000298 return codec_getstreamcodec(encoding, stream, errors, 2);
Guido van Rossumfeee4b92000-03-10 22:57:27 +0000299}
300
301PyObject *PyCodec_StreamWriter(const char *encoding,
302 PyObject *stream,
303 const char *errors)
304{
Walter Dörwaldd53850a2006-03-16 21:46:40 +0000305 return codec_getstreamcodec(encoding, stream, errors, 3);
Guido van Rossumfeee4b92000-03-10 22:57:27 +0000306}
307
308/* Encode an object (e.g. an Unicode object) using the given encoding
309 and return the resulting encoded object (usually a Python string).
310
311 errors is passed to the encoder factory as argument if non-NULL. */
312
313PyObject *PyCodec_Encode(PyObject *object,
314 const char *encoding,
315 const char *errors)
316{
317 PyObject *encoder = NULL;
Neal Norwitz3715c3e2005-11-24 22:09:18 +0000318 PyObject *args = NULL, *result = NULL;
Guido van Rossumfeee4b92000-03-10 22:57:27 +0000319 PyObject *v;
320
321 encoder = PyCodec_Encoder(encoding);
322 if (encoder == NULL)
323 goto onError;
324
325 args = args_tuple(object, errors);
326 if (args == NULL)
327 goto onError;
328
329 result = PyEval_CallObject(encoder,args);
330 if (result == NULL)
331 goto onError;
332
333 if (!PyTuple_Check(result) ||
334 PyTuple_GET_SIZE(result) != 2) {
335 PyErr_SetString(PyExc_TypeError,
336 "encoder must return a tuple (object,integer)");
337 goto onError;
338 }
339 v = PyTuple_GET_ITEM(result,0);
340 Py_INCREF(v);
341 /* We don't check or use the second (integer) entry. */
342
343 Py_DECREF(args);
344 Py_DECREF(encoder);
345 Py_DECREF(result);
346 return v;
347
348 onError:
Neal Norwitz3715c3e2005-11-24 22:09:18 +0000349 Py_XDECREF(result);
Guido van Rossumfeee4b92000-03-10 22:57:27 +0000350 Py_XDECREF(args);
351 Py_XDECREF(encoder);
352 return NULL;
353}
354
355/* Decode an object (usually a Python string) using the given encoding
356 and return an equivalent object (e.g. an Unicode object).
357
358 errors is passed to the decoder factory as argument if non-NULL. */
359
360PyObject *PyCodec_Decode(PyObject *object,
361 const char *encoding,
362 const char *errors)
363{
364 PyObject *decoder = NULL;
365 PyObject *args = NULL, *result = NULL;
366 PyObject *v;
367
368 decoder = PyCodec_Decoder(encoding);
369 if (decoder == NULL)
370 goto onError;
371
372 args = args_tuple(object, errors);
373 if (args == NULL)
374 goto onError;
375
376 result = PyEval_CallObject(decoder,args);
377 if (result == NULL)
378 goto onError;
379 if (!PyTuple_Check(result) ||
380 PyTuple_GET_SIZE(result) != 2) {
381 PyErr_SetString(PyExc_TypeError,
382 "decoder must return a tuple (object,integer)");
383 goto onError;
384 }
385 v = PyTuple_GET_ITEM(result,0);
386 Py_INCREF(v);
387 /* We don't check or use the second (integer) entry. */
388
389 Py_DECREF(args);
390 Py_DECREF(decoder);
391 Py_DECREF(result);
392 return v;
393
394 onError:
395 Py_XDECREF(args);
396 Py_XDECREF(decoder);
397 Py_XDECREF(result);
398 return NULL;
399}
400
Walter Dörwald3aeb6322002-09-02 13:14:32 +0000401/* Register the error handling callback function error under the name
402 name. This function will be called by the codec when it encounters
403 an unencodable characters/undecodable bytes and doesn't know the
404 callback name, when name is specified as the error parameter
405 in the call to the encode/decode function.
406 Return 0 on success, -1 on error */
407int PyCodec_RegisterError(const char *name, PyObject *error)
408{
Nicholas Bastine5662ae2004-03-24 22:22:12 +0000409 PyInterpreterState *interp = PyThreadState_GET()->interp;
Gustavo Niemeyer5ddd4c32003-03-19 00:35:36 +0000410 if (interp->codec_search_path == NULL && _PyCodecRegistry_Init())
411 return -1;
Walter Dörwald3aeb6322002-09-02 13:14:32 +0000412 if (!PyCallable_Check(error)) {
413 PyErr_SetString(PyExc_TypeError, "handler must be callable");
414 return -1;
415 }
Gustavo Niemeyer5ddd4c32003-03-19 00:35:36 +0000416 return PyDict_SetItemString(interp->codec_error_registry,
417 (char *)name, error);
Walter Dörwald3aeb6322002-09-02 13:14:32 +0000418}
419
420/* Lookup the error handling callback function registered under the
421 name error. As a special case NULL can be passed, in which case
422 the error handling callback for strict encoding will be returned. */
423PyObject *PyCodec_LookupError(const char *name)
424{
425 PyObject *handler = NULL;
426
Nicholas Bastine5662ae2004-03-24 22:22:12 +0000427 PyInterpreterState *interp = PyThreadState_GET()->interp;
Gustavo Niemeyer5ddd4c32003-03-19 00:35:36 +0000428 if (interp->codec_search_path == NULL && _PyCodecRegistry_Init())
429 return NULL;
430
Walter Dörwald3aeb6322002-09-02 13:14:32 +0000431 if (name==NULL)
432 name = "strict";
Gustavo Niemeyer5ddd4c32003-03-19 00:35:36 +0000433 handler = PyDict_GetItemString(interp->codec_error_registry, (char *)name);
Walter Dörwald3aeb6322002-09-02 13:14:32 +0000434 if (!handler)
435 PyErr_Format(PyExc_LookupError, "unknown error handler name '%.400s'", name);
436 else
437 Py_INCREF(handler);
438 return handler;
439}
440
441static void wrong_exception_type(PyObject *exc)
442{
443 PyObject *type = PyObject_GetAttrString(exc, "__class__");
444 if (type != NULL) {
445 PyObject *name = PyObject_GetAttrString(type, "__name__");
446 Py_DECREF(type);
447 if (name != NULL) {
448 PyObject *string = PyObject_Str(name);
449 Py_DECREF(name);
Walter Dörwaldf7bcd1d2002-09-02 18:22:32 +0000450 if (string != NULL) {
451 PyErr_Format(PyExc_TypeError,
452 "don't know how to handle %.400s in error callback",
453 PyString_AS_STRING(string));
454 Py_DECREF(string);
455 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +0000456 }
457 }
458}
459
460PyObject *PyCodec_StrictErrors(PyObject *exc)
461{
Brett Cannonbf364092006-03-01 04:25:17 +0000462 if (PyExceptionInstance_Check(exc))
463 PyErr_SetObject(PyExceptionInstance_Class(exc), exc);
Walter Dörwald3aeb6322002-09-02 13:14:32 +0000464 else
465 PyErr_SetString(PyExc_TypeError, "codec must pass exception instance");
466 return NULL;
467}
468
469
Walter Dörwaldbf73db82002-11-21 20:08:33 +0000470#ifdef Py_USING_UNICODE
Walter Dörwald3aeb6322002-09-02 13:14:32 +0000471PyObject *PyCodec_IgnoreErrors(PyObject *exc)
472{
Martin v. Löwis18e16552006-02-15 17:27:45 +0000473 Py_ssize_t end;
Walter Dörwald3aeb6322002-09-02 13:14:32 +0000474 if (PyObject_IsInstance(exc, PyExc_UnicodeEncodeError)) {
475 if (PyUnicodeEncodeError_GetEnd(exc, &end))
476 return NULL;
477 }
478 else if (PyObject_IsInstance(exc, PyExc_UnicodeDecodeError)) {
479 if (PyUnicodeDecodeError_GetEnd(exc, &end))
480 return NULL;
481 }
482 else if (PyObject_IsInstance(exc, PyExc_UnicodeTranslateError)) {
483 if (PyUnicodeTranslateError_GetEnd(exc, &end))
484 return NULL;
485 }
486 else {
487 wrong_exception_type(exc);
488 return NULL;
489 }
490 /* ouch: passing NULL, 0, pos gives None instead of u'' */
Martin v. Löwis18e16552006-02-15 17:27:45 +0000491 return Py_BuildValue("(u#n)", &end, 0, end);
Walter Dörwald3aeb6322002-09-02 13:14:32 +0000492}
493
494
495PyObject *PyCodec_ReplaceErrors(PyObject *exc)
496{
497 PyObject *restuple;
Martin v. Löwis18e16552006-02-15 17:27:45 +0000498 Py_ssize_t start;
499 Py_ssize_t end;
500 Py_ssize_t i;
Walter Dörwald3aeb6322002-09-02 13:14:32 +0000501
502 if (PyObject_IsInstance(exc, PyExc_UnicodeEncodeError)) {
503 PyObject *res;
504 Py_UNICODE *p;
505 if (PyUnicodeEncodeError_GetStart(exc, &start))
506 return NULL;
507 if (PyUnicodeEncodeError_GetEnd(exc, &end))
508 return NULL;
509 res = PyUnicode_FromUnicode(NULL, end-start);
510 if (res == NULL)
511 return NULL;
512 for (p = PyUnicode_AS_UNICODE(res), i = start;
513 i<end; ++p, ++i)
514 *p = '?';
Martin v. Löwis18e16552006-02-15 17:27:45 +0000515 restuple = Py_BuildValue("(On)", res, end);
Walter Dörwald3aeb6322002-09-02 13:14:32 +0000516 Py_DECREF(res);
517 return restuple;
518 }
519 else if (PyObject_IsInstance(exc, PyExc_UnicodeDecodeError)) {
520 Py_UNICODE res = Py_UNICODE_REPLACEMENT_CHARACTER;
521 if (PyUnicodeDecodeError_GetEnd(exc, &end))
522 return NULL;
Martin v. Löwis18e16552006-02-15 17:27:45 +0000523 return Py_BuildValue("(u#n)", &res, 1, end);
Walter Dörwald3aeb6322002-09-02 13:14:32 +0000524 }
525 else if (PyObject_IsInstance(exc, PyExc_UnicodeTranslateError)) {
526 PyObject *res;
527 Py_UNICODE *p;
528 if (PyUnicodeTranslateError_GetStart(exc, &start))
529 return NULL;
530 if (PyUnicodeTranslateError_GetEnd(exc, &end))
531 return NULL;
532 res = PyUnicode_FromUnicode(NULL, end-start);
533 if (res == NULL)
534 return NULL;
535 for (p = PyUnicode_AS_UNICODE(res), i = start;
536 i<end; ++p, ++i)
537 *p = Py_UNICODE_REPLACEMENT_CHARACTER;
Martin v. Löwis18e16552006-02-15 17:27:45 +0000538 restuple = Py_BuildValue("(On)", res, end);
Walter Dörwald3aeb6322002-09-02 13:14:32 +0000539 Py_DECREF(res);
540 return restuple;
541 }
542 else {
543 wrong_exception_type(exc);
544 return NULL;
545 }
546}
547
548PyObject *PyCodec_XMLCharRefReplaceErrors(PyObject *exc)
549{
550 if (PyObject_IsInstance(exc, PyExc_UnicodeEncodeError)) {
551 PyObject *restuple;
552 PyObject *object;
Martin v. Löwis18e16552006-02-15 17:27:45 +0000553 Py_ssize_t start;
554 Py_ssize_t end;
Walter Dörwald3aeb6322002-09-02 13:14:32 +0000555 PyObject *res;
556 Py_UNICODE *p;
557 Py_UNICODE *startp;
558 Py_UNICODE *outp;
559 int ressize;
560 if (PyUnicodeEncodeError_GetStart(exc, &start))
561 return NULL;
562 if (PyUnicodeEncodeError_GetEnd(exc, &end))
563 return NULL;
564 if (!(object = PyUnicodeEncodeError_GetObject(exc)))
565 return NULL;
566 startp = PyUnicode_AS_UNICODE(object);
567 for (p = startp+start, ressize = 0; p < startp+end; ++p) {
568 if (*p<10)
569 ressize += 2+1+1;
570 else if (*p<100)
571 ressize += 2+2+1;
572 else if (*p<1000)
573 ressize += 2+3+1;
574 else if (*p<10000)
575 ressize += 2+4+1;
Hye-Shik Chang7db07e62003-12-29 01:36:01 +0000576#ifndef Py_UNICODE_WIDE
577 else
578 ressize += 2+5+1;
579#else
Walter Dörwald3aeb6322002-09-02 13:14:32 +0000580 else if (*p<100000)
581 ressize += 2+5+1;
582 else if (*p<1000000)
583 ressize += 2+6+1;
584 else
585 ressize += 2+7+1;
Hye-Shik Chang7db07e62003-12-29 01:36:01 +0000586#endif
Walter Dörwald3aeb6322002-09-02 13:14:32 +0000587 }
588 /* allocate replacement */
589 res = PyUnicode_FromUnicode(NULL, ressize);
590 if (res == NULL) {
591 Py_DECREF(object);
592 return NULL;
593 }
594 /* generate replacement */
595 for (p = startp+start, outp = PyUnicode_AS_UNICODE(res);
596 p < startp+end; ++p) {
597 Py_UNICODE c = *p;
598 int digits;
599 int base;
600 *outp++ = '&';
601 *outp++ = '#';
602 if (*p<10) {
603 digits = 1;
604 base = 1;
605 }
606 else if (*p<100) {
607 digits = 2;
608 base = 10;
609 }
610 else if (*p<1000) {
611 digits = 3;
612 base = 100;
613 }
614 else if (*p<10000) {
615 digits = 4;
616 base = 1000;
617 }
Hye-Shik Chang7db07e62003-12-29 01:36:01 +0000618#ifndef Py_UNICODE_WIDE
619 else {
620 digits = 5;
621 base = 10000;
622 }
623#else
Walter Dörwald3aeb6322002-09-02 13:14:32 +0000624 else if (*p<100000) {
625 digits = 5;
626 base = 10000;
627 }
628 else if (*p<1000000) {
629 digits = 6;
630 base = 100000;
631 }
632 else {
633 digits = 7;
634 base = 1000000;
635 }
Hye-Shik Chang7db07e62003-12-29 01:36:01 +0000636#endif
Walter Dörwald3aeb6322002-09-02 13:14:32 +0000637 while (digits-->0) {
638 *outp++ = '0' + c/base;
639 c %= base;
640 base /= 10;
641 }
642 *outp++ = ';';
643 }
Martin v. Löwis18e16552006-02-15 17:27:45 +0000644 restuple = Py_BuildValue("(On)", res, end);
Walter Dörwald3aeb6322002-09-02 13:14:32 +0000645 Py_DECREF(res);
646 Py_DECREF(object);
647 return restuple;
648 }
649 else {
650 wrong_exception_type(exc);
651 return NULL;
652 }
653}
654
655static Py_UNICODE hexdigits[] = {
656 '0', '1', '2', '3', '4', '5', '6', '7',
657 '8', '9', 'a', 'b', 'c', 'd', 'e', 'f'
658};
659
660PyObject *PyCodec_BackslashReplaceErrors(PyObject *exc)
661{
662 if (PyObject_IsInstance(exc, PyExc_UnicodeEncodeError)) {
663 PyObject *restuple;
664 PyObject *object;
Martin v. Löwis18e16552006-02-15 17:27:45 +0000665 Py_ssize_t start;
666 Py_ssize_t end;
Walter Dörwald3aeb6322002-09-02 13:14:32 +0000667 PyObject *res;
668 Py_UNICODE *p;
669 Py_UNICODE *startp;
670 Py_UNICODE *outp;
671 int ressize;
672 if (PyUnicodeEncodeError_GetStart(exc, &start))
673 return NULL;
674 if (PyUnicodeEncodeError_GetEnd(exc, &end))
675 return NULL;
676 if (!(object = PyUnicodeEncodeError_GetObject(exc)))
677 return NULL;
678 startp = PyUnicode_AS_UNICODE(object);
679 for (p = startp+start, ressize = 0; p < startp+end; ++p) {
Hye-Shik Chang7db07e62003-12-29 01:36:01 +0000680#ifdef Py_UNICODE_WIDE
Walter Dörwald3aeb6322002-09-02 13:14:32 +0000681 if (*p >= 0x00010000)
682 ressize += 1+1+8;
Hye-Shik Chang7db07e62003-12-29 01:36:01 +0000683 else
684#endif
685 if (*p >= 0x100) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +0000686 ressize += 1+1+4;
687 }
688 else
689 ressize += 1+1+2;
690 }
691 res = PyUnicode_FromUnicode(NULL, ressize);
692 if (res==NULL)
693 return NULL;
694 for (p = startp+start, outp = PyUnicode_AS_UNICODE(res);
695 p < startp+end; ++p) {
696 Py_UNICODE c = *p;
697 *outp++ = '\\';
Hye-Shik Chang7db07e62003-12-29 01:36:01 +0000698#ifdef Py_UNICODE_WIDE
Walter Dörwald3aeb6322002-09-02 13:14:32 +0000699 if (c >= 0x00010000) {
700 *outp++ = 'U';
701 *outp++ = hexdigits[(c>>28)&0xf];
702 *outp++ = hexdigits[(c>>24)&0xf];
703 *outp++ = hexdigits[(c>>20)&0xf];
704 *outp++ = hexdigits[(c>>16)&0xf];
705 *outp++ = hexdigits[(c>>12)&0xf];
706 *outp++ = hexdigits[(c>>8)&0xf];
707 }
Hye-Shik Chang7db07e62003-12-29 01:36:01 +0000708 else
709#endif
710 if (c >= 0x100) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +0000711 *outp++ = 'u';
712 *outp++ = hexdigits[(c>>12)&0xf];
713 *outp++ = hexdigits[(c>>8)&0xf];
714 }
715 else
716 *outp++ = 'x';
717 *outp++ = hexdigits[(c>>4)&0xf];
718 *outp++ = hexdigits[c&0xf];
719 }
720
Martin v. Löwis18e16552006-02-15 17:27:45 +0000721 restuple = Py_BuildValue("(On)", res, end);
Walter Dörwald3aeb6322002-09-02 13:14:32 +0000722 Py_DECREF(res);
723 Py_DECREF(object);
724 return restuple;
725 }
726 else {
727 wrong_exception_type(exc);
728 return NULL;
729 }
730}
Walter Dörwaldbf73db82002-11-21 20:08:33 +0000731#endif
Walter Dörwald3aeb6322002-09-02 13:14:32 +0000732
733static PyObject *strict_errors(PyObject *self, PyObject *exc)
734{
735 return PyCodec_StrictErrors(exc);
736}
737
738
Walter Dörwaldbf73db82002-11-21 20:08:33 +0000739#ifdef Py_USING_UNICODE
Walter Dörwald3aeb6322002-09-02 13:14:32 +0000740static PyObject *ignore_errors(PyObject *self, PyObject *exc)
741{
742 return PyCodec_IgnoreErrors(exc);
743}
744
745
746static PyObject *replace_errors(PyObject *self, PyObject *exc)
747{
748 return PyCodec_ReplaceErrors(exc);
749}
750
751
752static PyObject *xmlcharrefreplace_errors(PyObject *self, PyObject *exc)
753{
754 return PyCodec_XMLCharRefReplaceErrors(exc);
755}
756
757
758static PyObject *backslashreplace_errors(PyObject *self, PyObject *exc)
759{
760 return PyCodec_BackslashReplaceErrors(exc);
761}
Walter Dörwaldbf73db82002-11-21 20:08:33 +0000762#endif
Walter Dörwald3aeb6322002-09-02 13:14:32 +0000763
Gustavo Niemeyer5ddd4c32003-03-19 00:35:36 +0000764static int _PyCodecRegistry_Init(void)
Guido van Rossumfeee4b92000-03-10 22:57:27 +0000765{
Walter Dörwald3aeb6322002-09-02 13:14:32 +0000766 static struct {
767 char *name;
768 PyMethodDef def;
769 } methods[] =
770 {
771 {
772 "strict",
773 {
774 "strict_errors",
775 strict_errors,
776 METH_O
777 }
778 },
Walter Dörwaldbf73db82002-11-21 20:08:33 +0000779#ifdef Py_USING_UNICODE
Walter Dörwald3aeb6322002-09-02 13:14:32 +0000780 {
781 "ignore",
782 {
783 "ignore_errors",
784 ignore_errors,
785 METH_O
786 }
787 },
788 {
789 "replace",
790 {
791 "replace_errors",
792 replace_errors,
793 METH_O
794 }
795 },
796 {
797 "xmlcharrefreplace",
798 {
799 "xmlcharrefreplace_errors",
800 xmlcharrefreplace_errors,
801 METH_O
802 }
803 },
804 {
805 "backslashreplace",
806 {
807 "backslashreplace_errors",
808 backslashreplace_errors,
809 METH_O
810 }
811 }
Walter Dörwaldbf73db82002-11-21 20:08:33 +0000812#endif
Walter Dörwald3aeb6322002-09-02 13:14:32 +0000813 };
Walter Dörwald3aeb6322002-09-02 13:14:32 +0000814
Nicholas Bastine5662ae2004-03-24 22:22:12 +0000815 PyInterpreterState *interp = PyThreadState_GET()->interp;
Gustavo Niemeyer5ddd4c32003-03-19 00:35:36 +0000816 PyObject *mod;
Neal Norwitz739a8f82004-07-08 01:55:58 +0000817 unsigned i;
Gustavo Niemeyer5ddd4c32003-03-19 00:35:36 +0000818
819 if (interp->codec_search_path != NULL)
820 return 0;
821
822 interp->codec_search_path = PyList_New(0);
823 interp->codec_search_cache = PyDict_New();
824 interp->codec_error_registry = PyDict_New();
825
826 if (interp->codec_error_registry) {
827 for (i = 0; i < sizeof(methods)/sizeof(methods[0]); ++i) {
828 PyObject *func = PyCFunction_New(&methods[i].def, NULL);
829 int res;
830 if (!func)
831 Py_FatalError("can't initialize codec error registry");
832 res = PyCodec_RegisterError(methods[i].name, func);
833 Py_DECREF(func);
834 if (res)
835 Py_FatalError("can't initialize codec error registry");
Walter Dörwald3aeb6322002-09-02 13:14:32 +0000836 }
837 }
Guido van Rossumfeee4b92000-03-10 22:57:27 +0000838
Gustavo Niemeyer5ddd4c32003-03-19 00:35:36 +0000839 if (interp->codec_search_path == NULL ||
840 interp->codec_search_cache == NULL ||
841 interp->codec_error_registry == NULL)
842 Py_FatalError("can't initialize codec registry");
843
Thomas Woutersf7f438b2006-02-28 16:09:29 +0000844 mod = PyImport_ImportModuleLevel("encodings", NULL, NULL, NULL, 0);
Gustavo Niemeyer5ddd4c32003-03-19 00:35:36 +0000845 if (mod == NULL) {
846 if (PyErr_ExceptionMatches(PyExc_ImportError)) {
847 /* Ignore ImportErrors... this is done so that
848 distributions can disable the encodings package. Note
849 that other errors are not masked, e.g. SystemErrors
850 raised to inform the user of an error in the Python
851 configuration are still reported back to the user. */
852 PyErr_Clear();
853 return 0;
854 }
855 return -1;
856 }
857 Py_DECREF(mod);
858 return 0;
Guido van Rossumfeee4b92000-03-10 22:57:27 +0000859}