blob: 046abe35079af3208762ad4bea0e6dcd6a315aaa [file] [log] [blame]
Guido van Rossumfeee4b92000-03-10 22:57:27 +00001/* ------------------------------------------------------------------------
2
3 Python Codec Registry and support functions
4
5Written by Marc-Andre Lemburg (mal@lemburg.com).
6
Guido van Rossum16b1ad92000-08-03 16:24:25 +00007Copyright (c) Corporation for National Research Initiatives.
Guido van Rossumfeee4b92000-03-10 22:57:27 +00008
9 ------------------------------------------------------------------------ */
10
11#include "Python.h"
12#include <ctype.h>
13
Guido van Rossumfeee4b92000-03-10 22:57:27 +000014/* --- Codec Registry ----------------------------------------------------- */
15
16/* Import the standard encodings package which will register the first
17 codec search function.
18
19 This is done in a lazy way so that the Unicode implementation does
20 not downgrade startup time of scripts not needing it.
21
Guido van Rossumb95de4f2000-03-31 17:25:23 +000022 ImportErrors are silently ignored by this function. Only one try is
23 made.
Guido van Rossumfeee4b92000-03-10 22:57:27 +000024
25*/
26
Gustavo Niemeyer5ddd4c32003-03-19 00:35:36 +000027static int _PyCodecRegistry_Init(void); /* Forward */
Guido van Rossumfeee4b92000-03-10 22:57:27 +000028
Guido van Rossumfeee4b92000-03-10 22:57:27 +000029int PyCodec_Register(PyObject *search_function)
30{
Nicholas Bastine5662ae2004-03-24 22:22:12 +000031 PyInterpreterState *interp = PyThreadState_GET()->interp;
Gustavo Niemeyer5ddd4c32003-03-19 00:35:36 +000032 if (interp->codec_search_path == NULL && _PyCodecRegistry_Init())
33 goto onError;
Guido van Rossumfeee4b92000-03-10 22:57:27 +000034 if (search_function == NULL) {
35 PyErr_BadArgument();
Guido van Rossumb95de4f2000-03-31 17:25:23 +000036 goto onError;
Guido van Rossumfeee4b92000-03-10 22:57:27 +000037 }
38 if (!PyCallable_Check(search_function)) {
Neal Norwitz3715c3e2005-11-24 22:09:18 +000039 PyErr_SetString(PyExc_TypeError, "argument must be callable");
Guido van Rossumb95de4f2000-03-31 17:25:23 +000040 goto onError;
Guido van Rossumfeee4b92000-03-10 22:57:27 +000041 }
Gustavo Niemeyer5ddd4c32003-03-19 00:35:36 +000042 return PyList_Append(interp->codec_search_path, search_function);
Guido van Rossumb95de4f2000-03-31 17:25:23 +000043
44 onError:
45 return -1;
Guido van Rossumfeee4b92000-03-10 22:57:27 +000046}
47
Guido van Rossum9e896b32000-04-05 20:11:21 +000048/* Convert a string to a normalized Python string: all characters are
49 converted to lower case, spaces are replaced with underscores. */
50
Guido van Rossumfeee4b92000-03-10 22:57:27 +000051static
Guido van Rossum9e896b32000-04-05 20:11:21 +000052PyObject *normalizestring(const char *string)
Guido van Rossumfeee4b92000-03-10 22:57:27 +000053{
Guido van Rossum33831132000-06-29 14:50:15 +000054 register size_t i;
Guido van Rossum582acec2000-06-28 22:07:35 +000055 size_t len = strlen(string);
Guido van Rossumfeee4b92000-03-10 22:57:27 +000056 char *p;
57 PyObject *v;
58
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000059 if (len > PY_SSIZE_T_MAX) {
60 PyErr_SetString(PyExc_OverflowError, "string is too large");
61 return NULL;
62 }
Guido van Rossum582acec2000-06-28 22:07:35 +000063
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000064 v = PyString_FromStringAndSize(NULL, len);
Guido van Rossumfeee4b92000-03-10 22:57:27 +000065 if (v == NULL)
66 return NULL;
67 p = PyString_AS_STRING(v);
Guido van Rossum9e896b32000-04-05 20:11:21 +000068 for (i = 0; i < len; i++) {
69 register char ch = string[i];
70 if (ch == ' ')
71 ch = '-';
72 else
Thomas Wouters477c8d52006-05-27 19:21:47 +000073 ch = tolower(Py_CHARMASK(ch));
Guido van Rossum9e896b32000-04-05 20:11:21 +000074 p[i] = ch;
75 }
Guido van Rossumfeee4b92000-03-10 22:57:27 +000076 return v;
77}
78
79/* Lookup the given encoding and return a tuple providing the codec
80 facilities.
81
82 The encoding string is looked up converted to all lower-case
83 characters. This makes encodings looked up through this mechanism
84 effectively case-insensitive.
85
Fred Drake766de832000-05-09 19:55:59 +000086 If no codec is found, a LookupError is set and NULL returned.
Guido van Rossumb95de4f2000-03-31 17:25:23 +000087
88 As side effect, this tries to load the encodings package, if not
89 yet done. This is part of the lazy load strategy for the encodings
90 package.
91
92*/
Guido van Rossumfeee4b92000-03-10 22:57:27 +000093
94PyObject *_PyCodec_Lookup(const char *encoding)
95{
Gustavo Niemeyer5ddd4c32003-03-19 00:35:36 +000096 PyInterpreterState *interp;
Guido van Rossum5ba3c842000-03-24 20:52:23 +000097 PyObject *result, *args = NULL, *v;
Thomas Wouters477c8d52006-05-27 19:21:47 +000098 Py_ssize_t i, len;
Guido van Rossumfeee4b92000-03-10 22:57:27 +000099
Fred Drake766de832000-05-09 19:55:59 +0000100 if (encoding == NULL) {
101 PyErr_BadArgument();
102 goto onError;
103 }
Gustavo Niemeyer5ddd4c32003-03-19 00:35:36 +0000104
Nicholas Bastine5662ae2004-03-24 22:22:12 +0000105 interp = PyThreadState_GET()->interp;
Gustavo Niemeyer5ddd4c32003-03-19 00:35:36 +0000106 if (interp->codec_search_path == NULL && _PyCodecRegistry_Init())
Barry Warsaw51ac5802000-03-20 16:36:48 +0000107 goto onError;
Guido van Rossumfeee4b92000-03-10 22:57:27 +0000108
Guido van Rossum9e896b32000-04-05 20:11:21 +0000109 /* Convert the encoding to a normalized Python string: all
Thomas Wouters7e474022000-07-16 12:04:32 +0000110 characters are converted to lower case, spaces and hyphens are
Guido van Rossum9e896b32000-04-05 20:11:21 +0000111 replaced with underscores. */
112 v = normalizestring(encoding);
Guido van Rossumfeee4b92000-03-10 22:57:27 +0000113 if (v == NULL)
114 goto onError;
115 PyString_InternInPlace(&v);
116
117 /* First, try to lookup the name in the registry dictionary */
Gustavo Niemeyer5ddd4c32003-03-19 00:35:36 +0000118 result = PyDict_GetItem(interp->codec_search_cache, v);
Guido van Rossumfeee4b92000-03-10 22:57:27 +0000119 if (result != NULL) {
120 Py_INCREF(result);
Barry Warsaw51ac5802000-03-20 16:36:48 +0000121 Py_DECREF(v);
Guido van Rossumfeee4b92000-03-10 22:57:27 +0000122 return result;
123 }
124
125 /* Next, scan the search functions in order of registration */
Guido van Rossumfeee4b92000-03-10 22:57:27 +0000126 args = PyTuple_New(1);
127 if (args == NULL)
128 goto onError;
129 PyTuple_SET_ITEM(args,0,v);
Guido van Rossum5ba3c842000-03-24 20:52:23 +0000130
Gustavo Niemeyer5ddd4c32003-03-19 00:35:36 +0000131 len = PyList_Size(interp->codec_search_path);
Guido van Rossum5ba3c842000-03-24 20:52:23 +0000132 if (len < 0)
133 goto onError;
Guido van Rossumb95de4f2000-03-31 17:25:23 +0000134 if (len == 0) {
135 PyErr_SetString(PyExc_LookupError,
136 "no codec search functions registered: "
137 "can't find encoding");
138 goto onError;
139 }
Guido van Rossumfeee4b92000-03-10 22:57:27 +0000140
141 for (i = 0; i < len; i++) {
142 PyObject *func;
143
Gustavo Niemeyer5ddd4c32003-03-19 00:35:36 +0000144 func = PyList_GetItem(interp->codec_search_path, i);
Guido van Rossumfeee4b92000-03-10 22:57:27 +0000145 if (func == NULL)
146 goto onError;
Guido van Rossum5ba3c842000-03-24 20:52:23 +0000147 result = PyEval_CallObject(func, args);
Guido van Rossumfeee4b92000-03-10 22:57:27 +0000148 if (result == NULL)
149 goto onError;
150 if (result == Py_None) {
151 Py_DECREF(result);
152 continue;
153 }
154 if (!PyTuple_Check(result) || PyTuple_GET_SIZE(result) != 4) {
155 PyErr_SetString(PyExc_TypeError,
156 "codec search functions must return 4-tuples");
157 Py_DECREF(result);
158 goto onError;
159 }
160 break;
161 }
162 if (i == len) {
163 /* XXX Perhaps we should cache misses too ? */
Martin v. Löwiseb42b022002-09-26 16:01:24 +0000164 PyErr_Format(PyExc_LookupError,
165 "unknown encoding: %s", encoding);
Guido van Rossumfeee4b92000-03-10 22:57:27 +0000166 goto onError;
167 }
168
169 /* Cache and return the result */
Gustavo Niemeyer5ddd4c32003-03-19 00:35:36 +0000170 PyDict_SetItem(interp->codec_search_cache, v, result);
Guido van Rossumfeee4b92000-03-10 22:57:27 +0000171 Py_DECREF(args);
172 return result;
173
174 onError:
175 Py_XDECREF(args);
176 return NULL;
177}
178
179static
180PyObject *args_tuple(PyObject *object,
181 const char *errors)
182{
183 PyObject *args;
184
185 args = PyTuple_New(1 + (errors != NULL));
186 if (args == NULL)
187 return NULL;
188 Py_INCREF(object);
189 PyTuple_SET_ITEM(args,0,object);
190 if (errors) {
191 PyObject *v;
192
193 v = PyString_FromString(errors);
194 if (v == NULL) {
195 Py_DECREF(args);
196 return NULL;
197 }
198 PyTuple_SET_ITEM(args, 1, v);
199 }
200 return args;
201}
202
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000203/* Helper function to get a codec item */
Guido van Rossumfeee4b92000-03-10 22:57:27 +0000204
205static
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000206PyObject *codec_getitem(const char *encoding, int index)
Guido van Rossumfeee4b92000-03-10 22:57:27 +0000207{
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000208 PyObject *codecs;
209 PyObject *v;
Guido van Rossumfeee4b92000-03-10 22:57:27 +0000210
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000211 codecs = _PyCodec_Lookup(encoding);
212 if (codecs == NULL)
Guido van Rossumfeee4b92000-03-10 22:57:27 +0000213 return NULL;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000214 v = PyTuple_GET_ITEM(codecs, index);
215 Py_DECREF(codecs);
216 Py_INCREF(v);
217 return v;
218}
219
220/* Helper function to create an incremental codec. */
221
222static
223PyObject *codec_getincrementalcodec(const char *encoding,
224 const char *errors,
225 const char *attrname)
226{
227 PyObject *codecs, *ret, *inccodec;
228
229 codecs = _PyCodec_Lookup(encoding);
230 if (codecs == NULL)
231 return NULL;
232 inccodec = PyObject_GetAttrString(codecs, attrname);
233 Py_DECREF(codecs);
234 if (inccodec == NULL)
235 return NULL;
236 if (errors)
237 ret = PyObject_CallFunction(inccodec, "s", errors);
238 else
239 ret = PyObject_CallFunction(inccodec, NULL);
240 Py_DECREF(inccodec);
241 return ret;
242}
243
244/* Helper function to create a stream codec. */
245
246static
247PyObject *codec_getstreamcodec(const char *encoding,
248 PyObject *stream,
249 const char *errors,
250 const int index)
251{
252 PyObject *codecs, *streamcodec;
253
254 codecs = _PyCodec_Lookup(encoding);
255 if (codecs == NULL)
256 return NULL;
257
258 streamcodec = PyEval_CallFunction(
259 PyTuple_GET_ITEM(codecs, index), "Os", stream, errors);
260 Py_DECREF(codecs);
261 return streamcodec;
Guido van Rossumfeee4b92000-03-10 22:57:27 +0000262}
263
264/* Convenience APIs to query the Codec registry.
265
266 All APIs return a codec object with incremented refcount.
267
268 */
269
270PyObject *PyCodec_Encoder(const char *encoding)
271{
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000272 return codec_getitem(encoding, 0);
Guido van Rossumfeee4b92000-03-10 22:57:27 +0000273}
274
275PyObject *PyCodec_Decoder(const char *encoding)
276{
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000277 return codec_getitem(encoding, 1);
Guido van Rossumfeee4b92000-03-10 22:57:27 +0000278}
279
Thomas Woutersa9773292006-04-21 09:43:23 +0000280PyObject *PyCodec_IncrementalEncoder(const char *encoding,
281 const char *errors)
282{
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000283 return codec_getincrementalcodec(encoding, errors, "incrementalencoder");
Thomas Woutersa9773292006-04-21 09:43:23 +0000284}
285
286PyObject *PyCodec_IncrementalDecoder(const char *encoding,
287 const char *errors)
288{
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000289 return codec_getincrementalcodec(encoding, errors, "incrementaldecoder");
Thomas Woutersa9773292006-04-21 09:43:23 +0000290}
291
Guido van Rossumfeee4b92000-03-10 22:57:27 +0000292PyObject *PyCodec_StreamReader(const char *encoding,
293 PyObject *stream,
294 const char *errors)
295{
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000296 return codec_getstreamcodec(encoding, stream, errors, 2);
Guido van Rossumfeee4b92000-03-10 22:57:27 +0000297}
298
299PyObject *PyCodec_StreamWriter(const char *encoding,
300 PyObject *stream,
301 const char *errors)
302{
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000303 return codec_getstreamcodec(encoding, stream, errors, 3);
Guido van Rossumfeee4b92000-03-10 22:57:27 +0000304}
305
306/* Encode an object (e.g. an Unicode object) using the given encoding
307 and return the resulting encoded object (usually a Python string).
308
309 errors is passed to the encoder factory as argument if non-NULL. */
310
311PyObject *PyCodec_Encode(PyObject *object,
312 const char *encoding,
313 const char *errors)
314{
315 PyObject *encoder = NULL;
Neal Norwitz3715c3e2005-11-24 22:09:18 +0000316 PyObject *args = NULL, *result = NULL;
Guido van Rossumfeee4b92000-03-10 22:57:27 +0000317 PyObject *v;
318
319 encoder = PyCodec_Encoder(encoding);
320 if (encoder == NULL)
321 goto onError;
322
323 args = args_tuple(object, errors);
324 if (args == NULL)
325 goto onError;
326
327 result = PyEval_CallObject(encoder,args);
328 if (result == NULL)
329 goto onError;
330
331 if (!PyTuple_Check(result) ||
332 PyTuple_GET_SIZE(result) != 2) {
333 PyErr_SetString(PyExc_TypeError,
334 "encoder must return a tuple (object,integer)");
335 goto onError;
336 }
337 v = PyTuple_GET_ITEM(result,0);
338 Py_INCREF(v);
339 /* We don't check or use the second (integer) entry. */
340
341 Py_DECREF(args);
342 Py_DECREF(encoder);
343 Py_DECREF(result);
344 return v;
345
346 onError:
Neal Norwitz3715c3e2005-11-24 22:09:18 +0000347 Py_XDECREF(result);
Guido van Rossumfeee4b92000-03-10 22:57:27 +0000348 Py_XDECREF(args);
349 Py_XDECREF(encoder);
350 return NULL;
351}
352
353/* Decode an object (usually a Python string) using the given encoding
354 and return an equivalent object (e.g. an Unicode object).
355
356 errors is passed to the decoder factory as argument if non-NULL. */
357
358PyObject *PyCodec_Decode(PyObject *object,
359 const char *encoding,
360 const char *errors)
361{
362 PyObject *decoder = NULL;
363 PyObject *args = NULL, *result = NULL;
364 PyObject *v;
365
366 decoder = PyCodec_Decoder(encoding);
367 if (decoder == NULL)
368 goto onError;
369
370 args = args_tuple(object, errors);
371 if (args == NULL)
372 goto onError;
373
374 result = PyEval_CallObject(decoder,args);
375 if (result == NULL)
376 goto onError;
377 if (!PyTuple_Check(result) ||
378 PyTuple_GET_SIZE(result) != 2) {
379 PyErr_SetString(PyExc_TypeError,
380 "decoder must return a tuple (object,integer)");
381 goto onError;
382 }
383 v = PyTuple_GET_ITEM(result,0);
384 Py_INCREF(v);
385 /* We don't check or use the second (integer) entry. */
386
387 Py_DECREF(args);
388 Py_DECREF(decoder);
389 Py_DECREF(result);
390 return v;
391
392 onError:
393 Py_XDECREF(args);
394 Py_XDECREF(decoder);
395 Py_XDECREF(result);
396 return NULL;
397}
398
Walter Dörwald3aeb6322002-09-02 13:14:32 +0000399/* Register the error handling callback function error under the name
400 name. This function will be called by the codec when it encounters
401 an unencodable characters/undecodable bytes and doesn't know the
402 callback name, when name is specified as the error parameter
403 in the call to the encode/decode function.
404 Return 0 on success, -1 on error */
405int PyCodec_RegisterError(const char *name, PyObject *error)
406{
Nicholas Bastine5662ae2004-03-24 22:22:12 +0000407 PyInterpreterState *interp = PyThreadState_GET()->interp;
Gustavo Niemeyer5ddd4c32003-03-19 00:35:36 +0000408 if (interp->codec_search_path == NULL && _PyCodecRegistry_Init())
409 return -1;
Walter Dörwald3aeb6322002-09-02 13:14:32 +0000410 if (!PyCallable_Check(error)) {
411 PyErr_SetString(PyExc_TypeError, "handler must be callable");
412 return -1;
413 }
Gustavo Niemeyer5ddd4c32003-03-19 00:35:36 +0000414 return PyDict_SetItemString(interp->codec_error_registry,
415 (char *)name, error);
Walter Dörwald3aeb6322002-09-02 13:14:32 +0000416}
417
418/* Lookup the error handling callback function registered under the
419 name error. As a special case NULL can be passed, in which case
420 the error handling callback for strict encoding will be returned. */
421PyObject *PyCodec_LookupError(const char *name)
422{
423 PyObject *handler = NULL;
424
Nicholas Bastine5662ae2004-03-24 22:22:12 +0000425 PyInterpreterState *interp = PyThreadState_GET()->interp;
Gustavo Niemeyer5ddd4c32003-03-19 00:35:36 +0000426 if (interp->codec_search_path == NULL && _PyCodecRegistry_Init())
427 return NULL;
428
Walter Dörwald3aeb6322002-09-02 13:14:32 +0000429 if (name==NULL)
430 name = "strict";
Gustavo Niemeyer5ddd4c32003-03-19 00:35:36 +0000431 handler = PyDict_GetItemString(interp->codec_error_registry, (char *)name);
Walter Dörwald3aeb6322002-09-02 13:14:32 +0000432 if (!handler)
433 PyErr_Format(PyExc_LookupError, "unknown error handler name '%.400s'", name);
434 else
435 Py_INCREF(handler);
436 return handler;
437}
438
439static void wrong_exception_type(PyObject *exc)
440{
441 PyObject *type = PyObject_GetAttrString(exc, "__class__");
442 if (type != NULL) {
443 PyObject *name = PyObject_GetAttrString(type, "__name__");
444 Py_DECREF(type);
445 if (name != NULL) {
446 PyObject *string = PyObject_Str(name);
447 Py_DECREF(name);
Walter Dörwaldf7bcd1d2002-09-02 18:22:32 +0000448 if (string != NULL) {
449 PyErr_Format(PyExc_TypeError,
450 "don't know how to handle %.400s in error callback",
451 PyString_AS_STRING(string));
452 Py_DECREF(string);
453 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +0000454 }
455 }
456}
457
458PyObject *PyCodec_StrictErrors(PyObject *exc)
459{
Brett Cannonbf364092006-03-01 04:25:17 +0000460 if (PyExceptionInstance_Check(exc))
461 PyErr_SetObject(PyExceptionInstance_Class(exc), exc);
Walter Dörwald3aeb6322002-09-02 13:14:32 +0000462 else
463 PyErr_SetString(PyExc_TypeError, "codec must pass exception instance");
464 return NULL;
465}
466
467
Walter Dörwaldbf73db82002-11-21 20:08:33 +0000468#ifdef Py_USING_UNICODE
Walter Dörwald3aeb6322002-09-02 13:14:32 +0000469PyObject *PyCodec_IgnoreErrors(PyObject *exc)
470{
Martin v. Löwis18e16552006-02-15 17:27:45 +0000471 Py_ssize_t end;
Walter Dörwald3aeb6322002-09-02 13:14:32 +0000472 if (PyObject_IsInstance(exc, PyExc_UnicodeEncodeError)) {
473 if (PyUnicodeEncodeError_GetEnd(exc, &end))
474 return NULL;
475 }
476 else if (PyObject_IsInstance(exc, PyExc_UnicodeDecodeError)) {
477 if (PyUnicodeDecodeError_GetEnd(exc, &end))
478 return NULL;
479 }
480 else if (PyObject_IsInstance(exc, PyExc_UnicodeTranslateError)) {
481 if (PyUnicodeTranslateError_GetEnd(exc, &end))
482 return NULL;
483 }
484 else {
485 wrong_exception_type(exc);
486 return NULL;
487 }
488 /* ouch: passing NULL, 0, pos gives None instead of u'' */
Martin v. Löwis18e16552006-02-15 17:27:45 +0000489 return Py_BuildValue("(u#n)", &end, 0, end);
Walter Dörwald3aeb6322002-09-02 13:14:32 +0000490}
491
492
493PyObject *PyCodec_ReplaceErrors(PyObject *exc)
494{
495 PyObject *restuple;
Martin v. Löwis18e16552006-02-15 17:27:45 +0000496 Py_ssize_t start;
497 Py_ssize_t end;
498 Py_ssize_t i;
Walter Dörwald3aeb6322002-09-02 13:14:32 +0000499
500 if (PyObject_IsInstance(exc, PyExc_UnicodeEncodeError)) {
501 PyObject *res;
502 Py_UNICODE *p;
503 if (PyUnicodeEncodeError_GetStart(exc, &start))
504 return NULL;
505 if (PyUnicodeEncodeError_GetEnd(exc, &end))
506 return NULL;
507 res = PyUnicode_FromUnicode(NULL, end-start);
508 if (res == NULL)
509 return NULL;
510 for (p = PyUnicode_AS_UNICODE(res), i = start;
511 i<end; ++p, ++i)
512 *p = '?';
Martin v. Löwis18e16552006-02-15 17:27:45 +0000513 restuple = Py_BuildValue("(On)", res, end);
Walter Dörwald3aeb6322002-09-02 13:14:32 +0000514 Py_DECREF(res);
515 return restuple;
516 }
517 else if (PyObject_IsInstance(exc, PyExc_UnicodeDecodeError)) {
518 Py_UNICODE res = Py_UNICODE_REPLACEMENT_CHARACTER;
519 if (PyUnicodeDecodeError_GetEnd(exc, &end))
520 return NULL;
Martin v. Löwis18e16552006-02-15 17:27:45 +0000521 return Py_BuildValue("(u#n)", &res, 1, end);
Walter Dörwald3aeb6322002-09-02 13:14:32 +0000522 }
523 else if (PyObject_IsInstance(exc, PyExc_UnicodeTranslateError)) {
524 PyObject *res;
525 Py_UNICODE *p;
526 if (PyUnicodeTranslateError_GetStart(exc, &start))
527 return NULL;
528 if (PyUnicodeTranslateError_GetEnd(exc, &end))
529 return NULL;
530 res = PyUnicode_FromUnicode(NULL, end-start);
531 if (res == NULL)
532 return NULL;
533 for (p = PyUnicode_AS_UNICODE(res), i = start;
534 i<end; ++p, ++i)
535 *p = Py_UNICODE_REPLACEMENT_CHARACTER;
Martin v. Löwis18e16552006-02-15 17:27:45 +0000536 restuple = Py_BuildValue("(On)", res, end);
Walter Dörwald3aeb6322002-09-02 13:14:32 +0000537 Py_DECREF(res);
538 return restuple;
539 }
540 else {
541 wrong_exception_type(exc);
542 return NULL;
543 }
544}
545
546PyObject *PyCodec_XMLCharRefReplaceErrors(PyObject *exc)
547{
548 if (PyObject_IsInstance(exc, PyExc_UnicodeEncodeError)) {
549 PyObject *restuple;
550 PyObject *object;
Martin v. Löwis18e16552006-02-15 17:27:45 +0000551 Py_ssize_t start;
552 Py_ssize_t end;
Walter Dörwald3aeb6322002-09-02 13:14:32 +0000553 PyObject *res;
554 Py_UNICODE *p;
555 Py_UNICODE *startp;
556 Py_UNICODE *outp;
557 int ressize;
558 if (PyUnicodeEncodeError_GetStart(exc, &start))
559 return NULL;
560 if (PyUnicodeEncodeError_GetEnd(exc, &end))
561 return NULL;
562 if (!(object = PyUnicodeEncodeError_GetObject(exc)))
563 return NULL;
564 startp = PyUnicode_AS_UNICODE(object);
565 for (p = startp+start, ressize = 0; p < startp+end; ++p) {
566 if (*p<10)
567 ressize += 2+1+1;
568 else if (*p<100)
569 ressize += 2+2+1;
570 else if (*p<1000)
571 ressize += 2+3+1;
572 else if (*p<10000)
573 ressize += 2+4+1;
Hye-Shik Chang7db07e62003-12-29 01:36:01 +0000574#ifndef Py_UNICODE_WIDE
575 else
576 ressize += 2+5+1;
577#else
Walter Dörwald3aeb6322002-09-02 13:14:32 +0000578 else if (*p<100000)
579 ressize += 2+5+1;
580 else if (*p<1000000)
581 ressize += 2+6+1;
582 else
583 ressize += 2+7+1;
Hye-Shik Chang7db07e62003-12-29 01:36:01 +0000584#endif
Walter Dörwald3aeb6322002-09-02 13:14:32 +0000585 }
586 /* allocate replacement */
587 res = PyUnicode_FromUnicode(NULL, ressize);
588 if (res == NULL) {
589 Py_DECREF(object);
590 return NULL;
591 }
592 /* generate replacement */
593 for (p = startp+start, outp = PyUnicode_AS_UNICODE(res);
594 p < startp+end; ++p) {
595 Py_UNICODE c = *p;
596 int digits;
597 int base;
598 *outp++ = '&';
599 *outp++ = '#';
600 if (*p<10) {
601 digits = 1;
602 base = 1;
603 }
604 else if (*p<100) {
605 digits = 2;
606 base = 10;
607 }
608 else if (*p<1000) {
609 digits = 3;
610 base = 100;
611 }
612 else if (*p<10000) {
613 digits = 4;
614 base = 1000;
615 }
Hye-Shik Chang7db07e62003-12-29 01:36:01 +0000616#ifndef Py_UNICODE_WIDE
617 else {
618 digits = 5;
619 base = 10000;
620 }
621#else
Walter Dörwald3aeb6322002-09-02 13:14:32 +0000622 else if (*p<100000) {
623 digits = 5;
624 base = 10000;
625 }
626 else if (*p<1000000) {
627 digits = 6;
628 base = 100000;
629 }
630 else {
631 digits = 7;
632 base = 1000000;
633 }
Hye-Shik Chang7db07e62003-12-29 01:36:01 +0000634#endif
Walter Dörwald3aeb6322002-09-02 13:14:32 +0000635 while (digits-->0) {
636 *outp++ = '0' + c/base;
637 c %= base;
638 base /= 10;
639 }
640 *outp++ = ';';
641 }
Martin v. Löwis18e16552006-02-15 17:27:45 +0000642 restuple = Py_BuildValue("(On)", res, end);
Walter Dörwald3aeb6322002-09-02 13:14:32 +0000643 Py_DECREF(res);
644 Py_DECREF(object);
645 return restuple;
646 }
647 else {
648 wrong_exception_type(exc);
649 return NULL;
650 }
651}
652
653static Py_UNICODE hexdigits[] = {
654 '0', '1', '2', '3', '4', '5', '6', '7',
655 '8', '9', 'a', 'b', 'c', 'd', 'e', 'f'
656};
657
658PyObject *PyCodec_BackslashReplaceErrors(PyObject *exc)
659{
660 if (PyObject_IsInstance(exc, PyExc_UnicodeEncodeError)) {
661 PyObject *restuple;
662 PyObject *object;
Martin v. Löwis18e16552006-02-15 17:27:45 +0000663 Py_ssize_t start;
664 Py_ssize_t end;
Walter Dörwald3aeb6322002-09-02 13:14:32 +0000665 PyObject *res;
666 Py_UNICODE *p;
667 Py_UNICODE *startp;
668 Py_UNICODE *outp;
669 int ressize;
670 if (PyUnicodeEncodeError_GetStart(exc, &start))
671 return NULL;
672 if (PyUnicodeEncodeError_GetEnd(exc, &end))
673 return NULL;
674 if (!(object = PyUnicodeEncodeError_GetObject(exc)))
675 return NULL;
676 startp = PyUnicode_AS_UNICODE(object);
677 for (p = startp+start, ressize = 0; p < startp+end; ++p) {
Hye-Shik Chang7db07e62003-12-29 01:36:01 +0000678#ifdef Py_UNICODE_WIDE
Walter Dörwald3aeb6322002-09-02 13:14:32 +0000679 if (*p >= 0x00010000)
680 ressize += 1+1+8;
Hye-Shik Chang7db07e62003-12-29 01:36:01 +0000681 else
682#endif
683 if (*p >= 0x100) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +0000684 ressize += 1+1+4;
685 }
686 else
687 ressize += 1+1+2;
688 }
689 res = PyUnicode_FromUnicode(NULL, ressize);
690 if (res==NULL)
691 return NULL;
692 for (p = startp+start, outp = PyUnicode_AS_UNICODE(res);
693 p < startp+end; ++p) {
694 Py_UNICODE c = *p;
695 *outp++ = '\\';
Hye-Shik Chang7db07e62003-12-29 01:36:01 +0000696#ifdef Py_UNICODE_WIDE
Walter Dörwald3aeb6322002-09-02 13:14:32 +0000697 if (c >= 0x00010000) {
698 *outp++ = 'U';
699 *outp++ = hexdigits[(c>>28)&0xf];
700 *outp++ = hexdigits[(c>>24)&0xf];
701 *outp++ = hexdigits[(c>>20)&0xf];
702 *outp++ = hexdigits[(c>>16)&0xf];
703 *outp++ = hexdigits[(c>>12)&0xf];
704 *outp++ = hexdigits[(c>>8)&0xf];
705 }
Hye-Shik Chang7db07e62003-12-29 01:36:01 +0000706 else
707#endif
708 if (c >= 0x100) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +0000709 *outp++ = 'u';
710 *outp++ = hexdigits[(c>>12)&0xf];
711 *outp++ = hexdigits[(c>>8)&0xf];
712 }
713 else
714 *outp++ = 'x';
715 *outp++ = hexdigits[(c>>4)&0xf];
716 *outp++ = hexdigits[c&0xf];
717 }
718
Martin v. Löwis18e16552006-02-15 17:27:45 +0000719 restuple = Py_BuildValue("(On)", res, end);
Walter Dörwald3aeb6322002-09-02 13:14:32 +0000720 Py_DECREF(res);
721 Py_DECREF(object);
722 return restuple;
723 }
724 else {
725 wrong_exception_type(exc);
726 return NULL;
727 }
728}
Walter Dörwaldbf73db82002-11-21 20:08:33 +0000729#endif
Walter Dörwald3aeb6322002-09-02 13:14:32 +0000730
731static PyObject *strict_errors(PyObject *self, PyObject *exc)
732{
733 return PyCodec_StrictErrors(exc);
734}
735
736
Walter Dörwaldbf73db82002-11-21 20:08:33 +0000737#ifdef Py_USING_UNICODE
Walter Dörwald3aeb6322002-09-02 13:14:32 +0000738static PyObject *ignore_errors(PyObject *self, PyObject *exc)
739{
740 return PyCodec_IgnoreErrors(exc);
741}
742
743
744static PyObject *replace_errors(PyObject *self, PyObject *exc)
745{
746 return PyCodec_ReplaceErrors(exc);
747}
748
749
750static PyObject *xmlcharrefreplace_errors(PyObject *self, PyObject *exc)
751{
752 return PyCodec_XMLCharRefReplaceErrors(exc);
753}
754
755
756static PyObject *backslashreplace_errors(PyObject *self, PyObject *exc)
757{
758 return PyCodec_BackslashReplaceErrors(exc);
759}
Walter Dörwaldbf73db82002-11-21 20:08:33 +0000760#endif
Walter Dörwald3aeb6322002-09-02 13:14:32 +0000761
Gustavo Niemeyer5ddd4c32003-03-19 00:35:36 +0000762static int _PyCodecRegistry_Init(void)
Guido van Rossumfeee4b92000-03-10 22:57:27 +0000763{
Walter Dörwald3aeb6322002-09-02 13:14:32 +0000764 static struct {
765 char *name;
766 PyMethodDef def;
767 } methods[] =
768 {
769 {
770 "strict",
771 {
772 "strict_errors",
773 strict_errors,
774 METH_O
775 }
776 },
Walter Dörwaldbf73db82002-11-21 20:08:33 +0000777#ifdef Py_USING_UNICODE
Walter Dörwald3aeb6322002-09-02 13:14:32 +0000778 {
779 "ignore",
780 {
781 "ignore_errors",
782 ignore_errors,
783 METH_O
784 }
785 },
786 {
787 "replace",
788 {
789 "replace_errors",
790 replace_errors,
791 METH_O
792 }
793 },
794 {
795 "xmlcharrefreplace",
796 {
797 "xmlcharrefreplace_errors",
798 xmlcharrefreplace_errors,
799 METH_O
800 }
801 },
802 {
803 "backslashreplace",
804 {
805 "backslashreplace_errors",
806 backslashreplace_errors,
807 METH_O
808 }
809 }
Walter Dörwaldbf73db82002-11-21 20:08:33 +0000810#endif
Walter Dörwald3aeb6322002-09-02 13:14:32 +0000811 };
Walter Dörwald3aeb6322002-09-02 13:14:32 +0000812
Nicholas Bastine5662ae2004-03-24 22:22:12 +0000813 PyInterpreterState *interp = PyThreadState_GET()->interp;
Gustavo Niemeyer5ddd4c32003-03-19 00:35:36 +0000814 PyObject *mod;
Neal Norwitz739a8f82004-07-08 01:55:58 +0000815 unsigned i;
Gustavo Niemeyer5ddd4c32003-03-19 00:35:36 +0000816
817 if (interp->codec_search_path != NULL)
818 return 0;
819
820 interp->codec_search_path = PyList_New(0);
821 interp->codec_search_cache = PyDict_New();
822 interp->codec_error_registry = PyDict_New();
823
824 if (interp->codec_error_registry) {
825 for (i = 0; i < sizeof(methods)/sizeof(methods[0]); ++i) {
826 PyObject *func = PyCFunction_New(&methods[i].def, NULL);
827 int res;
828 if (!func)
829 Py_FatalError("can't initialize codec error registry");
830 res = PyCodec_RegisterError(methods[i].name, func);
831 Py_DECREF(func);
832 if (res)
833 Py_FatalError("can't initialize codec error registry");
Walter Dörwald3aeb6322002-09-02 13:14:32 +0000834 }
835 }
Guido van Rossumfeee4b92000-03-10 22:57:27 +0000836
Gustavo Niemeyer5ddd4c32003-03-19 00:35:36 +0000837 if (interp->codec_search_path == NULL ||
838 interp->codec_search_cache == NULL ||
839 interp->codec_error_registry == NULL)
840 Py_FatalError("can't initialize codec registry");
841
Thomas Woutersf7f438b2006-02-28 16:09:29 +0000842 mod = PyImport_ImportModuleLevel("encodings", NULL, NULL, NULL, 0);
Gustavo Niemeyer5ddd4c32003-03-19 00:35:36 +0000843 if (mod == NULL) {
844 if (PyErr_ExceptionMatches(PyExc_ImportError)) {
845 /* Ignore ImportErrors... this is done so that
846 distributions can disable the encodings package. Note
847 that other errors are not masked, e.g. SystemErrors
848 raised to inform the user of an error in the Python
849 configuration are still reported back to the user. */
850 PyErr_Clear();
851 return 0;
852 }
853 return -1;
854 }
855 Py_DECREF(mod);
856 return 0;
Guido van Rossumfeee4b92000-03-10 22:57:27 +0000857}