blob: 464fffc1507725615aab18971dd9ad936080d20a [file] [log] [blame]
Guido van Rossumfeee4b92000-03-10 22:57:27 +00001/* ------------------------------------------------------------------------
2
3 Python Codec Registry and support functions
4
5Written by Marc-Andre Lemburg (mal@lemburg.com).
6
Guido van Rossum16b1ad92000-08-03 16:24:25 +00007Copyright (c) Corporation for National Research Initiatives.
Guido van Rossumfeee4b92000-03-10 22:57:27 +00008
9 ------------------------------------------------------------------------ */
10
11#include "Python.h"
12#include <ctype.h>
13
Guido van Rossumfeee4b92000-03-10 22:57:27 +000014/* --- Codec Registry ----------------------------------------------------- */
15
16/* Import the standard encodings package which will register the first
17 codec search function.
18
19 This is done in a lazy way so that the Unicode implementation does
20 not downgrade startup time of scripts not needing it.
21
Guido van Rossumb95de4f2000-03-31 17:25:23 +000022 ImportErrors are silently ignored by this function. Only one try is
23 made.
Guido van Rossumfeee4b92000-03-10 22:57:27 +000024
25*/
26
Gustavo Niemeyer5ddd4c32003-03-19 00:35:36 +000027static int _PyCodecRegistry_Init(void); /* Forward */
Guido van Rossumfeee4b92000-03-10 22:57:27 +000028
Guido van Rossumfeee4b92000-03-10 22:57:27 +000029int PyCodec_Register(PyObject *search_function)
30{
Nicholas Bastine5662ae2004-03-24 22:22:12 +000031 PyInterpreterState *interp = PyThreadState_GET()->interp;
Gustavo Niemeyer5ddd4c32003-03-19 00:35:36 +000032 if (interp->codec_search_path == NULL && _PyCodecRegistry_Init())
33 goto onError;
Guido van Rossumfeee4b92000-03-10 22:57:27 +000034 if (search_function == NULL) {
35 PyErr_BadArgument();
Guido van Rossumb95de4f2000-03-31 17:25:23 +000036 goto onError;
Guido van Rossumfeee4b92000-03-10 22:57:27 +000037 }
38 if (!PyCallable_Check(search_function)) {
Neal Norwitz3715c3e2005-11-24 22:09:18 +000039 PyErr_SetString(PyExc_TypeError, "argument must be callable");
Guido van Rossumb95de4f2000-03-31 17:25:23 +000040 goto onError;
Guido van Rossumfeee4b92000-03-10 22:57:27 +000041 }
Gustavo Niemeyer5ddd4c32003-03-19 00:35:36 +000042 return PyList_Append(interp->codec_search_path, search_function);
Guido van Rossumb95de4f2000-03-31 17:25:23 +000043
44 onError:
45 return -1;
Guido van Rossumfeee4b92000-03-10 22:57:27 +000046}
47
Guido van Rossum9e896b32000-04-05 20:11:21 +000048/* Convert a string to a normalized Python string: all characters are
49 converted to lower case, spaces are replaced with underscores. */
50
Guido van Rossumfeee4b92000-03-10 22:57:27 +000051static
Guido van Rossum9e896b32000-04-05 20:11:21 +000052PyObject *normalizestring(const char *string)
Guido van Rossumfeee4b92000-03-10 22:57:27 +000053{
Guido van Rossum33831132000-06-29 14:50:15 +000054 register size_t i;
Guido van Rossum582acec2000-06-28 22:07:35 +000055 size_t len = strlen(string);
Guido van Rossumfeee4b92000-03-10 22:57:27 +000056 char *p;
57 PyObject *v;
58
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000059 if (len > PY_SSIZE_T_MAX) {
60 PyErr_SetString(PyExc_OverflowError, "string is too large");
61 return NULL;
62 }
Guido van Rossum582acec2000-06-28 22:07:35 +000063
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000064 v = PyString_FromStringAndSize(NULL, len);
Guido van Rossumfeee4b92000-03-10 22:57:27 +000065 if (v == NULL)
66 return NULL;
67 p = PyString_AS_STRING(v);
Guido van Rossum9e896b32000-04-05 20:11:21 +000068 for (i = 0; i < len; i++) {
69 register char ch = string[i];
70 if (ch == ' ')
71 ch = '-';
72 else
Thomas Wouters477c8d52006-05-27 19:21:47 +000073 ch = tolower(Py_CHARMASK(ch));
Guido van Rossum9e896b32000-04-05 20:11:21 +000074 p[i] = ch;
75 }
Guido van Rossumfeee4b92000-03-10 22:57:27 +000076 return v;
77}
78
79/* Lookup the given encoding and return a tuple providing the codec
80 facilities.
81
82 The encoding string is looked up converted to all lower-case
83 characters. This makes encodings looked up through this mechanism
84 effectively case-insensitive.
85
Fred Drake766de832000-05-09 19:55:59 +000086 If no codec is found, a LookupError is set and NULL returned.
Guido van Rossumb95de4f2000-03-31 17:25:23 +000087
88 As side effect, this tries to load the encodings package, if not
89 yet done. This is part of the lazy load strategy for the encodings
90 package.
91
92*/
Guido van Rossumfeee4b92000-03-10 22:57:27 +000093
94PyObject *_PyCodec_Lookup(const char *encoding)
95{
Gustavo Niemeyer5ddd4c32003-03-19 00:35:36 +000096 PyInterpreterState *interp;
Guido van Rossum5ba3c842000-03-24 20:52:23 +000097 PyObject *result, *args = NULL, *v;
Thomas Wouters477c8d52006-05-27 19:21:47 +000098 Py_ssize_t i, len;
Guido van Rossumfeee4b92000-03-10 22:57:27 +000099
Fred Drake766de832000-05-09 19:55:59 +0000100 if (encoding == NULL) {
101 PyErr_BadArgument();
102 goto onError;
103 }
Gustavo Niemeyer5ddd4c32003-03-19 00:35:36 +0000104
Nicholas Bastine5662ae2004-03-24 22:22:12 +0000105 interp = PyThreadState_GET()->interp;
Gustavo Niemeyer5ddd4c32003-03-19 00:35:36 +0000106 if (interp->codec_search_path == NULL && _PyCodecRegistry_Init())
Barry Warsaw51ac5802000-03-20 16:36:48 +0000107 goto onError;
Guido van Rossumfeee4b92000-03-10 22:57:27 +0000108
Guido van Rossum9e896b32000-04-05 20:11:21 +0000109 /* Convert the encoding to a normalized Python string: all
Thomas Wouters7e474022000-07-16 12:04:32 +0000110 characters are converted to lower case, spaces and hyphens are
Guido van Rossum9e896b32000-04-05 20:11:21 +0000111 replaced with underscores. */
112 v = normalizestring(encoding);
Guido van Rossumfeee4b92000-03-10 22:57:27 +0000113 if (v == NULL)
114 goto onError;
115 PyString_InternInPlace(&v);
116
117 /* First, try to lookup the name in the registry dictionary */
Gustavo Niemeyer5ddd4c32003-03-19 00:35:36 +0000118 result = PyDict_GetItem(interp->codec_search_cache, v);
Guido van Rossumfeee4b92000-03-10 22:57:27 +0000119 if (result != NULL) {
120 Py_INCREF(result);
Barry Warsaw51ac5802000-03-20 16:36:48 +0000121 Py_DECREF(v);
Guido van Rossumfeee4b92000-03-10 22:57:27 +0000122 return result;
123 }
124
125 /* Next, scan the search functions in order of registration */
Guido van Rossumfeee4b92000-03-10 22:57:27 +0000126 args = PyTuple_New(1);
127 if (args == NULL)
128 goto onError;
129 PyTuple_SET_ITEM(args,0,v);
Guido van Rossum5ba3c842000-03-24 20:52:23 +0000130
Gustavo Niemeyer5ddd4c32003-03-19 00:35:36 +0000131 len = PyList_Size(interp->codec_search_path);
Guido van Rossum5ba3c842000-03-24 20:52:23 +0000132 if (len < 0)
133 goto onError;
Guido van Rossumb95de4f2000-03-31 17:25:23 +0000134 if (len == 0) {
135 PyErr_SetString(PyExc_LookupError,
136 "no codec search functions registered: "
137 "can't find encoding");
138 goto onError;
139 }
Guido van Rossumfeee4b92000-03-10 22:57:27 +0000140
141 for (i = 0; i < len; i++) {
142 PyObject *func;
143
Gustavo Niemeyer5ddd4c32003-03-19 00:35:36 +0000144 func = PyList_GetItem(interp->codec_search_path, i);
Guido van Rossumfeee4b92000-03-10 22:57:27 +0000145 if (func == NULL)
146 goto onError;
Guido van Rossum5ba3c842000-03-24 20:52:23 +0000147 result = PyEval_CallObject(func, args);
Guido van Rossumfeee4b92000-03-10 22:57:27 +0000148 if (result == NULL)
149 goto onError;
150 if (result == Py_None) {
151 Py_DECREF(result);
152 continue;
153 }
154 if (!PyTuple_Check(result) || PyTuple_GET_SIZE(result) != 4) {
155 PyErr_SetString(PyExc_TypeError,
156 "codec search functions must return 4-tuples");
157 Py_DECREF(result);
158 goto onError;
159 }
160 break;
161 }
162 if (i == len) {
163 /* XXX Perhaps we should cache misses too ? */
Martin v. Löwiseb42b022002-09-26 16:01:24 +0000164 PyErr_Format(PyExc_LookupError,
165 "unknown encoding: %s", encoding);
Guido van Rossumfeee4b92000-03-10 22:57:27 +0000166 goto onError;
167 }
168
169 /* Cache and return the result */
Neal Norwitz9edcc2e2007-08-11 04:58:26 +0000170 if (PyDict_SetItem(interp->codec_search_cache, v, result) < 0) {
171 Py_DECREF(result);
172 goto onError;
173 }
Guido van Rossumfeee4b92000-03-10 22:57:27 +0000174 Py_DECREF(args);
175 return result;
176
177 onError:
178 Py_XDECREF(args);
179 return NULL;
180}
181
182static
183PyObject *args_tuple(PyObject *object,
184 const char *errors)
185{
186 PyObject *args;
187
188 args = PyTuple_New(1 + (errors != NULL));
189 if (args == NULL)
190 return NULL;
191 Py_INCREF(object);
192 PyTuple_SET_ITEM(args,0,object);
193 if (errors) {
194 PyObject *v;
195
196 v = PyString_FromString(errors);
197 if (v == NULL) {
198 Py_DECREF(args);
199 return NULL;
200 }
201 PyTuple_SET_ITEM(args, 1, v);
202 }
203 return args;
204}
205
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000206/* Helper function to get a codec item */
Guido van Rossumfeee4b92000-03-10 22:57:27 +0000207
208static
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000209PyObject *codec_getitem(const char *encoding, int index)
Guido van Rossumfeee4b92000-03-10 22:57:27 +0000210{
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000211 PyObject *codecs;
212 PyObject *v;
Guido van Rossumfeee4b92000-03-10 22:57:27 +0000213
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000214 codecs = _PyCodec_Lookup(encoding);
215 if (codecs == NULL)
Guido van Rossumfeee4b92000-03-10 22:57:27 +0000216 return NULL;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000217 v = PyTuple_GET_ITEM(codecs, index);
218 Py_DECREF(codecs);
219 Py_INCREF(v);
220 return v;
221}
222
223/* Helper function to create an incremental codec. */
224
225static
226PyObject *codec_getincrementalcodec(const char *encoding,
227 const char *errors,
228 const char *attrname)
229{
230 PyObject *codecs, *ret, *inccodec;
231
232 codecs = _PyCodec_Lookup(encoding);
233 if (codecs == NULL)
234 return NULL;
235 inccodec = PyObject_GetAttrString(codecs, attrname);
236 Py_DECREF(codecs);
237 if (inccodec == NULL)
238 return NULL;
239 if (errors)
240 ret = PyObject_CallFunction(inccodec, "s", errors);
241 else
242 ret = PyObject_CallFunction(inccodec, NULL);
243 Py_DECREF(inccodec);
244 return ret;
245}
246
247/* Helper function to create a stream codec. */
248
249static
250PyObject *codec_getstreamcodec(const char *encoding,
251 PyObject *stream,
252 const char *errors,
253 const int index)
254{
Thomas Wouters0e3f5912006-08-11 14:57:12 +0000255 PyObject *codecs, *streamcodec, *codeccls;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000256
257 codecs = _PyCodec_Lookup(encoding);
258 if (codecs == NULL)
259 return NULL;
260
Thomas Wouters0e3f5912006-08-11 14:57:12 +0000261 codeccls = PyTuple_GET_ITEM(codecs, index);
262 if (errors != NULL)
263 streamcodec = PyObject_CallFunction(codeccls, "Os", stream, errors);
264 else
265 streamcodec = PyObject_CallFunction(codeccls, "O", stream);
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000266 Py_DECREF(codecs);
267 return streamcodec;
Guido van Rossumfeee4b92000-03-10 22:57:27 +0000268}
269
270/* Convenience APIs to query the Codec registry.
271
272 All APIs return a codec object with incremented refcount.
273
274 */
275
276PyObject *PyCodec_Encoder(const char *encoding)
277{
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000278 return codec_getitem(encoding, 0);
Guido van Rossumfeee4b92000-03-10 22:57:27 +0000279}
280
281PyObject *PyCodec_Decoder(const char *encoding)
282{
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000283 return codec_getitem(encoding, 1);
Guido van Rossumfeee4b92000-03-10 22:57:27 +0000284}
285
Thomas Woutersa9773292006-04-21 09:43:23 +0000286PyObject *PyCodec_IncrementalEncoder(const char *encoding,
287 const char *errors)
288{
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000289 return codec_getincrementalcodec(encoding, errors, "incrementalencoder");
Thomas Woutersa9773292006-04-21 09:43:23 +0000290}
291
292PyObject *PyCodec_IncrementalDecoder(const char *encoding,
293 const char *errors)
294{
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000295 return codec_getincrementalcodec(encoding, errors, "incrementaldecoder");
Thomas Woutersa9773292006-04-21 09:43:23 +0000296}
297
Guido van Rossumfeee4b92000-03-10 22:57:27 +0000298PyObject *PyCodec_StreamReader(const char *encoding,
299 PyObject *stream,
300 const char *errors)
301{
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000302 return codec_getstreamcodec(encoding, stream, errors, 2);
Guido van Rossumfeee4b92000-03-10 22:57:27 +0000303}
304
305PyObject *PyCodec_StreamWriter(const char *encoding,
306 PyObject *stream,
307 const char *errors)
308{
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000309 return codec_getstreamcodec(encoding, stream, errors, 3);
Guido van Rossumfeee4b92000-03-10 22:57:27 +0000310}
311
312/* Encode an object (e.g. an Unicode object) using the given encoding
313 and return the resulting encoded object (usually a Python string).
314
315 errors is passed to the encoder factory as argument if non-NULL. */
316
317PyObject *PyCodec_Encode(PyObject *object,
318 const char *encoding,
319 const char *errors)
320{
321 PyObject *encoder = NULL;
Neal Norwitz3715c3e2005-11-24 22:09:18 +0000322 PyObject *args = NULL, *result = NULL;
Guido van Rossumfeee4b92000-03-10 22:57:27 +0000323 PyObject *v;
324
325 encoder = PyCodec_Encoder(encoding);
326 if (encoder == NULL)
327 goto onError;
328
329 args = args_tuple(object, errors);
330 if (args == NULL)
331 goto onError;
332
333 result = PyEval_CallObject(encoder,args);
334 if (result == NULL)
335 goto onError;
336
337 if (!PyTuple_Check(result) ||
338 PyTuple_GET_SIZE(result) != 2) {
339 PyErr_SetString(PyExc_TypeError,
340 "encoder must return a tuple (object,integer)");
341 goto onError;
342 }
343 v = PyTuple_GET_ITEM(result,0);
344 Py_INCREF(v);
345 /* We don't check or use the second (integer) entry. */
346
347 Py_DECREF(args);
348 Py_DECREF(encoder);
349 Py_DECREF(result);
350 return v;
351
352 onError:
Neal Norwitz3715c3e2005-11-24 22:09:18 +0000353 Py_XDECREF(result);
Guido van Rossumfeee4b92000-03-10 22:57:27 +0000354 Py_XDECREF(args);
355 Py_XDECREF(encoder);
356 return NULL;
357}
358
359/* Decode an object (usually a Python string) using the given encoding
360 and return an equivalent object (e.g. an Unicode object).
361
362 errors is passed to the decoder factory as argument if non-NULL. */
363
364PyObject *PyCodec_Decode(PyObject *object,
365 const char *encoding,
366 const char *errors)
367{
368 PyObject *decoder = NULL;
369 PyObject *args = NULL, *result = NULL;
370 PyObject *v;
371
372 decoder = PyCodec_Decoder(encoding);
373 if (decoder == NULL)
374 goto onError;
375
376 args = args_tuple(object, errors);
377 if (args == NULL)
378 goto onError;
379
380 result = PyEval_CallObject(decoder,args);
381 if (result == NULL)
382 goto onError;
383 if (!PyTuple_Check(result) ||
384 PyTuple_GET_SIZE(result) != 2) {
385 PyErr_SetString(PyExc_TypeError,
386 "decoder must return a tuple (object,integer)");
387 goto onError;
388 }
389 v = PyTuple_GET_ITEM(result,0);
390 Py_INCREF(v);
391 /* We don't check or use the second (integer) entry. */
392
393 Py_DECREF(args);
394 Py_DECREF(decoder);
395 Py_DECREF(result);
396 return v;
397
398 onError:
399 Py_XDECREF(args);
400 Py_XDECREF(decoder);
401 Py_XDECREF(result);
402 return NULL;
403}
404
Walter Dörwald3aeb6322002-09-02 13:14:32 +0000405/* Register the error handling callback function error under the name
406 name. This function will be called by the codec when it encounters
407 an unencodable characters/undecodable bytes and doesn't know the
408 callback name, when name is specified as the error parameter
409 in the call to the encode/decode function.
410 Return 0 on success, -1 on error */
411int PyCodec_RegisterError(const char *name, PyObject *error)
412{
Nicholas Bastine5662ae2004-03-24 22:22:12 +0000413 PyInterpreterState *interp = PyThreadState_GET()->interp;
Gustavo Niemeyer5ddd4c32003-03-19 00:35:36 +0000414 if (interp->codec_search_path == NULL && _PyCodecRegistry_Init())
415 return -1;
Walter Dörwald3aeb6322002-09-02 13:14:32 +0000416 if (!PyCallable_Check(error)) {
417 PyErr_SetString(PyExc_TypeError, "handler must be callable");
418 return -1;
419 }
Gustavo Niemeyer5ddd4c32003-03-19 00:35:36 +0000420 return PyDict_SetItemString(interp->codec_error_registry,
421 (char *)name, error);
Walter Dörwald3aeb6322002-09-02 13:14:32 +0000422}
423
424/* Lookup the error handling callback function registered under the
425 name error. As a special case NULL can be passed, in which case
426 the error handling callback for strict encoding will be returned. */
427PyObject *PyCodec_LookupError(const char *name)
428{
429 PyObject *handler = NULL;
430
Nicholas Bastine5662ae2004-03-24 22:22:12 +0000431 PyInterpreterState *interp = PyThreadState_GET()->interp;
Gustavo Niemeyer5ddd4c32003-03-19 00:35:36 +0000432 if (interp->codec_search_path == NULL && _PyCodecRegistry_Init())
433 return NULL;
434
Walter Dörwald3aeb6322002-09-02 13:14:32 +0000435 if (name==NULL)
436 name = "strict";
Gustavo Niemeyer5ddd4c32003-03-19 00:35:36 +0000437 handler = PyDict_GetItemString(interp->codec_error_registry, (char *)name);
Walter Dörwald3aeb6322002-09-02 13:14:32 +0000438 if (!handler)
439 PyErr_Format(PyExc_LookupError, "unknown error handler name '%.400s'", name);
440 else
441 Py_INCREF(handler);
442 return handler;
443}
444
445static void wrong_exception_type(PyObject *exc)
446{
447 PyObject *type = PyObject_GetAttrString(exc, "__class__");
448 if (type != NULL) {
Walter Dörwald573c08c2007-05-25 15:46:59 +0000449 PyObject *name = PyObject_GetAttrString(type, "__name__");
450 Py_DECREF(type);
451 if (name != NULL) {
452 PyErr_Format(PyExc_TypeError,
453 "don't know how to handle %S in error callback", name);
454 Py_DECREF(name);
455 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +0000456 }
457}
458
459PyObject *PyCodec_StrictErrors(PyObject *exc)
460{
Brett Cannonbf364092006-03-01 04:25:17 +0000461 if (PyExceptionInstance_Check(exc))
462 PyErr_SetObject(PyExceptionInstance_Class(exc), exc);
Walter Dörwald3aeb6322002-09-02 13:14:32 +0000463 else
464 PyErr_SetString(PyExc_TypeError, "codec must pass exception instance");
465 return NULL;
466}
467
468
469PyObject *PyCodec_IgnoreErrors(PyObject *exc)
470{
Martin v. Löwis18e16552006-02-15 17:27:45 +0000471 Py_ssize_t end;
Walter Dörwald3aeb6322002-09-02 13:14:32 +0000472 if (PyObject_IsInstance(exc, PyExc_UnicodeEncodeError)) {
473 if (PyUnicodeEncodeError_GetEnd(exc, &end))
474 return NULL;
475 }
476 else if (PyObject_IsInstance(exc, PyExc_UnicodeDecodeError)) {
477 if (PyUnicodeDecodeError_GetEnd(exc, &end))
478 return NULL;
479 }
480 else if (PyObject_IsInstance(exc, PyExc_UnicodeTranslateError)) {
481 if (PyUnicodeTranslateError_GetEnd(exc, &end))
482 return NULL;
483 }
484 else {
485 wrong_exception_type(exc);
486 return NULL;
487 }
488 /* ouch: passing NULL, 0, pos gives None instead of u'' */
Martin v. Löwis18e16552006-02-15 17:27:45 +0000489 return Py_BuildValue("(u#n)", &end, 0, end);
Walter Dörwald3aeb6322002-09-02 13:14:32 +0000490}
491
492
493PyObject *PyCodec_ReplaceErrors(PyObject *exc)
494{
495 PyObject *restuple;
Martin v. Löwis18e16552006-02-15 17:27:45 +0000496 Py_ssize_t start;
497 Py_ssize_t end;
498 Py_ssize_t i;
Walter Dörwald3aeb6322002-09-02 13:14:32 +0000499
500 if (PyObject_IsInstance(exc, PyExc_UnicodeEncodeError)) {
501 PyObject *res;
502 Py_UNICODE *p;
503 if (PyUnicodeEncodeError_GetStart(exc, &start))
504 return NULL;
505 if (PyUnicodeEncodeError_GetEnd(exc, &end))
506 return NULL;
507 res = PyUnicode_FromUnicode(NULL, end-start);
508 if (res == NULL)
509 return NULL;
510 for (p = PyUnicode_AS_UNICODE(res), i = start;
511 i<end; ++p, ++i)
512 *p = '?';
Martin v. Löwis18e16552006-02-15 17:27:45 +0000513 restuple = Py_BuildValue("(On)", res, end);
Walter Dörwald3aeb6322002-09-02 13:14:32 +0000514 Py_DECREF(res);
515 return restuple;
516 }
517 else if (PyObject_IsInstance(exc, PyExc_UnicodeDecodeError)) {
518 Py_UNICODE res = Py_UNICODE_REPLACEMENT_CHARACTER;
519 if (PyUnicodeDecodeError_GetEnd(exc, &end))
520 return NULL;
Martin v. Löwis18e16552006-02-15 17:27:45 +0000521 return Py_BuildValue("(u#n)", &res, 1, end);
Walter Dörwald3aeb6322002-09-02 13:14:32 +0000522 }
523 else if (PyObject_IsInstance(exc, PyExc_UnicodeTranslateError)) {
524 PyObject *res;
525 Py_UNICODE *p;
526 if (PyUnicodeTranslateError_GetStart(exc, &start))
527 return NULL;
528 if (PyUnicodeTranslateError_GetEnd(exc, &end))
529 return NULL;
530 res = PyUnicode_FromUnicode(NULL, end-start);
531 if (res == NULL)
532 return NULL;
533 for (p = PyUnicode_AS_UNICODE(res), i = start;
534 i<end; ++p, ++i)
535 *p = Py_UNICODE_REPLACEMENT_CHARACTER;
Martin v. Löwis18e16552006-02-15 17:27:45 +0000536 restuple = Py_BuildValue("(On)", res, end);
Walter Dörwald3aeb6322002-09-02 13:14:32 +0000537 Py_DECREF(res);
538 return restuple;
539 }
540 else {
541 wrong_exception_type(exc);
542 return NULL;
543 }
544}
545
546PyObject *PyCodec_XMLCharRefReplaceErrors(PyObject *exc)
547{
548 if (PyObject_IsInstance(exc, PyExc_UnicodeEncodeError)) {
549 PyObject *restuple;
550 PyObject *object;
Martin v. Löwis18e16552006-02-15 17:27:45 +0000551 Py_ssize_t start;
552 Py_ssize_t end;
Walter Dörwald3aeb6322002-09-02 13:14:32 +0000553 PyObject *res;
554 Py_UNICODE *p;
555 Py_UNICODE *startp;
556 Py_UNICODE *outp;
557 int ressize;
558 if (PyUnicodeEncodeError_GetStart(exc, &start))
559 return NULL;
560 if (PyUnicodeEncodeError_GetEnd(exc, &end))
561 return NULL;
562 if (!(object = PyUnicodeEncodeError_GetObject(exc)))
563 return NULL;
564 startp = PyUnicode_AS_UNICODE(object);
565 for (p = startp+start, ressize = 0; p < startp+end; ++p) {
566 if (*p<10)
567 ressize += 2+1+1;
568 else if (*p<100)
569 ressize += 2+2+1;
570 else if (*p<1000)
571 ressize += 2+3+1;
572 else if (*p<10000)
573 ressize += 2+4+1;
Hye-Shik Chang7db07e62003-12-29 01:36:01 +0000574#ifndef Py_UNICODE_WIDE
575 else
576 ressize += 2+5+1;
577#else
Walter Dörwald3aeb6322002-09-02 13:14:32 +0000578 else if (*p<100000)
579 ressize += 2+5+1;
580 else if (*p<1000000)
581 ressize += 2+6+1;
582 else
583 ressize += 2+7+1;
Hye-Shik Chang7db07e62003-12-29 01:36:01 +0000584#endif
Walter Dörwald3aeb6322002-09-02 13:14:32 +0000585 }
586 /* allocate replacement */
587 res = PyUnicode_FromUnicode(NULL, ressize);
588 if (res == NULL) {
589 Py_DECREF(object);
590 return NULL;
591 }
592 /* generate replacement */
593 for (p = startp+start, outp = PyUnicode_AS_UNICODE(res);
594 p < startp+end; ++p) {
595 Py_UNICODE c = *p;
596 int digits;
597 int base;
598 *outp++ = '&';
599 *outp++ = '#';
600 if (*p<10) {
601 digits = 1;
602 base = 1;
603 }
604 else if (*p<100) {
605 digits = 2;
606 base = 10;
607 }
608 else if (*p<1000) {
609 digits = 3;
610 base = 100;
611 }
612 else if (*p<10000) {
613 digits = 4;
614 base = 1000;
615 }
Hye-Shik Chang7db07e62003-12-29 01:36:01 +0000616#ifndef Py_UNICODE_WIDE
617 else {
618 digits = 5;
619 base = 10000;
620 }
621#else
Walter Dörwald3aeb6322002-09-02 13:14:32 +0000622 else if (*p<100000) {
623 digits = 5;
624 base = 10000;
625 }
626 else if (*p<1000000) {
627 digits = 6;
628 base = 100000;
629 }
630 else {
631 digits = 7;
632 base = 1000000;
633 }
Hye-Shik Chang7db07e62003-12-29 01:36:01 +0000634#endif
Walter Dörwald3aeb6322002-09-02 13:14:32 +0000635 while (digits-->0) {
636 *outp++ = '0' + c/base;
637 c %= base;
638 base /= 10;
639 }
640 *outp++ = ';';
641 }
Martin v. Löwis18e16552006-02-15 17:27:45 +0000642 restuple = Py_BuildValue("(On)", res, end);
Walter Dörwald3aeb6322002-09-02 13:14:32 +0000643 Py_DECREF(res);
644 Py_DECREF(object);
645 return restuple;
646 }
647 else {
648 wrong_exception_type(exc);
649 return NULL;
650 }
651}
652
653static Py_UNICODE hexdigits[] = {
654 '0', '1', '2', '3', '4', '5', '6', '7',
655 '8', '9', 'a', 'b', 'c', 'd', 'e', 'f'
656};
657
658PyObject *PyCodec_BackslashReplaceErrors(PyObject *exc)
659{
660 if (PyObject_IsInstance(exc, PyExc_UnicodeEncodeError)) {
661 PyObject *restuple;
662 PyObject *object;
Martin v. Löwis18e16552006-02-15 17:27:45 +0000663 Py_ssize_t start;
664 Py_ssize_t end;
Walter Dörwald3aeb6322002-09-02 13:14:32 +0000665 PyObject *res;
666 Py_UNICODE *p;
667 Py_UNICODE *startp;
668 Py_UNICODE *outp;
669 int ressize;
670 if (PyUnicodeEncodeError_GetStart(exc, &start))
671 return NULL;
672 if (PyUnicodeEncodeError_GetEnd(exc, &end))
673 return NULL;
674 if (!(object = PyUnicodeEncodeError_GetObject(exc)))
675 return NULL;
676 startp = PyUnicode_AS_UNICODE(object);
677 for (p = startp+start, ressize = 0; p < startp+end; ++p) {
Hye-Shik Chang7db07e62003-12-29 01:36:01 +0000678#ifdef Py_UNICODE_WIDE
Walter Dörwald3aeb6322002-09-02 13:14:32 +0000679 if (*p >= 0x00010000)
680 ressize += 1+1+8;
Hye-Shik Chang7db07e62003-12-29 01:36:01 +0000681 else
682#endif
683 if (*p >= 0x100) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +0000684 ressize += 1+1+4;
685 }
686 else
687 ressize += 1+1+2;
688 }
689 res = PyUnicode_FromUnicode(NULL, ressize);
690 if (res==NULL)
691 return NULL;
692 for (p = startp+start, outp = PyUnicode_AS_UNICODE(res);
693 p < startp+end; ++p) {
694 Py_UNICODE c = *p;
695 *outp++ = '\\';
Hye-Shik Chang7db07e62003-12-29 01:36:01 +0000696#ifdef Py_UNICODE_WIDE
Walter Dörwald3aeb6322002-09-02 13:14:32 +0000697 if (c >= 0x00010000) {
698 *outp++ = 'U';
699 *outp++ = hexdigits[(c>>28)&0xf];
700 *outp++ = hexdigits[(c>>24)&0xf];
701 *outp++ = hexdigits[(c>>20)&0xf];
702 *outp++ = hexdigits[(c>>16)&0xf];
703 *outp++ = hexdigits[(c>>12)&0xf];
704 *outp++ = hexdigits[(c>>8)&0xf];
705 }
Hye-Shik Chang7db07e62003-12-29 01:36:01 +0000706 else
707#endif
708 if (c >= 0x100) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +0000709 *outp++ = 'u';
710 *outp++ = hexdigits[(c>>12)&0xf];
711 *outp++ = hexdigits[(c>>8)&0xf];
712 }
713 else
714 *outp++ = 'x';
715 *outp++ = hexdigits[(c>>4)&0xf];
716 *outp++ = hexdigits[c&0xf];
717 }
718
Martin v. Löwis18e16552006-02-15 17:27:45 +0000719 restuple = Py_BuildValue("(On)", res, end);
Walter Dörwald3aeb6322002-09-02 13:14:32 +0000720 Py_DECREF(res);
721 Py_DECREF(object);
722 return restuple;
723 }
724 else {
725 wrong_exception_type(exc);
726 return NULL;
727 }
728}
729
730static PyObject *strict_errors(PyObject *self, PyObject *exc)
731{
732 return PyCodec_StrictErrors(exc);
733}
734
735
736static PyObject *ignore_errors(PyObject *self, PyObject *exc)
737{
738 return PyCodec_IgnoreErrors(exc);
739}
740
741
742static PyObject *replace_errors(PyObject *self, PyObject *exc)
743{
744 return PyCodec_ReplaceErrors(exc);
745}
746
747
748static PyObject *xmlcharrefreplace_errors(PyObject *self, PyObject *exc)
749{
750 return PyCodec_XMLCharRefReplaceErrors(exc);
751}
752
753
754static PyObject *backslashreplace_errors(PyObject *self, PyObject *exc)
755{
756 return PyCodec_BackslashReplaceErrors(exc);
757}
758
Gustavo Niemeyer5ddd4c32003-03-19 00:35:36 +0000759static int _PyCodecRegistry_Init(void)
Guido van Rossumfeee4b92000-03-10 22:57:27 +0000760{
Walter Dörwald3aeb6322002-09-02 13:14:32 +0000761 static struct {
762 char *name;
763 PyMethodDef def;
764 } methods[] =
765 {
766 {
767 "strict",
768 {
769 "strict_errors",
770 strict_errors,
771 METH_O
772 }
773 },
774 {
775 "ignore",
776 {
777 "ignore_errors",
778 ignore_errors,
779 METH_O
780 }
781 },
782 {
783 "replace",
784 {
785 "replace_errors",
786 replace_errors,
787 METH_O
788 }
789 },
790 {
791 "xmlcharrefreplace",
792 {
793 "xmlcharrefreplace_errors",
794 xmlcharrefreplace_errors,
795 METH_O
796 }
797 },
798 {
799 "backslashreplace",
800 {
801 "backslashreplace_errors",
802 backslashreplace_errors,
803 METH_O
804 }
805 }
806 };
Walter Dörwald3aeb6322002-09-02 13:14:32 +0000807
Nicholas Bastine5662ae2004-03-24 22:22:12 +0000808 PyInterpreterState *interp = PyThreadState_GET()->interp;
Gustavo Niemeyer5ddd4c32003-03-19 00:35:36 +0000809 PyObject *mod;
Neal Norwitz739a8f82004-07-08 01:55:58 +0000810 unsigned i;
Gustavo Niemeyer5ddd4c32003-03-19 00:35:36 +0000811
812 if (interp->codec_search_path != NULL)
813 return 0;
814
815 interp->codec_search_path = PyList_New(0);
816 interp->codec_search_cache = PyDict_New();
817 interp->codec_error_registry = PyDict_New();
818
819 if (interp->codec_error_registry) {
820 for (i = 0; i < sizeof(methods)/sizeof(methods[0]); ++i) {
821 PyObject *func = PyCFunction_New(&methods[i].def, NULL);
822 int res;
823 if (!func)
824 Py_FatalError("can't initialize codec error registry");
825 res = PyCodec_RegisterError(methods[i].name, func);
826 Py_DECREF(func);
827 if (res)
828 Py_FatalError("can't initialize codec error registry");
Walter Dörwald3aeb6322002-09-02 13:14:32 +0000829 }
830 }
Guido van Rossumfeee4b92000-03-10 22:57:27 +0000831
Gustavo Niemeyer5ddd4c32003-03-19 00:35:36 +0000832 if (interp->codec_search_path == NULL ||
833 interp->codec_search_cache == NULL ||
834 interp->codec_error_registry == NULL)
835 Py_FatalError("can't initialize codec registry");
836
Thomas Woutersf7f438b2006-02-28 16:09:29 +0000837 mod = PyImport_ImportModuleLevel("encodings", NULL, NULL, NULL, 0);
Gustavo Niemeyer5ddd4c32003-03-19 00:35:36 +0000838 if (mod == NULL) {
839 if (PyErr_ExceptionMatches(PyExc_ImportError)) {
840 /* Ignore ImportErrors... this is done so that
841 distributions can disable the encodings package. Note
842 that other errors are not masked, e.g. SystemErrors
843 raised to inform the user of an error in the Python
844 configuration are still reported back to the user. */
845 PyErr_Clear();
846 return 0;
847 }
848 return -1;
849 }
850 Py_DECREF(mod);
851 return 0;
Guido van Rossumfeee4b92000-03-10 22:57:27 +0000852}