blob: 0e8c37498077466c65fa183bd4a6d7958ef9085b [file] [log] [blame]
Guido van Rossumfeee4b92000-03-10 22:57:27 +00001/* ------------------------------------------------------------------------
2
3 Python Codec Registry and support functions
4
5Written by Marc-Andre Lemburg (mal@lemburg.com).
6
Guido van Rossum16b1ad92000-08-03 16:24:25 +00007Copyright (c) Corporation for National Research Initiatives.
Guido van Rossumfeee4b92000-03-10 22:57:27 +00008
9 ------------------------------------------------------------------------ */
10
11#include "Python.h"
12#include <ctype.h>
13
Guido van Rossumfeee4b92000-03-10 22:57:27 +000014/* --- Codec Registry ----------------------------------------------------- */
15
16/* Import the standard encodings package which will register the first
17 codec search function.
18
19 This is done in a lazy way so that the Unicode implementation does
20 not downgrade startup time of scripts not needing it.
21
Guido van Rossumb95de4f2000-03-31 17:25:23 +000022 ImportErrors are silently ignored by this function. Only one try is
23 made.
Guido van Rossumfeee4b92000-03-10 22:57:27 +000024
25*/
26
Gustavo Niemeyer5ddd4c32003-03-19 00:35:36 +000027static int _PyCodecRegistry_Init(void); /* Forward */
Guido van Rossumfeee4b92000-03-10 22:57:27 +000028
Guido van Rossumfeee4b92000-03-10 22:57:27 +000029int PyCodec_Register(PyObject *search_function)
30{
Nicholas Bastine5662ae2004-03-24 22:22:12 +000031 PyInterpreterState *interp = PyThreadState_GET()->interp;
Gustavo Niemeyer5ddd4c32003-03-19 00:35:36 +000032 if (interp->codec_search_path == NULL && _PyCodecRegistry_Init())
33 goto onError;
Guido van Rossumfeee4b92000-03-10 22:57:27 +000034 if (search_function == NULL) {
35 PyErr_BadArgument();
Guido van Rossumb95de4f2000-03-31 17:25:23 +000036 goto onError;
Guido van Rossumfeee4b92000-03-10 22:57:27 +000037 }
38 if (!PyCallable_Check(search_function)) {
Neal Norwitz3715c3e2005-11-24 22:09:18 +000039 PyErr_SetString(PyExc_TypeError, "argument must be callable");
Guido van Rossumb95de4f2000-03-31 17:25:23 +000040 goto onError;
Guido van Rossumfeee4b92000-03-10 22:57:27 +000041 }
Gustavo Niemeyer5ddd4c32003-03-19 00:35:36 +000042 return PyList_Append(interp->codec_search_path, search_function);
Guido van Rossumb95de4f2000-03-31 17:25:23 +000043
44 onError:
45 return -1;
Guido van Rossumfeee4b92000-03-10 22:57:27 +000046}
47
Guido van Rossum9e896b32000-04-05 20:11:21 +000048/* Convert a string to a normalized Python string: all characters are
49 converted to lower case, spaces are replaced with underscores. */
50
Guido van Rossumfeee4b92000-03-10 22:57:27 +000051static
Guido van Rossum9e896b32000-04-05 20:11:21 +000052PyObject *normalizestring(const char *string)
Guido van Rossumfeee4b92000-03-10 22:57:27 +000053{
Guido van Rossum33831132000-06-29 14:50:15 +000054 register size_t i;
Guido van Rossum582acec2000-06-28 22:07:35 +000055 size_t len = strlen(string);
Guido van Rossumfeee4b92000-03-10 22:57:27 +000056 char *p;
57 PyObject *v;
58
Guido van Rossum582acec2000-06-28 22:07:35 +000059 if (len > INT_MAX) {
60 PyErr_SetString(PyExc_OverflowError, "string is too large");
61 return NULL;
62 }
63
64 v = PyString_FromStringAndSize(NULL, (int)len);
Guido van Rossumfeee4b92000-03-10 22:57:27 +000065 if (v == NULL)
66 return NULL;
67 p = PyString_AS_STRING(v);
Guido van Rossum9e896b32000-04-05 20:11:21 +000068 for (i = 0; i < len; i++) {
69 register char ch = string[i];
70 if (ch == ' ')
71 ch = '-';
72 else
73 ch = tolower(ch);
74 p[i] = ch;
75 }
Guido van Rossumfeee4b92000-03-10 22:57:27 +000076 return v;
77}
78
79/* Lookup the given encoding and return a tuple providing the codec
80 facilities.
81
82 The encoding string is looked up converted to all lower-case
83 characters. This makes encodings looked up through this mechanism
84 effectively case-insensitive.
85
Fred Drake766de832000-05-09 19:55:59 +000086 If no codec is found, a LookupError is set and NULL returned.
Guido van Rossumb95de4f2000-03-31 17:25:23 +000087
88 As side effect, this tries to load the encodings package, if not
89 yet done. This is part of the lazy load strategy for the encodings
90 package.
91
92*/
Guido van Rossumfeee4b92000-03-10 22:57:27 +000093
94PyObject *_PyCodec_Lookup(const char *encoding)
95{
Gustavo Niemeyer5ddd4c32003-03-19 00:35:36 +000096 PyInterpreterState *interp;
Guido van Rossum5ba3c842000-03-24 20:52:23 +000097 PyObject *result, *args = NULL, *v;
Guido van Rossumfeee4b92000-03-10 22:57:27 +000098 int i, len;
99
Fred Drake766de832000-05-09 19:55:59 +0000100 if (encoding == NULL) {
101 PyErr_BadArgument();
102 goto onError;
103 }
Gustavo Niemeyer5ddd4c32003-03-19 00:35:36 +0000104
Nicholas Bastine5662ae2004-03-24 22:22:12 +0000105 interp = PyThreadState_GET()->interp;
Gustavo Niemeyer5ddd4c32003-03-19 00:35:36 +0000106 if (interp->codec_search_path == NULL && _PyCodecRegistry_Init())
Barry Warsaw51ac5802000-03-20 16:36:48 +0000107 goto onError;
Guido van Rossumfeee4b92000-03-10 22:57:27 +0000108
Guido van Rossum9e896b32000-04-05 20:11:21 +0000109 /* Convert the encoding to a normalized Python string: all
Thomas Wouters7e474022000-07-16 12:04:32 +0000110 characters are converted to lower case, spaces and hyphens are
Guido van Rossum9e896b32000-04-05 20:11:21 +0000111 replaced with underscores. */
112 v = normalizestring(encoding);
Guido van Rossumfeee4b92000-03-10 22:57:27 +0000113 if (v == NULL)
114 goto onError;
115 PyString_InternInPlace(&v);
116
117 /* First, try to lookup the name in the registry dictionary */
Gustavo Niemeyer5ddd4c32003-03-19 00:35:36 +0000118 result = PyDict_GetItem(interp->codec_search_cache, v);
Guido van Rossumfeee4b92000-03-10 22:57:27 +0000119 if (result != NULL) {
120 Py_INCREF(result);
Barry Warsaw51ac5802000-03-20 16:36:48 +0000121 Py_DECREF(v);
Guido van Rossumfeee4b92000-03-10 22:57:27 +0000122 return result;
123 }
124
125 /* Next, scan the search functions in order of registration */
Guido van Rossumfeee4b92000-03-10 22:57:27 +0000126 args = PyTuple_New(1);
127 if (args == NULL)
128 goto onError;
129 PyTuple_SET_ITEM(args,0,v);
Guido van Rossum5ba3c842000-03-24 20:52:23 +0000130
Gustavo Niemeyer5ddd4c32003-03-19 00:35:36 +0000131 len = PyList_Size(interp->codec_search_path);
Guido van Rossum5ba3c842000-03-24 20:52:23 +0000132 if (len < 0)
133 goto onError;
Guido van Rossumb95de4f2000-03-31 17:25:23 +0000134 if (len == 0) {
135 PyErr_SetString(PyExc_LookupError,
136 "no codec search functions registered: "
137 "can't find encoding");
138 goto onError;
139 }
Guido van Rossumfeee4b92000-03-10 22:57:27 +0000140
141 for (i = 0; i < len; i++) {
142 PyObject *func;
143
Gustavo Niemeyer5ddd4c32003-03-19 00:35:36 +0000144 func = PyList_GetItem(interp->codec_search_path, i);
Guido van Rossumfeee4b92000-03-10 22:57:27 +0000145 if (func == NULL)
146 goto onError;
Guido van Rossum5ba3c842000-03-24 20:52:23 +0000147 result = PyEval_CallObject(func, args);
Guido van Rossumfeee4b92000-03-10 22:57:27 +0000148 if (result == NULL)
149 goto onError;
150 if (result == Py_None) {
151 Py_DECREF(result);
152 continue;
153 }
154 if (!PyTuple_Check(result) || PyTuple_GET_SIZE(result) != 4) {
155 PyErr_SetString(PyExc_TypeError,
156 "codec search functions must return 4-tuples");
157 Py_DECREF(result);
158 goto onError;
159 }
160 break;
161 }
162 if (i == len) {
163 /* XXX Perhaps we should cache misses too ? */
Martin v. Löwiseb42b022002-09-26 16:01:24 +0000164 PyErr_Format(PyExc_LookupError,
165 "unknown encoding: %s", encoding);
Guido van Rossumfeee4b92000-03-10 22:57:27 +0000166 goto onError;
167 }
168
169 /* Cache and return the result */
Gustavo Niemeyer5ddd4c32003-03-19 00:35:36 +0000170 PyDict_SetItem(interp->codec_search_cache, v, result);
Guido van Rossumfeee4b92000-03-10 22:57:27 +0000171 Py_DECREF(args);
172 return result;
173
174 onError:
175 Py_XDECREF(args);
176 return NULL;
177}
178
179static
180PyObject *args_tuple(PyObject *object,
181 const char *errors)
182{
183 PyObject *args;
184
185 args = PyTuple_New(1 + (errors != NULL));
186 if (args == NULL)
187 return NULL;
188 Py_INCREF(object);
189 PyTuple_SET_ITEM(args,0,object);
190 if (errors) {
191 PyObject *v;
192
193 v = PyString_FromString(errors);
194 if (v == NULL) {
195 Py_DECREF(args);
196 return NULL;
197 }
198 PyTuple_SET_ITEM(args, 1, v);
199 }
200 return args;
201}
202
203/* Build a codec by calling factory(stream[,errors]) or just
204 factory(errors) depending on whether the given parameters are
205 non-NULL. */
206
207static
208PyObject *build_stream_codec(PyObject *factory,
209 PyObject *stream,
210 const char *errors)
211{
212 PyObject *args, *codec;
213
214 args = args_tuple(stream, errors);
215 if (args == NULL)
216 return NULL;
217
218 codec = PyEval_CallObject(factory, args);
219 Py_DECREF(args);
220 return codec;
221}
222
223/* Convenience APIs to query the Codec registry.
224
225 All APIs return a codec object with incremented refcount.
226
227 */
228
229PyObject *PyCodec_Encoder(const char *encoding)
230{
231 PyObject *codecs;
232 PyObject *v;
233
234 codecs = _PyCodec_Lookup(encoding);
235 if (codecs == NULL)
236 goto onError;
237 v = PyTuple_GET_ITEM(codecs,0);
Mark Hammonde21262c2002-07-18 23:06:17 +0000238 Py_DECREF(codecs);
Guido van Rossumfeee4b92000-03-10 22:57:27 +0000239 Py_INCREF(v);
240 return v;
241
242 onError:
243 return NULL;
244}
245
246PyObject *PyCodec_Decoder(const char *encoding)
247{
248 PyObject *codecs;
249 PyObject *v;
250
251 codecs = _PyCodec_Lookup(encoding);
252 if (codecs == NULL)
253 goto onError;
254 v = PyTuple_GET_ITEM(codecs,1);
Mark Hammonde21262c2002-07-18 23:06:17 +0000255 Py_DECREF(codecs);
Guido van Rossumfeee4b92000-03-10 22:57:27 +0000256 Py_INCREF(v);
257 return v;
258
259 onError:
260 return NULL;
261}
262
Thomas Woutersa9773292006-04-21 09:43:23 +0000263PyObject *PyCodec_IncrementalEncoder(const char *encoding,
264 const char *errors)
265{
266 PyObject *codecs, *ret, *encoder;
267
268 codecs = _PyCodec_Lookup(encoding);
269 if (codecs == NULL)
270 goto onError;
271 encoder = PyObject_GetAttrString(codecs, "incrementalencoder");
272 if (encoder == NULL) {
273 Py_DECREF(codecs);
274 return NULL;
275 }
276 if (errors)
277 ret = PyObject_CallFunction(encoder, "O", errors);
278 else
279 ret = PyObject_CallFunction(encoder, NULL);
280 Py_DECREF(encoder);
281 Py_DECREF(codecs);
282 return ret;
283
284 onError:
285 return NULL;
286}
287
288PyObject *PyCodec_IncrementalDecoder(const char *encoding,
289 const char *errors)
290{
291 PyObject *codecs, *ret, *decoder;
292
293 codecs = _PyCodec_Lookup(encoding);
294 if (codecs == NULL)
295 goto onError;
296 decoder = PyObject_GetAttrString(codecs, "incrementaldecoder");
297 if (decoder == NULL) {
298 Py_DECREF(codecs);
299 return NULL;
300 }
301 if (errors)
302 ret = PyObject_CallFunction(decoder, "O", errors);
303 else
304 ret = PyObject_CallFunction(decoder, NULL);
305 Py_DECREF(decoder);
306 Py_DECREF(codecs);
307 return ret;
308
309 onError:
310 return NULL;
311}
312
Guido van Rossumfeee4b92000-03-10 22:57:27 +0000313PyObject *PyCodec_StreamReader(const char *encoding,
314 PyObject *stream,
315 const char *errors)
316{
Mark Hammonde21262c2002-07-18 23:06:17 +0000317 PyObject *codecs, *ret;
Guido van Rossumfeee4b92000-03-10 22:57:27 +0000318
319 codecs = _PyCodec_Lookup(encoding);
320 if (codecs == NULL)
321 goto onError;
Mark Hammonde21262c2002-07-18 23:06:17 +0000322 ret = build_stream_codec(PyTuple_GET_ITEM(codecs,2),stream,errors);
323 Py_DECREF(codecs);
324 return ret;
Guido van Rossumfeee4b92000-03-10 22:57:27 +0000325
326 onError:
327 return NULL;
328}
329
330PyObject *PyCodec_StreamWriter(const char *encoding,
331 PyObject *stream,
332 const char *errors)
333{
Mark Hammonde21262c2002-07-18 23:06:17 +0000334 PyObject *codecs, *ret;
Guido van Rossumfeee4b92000-03-10 22:57:27 +0000335
336 codecs = _PyCodec_Lookup(encoding);
337 if (codecs == NULL)
338 goto onError;
Mark Hammonde21262c2002-07-18 23:06:17 +0000339 ret = build_stream_codec(PyTuple_GET_ITEM(codecs,3),stream,errors);
340 Py_DECREF(codecs);
341 return ret;
Guido van Rossumfeee4b92000-03-10 22:57:27 +0000342
343 onError:
344 return NULL;
345}
346
347/* Encode an object (e.g. an Unicode object) using the given encoding
348 and return the resulting encoded object (usually a Python string).
349
350 errors is passed to the encoder factory as argument if non-NULL. */
351
352PyObject *PyCodec_Encode(PyObject *object,
353 const char *encoding,
354 const char *errors)
355{
356 PyObject *encoder = NULL;
Neal Norwitz3715c3e2005-11-24 22:09:18 +0000357 PyObject *args = NULL, *result = NULL;
Guido van Rossumfeee4b92000-03-10 22:57:27 +0000358 PyObject *v;
359
360 encoder = PyCodec_Encoder(encoding);
361 if (encoder == NULL)
362 goto onError;
363
364 args = args_tuple(object, errors);
365 if (args == NULL)
366 goto onError;
367
368 result = PyEval_CallObject(encoder,args);
369 if (result == NULL)
370 goto onError;
371
372 if (!PyTuple_Check(result) ||
373 PyTuple_GET_SIZE(result) != 2) {
374 PyErr_SetString(PyExc_TypeError,
375 "encoder must return a tuple (object,integer)");
376 goto onError;
377 }
378 v = PyTuple_GET_ITEM(result,0);
379 Py_INCREF(v);
380 /* We don't check or use the second (integer) entry. */
381
382 Py_DECREF(args);
383 Py_DECREF(encoder);
384 Py_DECREF(result);
385 return v;
386
387 onError:
Neal Norwitz3715c3e2005-11-24 22:09:18 +0000388 Py_XDECREF(result);
Guido van Rossumfeee4b92000-03-10 22:57:27 +0000389 Py_XDECREF(args);
390 Py_XDECREF(encoder);
391 return NULL;
392}
393
394/* Decode an object (usually a Python string) using the given encoding
395 and return an equivalent object (e.g. an Unicode object).
396
397 errors is passed to the decoder factory as argument if non-NULL. */
398
399PyObject *PyCodec_Decode(PyObject *object,
400 const char *encoding,
401 const char *errors)
402{
403 PyObject *decoder = NULL;
404 PyObject *args = NULL, *result = NULL;
405 PyObject *v;
406
407 decoder = PyCodec_Decoder(encoding);
408 if (decoder == NULL)
409 goto onError;
410
411 args = args_tuple(object, errors);
412 if (args == NULL)
413 goto onError;
414
415 result = PyEval_CallObject(decoder,args);
416 if (result == NULL)
417 goto onError;
418 if (!PyTuple_Check(result) ||
419 PyTuple_GET_SIZE(result) != 2) {
420 PyErr_SetString(PyExc_TypeError,
421 "decoder must return a tuple (object,integer)");
422 goto onError;
423 }
424 v = PyTuple_GET_ITEM(result,0);
425 Py_INCREF(v);
426 /* We don't check or use the second (integer) entry. */
427
428 Py_DECREF(args);
429 Py_DECREF(decoder);
430 Py_DECREF(result);
431 return v;
432
433 onError:
434 Py_XDECREF(args);
435 Py_XDECREF(decoder);
436 Py_XDECREF(result);
437 return NULL;
438}
439
Walter Dörwald3aeb6322002-09-02 13:14:32 +0000440/* Register the error handling callback function error under the name
441 name. This function will be called by the codec when it encounters
442 an unencodable characters/undecodable bytes and doesn't know the
443 callback name, when name is specified as the error parameter
444 in the call to the encode/decode function.
445 Return 0 on success, -1 on error */
446int PyCodec_RegisterError(const char *name, PyObject *error)
447{
Nicholas Bastine5662ae2004-03-24 22:22:12 +0000448 PyInterpreterState *interp = PyThreadState_GET()->interp;
Gustavo Niemeyer5ddd4c32003-03-19 00:35:36 +0000449 if (interp->codec_search_path == NULL && _PyCodecRegistry_Init())
450 return -1;
Walter Dörwald3aeb6322002-09-02 13:14:32 +0000451 if (!PyCallable_Check(error)) {
452 PyErr_SetString(PyExc_TypeError, "handler must be callable");
453 return -1;
454 }
Gustavo Niemeyer5ddd4c32003-03-19 00:35:36 +0000455 return PyDict_SetItemString(interp->codec_error_registry,
456 (char *)name, error);
Walter Dörwald3aeb6322002-09-02 13:14:32 +0000457}
458
459/* Lookup the error handling callback function registered under the
460 name error. As a special case NULL can be passed, in which case
461 the error handling callback for strict encoding will be returned. */
462PyObject *PyCodec_LookupError(const char *name)
463{
464 PyObject *handler = NULL;
465
Nicholas Bastine5662ae2004-03-24 22:22:12 +0000466 PyInterpreterState *interp = PyThreadState_GET()->interp;
Gustavo Niemeyer5ddd4c32003-03-19 00:35:36 +0000467 if (interp->codec_search_path == NULL && _PyCodecRegistry_Init())
468 return NULL;
469
Walter Dörwald3aeb6322002-09-02 13:14:32 +0000470 if (name==NULL)
471 name = "strict";
Gustavo Niemeyer5ddd4c32003-03-19 00:35:36 +0000472 handler = PyDict_GetItemString(interp->codec_error_registry, (char *)name);
Walter Dörwald3aeb6322002-09-02 13:14:32 +0000473 if (!handler)
474 PyErr_Format(PyExc_LookupError, "unknown error handler name '%.400s'", name);
475 else
476 Py_INCREF(handler);
477 return handler;
478}
479
480static void wrong_exception_type(PyObject *exc)
481{
482 PyObject *type = PyObject_GetAttrString(exc, "__class__");
483 if (type != NULL) {
484 PyObject *name = PyObject_GetAttrString(type, "__name__");
485 Py_DECREF(type);
486 if (name != NULL) {
487 PyObject *string = PyObject_Str(name);
488 Py_DECREF(name);
Walter Dörwaldf7bcd1d2002-09-02 18:22:32 +0000489 if (string != NULL) {
490 PyErr_Format(PyExc_TypeError,
491 "don't know how to handle %.400s in error callback",
492 PyString_AS_STRING(string));
493 Py_DECREF(string);
494 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +0000495 }
496 }
497}
498
499PyObject *PyCodec_StrictErrors(PyObject *exc)
500{
Brett Cannonbf364092006-03-01 04:25:17 +0000501 if (PyExceptionInstance_Check(exc))
502 PyErr_SetObject(PyExceptionInstance_Class(exc), exc);
Walter Dörwald3aeb6322002-09-02 13:14:32 +0000503 else
504 PyErr_SetString(PyExc_TypeError, "codec must pass exception instance");
505 return NULL;
506}
507
508
Walter Dörwaldbf73db82002-11-21 20:08:33 +0000509#ifdef Py_USING_UNICODE
Walter Dörwald3aeb6322002-09-02 13:14:32 +0000510PyObject *PyCodec_IgnoreErrors(PyObject *exc)
511{
Martin v. Löwis18e16552006-02-15 17:27:45 +0000512 Py_ssize_t end;
Walter Dörwald3aeb6322002-09-02 13:14:32 +0000513 if (PyObject_IsInstance(exc, PyExc_UnicodeEncodeError)) {
514 if (PyUnicodeEncodeError_GetEnd(exc, &end))
515 return NULL;
516 }
517 else if (PyObject_IsInstance(exc, PyExc_UnicodeDecodeError)) {
518 if (PyUnicodeDecodeError_GetEnd(exc, &end))
519 return NULL;
520 }
521 else if (PyObject_IsInstance(exc, PyExc_UnicodeTranslateError)) {
522 if (PyUnicodeTranslateError_GetEnd(exc, &end))
523 return NULL;
524 }
525 else {
526 wrong_exception_type(exc);
527 return NULL;
528 }
529 /* ouch: passing NULL, 0, pos gives None instead of u'' */
Martin v. Löwis18e16552006-02-15 17:27:45 +0000530 return Py_BuildValue("(u#n)", &end, 0, end);
Walter Dörwald3aeb6322002-09-02 13:14:32 +0000531}
532
533
534PyObject *PyCodec_ReplaceErrors(PyObject *exc)
535{
536 PyObject *restuple;
Martin v. Löwis18e16552006-02-15 17:27:45 +0000537 Py_ssize_t start;
538 Py_ssize_t end;
539 Py_ssize_t i;
Walter Dörwald3aeb6322002-09-02 13:14:32 +0000540
541 if (PyObject_IsInstance(exc, PyExc_UnicodeEncodeError)) {
542 PyObject *res;
543 Py_UNICODE *p;
544 if (PyUnicodeEncodeError_GetStart(exc, &start))
545 return NULL;
546 if (PyUnicodeEncodeError_GetEnd(exc, &end))
547 return NULL;
548 res = PyUnicode_FromUnicode(NULL, end-start);
549 if (res == NULL)
550 return NULL;
551 for (p = PyUnicode_AS_UNICODE(res), i = start;
552 i<end; ++p, ++i)
553 *p = '?';
Martin v. Löwis18e16552006-02-15 17:27:45 +0000554 restuple = Py_BuildValue("(On)", res, end);
Walter Dörwald3aeb6322002-09-02 13:14:32 +0000555 Py_DECREF(res);
556 return restuple;
557 }
558 else if (PyObject_IsInstance(exc, PyExc_UnicodeDecodeError)) {
559 Py_UNICODE res = Py_UNICODE_REPLACEMENT_CHARACTER;
560 if (PyUnicodeDecodeError_GetEnd(exc, &end))
561 return NULL;
Martin v. Löwis18e16552006-02-15 17:27:45 +0000562 return Py_BuildValue("(u#n)", &res, 1, end);
Walter Dörwald3aeb6322002-09-02 13:14:32 +0000563 }
564 else if (PyObject_IsInstance(exc, PyExc_UnicodeTranslateError)) {
565 PyObject *res;
566 Py_UNICODE *p;
567 if (PyUnicodeTranslateError_GetStart(exc, &start))
568 return NULL;
569 if (PyUnicodeTranslateError_GetEnd(exc, &end))
570 return NULL;
571 res = PyUnicode_FromUnicode(NULL, end-start);
572 if (res == NULL)
573 return NULL;
574 for (p = PyUnicode_AS_UNICODE(res), i = start;
575 i<end; ++p, ++i)
576 *p = Py_UNICODE_REPLACEMENT_CHARACTER;
Martin v. Löwis18e16552006-02-15 17:27:45 +0000577 restuple = Py_BuildValue("(On)", res, end);
Walter Dörwald3aeb6322002-09-02 13:14:32 +0000578 Py_DECREF(res);
579 return restuple;
580 }
581 else {
582 wrong_exception_type(exc);
583 return NULL;
584 }
585}
586
587PyObject *PyCodec_XMLCharRefReplaceErrors(PyObject *exc)
588{
589 if (PyObject_IsInstance(exc, PyExc_UnicodeEncodeError)) {
590 PyObject *restuple;
591 PyObject *object;
Martin v. Löwis18e16552006-02-15 17:27:45 +0000592 Py_ssize_t start;
593 Py_ssize_t end;
Walter Dörwald3aeb6322002-09-02 13:14:32 +0000594 PyObject *res;
595 Py_UNICODE *p;
596 Py_UNICODE *startp;
597 Py_UNICODE *outp;
598 int ressize;
599 if (PyUnicodeEncodeError_GetStart(exc, &start))
600 return NULL;
601 if (PyUnicodeEncodeError_GetEnd(exc, &end))
602 return NULL;
603 if (!(object = PyUnicodeEncodeError_GetObject(exc)))
604 return NULL;
605 startp = PyUnicode_AS_UNICODE(object);
606 for (p = startp+start, ressize = 0; p < startp+end; ++p) {
607 if (*p<10)
608 ressize += 2+1+1;
609 else if (*p<100)
610 ressize += 2+2+1;
611 else if (*p<1000)
612 ressize += 2+3+1;
613 else if (*p<10000)
614 ressize += 2+4+1;
Hye-Shik Chang7db07e62003-12-29 01:36:01 +0000615#ifndef Py_UNICODE_WIDE
616 else
617 ressize += 2+5+1;
618#else
Walter Dörwald3aeb6322002-09-02 13:14:32 +0000619 else if (*p<100000)
620 ressize += 2+5+1;
621 else if (*p<1000000)
622 ressize += 2+6+1;
623 else
624 ressize += 2+7+1;
Hye-Shik Chang7db07e62003-12-29 01:36:01 +0000625#endif
Walter Dörwald3aeb6322002-09-02 13:14:32 +0000626 }
627 /* allocate replacement */
628 res = PyUnicode_FromUnicode(NULL, ressize);
629 if (res == NULL) {
630 Py_DECREF(object);
631 return NULL;
632 }
633 /* generate replacement */
634 for (p = startp+start, outp = PyUnicode_AS_UNICODE(res);
635 p < startp+end; ++p) {
636 Py_UNICODE c = *p;
637 int digits;
638 int base;
639 *outp++ = '&';
640 *outp++ = '#';
641 if (*p<10) {
642 digits = 1;
643 base = 1;
644 }
645 else if (*p<100) {
646 digits = 2;
647 base = 10;
648 }
649 else if (*p<1000) {
650 digits = 3;
651 base = 100;
652 }
653 else if (*p<10000) {
654 digits = 4;
655 base = 1000;
656 }
Hye-Shik Chang7db07e62003-12-29 01:36:01 +0000657#ifndef Py_UNICODE_WIDE
658 else {
659 digits = 5;
660 base = 10000;
661 }
662#else
Walter Dörwald3aeb6322002-09-02 13:14:32 +0000663 else if (*p<100000) {
664 digits = 5;
665 base = 10000;
666 }
667 else if (*p<1000000) {
668 digits = 6;
669 base = 100000;
670 }
671 else {
672 digits = 7;
673 base = 1000000;
674 }
Hye-Shik Chang7db07e62003-12-29 01:36:01 +0000675#endif
Walter Dörwald3aeb6322002-09-02 13:14:32 +0000676 while (digits-->0) {
677 *outp++ = '0' + c/base;
678 c %= base;
679 base /= 10;
680 }
681 *outp++ = ';';
682 }
Martin v. Löwis18e16552006-02-15 17:27:45 +0000683 restuple = Py_BuildValue("(On)", res, end);
Walter Dörwald3aeb6322002-09-02 13:14:32 +0000684 Py_DECREF(res);
685 Py_DECREF(object);
686 return restuple;
687 }
688 else {
689 wrong_exception_type(exc);
690 return NULL;
691 }
692}
693
694static Py_UNICODE hexdigits[] = {
695 '0', '1', '2', '3', '4', '5', '6', '7',
696 '8', '9', 'a', 'b', 'c', 'd', 'e', 'f'
697};
698
699PyObject *PyCodec_BackslashReplaceErrors(PyObject *exc)
700{
701 if (PyObject_IsInstance(exc, PyExc_UnicodeEncodeError)) {
702 PyObject *restuple;
703 PyObject *object;
Martin v. Löwis18e16552006-02-15 17:27:45 +0000704 Py_ssize_t start;
705 Py_ssize_t end;
Walter Dörwald3aeb6322002-09-02 13:14:32 +0000706 PyObject *res;
707 Py_UNICODE *p;
708 Py_UNICODE *startp;
709 Py_UNICODE *outp;
710 int ressize;
711 if (PyUnicodeEncodeError_GetStart(exc, &start))
712 return NULL;
713 if (PyUnicodeEncodeError_GetEnd(exc, &end))
714 return NULL;
715 if (!(object = PyUnicodeEncodeError_GetObject(exc)))
716 return NULL;
717 startp = PyUnicode_AS_UNICODE(object);
718 for (p = startp+start, ressize = 0; p < startp+end; ++p) {
Hye-Shik Chang7db07e62003-12-29 01:36:01 +0000719#ifdef Py_UNICODE_WIDE
Walter Dörwald3aeb6322002-09-02 13:14:32 +0000720 if (*p >= 0x00010000)
721 ressize += 1+1+8;
Hye-Shik Chang7db07e62003-12-29 01:36:01 +0000722 else
723#endif
724 if (*p >= 0x100) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +0000725 ressize += 1+1+4;
726 }
727 else
728 ressize += 1+1+2;
729 }
730 res = PyUnicode_FromUnicode(NULL, ressize);
731 if (res==NULL)
732 return NULL;
733 for (p = startp+start, outp = PyUnicode_AS_UNICODE(res);
734 p < startp+end; ++p) {
735 Py_UNICODE c = *p;
736 *outp++ = '\\';
Hye-Shik Chang7db07e62003-12-29 01:36:01 +0000737#ifdef Py_UNICODE_WIDE
Walter Dörwald3aeb6322002-09-02 13:14:32 +0000738 if (c >= 0x00010000) {
739 *outp++ = 'U';
740 *outp++ = hexdigits[(c>>28)&0xf];
741 *outp++ = hexdigits[(c>>24)&0xf];
742 *outp++ = hexdigits[(c>>20)&0xf];
743 *outp++ = hexdigits[(c>>16)&0xf];
744 *outp++ = hexdigits[(c>>12)&0xf];
745 *outp++ = hexdigits[(c>>8)&0xf];
746 }
Hye-Shik Chang7db07e62003-12-29 01:36:01 +0000747 else
748#endif
749 if (c >= 0x100) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +0000750 *outp++ = 'u';
751 *outp++ = hexdigits[(c>>12)&0xf];
752 *outp++ = hexdigits[(c>>8)&0xf];
753 }
754 else
755 *outp++ = 'x';
756 *outp++ = hexdigits[(c>>4)&0xf];
757 *outp++ = hexdigits[c&0xf];
758 }
759
Martin v. Löwis18e16552006-02-15 17:27:45 +0000760 restuple = Py_BuildValue("(On)", res, end);
Walter Dörwald3aeb6322002-09-02 13:14:32 +0000761 Py_DECREF(res);
762 Py_DECREF(object);
763 return restuple;
764 }
765 else {
766 wrong_exception_type(exc);
767 return NULL;
768 }
769}
Walter Dörwaldbf73db82002-11-21 20:08:33 +0000770#endif
Walter Dörwald3aeb6322002-09-02 13:14:32 +0000771
772static PyObject *strict_errors(PyObject *self, PyObject *exc)
773{
774 return PyCodec_StrictErrors(exc);
775}
776
777
Walter Dörwaldbf73db82002-11-21 20:08:33 +0000778#ifdef Py_USING_UNICODE
Walter Dörwald3aeb6322002-09-02 13:14:32 +0000779static PyObject *ignore_errors(PyObject *self, PyObject *exc)
780{
781 return PyCodec_IgnoreErrors(exc);
782}
783
784
785static PyObject *replace_errors(PyObject *self, PyObject *exc)
786{
787 return PyCodec_ReplaceErrors(exc);
788}
789
790
791static PyObject *xmlcharrefreplace_errors(PyObject *self, PyObject *exc)
792{
793 return PyCodec_XMLCharRefReplaceErrors(exc);
794}
795
796
797static PyObject *backslashreplace_errors(PyObject *self, PyObject *exc)
798{
799 return PyCodec_BackslashReplaceErrors(exc);
800}
Walter Dörwaldbf73db82002-11-21 20:08:33 +0000801#endif
Walter Dörwald3aeb6322002-09-02 13:14:32 +0000802
Gustavo Niemeyer5ddd4c32003-03-19 00:35:36 +0000803static int _PyCodecRegistry_Init(void)
Guido van Rossumfeee4b92000-03-10 22:57:27 +0000804{
Walter Dörwald3aeb6322002-09-02 13:14:32 +0000805 static struct {
806 char *name;
807 PyMethodDef def;
808 } methods[] =
809 {
810 {
811 "strict",
812 {
813 "strict_errors",
814 strict_errors,
815 METH_O
816 }
817 },
Walter Dörwaldbf73db82002-11-21 20:08:33 +0000818#ifdef Py_USING_UNICODE
Walter Dörwald3aeb6322002-09-02 13:14:32 +0000819 {
820 "ignore",
821 {
822 "ignore_errors",
823 ignore_errors,
824 METH_O
825 }
826 },
827 {
828 "replace",
829 {
830 "replace_errors",
831 replace_errors,
832 METH_O
833 }
834 },
835 {
836 "xmlcharrefreplace",
837 {
838 "xmlcharrefreplace_errors",
839 xmlcharrefreplace_errors,
840 METH_O
841 }
842 },
843 {
844 "backslashreplace",
845 {
846 "backslashreplace_errors",
847 backslashreplace_errors,
848 METH_O
849 }
850 }
Walter Dörwaldbf73db82002-11-21 20:08:33 +0000851#endif
Walter Dörwald3aeb6322002-09-02 13:14:32 +0000852 };
Walter Dörwald3aeb6322002-09-02 13:14:32 +0000853
Nicholas Bastine5662ae2004-03-24 22:22:12 +0000854 PyInterpreterState *interp = PyThreadState_GET()->interp;
Gustavo Niemeyer5ddd4c32003-03-19 00:35:36 +0000855 PyObject *mod;
Neal Norwitz739a8f82004-07-08 01:55:58 +0000856 unsigned i;
Gustavo Niemeyer5ddd4c32003-03-19 00:35:36 +0000857
858 if (interp->codec_search_path != NULL)
859 return 0;
860
861 interp->codec_search_path = PyList_New(0);
862 interp->codec_search_cache = PyDict_New();
863 interp->codec_error_registry = PyDict_New();
864
865 if (interp->codec_error_registry) {
866 for (i = 0; i < sizeof(methods)/sizeof(methods[0]); ++i) {
867 PyObject *func = PyCFunction_New(&methods[i].def, NULL);
868 int res;
869 if (!func)
870 Py_FatalError("can't initialize codec error registry");
871 res = PyCodec_RegisterError(methods[i].name, func);
872 Py_DECREF(func);
873 if (res)
874 Py_FatalError("can't initialize codec error registry");
Walter Dörwald3aeb6322002-09-02 13:14:32 +0000875 }
876 }
Guido van Rossumfeee4b92000-03-10 22:57:27 +0000877
Gustavo Niemeyer5ddd4c32003-03-19 00:35:36 +0000878 if (interp->codec_search_path == NULL ||
879 interp->codec_search_cache == NULL ||
880 interp->codec_error_registry == NULL)
881 Py_FatalError("can't initialize codec registry");
882
Thomas Woutersf7f438b2006-02-28 16:09:29 +0000883 mod = PyImport_ImportModuleLevel("encodings", NULL, NULL, NULL, 0);
Gustavo Niemeyer5ddd4c32003-03-19 00:35:36 +0000884 if (mod == NULL) {
885 if (PyErr_ExceptionMatches(PyExc_ImportError)) {
886 /* Ignore ImportErrors... this is done so that
887 distributions can disable the encodings package. Note
888 that other errors are not masked, e.g. SystemErrors
889 raised to inform the user of an error in the Python
890 configuration are still reported back to the user. */
891 PyErr_Clear();
892 return 0;
893 }
894 return -1;
895 }
896 Py_DECREF(mod);
897 return 0;
Guido van Rossumfeee4b92000-03-10 22:57:27 +0000898}