blob: ddd19359ac927d51c7e248b51364bd63b830f496 [file] [log] [blame]
Guido van Rossumfeee4b92000-03-10 22:57:27 +00001/* ------------------------------------------------------------------------
2
3 Python Codec Registry and support functions
4
5Written by Marc-Andre Lemburg (mal@lemburg.com).
6
Guido van Rossum16b1ad92000-08-03 16:24:25 +00007Copyright (c) Corporation for National Research Initiatives.
Guido van Rossumfeee4b92000-03-10 22:57:27 +00008
9 ------------------------------------------------------------------------ */
10
11#include "Python.h"
12#include <ctype.h>
13
Guido van Rossumfeee4b92000-03-10 22:57:27 +000014/* --- Codec Registry ----------------------------------------------------- */
15
16/* Import the standard encodings package which will register the first
17 codec search function.
18
19 This is done in a lazy way so that the Unicode implementation does
20 not downgrade startup time of scripts not needing it.
21
Guido van Rossumb95de4f2000-03-31 17:25:23 +000022 ImportErrors are silently ignored by this function. Only one try is
23 made.
Guido van Rossumfeee4b92000-03-10 22:57:27 +000024
25*/
26
Gustavo Niemeyer5ddd4c32003-03-19 00:35:36 +000027static int _PyCodecRegistry_Init(void); /* Forward */
Guido van Rossumfeee4b92000-03-10 22:57:27 +000028
Guido van Rossumfeee4b92000-03-10 22:57:27 +000029int PyCodec_Register(PyObject *search_function)
30{
Nicholas Bastine5662ae2004-03-24 22:22:12 +000031 PyInterpreterState *interp = PyThreadState_GET()->interp;
Gustavo Niemeyer5ddd4c32003-03-19 00:35:36 +000032 if (interp->codec_search_path == NULL && _PyCodecRegistry_Init())
33 goto onError;
Guido van Rossumfeee4b92000-03-10 22:57:27 +000034 if (search_function == NULL) {
35 PyErr_BadArgument();
Guido van Rossumb95de4f2000-03-31 17:25:23 +000036 goto onError;
Guido van Rossumfeee4b92000-03-10 22:57:27 +000037 }
38 if (!PyCallable_Check(search_function)) {
Neal Norwitz3715c3e2005-11-24 22:09:18 +000039 PyErr_SetString(PyExc_TypeError, "argument must be callable");
Guido van Rossumb95de4f2000-03-31 17:25:23 +000040 goto onError;
Guido van Rossumfeee4b92000-03-10 22:57:27 +000041 }
Gustavo Niemeyer5ddd4c32003-03-19 00:35:36 +000042 return PyList_Append(interp->codec_search_path, search_function);
Guido van Rossumb95de4f2000-03-31 17:25:23 +000043
44 onError:
45 return -1;
Guido van Rossumfeee4b92000-03-10 22:57:27 +000046}
47
Guido van Rossum9e896b32000-04-05 20:11:21 +000048/* Convert a string to a normalized Python string: all characters are
49 converted to lower case, spaces are replaced with underscores. */
50
Guido van Rossumfeee4b92000-03-10 22:57:27 +000051static
Guido van Rossum9e896b32000-04-05 20:11:21 +000052PyObject *normalizestring(const char *string)
Guido van Rossumfeee4b92000-03-10 22:57:27 +000053{
Guido van Rossum33831132000-06-29 14:50:15 +000054 register size_t i;
Guido van Rossum582acec2000-06-28 22:07:35 +000055 size_t len = strlen(string);
Guido van Rossumfeee4b92000-03-10 22:57:27 +000056 char *p;
57 PyObject *v;
58
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000059 if (len > PY_SSIZE_T_MAX) {
60 PyErr_SetString(PyExc_OverflowError, "string is too large");
61 return NULL;
62 }
Guido van Rossum582acec2000-06-28 22:07:35 +000063
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000064 v = PyString_FromStringAndSize(NULL, len);
Guido van Rossumfeee4b92000-03-10 22:57:27 +000065 if (v == NULL)
66 return NULL;
67 p = PyString_AS_STRING(v);
Guido van Rossum9e896b32000-04-05 20:11:21 +000068 for (i = 0; i < len; i++) {
69 register char ch = string[i];
70 if (ch == ' ')
71 ch = '-';
72 else
Thomas Wouters477c8d52006-05-27 19:21:47 +000073 ch = tolower(Py_CHARMASK(ch));
Guido van Rossum9e896b32000-04-05 20:11:21 +000074 p[i] = ch;
75 }
Guido van Rossumfeee4b92000-03-10 22:57:27 +000076 return v;
77}
78
79/* Lookup the given encoding and return a tuple providing the codec
80 facilities.
81
82 The encoding string is looked up converted to all lower-case
83 characters. This makes encodings looked up through this mechanism
84 effectively case-insensitive.
85
Fred Drake766de832000-05-09 19:55:59 +000086 If no codec is found, a LookupError is set and NULL returned.
Guido van Rossumb95de4f2000-03-31 17:25:23 +000087
88 As side effect, this tries to load the encodings package, if not
89 yet done. This is part of the lazy load strategy for the encodings
90 package.
91
92*/
Guido van Rossumfeee4b92000-03-10 22:57:27 +000093
94PyObject *_PyCodec_Lookup(const char *encoding)
95{
Gustavo Niemeyer5ddd4c32003-03-19 00:35:36 +000096 PyInterpreterState *interp;
Guido van Rossum5ba3c842000-03-24 20:52:23 +000097 PyObject *result, *args = NULL, *v;
Thomas Wouters477c8d52006-05-27 19:21:47 +000098 Py_ssize_t i, len;
Guido van Rossumfeee4b92000-03-10 22:57:27 +000099
Fred Drake766de832000-05-09 19:55:59 +0000100 if (encoding == NULL) {
101 PyErr_BadArgument();
102 goto onError;
103 }
Gustavo Niemeyer5ddd4c32003-03-19 00:35:36 +0000104
Nicholas Bastine5662ae2004-03-24 22:22:12 +0000105 interp = PyThreadState_GET()->interp;
Gustavo Niemeyer5ddd4c32003-03-19 00:35:36 +0000106 if (interp->codec_search_path == NULL && _PyCodecRegistry_Init())
Barry Warsaw51ac5802000-03-20 16:36:48 +0000107 goto onError;
Guido van Rossumfeee4b92000-03-10 22:57:27 +0000108
Guido van Rossum9e896b32000-04-05 20:11:21 +0000109 /* Convert the encoding to a normalized Python string: all
Thomas Wouters7e474022000-07-16 12:04:32 +0000110 characters are converted to lower case, spaces and hyphens are
Guido van Rossum9e896b32000-04-05 20:11:21 +0000111 replaced with underscores. */
112 v = normalizestring(encoding);
Guido van Rossumfeee4b92000-03-10 22:57:27 +0000113 if (v == NULL)
114 goto onError;
115 PyString_InternInPlace(&v);
116
117 /* First, try to lookup the name in the registry dictionary */
Gustavo Niemeyer5ddd4c32003-03-19 00:35:36 +0000118 result = PyDict_GetItem(interp->codec_search_cache, v);
Guido van Rossumfeee4b92000-03-10 22:57:27 +0000119 if (result != NULL) {
120 Py_INCREF(result);
Barry Warsaw51ac5802000-03-20 16:36:48 +0000121 Py_DECREF(v);
Guido van Rossumfeee4b92000-03-10 22:57:27 +0000122 return result;
123 }
124
125 /* Next, scan the search functions in order of registration */
Guido van Rossumfeee4b92000-03-10 22:57:27 +0000126 args = PyTuple_New(1);
127 if (args == NULL)
128 goto onError;
129 PyTuple_SET_ITEM(args,0,v);
Guido van Rossum5ba3c842000-03-24 20:52:23 +0000130
Gustavo Niemeyer5ddd4c32003-03-19 00:35:36 +0000131 len = PyList_Size(interp->codec_search_path);
Guido van Rossum5ba3c842000-03-24 20:52:23 +0000132 if (len < 0)
133 goto onError;
Guido van Rossumb95de4f2000-03-31 17:25:23 +0000134 if (len == 0) {
135 PyErr_SetString(PyExc_LookupError,
136 "no codec search functions registered: "
137 "can't find encoding");
138 goto onError;
139 }
Guido van Rossumfeee4b92000-03-10 22:57:27 +0000140
141 for (i = 0; i < len; i++) {
142 PyObject *func;
143
Gustavo Niemeyer5ddd4c32003-03-19 00:35:36 +0000144 func = PyList_GetItem(interp->codec_search_path, i);
Guido van Rossumfeee4b92000-03-10 22:57:27 +0000145 if (func == NULL)
146 goto onError;
Guido van Rossum5ba3c842000-03-24 20:52:23 +0000147 result = PyEval_CallObject(func, args);
Guido van Rossumfeee4b92000-03-10 22:57:27 +0000148 if (result == NULL)
149 goto onError;
150 if (result == Py_None) {
151 Py_DECREF(result);
152 continue;
153 }
154 if (!PyTuple_Check(result) || PyTuple_GET_SIZE(result) != 4) {
155 PyErr_SetString(PyExc_TypeError,
156 "codec search functions must return 4-tuples");
157 Py_DECREF(result);
158 goto onError;
159 }
160 break;
161 }
162 if (i == len) {
163 /* XXX Perhaps we should cache misses too ? */
Martin v. Löwiseb42b022002-09-26 16:01:24 +0000164 PyErr_Format(PyExc_LookupError,
165 "unknown encoding: %s", encoding);
Guido van Rossumfeee4b92000-03-10 22:57:27 +0000166 goto onError;
167 }
168
169 /* Cache and return the result */
Gustavo Niemeyer5ddd4c32003-03-19 00:35:36 +0000170 PyDict_SetItem(interp->codec_search_cache, v, result);
Guido van Rossumfeee4b92000-03-10 22:57:27 +0000171 Py_DECREF(args);
172 return result;
173
174 onError:
175 Py_XDECREF(args);
176 return NULL;
177}
178
179static
180PyObject *args_tuple(PyObject *object,
181 const char *errors)
182{
183 PyObject *args;
184
185 args = PyTuple_New(1 + (errors != NULL));
186 if (args == NULL)
187 return NULL;
188 Py_INCREF(object);
189 PyTuple_SET_ITEM(args,0,object);
190 if (errors) {
191 PyObject *v;
192
193 v = PyString_FromString(errors);
194 if (v == NULL) {
195 Py_DECREF(args);
196 return NULL;
197 }
198 PyTuple_SET_ITEM(args, 1, v);
199 }
200 return args;
201}
202
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000203/* Helper function to get a codec item */
Guido van Rossumfeee4b92000-03-10 22:57:27 +0000204
205static
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000206PyObject *codec_getitem(const char *encoding, int index)
Guido van Rossumfeee4b92000-03-10 22:57:27 +0000207{
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000208 PyObject *codecs;
209 PyObject *v;
Guido van Rossumfeee4b92000-03-10 22:57:27 +0000210
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000211 codecs = _PyCodec_Lookup(encoding);
212 if (codecs == NULL)
Guido van Rossumfeee4b92000-03-10 22:57:27 +0000213 return NULL;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000214 v = PyTuple_GET_ITEM(codecs, index);
215 Py_DECREF(codecs);
216 Py_INCREF(v);
217 return v;
218}
219
220/* Helper function to create an incremental codec. */
221
222static
223PyObject *codec_getincrementalcodec(const char *encoding,
224 const char *errors,
225 const char *attrname)
226{
227 PyObject *codecs, *ret, *inccodec;
228
229 codecs = _PyCodec_Lookup(encoding);
230 if (codecs == NULL)
231 return NULL;
232 inccodec = PyObject_GetAttrString(codecs, attrname);
233 Py_DECREF(codecs);
234 if (inccodec == NULL)
235 return NULL;
236 if (errors)
237 ret = PyObject_CallFunction(inccodec, "s", errors);
238 else
239 ret = PyObject_CallFunction(inccodec, NULL);
240 Py_DECREF(inccodec);
241 return ret;
242}
243
244/* Helper function to create a stream codec. */
245
246static
247PyObject *codec_getstreamcodec(const char *encoding,
248 PyObject *stream,
249 const char *errors,
250 const int index)
251{
Thomas Wouters0e3f5912006-08-11 14:57:12 +0000252 PyObject *codecs, *streamcodec, *codeccls;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000253
254 codecs = _PyCodec_Lookup(encoding);
255 if (codecs == NULL)
256 return NULL;
257
Thomas Wouters0e3f5912006-08-11 14:57:12 +0000258 codeccls = PyTuple_GET_ITEM(codecs, index);
259 if (errors != NULL)
260 streamcodec = PyObject_CallFunction(codeccls, "Os", stream, errors);
261 else
262 streamcodec = PyObject_CallFunction(codeccls, "O", stream);
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000263 Py_DECREF(codecs);
264 return streamcodec;
Guido van Rossumfeee4b92000-03-10 22:57:27 +0000265}
266
267/* Convenience APIs to query the Codec registry.
268
269 All APIs return a codec object with incremented refcount.
270
271 */
272
273PyObject *PyCodec_Encoder(const char *encoding)
274{
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000275 return codec_getitem(encoding, 0);
Guido van Rossumfeee4b92000-03-10 22:57:27 +0000276}
277
278PyObject *PyCodec_Decoder(const char *encoding)
279{
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000280 return codec_getitem(encoding, 1);
Guido van Rossumfeee4b92000-03-10 22:57:27 +0000281}
282
Thomas Woutersa9773292006-04-21 09:43:23 +0000283PyObject *PyCodec_IncrementalEncoder(const char *encoding,
284 const char *errors)
285{
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000286 return codec_getincrementalcodec(encoding, errors, "incrementalencoder");
Thomas Woutersa9773292006-04-21 09:43:23 +0000287}
288
289PyObject *PyCodec_IncrementalDecoder(const char *encoding,
290 const char *errors)
291{
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000292 return codec_getincrementalcodec(encoding, errors, "incrementaldecoder");
Thomas Woutersa9773292006-04-21 09:43:23 +0000293}
294
Guido van Rossumfeee4b92000-03-10 22:57:27 +0000295PyObject *PyCodec_StreamReader(const char *encoding,
296 PyObject *stream,
297 const char *errors)
298{
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000299 return codec_getstreamcodec(encoding, stream, errors, 2);
Guido van Rossumfeee4b92000-03-10 22:57:27 +0000300}
301
302PyObject *PyCodec_StreamWriter(const char *encoding,
303 PyObject *stream,
304 const char *errors)
305{
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000306 return codec_getstreamcodec(encoding, stream, errors, 3);
Guido van Rossumfeee4b92000-03-10 22:57:27 +0000307}
308
309/* Encode an object (e.g. an Unicode object) using the given encoding
310 and return the resulting encoded object (usually a Python string).
311
312 errors is passed to the encoder factory as argument if non-NULL. */
313
314PyObject *PyCodec_Encode(PyObject *object,
315 const char *encoding,
316 const char *errors)
317{
318 PyObject *encoder = NULL;
Neal Norwitz3715c3e2005-11-24 22:09:18 +0000319 PyObject *args = NULL, *result = NULL;
Guido van Rossumfeee4b92000-03-10 22:57:27 +0000320 PyObject *v;
321
322 encoder = PyCodec_Encoder(encoding);
323 if (encoder == NULL)
324 goto onError;
325
326 args = args_tuple(object, errors);
327 if (args == NULL)
328 goto onError;
329
330 result = PyEval_CallObject(encoder,args);
331 if (result == NULL)
332 goto onError;
333
334 if (!PyTuple_Check(result) ||
335 PyTuple_GET_SIZE(result) != 2) {
336 PyErr_SetString(PyExc_TypeError,
337 "encoder must return a tuple (object,integer)");
338 goto onError;
339 }
340 v = PyTuple_GET_ITEM(result,0);
341 Py_INCREF(v);
342 /* We don't check or use the second (integer) entry. */
343
344 Py_DECREF(args);
345 Py_DECREF(encoder);
346 Py_DECREF(result);
347 return v;
348
349 onError:
Neal Norwitz3715c3e2005-11-24 22:09:18 +0000350 Py_XDECREF(result);
Guido van Rossumfeee4b92000-03-10 22:57:27 +0000351 Py_XDECREF(args);
352 Py_XDECREF(encoder);
353 return NULL;
354}
355
356/* Decode an object (usually a Python string) using the given encoding
357 and return an equivalent object (e.g. an Unicode object).
358
359 errors is passed to the decoder factory as argument if non-NULL. */
360
361PyObject *PyCodec_Decode(PyObject *object,
362 const char *encoding,
363 const char *errors)
364{
365 PyObject *decoder = NULL;
366 PyObject *args = NULL, *result = NULL;
367 PyObject *v;
368
369 decoder = PyCodec_Decoder(encoding);
370 if (decoder == NULL)
371 goto onError;
372
373 args = args_tuple(object, errors);
374 if (args == NULL)
375 goto onError;
376
377 result = PyEval_CallObject(decoder,args);
378 if (result == NULL)
379 goto onError;
380 if (!PyTuple_Check(result) ||
381 PyTuple_GET_SIZE(result) != 2) {
382 PyErr_SetString(PyExc_TypeError,
383 "decoder must return a tuple (object,integer)");
384 goto onError;
385 }
386 v = PyTuple_GET_ITEM(result,0);
387 Py_INCREF(v);
388 /* We don't check or use the second (integer) entry. */
389
390 Py_DECREF(args);
391 Py_DECREF(decoder);
392 Py_DECREF(result);
393 return v;
394
395 onError:
396 Py_XDECREF(args);
397 Py_XDECREF(decoder);
398 Py_XDECREF(result);
399 return NULL;
400}
401
Walter Dörwald3aeb6322002-09-02 13:14:32 +0000402/* Register the error handling callback function error under the name
403 name. This function will be called by the codec when it encounters
404 an unencodable characters/undecodable bytes and doesn't know the
405 callback name, when name is specified as the error parameter
406 in the call to the encode/decode function.
407 Return 0 on success, -1 on error */
408int PyCodec_RegisterError(const char *name, PyObject *error)
409{
Nicholas Bastine5662ae2004-03-24 22:22:12 +0000410 PyInterpreterState *interp = PyThreadState_GET()->interp;
Gustavo Niemeyer5ddd4c32003-03-19 00:35:36 +0000411 if (interp->codec_search_path == NULL && _PyCodecRegistry_Init())
412 return -1;
Walter Dörwald3aeb6322002-09-02 13:14:32 +0000413 if (!PyCallable_Check(error)) {
414 PyErr_SetString(PyExc_TypeError, "handler must be callable");
415 return -1;
416 }
Gustavo Niemeyer5ddd4c32003-03-19 00:35:36 +0000417 return PyDict_SetItemString(interp->codec_error_registry,
418 (char *)name, error);
Walter Dörwald3aeb6322002-09-02 13:14:32 +0000419}
420
421/* Lookup the error handling callback function registered under the
422 name error. As a special case NULL can be passed, in which case
423 the error handling callback for strict encoding will be returned. */
424PyObject *PyCodec_LookupError(const char *name)
425{
426 PyObject *handler = NULL;
427
Nicholas Bastine5662ae2004-03-24 22:22:12 +0000428 PyInterpreterState *interp = PyThreadState_GET()->interp;
Gustavo Niemeyer5ddd4c32003-03-19 00:35:36 +0000429 if (interp->codec_search_path == NULL && _PyCodecRegistry_Init())
430 return NULL;
431
Walter Dörwald3aeb6322002-09-02 13:14:32 +0000432 if (name==NULL)
433 name = "strict";
Gustavo Niemeyer5ddd4c32003-03-19 00:35:36 +0000434 handler = PyDict_GetItemString(interp->codec_error_registry, (char *)name);
Walter Dörwald3aeb6322002-09-02 13:14:32 +0000435 if (!handler)
436 PyErr_Format(PyExc_LookupError, "unknown error handler name '%.400s'", name);
437 else
438 Py_INCREF(handler);
439 return handler;
440}
441
442static void wrong_exception_type(PyObject *exc)
443{
444 PyObject *type = PyObject_GetAttrString(exc, "__class__");
445 if (type != NULL) {
446 PyObject *name = PyObject_GetAttrString(type, "__name__");
447 Py_DECREF(type);
448 if (name != NULL) {
449 PyObject *string = PyObject_Str(name);
450 Py_DECREF(name);
Walter Dörwaldf7bcd1d2002-09-02 18:22:32 +0000451 if (string != NULL) {
452 PyErr_Format(PyExc_TypeError,
453 "don't know how to handle %.400s in error callback",
454 PyString_AS_STRING(string));
455 Py_DECREF(string);
456 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +0000457 }
458 }
459}
460
461PyObject *PyCodec_StrictErrors(PyObject *exc)
462{
Brett Cannonbf364092006-03-01 04:25:17 +0000463 if (PyExceptionInstance_Check(exc))
464 PyErr_SetObject(PyExceptionInstance_Class(exc), exc);
Walter Dörwald3aeb6322002-09-02 13:14:32 +0000465 else
466 PyErr_SetString(PyExc_TypeError, "codec must pass exception instance");
467 return NULL;
468}
469
470
471PyObject *PyCodec_IgnoreErrors(PyObject *exc)
472{
Martin v. Löwis18e16552006-02-15 17:27:45 +0000473 Py_ssize_t end;
Walter Dörwald3aeb6322002-09-02 13:14:32 +0000474 if (PyObject_IsInstance(exc, PyExc_UnicodeEncodeError)) {
475 if (PyUnicodeEncodeError_GetEnd(exc, &end))
476 return NULL;
477 }
478 else if (PyObject_IsInstance(exc, PyExc_UnicodeDecodeError)) {
479 if (PyUnicodeDecodeError_GetEnd(exc, &end))
480 return NULL;
481 }
482 else if (PyObject_IsInstance(exc, PyExc_UnicodeTranslateError)) {
483 if (PyUnicodeTranslateError_GetEnd(exc, &end))
484 return NULL;
485 }
486 else {
487 wrong_exception_type(exc);
488 return NULL;
489 }
490 /* ouch: passing NULL, 0, pos gives None instead of u'' */
Martin v. Löwis18e16552006-02-15 17:27:45 +0000491 return Py_BuildValue("(u#n)", &end, 0, end);
Walter Dörwald3aeb6322002-09-02 13:14:32 +0000492}
493
494
495PyObject *PyCodec_ReplaceErrors(PyObject *exc)
496{
497 PyObject *restuple;
Martin v. Löwis18e16552006-02-15 17:27:45 +0000498 Py_ssize_t start;
499 Py_ssize_t end;
500 Py_ssize_t i;
Walter Dörwald3aeb6322002-09-02 13:14:32 +0000501
502 if (PyObject_IsInstance(exc, PyExc_UnicodeEncodeError)) {
503 PyObject *res;
504 Py_UNICODE *p;
505 if (PyUnicodeEncodeError_GetStart(exc, &start))
506 return NULL;
507 if (PyUnicodeEncodeError_GetEnd(exc, &end))
508 return NULL;
509 res = PyUnicode_FromUnicode(NULL, end-start);
510 if (res == NULL)
511 return NULL;
512 for (p = PyUnicode_AS_UNICODE(res), i = start;
513 i<end; ++p, ++i)
514 *p = '?';
Martin v. Löwis18e16552006-02-15 17:27:45 +0000515 restuple = Py_BuildValue("(On)", res, end);
Walter Dörwald3aeb6322002-09-02 13:14:32 +0000516 Py_DECREF(res);
517 return restuple;
518 }
519 else if (PyObject_IsInstance(exc, PyExc_UnicodeDecodeError)) {
520 Py_UNICODE res = Py_UNICODE_REPLACEMENT_CHARACTER;
521 if (PyUnicodeDecodeError_GetEnd(exc, &end))
522 return NULL;
Martin v. Löwis18e16552006-02-15 17:27:45 +0000523 return Py_BuildValue("(u#n)", &res, 1, end);
Walter Dörwald3aeb6322002-09-02 13:14:32 +0000524 }
525 else if (PyObject_IsInstance(exc, PyExc_UnicodeTranslateError)) {
526 PyObject *res;
527 Py_UNICODE *p;
528 if (PyUnicodeTranslateError_GetStart(exc, &start))
529 return NULL;
530 if (PyUnicodeTranslateError_GetEnd(exc, &end))
531 return NULL;
532 res = PyUnicode_FromUnicode(NULL, end-start);
533 if (res == NULL)
534 return NULL;
535 for (p = PyUnicode_AS_UNICODE(res), i = start;
536 i<end; ++p, ++i)
537 *p = Py_UNICODE_REPLACEMENT_CHARACTER;
Martin v. Löwis18e16552006-02-15 17:27:45 +0000538 restuple = Py_BuildValue("(On)", res, end);
Walter Dörwald3aeb6322002-09-02 13:14:32 +0000539 Py_DECREF(res);
540 return restuple;
541 }
542 else {
543 wrong_exception_type(exc);
544 return NULL;
545 }
546}
547
548PyObject *PyCodec_XMLCharRefReplaceErrors(PyObject *exc)
549{
550 if (PyObject_IsInstance(exc, PyExc_UnicodeEncodeError)) {
551 PyObject *restuple;
552 PyObject *object;
Martin v. Löwis18e16552006-02-15 17:27:45 +0000553 Py_ssize_t start;
554 Py_ssize_t end;
Walter Dörwald3aeb6322002-09-02 13:14:32 +0000555 PyObject *res;
556 Py_UNICODE *p;
557 Py_UNICODE *startp;
558 Py_UNICODE *outp;
559 int ressize;
560 if (PyUnicodeEncodeError_GetStart(exc, &start))
561 return NULL;
562 if (PyUnicodeEncodeError_GetEnd(exc, &end))
563 return NULL;
564 if (!(object = PyUnicodeEncodeError_GetObject(exc)))
565 return NULL;
566 startp = PyUnicode_AS_UNICODE(object);
567 for (p = startp+start, ressize = 0; p < startp+end; ++p) {
568 if (*p<10)
569 ressize += 2+1+1;
570 else if (*p<100)
571 ressize += 2+2+1;
572 else if (*p<1000)
573 ressize += 2+3+1;
574 else if (*p<10000)
575 ressize += 2+4+1;
Hye-Shik Chang7db07e62003-12-29 01:36:01 +0000576#ifndef Py_UNICODE_WIDE
577 else
578 ressize += 2+5+1;
579#else
Walter Dörwald3aeb6322002-09-02 13:14:32 +0000580 else if (*p<100000)
581 ressize += 2+5+1;
582 else if (*p<1000000)
583 ressize += 2+6+1;
584 else
585 ressize += 2+7+1;
Hye-Shik Chang7db07e62003-12-29 01:36:01 +0000586#endif
Walter Dörwald3aeb6322002-09-02 13:14:32 +0000587 }
588 /* allocate replacement */
589 res = PyUnicode_FromUnicode(NULL, ressize);
590 if (res == NULL) {
591 Py_DECREF(object);
592 return NULL;
593 }
594 /* generate replacement */
595 for (p = startp+start, outp = PyUnicode_AS_UNICODE(res);
596 p < startp+end; ++p) {
597 Py_UNICODE c = *p;
598 int digits;
599 int base;
600 *outp++ = '&';
601 *outp++ = '#';
602 if (*p<10) {
603 digits = 1;
604 base = 1;
605 }
606 else if (*p<100) {
607 digits = 2;
608 base = 10;
609 }
610 else if (*p<1000) {
611 digits = 3;
612 base = 100;
613 }
614 else if (*p<10000) {
615 digits = 4;
616 base = 1000;
617 }
Hye-Shik Chang7db07e62003-12-29 01:36:01 +0000618#ifndef Py_UNICODE_WIDE
619 else {
620 digits = 5;
621 base = 10000;
622 }
623#else
Walter Dörwald3aeb6322002-09-02 13:14:32 +0000624 else if (*p<100000) {
625 digits = 5;
626 base = 10000;
627 }
628 else if (*p<1000000) {
629 digits = 6;
630 base = 100000;
631 }
632 else {
633 digits = 7;
634 base = 1000000;
635 }
Hye-Shik Chang7db07e62003-12-29 01:36:01 +0000636#endif
Walter Dörwald3aeb6322002-09-02 13:14:32 +0000637 while (digits-->0) {
638 *outp++ = '0' + c/base;
639 c %= base;
640 base /= 10;
641 }
642 *outp++ = ';';
643 }
Martin v. Löwis18e16552006-02-15 17:27:45 +0000644 restuple = Py_BuildValue("(On)", res, end);
Walter Dörwald3aeb6322002-09-02 13:14:32 +0000645 Py_DECREF(res);
646 Py_DECREF(object);
647 return restuple;
648 }
649 else {
650 wrong_exception_type(exc);
651 return NULL;
652 }
653}
654
655static Py_UNICODE hexdigits[] = {
656 '0', '1', '2', '3', '4', '5', '6', '7',
657 '8', '9', 'a', 'b', 'c', 'd', 'e', 'f'
658};
659
660PyObject *PyCodec_BackslashReplaceErrors(PyObject *exc)
661{
662 if (PyObject_IsInstance(exc, PyExc_UnicodeEncodeError)) {
663 PyObject *restuple;
664 PyObject *object;
Martin v. Löwis18e16552006-02-15 17:27:45 +0000665 Py_ssize_t start;
666 Py_ssize_t end;
Walter Dörwald3aeb6322002-09-02 13:14:32 +0000667 PyObject *res;
668 Py_UNICODE *p;
669 Py_UNICODE *startp;
670 Py_UNICODE *outp;
671 int ressize;
672 if (PyUnicodeEncodeError_GetStart(exc, &start))
673 return NULL;
674 if (PyUnicodeEncodeError_GetEnd(exc, &end))
675 return NULL;
676 if (!(object = PyUnicodeEncodeError_GetObject(exc)))
677 return NULL;
678 startp = PyUnicode_AS_UNICODE(object);
679 for (p = startp+start, ressize = 0; p < startp+end; ++p) {
Hye-Shik Chang7db07e62003-12-29 01:36:01 +0000680#ifdef Py_UNICODE_WIDE
Walter Dörwald3aeb6322002-09-02 13:14:32 +0000681 if (*p >= 0x00010000)
682 ressize += 1+1+8;
Hye-Shik Chang7db07e62003-12-29 01:36:01 +0000683 else
684#endif
685 if (*p >= 0x100) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +0000686 ressize += 1+1+4;
687 }
688 else
689 ressize += 1+1+2;
690 }
691 res = PyUnicode_FromUnicode(NULL, ressize);
692 if (res==NULL)
693 return NULL;
694 for (p = startp+start, outp = PyUnicode_AS_UNICODE(res);
695 p < startp+end; ++p) {
696 Py_UNICODE c = *p;
697 *outp++ = '\\';
Hye-Shik Chang7db07e62003-12-29 01:36:01 +0000698#ifdef Py_UNICODE_WIDE
Walter Dörwald3aeb6322002-09-02 13:14:32 +0000699 if (c >= 0x00010000) {
700 *outp++ = 'U';
701 *outp++ = hexdigits[(c>>28)&0xf];
702 *outp++ = hexdigits[(c>>24)&0xf];
703 *outp++ = hexdigits[(c>>20)&0xf];
704 *outp++ = hexdigits[(c>>16)&0xf];
705 *outp++ = hexdigits[(c>>12)&0xf];
706 *outp++ = hexdigits[(c>>8)&0xf];
707 }
Hye-Shik Chang7db07e62003-12-29 01:36:01 +0000708 else
709#endif
710 if (c >= 0x100) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +0000711 *outp++ = 'u';
712 *outp++ = hexdigits[(c>>12)&0xf];
713 *outp++ = hexdigits[(c>>8)&0xf];
714 }
715 else
716 *outp++ = 'x';
717 *outp++ = hexdigits[(c>>4)&0xf];
718 *outp++ = hexdigits[c&0xf];
719 }
720
Martin v. Löwis18e16552006-02-15 17:27:45 +0000721 restuple = Py_BuildValue("(On)", res, end);
Walter Dörwald3aeb6322002-09-02 13:14:32 +0000722 Py_DECREF(res);
723 Py_DECREF(object);
724 return restuple;
725 }
726 else {
727 wrong_exception_type(exc);
728 return NULL;
729 }
730}
731
732static PyObject *strict_errors(PyObject *self, PyObject *exc)
733{
734 return PyCodec_StrictErrors(exc);
735}
736
737
738static PyObject *ignore_errors(PyObject *self, PyObject *exc)
739{
740 return PyCodec_IgnoreErrors(exc);
741}
742
743
744static PyObject *replace_errors(PyObject *self, PyObject *exc)
745{
746 return PyCodec_ReplaceErrors(exc);
747}
748
749
750static PyObject *xmlcharrefreplace_errors(PyObject *self, PyObject *exc)
751{
752 return PyCodec_XMLCharRefReplaceErrors(exc);
753}
754
755
756static PyObject *backslashreplace_errors(PyObject *self, PyObject *exc)
757{
758 return PyCodec_BackslashReplaceErrors(exc);
759}
760
Gustavo Niemeyer5ddd4c32003-03-19 00:35:36 +0000761static int _PyCodecRegistry_Init(void)
Guido van Rossumfeee4b92000-03-10 22:57:27 +0000762{
Walter Dörwald3aeb6322002-09-02 13:14:32 +0000763 static struct {
764 char *name;
765 PyMethodDef def;
766 } methods[] =
767 {
768 {
769 "strict",
770 {
771 "strict_errors",
772 strict_errors,
773 METH_O
774 }
775 },
776 {
777 "ignore",
778 {
779 "ignore_errors",
780 ignore_errors,
781 METH_O
782 }
783 },
784 {
785 "replace",
786 {
787 "replace_errors",
788 replace_errors,
789 METH_O
790 }
791 },
792 {
793 "xmlcharrefreplace",
794 {
795 "xmlcharrefreplace_errors",
796 xmlcharrefreplace_errors,
797 METH_O
798 }
799 },
800 {
801 "backslashreplace",
802 {
803 "backslashreplace_errors",
804 backslashreplace_errors,
805 METH_O
806 }
807 }
808 };
Walter Dörwald3aeb6322002-09-02 13:14:32 +0000809
Nicholas Bastine5662ae2004-03-24 22:22:12 +0000810 PyInterpreterState *interp = PyThreadState_GET()->interp;
Gustavo Niemeyer5ddd4c32003-03-19 00:35:36 +0000811 PyObject *mod;
Neal Norwitz739a8f82004-07-08 01:55:58 +0000812 unsigned i;
Gustavo Niemeyer5ddd4c32003-03-19 00:35:36 +0000813
814 if (interp->codec_search_path != NULL)
815 return 0;
816
817 interp->codec_search_path = PyList_New(0);
818 interp->codec_search_cache = PyDict_New();
819 interp->codec_error_registry = PyDict_New();
820
821 if (interp->codec_error_registry) {
822 for (i = 0; i < sizeof(methods)/sizeof(methods[0]); ++i) {
823 PyObject *func = PyCFunction_New(&methods[i].def, NULL);
824 int res;
825 if (!func)
826 Py_FatalError("can't initialize codec error registry");
827 res = PyCodec_RegisterError(methods[i].name, func);
828 Py_DECREF(func);
829 if (res)
830 Py_FatalError("can't initialize codec error registry");
Walter Dörwald3aeb6322002-09-02 13:14:32 +0000831 }
832 }
Guido van Rossumfeee4b92000-03-10 22:57:27 +0000833
Gustavo Niemeyer5ddd4c32003-03-19 00:35:36 +0000834 if (interp->codec_search_path == NULL ||
835 interp->codec_search_cache == NULL ||
836 interp->codec_error_registry == NULL)
837 Py_FatalError("can't initialize codec registry");
838
Thomas Woutersf7f438b2006-02-28 16:09:29 +0000839 mod = PyImport_ImportModuleLevel("encodings", NULL, NULL, NULL, 0);
Gustavo Niemeyer5ddd4c32003-03-19 00:35:36 +0000840 if (mod == NULL) {
841 if (PyErr_ExceptionMatches(PyExc_ImportError)) {
842 /* Ignore ImportErrors... this is done so that
843 distributions can disable the encodings package. Note
844 that other errors are not masked, e.g. SystemErrors
845 raised to inform the user of an error in the Python
846 configuration are still reported back to the user. */
847 PyErr_Clear();
848 return 0;
849 }
850 return -1;
851 }
852 Py_DECREF(mod);
853 return 0;
Guido van Rossumfeee4b92000-03-10 22:57:27 +0000854}