blob: 1ba600912262896766812815400540dc012fadc1 [file] [log] [blame]
Guido van Rossumfeee4b92000-03-10 22:57:27 +00001/* ------------------------------------------------------------------------
2
3 Python Codec Registry and support functions
4
5Written by Marc-Andre Lemburg (mal@lemburg.com).
6
Guido van Rossum16b1ad92000-08-03 16:24:25 +00007Copyright (c) Corporation for National Research Initiatives.
Guido van Rossumfeee4b92000-03-10 22:57:27 +00008
9 ------------------------------------------------------------------------ */
10
11#include "Python.h"
12#include <ctype.h>
13
Guido van Rossumfeee4b92000-03-10 22:57:27 +000014/* --- Codec Registry ----------------------------------------------------- */
15
16/* Import the standard encodings package which will register the first
17 codec search function.
18
19 This is done in a lazy way so that the Unicode implementation does
20 not downgrade startup time of scripts not needing it.
21
Guido van Rossumb95de4f2000-03-31 17:25:23 +000022 ImportErrors are silently ignored by this function. Only one try is
23 made.
Guido van Rossumfeee4b92000-03-10 22:57:27 +000024
25*/
26
Gustavo Niemeyer5ddd4c32003-03-19 00:35:36 +000027static int _PyCodecRegistry_Init(void); /* Forward */
Guido van Rossumfeee4b92000-03-10 22:57:27 +000028
Guido van Rossumfeee4b92000-03-10 22:57:27 +000029int PyCodec_Register(PyObject *search_function)
30{
Nicholas Bastine5662ae2004-03-24 22:22:12 +000031 PyInterpreterState *interp = PyThreadState_GET()->interp;
Gustavo Niemeyer5ddd4c32003-03-19 00:35:36 +000032 if (interp->codec_search_path == NULL && _PyCodecRegistry_Init())
33 goto onError;
Guido van Rossumfeee4b92000-03-10 22:57:27 +000034 if (search_function == NULL) {
35 PyErr_BadArgument();
Guido van Rossumb95de4f2000-03-31 17:25:23 +000036 goto onError;
Guido van Rossumfeee4b92000-03-10 22:57:27 +000037 }
38 if (!PyCallable_Check(search_function)) {
Neal Norwitz3715c3e2005-11-24 22:09:18 +000039 PyErr_SetString(PyExc_TypeError, "argument must be callable");
Guido van Rossumb95de4f2000-03-31 17:25:23 +000040 goto onError;
Guido van Rossumfeee4b92000-03-10 22:57:27 +000041 }
Gustavo Niemeyer5ddd4c32003-03-19 00:35:36 +000042 return PyList_Append(interp->codec_search_path, search_function);
Guido van Rossumb95de4f2000-03-31 17:25:23 +000043
44 onError:
45 return -1;
Guido van Rossumfeee4b92000-03-10 22:57:27 +000046}
47
Guido van Rossum9e896b32000-04-05 20:11:21 +000048/* Convert a string to a normalized Python string: all characters are
49 converted to lower case, spaces are replaced with underscores. */
50
Guido van Rossumfeee4b92000-03-10 22:57:27 +000051static
Guido van Rossum9e896b32000-04-05 20:11:21 +000052PyObject *normalizestring(const char *string)
Guido van Rossumfeee4b92000-03-10 22:57:27 +000053{
Guido van Rossum33831132000-06-29 14:50:15 +000054 register size_t i;
Guido van Rossum582acec2000-06-28 22:07:35 +000055 size_t len = strlen(string);
Guido van Rossumfeee4b92000-03-10 22:57:27 +000056 char *p;
57 PyObject *v;
58
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000059 if (len > PY_SSIZE_T_MAX) {
60 PyErr_SetString(PyExc_OverflowError, "string is too large");
61 return NULL;
62 }
Guido van Rossum582acec2000-06-28 22:07:35 +000063
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000064 v = PyString_FromStringAndSize(NULL, len);
Guido van Rossumfeee4b92000-03-10 22:57:27 +000065 if (v == NULL)
66 return NULL;
67 p = PyString_AS_STRING(v);
Guido van Rossum9e896b32000-04-05 20:11:21 +000068 for (i = 0; i < len; i++) {
69 register char ch = string[i];
70 if (ch == ' ')
71 ch = '-';
72 else
Thomas Wouters477c8d52006-05-27 19:21:47 +000073 ch = tolower(Py_CHARMASK(ch));
Guido van Rossum9e896b32000-04-05 20:11:21 +000074 p[i] = ch;
75 }
Guido van Rossumfeee4b92000-03-10 22:57:27 +000076 return v;
77}
78
79/* Lookup the given encoding and return a tuple providing the codec
80 facilities.
81
82 The encoding string is looked up converted to all lower-case
83 characters. This makes encodings looked up through this mechanism
84 effectively case-insensitive.
85
Fred Drake766de832000-05-09 19:55:59 +000086 If no codec is found, a LookupError is set and NULL returned.
Guido van Rossumb95de4f2000-03-31 17:25:23 +000087
88 As side effect, this tries to load the encodings package, if not
89 yet done. This is part of the lazy load strategy for the encodings
90 package.
91
92*/
Guido van Rossumfeee4b92000-03-10 22:57:27 +000093
94PyObject *_PyCodec_Lookup(const char *encoding)
95{
Gustavo Niemeyer5ddd4c32003-03-19 00:35:36 +000096 PyInterpreterState *interp;
Guido van Rossum5ba3c842000-03-24 20:52:23 +000097 PyObject *result, *args = NULL, *v;
Thomas Wouters477c8d52006-05-27 19:21:47 +000098 Py_ssize_t i, len;
Guido van Rossumfeee4b92000-03-10 22:57:27 +000099
Fred Drake766de832000-05-09 19:55:59 +0000100 if (encoding == NULL) {
101 PyErr_BadArgument();
102 goto onError;
103 }
Gustavo Niemeyer5ddd4c32003-03-19 00:35:36 +0000104
Nicholas Bastine5662ae2004-03-24 22:22:12 +0000105 interp = PyThreadState_GET()->interp;
Gustavo Niemeyer5ddd4c32003-03-19 00:35:36 +0000106 if (interp->codec_search_path == NULL && _PyCodecRegistry_Init())
Barry Warsaw51ac5802000-03-20 16:36:48 +0000107 goto onError;
Guido van Rossumfeee4b92000-03-10 22:57:27 +0000108
Guido van Rossum9e896b32000-04-05 20:11:21 +0000109 /* Convert the encoding to a normalized Python string: all
Thomas Wouters7e474022000-07-16 12:04:32 +0000110 characters are converted to lower case, spaces and hyphens are
Guido van Rossum9e896b32000-04-05 20:11:21 +0000111 replaced with underscores. */
112 v = normalizestring(encoding);
Guido van Rossumfeee4b92000-03-10 22:57:27 +0000113 if (v == NULL)
114 goto onError;
115 PyString_InternInPlace(&v);
116
117 /* First, try to lookup the name in the registry dictionary */
Gustavo Niemeyer5ddd4c32003-03-19 00:35:36 +0000118 result = PyDict_GetItem(interp->codec_search_cache, v);
Guido van Rossumfeee4b92000-03-10 22:57:27 +0000119 if (result != NULL) {
120 Py_INCREF(result);
Barry Warsaw51ac5802000-03-20 16:36:48 +0000121 Py_DECREF(v);
Guido van Rossumfeee4b92000-03-10 22:57:27 +0000122 return result;
123 }
124
125 /* Next, scan the search functions in order of registration */
Guido van Rossumfeee4b92000-03-10 22:57:27 +0000126 args = PyTuple_New(1);
127 if (args == NULL)
128 goto onError;
129 PyTuple_SET_ITEM(args,0,v);
Guido van Rossum5ba3c842000-03-24 20:52:23 +0000130
Gustavo Niemeyer5ddd4c32003-03-19 00:35:36 +0000131 len = PyList_Size(interp->codec_search_path);
Guido van Rossum5ba3c842000-03-24 20:52:23 +0000132 if (len < 0)
133 goto onError;
Guido van Rossumb95de4f2000-03-31 17:25:23 +0000134 if (len == 0) {
135 PyErr_SetString(PyExc_LookupError,
136 "no codec search functions registered: "
137 "can't find encoding");
138 goto onError;
139 }
Guido van Rossumfeee4b92000-03-10 22:57:27 +0000140
141 for (i = 0; i < len; i++) {
142 PyObject *func;
143
Gustavo Niemeyer5ddd4c32003-03-19 00:35:36 +0000144 func = PyList_GetItem(interp->codec_search_path, i);
Guido van Rossumfeee4b92000-03-10 22:57:27 +0000145 if (func == NULL)
146 goto onError;
Guido van Rossum5ba3c842000-03-24 20:52:23 +0000147 result = PyEval_CallObject(func, args);
Guido van Rossumfeee4b92000-03-10 22:57:27 +0000148 if (result == NULL)
149 goto onError;
150 if (result == Py_None) {
151 Py_DECREF(result);
152 continue;
153 }
154 if (!PyTuple_Check(result) || PyTuple_GET_SIZE(result) != 4) {
155 PyErr_SetString(PyExc_TypeError,
156 "codec search functions must return 4-tuples");
157 Py_DECREF(result);
158 goto onError;
159 }
160 break;
161 }
162 if (i == len) {
163 /* XXX Perhaps we should cache misses too ? */
Martin v. Löwiseb42b022002-09-26 16:01:24 +0000164 PyErr_Format(PyExc_LookupError,
165 "unknown encoding: %s", encoding);
Guido van Rossumfeee4b92000-03-10 22:57:27 +0000166 goto onError;
167 }
168
169 /* Cache and return the result */
Gustavo Niemeyer5ddd4c32003-03-19 00:35:36 +0000170 PyDict_SetItem(interp->codec_search_cache, v, result);
Guido van Rossumfeee4b92000-03-10 22:57:27 +0000171 Py_DECREF(args);
172 return result;
173
174 onError:
175 Py_XDECREF(args);
176 return NULL;
177}
178
179static
180PyObject *args_tuple(PyObject *object,
181 const char *errors)
182{
183 PyObject *args;
184
185 args = PyTuple_New(1 + (errors != NULL));
186 if (args == NULL)
187 return NULL;
188 Py_INCREF(object);
189 PyTuple_SET_ITEM(args,0,object);
190 if (errors) {
191 PyObject *v;
192
193 v = PyString_FromString(errors);
194 if (v == NULL) {
195 Py_DECREF(args);
196 return NULL;
197 }
198 PyTuple_SET_ITEM(args, 1, v);
199 }
200 return args;
201}
202
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000203/* Helper function to get a codec item */
Guido van Rossumfeee4b92000-03-10 22:57:27 +0000204
205static
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000206PyObject *codec_getitem(const char *encoding, int index)
Guido van Rossumfeee4b92000-03-10 22:57:27 +0000207{
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000208 PyObject *codecs;
209 PyObject *v;
Guido van Rossumfeee4b92000-03-10 22:57:27 +0000210
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000211 codecs = _PyCodec_Lookup(encoding);
212 if (codecs == NULL)
Guido van Rossumfeee4b92000-03-10 22:57:27 +0000213 return NULL;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000214 v = PyTuple_GET_ITEM(codecs, index);
215 Py_DECREF(codecs);
216 Py_INCREF(v);
217 return v;
218}
219
220/* Helper function to create an incremental codec. */
221
222static
223PyObject *codec_getincrementalcodec(const char *encoding,
224 const char *errors,
225 const char *attrname)
226{
227 PyObject *codecs, *ret, *inccodec;
228
229 codecs = _PyCodec_Lookup(encoding);
230 if (codecs == NULL)
231 return NULL;
232 inccodec = PyObject_GetAttrString(codecs, attrname);
233 Py_DECREF(codecs);
234 if (inccodec == NULL)
235 return NULL;
236 if (errors)
237 ret = PyObject_CallFunction(inccodec, "s", errors);
238 else
239 ret = PyObject_CallFunction(inccodec, NULL);
240 Py_DECREF(inccodec);
241 return ret;
242}
243
244/* Helper function to create a stream codec. */
245
246static
247PyObject *codec_getstreamcodec(const char *encoding,
248 PyObject *stream,
249 const char *errors,
250 const int index)
251{
Thomas Wouters0e3f5912006-08-11 14:57:12 +0000252 PyObject *codecs, *streamcodec, *codeccls;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000253
254 codecs = _PyCodec_Lookup(encoding);
255 if (codecs == NULL)
256 return NULL;
257
Thomas Wouters0e3f5912006-08-11 14:57:12 +0000258 codeccls = PyTuple_GET_ITEM(codecs, index);
259 if (errors != NULL)
260 streamcodec = PyObject_CallFunction(codeccls, "Os", stream, errors);
261 else
262 streamcodec = PyObject_CallFunction(codeccls, "O", stream);
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000263 Py_DECREF(codecs);
264 return streamcodec;
Guido van Rossumfeee4b92000-03-10 22:57:27 +0000265}
266
267/* Convenience APIs to query the Codec registry.
268
269 All APIs return a codec object with incremented refcount.
270
271 */
272
273PyObject *PyCodec_Encoder(const char *encoding)
274{
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000275 return codec_getitem(encoding, 0);
Guido van Rossumfeee4b92000-03-10 22:57:27 +0000276}
277
278PyObject *PyCodec_Decoder(const char *encoding)
279{
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000280 return codec_getitem(encoding, 1);
Guido van Rossumfeee4b92000-03-10 22:57:27 +0000281}
282
Thomas Woutersa9773292006-04-21 09:43:23 +0000283PyObject *PyCodec_IncrementalEncoder(const char *encoding,
284 const char *errors)
285{
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000286 return codec_getincrementalcodec(encoding, errors, "incrementalencoder");
Thomas Woutersa9773292006-04-21 09:43:23 +0000287}
288
289PyObject *PyCodec_IncrementalDecoder(const char *encoding,
290 const char *errors)
291{
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000292 return codec_getincrementalcodec(encoding, errors, "incrementaldecoder");
Thomas Woutersa9773292006-04-21 09:43:23 +0000293}
294
Guido van Rossumfeee4b92000-03-10 22:57:27 +0000295PyObject *PyCodec_StreamReader(const char *encoding,
296 PyObject *stream,
297 const char *errors)
298{
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000299 return codec_getstreamcodec(encoding, stream, errors, 2);
Guido van Rossumfeee4b92000-03-10 22:57:27 +0000300}
301
302PyObject *PyCodec_StreamWriter(const char *encoding,
303 PyObject *stream,
304 const char *errors)
305{
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000306 return codec_getstreamcodec(encoding, stream, errors, 3);
Guido van Rossumfeee4b92000-03-10 22:57:27 +0000307}
308
309/* Encode an object (e.g. an Unicode object) using the given encoding
310 and return the resulting encoded object (usually a Python string).
311
312 errors is passed to the encoder factory as argument if non-NULL. */
313
314PyObject *PyCodec_Encode(PyObject *object,
315 const char *encoding,
316 const char *errors)
317{
318 PyObject *encoder = NULL;
Neal Norwitz3715c3e2005-11-24 22:09:18 +0000319 PyObject *args = NULL, *result = NULL;
Guido van Rossumfeee4b92000-03-10 22:57:27 +0000320 PyObject *v;
321
Martin v. Löwis641d5cc2007-06-11 04:19:13 +0000322 /* XXX short-cut a few common file system
323 encodings for now, as otherwise the import
324 code can't load the codec registry. */
325 if (strcmp(encoding, "utf-8") == 0 && PyUnicode_Check(object)) {
326 return PyUnicode_EncodeUTF8(PyUnicode_AS_UNICODE(object),
327 PyUnicode_GET_SIZE(object),
328 errors);
329 }
330#if defined(MS_WINDOWS) && defined(HAVE_USABLE_WCHAR_T)
331 if (strcmp(encoding, "mbcs") == 0 && PyUnicode_Check(object)) {
332 return PyUnicode_EncodeMBCS(PyUnicode_AS_UNICODE(object),
333 PyUnicode_GET_SIZE(object),
334 errors);
335 }
336#endif
337
338
Guido van Rossumfeee4b92000-03-10 22:57:27 +0000339 encoder = PyCodec_Encoder(encoding);
340 if (encoder == NULL)
341 goto onError;
342
343 args = args_tuple(object, errors);
344 if (args == NULL)
345 goto onError;
346
347 result = PyEval_CallObject(encoder,args);
348 if (result == NULL)
349 goto onError;
350
351 if (!PyTuple_Check(result) ||
352 PyTuple_GET_SIZE(result) != 2) {
353 PyErr_SetString(PyExc_TypeError,
354 "encoder must return a tuple (object,integer)");
355 goto onError;
356 }
357 v = PyTuple_GET_ITEM(result,0);
358 Py_INCREF(v);
359 /* We don't check or use the second (integer) entry. */
360
361 Py_DECREF(args);
362 Py_DECREF(encoder);
363 Py_DECREF(result);
364 return v;
365
366 onError:
Neal Norwitz3715c3e2005-11-24 22:09:18 +0000367 Py_XDECREF(result);
Guido van Rossumfeee4b92000-03-10 22:57:27 +0000368 Py_XDECREF(args);
369 Py_XDECREF(encoder);
370 return NULL;
371}
372
373/* Decode an object (usually a Python string) using the given encoding
374 and return an equivalent object (e.g. an Unicode object).
375
376 errors is passed to the decoder factory as argument if non-NULL. */
377
378PyObject *PyCodec_Decode(PyObject *object,
379 const char *encoding,
380 const char *errors)
381{
382 PyObject *decoder = NULL;
383 PyObject *args = NULL, *result = NULL;
384 PyObject *v;
385
386 decoder = PyCodec_Decoder(encoding);
387 if (decoder == NULL)
388 goto onError;
389
390 args = args_tuple(object, errors);
391 if (args == NULL)
392 goto onError;
393
394 result = PyEval_CallObject(decoder,args);
395 if (result == NULL)
396 goto onError;
397 if (!PyTuple_Check(result) ||
398 PyTuple_GET_SIZE(result) != 2) {
399 PyErr_SetString(PyExc_TypeError,
400 "decoder must return a tuple (object,integer)");
401 goto onError;
402 }
403 v = PyTuple_GET_ITEM(result,0);
404 Py_INCREF(v);
405 /* We don't check or use the second (integer) entry. */
406
407 Py_DECREF(args);
408 Py_DECREF(decoder);
409 Py_DECREF(result);
410 return v;
411
412 onError:
413 Py_XDECREF(args);
414 Py_XDECREF(decoder);
415 Py_XDECREF(result);
416 return NULL;
417}
418
Walter Dörwald3aeb6322002-09-02 13:14:32 +0000419/* Register the error handling callback function error under the name
420 name. This function will be called by the codec when it encounters
421 an unencodable characters/undecodable bytes and doesn't know the
422 callback name, when name is specified as the error parameter
423 in the call to the encode/decode function.
424 Return 0 on success, -1 on error */
425int PyCodec_RegisterError(const char *name, PyObject *error)
426{
Nicholas Bastine5662ae2004-03-24 22:22:12 +0000427 PyInterpreterState *interp = PyThreadState_GET()->interp;
Gustavo Niemeyer5ddd4c32003-03-19 00:35:36 +0000428 if (interp->codec_search_path == NULL && _PyCodecRegistry_Init())
429 return -1;
Walter Dörwald3aeb6322002-09-02 13:14:32 +0000430 if (!PyCallable_Check(error)) {
431 PyErr_SetString(PyExc_TypeError, "handler must be callable");
432 return -1;
433 }
Gustavo Niemeyer5ddd4c32003-03-19 00:35:36 +0000434 return PyDict_SetItemString(interp->codec_error_registry,
435 (char *)name, error);
Walter Dörwald3aeb6322002-09-02 13:14:32 +0000436}
437
438/* Lookup the error handling callback function registered under the
439 name error. As a special case NULL can be passed, in which case
440 the error handling callback for strict encoding will be returned. */
441PyObject *PyCodec_LookupError(const char *name)
442{
443 PyObject *handler = NULL;
444
Nicholas Bastine5662ae2004-03-24 22:22:12 +0000445 PyInterpreterState *interp = PyThreadState_GET()->interp;
Gustavo Niemeyer5ddd4c32003-03-19 00:35:36 +0000446 if (interp->codec_search_path == NULL && _PyCodecRegistry_Init())
447 return NULL;
448
Walter Dörwald3aeb6322002-09-02 13:14:32 +0000449 if (name==NULL)
450 name = "strict";
Gustavo Niemeyer5ddd4c32003-03-19 00:35:36 +0000451 handler = PyDict_GetItemString(interp->codec_error_registry, (char *)name);
Walter Dörwald3aeb6322002-09-02 13:14:32 +0000452 if (!handler)
453 PyErr_Format(PyExc_LookupError, "unknown error handler name '%.400s'", name);
454 else
455 Py_INCREF(handler);
456 return handler;
457}
458
459static void wrong_exception_type(PyObject *exc)
460{
461 PyObject *type = PyObject_GetAttrString(exc, "__class__");
462 if (type != NULL) {
Walter Dörwald573c08c2007-05-25 15:46:59 +0000463 PyObject *name = PyObject_GetAttrString(type, "__name__");
464 Py_DECREF(type);
465 if (name != NULL) {
466 PyErr_Format(PyExc_TypeError,
467 "don't know how to handle %S in error callback", name);
468 Py_DECREF(name);
469 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +0000470 }
471}
472
473PyObject *PyCodec_StrictErrors(PyObject *exc)
474{
Brett Cannonbf364092006-03-01 04:25:17 +0000475 if (PyExceptionInstance_Check(exc))
476 PyErr_SetObject(PyExceptionInstance_Class(exc), exc);
Walter Dörwald3aeb6322002-09-02 13:14:32 +0000477 else
478 PyErr_SetString(PyExc_TypeError, "codec must pass exception instance");
479 return NULL;
480}
481
482
483PyObject *PyCodec_IgnoreErrors(PyObject *exc)
484{
Martin v. Löwis18e16552006-02-15 17:27:45 +0000485 Py_ssize_t end;
Walter Dörwald3aeb6322002-09-02 13:14:32 +0000486 if (PyObject_IsInstance(exc, PyExc_UnicodeEncodeError)) {
487 if (PyUnicodeEncodeError_GetEnd(exc, &end))
488 return NULL;
489 }
490 else if (PyObject_IsInstance(exc, PyExc_UnicodeDecodeError)) {
491 if (PyUnicodeDecodeError_GetEnd(exc, &end))
492 return NULL;
493 }
494 else if (PyObject_IsInstance(exc, PyExc_UnicodeTranslateError)) {
495 if (PyUnicodeTranslateError_GetEnd(exc, &end))
496 return NULL;
497 }
498 else {
499 wrong_exception_type(exc);
500 return NULL;
501 }
502 /* ouch: passing NULL, 0, pos gives None instead of u'' */
Martin v. Löwis18e16552006-02-15 17:27:45 +0000503 return Py_BuildValue("(u#n)", &end, 0, end);
Walter Dörwald3aeb6322002-09-02 13:14:32 +0000504}
505
506
507PyObject *PyCodec_ReplaceErrors(PyObject *exc)
508{
509 PyObject *restuple;
Martin v. Löwis18e16552006-02-15 17:27:45 +0000510 Py_ssize_t start;
511 Py_ssize_t end;
512 Py_ssize_t i;
Walter Dörwald3aeb6322002-09-02 13:14:32 +0000513
514 if (PyObject_IsInstance(exc, PyExc_UnicodeEncodeError)) {
515 PyObject *res;
516 Py_UNICODE *p;
517 if (PyUnicodeEncodeError_GetStart(exc, &start))
518 return NULL;
519 if (PyUnicodeEncodeError_GetEnd(exc, &end))
520 return NULL;
521 res = PyUnicode_FromUnicode(NULL, end-start);
522 if (res == NULL)
523 return NULL;
524 for (p = PyUnicode_AS_UNICODE(res), i = start;
525 i<end; ++p, ++i)
526 *p = '?';
Martin v. Löwis18e16552006-02-15 17:27:45 +0000527 restuple = Py_BuildValue("(On)", res, end);
Walter Dörwald3aeb6322002-09-02 13:14:32 +0000528 Py_DECREF(res);
529 return restuple;
530 }
531 else if (PyObject_IsInstance(exc, PyExc_UnicodeDecodeError)) {
532 Py_UNICODE res = Py_UNICODE_REPLACEMENT_CHARACTER;
533 if (PyUnicodeDecodeError_GetEnd(exc, &end))
534 return NULL;
Martin v. Löwis18e16552006-02-15 17:27:45 +0000535 return Py_BuildValue("(u#n)", &res, 1, end);
Walter Dörwald3aeb6322002-09-02 13:14:32 +0000536 }
537 else if (PyObject_IsInstance(exc, PyExc_UnicodeTranslateError)) {
538 PyObject *res;
539 Py_UNICODE *p;
540 if (PyUnicodeTranslateError_GetStart(exc, &start))
541 return NULL;
542 if (PyUnicodeTranslateError_GetEnd(exc, &end))
543 return NULL;
544 res = PyUnicode_FromUnicode(NULL, end-start);
545 if (res == NULL)
546 return NULL;
547 for (p = PyUnicode_AS_UNICODE(res), i = start;
548 i<end; ++p, ++i)
549 *p = Py_UNICODE_REPLACEMENT_CHARACTER;
Martin v. Löwis18e16552006-02-15 17:27:45 +0000550 restuple = Py_BuildValue("(On)", res, end);
Walter Dörwald3aeb6322002-09-02 13:14:32 +0000551 Py_DECREF(res);
552 return restuple;
553 }
554 else {
555 wrong_exception_type(exc);
556 return NULL;
557 }
558}
559
560PyObject *PyCodec_XMLCharRefReplaceErrors(PyObject *exc)
561{
562 if (PyObject_IsInstance(exc, PyExc_UnicodeEncodeError)) {
563 PyObject *restuple;
564 PyObject *object;
Martin v. Löwis18e16552006-02-15 17:27:45 +0000565 Py_ssize_t start;
566 Py_ssize_t end;
Walter Dörwald3aeb6322002-09-02 13:14:32 +0000567 PyObject *res;
568 Py_UNICODE *p;
569 Py_UNICODE *startp;
570 Py_UNICODE *outp;
571 int ressize;
572 if (PyUnicodeEncodeError_GetStart(exc, &start))
573 return NULL;
574 if (PyUnicodeEncodeError_GetEnd(exc, &end))
575 return NULL;
576 if (!(object = PyUnicodeEncodeError_GetObject(exc)))
577 return NULL;
578 startp = PyUnicode_AS_UNICODE(object);
579 for (p = startp+start, ressize = 0; p < startp+end; ++p) {
580 if (*p<10)
581 ressize += 2+1+1;
582 else if (*p<100)
583 ressize += 2+2+1;
584 else if (*p<1000)
585 ressize += 2+3+1;
586 else if (*p<10000)
587 ressize += 2+4+1;
Hye-Shik Chang7db07e62003-12-29 01:36:01 +0000588#ifndef Py_UNICODE_WIDE
589 else
590 ressize += 2+5+1;
591#else
Walter Dörwald3aeb6322002-09-02 13:14:32 +0000592 else if (*p<100000)
593 ressize += 2+5+1;
594 else if (*p<1000000)
595 ressize += 2+6+1;
596 else
597 ressize += 2+7+1;
Hye-Shik Chang7db07e62003-12-29 01:36:01 +0000598#endif
Walter Dörwald3aeb6322002-09-02 13:14:32 +0000599 }
600 /* allocate replacement */
601 res = PyUnicode_FromUnicode(NULL, ressize);
602 if (res == NULL) {
603 Py_DECREF(object);
604 return NULL;
605 }
606 /* generate replacement */
607 for (p = startp+start, outp = PyUnicode_AS_UNICODE(res);
608 p < startp+end; ++p) {
609 Py_UNICODE c = *p;
610 int digits;
611 int base;
612 *outp++ = '&';
613 *outp++ = '#';
614 if (*p<10) {
615 digits = 1;
616 base = 1;
617 }
618 else if (*p<100) {
619 digits = 2;
620 base = 10;
621 }
622 else if (*p<1000) {
623 digits = 3;
624 base = 100;
625 }
626 else if (*p<10000) {
627 digits = 4;
628 base = 1000;
629 }
Hye-Shik Chang7db07e62003-12-29 01:36:01 +0000630#ifndef Py_UNICODE_WIDE
631 else {
632 digits = 5;
633 base = 10000;
634 }
635#else
Walter Dörwald3aeb6322002-09-02 13:14:32 +0000636 else if (*p<100000) {
637 digits = 5;
638 base = 10000;
639 }
640 else if (*p<1000000) {
641 digits = 6;
642 base = 100000;
643 }
644 else {
645 digits = 7;
646 base = 1000000;
647 }
Hye-Shik Chang7db07e62003-12-29 01:36:01 +0000648#endif
Walter Dörwald3aeb6322002-09-02 13:14:32 +0000649 while (digits-->0) {
650 *outp++ = '0' + c/base;
651 c %= base;
652 base /= 10;
653 }
654 *outp++ = ';';
655 }
Martin v. Löwis18e16552006-02-15 17:27:45 +0000656 restuple = Py_BuildValue("(On)", res, end);
Walter Dörwald3aeb6322002-09-02 13:14:32 +0000657 Py_DECREF(res);
658 Py_DECREF(object);
659 return restuple;
660 }
661 else {
662 wrong_exception_type(exc);
663 return NULL;
664 }
665}
666
667static Py_UNICODE hexdigits[] = {
668 '0', '1', '2', '3', '4', '5', '6', '7',
669 '8', '9', 'a', 'b', 'c', 'd', 'e', 'f'
670};
671
672PyObject *PyCodec_BackslashReplaceErrors(PyObject *exc)
673{
674 if (PyObject_IsInstance(exc, PyExc_UnicodeEncodeError)) {
675 PyObject *restuple;
676 PyObject *object;
Martin v. Löwis18e16552006-02-15 17:27:45 +0000677 Py_ssize_t start;
678 Py_ssize_t end;
Walter Dörwald3aeb6322002-09-02 13:14:32 +0000679 PyObject *res;
680 Py_UNICODE *p;
681 Py_UNICODE *startp;
682 Py_UNICODE *outp;
683 int ressize;
684 if (PyUnicodeEncodeError_GetStart(exc, &start))
685 return NULL;
686 if (PyUnicodeEncodeError_GetEnd(exc, &end))
687 return NULL;
688 if (!(object = PyUnicodeEncodeError_GetObject(exc)))
689 return NULL;
690 startp = PyUnicode_AS_UNICODE(object);
691 for (p = startp+start, ressize = 0; p < startp+end; ++p) {
Hye-Shik Chang7db07e62003-12-29 01:36:01 +0000692#ifdef Py_UNICODE_WIDE
Walter Dörwald3aeb6322002-09-02 13:14:32 +0000693 if (*p >= 0x00010000)
694 ressize += 1+1+8;
Hye-Shik Chang7db07e62003-12-29 01:36:01 +0000695 else
696#endif
697 if (*p >= 0x100) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +0000698 ressize += 1+1+4;
699 }
700 else
701 ressize += 1+1+2;
702 }
703 res = PyUnicode_FromUnicode(NULL, ressize);
704 if (res==NULL)
705 return NULL;
706 for (p = startp+start, outp = PyUnicode_AS_UNICODE(res);
707 p < startp+end; ++p) {
708 Py_UNICODE c = *p;
709 *outp++ = '\\';
Hye-Shik Chang7db07e62003-12-29 01:36:01 +0000710#ifdef Py_UNICODE_WIDE
Walter Dörwald3aeb6322002-09-02 13:14:32 +0000711 if (c >= 0x00010000) {
712 *outp++ = 'U';
713 *outp++ = hexdigits[(c>>28)&0xf];
714 *outp++ = hexdigits[(c>>24)&0xf];
715 *outp++ = hexdigits[(c>>20)&0xf];
716 *outp++ = hexdigits[(c>>16)&0xf];
717 *outp++ = hexdigits[(c>>12)&0xf];
718 *outp++ = hexdigits[(c>>8)&0xf];
719 }
Hye-Shik Chang7db07e62003-12-29 01:36:01 +0000720 else
721#endif
722 if (c >= 0x100) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +0000723 *outp++ = 'u';
724 *outp++ = hexdigits[(c>>12)&0xf];
725 *outp++ = hexdigits[(c>>8)&0xf];
726 }
727 else
728 *outp++ = 'x';
729 *outp++ = hexdigits[(c>>4)&0xf];
730 *outp++ = hexdigits[c&0xf];
731 }
732
Martin v. Löwis18e16552006-02-15 17:27:45 +0000733 restuple = Py_BuildValue("(On)", res, end);
Walter Dörwald3aeb6322002-09-02 13:14:32 +0000734 Py_DECREF(res);
735 Py_DECREF(object);
736 return restuple;
737 }
738 else {
739 wrong_exception_type(exc);
740 return NULL;
741 }
742}
743
744static PyObject *strict_errors(PyObject *self, PyObject *exc)
745{
746 return PyCodec_StrictErrors(exc);
747}
748
749
750static PyObject *ignore_errors(PyObject *self, PyObject *exc)
751{
752 return PyCodec_IgnoreErrors(exc);
753}
754
755
756static PyObject *replace_errors(PyObject *self, PyObject *exc)
757{
758 return PyCodec_ReplaceErrors(exc);
759}
760
761
762static PyObject *xmlcharrefreplace_errors(PyObject *self, PyObject *exc)
763{
764 return PyCodec_XMLCharRefReplaceErrors(exc);
765}
766
767
768static PyObject *backslashreplace_errors(PyObject *self, PyObject *exc)
769{
770 return PyCodec_BackslashReplaceErrors(exc);
771}
772
Gustavo Niemeyer5ddd4c32003-03-19 00:35:36 +0000773static int _PyCodecRegistry_Init(void)
Guido van Rossumfeee4b92000-03-10 22:57:27 +0000774{
Walter Dörwald3aeb6322002-09-02 13:14:32 +0000775 static struct {
776 char *name;
777 PyMethodDef def;
778 } methods[] =
779 {
780 {
781 "strict",
782 {
783 "strict_errors",
784 strict_errors,
785 METH_O
786 }
787 },
788 {
789 "ignore",
790 {
791 "ignore_errors",
792 ignore_errors,
793 METH_O
794 }
795 },
796 {
797 "replace",
798 {
799 "replace_errors",
800 replace_errors,
801 METH_O
802 }
803 },
804 {
805 "xmlcharrefreplace",
806 {
807 "xmlcharrefreplace_errors",
808 xmlcharrefreplace_errors,
809 METH_O
810 }
811 },
812 {
813 "backslashreplace",
814 {
815 "backslashreplace_errors",
816 backslashreplace_errors,
817 METH_O
818 }
819 }
820 };
Walter Dörwald3aeb6322002-09-02 13:14:32 +0000821
Nicholas Bastine5662ae2004-03-24 22:22:12 +0000822 PyInterpreterState *interp = PyThreadState_GET()->interp;
Gustavo Niemeyer5ddd4c32003-03-19 00:35:36 +0000823 PyObject *mod;
Neal Norwitz739a8f82004-07-08 01:55:58 +0000824 unsigned i;
Gustavo Niemeyer5ddd4c32003-03-19 00:35:36 +0000825
826 if (interp->codec_search_path != NULL)
827 return 0;
828
829 interp->codec_search_path = PyList_New(0);
830 interp->codec_search_cache = PyDict_New();
831 interp->codec_error_registry = PyDict_New();
832
833 if (interp->codec_error_registry) {
834 for (i = 0; i < sizeof(methods)/sizeof(methods[0]); ++i) {
835 PyObject *func = PyCFunction_New(&methods[i].def, NULL);
836 int res;
837 if (!func)
838 Py_FatalError("can't initialize codec error registry");
839 res = PyCodec_RegisterError(methods[i].name, func);
840 Py_DECREF(func);
841 if (res)
842 Py_FatalError("can't initialize codec error registry");
Walter Dörwald3aeb6322002-09-02 13:14:32 +0000843 }
844 }
Guido van Rossumfeee4b92000-03-10 22:57:27 +0000845
Gustavo Niemeyer5ddd4c32003-03-19 00:35:36 +0000846 if (interp->codec_search_path == NULL ||
847 interp->codec_search_cache == NULL ||
848 interp->codec_error_registry == NULL)
849 Py_FatalError("can't initialize codec registry");
850
Thomas Woutersf7f438b2006-02-28 16:09:29 +0000851 mod = PyImport_ImportModuleLevel("encodings", NULL, NULL, NULL, 0);
Gustavo Niemeyer5ddd4c32003-03-19 00:35:36 +0000852 if (mod == NULL) {
853 if (PyErr_ExceptionMatches(PyExc_ImportError)) {
854 /* Ignore ImportErrors... this is done so that
855 distributions can disable the encodings package. Note
856 that other errors are not masked, e.g. SystemErrors
857 raised to inform the user of an error in the Python
858 configuration are still reported back to the user. */
859 PyErr_Clear();
860 return 0;
861 }
862 return -1;
863 }
864 Py_DECREF(mod);
865 return 0;
Guido van Rossumfeee4b92000-03-10 22:57:27 +0000866}