blob: 633a24c6611ea2ba225535c86a8817f34576221f [file] [log] [blame]
Guido van Rossumfeee4b92000-03-10 22:57:27 +00001/* ------------------------------------------------------------------------
2
3 Python Codec Registry and support functions
4
5Written by Marc-Andre Lemburg (mal@lemburg.com).
6
Guido van Rossum16b1ad92000-08-03 16:24:25 +00007Copyright (c) Corporation for National Research Initiatives.
Guido van Rossumfeee4b92000-03-10 22:57:27 +00008
9 ------------------------------------------------------------------------ */
10
11#include "Python.h"
12#include <ctype.h>
13
Guido van Rossumfeee4b92000-03-10 22:57:27 +000014/* --- Codec Registry ----------------------------------------------------- */
15
16/* Import the standard encodings package which will register the first
Guido van Rossum98297ee2007-11-06 21:34:58 +000017 codec search function.
Guido van Rossumfeee4b92000-03-10 22:57:27 +000018
19 This is done in a lazy way so that the Unicode implementation does
20 not downgrade startup time of scripts not needing it.
21
Guido van Rossumb95de4f2000-03-31 17:25:23 +000022 ImportErrors are silently ignored by this function. Only one try is
23 made.
Guido van Rossumfeee4b92000-03-10 22:57:27 +000024
25*/
26
Gustavo Niemeyer5ddd4c32003-03-19 00:35:36 +000027static int _PyCodecRegistry_Init(void); /* Forward */
Guido van Rossumfeee4b92000-03-10 22:57:27 +000028
Guido van Rossumfeee4b92000-03-10 22:57:27 +000029int PyCodec_Register(PyObject *search_function)
30{
Nicholas Bastine5662ae2004-03-24 22:22:12 +000031 PyInterpreterState *interp = PyThreadState_GET()->interp;
Gustavo Niemeyer5ddd4c32003-03-19 00:35:36 +000032 if (interp->codec_search_path == NULL && _PyCodecRegistry_Init())
33 goto onError;
Guido van Rossumfeee4b92000-03-10 22:57:27 +000034 if (search_function == NULL) {
35 PyErr_BadArgument();
Guido van Rossumb95de4f2000-03-31 17:25:23 +000036 goto onError;
Guido van Rossumfeee4b92000-03-10 22:57:27 +000037 }
38 if (!PyCallable_Check(search_function)) {
Neal Norwitz3715c3e2005-11-24 22:09:18 +000039 PyErr_SetString(PyExc_TypeError, "argument must be callable");
Guido van Rossumb95de4f2000-03-31 17:25:23 +000040 goto onError;
Guido van Rossumfeee4b92000-03-10 22:57:27 +000041 }
Gustavo Niemeyer5ddd4c32003-03-19 00:35:36 +000042 return PyList_Append(interp->codec_search_path, search_function);
Guido van Rossumb95de4f2000-03-31 17:25:23 +000043
44 onError:
45 return -1;
Guido van Rossumfeee4b92000-03-10 22:57:27 +000046}
47
Guido van Rossum9e896b32000-04-05 20:11:21 +000048/* Convert a string to a normalized Python string: all characters are
49 converted to lower case, spaces are replaced with underscores. */
50
Guido van Rossumfeee4b92000-03-10 22:57:27 +000051static
Guido van Rossum9e896b32000-04-05 20:11:21 +000052PyObject *normalizestring(const char *string)
Guido van Rossumfeee4b92000-03-10 22:57:27 +000053{
Guido van Rossum33831132000-06-29 14:50:15 +000054 register size_t i;
Guido van Rossum582acec2000-06-28 22:07:35 +000055 size_t len = strlen(string);
Guido van Rossumfeee4b92000-03-10 22:57:27 +000056 char *p;
57 PyObject *v;
Guido van Rossum21431e82007-10-19 21:48:41 +000058
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000059 if (len > PY_SSIZE_T_MAX) {
60 PyErr_SetString(PyExc_OverflowError, "string is too large");
61 return NULL;
62 }
Guido van Rossum21431e82007-10-19 21:48:41 +000063
64 p = PyMem_Malloc(len + 1);
65 if (p == NULL)
66 return NULL;
Guido van Rossum9e896b32000-04-05 20:11:21 +000067 for (i = 0; i < len; i++) {
68 register char ch = string[i];
69 if (ch == ' ')
70 ch = '-';
71 else
Thomas Wouters477c8d52006-05-27 19:21:47 +000072 ch = tolower(Py_CHARMASK(ch));
Guido van Rossum9e896b32000-04-05 20:11:21 +000073 p[i] = ch;
74 }
Guido van Rossum21431e82007-10-19 21:48:41 +000075 p[i] = '\0';
76 v = PyUnicode_FromString(p);
77 if (v == NULL)
78 return NULL;
79 PyMem_Free(p);
Guido van Rossumfeee4b92000-03-10 22:57:27 +000080 return v;
81}
82
83/* Lookup the given encoding and return a tuple providing the codec
84 facilities.
85
86 The encoding string is looked up converted to all lower-case
87 characters. This makes encodings looked up through this mechanism
88 effectively case-insensitive.
89
Guido van Rossum98297ee2007-11-06 21:34:58 +000090 If no codec is found, a LookupError is set and NULL returned.
Guido van Rossumb95de4f2000-03-31 17:25:23 +000091
92 As side effect, this tries to load the encodings package, if not
93 yet done. This is part of the lazy load strategy for the encodings
94 package.
95
96*/
Guido van Rossumfeee4b92000-03-10 22:57:27 +000097
98PyObject *_PyCodec_Lookup(const char *encoding)
99{
Gustavo Niemeyer5ddd4c32003-03-19 00:35:36 +0000100 PyInterpreterState *interp;
Guido van Rossum5ba3c842000-03-24 20:52:23 +0000101 PyObject *result, *args = NULL, *v;
Thomas Wouters477c8d52006-05-27 19:21:47 +0000102 Py_ssize_t i, len;
Guido van Rossumfeee4b92000-03-10 22:57:27 +0000103
Fred Drake766de832000-05-09 19:55:59 +0000104 if (encoding == NULL) {
105 PyErr_BadArgument();
106 goto onError;
107 }
Gustavo Niemeyer5ddd4c32003-03-19 00:35:36 +0000108
Nicholas Bastine5662ae2004-03-24 22:22:12 +0000109 interp = PyThreadState_GET()->interp;
Gustavo Niemeyer5ddd4c32003-03-19 00:35:36 +0000110 if (interp->codec_search_path == NULL && _PyCodecRegistry_Init())
Barry Warsaw51ac5802000-03-20 16:36:48 +0000111 goto onError;
Guido van Rossumfeee4b92000-03-10 22:57:27 +0000112
Guido van Rossum9e896b32000-04-05 20:11:21 +0000113 /* Convert the encoding to a normalized Python string: all
Thomas Wouters7e474022000-07-16 12:04:32 +0000114 characters are converted to lower case, spaces and hyphens are
Guido van Rossum9e896b32000-04-05 20:11:21 +0000115 replaced with underscores. */
116 v = normalizestring(encoding);
Guido van Rossumfeee4b92000-03-10 22:57:27 +0000117 if (v == NULL)
118 goto onError;
Guido van Rossum21431e82007-10-19 21:48:41 +0000119 PyUnicode_InternInPlace(&v);
Guido van Rossumfeee4b92000-03-10 22:57:27 +0000120
121 /* First, try to lookup the name in the registry dictionary */
Gustavo Niemeyer5ddd4c32003-03-19 00:35:36 +0000122 result = PyDict_GetItem(interp->codec_search_cache, v);
Guido van Rossumfeee4b92000-03-10 22:57:27 +0000123 if (result != NULL) {
124 Py_INCREF(result);
Barry Warsaw51ac5802000-03-20 16:36:48 +0000125 Py_DECREF(v);
Guido van Rossumfeee4b92000-03-10 22:57:27 +0000126 return result;
127 }
Guido van Rossum98297ee2007-11-06 21:34:58 +0000128
Guido van Rossumfeee4b92000-03-10 22:57:27 +0000129 /* Next, scan the search functions in order of registration */
Guido van Rossumfeee4b92000-03-10 22:57:27 +0000130 args = PyTuple_New(1);
131 if (args == NULL)
132 goto onError;
133 PyTuple_SET_ITEM(args,0,v);
Guido van Rossum5ba3c842000-03-24 20:52:23 +0000134
Gustavo Niemeyer5ddd4c32003-03-19 00:35:36 +0000135 len = PyList_Size(interp->codec_search_path);
Guido van Rossum5ba3c842000-03-24 20:52:23 +0000136 if (len < 0)
137 goto onError;
Guido van Rossumb95de4f2000-03-31 17:25:23 +0000138 if (len == 0) {
139 PyErr_SetString(PyExc_LookupError,
140 "no codec search functions registered: "
141 "can't find encoding");
142 goto onError;
143 }
Guido van Rossumfeee4b92000-03-10 22:57:27 +0000144
145 for (i = 0; i < len; i++) {
146 PyObject *func;
Guido van Rossum98297ee2007-11-06 21:34:58 +0000147
Gustavo Niemeyer5ddd4c32003-03-19 00:35:36 +0000148 func = PyList_GetItem(interp->codec_search_path, i);
Guido van Rossumfeee4b92000-03-10 22:57:27 +0000149 if (func == NULL)
150 goto onError;
Guido van Rossum5ba3c842000-03-24 20:52:23 +0000151 result = PyEval_CallObject(func, args);
Guido van Rossumfeee4b92000-03-10 22:57:27 +0000152 if (result == NULL)
153 goto onError;
154 if (result == Py_None) {
155 Py_DECREF(result);
156 continue;
157 }
158 if (!PyTuple_Check(result) || PyTuple_GET_SIZE(result) != 4) {
159 PyErr_SetString(PyExc_TypeError,
160 "codec search functions must return 4-tuples");
161 Py_DECREF(result);
162 goto onError;
163 }
164 break;
165 }
166 if (i == len) {
167 /* XXX Perhaps we should cache misses too ? */
Martin v. Löwiseb42b022002-09-26 16:01:24 +0000168 PyErr_Format(PyExc_LookupError,
169 "unknown encoding: %s", encoding);
Guido van Rossumfeee4b92000-03-10 22:57:27 +0000170 goto onError;
171 }
172
173 /* Cache and return the result */
Neal Norwitz9edcc2e2007-08-11 04:58:26 +0000174 if (PyDict_SetItem(interp->codec_search_cache, v, result) < 0) {
175 Py_DECREF(result);
176 goto onError;
177 }
Guido van Rossumfeee4b92000-03-10 22:57:27 +0000178 Py_DECREF(args);
179 return result;
180
181 onError:
182 Py_XDECREF(args);
183 return NULL;
184}
185
Marc-André Lemburgb2750b52008-06-06 12:18:17 +0000186/* Codec registry encoding check API. */
187
188int PyCodec_KnownEncoding(const char *encoding)
189{
190 PyObject *codecs;
191
192 codecs = _PyCodec_Lookup(encoding);
193 if (!codecs) {
194 PyErr_Clear();
195 return 0;
196 }
197 else {
198 Py_DECREF(codecs);
199 return 1;
200 }
201}
202
Guido van Rossumfeee4b92000-03-10 22:57:27 +0000203static
204PyObject *args_tuple(PyObject *object,
205 const char *errors)
206{
207 PyObject *args;
Guido van Rossum98297ee2007-11-06 21:34:58 +0000208
Guido van Rossumfeee4b92000-03-10 22:57:27 +0000209 args = PyTuple_New(1 + (errors != NULL));
210 if (args == NULL)
211 return NULL;
212 Py_INCREF(object);
213 PyTuple_SET_ITEM(args,0,object);
214 if (errors) {
215 PyObject *v;
Guido van Rossum98297ee2007-11-06 21:34:58 +0000216
Guido van Rossum21431e82007-10-19 21:48:41 +0000217 v = PyUnicode_FromString(errors);
Guido van Rossumfeee4b92000-03-10 22:57:27 +0000218 if (v == NULL) {
219 Py_DECREF(args);
220 return NULL;
221 }
222 PyTuple_SET_ITEM(args, 1, v);
223 }
224 return args;
225}
226
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000227/* Helper function to get a codec item */
Guido van Rossumfeee4b92000-03-10 22:57:27 +0000228
229static
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000230PyObject *codec_getitem(const char *encoding, int index)
Guido van Rossumfeee4b92000-03-10 22:57:27 +0000231{
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000232 PyObject *codecs;
233 PyObject *v;
Guido van Rossumfeee4b92000-03-10 22:57:27 +0000234
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000235 codecs = _PyCodec_Lookup(encoding);
236 if (codecs == NULL)
Guido van Rossumfeee4b92000-03-10 22:57:27 +0000237 return NULL;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000238 v = PyTuple_GET_ITEM(codecs, index);
239 Py_DECREF(codecs);
240 Py_INCREF(v);
241 return v;
242}
243
244/* Helper function to create an incremental codec. */
245
246static
247PyObject *codec_getincrementalcodec(const char *encoding,
248 const char *errors,
249 const char *attrname)
250{
251 PyObject *codecs, *ret, *inccodec;
252
253 codecs = _PyCodec_Lookup(encoding);
254 if (codecs == NULL)
255 return NULL;
256 inccodec = PyObject_GetAttrString(codecs, attrname);
257 Py_DECREF(codecs);
258 if (inccodec == NULL)
259 return NULL;
260 if (errors)
261 ret = PyObject_CallFunction(inccodec, "s", errors);
262 else
263 ret = PyObject_CallFunction(inccodec, NULL);
264 Py_DECREF(inccodec);
265 return ret;
266}
267
268/* Helper function to create a stream codec. */
269
270static
271PyObject *codec_getstreamcodec(const char *encoding,
272 PyObject *stream,
273 const char *errors,
274 const int index)
275{
Thomas Wouters0e3f5912006-08-11 14:57:12 +0000276 PyObject *codecs, *streamcodec, *codeccls;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000277
278 codecs = _PyCodec_Lookup(encoding);
279 if (codecs == NULL)
280 return NULL;
281
Thomas Wouters0e3f5912006-08-11 14:57:12 +0000282 codeccls = PyTuple_GET_ITEM(codecs, index);
283 if (errors != NULL)
284 streamcodec = PyObject_CallFunction(codeccls, "Os", stream, errors);
285 else
286 streamcodec = PyObject_CallFunction(codeccls, "O", stream);
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000287 Py_DECREF(codecs);
288 return streamcodec;
Guido van Rossumfeee4b92000-03-10 22:57:27 +0000289}
290
Guido van Rossum98297ee2007-11-06 21:34:58 +0000291/* Convenience APIs to query the Codec registry.
292
Guido van Rossumfeee4b92000-03-10 22:57:27 +0000293 All APIs return a codec object with incremented refcount.
Guido van Rossum98297ee2007-11-06 21:34:58 +0000294
Guido van Rossumfeee4b92000-03-10 22:57:27 +0000295 */
296
297PyObject *PyCodec_Encoder(const char *encoding)
298{
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000299 return codec_getitem(encoding, 0);
Guido van Rossumfeee4b92000-03-10 22:57:27 +0000300}
301
302PyObject *PyCodec_Decoder(const char *encoding)
303{
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000304 return codec_getitem(encoding, 1);
Guido van Rossumfeee4b92000-03-10 22:57:27 +0000305}
306
Thomas Woutersa9773292006-04-21 09:43:23 +0000307PyObject *PyCodec_IncrementalEncoder(const char *encoding,
308 const char *errors)
309{
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000310 return codec_getincrementalcodec(encoding, errors, "incrementalencoder");
Thomas Woutersa9773292006-04-21 09:43:23 +0000311}
312
313PyObject *PyCodec_IncrementalDecoder(const char *encoding,
314 const char *errors)
315{
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000316 return codec_getincrementalcodec(encoding, errors, "incrementaldecoder");
Thomas Woutersa9773292006-04-21 09:43:23 +0000317}
318
Guido van Rossumfeee4b92000-03-10 22:57:27 +0000319PyObject *PyCodec_StreamReader(const char *encoding,
320 PyObject *stream,
321 const char *errors)
322{
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000323 return codec_getstreamcodec(encoding, stream, errors, 2);
Guido van Rossumfeee4b92000-03-10 22:57:27 +0000324}
325
326PyObject *PyCodec_StreamWriter(const char *encoding,
327 PyObject *stream,
328 const char *errors)
329{
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000330 return codec_getstreamcodec(encoding, stream, errors, 3);
Guido van Rossumfeee4b92000-03-10 22:57:27 +0000331}
332
333/* Encode an object (e.g. an Unicode object) using the given encoding
334 and return the resulting encoded object (usually a Python string).
335
336 errors is passed to the encoder factory as argument if non-NULL. */
337
338PyObject *PyCodec_Encode(PyObject *object,
339 const char *encoding,
340 const char *errors)
341{
342 PyObject *encoder = NULL;
Neal Norwitz3715c3e2005-11-24 22:09:18 +0000343 PyObject *args = NULL, *result = NULL;
Guido van Rossum98297ee2007-11-06 21:34:58 +0000344 PyObject *v = NULL;
Guido van Rossumfeee4b92000-03-10 22:57:27 +0000345
346 encoder = PyCodec_Encoder(encoding);
347 if (encoder == NULL)
348 goto onError;
349
350 args = args_tuple(object, errors);
351 if (args == NULL)
352 goto onError;
Guido van Rossum98297ee2007-11-06 21:34:58 +0000353
354 result = PyEval_CallObject(encoder, args);
Guido van Rossumfeee4b92000-03-10 22:57:27 +0000355 if (result == NULL)
356 goto onError;
357
Guido van Rossum98297ee2007-11-06 21:34:58 +0000358 if (!PyTuple_Check(result) ||
Guido van Rossumfeee4b92000-03-10 22:57:27 +0000359 PyTuple_GET_SIZE(result) != 2) {
360 PyErr_SetString(PyExc_TypeError,
Guido van Rossum98297ee2007-11-06 21:34:58 +0000361 "encoder must return a tuple (object, integer)");
Guido van Rossumfeee4b92000-03-10 22:57:27 +0000362 goto onError;
363 }
Marc-André Lemburgb2750b52008-06-06 12:18:17 +0000364 v = PyTuple_GET_ITEM(result,0);
365 Py_INCREF(v);
Guido van Rossumfeee4b92000-03-10 22:57:27 +0000366 /* We don't check or use the second (integer) entry. */
367
Marc-André Lemburgb2750b52008-06-06 12:18:17 +0000368 Py_DECREF(args);
369 Py_DECREF(encoder);
370 Py_DECREF(result);
371 return v;
372
Guido van Rossumfeee4b92000-03-10 22:57:27 +0000373 onError:
Neal Norwitz3715c3e2005-11-24 22:09:18 +0000374 Py_XDECREF(result);
Guido van Rossumfeee4b92000-03-10 22:57:27 +0000375 Py_XDECREF(args);
376 Py_XDECREF(encoder);
Marc-André Lemburgb2750b52008-06-06 12:18:17 +0000377 return NULL;
Guido van Rossumfeee4b92000-03-10 22:57:27 +0000378}
379
380/* Decode an object (usually a Python string) using the given encoding
381 and return an equivalent object (e.g. an Unicode object).
382
383 errors is passed to the decoder factory as argument if non-NULL. */
384
385PyObject *PyCodec_Decode(PyObject *object,
386 const char *encoding,
387 const char *errors)
388{
389 PyObject *decoder = NULL;
390 PyObject *args = NULL, *result = NULL;
391 PyObject *v;
392
393 decoder = PyCodec_Decoder(encoding);
394 if (decoder == NULL)
395 goto onError;
396
397 args = args_tuple(object, errors);
398 if (args == NULL)
399 goto onError;
Guido van Rossum98297ee2007-11-06 21:34:58 +0000400
Guido van Rossumfeee4b92000-03-10 22:57:27 +0000401 result = PyEval_CallObject(decoder,args);
402 if (result == NULL)
403 goto onError;
Guido van Rossum98297ee2007-11-06 21:34:58 +0000404 if (!PyTuple_Check(result) ||
Guido van Rossumfeee4b92000-03-10 22:57:27 +0000405 PyTuple_GET_SIZE(result) != 2) {
406 PyErr_SetString(PyExc_TypeError,
407 "decoder must return a tuple (object,integer)");
408 goto onError;
409 }
410 v = PyTuple_GET_ITEM(result,0);
411 Py_INCREF(v);
412 /* We don't check or use the second (integer) entry. */
413
414 Py_DECREF(args);
415 Py_DECREF(decoder);
416 Py_DECREF(result);
417 return v;
Guido van Rossum98297ee2007-11-06 21:34:58 +0000418
Guido van Rossumfeee4b92000-03-10 22:57:27 +0000419 onError:
420 Py_XDECREF(args);
421 Py_XDECREF(decoder);
422 Py_XDECREF(result);
423 return NULL;
424}
425
Walter Dörwald3aeb6322002-09-02 13:14:32 +0000426/* Register the error handling callback function error under the name
427 name. This function will be called by the codec when it encounters
428 an unencodable characters/undecodable bytes and doesn't know the
429 callback name, when name is specified as the error parameter
430 in the call to the encode/decode function.
431 Return 0 on success, -1 on error */
432int PyCodec_RegisterError(const char *name, PyObject *error)
433{
Nicholas Bastine5662ae2004-03-24 22:22:12 +0000434 PyInterpreterState *interp = PyThreadState_GET()->interp;
Gustavo Niemeyer5ddd4c32003-03-19 00:35:36 +0000435 if (interp->codec_search_path == NULL && _PyCodecRegistry_Init())
436 return -1;
Walter Dörwald3aeb6322002-09-02 13:14:32 +0000437 if (!PyCallable_Check(error)) {
438 PyErr_SetString(PyExc_TypeError, "handler must be callable");
439 return -1;
440 }
Gustavo Niemeyer5ddd4c32003-03-19 00:35:36 +0000441 return PyDict_SetItemString(interp->codec_error_registry,
442 (char *)name, error);
Walter Dörwald3aeb6322002-09-02 13:14:32 +0000443}
444
445/* Lookup the error handling callback function registered under the
446 name error. As a special case NULL can be passed, in which case
447 the error handling callback for strict encoding will be returned. */
448PyObject *PyCodec_LookupError(const char *name)
449{
450 PyObject *handler = NULL;
451
Nicholas Bastine5662ae2004-03-24 22:22:12 +0000452 PyInterpreterState *interp = PyThreadState_GET()->interp;
Gustavo Niemeyer5ddd4c32003-03-19 00:35:36 +0000453 if (interp->codec_search_path == NULL && _PyCodecRegistry_Init())
454 return NULL;
455
Walter Dörwald3aeb6322002-09-02 13:14:32 +0000456 if (name==NULL)
457 name = "strict";
Gustavo Niemeyer5ddd4c32003-03-19 00:35:36 +0000458 handler = PyDict_GetItemString(interp->codec_error_registry, (char *)name);
Walter Dörwald3aeb6322002-09-02 13:14:32 +0000459 if (!handler)
460 PyErr_Format(PyExc_LookupError, "unknown error handler name '%.400s'", name);
461 else
462 Py_INCREF(handler);
463 return handler;
464}
465
466static void wrong_exception_type(PyObject *exc)
467{
468 PyObject *type = PyObject_GetAttrString(exc, "__class__");
469 if (type != NULL) {
Walter Dörwald573c08c2007-05-25 15:46:59 +0000470 PyObject *name = PyObject_GetAttrString(type, "__name__");
471 Py_DECREF(type);
472 if (name != NULL) {
473 PyErr_Format(PyExc_TypeError,
474 "don't know how to handle %S in error callback", name);
475 Py_DECREF(name);
476 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +0000477 }
478}
479
480PyObject *PyCodec_StrictErrors(PyObject *exc)
481{
Brett Cannonbf364092006-03-01 04:25:17 +0000482 if (PyExceptionInstance_Check(exc))
483 PyErr_SetObject(PyExceptionInstance_Class(exc), exc);
Walter Dörwald3aeb6322002-09-02 13:14:32 +0000484 else
485 PyErr_SetString(PyExc_TypeError, "codec must pass exception instance");
486 return NULL;
487}
488
489
490PyObject *PyCodec_IgnoreErrors(PyObject *exc)
491{
Martin v. Löwis18e16552006-02-15 17:27:45 +0000492 Py_ssize_t end;
Walter Dörwald3aeb6322002-09-02 13:14:32 +0000493 if (PyObject_IsInstance(exc, PyExc_UnicodeEncodeError)) {
494 if (PyUnicodeEncodeError_GetEnd(exc, &end))
495 return NULL;
496 }
497 else if (PyObject_IsInstance(exc, PyExc_UnicodeDecodeError)) {
498 if (PyUnicodeDecodeError_GetEnd(exc, &end))
499 return NULL;
500 }
501 else if (PyObject_IsInstance(exc, PyExc_UnicodeTranslateError)) {
502 if (PyUnicodeTranslateError_GetEnd(exc, &end))
503 return NULL;
504 }
505 else {
506 wrong_exception_type(exc);
507 return NULL;
508 }
509 /* ouch: passing NULL, 0, pos gives None instead of u'' */
Martin v. Löwis18e16552006-02-15 17:27:45 +0000510 return Py_BuildValue("(u#n)", &end, 0, end);
Walter Dörwald3aeb6322002-09-02 13:14:32 +0000511}
512
513
514PyObject *PyCodec_ReplaceErrors(PyObject *exc)
515{
516 PyObject *restuple;
Martin v. Löwis18e16552006-02-15 17:27:45 +0000517 Py_ssize_t start;
518 Py_ssize_t end;
519 Py_ssize_t i;
Walter Dörwald3aeb6322002-09-02 13:14:32 +0000520
521 if (PyObject_IsInstance(exc, PyExc_UnicodeEncodeError)) {
522 PyObject *res;
523 Py_UNICODE *p;
524 if (PyUnicodeEncodeError_GetStart(exc, &start))
525 return NULL;
526 if (PyUnicodeEncodeError_GetEnd(exc, &end))
527 return NULL;
528 res = PyUnicode_FromUnicode(NULL, end-start);
529 if (res == NULL)
530 return NULL;
531 for (p = PyUnicode_AS_UNICODE(res), i = start;
532 i<end; ++p, ++i)
533 *p = '?';
Martin v. Löwis18e16552006-02-15 17:27:45 +0000534 restuple = Py_BuildValue("(On)", res, end);
Walter Dörwald3aeb6322002-09-02 13:14:32 +0000535 Py_DECREF(res);
536 return restuple;
537 }
538 else if (PyObject_IsInstance(exc, PyExc_UnicodeDecodeError)) {
539 Py_UNICODE res = Py_UNICODE_REPLACEMENT_CHARACTER;
540 if (PyUnicodeDecodeError_GetEnd(exc, &end))
541 return NULL;
Martin v. Löwis18e16552006-02-15 17:27:45 +0000542 return Py_BuildValue("(u#n)", &res, 1, end);
Walter Dörwald3aeb6322002-09-02 13:14:32 +0000543 }
544 else if (PyObject_IsInstance(exc, PyExc_UnicodeTranslateError)) {
545 PyObject *res;
546 Py_UNICODE *p;
547 if (PyUnicodeTranslateError_GetStart(exc, &start))
548 return NULL;
549 if (PyUnicodeTranslateError_GetEnd(exc, &end))
550 return NULL;
551 res = PyUnicode_FromUnicode(NULL, end-start);
552 if (res == NULL)
553 return NULL;
554 for (p = PyUnicode_AS_UNICODE(res), i = start;
555 i<end; ++p, ++i)
556 *p = Py_UNICODE_REPLACEMENT_CHARACTER;
Martin v. Löwis18e16552006-02-15 17:27:45 +0000557 restuple = Py_BuildValue("(On)", res, end);
Walter Dörwald3aeb6322002-09-02 13:14:32 +0000558 Py_DECREF(res);
559 return restuple;
560 }
561 else {
562 wrong_exception_type(exc);
563 return NULL;
564 }
565}
566
567PyObject *PyCodec_XMLCharRefReplaceErrors(PyObject *exc)
568{
569 if (PyObject_IsInstance(exc, PyExc_UnicodeEncodeError)) {
570 PyObject *restuple;
571 PyObject *object;
Martin v. Löwis18e16552006-02-15 17:27:45 +0000572 Py_ssize_t start;
573 Py_ssize_t end;
Walter Dörwald3aeb6322002-09-02 13:14:32 +0000574 PyObject *res;
575 Py_UNICODE *p;
576 Py_UNICODE *startp;
577 Py_UNICODE *outp;
578 int ressize;
579 if (PyUnicodeEncodeError_GetStart(exc, &start))
580 return NULL;
581 if (PyUnicodeEncodeError_GetEnd(exc, &end))
582 return NULL;
583 if (!(object = PyUnicodeEncodeError_GetObject(exc)))
584 return NULL;
585 startp = PyUnicode_AS_UNICODE(object);
586 for (p = startp+start, ressize = 0; p < startp+end; ++p) {
587 if (*p<10)
588 ressize += 2+1+1;
589 else if (*p<100)
590 ressize += 2+2+1;
591 else if (*p<1000)
592 ressize += 2+3+1;
593 else if (*p<10000)
594 ressize += 2+4+1;
Hye-Shik Chang7db07e62003-12-29 01:36:01 +0000595#ifndef Py_UNICODE_WIDE
596 else
597 ressize += 2+5+1;
598#else
Walter Dörwald3aeb6322002-09-02 13:14:32 +0000599 else if (*p<100000)
600 ressize += 2+5+1;
601 else if (*p<1000000)
602 ressize += 2+6+1;
603 else
604 ressize += 2+7+1;
Hye-Shik Chang7db07e62003-12-29 01:36:01 +0000605#endif
Walter Dörwald3aeb6322002-09-02 13:14:32 +0000606 }
607 /* allocate replacement */
608 res = PyUnicode_FromUnicode(NULL, ressize);
609 if (res == NULL) {
610 Py_DECREF(object);
611 return NULL;
612 }
613 /* generate replacement */
614 for (p = startp+start, outp = PyUnicode_AS_UNICODE(res);
615 p < startp+end; ++p) {
616 Py_UNICODE c = *p;
617 int digits;
618 int base;
619 *outp++ = '&';
620 *outp++ = '#';
621 if (*p<10) {
622 digits = 1;
623 base = 1;
624 }
625 else if (*p<100) {
626 digits = 2;
627 base = 10;
628 }
629 else if (*p<1000) {
630 digits = 3;
631 base = 100;
632 }
633 else if (*p<10000) {
634 digits = 4;
635 base = 1000;
636 }
Hye-Shik Chang7db07e62003-12-29 01:36:01 +0000637#ifndef Py_UNICODE_WIDE
638 else {
639 digits = 5;
640 base = 10000;
641 }
642#else
Walter Dörwald3aeb6322002-09-02 13:14:32 +0000643 else if (*p<100000) {
644 digits = 5;
645 base = 10000;
646 }
647 else if (*p<1000000) {
648 digits = 6;
649 base = 100000;
650 }
651 else {
652 digits = 7;
653 base = 1000000;
654 }
Hye-Shik Chang7db07e62003-12-29 01:36:01 +0000655#endif
Walter Dörwald3aeb6322002-09-02 13:14:32 +0000656 while (digits-->0) {
657 *outp++ = '0' + c/base;
658 c %= base;
659 base /= 10;
660 }
661 *outp++ = ';';
662 }
Martin v. Löwis18e16552006-02-15 17:27:45 +0000663 restuple = Py_BuildValue("(On)", res, end);
Walter Dörwald3aeb6322002-09-02 13:14:32 +0000664 Py_DECREF(res);
665 Py_DECREF(object);
666 return restuple;
667 }
668 else {
669 wrong_exception_type(exc);
670 return NULL;
671 }
672}
673
674static Py_UNICODE hexdigits[] = {
675 '0', '1', '2', '3', '4', '5', '6', '7',
676 '8', '9', 'a', 'b', 'c', 'd', 'e', 'f'
677};
678
679PyObject *PyCodec_BackslashReplaceErrors(PyObject *exc)
680{
681 if (PyObject_IsInstance(exc, PyExc_UnicodeEncodeError)) {
682 PyObject *restuple;
683 PyObject *object;
Martin v. Löwis18e16552006-02-15 17:27:45 +0000684 Py_ssize_t start;
685 Py_ssize_t end;
Walter Dörwald3aeb6322002-09-02 13:14:32 +0000686 PyObject *res;
687 Py_UNICODE *p;
688 Py_UNICODE *startp;
689 Py_UNICODE *outp;
690 int ressize;
691 if (PyUnicodeEncodeError_GetStart(exc, &start))
692 return NULL;
693 if (PyUnicodeEncodeError_GetEnd(exc, &end))
694 return NULL;
695 if (!(object = PyUnicodeEncodeError_GetObject(exc)))
696 return NULL;
697 startp = PyUnicode_AS_UNICODE(object);
698 for (p = startp+start, ressize = 0; p < startp+end; ++p) {
Hye-Shik Chang7db07e62003-12-29 01:36:01 +0000699#ifdef Py_UNICODE_WIDE
Walter Dörwald3aeb6322002-09-02 13:14:32 +0000700 if (*p >= 0x00010000)
701 ressize += 1+1+8;
Hye-Shik Chang7db07e62003-12-29 01:36:01 +0000702 else
703#endif
704 if (*p >= 0x100) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +0000705 ressize += 1+1+4;
706 }
707 else
708 ressize += 1+1+2;
709 }
710 res = PyUnicode_FromUnicode(NULL, ressize);
711 if (res==NULL)
712 return NULL;
713 for (p = startp+start, outp = PyUnicode_AS_UNICODE(res);
714 p < startp+end; ++p) {
715 Py_UNICODE c = *p;
716 *outp++ = '\\';
Hye-Shik Chang7db07e62003-12-29 01:36:01 +0000717#ifdef Py_UNICODE_WIDE
Walter Dörwald3aeb6322002-09-02 13:14:32 +0000718 if (c >= 0x00010000) {
719 *outp++ = 'U';
720 *outp++ = hexdigits[(c>>28)&0xf];
721 *outp++ = hexdigits[(c>>24)&0xf];
722 *outp++ = hexdigits[(c>>20)&0xf];
723 *outp++ = hexdigits[(c>>16)&0xf];
724 *outp++ = hexdigits[(c>>12)&0xf];
725 *outp++ = hexdigits[(c>>8)&0xf];
726 }
Hye-Shik Chang7db07e62003-12-29 01:36:01 +0000727 else
728#endif
729 if (c >= 0x100) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +0000730 *outp++ = 'u';
731 *outp++ = hexdigits[(c>>12)&0xf];
732 *outp++ = hexdigits[(c>>8)&0xf];
733 }
734 else
735 *outp++ = 'x';
736 *outp++ = hexdigits[(c>>4)&0xf];
737 *outp++ = hexdigits[c&0xf];
738 }
739
Martin v. Löwis18e16552006-02-15 17:27:45 +0000740 restuple = Py_BuildValue("(On)", res, end);
Walter Dörwald3aeb6322002-09-02 13:14:32 +0000741 Py_DECREF(res);
742 Py_DECREF(object);
743 return restuple;
744 }
745 else {
746 wrong_exception_type(exc);
747 return NULL;
748 }
749}
750
Martin v. Löwisaef3fb02009-05-02 19:27:30 +0000751/* This handler is declared static until someone demonstrates
752 a need to call it directly. */
753static PyObject *
754PyCodec_SurrogateErrors(PyObject *exc)
Martin v. Löwisdb12d452009-05-02 18:52:14 +0000755{
756 PyObject *restuple;
757 PyObject *object;
758 Py_ssize_t start;
759 Py_ssize_t end;
760 PyObject *res;
761 if (PyObject_IsInstance(exc, PyExc_UnicodeEncodeError)) {
762 Py_UNICODE *p;
763 Py_UNICODE *startp;
764 char *outp;
765 if (PyUnicodeEncodeError_GetStart(exc, &start))
766 return NULL;
767 if (PyUnicodeEncodeError_GetEnd(exc, &end))
768 return NULL;
769 if (!(object = PyUnicodeEncodeError_GetObject(exc)))
770 return NULL;
771 startp = PyUnicode_AS_UNICODE(object);
772 res = PyBytes_FromStringAndSize(NULL, 3*(end-start));
773 if (!res) {
774 Py_DECREF(object);
775 return NULL;
776 }
777 outp = PyBytes_AsString(res);
778 for (p = startp+start; p < startp+end; p++) {
779 Py_UNICODE ch = *p;
780 if (ch < 0xd800 || ch > 0xdfff) {
781 /* Not a surrogate, fail with original exception */
782 PyErr_SetObject(PyExceptionInstance_Class(exc), exc);
783 Py_DECREF(res);
784 Py_DECREF(object);
785 return NULL;
786 }
787 *outp++ = (char)(0xe0 | (ch >> 12));
788 *outp++ = (char)(0x80 | ((ch >> 6) & 0x3f));
789 *outp++ = (char)(0x80 | (ch & 0x3f));
790 }
791 restuple = Py_BuildValue("(On)", res, end);
792 Py_DECREF(res);
793 Py_DECREF(object);
794 return restuple;
795 }
796 else if (PyObject_IsInstance(exc, PyExc_UnicodeDecodeError)) {
797 unsigned char *p;
798 Py_UNICODE ch = 0;
799 if (PyUnicodeDecodeError_GetStart(exc, &start))
800 return NULL;
801 if (!(object = PyUnicodeDecodeError_GetObject(exc)))
802 return NULL;
803 if (!(p = (unsigned char*)PyBytes_AsString(object))) {
804 Py_DECREF(object);
805 return NULL;
806 }
807 /* Try decoding a single surrogate character. If
808 there are more, let the codec call us again. */
809 p += start;
810 if ((p[0] & 0xf0) == 0xe0 ||
811 (p[1] & 0xc0) == 0x80 ||
812 (p[2] & 0xc0) == 0x80) {
813 /* it's a three-byte code */
814 ch = ((p[0] & 0x0f) << 12) + ((p[1] & 0x3f) << 6) + (p[2] & 0x3f);
815 if (ch < 0xd800 || ch > 0xdfff)
816 /* it's not a surrogate - fail */
817 ch = 0;
818 }
819 Py_DECREF(object);
820 if (ch == 0) {
821 PyErr_SetObject(PyExceptionInstance_Class(exc), exc);
822 return NULL;
823 }
824 return Py_BuildValue("(u#n)", &ch, 1, start+3);
825 }
826 else {
827 wrong_exception_type(exc);
828 return NULL;
829 }
830}
831
832
Walter Dörwald3aeb6322002-09-02 13:14:32 +0000833static PyObject *strict_errors(PyObject *self, PyObject *exc)
834{
835 return PyCodec_StrictErrors(exc);
836}
837
838
839static PyObject *ignore_errors(PyObject *self, PyObject *exc)
840{
841 return PyCodec_IgnoreErrors(exc);
842}
843
844
845static PyObject *replace_errors(PyObject *self, PyObject *exc)
846{
847 return PyCodec_ReplaceErrors(exc);
848}
849
850
851static PyObject *xmlcharrefreplace_errors(PyObject *self, PyObject *exc)
852{
853 return PyCodec_XMLCharRefReplaceErrors(exc);
854}
855
856
857static PyObject *backslashreplace_errors(PyObject *self, PyObject *exc)
858{
859 return PyCodec_BackslashReplaceErrors(exc);
860}
861
Martin v. Löwisdb12d452009-05-02 18:52:14 +0000862static PyObject *surrogates_errors(PyObject *self, PyObject *exc)
863{
864 return PyCodec_SurrogateErrors(exc);
865}
866
Gustavo Niemeyer5ddd4c32003-03-19 00:35:36 +0000867static int _PyCodecRegistry_Init(void)
Guido van Rossumfeee4b92000-03-10 22:57:27 +0000868{
Walter Dörwald3aeb6322002-09-02 13:14:32 +0000869 static struct {
870 char *name;
871 PyMethodDef def;
872 } methods[] =
873 {
874 {
875 "strict",
876 {
877 "strict_errors",
878 strict_errors,
879 METH_O
880 }
881 },
882 {
883 "ignore",
884 {
885 "ignore_errors",
886 ignore_errors,
887 METH_O
888 }
889 },
890 {
891 "replace",
892 {
893 "replace_errors",
894 replace_errors,
895 METH_O
896 }
897 },
898 {
899 "xmlcharrefreplace",
900 {
901 "xmlcharrefreplace_errors",
902 xmlcharrefreplace_errors,
903 METH_O
904 }
905 },
906 {
907 "backslashreplace",
908 {
909 "backslashreplace_errors",
910 backslashreplace_errors,
911 METH_O
912 }
Martin v. Löwisdb12d452009-05-02 18:52:14 +0000913 },
914 {
915 "surrogates",
916 {
917 "surrogates",
918 surrogates_errors,
919 METH_O
920 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +0000921 }
922 };
Walter Dörwald3aeb6322002-09-02 13:14:32 +0000923
Nicholas Bastine5662ae2004-03-24 22:22:12 +0000924 PyInterpreterState *interp = PyThreadState_GET()->interp;
Gustavo Niemeyer5ddd4c32003-03-19 00:35:36 +0000925 PyObject *mod;
Neal Norwitz739a8f82004-07-08 01:55:58 +0000926 unsigned i;
Gustavo Niemeyer5ddd4c32003-03-19 00:35:36 +0000927
928 if (interp->codec_search_path != NULL)
929 return 0;
930
931 interp->codec_search_path = PyList_New(0);
932 interp->codec_search_cache = PyDict_New();
933 interp->codec_error_registry = PyDict_New();
934
935 if (interp->codec_error_registry) {
936 for (i = 0; i < sizeof(methods)/sizeof(methods[0]); ++i) {
937 PyObject *func = PyCFunction_New(&methods[i].def, NULL);
938 int res;
939 if (!func)
940 Py_FatalError("can't initialize codec error registry");
941 res = PyCodec_RegisterError(methods[i].name, func);
942 Py_DECREF(func);
943 if (res)
944 Py_FatalError("can't initialize codec error registry");
Walter Dörwald3aeb6322002-09-02 13:14:32 +0000945 }
946 }
Guido van Rossumfeee4b92000-03-10 22:57:27 +0000947
Gustavo Niemeyer5ddd4c32003-03-19 00:35:36 +0000948 if (interp->codec_search_path == NULL ||
949 interp->codec_search_cache == NULL ||
950 interp->codec_error_registry == NULL)
951 Py_FatalError("can't initialize codec registry");
952
Christian Heimes819b8bf2008-01-03 23:05:47 +0000953 mod = PyImport_ImportModuleNoBlock("encodings");
Gustavo Niemeyer5ddd4c32003-03-19 00:35:36 +0000954 if (mod == NULL) {
955 if (PyErr_ExceptionMatches(PyExc_ImportError)) {
956 /* Ignore ImportErrors... this is done so that
957 distributions can disable the encodings package. Note
958 that other errors are not masked, e.g. SystemErrors
959 raised to inform the user of an error in the Python
960 configuration are still reported back to the user. */
961 PyErr_Clear();
962 return 0;
963 }
964 return -1;
965 }
966 Py_DECREF(mod);
Christian Heimes6a27efa2008-10-30 21:48:26 +0000967 interp->codecs_initialized = 1;
Gustavo Niemeyer5ddd4c32003-03-19 00:35:36 +0000968 return 0;
Guido van Rossumfeee4b92000-03-10 22:57:27 +0000969}