blob: 79dfe89aba2235511cc2c8e09fff809781682dc7 [file] [log] [blame]
Guido van Rossumfeee4b92000-03-10 22:57:27 +00001/* ------------------------------------------------------------------------
2
3 Python Codec Registry and support functions
4
5Written by Marc-Andre Lemburg (mal@lemburg.com).
6
Guido van Rossum16b1ad92000-08-03 16:24:25 +00007Copyright (c) Corporation for National Research Initiatives.
Guido van Rossumfeee4b92000-03-10 22:57:27 +00008
9 ------------------------------------------------------------------------ */
10
11#include "Python.h"
12#include <ctype.h>
13
Guido van Rossumfeee4b92000-03-10 22:57:27 +000014/* --- Codec Registry ----------------------------------------------------- */
15
16/* Import the standard encodings package which will register the first
Guido van Rossum98297ee2007-11-06 21:34:58 +000017 codec search function.
Guido van Rossumfeee4b92000-03-10 22:57:27 +000018
19 This is done in a lazy way so that the Unicode implementation does
20 not downgrade startup time of scripts not needing it.
21
Guido van Rossumb95de4f2000-03-31 17:25:23 +000022 ImportErrors are silently ignored by this function. Only one try is
23 made.
Guido van Rossumfeee4b92000-03-10 22:57:27 +000024
25*/
26
Gustavo Niemeyer5ddd4c32003-03-19 00:35:36 +000027static int _PyCodecRegistry_Init(void); /* Forward */
Guido van Rossumfeee4b92000-03-10 22:57:27 +000028
Guido van Rossumfeee4b92000-03-10 22:57:27 +000029int PyCodec_Register(PyObject *search_function)
30{
Nicholas Bastine5662ae2004-03-24 22:22:12 +000031 PyInterpreterState *interp = PyThreadState_GET()->interp;
Gustavo Niemeyer5ddd4c32003-03-19 00:35:36 +000032 if (interp->codec_search_path == NULL && _PyCodecRegistry_Init())
Antoine Pitrouf95a1b32010-05-09 15:52:27 +000033 goto onError;
Guido van Rossumfeee4b92000-03-10 22:57:27 +000034 if (search_function == NULL) {
Antoine Pitrouf95a1b32010-05-09 15:52:27 +000035 PyErr_BadArgument();
36 goto onError;
Guido van Rossumfeee4b92000-03-10 22:57:27 +000037 }
38 if (!PyCallable_Check(search_function)) {
Antoine Pitrouf95a1b32010-05-09 15:52:27 +000039 PyErr_SetString(PyExc_TypeError, "argument must be callable");
40 goto onError;
Guido van Rossumfeee4b92000-03-10 22:57:27 +000041 }
Gustavo Niemeyer5ddd4c32003-03-19 00:35:36 +000042 return PyList_Append(interp->codec_search_path, search_function);
Guido van Rossumb95de4f2000-03-31 17:25:23 +000043
44 onError:
45 return -1;
Guido van Rossumfeee4b92000-03-10 22:57:27 +000046}
47
Guido van Rossum9e896b32000-04-05 20:11:21 +000048/* Convert a string to a normalized Python string: all characters are
49 converted to lower case, spaces are replaced with underscores. */
50
Guido van Rossumfeee4b92000-03-10 22:57:27 +000051static
Guido van Rossum9e896b32000-04-05 20:11:21 +000052PyObject *normalizestring(const char *string)
Guido van Rossumfeee4b92000-03-10 22:57:27 +000053{
Guido van Rossum33831132000-06-29 14:50:15 +000054 register size_t i;
Guido van Rossum582acec2000-06-28 22:07:35 +000055 size_t len = strlen(string);
Guido van Rossumfeee4b92000-03-10 22:57:27 +000056 char *p;
57 PyObject *v;
Guido van Rossum21431e82007-10-19 21:48:41 +000058
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000059 if (len > PY_SSIZE_T_MAX) {
Antoine Pitrouf95a1b32010-05-09 15:52:27 +000060 PyErr_SetString(PyExc_OverflowError, "string is too large");
61 return NULL;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000062 }
Guido van Rossum21431e82007-10-19 21:48:41 +000063
64 p = PyMem_Malloc(len + 1);
65 if (p == NULL)
66 return NULL;
Guido van Rossum9e896b32000-04-05 20:11:21 +000067 for (i = 0; i < len; i++) {
68 register char ch = string[i];
69 if (ch == ' ')
70 ch = '-';
71 else
Antoine Pitroucf9d3c02011-07-24 02:27:04 +020072 ch = Py_TOLOWER(Py_CHARMASK(ch));
Antoine Pitrouf95a1b32010-05-09 15:52:27 +000073 p[i] = ch;
Guido van Rossum9e896b32000-04-05 20:11:21 +000074 }
Guido van Rossum21431e82007-10-19 21:48:41 +000075 p[i] = '\0';
76 v = PyUnicode_FromString(p);
77 if (v == NULL)
78 return NULL;
79 PyMem_Free(p);
Guido van Rossumfeee4b92000-03-10 22:57:27 +000080 return v;
81}
82
83/* Lookup the given encoding and return a tuple providing the codec
84 facilities.
85
86 The encoding string is looked up converted to all lower-case
87 characters. This makes encodings looked up through this mechanism
88 effectively case-insensitive.
89
Guido van Rossum98297ee2007-11-06 21:34:58 +000090 If no codec is found, a LookupError is set and NULL returned.
Guido van Rossumb95de4f2000-03-31 17:25:23 +000091
92 As side effect, this tries to load the encodings package, if not
93 yet done. This is part of the lazy load strategy for the encodings
94 package.
95
96*/
Guido van Rossumfeee4b92000-03-10 22:57:27 +000097
98PyObject *_PyCodec_Lookup(const char *encoding)
99{
Gustavo Niemeyer5ddd4c32003-03-19 00:35:36 +0000100 PyInterpreterState *interp;
Guido van Rossum5ba3c842000-03-24 20:52:23 +0000101 PyObject *result, *args = NULL, *v;
Thomas Wouters477c8d52006-05-27 19:21:47 +0000102 Py_ssize_t i, len;
Guido van Rossumfeee4b92000-03-10 22:57:27 +0000103
Fred Drake766de832000-05-09 19:55:59 +0000104 if (encoding == NULL) {
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000105 PyErr_BadArgument();
106 goto onError;
Fred Drake766de832000-05-09 19:55:59 +0000107 }
Gustavo Niemeyer5ddd4c32003-03-19 00:35:36 +0000108
Nicholas Bastine5662ae2004-03-24 22:22:12 +0000109 interp = PyThreadState_GET()->interp;
Gustavo Niemeyer5ddd4c32003-03-19 00:35:36 +0000110 if (interp->codec_search_path == NULL && _PyCodecRegistry_Init())
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000111 goto onError;
Guido van Rossumfeee4b92000-03-10 22:57:27 +0000112
Guido van Rossum9e896b32000-04-05 20:11:21 +0000113 /* Convert the encoding to a normalized Python string: all
Thomas Wouters7e474022000-07-16 12:04:32 +0000114 characters are converted to lower case, spaces and hyphens are
Guido van Rossum9e896b32000-04-05 20:11:21 +0000115 replaced with underscores. */
116 v = normalizestring(encoding);
Guido van Rossumfeee4b92000-03-10 22:57:27 +0000117 if (v == NULL)
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000118 goto onError;
Guido van Rossum21431e82007-10-19 21:48:41 +0000119 PyUnicode_InternInPlace(&v);
Guido van Rossumfeee4b92000-03-10 22:57:27 +0000120
121 /* First, try to lookup the name in the registry dictionary */
Gustavo Niemeyer5ddd4c32003-03-19 00:35:36 +0000122 result = PyDict_GetItem(interp->codec_search_cache, v);
Guido van Rossumfeee4b92000-03-10 22:57:27 +0000123 if (result != NULL) {
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000124 Py_INCREF(result);
125 Py_DECREF(v);
126 return result;
Guido van Rossumfeee4b92000-03-10 22:57:27 +0000127 }
Guido van Rossum98297ee2007-11-06 21:34:58 +0000128
Guido van Rossumfeee4b92000-03-10 22:57:27 +0000129 /* Next, scan the search functions in order of registration */
Guido van Rossumfeee4b92000-03-10 22:57:27 +0000130 args = PyTuple_New(1);
131 if (args == NULL)
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000132 goto onError;
Guido van Rossumfeee4b92000-03-10 22:57:27 +0000133 PyTuple_SET_ITEM(args,0,v);
Guido van Rossum5ba3c842000-03-24 20:52:23 +0000134
Gustavo Niemeyer5ddd4c32003-03-19 00:35:36 +0000135 len = PyList_Size(interp->codec_search_path);
Guido van Rossum5ba3c842000-03-24 20:52:23 +0000136 if (len < 0)
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000137 goto onError;
Guido van Rossumb95de4f2000-03-31 17:25:23 +0000138 if (len == 0) {
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000139 PyErr_SetString(PyExc_LookupError,
140 "no codec search functions registered: "
141 "can't find encoding");
142 goto onError;
Guido van Rossumb95de4f2000-03-31 17:25:23 +0000143 }
Guido van Rossumfeee4b92000-03-10 22:57:27 +0000144
145 for (i = 0; i < len; i++) {
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000146 PyObject *func;
Guido van Rossum98297ee2007-11-06 21:34:58 +0000147
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000148 func = PyList_GetItem(interp->codec_search_path, i);
149 if (func == NULL)
150 goto onError;
151 result = PyEval_CallObject(func, args);
152 if (result == NULL)
153 goto onError;
154 if (result == Py_None) {
155 Py_DECREF(result);
156 continue;
157 }
158 if (!PyTuple_Check(result) || PyTuple_GET_SIZE(result) != 4) {
159 PyErr_SetString(PyExc_TypeError,
160 "codec search functions must return 4-tuples");
161 Py_DECREF(result);
162 goto onError;
163 }
164 break;
Guido van Rossumfeee4b92000-03-10 22:57:27 +0000165 }
166 if (i == len) {
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000167 /* XXX Perhaps we should cache misses too ? */
168 PyErr_Format(PyExc_LookupError,
Martin v. Löwiseb42b022002-09-26 16:01:24 +0000169 "unknown encoding: %s", encoding);
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000170 goto onError;
Guido van Rossumfeee4b92000-03-10 22:57:27 +0000171 }
172
173 /* Cache and return the result */
Neal Norwitz9edcc2e2007-08-11 04:58:26 +0000174 if (PyDict_SetItem(interp->codec_search_cache, v, result) < 0) {
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000175 Py_DECREF(result);
176 goto onError;
Neal Norwitz9edcc2e2007-08-11 04:58:26 +0000177 }
Guido van Rossumfeee4b92000-03-10 22:57:27 +0000178 Py_DECREF(args);
179 return result;
180
181 onError:
182 Py_XDECREF(args);
183 return NULL;
184}
185
Marc-André Lemburgb2750b52008-06-06 12:18:17 +0000186/* Codec registry encoding check API. */
187
188int PyCodec_KnownEncoding(const char *encoding)
189{
190 PyObject *codecs;
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000191
Marc-André Lemburgb2750b52008-06-06 12:18:17 +0000192 codecs = _PyCodec_Lookup(encoding);
193 if (!codecs) {
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000194 PyErr_Clear();
195 return 0;
Marc-André Lemburgb2750b52008-06-06 12:18:17 +0000196 }
197 else {
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000198 Py_DECREF(codecs);
199 return 1;
Marc-André Lemburgb2750b52008-06-06 12:18:17 +0000200 }
201}
202
Guido van Rossumfeee4b92000-03-10 22:57:27 +0000203static
204PyObject *args_tuple(PyObject *object,
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000205 const char *errors)
Guido van Rossumfeee4b92000-03-10 22:57:27 +0000206{
207 PyObject *args;
Guido van Rossum98297ee2007-11-06 21:34:58 +0000208
Guido van Rossumfeee4b92000-03-10 22:57:27 +0000209 args = PyTuple_New(1 + (errors != NULL));
210 if (args == NULL)
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000211 return NULL;
Guido van Rossumfeee4b92000-03-10 22:57:27 +0000212 Py_INCREF(object);
213 PyTuple_SET_ITEM(args,0,object);
214 if (errors) {
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000215 PyObject *v;
Guido van Rossum98297ee2007-11-06 21:34:58 +0000216
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000217 v = PyUnicode_FromString(errors);
218 if (v == NULL) {
219 Py_DECREF(args);
220 return NULL;
221 }
222 PyTuple_SET_ITEM(args, 1, v);
Guido van Rossumfeee4b92000-03-10 22:57:27 +0000223 }
224 return args;
225}
226
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000227/* Helper function to get a codec item */
Guido van Rossumfeee4b92000-03-10 22:57:27 +0000228
229static
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000230PyObject *codec_getitem(const char *encoding, int index)
Guido van Rossumfeee4b92000-03-10 22:57:27 +0000231{
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000232 PyObject *codecs;
233 PyObject *v;
Guido van Rossumfeee4b92000-03-10 22:57:27 +0000234
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000235 codecs = _PyCodec_Lookup(encoding);
236 if (codecs == NULL)
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000237 return NULL;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000238 v = PyTuple_GET_ITEM(codecs, index);
239 Py_DECREF(codecs);
240 Py_INCREF(v);
241 return v;
242}
243
244/* Helper function to create an incremental codec. */
245
246static
247PyObject *codec_getincrementalcodec(const char *encoding,
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000248 const char *errors,
249 const char *attrname)
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000250{
251 PyObject *codecs, *ret, *inccodec;
252
253 codecs = _PyCodec_Lookup(encoding);
254 if (codecs == NULL)
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000255 return NULL;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000256 inccodec = PyObject_GetAttrString(codecs, attrname);
257 Py_DECREF(codecs);
258 if (inccodec == NULL)
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000259 return NULL;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000260 if (errors)
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000261 ret = PyObject_CallFunction(inccodec, "s", errors);
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000262 else
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000263 ret = PyObject_CallFunction(inccodec, NULL);
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000264 Py_DECREF(inccodec);
265 return ret;
266}
267
268/* Helper function to create a stream codec. */
269
270static
271PyObject *codec_getstreamcodec(const char *encoding,
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000272 PyObject *stream,
273 const char *errors,
274 const int index)
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000275{
Thomas Wouters0e3f5912006-08-11 14:57:12 +0000276 PyObject *codecs, *streamcodec, *codeccls;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000277
278 codecs = _PyCodec_Lookup(encoding);
279 if (codecs == NULL)
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000280 return NULL;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000281
Thomas Wouters0e3f5912006-08-11 14:57:12 +0000282 codeccls = PyTuple_GET_ITEM(codecs, index);
283 if (errors != NULL)
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000284 streamcodec = PyObject_CallFunction(codeccls, "Os", stream, errors);
Thomas Wouters0e3f5912006-08-11 14:57:12 +0000285 else
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000286 streamcodec = PyObject_CallFunction(codeccls, "O", stream);
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000287 Py_DECREF(codecs);
288 return streamcodec;
Guido van Rossumfeee4b92000-03-10 22:57:27 +0000289}
290
Guido van Rossum98297ee2007-11-06 21:34:58 +0000291/* Convenience APIs to query the Codec registry.
292
Guido van Rossumfeee4b92000-03-10 22:57:27 +0000293 All APIs return a codec object with incremented refcount.
Guido van Rossum98297ee2007-11-06 21:34:58 +0000294
Guido van Rossumfeee4b92000-03-10 22:57:27 +0000295 */
296
297PyObject *PyCodec_Encoder(const char *encoding)
298{
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000299 return codec_getitem(encoding, 0);
Guido van Rossumfeee4b92000-03-10 22:57:27 +0000300}
301
302PyObject *PyCodec_Decoder(const char *encoding)
303{
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000304 return codec_getitem(encoding, 1);
Guido van Rossumfeee4b92000-03-10 22:57:27 +0000305}
306
Thomas Woutersa9773292006-04-21 09:43:23 +0000307PyObject *PyCodec_IncrementalEncoder(const char *encoding,
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000308 const char *errors)
Thomas Woutersa9773292006-04-21 09:43:23 +0000309{
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000310 return codec_getincrementalcodec(encoding, errors, "incrementalencoder");
Thomas Woutersa9773292006-04-21 09:43:23 +0000311}
312
313PyObject *PyCodec_IncrementalDecoder(const char *encoding,
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000314 const char *errors)
Thomas Woutersa9773292006-04-21 09:43:23 +0000315{
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000316 return codec_getincrementalcodec(encoding, errors, "incrementaldecoder");
Thomas Woutersa9773292006-04-21 09:43:23 +0000317}
318
Guido van Rossumfeee4b92000-03-10 22:57:27 +0000319PyObject *PyCodec_StreamReader(const char *encoding,
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000320 PyObject *stream,
321 const char *errors)
Guido van Rossumfeee4b92000-03-10 22:57:27 +0000322{
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000323 return codec_getstreamcodec(encoding, stream, errors, 2);
Guido van Rossumfeee4b92000-03-10 22:57:27 +0000324}
325
326PyObject *PyCodec_StreamWriter(const char *encoding,
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000327 PyObject *stream,
328 const char *errors)
Guido van Rossumfeee4b92000-03-10 22:57:27 +0000329{
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000330 return codec_getstreamcodec(encoding, stream, errors, 3);
Guido van Rossumfeee4b92000-03-10 22:57:27 +0000331}
332
333/* Encode an object (e.g. an Unicode object) using the given encoding
334 and return the resulting encoded object (usually a Python string).
335
336 errors is passed to the encoder factory as argument if non-NULL. */
337
338PyObject *PyCodec_Encode(PyObject *object,
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000339 const char *encoding,
340 const char *errors)
Guido van Rossumfeee4b92000-03-10 22:57:27 +0000341{
342 PyObject *encoder = NULL;
Neal Norwitz3715c3e2005-11-24 22:09:18 +0000343 PyObject *args = NULL, *result = NULL;
Guido van Rossum98297ee2007-11-06 21:34:58 +0000344 PyObject *v = NULL;
Guido van Rossumfeee4b92000-03-10 22:57:27 +0000345
346 encoder = PyCodec_Encoder(encoding);
347 if (encoder == NULL)
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000348 goto onError;
Guido van Rossumfeee4b92000-03-10 22:57:27 +0000349
350 args = args_tuple(object, errors);
351 if (args == NULL)
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000352 goto onError;
Guido van Rossum98297ee2007-11-06 21:34:58 +0000353
354 result = PyEval_CallObject(encoder, args);
Guido van Rossumfeee4b92000-03-10 22:57:27 +0000355 if (result == NULL)
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000356 goto onError;
Guido van Rossumfeee4b92000-03-10 22:57:27 +0000357
Guido van Rossum98297ee2007-11-06 21:34:58 +0000358 if (!PyTuple_Check(result) ||
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000359 PyTuple_GET_SIZE(result) != 2) {
360 PyErr_SetString(PyExc_TypeError,
361 "encoder must return a tuple (object, integer)");
362 goto onError;
Guido van Rossumfeee4b92000-03-10 22:57:27 +0000363 }
Marc-André Lemburgb2750b52008-06-06 12:18:17 +0000364 v = PyTuple_GET_ITEM(result,0);
365 Py_INCREF(v);
Guido van Rossumfeee4b92000-03-10 22:57:27 +0000366 /* We don't check or use the second (integer) entry. */
367
Marc-André Lemburgb2750b52008-06-06 12:18:17 +0000368 Py_DECREF(args);
369 Py_DECREF(encoder);
370 Py_DECREF(result);
371 return v;
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000372
Guido van Rossumfeee4b92000-03-10 22:57:27 +0000373 onError:
Neal Norwitz3715c3e2005-11-24 22:09:18 +0000374 Py_XDECREF(result);
Guido van Rossumfeee4b92000-03-10 22:57:27 +0000375 Py_XDECREF(args);
376 Py_XDECREF(encoder);
Marc-André Lemburgb2750b52008-06-06 12:18:17 +0000377 return NULL;
Guido van Rossumfeee4b92000-03-10 22:57:27 +0000378}
379
380/* Decode an object (usually a Python string) using the given encoding
381 and return an equivalent object (e.g. an Unicode object).
382
383 errors is passed to the decoder factory as argument if non-NULL. */
384
385PyObject *PyCodec_Decode(PyObject *object,
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000386 const char *encoding,
387 const char *errors)
Guido van Rossumfeee4b92000-03-10 22:57:27 +0000388{
389 PyObject *decoder = NULL;
390 PyObject *args = NULL, *result = NULL;
391 PyObject *v;
392
393 decoder = PyCodec_Decoder(encoding);
394 if (decoder == NULL)
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000395 goto onError;
Guido van Rossumfeee4b92000-03-10 22:57:27 +0000396
397 args = args_tuple(object, errors);
398 if (args == NULL)
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000399 goto onError;
Guido van Rossum98297ee2007-11-06 21:34:58 +0000400
Guido van Rossumfeee4b92000-03-10 22:57:27 +0000401 result = PyEval_CallObject(decoder,args);
402 if (result == NULL)
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000403 goto onError;
Guido van Rossum98297ee2007-11-06 21:34:58 +0000404 if (!PyTuple_Check(result) ||
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000405 PyTuple_GET_SIZE(result) != 2) {
406 PyErr_SetString(PyExc_TypeError,
407 "decoder must return a tuple (object,integer)");
408 goto onError;
Guido van Rossumfeee4b92000-03-10 22:57:27 +0000409 }
410 v = PyTuple_GET_ITEM(result,0);
411 Py_INCREF(v);
412 /* We don't check or use the second (integer) entry. */
413
414 Py_DECREF(args);
415 Py_DECREF(decoder);
416 Py_DECREF(result);
417 return v;
Guido van Rossum98297ee2007-11-06 21:34:58 +0000418
Guido van Rossumfeee4b92000-03-10 22:57:27 +0000419 onError:
420 Py_XDECREF(args);
421 Py_XDECREF(decoder);
422 Py_XDECREF(result);
423 return NULL;
424}
425
Walter Dörwald3aeb6322002-09-02 13:14:32 +0000426/* Register the error handling callback function error under the name
427 name. This function will be called by the codec when it encounters
428 an unencodable characters/undecodable bytes and doesn't know the
429 callback name, when name is specified as the error parameter
430 in the call to the encode/decode function.
431 Return 0 on success, -1 on error */
432int PyCodec_RegisterError(const char *name, PyObject *error)
433{
Nicholas Bastine5662ae2004-03-24 22:22:12 +0000434 PyInterpreterState *interp = PyThreadState_GET()->interp;
Gustavo Niemeyer5ddd4c32003-03-19 00:35:36 +0000435 if (interp->codec_search_path == NULL && _PyCodecRegistry_Init())
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000436 return -1;
Walter Dörwald3aeb6322002-09-02 13:14:32 +0000437 if (!PyCallable_Check(error)) {
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000438 PyErr_SetString(PyExc_TypeError, "handler must be callable");
439 return -1;
Walter Dörwald3aeb6322002-09-02 13:14:32 +0000440 }
Gustavo Niemeyer5ddd4c32003-03-19 00:35:36 +0000441 return PyDict_SetItemString(interp->codec_error_registry,
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000442 (char *)name, error);
Walter Dörwald3aeb6322002-09-02 13:14:32 +0000443}
444
445/* Lookup the error handling callback function registered under the
446 name error. As a special case NULL can be passed, in which case
447 the error handling callback for strict encoding will be returned. */
448PyObject *PyCodec_LookupError(const char *name)
449{
450 PyObject *handler = NULL;
451
Nicholas Bastine5662ae2004-03-24 22:22:12 +0000452 PyInterpreterState *interp = PyThreadState_GET()->interp;
Gustavo Niemeyer5ddd4c32003-03-19 00:35:36 +0000453 if (interp->codec_search_path == NULL && _PyCodecRegistry_Init())
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000454 return NULL;
Gustavo Niemeyer5ddd4c32003-03-19 00:35:36 +0000455
Walter Dörwald3aeb6322002-09-02 13:14:32 +0000456 if (name==NULL)
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000457 name = "strict";
Gustavo Niemeyer5ddd4c32003-03-19 00:35:36 +0000458 handler = PyDict_GetItemString(interp->codec_error_registry, (char *)name);
Walter Dörwald3aeb6322002-09-02 13:14:32 +0000459 if (!handler)
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000460 PyErr_Format(PyExc_LookupError, "unknown error handler name '%.400s'", name);
Walter Dörwald3aeb6322002-09-02 13:14:32 +0000461 else
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000462 Py_INCREF(handler);
Walter Dörwald3aeb6322002-09-02 13:14:32 +0000463 return handler;
464}
465
466static void wrong_exception_type(PyObject *exc)
467{
Martin v. Löwis1ee1b6f2011-10-10 18:11:30 +0200468 _Py_identifier(__class__);
469 _Py_identifier(__name__);
470 PyObject *type = _PyObject_GetAttrId(exc, &PyId___class__);
Walter Dörwald3aeb6322002-09-02 13:14:32 +0000471 if (type != NULL) {
Martin v. Löwis1ee1b6f2011-10-10 18:11:30 +0200472 PyObject *name = _PyObject_GetAttrId(type, &PyId___name__);
Walter Dörwald573c08c2007-05-25 15:46:59 +0000473 Py_DECREF(type);
474 if (name != NULL) {
475 PyErr_Format(PyExc_TypeError,
476 "don't know how to handle %S in error callback", name);
477 Py_DECREF(name);
478 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +0000479 }
480}
481
482PyObject *PyCodec_StrictErrors(PyObject *exc)
483{
Brett Cannonbf364092006-03-01 04:25:17 +0000484 if (PyExceptionInstance_Check(exc))
485 PyErr_SetObject(PyExceptionInstance_Class(exc), exc);
Walter Dörwald3aeb6322002-09-02 13:14:32 +0000486 else
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000487 PyErr_SetString(PyExc_TypeError, "codec must pass exception instance");
Walter Dörwald3aeb6322002-09-02 13:14:32 +0000488 return NULL;
489}
490
491
492PyObject *PyCodec_IgnoreErrors(PyObject *exc)
493{
Martin v. Löwis18e16552006-02-15 17:27:45 +0000494 Py_ssize_t end;
Walter Dörwald3aeb6322002-09-02 13:14:32 +0000495 if (PyObject_IsInstance(exc, PyExc_UnicodeEncodeError)) {
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000496 if (PyUnicodeEncodeError_GetEnd(exc, &end))
497 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +0000498 }
499 else if (PyObject_IsInstance(exc, PyExc_UnicodeDecodeError)) {
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000500 if (PyUnicodeDecodeError_GetEnd(exc, &end))
501 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +0000502 }
503 else if (PyObject_IsInstance(exc, PyExc_UnicodeTranslateError)) {
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000504 if (PyUnicodeTranslateError_GetEnd(exc, &end))
505 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +0000506 }
507 else {
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000508 wrong_exception_type(exc);
509 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +0000510 }
511 /* ouch: passing NULL, 0, pos gives None instead of u'' */
Martin v. Löwis18e16552006-02-15 17:27:45 +0000512 return Py_BuildValue("(u#n)", &end, 0, end);
Walter Dörwald3aeb6322002-09-02 13:14:32 +0000513}
514
515
516PyObject *PyCodec_ReplaceErrors(PyObject *exc)
517{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200518 Py_ssize_t start, end, i, len;
Walter Dörwald3aeb6322002-09-02 13:14:32 +0000519
520 if (PyObject_IsInstance(exc, PyExc_UnicodeEncodeError)) {
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000521 PyObject *res;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200522 int kind;
523 void *data;
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000524 if (PyUnicodeEncodeError_GetStart(exc, &start))
525 return NULL;
526 if (PyUnicodeEncodeError_GetEnd(exc, &end))
527 return NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200528 len = end - start;
529 res = PyUnicode_New(len, '?');
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000530 if (res == NULL)
531 return NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200532 kind = PyUnicode_KIND(res);
533 data = PyUnicode_DATA(res);
534 for (i = 0; i < len; ++i)
535 PyUnicode_WRITE(kind, data, i, '?');
536 return Py_BuildValue("(Nn)", res, end);
Walter Dörwald3aeb6322002-09-02 13:14:32 +0000537 }
538 else if (PyObject_IsInstance(exc, PyExc_UnicodeDecodeError)) {
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000539 if (PyUnicodeDecodeError_GetEnd(exc, &end))
540 return NULL;
Victor Stinner1a15aba2011-10-02 19:00:15 +0200541 return Py_BuildValue("(Cn)",
542 (int)Py_UNICODE_REPLACEMENT_CHARACTER,
543 end);
Walter Dörwald3aeb6322002-09-02 13:14:32 +0000544 }
545 else if (PyObject_IsInstance(exc, PyExc_UnicodeTranslateError)) {
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000546 PyObject *res;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200547 int kind;
548 void *data;
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000549 if (PyUnicodeTranslateError_GetStart(exc, &start))
550 return NULL;
551 if (PyUnicodeTranslateError_GetEnd(exc, &end))
552 return NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200553 len = end - start;
554 res = PyUnicode_New(len, Py_UNICODE_REPLACEMENT_CHARACTER);
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000555 if (res == NULL)
556 return NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200557 kind = PyUnicode_KIND(res);
558 data = PyUnicode_DATA(res);
559 for (i=0; i < len; i++)
560 PyUnicode_WRITE(kind, data, i, Py_UNICODE_REPLACEMENT_CHARACTER);
561 return Py_BuildValue("(Nn)", res, end);
Walter Dörwald3aeb6322002-09-02 13:14:32 +0000562 }
563 else {
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000564 wrong_exception_type(exc);
565 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +0000566 }
567}
568
569PyObject *PyCodec_XMLCharRefReplaceErrors(PyObject *exc)
570{
571 if (PyObject_IsInstance(exc, PyExc_UnicodeEncodeError)) {
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000572 PyObject *restuple;
573 PyObject *object;
574 Py_ssize_t start;
575 Py_ssize_t end;
576 PyObject *res;
577 Py_UNICODE *p;
578 Py_UNICODE *startp;
579 Py_UNICODE *outp;
580 int ressize;
581 if (PyUnicodeEncodeError_GetStart(exc, &start))
582 return NULL;
583 if (PyUnicodeEncodeError_GetEnd(exc, &end))
584 return NULL;
585 if (!(object = PyUnicodeEncodeError_GetObject(exc)))
586 return NULL;
587 startp = PyUnicode_AS_UNICODE(object);
588 for (p = startp+start, ressize = 0; p < startp+end; ++p) {
589 if (*p<10)
590 ressize += 2+1+1;
591 else if (*p<100)
592 ressize += 2+2+1;
593 else if (*p<1000)
594 ressize += 2+3+1;
595 else if (*p<10000)
596 ressize += 2+4+1;
Hye-Shik Chang7db07e62003-12-29 01:36:01 +0000597#ifndef Py_UNICODE_WIDE
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000598 else
599 ressize += 2+5+1;
Hye-Shik Chang7db07e62003-12-29 01:36:01 +0000600#else
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000601 else if (*p<100000)
602 ressize += 2+5+1;
603 else if (*p<1000000)
604 ressize += 2+6+1;
605 else
606 ressize += 2+7+1;
Hye-Shik Chang7db07e62003-12-29 01:36:01 +0000607#endif
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000608 }
609 /* allocate replacement */
610 res = PyUnicode_FromUnicode(NULL, ressize);
611 if (res == NULL) {
612 Py_DECREF(object);
613 return NULL;
614 }
615 /* generate replacement */
616 for (p = startp+start, outp = PyUnicode_AS_UNICODE(res);
617 p < startp+end; ++p) {
618 Py_UNICODE c = *p;
619 int digits;
620 int base;
621 *outp++ = '&';
622 *outp++ = '#';
623 if (*p<10) {
624 digits = 1;
625 base = 1;
626 }
627 else if (*p<100) {
628 digits = 2;
629 base = 10;
630 }
631 else if (*p<1000) {
632 digits = 3;
633 base = 100;
634 }
635 else if (*p<10000) {
636 digits = 4;
637 base = 1000;
638 }
Hye-Shik Chang7db07e62003-12-29 01:36:01 +0000639#ifndef Py_UNICODE_WIDE
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000640 else {
641 digits = 5;
642 base = 10000;
643 }
Hye-Shik Chang7db07e62003-12-29 01:36:01 +0000644#else
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000645 else if (*p<100000) {
646 digits = 5;
647 base = 10000;
648 }
649 else if (*p<1000000) {
650 digits = 6;
651 base = 100000;
652 }
653 else {
654 digits = 7;
655 base = 1000000;
656 }
Hye-Shik Chang7db07e62003-12-29 01:36:01 +0000657#endif
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000658 while (digits-->0) {
659 *outp++ = '0' + c/base;
660 c %= base;
661 base /= 10;
662 }
663 *outp++ = ';';
664 }
665 restuple = Py_BuildValue("(On)", res, end);
666 Py_DECREF(res);
667 Py_DECREF(object);
668 return restuple;
Walter Dörwald3aeb6322002-09-02 13:14:32 +0000669 }
670 else {
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000671 wrong_exception_type(exc);
672 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +0000673 }
674}
675
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200676static const char *hexdigits = "0123456789abcdef";
Walter Dörwald3aeb6322002-09-02 13:14:32 +0000677
678PyObject *PyCodec_BackslashReplaceErrors(PyObject *exc)
679{
Antoine Pitroue4a18922010-09-09 20:30:23 +0000680#ifndef Py_UNICODE_WIDE
681#define IS_SURROGATE_PAIR(p, end) \
682 (*p >= 0xD800 && *p <= 0xDBFF && (p + 1) < end && \
683 *(p + 1) >= 0xDC00 && *(p + 1) <= 0xDFFF)
684#else
685#define IS_SURROGATE_PAIR(p, end) 0
686#endif
Walter Dörwald3aeb6322002-09-02 13:14:32 +0000687 if (PyObject_IsInstance(exc, PyExc_UnicodeEncodeError)) {
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000688 PyObject *restuple;
689 PyObject *object;
690 Py_ssize_t start;
691 Py_ssize_t end;
692 PyObject *res;
693 Py_UNICODE *p;
694 Py_UNICODE *startp;
695 Py_UNICODE *outp;
696 int ressize;
697 if (PyUnicodeEncodeError_GetStart(exc, &start))
698 return NULL;
699 if (PyUnicodeEncodeError_GetEnd(exc, &end))
700 return NULL;
701 if (!(object = PyUnicodeEncodeError_GetObject(exc)))
702 return NULL;
703 startp = PyUnicode_AS_UNICODE(object);
704 for (p = startp+start, ressize = 0; p < startp+end; ++p) {
Hye-Shik Chang7db07e62003-12-29 01:36:01 +0000705#ifdef Py_UNICODE_WIDE
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000706 if (*p >= 0x00010000)
707 ressize += 1+1+8;
708 else
Hye-Shik Chang7db07e62003-12-29 01:36:01 +0000709#endif
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000710 if (*p >= 0x100) {
Antoine Pitroue4a18922010-09-09 20:30:23 +0000711 if (IS_SURROGATE_PAIR(p, startp+end)) {
712 ressize += 1+1+8;
713 ++p;
714 }
715 else
716 ressize += 1+1+4;
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000717 }
718 else
719 ressize += 1+1+2;
720 }
721 res = PyUnicode_FromUnicode(NULL, ressize);
722 if (res==NULL)
723 return NULL;
724 for (p = startp+start, outp = PyUnicode_AS_UNICODE(res);
725 p < startp+end; ++p) {
Antoine Pitroue4a18922010-09-09 20:30:23 +0000726 Py_UCS4 c = (Py_UCS4) *p;
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000727 *outp++ = '\\';
Antoine Pitroue4a18922010-09-09 20:30:23 +0000728 if (IS_SURROGATE_PAIR(p, startp+end)) {
729 c = ((*p & 0x3FF) << 10) + (*(p + 1) & 0x3FF) + 0x10000;
730 ++p;
731 }
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000732 if (c >= 0x00010000) {
733 *outp++ = 'U';
734 *outp++ = hexdigits[(c>>28)&0xf];
735 *outp++ = hexdigits[(c>>24)&0xf];
736 *outp++ = hexdigits[(c>>20)&0xf];
737 *outp++ = hexdigits[(c>>16)&0xf];
738 *outp++ = hexdigits[(c>>12)&0xf];
739 *outp++ = hexdigits[(c>>8)&0xf];
740 }
Antoine Pitroue4a18922010-09-09 20:30:23 +0000741 else if (c >= 0x100) {
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000742 *outp++ = 'u';
743 *outp++ = hexdigits[(c>>12)&0xf];
744 *outp++ = hexdigits[(c>>8)&0xf];
745 }
746 else
747 *outp++ = 'x';
748 *outp++ = hexdigits[(c>>4)&0xf];
749 *outp++ = hexdigits[c&0xf];
750 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +0000751
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000752 restuple = Py_BuildValue("(On)", res, end);
753 Py_DECREF(res);
754 Py_DECREF(object);
755 return restuple;
Walter Dörwald3aeb6322002-09-02 13:14:32 +0000756 }
757 else {
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000758 wrong_exception_type(exc);
759 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +0000760 }
Antoine Pitroue4a18922010-09-09 20:30:23 +0000761#undef IS_SURROGATE_PAIR
Walter Dörwald3aeb6322002-09-02 13:14:32 +0000762}
763
Martin v. Löwisaef3fb02009-05-02 19:27:30 +0000764/* This handler is declared static until someone demonstrates
765 a need to call it directly. */
766static PyObject *
Martin v. Löwise0a2b722009-05-10 08:08:56 +0000767PyCodec_SurrogatePassErrors(PyObject *exc)
Martin v. Löwisdb12d452009-05-02 18:52:14 +0000768{
769 PyObject *restuple;
770 PyObject *object;
771 Py_ssize_t start;
772 Py_ssize_t end;
773 PyObject *res;
774 if (PyObject_IsInstance(exc, PyExc_UnicodeEncodeError)) {
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000775 Py_UNICODE *p;
776 Py_UNICODE *startp;
777 char *outp;
778 if (PyUnicodeEncodeError_GetStart(exc, &start))
779 return NULL;
780 if (PyUnicodeEncodeError_GetEnd(exc, &end))
781 return NULL;
782 if (!(object = PyUnicodeEncodeError_GetObject(exc)))
783 return NULL;
784 startp = PyUnicode_AS_UNICODE(object);
785 res = PyBytes_FromStringAndSize(NULL, 3*(end-start));
786 if (!res) {
787 Py_DECREF(object);
788 return NULL;
789 }
790 outp = PyBytes_AsString(res);
791 for (p = startp+start; p < startp+end; p++) {
792 Py_UNICODE ch = *p;
793 if (ch < 0xd800 || ch > 0xdfff) {
794 /* Not a surrogate, fail with original exception */
795 PyErr_SetObject(PyExceptionInstance_Class(exc), exc);
796 Py_DECREF(res);
797 Py_DECREF(object);
798 return NULL;
799 }
800 *outp++ = (char)(0xe0 | (ch >> 12));
801 *outp++ = (char)(0x80 | ((ch >> 6) & 0x3f));
802 *outp++ = (char)(0x80 | (ch & 0x3f));
803 }
804 restuple = Py_BuildValue("(On)", res, end);
805 Py_DECREF(res);
806 Py_DECREF(object);
807 return restuple;
Martin v. Löwisdb12d452009-05-02 18:52:14 +0000808 }
809 else if (PyObject_IsInstance(exc, PyExc_UnicodeDecodeError)) {
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000810 unsigned char *p;
811 Py_UNICODE ch = 0;
812 if (PyUnicodeDecodeError_GetStart(exc, &start))
813 return NULL;
814 if (!(object = PyUnicodeDecodeError_GetObject(exc)))
815 return NULL;
816 if (!(p = (unsigned char*)PyBytes_AsString(object))) {
817 Py_DECREF(object);
818 return NULL;
819 }
820 /* Try decoding a single surrogate character. If
821 there are more, let the codec call us again. */
822 p += start;
823 if ((p[0] & 0xf0) == 0xe0 ||
824 (p[1] & 0xc0) == 0x80 ||
825 (p[2] & 0xc0) == 0x80) {
826 /* it's a three-byte code */
827 ch = ((p[0] & 0x0f) << 12) + ((p[1] & 0x3f) << 6) + (p[2] & 0x3f);
828 if (ch < 0xd800 || ch > 0xdfff)
829 /* it's not a surrogate - fail */
830 ch = 0;
831 }
832 Py_DECREF(object);
833 if (ch == 0) {
834 PyErr_SetObject(PyExceptionInstance_Class(exc), exc);
835 return NULL;
836 }
837 return Py_BuildValue("(u#n)", &ch, 1, start+3);
Martin v. Löwisdb12d452009-05-02 18:52:14 +0000838 }
839 else {
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000840 wrong_exception_type(exc);
841 return NULL;
Martin v. Löwisdb12d452009-05-02 18:52:14 +0000842 }
843}
844
Martin v. Löwis011e8422009-05-05 04:43:17 +0000845static PyObject *
Martin v. Löwis43c57782009-05-10 08:15:24 +0000846PyCodec_SurrogateEscapeErrors(PyObject *exc)
Martin v. Löwis011e8422009-05-05 04:43:17 +0000847{
848 PyObject *restuple;
849 PyObject *object;
850 Py_ssize_t start;
851 Py_ssize_t end;
852 PyObject *res;
853 if (PyObject_IsInstance(exc, PyExc_UnicodeEncodeError)) {
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000854 Py_UNICODE *p;
855 Py_UNICODE *startp;
856 char *outp;
857 if (PyUnicodeEncodeError_GetStart(exc, &start))
858 return NULL;
859 if (PyUnicodeEncodeError_GetEnd(exc, &end))
860 return NULL;
861 if (!(object = PyUnicodeEncodeError_GetObject(exc)))
862 return NULL;
863 startp = PyUnicode_AS_UNICODE(object);
864 res = PyBytes_FromStringAndSize(NULL, end-start);
865 if (!res) {
866 Py_DECREF(object);
867 return NULL;
868 }
869 outp = PyBytes_AsString(res);
870 for (p = startp+start; p < startp+end; p++) {
871 Py_UNICODE ch = *p;
872 if (ch < 0xdc80 || ch > 0xdcff) {
873 /* Not a UTF-8b surrogate, fail with original exception */
874 PyErr_SetObject(PyExceptionInstance_Class(exc), exc);
875 Py_DECREF(res);
876 Py_DECREF(object);
877 return NULL;
878 }
879 *outp++ = ch - 0xdc00;
880 }
881 restuple = Py_BuildValue("(On)", res, end);
882 Py_DECREF(res);
883 Py_DECREF(object);
884 return restuple;
Martin v. Löwis011e8422009-05-05 04:43:17 +0000885 }
886 else if (PyObject_IsInstance(exc, PyExc_UnicodeDecodeError)) {
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000887 unsigned char *p;
888 Py_UNICODE ch[4]; /* decode up to 4 bad bytes. */
889 int consumed = 0;
890 if (PyUnicodeDecodeError_GetStart(exc, &start))
891 return NULL;
892 if (PyUnicodeDecodeError_GetEnd(exc, &end))
893 return NULL;
894 if (!(object = PyUnicodeDecodeError_GetObject(exc)))
895 return NULL;
896 if (!(p = (unsigned char*)PyBytes_AsString(object))) {
897 Py_DECREF(object);
898 return NULL;
899 }
900 while (consumed < 4 && consumed < end-start) {
901 /* Refuse to escape ASCII bytes. */
902 if (p[start+consumed] < 128)
903 break;
904 ch[consumed] = 0xdc00 + p[start+consumed];
905 consumed++;
906 }
907 Py_DECREF(object);
908 if (!consumed) {
909 /* codec complained about ASCII byte. */
910 PyErr_SetObject(PyExceptionInstance_Class(exc), exc);
911 return NULL;
912 }
913 return Py_BuildValue("(u#n)", ch, consumed, start+consumed);
Martin v. Löwis011e8422009-05-05 04:43:17 +0000914 }
915 else {
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000916 wrong_exception_type(exc);
917 return NULL;
Martin v. Löwis011e8422009-05-05 04:43:17 +0000918 }
919}
920
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000921
Walter Dörwald3aeb6322002-09-02 13:14:32 +0000922static PyObject *strict_errors(PyObject *self, PyObject *exc)
923{
924 return PyCodec_StrictErrors(exc);
925}
926
927
928static PyObject *ignore_errors(PyObject *self, PyObject *exc)
929{
930 return PyCodec_IgnoreErrors(exc);
931}
932
933
934static PyObject *replace_errors(PyObject *self, PyObject *exc)
935{
936 return PyCodec_ReplaceErrors(exc);
937}
938
939
940static PyObject *xmlcharrefreplace_errors(PyObject *self, PyObject *exc)
941{
942 return PyCodec_XMLCharRefReplaceErrors(exc);
943}
944
945
946static PyObject *backslashreplace_errors(PyObject *self, PyObject *exc)
947{
948 return PyCodec_BackslashReplaceErrors(exc);
949}
950
Martin v. Löwise0a2b722009-05-10 08:08:56 +0000951static PyObject *surrogatepass_errors(PyObject *self, PyObject *exc)
Martin v. Löwisdb12d452009-05-02 18:52:14 +0000952{
Martin v. Löwise0a2b722009-05-10 08:08:56 +0000953 return PyCodec_SurrogatePassErrors(exc);
Martin v. Löwisdb12d452009-05-02 18:52:14 +0000954}
955
Martin v. Löwis43c57782009-05-10 08:15:24 +0000956static PyObject *surrogateescape_errors(PyObject *self, PyObject *exc)
Martin v. Löwis011e8422009-05-05 04:43:17 +0000957{
Martin v. Löwis43c57782009-05-10 08:15:24 +0000958 return PyCodec_SurrogateEscapeErrors(exc);
Martin v. Löwis011e8422009-05-05 04:43:17 +0000959}
960
Gustavo Niemeyer5ddd4c32003-03-19 00:35:36 +0000961static int _PyCodecRegistry_Init(void)
Guido van Rossumfeee4b92000-03-10 22:57:27 +0000962{
Walter Dörwald3aeb6322002-09-02 13:14:32 +0000963 static struct {
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000964 char *name;
965 PyMethodDef def;
Walter Dörwald3aeb6322002-09-02 13:14:32 +0000966 } methods[] =
967 {
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000968 {
969 "strict",
970 {
971 "strict_errors",
972 strict_errors,
973 METH_O,
974 PyDoc_STR("Implements the 'strict' error handling, which "
975 "raises a UnicodeError on coding errors.")
976 }
977 },
978 {
979 "ignore",
980 {
981 "ignore_errors",
982 ignore_errors,
983 METH_O,
984 PyDoc_STR("Implements the 'ignore' error handling, which "
985 "ignores malformed data and continues.")
986 }
987 },
988 {
989 "replace",
990 {
991 "replace_errors",
992 replace_errors,
993 METH_O,
994 PyDoc_STR("Implements the 'replace' error handling, which "
995 "replaces malformed data with a replacement marker.")
996 }
997 },
998 {
999 "xmlcharrefreplace",
1000 {
1001 "xmlcharrefreplace_errors",
1002 xmlcharrefreplace_errors,
1003 METH_O,
1004 PyDoc_STR("Implements the 'xmlcharrefreplace' error handling, "
1005 "which replaces an unencodable character with the "
1006 "appropriate XML character reference.")
1007 }
1008 },
1009 {
1010 "backslashreplace",
1011 {
1012 "backslashreplace_errors",
1013 backslashreplace_errors,
1014 METH_O,
1015 PyDoc_STR("Implements the 'backslashreplace' error handling, "
1016 "which replaces an unencodable character with a "
1017 "backslashed escape sequence.")
1018 }
1019 },
1020 {
1021 "surrogatepass",
1022 {
1023 "surrogatepass",
1024 surrogatepass_errors,
1025 METH_O
1026 }
1027 },
1028 {
1029 "surrogateescape",
1030 {
1031 "surrogateescape",
1032 surrogateescape_errors,
1033 METH_O
1034 }
1035 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001036 };
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001037
Nicholas Bastine5662ae2004-03-24 22:22:12 +00001038 PyInterpreterState *interp = PyThreadState_GET()->interp;
Gustavo Niemeyer5ddd4c32003-03-19 00:35:36 +00001039 PyObject *mod;
Neal Norwitz739a8f82004-07-08 01:55:58 +00001040 unsigned i;
Gustavo Niemeyer5ddd4c32003-03-19 00:35:36 +00001041
1042 if (interp->codec_search_path != NULL)
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001043 return 0;
Gustavo Niemeyer5ddd4c32003-03-19 00:35:36 +00001044
1045 interp->codec_search_path = PyList_New(0);
1046 interp->codec_search_cache = PyDict_New();
1047 interp->codec_error_registry = PyDict_New();
1048
1049 if (interp->codec_error_registry) {
Victor Stinner63941882011-09-29 00:42:28 +02001050 for (i = 0; i < Py_ARRAY_LENGTH(methods); ++i) {
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001051 PyObject *func = PyCFunction_New(&methods[i].def, NULL);
1052 int res;
1053 if (!func)
1054 Py_FatalError("can't initialize codec error registry");
1055 res = PyCodec_RegisterError(methods[i].name, func);
1056 Py_DECREF(func);
1057 if (res)
1058 Py_FatalError("can't initialize codec error registry");
1059 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001060 }
Guido van Rossumfeee4b92000-03-10 22:57:27 +00001061
Gustavo Niemeyer5ddd4c32003-03-19 00:35:36 +00001062 if (interp->codec_search_path == NULL ||
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001063 interp->codec_search_cache == NULL ||
1064 interp->codec_error_registry == NULL)
1065 Py_FatalError("can't initialize codec registry");
Gustavo Niemeyer5ddd4c32003-03-19 00:35:36 +00001066
Christian Heimes819b8bf2008-01-03 23:05:47 +00001067 mod = PyImport_ImportModuleNoBlock("encodings");
Gustavo Niemeyer5ddd4c32003-03-19 00:35:36 +00001068 if (mod == NULL) {
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001069 if (PyErr_ExceptionMatches(PyExc_ImportError)) {
1070 /* Ignore ImportErrors... this is done so that
1071 distributions can disable the encodings package. Note
1072 that other errors are not masked, e.g. SystemErrors
1073 raised to inform the user of an error in the Python
1074 configuration are still reported back to the user. */
1075 PyErr_Clear();
1076 return 0;
1077 }
1078 return -1;
Gustavo Niemeyer5ddd4c32003-03-19 00:35:36 +00001079 }
1080 Py_DECREF(mod);
Christian Heimes6a27efa2008-10-30 21:48:26 +00001081 interp->codecs_initialized = 1;
Gustavo Niemeyer5ddd4c32003-03-19 00:35:36 +00001082 return 0;
Guido van Rossumfeee4b92000-03-10 22:57:27 +00001083}