blob: fd67d1b9e183bc846422b7182f8bdb5c40d7acfd [file] [log] [blame]
Guido van Rossumfeee4b92000-03-10 22:57:27 +00001/* ------------------------------------------------------------------------
2
3 Python Codec Registry and support functions
4
5Written by Marc-Andre Lemburg (mal@lemburg.com).
6
Guido van Rossum16b1ad92000-08-03 16:24:25 +00007Copyright (c) Corporation for National Research Initiatives.
Guido van Rossumfeee4b92000-03-10 22:57:27 +00008
9 ------------------------------------------------------------------------ */
10
11#include "Python.h"
12#include <ctype.h>
13
Victor Stinnerf5cff562011-10-14 02:13:11 +020014const char *Py_hexdigits = "0123456789abcdef";
15
Guido van Rossumfeee4b92000-03-10 22:57:27 +000016/* --- Codec Registry ----------------------------------------------------- */
17
18/* Import the standard encodings package which will register the first
Guido van Rossum98297ee2007-11-06 21:34:58 +000019 codec search function.
Guido van Rossumfeee4b92000-03-10 22:57:27 +000020
21 This is done in a lazy way so that the Unicode implementation does
22 not downgrade startup time of scripts not needing it.
23
Guido van Rossumb95de4f2000-03-31 17:25:23 +000024 ImportErrors are silently ignored by this function. Only one try is
25 made.
Guido van Rossumfeee4b92000-03-10 22:57:27 +000026
27*/
28
Gustavo Niemeyer5ddd4c32003-03-19 00:35:36 +000029static int _PyCodecRegistry_Init(void); /* Forward */
Guido van Rossumfeee4b92000-03-10 22:57:27 +000030
Guido van Rossumfeee4b92000-03-10 22:57:27 +000031int PyCodec_Register(PyObject *search_function)
32{
Nicholas Bastine5662ae2004-03-24 22:22:12 +000033 PyInterpreterState *interp = PyThreadState_GET()->interp;
Gustavo Niemeyer5ddd4c32003-03-19 00:35:36 +000034 if (interp->codec_search_path == NULL && _PyCodecRegistry_Init())
Antoine Pitrouf95a1b32010-05-09 15:52:27 +000035 goto onError;
Guido van Rossumfeee4b92000-03-10 22:57:27 +000036 if (search_function == NULL) {
Antoine Pitrouf95a1b32010-05-09 15:52:27 +000037 PyErr_BadArgument();
38 goto onError;
Guido van Rossumfeee4b92000-03-10 22:57:27 +000039 }
40 if (!PyCallable_Check(search_function)) {
Antoine Pitrouf95a1b32010-05-09 15:52:27 +000041 PyErr_SetString(PyExc_TypeError, "argument must be callable");
42 goto onError;
Guido van Rossumfeee4b92000-03-10 22:57:27 +000043 }
Gustavo Niemeyer5ddd4c32003-03-19 00:35:36 +000044 return PyList_Append(interp->codec_search_path, search_function);
Guido van Rossumb95de4f2000-03-31 17:25:23 +000045
46 onError:
47 return -1;
Guido van Rossumfeee4b92000-03-10 22:57:27 +000048}
49
Guido van Rossum9e896b32000-04-05 20:11:21 +000050/* Convert a string to a normalized Python string: all characters are
51 converted to lower case, spaces are replaced with underscores. */
52
Guido van Rossumfeee4b92000-03-10 22:57:27 +000053static
Guido van Rossum9e896b32000-04-05 20:11:21 +000054PyObject *normalizestring(const char *string)
Guido van Rossumfeee4b92000-03-10 22:57:27 +000055{
Guido van Rossum33831132000-06-29 14:50:15 +000056 register size_t i;
Guido van Rossum582acec2000-06-28 22:07:35 +000057 size_t len = strlen(string);
Guido van Rossumfeee4b92000-03-10 22:57:27 +000058 char *p;
59 PyObject *v;
Guido van Rossum21431e82007-10-19 21:48:41 +000060
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000061 if (len > PY_SSIZE_T_MAX) {
Antoine Pitrouf95a1b32010-05-09 15:52:27 +000062 PyErr_SetString(PyExc_OverflowError, "string is too large");
63 return NULL;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000064 }
Guido van Rossum21431e82007-10-19 21:48:41 +000065
66 p = PyMem_Malloc(len + 1);
67 if (p == NULL)
68 return NULL;
Guido van Rossum9e896b32000-04-05 20:11:21 +000069 for (i = 0; i < len; i++) {
70 register char ch = string[i];
71 if (ch == ' ')
72 ch = '-';
73 else
Antoine Pitroucf9d3c02011-07-24 02:27:04 +020074 ch = Py_TOLOWER(Py_CHARMASK(ch));
Antoine Pitrouf95a1b32010-05-09 15:52:27 +000075 p[i] = ch;
Guido van Rossum9e896b32000-04-05 20:11:21 +000076 }
Guido van Rossum21431e82007-10-19 21:48:41 +000077 p[i] = '\0';
78 v = PyUnicode_FromString(p);
79 if (v == NULL)
80 return NULL;
81 PyMem_Free(p);
Guido van Rossumfeee4b92000-03-10 22:57:27 +000082 return v;
83}
84
85/* Lookup the given encoding and return a tuple providing the codec
86 facilities.
87
88 The encoding string is looked up converted to all lower-case
89 characters. This makes encodings looked up through this mechanism
90 effectively case-insensitive.
91
Guido van Rossum98297ee2007-11-06 21:34:58 +000092 If no codec is found, a LookupError is set and NULL returned.
Guido van Rossumb95de4f2000-03-31 17:25:23 +000093
94 As side effect, this tries to load the encodings package, if not
95 yet done. This is part of the lazy load strategy for the encodings
96 package.
97
98*/
Guido van Rossumfeee4b92000-03-10 22:57:27 +000099
100PyObject *_PyCodec_Lookup(const char *encoding)
101{
Gustavo Niemeyer5ddd4c32003-03-19 00:35:36 +0000102 PyInterpreterState *interp;
Guido van Rossum5ba3c842000-03-24 20:52:23 +0000103 PyObject *result, *args = NULL, *v;
Thomas Wouters477c8d52006-05-27 19:21:47 +0000104 Py_ssize_t i, len;
Guido van Rossumfeee4b92000-03-10 22:57:27 +0000105
Fred Drake766de832000-05-09 19:55:59 +0000106 if (encoding == NULL) {
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000107 PyErr_BadArgument();
108 goto onError;
Fred Drake766de832000-05-09 19:55:59 +0000109 }
Gustavo Niemeyer5ddd4c32003-03-19 00:35:36 +0000110
Nicholas Bastine5662ae2004-03-24 22:22:12 +0000111 interp = PyThreadState_GET()->interp;
Gustavo Niemeyer5ddd4c32003-03-19 00:35:36 +0000112 if (interp->codec_search_path == NULL && _PyCodecRegistry_Init())
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000113 goto onError;
Guido van Rossumfeee4b92000-03-10 22:57:27 +0000114
Guido van Rossum9e896b32000-04-05 20:11:21 +0000115 /* Convert the encoding to a normalized Python string: all
Thomas Wouters7e474022000-07-16 12:04:32 +0000116 characters are converted to lower case, spaces and hyphens are
Guido van Rossum9e896b32000-04-05 20:11:21 +0000117 replaced with underscores. */
118 v = normalizestring(encoding);
Guido van Rossumfeee4b92000-03-10 22:57:27 +0000119 if (v == NULL)
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000120 goto onError;
Guido van Rossum21431e82007-10-19 21:48:41 +0000121 PyUnicode_InternInPlace(&v);
Guido van Rossumfeee4b92000-03-10 22:57:27 +0000122
123 /* First, try to lookup the name in the registry dictionary */
Gustavo Niemeyer5ddd4c32003-03-19 00:35:36 +0000124 result = PyDict_GetItem(interp->codec_search_cache, v);
Guido van Rossumfeee4b92000-03-10 22:57:27 +0000125 if (result != NULL) {
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000126 Py_INCREF(result);
127 Py_DECREF(v);
128 return result;
Guido van Rossumfeee4b92000-03-10 22:57:27 +0000129 }
Guido van Rossum98297ee2007-11-06 21:34:58 +0000130
Guido van Rossumfeee4b92000-03-10 22:57:27 +0000131 /* Next, scan the search functions in order of registration */
Guido van Rossumfeee4b92000-03-10 22:57:27 +0000132 args = PyTuple_New(1);
133 if (args == NULL)
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000134 goto onError;
Guido van Rossumfeee4b92000-03-10 22:57:27 +0000135 PyTuple_SET_ITEM(args,0,v);
Guido van Rossum5ba3c842000-03-24 20:52:23 +0000136
Gustavo Niemeyer5ddd4c32003-03-19 00:35:36 +0000137 len = PyList_Size(interp->codec_search_path);
Guido van Rossum5ba3c842000-03-24 20:52:23 +0000138 if (len < 0)
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000139 goto onError;
Guido van Rossumb95de4f2000-03-31 17:25:23 +0000140 if (len == 0) {
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000141 PyErr_SetString(PyExc_LookupError,
142 "no codec search functions registered: "
143 "can't find encoding");
144 goto onError;
Guido van Rossumb95de4f2000-03-31 17:25:23 +0000145 }
Guido van Rossumfeee4b92000-03-10 22:57:27 +0000146
147 for (i = 0; i < len; i++) {
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000148 PyObject *func;
Guido van Rossum98297ee2007-11-06 21:34:58 +0000149
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000150 func = PyList_GetItem(interp->codec_search_path, i);
151 if (func == NULL)
152 goto onError;
153 result = PyEval_CallObject(func, args);
154 if (result == NULL)
155 goto onError;
156 if (result == Py_None) {
157 Py_DECREF(result);
158 continue;
159 }
160 if (!PyTuple_Check(result) || PyTuple_GET_SIZE(result) != 4) {
161 PyErr_SetString(PyExc_TypeError,
162 "codec search functions must return 4-tuples");
163 Py_DECREF(result);
164 goto onError;
165 }
166 break;
Guido van Rossumfeee4b92000-03-10 22:57:27 +0000167 }
168 if (i == len) {
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000169 /* XXX Perhaps we should cache misses too ? */
170 PyErr_Format(PyExc_LookupError,
Martin v. Löwiseb42b022002-09-26 16:01:24 +0000171 "unknown encoding: %s", encoding);
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000172 goto onError;
Guido van Rossumfeee4b92000-03-10 22:57:27 +0000173 }
174
175 /* Cache and return the result */
Neal Norwitz9edcc2e2007-08-11 04:58:26 +0000176 if (PyDict_SetItem(interp->codec_search_cache, v, result) < 0) {
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000177 Py_DECREF(result);
178 goto onError;
Neal Norwitz9edcc2e2007-08-11 04:58:26 +0000179 }
Guido van Rossumfeee4b92000-03-10 22:57:27 +0000180 Py_DECREF(args);
181 return result;
182
183 onError:
184 Py_XDECREF(args);
185 return NULL;
186}
187
Marc-André Lemburgb2750b52008-06-06 12:18:17 +0000188/* Codec registry encoding check API. */
189
190int PyCodec_KnownEncoding(const char *encoding)
191{
192 PyObject *codecs;
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000193
Marc-André Lemburgb2750b52008-06-06 12:18:17 +0000194 codecs = _PyCodec_Lookup(encoding);
195 if (!codecs) {
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000196 PyErr_Clear();
197 return 0;
Marc-André Lemburgb2750b52008-06-06 12:18:17 +0000198 }
199 else {
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000200 Py_DECREF(codecs);
201 return 1;
Marc-André Lemburgb2750b52008-06-06 12:18:17 +0000202 }
203}
204
Guido van Rossumfeee4b92000-03-10 22:57:27 +0000205static
206PyObject *args_tuple(PyObject *object,
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000207 const char *errors)
Guido van Rossumfeee4b92000-03-10 22:57:27 +0000208{
209 PyObject *args;
Guido van Rossum98297ee2007-11-06 21:34:58 +0000210
Guido van Rossumfeee4b92000-03-10 22:57:27 +0000211 args = PyTuple_New(1 + (errors != NULL));
212 if (args == NULL)
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000213 return NULL;
Guido van Rossumfeee4b92000-03-10 22:57:27 +0000214 Py_INCREF(object);
215 PyTuple_SET_ITEM(args,0,object);
216 if (errors) {
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000217 PyObject *v;
Guido van Rossum98297ee2007-11-06 21:34:58 +0000218
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000219 v = PyUnicode_FromString(errors);
220 if (v == NULL) {
221 Py_DECREF(args);
222 return NULL;
223 }
224 PyTuple_SET_ITEM(args, 1, v);
Guido van Rossumfeee4b92000-03-10 22:57:27 +0000225 }
226 return args;
227}
228
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000229/* Helper function to get a codec item */
Guido van Rossumfeee4b92000-03-10 22:57:27 +0000230
231static
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000232PyObject *codec_getitem(const char *encoding, int index)
Guido van Rossumfeee4b92000-03-10 22:57:27 +0000233{
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000234 PyObject *codecs;
235 PyObject *v;
Guido van Rossumfeee4b92000-03-10 22:57:27 +0000236
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000237 codecs = _PyCodec_Lookup(encoding);
238 if (codecs == NULL)
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000239 return NULL;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000240 v = PyTuple_GET_ITEM(codecs, index);
241 Py_DECREF(codecs);
242 Py_INCREF(v);
243 return v;
244}
245
246/* Helper function to create an incremental codec. */
247
248static
249PyObject *codec_getincrementalcodec(const char *encoding,
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000250 const char *errors,
251 const char *attrname)
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000252{
253 PyObject *codecs, *ret, *inccodec;
254
255 codecs = _PyCodec_Lookup(encoding);
256 if (codecs == NULL)
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000257 return NULL;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000258 inccodec = PyObject_GetAttrString(codecs, attrname);
259 Py_DECREF(codecs);
260 if (inccodec == NULL)
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000261 return NULL;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000262 if (errors)
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000263 ret = PyObject_CallFunction(inccodec, "s", errors);
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000264 else
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000265 ret = PyObject_CallFunction(inccodec, NULL);
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000266 Py_DECREF(inccodec);
267 return ret;
268}
269
270/* Helper function to create a stream codec. */
271
272static
273PyObject *codec_getstreamcodec(const char *encoding,
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000274 PyObject *stream,
275 const char *errors,
276 const int index)
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000277{
Thomas Wouters0e3f5912006-08-11 14:57:12 +0000278 PyObject *codecs, *streamcodec, *codeccls;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000279
280 codecs = _PyCodec_Lookup(encoding);
281 if (codecs == NULL)
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000282 return NULL;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000283
Thomas Wouters0e3f5912006-08-11 14:57:12 +0000284 codeccls = PyTuple_GET_ITEM(codecs, index);
285 if (errors != NULL)
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000286 streamcodec = PyObject_CallFunction(codeccls, "Os", stream, errors);
Thomas Wouters0e3f5912006-08-11 14:57:12 +0000287 else
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000288 streamcodec = PyObject_CallFunction(codeccls, "O", stream);
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000289 Py_DECREF(codecs);
290 return streamcodec;
Guido van Rossumfeee4b92000-03-10 22:57:27 +0000291}
292
Guido van Rossum98297ee2007-11-06 21:34:58 +0000293/* Convenience APIs to query the Codec registry.
294
Guido van Rossumfeee4b92000-03-10 22:57:27 +0000295 All APIs return a codec object with incremented refcount.
Guido van Rossum98297ee2007-11-06 21:34:58 +0000296
Guido van Rossumfeee4b92000-03-10 22:57:27 +0000297 */
298
299PyObject *PyCodec_Encoder(const char *encoding)
300{
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000301 return codec_getitem(encoding, 0);
Guido van Rossumfeee4b92000-03-10 22:57:27 +0000302}
303
304PyObject *PyCodec_Decoder(const char *encoding)
305{
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000306 return codec_getitem(encoding, 1);
Guido van Rossumfeee4b92000-03-10 22:57:27 +0000307}
308
Thomas Woutersa9773292006-04-21 09:43:23 +0000309PyObject *PyCodec_IncrementalEncoder(const char *encoding,
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000310 const char *errors)
Thomas Woutersa9773292006-04-21 09:43:23 +0000311{
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000312 return codec_getincrementalcodec(encoding, errors, "incrementalencoder");
Thomas Woutersa9773292006-04-21 09:43:23 +0000313}
314
315PyObject *PyCodec_IncrementalDecoder(const char *encoding,
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000316 const char *errors)
Thomas Woutersa9773292006-04-21 09:43:23 +0000317{
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000318 return codec_getincrementalcodec(encoding, errors, "incrementaldecoder");
Thomas Woutersa9773292006-04-21 09:43:23 +0000319}
320
Guido van Rossumfeee4b92000-03-10 22:57:27 +0000321PyObject *PyCodec_StreamReader(const char *encoding,
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000322 PyObject *stream,
323 const char *errors)
Guido van Rossumfeee4b92000-03-10 22:57:27 +0000324{
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000325 return codec_getstreamcodec(encoding, stream, errors, 2);
Guido van Rossumfeee4b92000-03-10 22:57:27 +0000326}
327
328PyObject *PyCodec_StreamWriter(const char *encoding,
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000329 PyObject *stream,
330 const char *errors)
Guido van Rossumfeee4b92000-03-10 22:57:27 +0000331{
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000332 return codec_getstreamcodec(encoding, stream, errors, 3);
Guido van Rossumfeee4b92000-03-10 22:57:27 +0000333}
334
335/* Encode an object (e.g. an Unicode object) using the given encoding
336 and return the resulting encoded object (usually a Python string).
337
338 errors is passed to the encoder factory as argument if non-NULL. */
339
340PyObject *PyCodec_Encode(PyObject *object,
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000341 const char *encoding,
342 const char *errors)
Guido van Rossumfeee4b92000-03-10 22:57:27 +0000343{
344 PyObject *encoder = NULL;
Neal Norwitz3715c3e2005-11-24 22:09:18 +0000345 PyObject *args = NULL, *result = NULL;
Guido van Rossum98297ee2007-11-06 21:34:58 +0000346 PyObject *v = NULL;
Guido van Rossumfeee4b92000-03-10 22:57:27 +0000347
348 encoder = PyCodec_Encoder(encoding);
349 if (encoder == NULL)
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000350 goto onError;
Guido van Rossumfeee4b92000-03-10 22:57:27 +0000351
352 args = args_tuple(object, errors);
353 if (args == NULL)
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000354 goto onError;
Guido van Rossum98297ee2007-11-06 21:34:58 +0000355
356 result = PyEval_CallObject(encoder, args);
Guido van Rossumfeee4b92000-03-10 22:57:27 +0000357 if (result == NULL)
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000358 goto onError;
Guido van Rossumfeee4b92000-03-10 22:57:27 +0000359
Guido van Rossum98297ee2007-11-06 21:34:58 +0000360 if (!PyTuple_Check(result) ||
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000361 PyTuple_GET_SIZE(result) != 2) {
362 PyErr_SetString(PyExc_TypeError,
363 "encoder must return a tuple (object, integer)");
364 goto onError;
Guido van Rossumfeee4b92000-03-10 22:57:27 +0000365 }
Marc-André Lemburgb2750b52008-06-06 12:18:17 +0000366 v = PyTuple_GET_ITEM(result,0);
367 Py_INCREF(v);
Guido van Rossumfeee4b92000-03-10 22:57:27 +0000368 /* We don't check or use the second (integer) entry. */
369
Marc-André Lemburgb2750b52008-06-06 12:18:17 +0000370 Py_DECREF(args);
371 Py_DECREF(encoder);
372 Py_DECREF(result);
373 return v;
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000374
Guido van Rossumfeee4b92000-03-10 22:57:27 +0000375 onError:
Neal Norwitz3715c3e2005-11-24 22:09:18 +0000376 Py_XDECREF(result);
Guido van Rossumfeee4b92000-03-10 22:57:27 +0000377 Py_XDECREF(args);
378 Py_XDECREF(encoder);
Marc-André Lemburgb2750b52008-06-06 12:18:17 +0000379 return NULL;
Guido van Rossumfeee4b92000-03-10 22:57:27 +0000380}
381
382/* Decode an object (usually a Python string) using the given encoding
383 and return an equivalent object (e.g. an Unicode object).
384
385 errors is passed to the decoder factory as argument if non-NULL. */
386
387PyObject *PyCodec_Decode(PyObject *object,
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000388 const char *encoding,
389 const char *errors)
Guido van Rossumfeee4b92000-03-10 22:57:27 +0000390{
391 PyObject *decoder = NULL;
392 PyObject *args = NULL, *result = NULL;
393 PyObject *v;
394
395 decoder = PyCodec_Decoder(encoding);
396 if (decoder == NULL)
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000397 goto onError;
Guido van Rossumfeee4b92000-03-10 22:57:27 +0000398
399 args = args_tuple(object, errors);
400 if (args == NULL)
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000401 goto onError;
Guido van Rossum98297ee2007-11-06 21:34:58 +0000402
Guido van Rossumfeee4b92000-03-10 22:57:27 +0000403 result = PyEval_CallObject(decoder,args);
404 if (result == NULL)
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000405 goto onError;
Guido van Rossum98297ee2007-11-06 21:34:58 +0000406 if (!PyTuple_Check(result) ||
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000407 PyTuple_GET_SIZE(result) != 2) {
408 PyErr_SetString(PyExc_TypeError,
409 "decoder must return a tuple (object,integer)");
410 goto onError;
Guido van Rossumfeee4b92000-03-10 22:57:27 +0000411 }
412 v = PyTuple_GET_ITEM(result,0);
413 Py_INCREF(v);
414 /* We don't check or use the second (integer) entry. */
415
416 Py_DECREF(args);
417 Py_DECREF(decoder);
418 Py_DECREF(result);
419 return v;
Guido van Rossum98297ee2007-11-06 21:34:58 +0000420
Guido van Rossumfeee4b92000-03-10 22:57:27 +0000421 onError:
422 Py_XDECREF(args);
423 Py_XDECREF(decoder);
424 Py_XDECREF(result);
425 return NULL;
426}
427
Walter Dörwald3aeb6322002-09-02 13:14:32 +0000428/* Register the error handling callback function error under the name
429 name. This function will be called by the codec when it encounters
430 an unencodable characters/undecodable bytes and doesn't know the
431 callback name, when name is specified as the error parameter
432 in the call to the encode/decode function.
433 Return 0 on success, -1 on error */
434int PyCodec_RegisterError(const char *name, PyObject *error)
435{
Nicholas Bastine5662ae2004-03-24 22:22:12 +0000436 PyInterpreterState *interp = PyThreadState_GET()->interp;
Gustavo Niemeyer5ddd4c32003-03-19 00:35:36 +0000437 if (interp->codec_search_path == NULL && _PyCodecRegistry_Init())
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000438 return -1;
Walter Dörwald3aeb6322002-09-02 13:14:32 +0000439 if (!PyCallable_Check(error)) {
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000440 PyErr_SetString(PyExc_TypeError, "handler must be callable");
441 return -1;
Walter Dörwald3aeb6322002-09-02 13:14:32 +0000442 }
Gustavo Niemeyer5ddd4c32003-03-19 00:35:36 +0000443 return PyDict_SetItemString(interp->codec_error_registry,
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000444 (char *)name, error);
Walter Dörwald3aeb6322002-09-02 13:14:32 +0000445}
446
447/* Lookup the error handling callback function registered under the
448 name error. As a special case NULL can be passed, in which case
449 the error handling callback for strict encoding will be returned. */
450PyObject *PyCodec_LookupError(const char *name)
451{
452 PyObject *handler = NULL;
453
Nicholas Bastine5662ae2004-03-24 22:22:12 +0000454 PyInterpreterState *interp = PyThreadState_GET()->interp;
Gustavo Niemeyer5ddd4c32003-03-19 00:35:36 +0000455 if (interp->codec_search_path == NULL && _PyCodecRegistry_Init())
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000456 return NULL;
Gustavo Niemeyer5ddd4c32003-03-19 00:35:36 +0000457
Walter Dörwald3aeb6322002-09-02 13:14:32 +0000458 if (name==NULL)
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000459 name = "strict";
Gustavo Niemeyer5ddd4c32003-03-19 00:35:36 +0000460 handler = PyDict_GetItemString(interp->codec_error_registry, (char *)name);
Walter Dörwald3aeb6322002-09-02 13:14:32 +0000461 if (!handler)
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000462 PyErr_Format(PyExc_LookupError, "unknown error handler name '%.400s'", name);
Walter Dörwald3aeb6322002-09-02 13:14:32 +0000463 else
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000464 Py_INCREF(handler);
Walter Dörwald3aeb6322002-09-02 13:14:32 +0000465 return handler;
466}
467
468static void wrong_exception_type(PyObject *exc)
469{
Martin v. Löwisbd928fe2011-10-14 10:20:37 +0200470 _Py_IDENTIFIER(__class__);
471 _Py_IDENTIFIER(__name__);
Martin v. Löwis1ee1b6f2011-10-10 18:11:30 +0200472 PyObject *type = _PyObject_GetAttrId(exc, &PyId___class__);
Walter Dörwald3aeb6322002-09-02 13:14:32 +0000473 if (type != NULL) {
Martin v. Löwis1ee1b6f2011-10-10 18:11:30 +0200474 PyObject *name = _PyObject_GetAttrId(type, &PyId___name__);
Walter Dörwald573c08c2007-05-25 15:46:59 +0000475 Py_DECREF(type);
476 if (name != NULL) {
477 PyErr_Format(PyExc_TypeError,
478 "don't know how to handle %S in error callback", name);
479 Py_DECREF(name);
480 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +0000481 }
482}
483
484PyObject *PyCodec_StrictErrors(PyObject *exc)
485{
Brett Cannonbf364092006-03-01 04:25:17 +0000486 if (PyExceptionInstance_Check(exc))
487 PyErr_SetObject(PyExceptionInstance_Class(exc), exc);
Walter Dörwald3aeb6322002-09-02 13:14:32 +0000488 else
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000489 PyErr_SetString(PyExc_TypeError, "codec must pass exception instance");
Walter Dörwald3aeb6322002-09-02 13:14:32 +0000490 return NULL;
491}
492
493
494PyObject *PyCodec_IgnoreErrors(PyObject *exc)
495{
Martin v. Löwis18e16552006-02-15 17:27:45 +0000496 Py_ssize_t end;
Walter Dörwald3aeb6322002-09-02 13:14:32 +0000497 if (PyObject_IsInstance(exc, PyExc_UnicodeEncodeError)) {
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000498 if (PyUnicodeEncodeError_GetEnd(exc, &end))
499 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +0000500 }
501 else if (PyObject_IsInstance(exc, PyExc_UnicodeDecodeError)) {
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000502 if (PyUnicodeDecodeError_GetEnd(exc, &end))
503 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +0000504 }
505 else if (PyObject_IsInstance(exc, PyExc_UnicodeTranslateError)) {
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000506 if (PyUnicodeTranslateError_GetEnd(exc, &end))
507 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +0000508 }
509 else {
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000510 wrong_exception_type(exc);
511 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +0000512 }
Victor Stinneree450092011-12-01 02:52:11 +0100513 return Py_BuildValue("(Nn)", PyUnicode_New(0, 0), end);
Walter Dörwald3aeb6322002-09-02 13:14:32 +0000514}
515
516
517PyObject *PyCodec_ReplaceErrors(PyObject *exc)
518{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200519 Py_ssize_t start, end, i, len;
Walter Dörwald3aeb6322002-09-02 13:14:32 +0000520
521 if (PyObject_IsInstance(exc, PyExc_UnicodeEncodeError)) {
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000522 PyObject *res;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200523 int kind;
524 void *data;
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000525 if (PyUnicodeEncodeError_GetStart(exc, &start))
526 return NULL;
527 if (PyUnicodeEncodeError_GetEnd(exc, &end))
528 return NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200529 len = end - start;
530 res = PyUnicode_New(len, '?');
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000531 if (res == NULL)
532 return NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200533 kind = PyUnicode_KIND(res);
534 data = PyUnicode_DATA(res);
535 for (i = 0; i < len; ++i)
536 PyUnicode_WRITE(kind, data, i, '?');
Victor Stinner8f825062012-04-27 13:55:39 +0200537 assert(_PyUnicode_CheckConsistency(res, 1));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200538 return Py_BuildValue("(Nn)", res, end);
Walter Dörwald3aeb6322002-09-02 13:14:32 +0000539 }
540 else if (PyObject_IsInstance(exc, PyExc_UnicodeDecodeError)) {
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000541 if (PyUnicodeDecodeError_GetEnd(exc, &end))
542 return NULL;
Victor Stinner1a15aba2011-10-02 19:00:15 +0200543 return Py_BuildValue("(Cn)",
544 (int)Py_UNICODE_REPLACEMENT_CHARACTER,
545 end);
Walter Dörwald3aeb6322002-09-02 13:14:32 +0000546 }
547 else if (PyObject_IsInstance(exc, PyExc_UnicodeTranslateError)) {
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000548 PyObject *res;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200549 int kind;
550 void *data;
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000551 if (PyUnicodeTranslateError_GetStart(exc, &start))
552 return NULL;
553 if (PyUnicodeTranslateError_GetEnd(exc, &end))
554 return NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200555 len = end - start;
556 res = PyUnicode_New(len, Py_UNICODE_REPLACEMENT_CHARACTER);
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000557 if (res == NULL)
558 return NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200559 kind = PyUnicode_KIND(res);
560 data = PyUnicode_DATA(res);
561 for (i=0; i < len; i++)
562 PyUnicode_WRITE(kind, data, i, Py_UNICODE_REPLACEMENT_CHARACTER);
Victor Stinner8f825062012-04-27 13:55:39 +0200563 assert(_PyUnicode_CheckConsistency(res, 1));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200564 return Py_BuildValue("(Nn)", res, end);
Walter Dörwald3aeb6322002-09-02 13:14:32 +0000565 }
566 else {
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000567 wrong_exception_type(exc);
568 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +0000569 }
570}
571
572PyObject *PyCodec_XMLCharRefReplaceErrors(PyObject *exc)
573{
574 if (PyObject_IsInstance(exc, PyExc_UnicodeEncodeError)) {
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000575 PyObject *restuple;
576 PyObject *object;
Victor Stinnerb31f1bc2011-11-04 21:29:10 +0100577 Py_ssize_t i;
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000578 Py_ssize_t start;
579 Py_ssize_t end;
580 PyObject *res;
Martin v. Löwisb09af032011-11-04 11:16:41 +0100581 unsigned char *outp;
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000582 int ressize;
Martin v. Löwisb09af032011-11-04 11:16:41 +0100583 Py_UCS4 ch;
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000584 if (PyUnicodeEncodeError_GetStart(exc, &start))
585 return NULL;
586 if (PyUnicodeEncodeError_GetEnd(exc, &end))
587 return NULL;
588 if (!(object = PyUnicodeEncodeError_GetObject(exc)))
589 return NULL;
Martin v. Löwisb09af032011-11-04 11:16:41 +0100590 for (i = start, ressize = 0; i < end; ++i) {
591 /* object is guaranteed to be "ready" */
592 ch = PyUnicode_READ_CHAR(object, i);
593 if (ch<10)
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000594 ressize += 2+1+1;
Martin v. Löwisb09af032011-11-04 11:16:41 +0100595 else if (ch<100)
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000596 ressize += 2+2+1;
Martin v. Löwisb09af032011-11-04 11:16:41 +0100597 else if (ch<1000)
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000598 ressize += 2+3+1;
Martin v. Löwisb09af032011-11-04 11:16:41 +0100599 else if (ch<10000)
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000600 ressize += 2+4+1;
Martin v. Löwisb09af032011-11-04 11:16:41 +0100601 else if (ch<100000)
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000602 ressize += 2+5+1;
Martin v. Löwisb09af032011-11-04 11:16:41 +0100603 else if (ch<1000000)
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000604 ressize += 2+6+1;
605 else
606 ressize += 2+7+1;
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000607 }
608 /* allocate replacement */
Martin v. Löwisb09af032011-11-04 11:16:41 +0100609 res = PyUnicode_New(ressize, 127);
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000610 if (res == NULL) {
611 Py_DECREF(object);
612 return NULL;
613 }
Martin v. Löwisb09af032011-11-04 11:16:41 +0100614 outp = PyUnicode_1BYTE_DATA(res);
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000615 /* generate replacement */
Victor Stinnerb31f1bc2011-11-04 21:29:10 +0100616 for (i = start; i < end; ++i) {
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000617 int digits;
618 int base;
Martin v. Löwis8ba79302011-11-04 12:26:49 +0100619 ch = PyUnicode_READ_CHAR(object, i);
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000620 *outp++ = '&';
621 *outp++ = '#';
Martin v. Löwisb09af032011-11-04 11:16:41 +0100622 if (ch<10) {
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000623 digits = 1;
624 base = 1;
625 }
Martin v. Löwisb09af032011-11-04 11:16:41 +0100626 else if (ch<100) {
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000627 digits = 2;
628 base = 10;
629 }
Martin v. Löwisb09af032011-11-04 11:16:41 +0100630 else if (ch<1000) {
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000631 digits = 3;
632 base = 100;
633 }
Martin v. Löwisb09af032011-11-04 11:16:41 +0100634 else if (ch<10000) {
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000635 digits = 4;
636 base = 1000;
637 }
Martin v. Löwisb09af032011-11-04 11:16:41 +0100638 else if (ch<100000) {
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000639 digits = 5;
640 base = 10000;
641 }
Martin v. Löwisb09af032011-11-04 11:16:41 +0100642 else if (ch<1000000) {
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000643 digits = 6;
644 base = 100000;
645 }
646 else {
647 digits = 7;
648 base = 1000000;
649 }
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000650 while (digits-->0) {
Martin v. Löwisb09af032011-11-04 11:16:41 +0100651 *outp++ = '0' + ch/base;
652 ch %= base;
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000653 base /= 10;
654 }
655 *outp++ = ';';
656 }
Victor Stinner8f825062012-04-27 13:55:39 +0200657 assert(_PyUnicode_CheckConsistency(res, 1));
658 restuple = Py_BuildValue("(Nn)", res, end);
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000659 Py_DECREF(object);
660 return restuple;
Walter Dörwald3aeb6322002-09-02 13:14:32 +0000661 }
662 else {
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000663 wrong_exception_type(exc);
664 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +0000665 }
666}
667
Walter Dörwald3aeb6322002-09-02 13:14:32 +0000668PyObject *PyCodec_BackslashReplaceErrors(PyObject *exc)
669{
670 if (PyObject_IsInstance(exc, PyExc_UnicodeEncodeError)) {
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000671 PyObject *restuple;
672 PyObject *object;
Martin v. Löwisb09af032011-11-04 11:16:41 +0100673 Py_ssize_t i;
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000674 Py_ssize_t start;
675 Py_ssize_t end;
676 PyObject *res;
Martin v. Löwisb09af032011-11-04 11:16:41 +0100677 unsigned char *outp;
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000678 int ressize;
Martin v. Löwisb09af032011-11-04 11:16:41 +0100679 Py_UCS4 c;
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000680 if (PyUnicodeEncodeError_GetStart(exc, &start))
681 return NULL;
682 if (PyUnicodeEncodeError_GetEnd(exc, &end))
683 return NULL;
684 if (!(object = PyUnicodeEncodeError_GetObject(exc)))
685 return NULL;
Martin v. Löwisb09af032011-11-04 11:16:41 +0100686 for (i = start, ressize = 0; i < end; ++i) {
687 /* object is guaranteed to be "ready" */
688 c = PyUnicode_READ_CHAR(object, i);
689 if (c >= 0x10000) {
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000690 ressize += 1+1+8;
Martin v. Löwisb09af032011-11-04 11:16:41 +0100691 }
692 else if (c >= 0x100) {
693 ressize += 1+1+4;
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000694 }
695 else
696 ressize += 1+1+2;
697 }
Martin v. Löwisb09af032011-11-04 11:16:41 +0100698 res = PyUnicode_New(ressize, 127);
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000699 if (res==NULL)
700 return NULL;
Martin v. Löwisb09af032011-11-04 11:16:41 +0100701 for (i = start, outp = PyUnicode_1BYTE_DATA(res);
702 i < end; ++i) {
703 c = PyUnicode_READ_CHAR(object, i);
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000704 *outp++ = '\\';
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000705 if (c >= 0x00010000) {
706 *outp++ = 'U';
Victor Stinnerf5cff562011-10-14 02:13:11 +0200707 *outp++ = Py_hexdigits[(c>>28)&0xf];
708 *outp++ = Py_hexdigits[(c>>24)&0xf];
709 *outp++ = Py_hexdigits[(c>>20)&0xf];
710 *outp++ = Py_hexdigits[(c>>16)&0xf];
711 *outp++ = Py_hexdigits[(c>>12)&0xf];
712 *outp++ = Py_hexdigits[(c>>8)&0xf];
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000713 }
Antoine Pitroue4a18922010-09-09 20:30:23 +0000714 else if (c >= 0x100) {
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000715 *outp++ = 'u';
Victor Stinnerf5cff562011-10-14 02:13:11 +0200716 *outp++ = Py_hexdigits[(c>>12)&0xf];
717 *outp++ = Py_hexdigits[(c>>8)&0xf];
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000718 }
719 else
720 *outp++ = 'x';
Victor Stinnerf5cff562011-10-14 02:13:11 +0200721 *outp++ = Py_hexdigits[(c>>4)&0xf];
722 *outp++ = Py_hexdigits[c&0xf];
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000723 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +0000724
Victor Stinner8f825062012-04-27 13:55:39 +0200725 assert(_PyUnicode_CheckConsistency(res, 1));
726 restuple = Py_BuildValue("(Nn)", res, end);
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000727 Py_DECREF(object);
728 return restuple;
Walter Dörwald3aeb6322002-09-02 13:14:32 +0000729 }
730 else {
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000731 wrong_exception_type(exc);
732 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +0000733 }
734}
735
Martin v. Löwisaef3fb02009-05-02 19:27:30 +0000736/* This handler is declared static until someone demonstrates
737 a need to call it directly. */
738static PyObject *
Martin v. Löwise0a2b722009-05-10 08:08:56 +0000739PyCodec_SurrogatePassErrors(PyObject *exc)
Martin v. Löwisdb12d452009-05-02 18:52:14 +0000740{
741 PyObject *restuple;
742 PyObject *object;
Martin v. Löwisb09af032011-11-04 11:16:41 +0100743 Py_ssize_t i;
Martin v. Löwisdb12d452009-05-02 18:52:14 +0000744 Py_ssize_t start;
745 Py_ssize_t end;
746 PyObject *res;
747 if (PyObject_IsInstance(exc, PyExc_UnicodeEncodeError)) {
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000748 char *outp;
749 if (PyUnicodeEncodeError_GetStart(exc, &start))
750 return NULL;
751 if (PyUnicodeEncodeError_GetEnd(exc, &end))
752 return NULL;
753 if (!(object = PyUnicodeEncodeError_GetObject(exc)))
754 return NULL;
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000755 res = PyBytes_FromStringAndSize(NULL, 3*(end-start));
756 if (!res) {
757 Py_DECREF(object);
758 return NULL;
759 }
760 outp = PyBytes_AsString(res);
Martin v. Löwisb09af032011-11-04 11:16:41 +0100761 for (i = start; i < end; i++) {
762 /* object is guaranteed to be "ready" */
763 Py_UCS4 ch = PyUnicode_READ_CHAR(object, i);
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000764 if (ch < 0xd800 || ch > 0xdfff) {
765 /* Not a surrogate, fail with original exception */
766 PyErr_SetObject(PyExceptionInstance_Class(exc), exc);
767 Py_DECREF(res);
768 Py_DECREF(object);
769 return NULL;
770 }
771 *outp++ = (char)(0xe0 | (ch >> 12));
772 *outp++ = (char)(0x80 | ((ch >> 6) & 0x3f));
773 *outp++ = (char)(0x80 | (ch & 0x3f));
774 }
775 restuple = Py_BuildValue("(On)", res, end);
776 Py_DECREF(res);
777 Py_DECREF(object);
778 return restuple;
Martin v. Löwisdb12d452009-05-02 18:52:14 +0000779 }
780 else if (PyObject_IsInstance(exc, PyExc_UnicodeDecodeError)) {
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000781 unsigned char *p;
Victor Stinnerc06bb7a2011-11-04 21:36:35 +0100782 Py_UCS4 ch = 0;
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000783 if (PyUnicodeDecodeError_GetStart(exc, &start))
784 return NULL;
785 if (!(object = PyUnicodeDecodeError_GetObject(exc)))
786 return NULL;
787 if (!(p = (unsigned char*)PyBytes_AsString(object))) {
788 Py_DECREF(object);
789 return NULL;
790 }
791 /* Try decoding a single surrogate character. If
792 there are more, let the codec call us again. */
793 p += start;
Ezio Melotti540da762012-11-03 23:03:39 +0200794 if (PyBytes_GET_SIZE(object) - start >= 3 &&
795 (p[0] & 0xf0) == 0xe0 &&
796 (p[1] & 0xc0) == 0x80 &&
797 (p[2] & 0xc0) == 0x80) {
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000798 /* it's a three-byte code */
799 ch = ((p[0] & 0x0f) << 12) + ((p[1] & 0x3f) << 6) + (p[2] & 0x3f);
800 if (ch < 0xd800 || ch > 0xdfff)
801 /* it's not a surrogate - fail */
802 ch = 0;
803 }
804 Py_DECREF(object);
805 if (ch == 0) {
806 PyErr_SetObject(PyExceptionInstance_Class(exc), exc);
807 return NULL;
808 }
Victor Stinnerc06bb7a2011-11-04 21:36:35 +0100809 res = PyUnicode_FromOrdinal(ch);
810 if (res == NULL)
811 return NULL;
812 return Py_BuildValue("(Nn)", res, start+3);
Martin v. Löwisdb12d452009-05-02 18:52:14 +0000813 }
814 else {
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000815 wrong_exception_type(exc);
816 return NULL;
Martin v. Löwisdb12d452009-05-02 18:52:14 +0000817 }
818}
819
Martin v. Löwis011e8422009-05-05 04:43:17 +0000820static PyObject *
Martin v. Löwis43c57782009-05-10 08:15:24 +0000821PyCodec_SurrogateEscapeErrors(PyObject *exc)
Martin v. Löwis011e8422009-05-05 04:43:17 +0000822{
823 PyObject *restuple;
824 PyObject *object;
Martin v. Löwisb09af032011-11-04 11:16:41 +0100825 Py_ssize_t i;
Martin v. Löwis011e8422009-05-05 04:43:17 +0000826 Py_ssize_t start;
827 Py_ssize_t end;
828 PyObject *res;
829 if (PyObject_IsInstance(exc, PyExc_UnicodeEncodeError)) {
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000830 char *outp;
831 if (PyUnicodeEncodeError_GetStart(exc, &start))
832 return NULL;
833 if (PyUnicodeEncodeError_GetEnd(exc, &end))
834 return NULL;
835 if (!(object = PyUnicodeEncodeError_GetObject(exc)))
836 return NULL;
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000837 res = PyBytes_FromStringAndSize(NULL, end-start);
838 if (!res) {
839 Py_DECREF(object);
840 return NULL;
841 }
842 outp = PyBytes_AsString(res);
Martin v. Löwisb09af032011-11-04 11:16:41 +0100843 for (i = start; i < end; i++) {
844 /* object is guaranteed to be "ready" */
845 Py_UCS4 ch = PyUnicode_READ_CHAR(object, i);
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000846 if (ch < 0xdc80 || ch > 0xdcff) {
847 /* Not a UTF-8b surrogate, fail with original exception */
848 PyErr_SetObject(PyExceptionInstance_Class(exc), exc);
849 Py_DECREF(res);
850 Py_DECREF(object);
851 return NULL;
852 }
853 *outp++ = ch - 0xdc00;
854 }
855 restuple = Py_BuildValue("(On)", res, end);
856 Py_DECREF(res);
857 Py_DECREF(object);
858 return restuple;
Martin v. Löwis011e8422009-05-05 04:43:17 +0000859 }
860 else if (PyObject_IsInstance(exc, PyExc_UnicodeDecodeError)) {
Victor Stinnerc06bb7a2011-11-04 21:36:35 +0100861 PyObject *str;
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000862 unsigned char *p;
Victor Stinnerc06bb7a2011-11-04 21:36:35 +0100863 Py_UCS2 ch[4]; /* decode up to 4 bad bytes. */
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000864 int consumed = 0;
865 if (PyUnicodeDecodeError_GetStart(exc, &start))
866 return NULL;
867 if (PyUnicodeDecodeError_GetEnd(exc, &end))
868 return NULL;
869 if (!(object = PyUnicodeDecodeError_GetObject(exc)))
870 return NULL;
871 if (!(p = (unsigned char*)PyBytes_AsString(object))) {
872 Py_DECREF(object);
873 return NULL;
874 }
875 while (consumed < 4 && consumed < end-start) {
876 /* Refuse to escape ASCII bytes. */
877 if (p[start+consumed] < 128)
878 break;
879 ch[consumed] = 0xdc00 + p[start+consumed];
880 consumed++;
881 }
882 Py_DECREF(object);
883 if (!consumed) {
884 /* codec complained about ASCII byte. */
885 PyErr_SetObject(PyExceptionInstance_Class(exc), exc);
886 return NULL;
887 }
Victor Stinnerc06bb7a2011-11-04 21:36:35 +0100888 str = PyUnicode_FromKindAndData(PyUnicode_2BYTE_KIND, ch, consumed);
889 if (str == NULL)
890 return NULL;
891 return Py_BuildValue("(Nn)", str, start+consumed);
Martin v. Löwis011e8422009-05-05 04:43:17 +0000892 }
893 else {
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000894 wrong_exception_type(exc);
895 return NULL;
Martin v. Löwis011e8422009-05-05 04:43:17 +0000896 }
897}
898
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000899
Walter Dörwald3aeb6322002-09-02 13:14:32 +0000900static PyObject *strict_errors(PyObject *self, PyObject *exc)
901{
902 return PyCodec_StrictErrors(exc);
903}
904
905
906static PyObject *ignore_errors(PyObject *self, PyObject *exc)
907{
908 return PyCodec_IgnoreErrors(exc);
909}
910
911
912static PyObject *replace_errors(PyObject *self, PyObject *exc)
913{
914 return PyCodec_ReplaceErrors(exc);
915}
916
917
918static PyObject *xmlcharrefreplace_errors(PyObject *self, PyObject *exc)
919{
920 return PyCodec_XMLCharRefReplaceErrors(exc);
921}
922
923
924static PyObject *backslashreplace_errors(PyObject *self, PyObject *exc)
925{
926 return PyCodec_BackslashReplaceErrors(exc);
927}
928
Martin v. Löwise0a2b722009-05-10 08:08:56 +0000929static PyObject *surrogatepass_errors(PyObject *self, PyObject *exc)
Martin v. Löwisdb12d452009-05-02 18:52:14 +0000930{
Martin v. Löwise0a2b722009-05-10 08:08:56 +0000931 return PyCodec_SurrogatePassErrors(exc);
Martin v. Löwisdb12d452009-05-02 18:52:14 +0000932}
933
Martin v. Löwis43c57782009-05-10 08:15:24 +0000934static PyObject *surrogateescape_errors(PyObject *self, PyObject *exc)
Martin v. Löwis011e8422009-05-05 04:43:17 +0000935{
Martin v. Löwis43c57782009-05-10 08:15:24 +0000936 return PyCodec_SurrogateEscapeErrors(exc);
Martin v. Löwis011e8422009-05-05 04:43:17 +0000937}
938
Gustavo Niemeyer5ddd4c32003-03-19 00:35:36 +0000939static int _PyCodecRegistry_Init(void)
Guido van Rossumfeee4b92000-03-10 22:57:27 +0000940{
Walter Dörwald3aeb6322002-09-02 13:14:32 +0000941 static struct {
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000942 char *name;
943 PyMethodDef def;
Walter Dörwald3aeb6322002-09-02 13:14:32 +0000944 } methods[] =
945 {
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000946 {
947 "strict",
948 {
949 "strict_errors",
950 strict_errors,
951 METH_O,
952 PyDoc_STR("Implements the 'strict' error handling, which "
953 "raises a UnicodeError on coding errors.")
954 }
955 },
956 {
957 "ignore",
958 {
959 "ignore_errors",
960 ignore_errors,
961 METH_O,
962 PyDoc_STR("Implements the 'ignore' error handling, which "
963 "ignores malformed data and continues.")
964 }
965 },
966 {
967 "replace",
968 {
969 "replace_errors",
970 replace_errors,
971 METH_O,
972 PyDoc_STR("Implements the 'replace' error handling, which "
973 "replaces malformed data with a replacement marker.")
974 }
975 },
976 {
977 "xmlcharrefreplace",
978 {
979 "xmlcharrefreplace_errors",
980 xmlcharrefreplace_errors,
981 METH_O,
982 PyDoc_STR("Implements the 'xmlcharrefreplace' error handling, "
983 "which replaces an unencodable character with the "
984 "appropriate XML character reference.")
985 }
986 },
987 {
988 "backslashreplace",
989 {
990 "backslashreplace_errors",
991 backslashreplace_errors,
992 METH_O,
993 PyDoc_STR("Implements the 'backslashreplace' error handling, "
994 "which replaces an unencodable character with a "
995 "backslashed escape sequence.")
996 }
997 },
998 {
999 "surrogatepass",
1000 {
1001 "surrogatepass",
1002 surrogatepass_errors,
1003 METH_O
1004 }
1005 },
1006 {
1007 "surrogateescape",
1008 {
1009 "surrogateescape",
1010 surrogateescape_errors,
1011 METH_O
1012 }
1013 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001014 };
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001015
Nicholas Bastine5662ae2004-03-24 22:22:12 +00001016 PyInterpreterState *interp = PyThreadState_GET()->interp;
Gustavo Niemeyer5ddd4c32003-03-19 00:35:36 +00001017 PyObject *mod;
Neal Norwitz739a8f82004-07-08 01:55:58 +00001018 unsigned i;
Gustavo Niemeyer5ddd4c32003-03-19 00:35:36 +00001019
1020 if (interp->codec_search_path != NULL)
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001021 return 0;
Gustavo Niemeyer5ddd4c32003-03-19 00:35:36 +00001022
1023 interp->codec_search_path = PyList_New(0);
1024 interp->codec_search_cache = PyDict_New();
1025 interp->codec_error_registry = PyDict_New();
1026
1027 if (interp->codec_error_registry) {
Victor Stinner63941882011-09-29 00:42:28 +02001028 for (i = 0; i < Py_ARRAY_LENGTH(methods); ++i) {
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001029 PyObject *func = PyCFunction_New(&methods[i].def, NULL);
1030 int res;
1031 if (!func)
1032 Py_FatalError("can't initialize codec error registry");
1033 res = PyCodec_RegisterError(methods[i].name, func);
1034 Py_DECREF(func);
1035 if (res)
1036 Py_FatalError("can't initialize codec error registry");
1037 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001038 }
Guido van Rossumfeee4b92000-03-10 22:57:27 +00001039
Gustavo Niemeyer5ddd4c32003-03-19 00:35:36 +00001040 if (interp->codec_search_path == NULL ||
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001041 interp->codec_search_cache == NULL ||
1042 interp->codec_error_registry == NULL)
1043 Py_FatalError("can't initialize codec registry");
Gustavo Niemeyer5ddd4c32003-03-19 00:35:36 +00001044
Christian Heimes819b8bf2008-01-03 23:05:47 +00001045 mod = PyImport_ImportModuleNoBlock("encodings");
Gustavo Niemeyer5ddd4c32003-03-19 00:35:36 +00001046 if (mod == NULL) {
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001047 return -1;
Gustavo Niemeyer5ddd4c32003-03-19 00:35:36 +00001048 }
1049 Py_DECREF(mod);
Christian Heimes6a27efa2008-10-30 21:48:26 +00001050 interp->codecs_initialized = 1;
Gustavo Niemeyer5ddd4c32003-03-19 00:35:36 +00001051 return 0;
Guido van Rossumfeee4b92000-03-10 22:57:27 +00001052}