blob: 5ebc4cb5f6e9e535e4ac41257f15cb279108ce76 [file] [log] [blame]
Guido van Rossumfeee4b92000-03-10 22:57:27 +00001/* ------------------------------------------------------------------------
2
3 Python Codec Registry and support functions
4
5Written by Marc-Andre Lemburg (mal@lemburg.com).
6
Guido van Rossum16b1ad92000-08-03 16:24:25 +00007Copyright (c) Corporation for National Research Initiatives.
Guido van Rossumfeee4b92000-03-10 22:57:27 +00008
9 ------------------------------------------------------------------------ */
10
11#include "Python.h"
12#include <ctype.h>
13
Victor Stinnerf5cff562011-10-14 02:13:11 +020014const char *Py_hexdigits = "0123456789abcdef";
15
Guido van Rossumfeee4b92000-03-10 22:57:27 +000016/* --- Codec Registry ----------------------------------------------------- */
17
18/* Import the standard encodings package which will register the first
Guido van Rossum98297ee2007-11-06 21:34:58 +000019 codec search function.
Guido van Rossumfeee4b92000-03-10 22:57:27 +000020
21 This is done in a lazy way so that the Unicode implementation does
22 not downgrade startup time of scripts not needing it.
23
Guido van Rossumb95de4f2000-03-31 17:25:23 +000024 ImportErrors are silently ignored by this function. Only one try is
25 made.
Guido van Rossumfeee4b92000-03-10 22:57:27 +000026
27*/
28
Gustavo Niemeyer5ddd4c32003-03-19 00:35:36 +000029static int _PyCodecRegistry_Init(void); /* Forward */
Guido van Rossumfeee4b92000-03-10 22:57:27 +000030
Guido van Rossumfeee4b92000-03-10 22:57:27 +000031int PyCodec_Register(PyObject *search_function)
32{
Nicholas Bastine5662ae2004-03-24 22:22:12 +000033 PyInterpreterState *interp = PyThreadState_GET()->interp;
Gustavo Niemeyer5ddd4c32003-03-19 00:35:36 +000034 if (interp->codec_search_path == NULL && _PyCodecRegistry_Init())
Antoine Pitrouf95a1b32010-05-09 15:52:27 +000035 goto onError;
Guido van Rossumfeee4b92000-03-10 22:57:27 +000036 if (search_function == NULL) {
Antoine Pitrouf95a1b32010-05-09 15:52:27 +000037 PyErr_BadArgument();
38 goto onError;
Guido van Rossumfeee4b92000-03-10 22:57:27 +000039 }
40 if (!PyCallable_Check(search_function)) {
Antoine Pitrouf95a1b32010-05-09 15:52:27 +000041 PyErr_SetString(PyExc_TypeError, "argument must be callable");
42 goto onError;
Guido van Rossumfeee4b92000-03-10 22:57:27 +000043 }
Gustavo Niemeyer5ddd4c32003-03-19 00:35:36 +000044 return PyList_Append(interp->codec_search_path, search_function);
Guido van Rossumb95de4f2000-03-31 17:25:23 +000045
46 onError:
47 return -1;
Guido van Rossumfeee4b92000-03-10 22:57:27 +000048}
49
Guido van Rossum9e896b32000-04-05 20:11:21 +000050/* Convert a string to a normalized Python string: all characters are
51 converted to lower case, spaces are replaced with underscores. */
52
Guido van Rossumfeee4b92000-03-10 22:57:27 +000053static
Guido van Rossum9e896b32000-04-05 20:11:21 +000054PyObject *normalizestring(const char *string)
Guido van Rossumfeee4b92000-03-10 22:57:27 +000055{
Guido van Rossum33831132000-06-29 14:50:15 +000056 register size_t i;
Guido van Rossum582acec2000-06-28 22:07:35 +000057 size_t len = strlen(string);
Guido van Rossumfeee4b92000-03-10 22:57:27 +000058 char *p;
59 PyObject *v;
Guido van Rossum21431e82007-10-19 21:48:41 +000060
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000061 if (len > PY_SSIZE_T_MAX) {
Antoine Pitrouf95a1b32010-05-09 15:52:27 +000062 PyErr_SetString(PyExc_OverflowError, "string is too large");
63 return NULL;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000064 }
Guido van Rossum21431e82007-10-19 21:48:41 +000065
66 p = PyMem_Malloc(len + 1);
67 if (p == NULL)
68 return NULL;
Guido van Rossum9e896b32000-04-05 20:11:21 +000069 for (i = 0; i < len; i++) {
70 register char ch = string[i];
71 if (ch == ' ')
72 ch = '-';
73 else
Antoine Pitroucf9d3c02011-07-24 02:27:04 +020074 ch = Py_TOLOWER(Py_CHARMASK(ch));
Antoine Pitrouf95a1b32010-05-09 15:52:27 +000075 p[i] = ch;
Guido van Rossum9e896b32000-04-05 20:11:21 +000076 }
Guido van Rossum21431e82007-10-19 21:48:41 +000077 p[i] = '\0';
78 v = PyUnicode_FromString(p);
79 if (v == NULL)
80 return NULL;
81 PyMem_Free(p);
Guido van Rossumfeee4b92000-03-10 22:57:27 +000082 return v;
83}
84
85/* Lookup the given encoding and return a tuple providing the codec
86 facilities.
87
88 The encoding string is looked up converted to all lower-case
89 characters. This makes encodings looked up through this mechanism
90 effectively case-insensitive.
91
Guido van Rossum98297ee2007-11-06 21:34:58 +000092 If no codec is found, a LookupError is set and NULL returned.
Guido van Rossumb95de4f2000-03-31 17:25:23 +000093
94 As side effect, this tries to load the encodings package, if not
95 yet done. This is part of the lazy load strategy for the encodings
96 package.
97
98*/
Guido van Rossumfeee4b92000-03-10 22:57:27 +000099
100PyObject *_PyCodec_Lookup(const char *encoding)
101{
Gustavo Niemeyer5ddd4c32003-03-19 00:35:36 +0000102 PyInterpreterState *interp;
Guido van Rossum5ba3c842000-03-24 20:52:23 +0000103 PyObject *result, *args = NULL, *v;
Thomas Wouters477c8d52006-05-27 19:21:47 +0000104 Py_ssize_t i, len;
Guido van Rossumfeee4b92000-03-10 22:57:27 +0000105
Fred Drake766de832000-05-09 19:55:59 +0000106 if (encoding == NULL) {
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000107 PyErr_BadArgument();
108 goto onError;
Fred Drake766de832000-05-09 19:55:59 +0000109 }
Gustavo Niemeyer5ddd4c32003-03-19 00:35:36 +0000110
Nicholas Bastine5662ae2004-03-24 22:22:12 +0000111 interp = PyThreadState_GET()->interp;
Gustavo Niemeyer5ddd4c32003-03-19 00:35:36 +0000112 if (interp->codec_search_path == NULL && _PyCodecRegistry_Init())
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000113 goto onError;
Guido van Rossumfeee4b92000-03-10 22:57:27 +0000114
Guido van Rossum9e896b32000-04-05 20:11:21 +0000115 /* Convert the encoding to a normalized Python string: all
Thomas Wouters7e474022000-07-16 12:04:32 +0000116 characters are converted to lower case, spaces and hyphens are
Guido van Rossum9e896b32000-04-05 20:11:21 +0000117 replaced with underscores. */
118 v = normalizestring(encoding);
Guido van Rossumfeee4b92000-03-10 22:57:27 +0000119 if (v == NULL)
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000120 goto onError;
Guido van Rossum21431e82007-10-19 21:48:41 +0000121 PyUnicode_InternInPlace(&v);
Guido van Rossumfeee4b92000-03-10 22:57:27 +0000122
123 /* First, try to lookup the name in the registry dictionary */
Gustavo Niemeyer5ddd4c32003-03-19 00:35:36 +0000124 result = PyDict_GetItem(interp->codec_search_cache, v);
Guido van Rossumfeee4b92000-03-10 22:57:27 +0000125 if (result != NULL) {
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000126 Py_INCREF(result);
127 Py_DECREF(v);
128 return result;
Guido van Rossumfeee4b92000-03-10 22:57:27 +0000129 }
Guido van Rossum98297ee2007-11-06 21:34:58 +0000130
Guido van Rossumfeee4b92000-03-10 22:57:27 +0000131 /* Next, scan the search functions in order of registration */
Guido van Rossumfeee4b92000-03-10 22:57:27 +0000132 args = PyTuple_New(1);
133 if (args == NULL)
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000134 goto onError;
Guido van Rossumfeee4b92000-03-10 22:57:27 +0000135 PyTuple_SET_ITEM(args,0,v);
Guido van Rossum5ba3c842000-03-24 20:52:23 +0000136
Gustavo Niemeyer5ddd4c32003-03-19 00:35:36 +0000137 len = PyList_Size(interp->codec_search_path);
Guido van Rossum5ba3c842000-03-24 20:52:23 +0000138 if (len < 0)
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000139 goto onError;
Guido van Rossumb95de4f2000-03-31 17:25:23 +0000140 if (len == 0) {
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000141 PyErr_SetString(PyExc_LookupError,
142 "no codec search functions registered: "
143 "can't find encoding");
144 goto onError;
Guido van Rossumb95de4f2000-03-31 17:25:23 +0000145 }
Guido van Rossumfeee4b92000-03-10 22:57:27 +0000146
147 for (i = 0; i < len; i++) {
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000148 PyObject *func;
Guido van Rossum98297ee2007-11-06 21:34:58 +0000149
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000150 func = PyList_GetItem(interp->codec_search_path, i);
151 if (func == NULL)
152 goto onError;
153 result = PyEval_CallObject(func, args);
154 if (result == NULL)
155 goto onError;
156 if (result == Py_None) {
157 Py_DECREF(result);
158 continue;
159 }
160 if (!PyTuple_Check(result) || PyTuple_GET_SIZE(result) != 4) {
161 PyErr_SetString(PyExc_TypeError,
162 "codec search functions must return 4-tuples");
163 Py_DECREF(result);
164 goto onError;
165 }
166 break;
Guido van Rossumfeee4b92000-03-10 22:57:27 +0000167 }
168 if (i == len) {
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000169 /* XXX Perhaps we should cache misses too ? */
170 PyErr_Format(PyExc_LookupError,
Martin v. Löwiseb42b022002-09-26 16:01:24 +0000171 "unknown encoding: %s", encoding);
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000172 goto onError;
Guido van Rossumfeee4b92000-03-10 22:57:27 +0000173 }
174
175 /* Cache and return the result */
Neal Norwitz9edcc2e2007-08-11 04:58:26 +0000176 if (PyDict_SetItem(interp->codec_search_cache, v, result) < 0) {
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000177 Py_DECREF(result);
178 goto onError;
Neal Norwitz9edcc2e2007-08-11 04:58:26 +0000179 }
Guido van Rossumfeee4b92000-03-10 22:57:27 +0000180 Py_DECREF(args);
181 return result;
182
183 onError:
184 Py_XDECREF(args);
185 return NULL;
186}
187
Marc-André Lemburgb2750b52008-06-06 12:18:17 +0000188/* Codec registry encoding check API. */
189
190int PyCodec_KnownEncoding(const char *encoding)
191{
192 PyObject *codecs;
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000193
Marc-André Lemburgb2750b52008-06-06 12:18:17 +0000194 codecs = _PyCodec_Lookup(encoding);
195 if (!codecs) {
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000196 PyErr_Clear();
197 return 0;
Marc-André Lemburgb2750b52008-06-06 12:18:17 +0000198 }
199 else {
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000200 Py_DECREF(codecs);
201 return 1;
Marc-André Lemburgb2750b52008-06-06 12:18:17 +0000202 }
203}
204
Guido van Rossumfeee4b92000-03-10 22:57:27 +0000205static
206PyObject *args_tuple(PyObject *object,
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000207 const char *errors)
Guido van Rossumfeee4b92000-03-10 22:57:27 +0000208{
209 PyObject *args;
Guido van Rossum98297ee2007-11-06 21:34:58 +0000210
Guido van Rossumfeee4b92000-03-10 22:57:27 +0000211 args = PyTuple_New(1 + (errors != NULL));
212 if (args == NULL)
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000213 return NULL;
Guido van Rossumfeee4b92000-03-10 22:57:27 +0000214 Py_INCREF(object);
215 PyTuple_SET_ITEM(args,0,object);
216 if (errors) {
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000217 PyObject *v;
Guido van Rossum98297ee2007-11-06 21:34:58 +0000218
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000219 v = PyUnicode_FromString(errors);
220 if (v == NULL) {
221 Py_DECREF(args);
222 return NULL;
223 }
224 PyTuple_SET_ITEM(args, 1, v);
Guido van Rossumfeee4b92000-03-10 22:57:27 +0000225 }
226 return args;
227}
228
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000229/* Helper function to get a codec item */
Guido van Rossumfeee4b92000-03-10 22:57:27 +0000230
231static
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000232PyObject *codec_getitem(const char *encoding, int index)
Guido van Rossumfeee4b92000-03-10 22:57:27 +0000233{
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000234 PyObject *codecs;
235 PyObject *v;
Guido van Rossumfeee4b92000-03-10 22:57:27 +0000236
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000237 codecs = _PyCodec_Lookup(encoding);
238 if (codecs == NULL)
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000239 return NULL;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000240 v = PyTuple_GET_ITEM(codecs, index);
241 Py_DECREF(codecs);
242 Py_INCREF(v);
243 return v;
244}
245
246/* Helper function to create an incremental codec. */
247
248static
249PyObject *codec_getincrementalcodec(const char *encoding,
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000250 const char *errors,
251 const char *attrname)
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000252{
253 PyObject *codecs, *ret, *inccodec;
254
255 codecs = _PyCodec_Lookup(encoding);
256 if (codecs == NULL)
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000257 return NULL;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000258 inccodec = PyObject_GetAttrString(codecs, attrname);
259 Py_DECREF(codecs);
260 if (inccodec == NULL)
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000261 return NULL;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000262 if (errors)
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000263 ret = PyObject_CallFunction(inccodec, "s", errors);
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000264 else
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000265 ret = PyObject_CallFunction(inccodec, NULL);
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000266 Py_DECREF(inccodec);
267 return ret;
268}
269
270/* Helper function to create a stream codec. */
271
272static
273PyObject *codec_getstreamcodec(const char *encoding,
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000274 PyObject *stream,
275 const char *errors,
276 const int index)
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000277{
Thomas Wouters0e3f5912006-08-11 14:57:12 +0000278 PyObject *codecs, *streamcodec, *codeccls;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000279
280 codecs = _PyCodec_Lookup(encoding);
281 if (codecs == NULL)
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000282 return NULL;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000283
Thomas Wouters0e3f5912006-08-11 14:57:12 +0000284 codeccls = PyTuple_GET_ITEM(codecs, index);
285 if (errors != NULL)
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000286 streamcodec = PyObject_CallFunction(codeccls, "Os", stream, errors);
Thomas Wouters0e3f5912006-08-11 14:57:12 +0000287 else
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000288 streamcodec = PyObject_CallFunction(codeccls, "O", stream);
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000289 Py_DECREF(codecs);
290 return streamcodec;
Guido van Rossumfeee4b92000-03-10 22:57:27 +0000291}
292
Guido van Rossum98297ee2007-11-06 21:34:58 +0000293/* Convenience APIs to query the Codec registry.
294
Guido van Rossumfeee4b92000-03-10 22:57:27 +0000295 All APIs return a codec object with incremented refcount.
Guido van Rossum98297ee2007-11-06 21:34:58 +0000296
Guido van Rossumfeee4b92000-03-10 22:57:27 +0000297 */
298
299PyObject *PyCodec_Encoder(const char *encoding)
300{
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000301 return codec_getitem(encoding, 0);
Guido van Rossumfeee4b92000-03-10 22:57:27 +0000302}
303
304PyObject *PyCodec_Decoder(const char *encoding)
305{
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000306 return codec_getitem(encoding, 1);
Guido van Rossumfeee4b92000-03-10 22:57:27 +0000307}
308
Thomas Woutersa9773292006-04-21 09:43:23 +0000309PyObject *PyCodec_IncrementalEncoder(const char *encoding,
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000310 const char *errors)
Thomas Woutersa9773292006-04-21 09:43:23 +0000311{
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000312 return codec_getincrementalcodec(encoding, errors, "incrementalencoder");
Thomas Woutersa9773292006-04-21 09:43:23 +0000313}
314
315PyObject *PyCodec_IncrementalDecoder(const char *encoding,
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000316 const char *errors)
Thomas Woutersa9773292006-04-21 09:43:23 +0000317{
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000318 return codec_getincrementalcodec(encoding, errors, "incrementaldecoder");
Thomas Woutersa9773292006-04-21 09:43:23 +0000319}
320
Guido van Rossumfeee4b92000-03-10 22:57:27 +0000321PyObject *PyCodec_StreamReader(const char *encoding,
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000322 PyObject *stream,
323 const char *errors)
Guido van Rossumfeee4b92000-03-10 22:57:27 +0000324{
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000325 return codec_getstreamcodec(encoding, stream, errors, 2);
Guido van Rossumfeee4b92000-03-10 22:57:27 +0000326}
327
328PyObject *PyCodec_StreamWriter(const char *encoding,
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000329 PyObject *stream,
330 const char *errors)
Guido van Rossumfeee4b92000-03-10 22:57:27 +0000331{
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000332 return codec_getstreamcodec(encoding, stream, errors, 3);
Guido van Rossumfeee4b92000-03-10 22:57:27 +0000333}
334
335/* Encode an object (e.g. an Unicode object) using the given encoding
336 and return the resulting encoded object (usually a Python string).
337
338 errors is passed to the encoder factory as argument if non-NULL. */
339
Serhiy Storchaka94ee3892014-02-24 14:43:03 +0200340static PyObject *
341_PyCodec_EncodeInternal(PyObject *object,
342 PyObject *encoder,
343 const char *encoding,
344 const char *errors)
Guido van Rossumfeee4b92000-03-10 22:57:27 +0000345{
Neal Norwitz3715c3e2005-11-24 22:09:18 +0000346 PyObject *args = NULL, *result = NULL;
Guido van Rossum98297ee2007-11-06 21:34:58 +0000347 PyObject *v = NULL;
Guido van Rossumfeee4b92000-03-10 22:57:27 +0000348
Guido van Rossumfeee4b92000-03-10 22:57:27 +0000349 args = args_tuple(object, errors);
350 if (args == NULL)
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000351 goto onError;
Guido van Rossum98297ee2007-11-06 21:34:58 +0000352
353 result = PyEval_CallObject(encoder, args);
Guido van Rossumfeee4b92000-03-10 22:57:27 +0000354 if (result == NULL)
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000355 goto onError;
Guido van Rossumfeee4b92000-03-10 22:57:27 +0000356
Guido van Rossum98297ee2007-11-06 21:34:58 +0000357 if (!PyTuple_Check(result) ||
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000358 PyTuple_GET_SIZE(result) != 2) {
359 PyErr_SetString(PyExc_TypeError,
360 "encoder must return a tuple (object, integer)");
361 goto onError;
Guido van Rossumfeee4b92000-03-10 22:57:27 +0000362 }
Marc-André Lemburgb2750b52008-06-06 12:18:17 +0000363 v = PyTuple_GET_ITEM(result,0);
364 Py_INCREF(v);
Guido van Rossumfeee4b92000-03-10 22:57:27 +0000365 /* We don't check or use the second (integer) entry. */
366
Marc-André Lemburgb2750b52008-06-06 12:18:17 +0000367 Py_DECREF(args);
368 Py_DECREF(encoder);
369 Py_DECREF(result);
370 return v;
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000371
Guido van Rossumfeee4b92000-03-10 22:57:27 +0000372 onError:
Neal Norwitz3715c3e2005-11-24 22:09:18 +0000373 Py_XDECREF(result);
Guido van Rossumfeee4b92000-03-10 22:57:27 +0000374 Py_XDECREF(args);
375 Py_XDECREF(encoder);
Marc-André Lemburgb2750b52008-06-06 12:18:17 +0000376 return NULL;
Guido van Rossumfeee4b92000-03-10 22:57:27 +0000377}
378
379/* Decode an object (usually a Python string) using the given encoding
380 and return an equivalent object (e.g. an Unicode object).
381
382 errors is passed to the decoder factory as argument if non-NULL. */
383
Serhiy Storchaka94ee3892014-02-24 14:43:03 +0200384static PyObject *
385_PyCodec_DecodeInternal(PyObject *object,
386 PyObject *decoder,
387 const char *encoding,
388 const char *errors)
Guido van Rossumfeee4b92000-03-10 22:57:27 +0000389{
Guido van Rossumfeee4b92000-03-10 22:57:27 +0000390 PyObject *args = NULL, *result = NULL;
391 PyObject *v;
392
Guido van Rossumfeee4b92000-03-10 22:57:27 +0000393 args = args_tuple(object, errors);
394 if (args == NULL)
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000395 goto onError;
Guido van Rossum98297ee2007-11-06 21:34:58 +0000396
Guido van Rossumfeee4b92000-03-10 22:57:27 +0000397 result = PyEval_CallObject(decoder,args);
398 if (result == NULL)
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000399 goto onError;
Guido van Rossum98297ee2007-11-06 21:34:58 +0000400 if (!PyTuple_Check(result) ||
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000401 PyTuple_GET_SIZE(result) != 2) {
402 PyErr_SetString(PyExc_TypeError,
403 "decoder must return a tuple (object,integer)");
404 goto onError;
Guido van Rossumfeee4b92000-03-10 22:57:27 +0000405 }
406 v = PyTuple_GET_ITEM(result,0);
407 Py_INCREF(v);
408 /* We don't check or use the second (integer) entry. */
409
410 Py_DECREF(args);
411 Py_DECREF(decoder);
412 Py_DECREF(result);
413 return v;
Guido van Rossum98297ee2007-11-06 21:34:58 +0000414
Guido van Rossumfeee4b92000-03-10 22:57:27 +0000415 onError:
416 Py_XDECREF(args);
417 Py_XDECREF(decoder);
418 Py_XDECREF(result);
419 return NULL;
420}
421
Serhiy Storchaka94ee3892014-02-24 14:43:03 +0200422/* Generic encoding/decoding API */
423PyObject *PyCodec_Encode(PyObject *object,
424 const char *encoding,
425 const char *errors)
426{
427 PyObject *encoder;
428
429 encoder = PyCodec_Encoder(encoding);
430 if (encoder == NULL)
431 return NULL;
432
433 return _PyCodec_EncodeInternal(object, encoder, encoding, errors);
434}
435
436PyObject *PyCodec_Decode(PyObject *object,
437 const char *encoding,
438 const char *errors)
439{
440 PyObject *decoder;
441
442 decoder = PyCodec_Decoder(encoding);
443 if (decoder == NULL)
444 return NULL;
445
446 return _PyCodec_DecodeInternal(object, decoder, encoding, errors);
447}
448
449/* Text encoding/decoding API */
450static
451PyObject *codec_getitem_checked(const char *encoding,
452 const char *operation_name,
453 int index)
454{
455 _Py_IDENTIFIER(_is_text_encoding);
456 PyObject *codec;
457 PyObject *attr;
458 PyObject *v;
459 int is_text_codec;
460
461 codec = _PyCodec_Lookup(encoding);
462 if (codec == NULL)
463 return NULL;
464
465 /* Backwards compatibility: assume any raw tuple describes a text
466 * encoding, and the same for anything lacking the private
467 * attribute.
468 */
469 if (!PyTuple_CheckExact(codec)) {
470 attr = _PyObject_GetAttrId(codec, &PyId__is_text_encoding);
471 if (attr == NULL) {
472 if (PyErr_ExceptionMatches(PyExc_AttributeError)) {
473 PyErr_Clear();
474 } else {
475 Py_DECREF(codec);
476 return NULL;
477 }
478 } else {
479 is_text_codec = PyObject_IsTrue(attr);
480 Py_DECREF(attr);
481 if (!is_text_codec) {
482 Py_DECREF(codec);
483 PyErr_Format(PyExc_LookupError,
484 "'%.400s' is not a text encoding; "
485 "use codecs.%s() to handle arbitrary codecs",
486 encoding, operation_name);
487 return NULL;
488 }
489 }
490 }
491
492 v = PyTuple_GET_ITEM(codec, index);
493 Py_DECREF(codec);
494 Py_INCREF(v);
495 return v;
496}
497
498static PyObject * _PyCodec_TextEncoder(const char *encoding)
499{
500 return codec_getitem_checked(encoding, "encode", 0);
501}
502
503static PyObject * _PyCodec_TextDecoder(const char *encoding)
504{
505 return codec_getitem_checked(encoding, "decode", 1);
506}
507
508PyObject *_PyCodec_EncodeText(PyObject *object,
509 const char *encoding,
510 const char *errors)
511{
512 PyObject *encoder;
513
514 encoder = _PyCodec_TextEncoder(encoding);
515 if (encoder == NULL)
516 return NULL;
517
518 return _PyCodec_EncodeInternal(object, encoder, encoding, errors);
519}
520
521PyObject *_PyCodec_DecodeText(PyObject *object,
522 const char *encoding,
523 const char *errors)
524{
525 PyObject *decoder;
526
527 decoder = _PyCodec_TextDecoder(encoding);
528 if (decoder == NULL)
529 return NULL;
530
531 return _PyCodec_DecodeInternal(object, decoder, encoding, errors);
532}
533
Walter Dörwald3aeb6322002-09-02 13:14:32 +0000534/* Register the error handling callback function error under the name
535 name. This function will be called by the codec when it encounters
536 an unencodable characters/undecodable bytes and doesn't know the
537 callback name, when name is specified as the error parameter
538 in the call to the encode/decode function.
539 Return 0 on success, -1 on error */
540int PyCodec_RegisterError(const char *name, PyObject *error)
541{
Nicholas Bastine5662ae2004-03-24 22:22:12 +0000542 PyInterpreterState *interp = PyThreadState_GET()->interp;
Gustavo Niemeyer5ddd4c32003-03-19 00:35:36 +0000543 if (interp->codec_search_path == NULL && _PyCodecRegistry_Init())
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000544 return -1;
Walter Dörwald3aeb6322002-09-02 13:14:32 +0000545 if (!PyCallable_Check(error)) {
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000546 PyErr_SetString(PyExc_TypeError, "handler must be callable");
547 return -1;
Walter Dörwald3aeb6322002-09-02 13:14:32 +0000548 }
Gustavo Niemeyer5ddd4c32003-03-19 00:35:36 +0000549 return PyDict_SetItemString(interp->codec_error_registry,
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000550 (char *)name, error);
Walter Dörwald3aeb6322002-09-02 13:14:32 +0000551}
552
553/* Lookup the error handling callback function registered under the
554 name error. As a special case NULL can be passed, in which case
555 the error handling callback for strict encoding will be returned. */
556PyObject *PyCodec_LookupError(const char *name)
557{
558 PyObject *handler = NULL;
559
Nicholas Bastine5662ae2004-03-24 22:22:12 +0000560 PyInterpreterState *interp = PyThreadState_GET()->interp;
Gustavo Niemeyer5ddd4c32003-03-19 00:35:36 +0000561 if (interp->codec_search_path == NULL && _PyCodecRegistry_Init())
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000562 return NULL;
Gustavo Niemeyer5ddd4c32003-03-19 00:35:36 +0000563
Walter Dörwald3aeb6322002-09-02 13:14:32 +0000564 if (name==NULL)
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000565 name = "strict";
Gustavo Niemeyer5ddd4c32003-03-19 00:35:36 +0000566 handler = PyDict_GetItemString(interp->codec_error_registry, (char *)name);
Walter Dörwald3aeb6322002-09-02 13:14:32 +0000567 if (!handler)
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000568 PyErr_Format(PyExc_LookupError, "unknown error handler name '%.400s'", name);
Walter Dörwald3aeb6322002-09-02 13:14:32 +0000569 else
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000570 Py_INCREF(handler);
Walter Dörwald3aeb6322002-09-02 13:14:32 +0000571 return handler;
572}
573
574static void wrong_exception_type(PyObject *exc)
575{
Martin v. Löwisbd928fe2011-10-14 10:20:37 +0200576 _Py_IDENTIFIER(__class__);
577 _Py_IDENTIFIER(__name__);
Martin v. Löwis1ee1b6f2011-10-10 18:11:30 +0200578 PyObject *type = _PyObject_GetAttrId(exc, &PyId___class__);
Walter Dörwald3aeb6322002-09-02 13:14:32 +0000579 if (type != NULL) {
Martin v. Löwis1ee1b6f2011-10-10 18:11:30 +0200580 PyObject *name = _PyObject_GetAttrId(type, &PyId___name__);
Walter Dörwald573c08c2007-05-25 15:46:59 +0000581 Py_DECREF(type);
582 if (name != NULL) {
583 PyErr_Format(PyExc_TypeError,
584 "don't know how to handle %S in error callback", name);
585 Py_DECREF(name);
586 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +0000587 }
588}
589
590PyObject *PyCodec_StrictErrors(PyObject *exc)
591{
Brett Cannonbf364092006-03-01 04:25:17 +0000592 if (PyExceptionInstance_Check(exc))
593 PyErr_SetObject(PyExceptionInstance_Class(exc), exc);
Walter Dörwald3aeb6322002-09-02 13:14:32 +0000594 else
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000595 PyErr_SetString(PyExc_TypeError, "codec must pass exception instance");
Walter Dörwald3aeb6322002-09-02 13:14:32 +0000596 return NULL;
597}
598
599
600PyObject *PyCodec_IgnoreErrors(PyObject *exc)
601{
Martin v. Löwis18e16552006-02-15 17:27:45 +0000602 Py_ssize_t end;
Walter Dörwald3aeb6322002-09-02 13:14:32 +0000603 if (PyObject_IsInstance(exc, PyExc_UnicodeEncodeError)) {
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000604 if (PyUnicodeEncodeError_GetEnd(exc, &end))
605 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +0000606 }
607 else if (PyObject_IsInstance(exc, PyExc_UnicodeDecodeError)) {
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000608 if (PyUnicodeDecodeError_GetEnd(exc, &end))
609 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +0000610 }
611 else if (PyObject_IsInstance(exc, PyExc_UnicodeTranslateError)) {
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000612 if (PyUnicodeTranslateError_GetEnd(exc, &end))
613 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +0000614 }
615 else {
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000616 wrong_exception_type(exc);
617 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +0000618 }
Victor Stinneree450092011-12-01 02:52:11 +0100619 return Py_BuildValue("(Nn)", PyUnicode_New(0, 0), end);
Walter Dörwald3aeb6322002-09-02 13:14:32 +0000620}
621
622
623PyObject *PyCodec_ReplaceErrors(PyObject *exc)
624{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200625 Py_ssize_t start, end, i, len;
Walter Dörwald3aeb6322002-09-02 13:14:32 +0000626
627 if (PyObject_IsInstance(exc, PyExc_UnicodeEncodeError)) {
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000628 PyObject *res;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200629 int kind;
630 void *data;
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000631 if (PyUnicodeEncodeError_GetStart(exc, &start))
632 return NULL;
633 if (PyUnicodeEncodeError_GetEnd(exc, &end))
634 return NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200635 len = end - start;
636 res = PyUnicode_New(len, '?');
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000637 if (res == NULL)
638 return NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200639 kind = PyUnicode_KIND(res);
640 data = PyUnicode_DATA(res);
641 for (i = 0; i < len; ++i)
642 PyUnicode_WRITE(kind, data, i, '?');
Victor Stinner8f825062012-04-27 13:55:39 +0200643 assert(_PyUnicode_CheckConsistency(res, 1));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200644 return Py_BuildValue("(Nn)", res, end);
Walter Dörwald3aeb6322002-09-02 13:14:32 +0000645 }
646 else if (PyObject_IsInstance(exc, PyExc_UnicodeDecodeError)) {
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000647 if (PyUnicodeDecodeError_GetEnd(exc, &end))
648 return NULL;
Victor Stinner1a15aba2011-10-02 19:00:15 +0200649 return Py_BuildValue("(Cn)",
650 (int)Py_UNICODE_REPLACEMENT_CHARACTER,
651 end);
Walter Dörwald3aeb6322002-09-02 13:14:32 +0000652 }
653 else if (PyObject_IsInstance(exc, PyExc_UnicodeTranslateError)) {
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000654 PyObject *res;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200655 int kind;
656 void *data;
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000657 if (PyUnicodeTranslateError_GetStart(exc, &start))
658 return NULL;
659 if (PyUnicodeTranslateError_GetEnd(exc, &end))
660 return NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200661 len = end - start;
662 res = PyUnicode_New(len, Py_UNICODE_REPLACEMENT_CHARACTER);
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000663 if (res == NULL)
664 return NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200665 kind = PyUnicode_KIND(res);
666 data = PyUnicode_DATA(res);
667 for (i=0; i < len; i++)
668 PyUnicode_WRITE(kind, data, i, Py_UNICODE_REPLACEMENT_CHARACTER);
Victor Stinner8f825062012-04-27 13:55:39 +0200669 assert(_PyUnicode_CheckConsistency(res, 1));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200670 return Py_BuildValue("(Nn)", res, end);
Walter Dörwald3aeb6322002-09-02 13:14:32 +0000671 }
672 else {
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000673 wrong_exception_type(exc);
674 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +0000675 }
676}
677
678PyObject *PyCodec_XMLCharRefReplaceErrors(PyObject *exc)
679{
680 if (PyObject_IsInstance(exc, PyExc_UnicodeEncodeError)) {
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000681 PyObject *restuple;
682 PyObject *object;
Victor Stinnerb31f1bc2011-11-04 21:29:10 +0100683 Py_ssize_t i;
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000684 Py_ssize_t start;
685 Py_ssize_t end;
686 PyObject *res;
Martin v. Löwisb09af032011-11-04 11:16:41 +0100687 unsigned char *outp;
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000688 int ressize;
Martin v. Löwisb09af032011-11-04 11:16:41 +0100689 Py_UCS4 ch;
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000690 if (PyUnicodeEncodeError_GetStart(exc, &start))
691 return NULL;
692 if (PyUnicodeEncodeError_GetEnd(exc, &end))
693 return NULL;
694 if (!(object = PyUnicodeEncodeError_GetObject(exc)))
695 return NULL;
Martin v. Löwisb09af032011-11-04 11:16:41 +0100696 for (i = start, ressize = 0; i < end; ++i) {
697 /* object is guaranteed to be "ready" */
698 ch = PyUnicode_READ_CHAR(object, i);
699 if (ch<10)
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000700 ressize += 2+1+1;
Martin v. Löwisb09af032011-11-04 11:16:41 +0100701 else if (ch<100)
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000702 ressize += 2+2+1;
Martin v. Löwisb09af032011-11-04 11:16:41 +0100703 else if (ch<1000)
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000704 ressize += 2+3+1;
Martin v. Löwisb09af032011-11-04 11:16:41 +0100705 else if (ch<10000)
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000706 ressize += 2+4+1;
Martin v. Löwisb09af032011-11-04 11:16:41 +0100707 else if (ch<100000)
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000708 ressize += 2+5+1;
Martin v. Löwisb09af032011-11-04 11:16:41 +0100709 else if (ch<1000000)
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000710 ressize += 2+6+1;
711 else
712 ressize += 2+7+1;
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000713 }
714 /* allocate replacement */
Martin v. Löwisb09af032011-11-04 11:16:41 +0100715 res = PyUnicode_New(ressize, 127);
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000716 if (res == NULL) {
717 Py_DECREF(object);
718 return NULL;
719 }
Martin v. Löwisb09af032011-11-04 11:16:41 +0100720 outp = PyUnicode_1BYTE_DATA(res);
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000721 /* generate replacement */
Victor Stinnerb31f1bc2011-11-04 21:29:10 +0100722 for (i = start; i < end; ++i) {
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000723 int digits;
724 int base;
Martin v. Löwis8ba79302011-11-04 12:26:49 +0100725 ch = PyUnicode_READ_CHAR(object, i);
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000726 *outp++ = '&';
727 *outp++ = '#';
Martin v. Löwisb09af032011-11-04 11:16:41 +0100728 if (ch<10) {
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000729 digits = 1;
730 base = 1;
731 }
Martin v. Löwisb09af032011-11-04 11:16:41 +0100732 else if (ch<100) {
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000733 digits = 2;
734 base = 10;
735 }
Martin v. Löwisb09af032011-11-04 11:16:41 +0100736 else if (ch<1000) {
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000737 digits = 3;
738 base = 100;
739 }
Martin v. Löwisb09af032011-11-04 11:16:41 +0100740 else if (ch<10000) {
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000741 digits = 4;
742 base = 1000;
743 }
Martin v. Löwisb09af032011-11-04 11:16:41 +0100744 else if (ch<100000) {
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000745 digits = 5;
746 base = 10000;
747 }
Martin v. Löwisb09af032011-11-04 11:16:41 +0100748 else if (ch<1000000) {
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000749 digits = 6;
750 base = 100000;
751 }
752 else {
753 digits = 7;
754 base = 1000000;
755 }
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000756 while (digits-->0) {
Martin v. Löwisb09af032011-11-04 11:16:41 +0100757 *outp++ = '0' + ch/base;
758 ch %= base;
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000759 base /= 10;
760 }
761 *outp++ = ';';
762 }
Victor Stinner8f825062012-04-27 13:55:39 +0200763 assert(_PyUnicode_CheckConsistency(res, 1));
764 restuple = Py_BuildValue("(Nn)", res, end);
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000765 Py_DECREF(object);
766 return restuple;
Walter Dörwald3aeb6322002-09-02 13:14:32 +0000767 }
768 else {
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000769 wrong_exception_type(exc);
770 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +0000771 }
772}
773
Walter Dörwald3aeb6322002-09-02 13:14:32 +0000774PyObject *PyCodec_BackslashReplaceErrors(PyObject *exc)
775{
776 if (PyObject_IsInstance(exc, PyExc_UnicodeEncodeError)) {
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000777 PyObject *restuple;
778 PyObject *object;
Martin v. Löwisb09af032011-11-04 11:16:41 +0100779 Py_ssize_t i;
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000780 Py_ssize_t start;
781 Py_ssize_t end;
782 PyObject *res;
Martin v. Löwisb09af032011-11-04 11:16:41 +0100783 unsigned char *outp;
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000784 int ressize;
Martin v. Löwisb09af032011-11-04 11:16:41 +0100785 Py_UCS4 c;
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000786 if (PyUnicodeEncodeError_GetStart(exc, &start))
787 return NULL;
788 if (PyUnicodeEncodeError_GetEnd(exc, &end))
789 return NULL;
790 if (!(object = PyUnicodeEncodeError_GetObject(exc)))
791 return NULL;
Martin v. Löwisb09af032011-11-04 11:16:41 +0100792 for (i = start, ressize = 0; i < end; ++i) {
793 /* object is guaranteed to be "ready" */
794 c = PyUnicode_READ_CHAR(object, i);
795 if (c >= 0x10000) {
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000796 ressize += 1+1+8;
Martin v. Löwisb09af032011-11-04 11:16:41 +0100797 }
798 else if (c >= 0x100) {
799 ressize += 1+1+4;
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000800 }
801 else
802 ressize += 1+1+2;
803 }
Martin v. Löwisb09af032011-11-04 11:16:41 +0100804 res = PyUnicode_New(ressize, 127);
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000805 if (res==NULL)
806 return NULL;
Martin v. Löwisb09af032011-11-04 11:16:41 +0100807 for (i = start, outp = PyUnicode_1BYTE_DATA(res);
808 i < end; ++i) {
809 c = PyUnicode_READ_CHAR(object, i);
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000810 *outp++ = '\\';
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000811 if (c >= 0x00010000) {
812 *outp++ = 'U';
Victor Stinnerf5cff562011-10-14 02:13:11 +0200813 *outp++ = Py_hexdigits[(c>>28)&0xf];
814 *outp++ = Py_hexdigits[(c>>24)&0xf];
815 *outp++ = Py_hexdigits[(c>>20)&0xf];
816 *outp++ = Py_hexdigits[(c>>16)&0xf];
817 *outp++ = Py_hexdigits[(c>>12)&0xf];
818 *outp++ = Py_hexdigits[(c>>8)&0xf];
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000819 }
Antoine Pitroue4a18922010-09-09 20:30:23 +0000820 else if (c >= 0x100) {
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000821 *outp++ = 'u';
Victor Stinnerf5cff562011-10-14 02:13:11 +0200822 *outp++ = Py_hexdigits[(c>>12)&0xf];
823 *outp++ = Py_hexdigits[(c>>8)&0xf];
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000824 }
825 else
826 *outp++ = 'x';
Victor Stinnerf5cff562011-10-14 02:13:11 +0200827 *outp++ = Py_hexdigits[(c>>4)&0xf];
828 *outp++ = Py_hexdigits[c&0xf];
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000829 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +0000830
Victor Stinner8f825062012-04-27 13:55:39 +0200831 assert(_PyUnicode_CheckConsistency(res, 1));
832 restuple = Py_BuildValue("(Nn)", res, end);
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000833 Py_DECREF(object);
834 return restuple;
Walter Dörwald3aeb6322002-09-02 13:14:32 +0000835 }
836 else {
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000837 wrong_exception_type(exc);
838 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +0000839 }
840}
841
Martin v. Löwisaef3fb02009-05-02 19:27:30 +0000842/* This handler is declared static until someone demonstrates
843 a need to call it directly. */
844static PyObject *
Martin v. Löwise0a2b722009-05-10 08:08:56 +0000845PyCodec_SurrogatePassErrors(PyObject *exc)
Martin v. Löwisdb12d452009-05-02 18:52:14 +0000846{
847 PyObject *restuple;
848 PyObject *object;
Martin v. Löwisb09af032011-11-04 11:16:41 +0100849 Py_ssize_t i;
Martin v. Löwisdb12d452009-05-02 18:52:14 +0000850 Py_ssize_t start;
851 Py_ssize_t end;
852 PyObject *res;
853 if (PyObject_IsInstance(exc, PyExc_UnicodeEncodeError)) {
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000854 char *outp;
855 if (PyUnicodeEncodeError_GetStart(exc, &start))
856 return NULL;
857 if (PyUnicodeEncodeError_GetEnd(exc, &end))
858 return NULL;
859 if (!(object = PyUnicodeEncodeError_GetObject(exc)))
860 return NULL;
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000861 res = PyBytes_FromStringAndSize(NULL, 3*(end-start));
862 if (!res) {
863 Py_DECREF(object);
864 return NULL;
865 }
866 outp = PyBytes_AsString(res);
Martin v. Löwisb09af032011-11-04 11:16:41 +0100867 for (i = start; i < end; i++) {
868 /* object is guaranteed to be "ready" */
869 Py_UCS4 ch = PyUnicode_READ_CHAR(object, i);
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000870 if (ch < 0xd800 || ch > 0xdfff) {
871 /* Not a surrogate, fail with original exception */
872 PyErr_SetObject(PyExceptionInstance_Class(exc), exc);
873 Py_DECREF(res);
874 Py_DECREF(object);
875 return NULL;
876 }
877 *outp++ = (char)(0xe0 | (ch >> 12));
878 *outp++ = (char)(0x80 | ((ch >> 6) & 0x3f));
879 *outp++ = (char)(0x80 | (ch & 0x3f));
880 }
881 restuple = Py_BuildValue("(On)", res, end);
882 Py_DECREF(res);
883 Py_DECREF(object);
884 return restuple;
Martin v. Löwisdb12d452009-05-02 18:52:14 +0000885 }
886 else if (PyObject_IsInstance(exc, PyExc_UnicodeDecodeError)) {
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000887 unsigned char *p;
Victor Stinnerc06bb7a2011-11-04 21:36:35 +0100888 Py_UCS4 ch = 0;
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000889 if (PyUnicodeDecodeError_GetStart(exc, &start))
890 return NULL;
891 if (!(object = PyUnicodeDecodeError_GetObject(exc)))
892 return NULL;
893 if (!(p = (unsigned char*)PyBytes_AsString(object))) {
894 Py_DECREF(object);
895 return NULL;
896 }
897 /* Try decoding a single surrogate character. If
898 there are more, let the codec call us again. */
899 p += start;
Ezio Melotti540da762012-11-03 23:03:39 +0200900 if (PyBytes_GET_SIZE(object) - start >= 3 &&
901 (p[0] & 0xf0) == 0xe0 &&
902 (p[1] & 0xc0) == 0x80 &&
903 (p[2] & 0xc0) == 0x80) {
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000904 /* it's a three-byte code */
905 ch = ((p[0] & 0x0f) << 12) + ((p[1] & 0x3f) << 6) + (p[2] & 0x3f);
906 if (ch < 0xd800 || ch > 0xdfff)
907 /* it's not a surrogate - fail */
908 ch = 0;
909 }
910 Py_DECREF(object);
911 if (ch == 0) {
912 PyErr_SetObject(PyExceptionInstance_Class(exc), exc);
913 return NULL;
914 }
Victor Stinnerc06bb7a2011-11-04 21:36:35 +0100915 res = PyUnicode_FromOrdinal(ch);
916 if (res == NULL)
917 return NULL;
918 return Py_BuildValue("(Nn)", res, start+3);
Martin v. Löwisdb12d452009-05-02 18:52:14 +0000919 }
920 else {
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000921 wrong_exception_type(exc);
922 return NULL;
Martin v. Löwisdb12d452009-05-02 18:52:14 +0000923 }
924}
925
Martin v. Löwis011e8422009-05-05 04:43:17 +0000926static PyObject *
Martin v. Löwis43c57782009-05-10 08:15:24 +0000927PyCodec_SurrogateEscapeErrors(PyObject *exc)
Martin v. Löwis011e8422009-05-05 04:43:17 +0000928{
929 PyObject *restuple;
930 PyObject *object;
Martin v. Löwisb09af032011-11-04 11:16:41 +0100931 Py_ssize_t i;
Martin v. Löwis011e8422009-05-05 04:43:17 +0000932 Py_ssize_t start;
933 Py_ssize_t end;
934 PyObject *res;
935 if (PyObject_IsInstance(exc, PyExc_UnicodeEncodeError)) {
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000936 char *outp;
937 if (PyUnicodeEncodeError_GetStart(exc, &start))
938 return NULL;
939 if (PyUnicodeEncodeError_GetEnd(exc, &end))
940 return NULL;
941 if (!(object = PyUnicodeEncodeError_GetObject(exc)))
942 return NULL;
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000943 res = PyBytes_FromStringAndSize(NULL, end-start);
944 if (!res) {
945 Py_DECREF(object);
946 return NULL;
947 }
948 outp = PyBytes_AsString(res);
Martin v. Löwisb09af032011-11-04 11:16:41 +0100949 for (i = start; i < end; i++) {
950 /* object is guaranteed to be "ready" */
951 Py_UCS4 ch = PyUnicode_READ_CHAR(object, i);
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000952 if (ch < 0xdc80 || ch > 0xdcff) {
953 /* Not a UTF-8b surrogate, fail with original exception */
954 PyErr_SetObject(PyExceptionInstance_Class(exc), exc);
955 Py_DECREF(res);
956 Py_DECREF(object);
957 return NULL;
958 }
959 *outp++ = ch - 0xdc00;
960 }
961 restuple = Py_BuildValue("(On)", res, end);
962 Py_DECREF(res);
963 Py_DECREF(object);
964 return restuple;
Martin v. Löwis011e8422009-05-05 04:43:17 +0000965 }
966 else if (PyObject_IsInstance(exc, PyExc_UnicodeDecodeError)) {
Victor Stinnerc06bb7a2011-11-04 21:36:35 +0100967 PyObject *str;
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000968 unsigned char *p;
Victor Stinnerc06bb7a2011-11-04 21:36:35 +0100969 Py_UCS2 ch[4]; /* decode up to 4 bad bytes. */
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000970 int consumed = 0;
971 if (PyUnicodeDecodeError_GetStart(exc, &start))
972 return NULL;
973 if (PyUnicodeDecodeError_GetEnd(exc, &end))
974 return NULL;
975 if (!(object = PyUnicodeDecodeError_GetObject(exc)))
976 return NULL;
977 if (!(p = (unsigned char*)PyBytes_AsString(object))) {
978 Py_DECREF(object);
979 return NULL;
980 }
981 while (consumed < 4 && consumed < end-start) {
982 /* Refuse to escape ASCII bytes. */
983 if (p[start+consumed] < 128)
984 break;
985 ch[consumed] = 0xdc00 + p[start+consumed];
986 consumed++;
987 }
988 Py_DECREF(object);
989 if (!consumed) {
990 /* codec complained about ASCII byte. */
991 PyErr_SetObject(PyExceptionInstance_Class(exc), exc);
992 return NULL;
993 }
Victor Stinnerc06bb7a2011-11-04 21:36:35 +0100994 str = PyUnicode_FromKindAndData(PyUnicode_2BYTE_KIND, ch, consumed);
995 if (str == NULL)
996 return NULL;
997 return Py_BuildValue("(Nn)", str, start+consumed);
Martin v. Löwis011e8422009-05-05 04:43:17 +0000998 }
999 else {
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001000 wrong_exception_type(exc);
1001 return NULL;
Martin v. Löwis011e8422009-05-05 04:43:17 +00001002 }
1003}
1004
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001005
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001006static PyObject *strict_errors(PyObject *self, PyObject *exc)
1007{
1008 return PyCodec_StrictErrors(exc);
1009}
1010
1011
1012static PyObject *ignore_errors(PyObject *self, PyObject *exc)
1013{
1014 return PyCodec_IgnoreErrors(exc);
1015}
1016
1017
1018static PyObject *replace_errors(PyObject *self, PyObject *exc)
1019{
1020 return PyCodec_ReplaceErrors(exc);
1021}
1022
1023
1024static PyObject *xmlcharrefreplace_errors(PyObject *self, PyObject *exc)
1025{
1026 return PyCodec_XMLCharRefReplaceErrors(exc);
1027}
1028
1029
1030static PyObject *backslashreplace_errors(PyObject *self, PyObject *exc)
1031{
1032 return PyCodec_BackslashReplaceErrors(exc);
1033}
1034
Martin v. Löwise0a2b722009-05-10 08:08:56 +00001035static PyObject *surrogatepass_errors(PyObject *self, PyObject *exc)
Martin v. Löwisdb12d452009-05-02 18:52:14 +00001036{
Martin v. Löwise0a2b722009-05-10 08:08:56 +00001037 return PyCodec_SurrogatePassErrors(exc);
Martin v. Löwisdb12d452009-05-02 18:52:14 +00001038}
1039
Martin v. Löwis43c57782009-05-10 08:15:24 +00001040static PyObject *surrogateescape_errors(PyObject *self, PyObject *exc)
Martin v. Löwis011e8422009-05-05 04:43:17 +00001041{
Martin v. Löwis43c57782009-05-10 08:15:24 +00001042 return PyCodec_SurrogateEscapeErrors(exc);
Martin v. Löwis011e8422009-05-05 04:43:17 +00001043}
1044
Gustavo Niemeyer5ddd4c32003-03-19 00:35:36 +00001045static int _PyCodecRegistry_Init(void)
Guido van Rossumfeee4b92000-03-10 22:57:27 +00001046{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001047 static struct {
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001048 char *name;
1049 PyMethodDef def;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001050 } methods[] =
1051 {
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001052 {
1053 "strict",
1054 {
1055 "strict_errors",
1056 strict_errors,
1057 METH_O,
1058 PyDoc_STR("Implements the 'strict' error handling, which "
1059 "raises a UnicodeError on coding errors.")
1060 }
1061 },
1062 {
1063 "ignore",
1064 {
1065 "ignore_errors",
1066 ignore_errors,
1067 METH_O,
1068 PyDoc_STR("Implements the 'ignore' error handling, which "
1069 "ignores malformed data and continues.")
1070 }
1071 },
1072 {
1073 "replace",
1074 {
1075 "replace_errors",
1076 replace_errors,
1077 METH_O,
1078 PyDoc_STR("Implements the 'replace' error handling, which "
1079 "replaces malformed data with a replacement marker.")
1080 }
1081 },
1082 {
1083 "xmlcharrefreplace",
1084 {
1085 "xmlcharrefreplace_errors",
1086 xmlcharrefreplace_errors,
1087 METH_O,
1088 PyDoc_STR("Implements the 'xmlcharrefreplace' error handling, "
1089 "which replaces an unencodable character with the "
1090 "appropriate XML character reference.")
1091 }
1092 },
1093 {
1094 "backslashreplace",
1095 {
1096 "backslashreplace_errors",
1097 backslashreplace_errors,
1098 METH_O,
1099 PyDoc_STR("Implements the 'backslashreplace' error handling, "
1100 "which replaces an unencodable character with a "
1101 "backslashed escape sequence.")
1102 }
1103 },
1104 {
1105 "surrogatepass",
1106 {
1107 "surrogatepass",
1108 surrogatepass_errors,
1109 METH_O
1110 }
1111 },
1112 {
1113 "surrogateescape",
1114 {
1115 "surrogateescape",
1116 surrogateescape_errors,
1117 METH_O
1118 }
1119 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001120 };
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001121
Nicholas Bastine5662ae2004-03-24 22:22:12 +00001122 PyInterpreterState *interp = PyThreadState_GET()->interp;
Gustavo Niemeyer5ddd4c32003-03-19 00:35:36 +00001123 PyObject *mod;
Neal Norwitz739a8f82004-07-08 01:55:58 +00001124 unsigned i;
Gustavo Niemeyer5ddd4c32003-03-19 00:35:36 +00001125
1126 if (interp->codec_search_path != NULL)
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001127 return 0;
Gustavo Niemeyer5ddd4c32003-03-19 00:35:36 +00001128
1129 interp->codec_search_path = PyList_New(0);
1130 interp->codec_search_cache = PyDict_New();
1131 interp->codec_error_registry = PyDict_New();
1132
1133 if (interp->codec_error_registry) {
Victor Stinner63941882011-09-29 00:42:28 +02001134 for (i = 0; i < Py_ARRAY_LENGTH(methods); ++i) {
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001135 PyObject *func = PyCFunction_New(&methods[i].def, NULL);
1136 int res;
1137 if (!func)
1138 Py_FatalError("can't initialize codec error registry");
1139 res = PyCodec_RegisterError(methods[i].name, func);
1140 Py_DECREF(func);
1141 if (res)
1142 Py_FatalError("can't initialize codec error registry");
1143 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001144 }
Guido van Rossumfeee4b92000-03-10 22:57:27 +00001145
Gustavo Niemeyer5ddd4c32003-03-19 00:35:36 +00001146 if (interp->codec_search_path == NULL ||
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001147 interp->codec_search_cache == NULL ||
1148 interp->codec_error_registry == NULL)
1149 Py_FatalError("can't initialize codec registry");
Gustavo Niemeyer5ddd4c32003-03-19 00:35:36 +00001150
Christian Heimes819b8bf2008-01-03 23:05:47 +00001151 mod = PyImport_ImportModuleNoBlock("encodings");
Gustavo Niemeyer5ddd4c32003-03-19 00:35:36 +00001152 if (mod == NULL) {
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001153 return -1;
Gustavo Niemeyer5ddd4c32003-03-19 00:35:36 +00001154 }
1155 Py_DECREF(mod);
Christian Heimes6a27efa2008-10-30 21:48:26 +00001156 interp->codecs_initialized = 1;
Gustavo Niemeyer5ddd4c32003-03-19 00:35:36 +00001157 return 0;
Guido van Rossumfeee4b92000-03-10 22:57:27 +00001158}