blob: 8fe0af7bf09e6bf0baa28395966013cc8f4aec94 [file] [log] [blame]
Guido van Rossumfeee4b92000-03-10 22:57:27 +00001/* ------------------------------------------------------------------------
2
3 Python Codec Registry and support functions
4
5Written by Marc-Andre Lemburg (mal@lemburg.com).
6
Guido van Rossum16b1ad92000-08-03 16:24:25 +00007Copyright (c) Corporation for National Research Initiatives.
Guido van Rossumfeee4b92000-03-10 22:57:27 +00008
9 ------------------------------------------------------------------------ */
10
11#include "Python.h"
12#include <ctype.h>
13
Victor Stinnerf5cff562011-10-14 02:13:11 +020014const char *Py_hexdigits = "0123456789abcdef";
15
Guido van Rossumfeee4b92000-03-10 22:57:27 +000016/* --- Codec Registry ----------------------------------------------------- */
17
18/* Import the standard encodings package which will register the first
Guido van Rossum98297ee2007-11-06 21:34:58 +000019 codec search function.
Guido van Rossumfeee4b92000-03-10 22:57:27 +000020
21 This is done in a lazy way so that the Unicode implementation does
22 not downgrade startup time of scripts not needing it.
23
Guido van Rossumb95de4f2000-03-31 17:25:23 +000024 ImportErrors are silently ignored by this function. Only one try is
25 made.
Guido van Rossumfeee4b92000-03-10 22:57:27 +000026
27*/
28
Gustavo Niemeyer5ddd4c32003-03-19 00:35:36 +000029static int _PyCodecRegistry_Init(void); /* Forward */
Guido van Rossumfeee4b92000-03-10 22:57:27 +000030
Guido van Rossumfeee4b92000-03-10 22:57:27 +000031int PyCodec_Register(PyObject *search_function)
32{
Nicholas Bastine5662ae2004-03-24 22:22:12 +000033 PyInterpreterState *interp = PyThreadState_GET()->interp;
Gustavo Niemeyer5ddd4c32003-03-19 00:35:36 +000034 if (interp->codec_search_path == NULL && _PyCodecRegistry_Init())
Antoine Pitrouf95a1b32010-05-09 15:52:27 +000035 goto onError;
Guido van Rossumfeee4b92000-03-10 22:57:27 +000036 if (search_function == NULL) {
Antoine Pitrouf95a1b32010-05-09 15:52:27 +000037 PyErr_BadArgument();
38 goto onError;
Guido van Rossumfeee4b92000-03-10 22:57:27 +000039 }
40 if (!PyCallable_Check(search_function)) {
Antoine Pitrouf95a1b32010-05-09 15:52:27 +000041 PyErr_SetString(PyExc_TypeError, "argument must be callable");
42 goto onError;
Guido van Rossumfeee4b92000-03-10 22:57:27 +000043 }
Gustavo Niemeyer5ddd4c32003-03-19 00:35:36 +000044 return PyList_Append(interp->codec_search_path, search_function);
Guido van Rossumb95de4f2000-03-31 17:25:23 +000045
46 onError:
47 return -1;
Guido van Rossumfeee4b92000-03-10 22:57:27 +000048}
49
Guido van Rossum9e896b32000-04-05 20:11:21 +000050/* Convert a string to a normalized Python string: all characters are
51 converted to lower case, spaces are replaced with underscores. */
52
Guido van Rossumfeee4b92000-03-10 22:57:27 +000053static
Guido van Rossum9e896b32000-04-05 20:11:21 +000054PyObject *normalizestring(const char *string)
Guido van Rossumfeee4b92000-03-10 22:57:27 +000055{
Antoine Pitrou9ed5f272013-08-13 20:18:52 +020056 size_t i;
Guido van Rossum582acec2000-06-28 22:07:35 +000057 size_t len = strlen(string);
Guido van Rossumfeee4b92000-03-10 22:57:27 +000058 char *p;
59 PyObject *v;
Guido van Rossum21431e82007-10-19 21:48:41 +000060
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000061 if (len > PY_SSIZE_T_MAX) {
Antoine Pitrouf95a1b32010-05-09 15:52:27 +000062 PyErr_SetString(PyExc_OverflowError, "string is too large");
63 return NULL;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000064 }
Guido van Rossum21431e82007-10-19 21:48:41 +000065
66 p = PyMem_Malloc(len + 1);
67 if (p == NULL)
Victor Stinnercc351592013-07-12 00:02:55 +020068 return PyErr_NoMemory();
Guido van Rossum9e896b32000-04-05 20:11:21 +000069 for (i = 0; i < len; i++) {
Antoine Pitrou9ed5f272013-08-13 20:18:52 +020070 char ch = string[i];
Guido van Rossum9e896b32000-04-05 20:11:21 +000071 if (ch == ' ')
72 ch = '-';
73 else
Antoine Pitroucf9d3c02011-07-24 02:27:04 +020074 ch = Py_TOLOWER(Py_CHARMASK(ch));
Antoine Pitrouf95a1b32010-05-09 15:52:27 +000075 p[i] = ch;
Guido van Rossum9e896b32000-04-05 20:11:21 +000076 }
Guido van Rossum21431e82007-10-19 21:48:41 +000077 p[i] = '\0';
78 v = PyUnicode_FromString(p);
79 if (v == NULL)
80 return NULL;
81 PyMem_Free(p);
Guido van Rossumfeee4b92000-03-10 22:57:27 +000082 return v;
83}
84
85/* Lookup the given encoding and return a tuple providing the codec
86 facilities.
87
88 The encoding string is looked up converted to all lower-case
89 characters. This makes encodings looked up through this mechanism
90 effectively case-insensitive.
91
Guido van Rossum98297ee2007-11-06 21:34:58 +000092 If no codec is found, a LookupError is set and NULL returned.
Guido van Rossumb95de4f2000-03-31 17:25:23 +000093
94 As side effect, this tries to load the encodings package, if not
95 yet done. This is part of the lazy load strategy for the encodings
96 package.
97
98*/
Guido van Rossumfeee4b92000-03-10 22:57:27 +000099
100PyObject *_PyCodec_Lookup(const char *encoding)
101{
Gustavo Niemeyer5ddd4c32003-03-19 00:35:36 +0000102 PyInterpreterState *interp;
Guido van Rossum5ba3c842000-03-24 20:52:23 +0000103 PyObject *result, *args = NULL, *v;
Thomas Wouters477c8d52006-05-27 19:21:47 +0000104 Py_ssize_t i, len;
Guido van Rossumfeee4b92000-03-10 22:57:27 +0000105
Fred Drake766de832000-05-09 19:55:59 +0000106 if (encoding == NULL) {
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000107 PyErr_BadArgument();
108 goto onError;
Fred Drake766de832000-05-09 19:55:59 +0000109 }
Gustavo Niemeyer5ddd4c32003-03-19 00:35:36 +0000110
Nicholas Bastine5662ae2004-03-24 22:22:12 +0000111 interp = PyThreadState_GET()->interp;
Gustavo Niemeyer5ddd4c32003-03-19 00:35:36 +0000112 if (interp->codec_search_path == NULL && _PyCodecRegistry_Init())
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000113 goto onError;
Guido van Rossumfeee4b92000-03-10 22:57:27 +0000114
Guido van Rossum9e896b32000-04-05 20:11:21 +0000115 /* Convert the encoding to a normalized Python string: all
Thomas Wouters7e474022000-07-16 12:04:32 +0000116 characters are converted to lower case, spaces and hyphens are
Guido van Rossum9e896b32000-04-05 20:11:21 +0000117 replaced with underscores. */
118 v = normalizestring(encoding);
Guido van Rossumfeee4b92000-03-10 22:57:27 +0000119 if (v == NULL)
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000120 goto onError;
Guido van Rossum21431e82007-10-19 21:48:41 +0000121 PyUnicode_InternInPlace(&v);
Guido van Rossumfeee4b92000-03-10 22:57:27 +0000122
123 /* First, try to lookup the name in the registry dictionary */
Gustavo Niemeyer5ddd4c32003-03-19 00:35:36 +0000124 result = PyDict_GetItem(interp->codec_search_cache, v);
Guido van Rossumfeee4b92000-03-10 22:57:27 +0000125 if (result != NULL) {
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000126 Py_INCREF(result);
127 Py_DECREF(v);
128 return result;
Guido van Rossumfeee4b92000-03-10 22:57:27 +0000129 }
Guido van Rossum98297ee2007-11-06 21:34:58 +0000130
Guido van Rossumfeee4b92000-03-10 22:57:27 +0000131 /* Next, scan the search functions in order of registration */
Guido van Rossumfeee4b92000-03-10 22:57:27 +0000132 args = PyTuple_New(1);
133 if (args == NULL)
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000134 goto onError;
Guido van Rossumfeee4b92000-03-10 22:57:27 +0000135 PyTuple_SET_ITEM(args,0,v);
Guido van Rossum5ba3c842000-03-24 20:52:23 +0000136
Gustavo Niemeyer5ddd4c32003-03-19 00:35:36 +0000137 len = PyList_Size(interp->codec_search_path);
Guido van Rossum5ba3c842000-03-24 20:52:23 +0000138 if (len < 0)
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000139 goto onError;
Guido van Rossumb95de4f2000-03-31 17:25:23 +0000140 if (len == 0) {
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000141 PyErr_SetString(PyExc_LookupError,
142 "no codec search functions registered: "
143 "can't find encoding");
144 goto onError;
Guido van Rossumb95de4f2000-03-31 17:25:23 +0000145 }
Guido van Rossumfeee4b92000-03-10 22:57:27 +0000146
147 for (i = 0; i < len; i++) {
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000148 PyObject *func;
Guido van Rossum98297ee2007-11-06 21:34:58 +0000149
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000150 func = PyList_GetItem(interp->codec_search_path, i);
151 if (func == NULL)
152 goto onError;
153 result = PyEval_CallObject(func, args);
154 if (result == NULL)
155 goto onError;
156 if (result == Py_None) {
157 Py_DECREF(result);
158 continue;
159 }
160 if (!PyTuple_Check(result) || PyTuple_GET_SIZE(result) != 4) {
161 PyErr_SetString(PyExc_TypeError,
162 "codec search functions must return 4-tuples");
163 Py_DECREF(result);
164 goto onError;
165 }
166 break;
Guido van Rossumfeee4b92000-03-10 22:57:27 +0000167 }
168 if (i == len) {
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000169 /* XXX Perhaps we should cache misses too ? */
170 PyErr_Format(PyExc_LookupError,
Martin v. Löwiseb42b022002-09-26 16:01:24 +0000171 "unknown encoding: %s", encoding);
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000172 goto onError;
Guido van Rossumfeee4b92000-03-10 22:57:27 +0000173 }
174
175 /* Cache and return the result */
Neal Norwitz9edcc2e2007-08-11 04:58:26 +0000176 if (PyDict_SetItem(interp->codec_search_cache, v, result) < 0) {
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000177 Py_DECREF(result);
178 goto onError;
Neal Norwitz9edcc2e2007-08-11 04:58:26 +0000179 }
Guido van Rossumfeee4b92000-03-10 22:57:27 +0000180 Py_DECREF(args);
181 return result;
182
183 onError:
184 Py_XDECREF(args);
185 return NULL;
186}
187
Marc-André Lemburgb2750b52008-06-06 12:18:17 +0000188/* Codec registry encoding check API. */
189
190int PyCodec_KnownEncoding(const char *encoding)
191{
192 PyObject *codecs;
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000193
Marc-André Lemburgb2750b52008-06-06 12:18:17 +0000194 codecs = _PyCodec_Lookup(encoding);
195 if (!codecs) {
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000196 PyErr_Clear();
197 return 0;
Marc-André Lemburgb2750b52008-06-06 12:18:17 +0000198 }
199 else {
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000200 Py_DECREF(codecs);
201 return 1;
Marc-André Lemburgb2750b52008-06-06 12:18:17 +0000202 }
203}
204
Guido van Rossumfeee4b92000-03-10 22:57:27 +0000205static
206PyObject *args_tuple(PyObject *object,
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000207 const char *errors)
Guido van Rossumfeee4b92000-03-10 22:57:27 +0000208{
209 PyObject *args;
Guido van Rossum98297ee2007-11-06 21:34:58 +0000210
Guido van Rossumfeee4b92000-03-10 22:57:27 +0000211 args = PyTuple_New(1 + (errors != NULL));
212 if (args == NULL)
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000213 return NULL;
Guido van Rossumfeee4b92000-03-10 22:57:27 +0000214 Py_INCREF(object);
215 PyTuple_SET_ITEM(args,0,object);
216 if (errors) {
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000217 PyObject *v;
Guido van Rossum98297ee2007-11-06 21:34:58 +0000218
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000219 v = PyUnicode_FromString(errors);
220 if (v == NULL) {
221 Py_DECREF(args);
222 return NULL;
223 }
224 PyTuple_SET_ITEM(args, 1, v);
Guido van Rossumfeee4b92000-03-10 22:57:27 +0000225 }
226 return args;
227}
228
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000229/* Helper function to get a codec item */
Guido van Rossumfeee4b92000-03-10 22:57:27 +0000230
231static
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000232PyObject *codec_getitem(const char *encoding, int index)
Guido van Rossumfeee4b92000-03-10 22:57:27 +0000233{
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000234 PyObject *codecs;
235 PyObject *v;
Guido van Rossumfeee4b92000-03-10 22:57:27 +0000236
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000237 codecs = _PyCodec_Lookup(encoding);
238 if (codecs == NULL)
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000239 return NULL;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000240 v = PyTuple_GET_ITEM(codecs, index);
241 Py_DECREF(codecs);
242 Py_INCREF(v);
243 return v;
244}
245
246/* Helper function to create an incremental codec. */
247
248static
249PyObject *codec_getincrementalcodec(const char *encoding,
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000250 const char *errors,
251 const char *attrname)
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000252{
253 PyObject *codecs, *ret, *inccodec;
254
255 codecs = _PyCodec_Lookup(encoding);
256 if (codecs == NULL)
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000257 return NULL;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000258 inccodec = PyObject_GetAttrString(codecs, attrname);
259 Py_DECREF(codecs);
260 if (inccodec == NULL)
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000261 return NULL;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000262 if (errors)
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000263 ret = PyObject_CallFunction(inccodec, "s", errors);
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000264 else
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000265 ret = PyObject_CallFunction(inccodec, NULL);
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000266 Py_DECREF(inccodec);
267 return ret;
268}
269
270/* Helper function to create a stream codec. */
271
272static
273PyObject *codec_getstreamcodec(const char *encoding,
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000274 PyObject *stream,
275 const char *errors,
276 const int index)
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000277{
Thomas Wouters0e3f5912006-08-11 14:57:12 +0000278 PyObject *codecs, *streamcodec, *codeccls;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000279
280 codecs = _PyCodec_Lookup(encoding);
281 if (codecs == NULL)
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000282 return NULL;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000283
Thomas Wouters0e3f5912006-08-11 14:57:12 +0000284 codeccls = PyTuple_GET_ITEM(codecs, index);
285 if (errors != NULL)
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000286 streamcodec = PyObject_CallFunction(codeccls, "Os", stream, errors);
Thomas Wouters0e3f5912006-08-11 14:57:12 +0000287 else
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000288 streamcodec = PyObject_CallFunction(codeccls, "O", stream);
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000289 Py_DECREF(codecs);
290 return streamcodec;
Guido van Rossumfeee4b92000-03-10 22:57:27 +0000291}
292
Guido van Rossum98297ee2007-11-06 21:34:58 +0000293/* Convenience APIs to query the Codec registry.
294
Guido van Rossumfeee4b92000-03-10 22:57:27 +0000295 All APIs return a codec object with incremented refcount.
Guido van Rossum98297ee2007-11-06 21:34:58 +0000296
Guido van Rossumfeee4b92000-03-10 22:57:27 +0000297 */
298
299PyObject *PyCodec_Encoder(const char *encoding)
300{
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000301 return codec_getitem(encoding, 0);
Guido van Rossumfeee4b92000-03-10 22:57:27 +0000302}
303
304PyObject *PyCodec_Decoder(const char *encoding)
305{
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000306 return codec_getitem(encoding, 1);
Guido van Rossumfeee4b92000-03-10 22:57:27 +0000307}
308
Thomas Woutersa9773292006-04-21 09:43:23 +0000309PyObject *PyCodec_IncrementalEncoder(const char *encoding,
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000310 const char *errors)
Thomas Woutersa9773292006-04-21 09:43:23 +0000311{
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000312 return codec_getincrementalcodec(encoding, errors, "incrementalencoder");
Thomas Woutersa9773292006-04-21 09:43:23 +0000313}
314
315PyObject *PyCodec_IncrementalDecoder(const char *encoding,
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000316 const char *errors)
Thomas Woutersa9773292006-04-21 09:43:23 +0000317{
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000318 return codec_getincrementalcodec(encoding, errors, "incrementaldecoder");
Thomas Woutersa9773292006-04-21 09:43:23 +0000319}
320
Guido van Rossumfeee4b92000-03-10 22:57:27 +0000321PyObject *PyCodec_StreamReader(const char *encoding,
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000322 PyObject *stream,
323 const char *errors)
Guido van Rossumfeee4b92000-03-10 22:57:27 +0000324{
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000325 return codec_getstreamcodec(encoding, stream, errors, 2);
Guido van Rossumfeee4b92000-03-10 22:57:27 +0000326}
327
328PyObject *PyCodec_StreamWriter(const char *encoding,
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000329 PyObject *stream,
330 const char *errors)
Guido van Rossumfeee4b92000-03-10 22:57:27 +0000331{
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000332 return codec_getstreamcodec(encoding, stream, errors, 3);
Guido van Rossumfeee4b92000-03-10 22:57:27 +0000333}
334
Nick Coghlan8b097b42013-11-13 23:49:21 +1000335/* Helper that tries to ensure the reported exception chain indicates the
336 * codec that was invoked to trigger the failure without changing the type
337 * of the exception raised.
338 */
339static void
340wrap_codec_error(const char *operation,
341 const char *encoding)
342{
343 /* TrySetFromCause will replace the active exception with a suitably
344 * updated clone if it can, otherwise it will leave the original
345 * exception alone.
346 */
347 _PyErr_TrySetFromCause("%s with '%s' codec failed",
348 operation, encoding);
349}
350
Guido van Rossumfeee4b92000-03-10 22:57:27 +0000351/* Encode an object (e.g. an Unicode object) using the given encoding
352 and return the resulting encoded object (usually a Python string).
353
354 errors is passed to the encoder factory as argument if non-NULL. */
355
356PyObject *PyCodec_Encode(PyObject *object,
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000357 const char *encoding,
358 const char *errors)
Guido van Rossumfeee4b92000-03-10 22:57:27 +0000359{
360 PyObject *encoder = NULL;
Neal Norwitz3715c3e2005-11-24 22:09:18 +0000361 PyObject *args = NULL, *result = NULL;
Guido van Rossum98297ee2007-11-06 21:34:58 +0000362 PyObject *v = NULL;
Guido van Rossumfeee4b92000-03-10 22:57:27 +0000363
364 encoder = PyCodec_Encoder(encoding);
365 if (encoder == NULL)
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000366 goto onError;
Guido van Rossumfeee4b92000-03-10 22:57:27 +0000367
368 args = args_tuple(object, errors);
369 if (args == NULL)
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000370 goto onError;
Guido van Rossum98297ee2007-11-06 21:34:58 +0000371
372 result = PyEval_CallObject(encoder, args);
Nick Coghlanc4c25802013-11-15 21:47:37 +1000373 if (result == NULL) {
374 wrap_codec_error("encoding", encoding);
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000375 goto onError;
Nick Coghlanc4c25802013-11-15 21:47:37 +1000376 }
Guido van Rossumfeee4b92000-03-10 22:57:27 +0000377
Guido van Rossum98297ee2007-11-06 21:34:58 +0000378 if (!PyTuple_Check(result) ||
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000379 PyTuple_GET_SIZE(result) != 2) {
380 PyErr_SetString(PyExc_TypeError,
381 "encoder must return a tuple (object, integer)");
382 goto onError;
Guido van Rossumfeee4b92000-03-10 22:57:27 +0000383 }
Marc-André Lemburgb2750b52008-06-06 12:18:17 +0000384 v = PyTuple_GET_ITEM(result,0);
385 Py_INCREF(v);
Guido van Rossumfeee4b92000-03-10 22:57:27 +0000386 /* We don't check or use the second (integer) entry. */
387
Marc-André Lemburgb2750b52008-06-06 12:18:17 +0000388 Py_DECREF(args);
389 Py_DECREF(encoder);
390 Py_DECREF(result);
391 return v;
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000392
Guido van Rossumfeee4b92000-03-10 22:57:27 +0000393 onError:
Neal Norwitz3715c3e2005-11-24 22:09:18 +0000394 Py_XDECREF(result);
Guido van Rossumfeee4b92000-03-10 22:57:27 +0000395 Py_XDECREF(args);
396 Py_XDECREF(encoder);
Marc-André Lemburgb2750b52008-06-06 12:18:17 +0000397 return NULL;
Guido van Rossumfeee4b92000-03-10 22:57:27 +0000398}
399
400/* Decode an object (usually a Python string) using the given encoding
401 and return an equivalent object (e.g. an Unicode object).
402
403 errors is passed to the decoder factory as argument if non-NULL. */
404
405PyObject *PyCodec_Decode(PyObject *object,
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000406 const char *encoding,
407 const char *errors)
Guido van Rossumfeee4b92000-03-10 22:57:27 +0000408{
409 PyObject *decoder = NULL;
410 PyObject *args = NULL, *result = NULL;
411 PyObject *v;
412
413 decoder = PyCodec_Decoder(encoding);
414 if (decoder == NULL)
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000415 goto onError;
Guido van Rossumfeee4b92000-03-10 22:57:27 +0000416
417 args = args_tuple(object, errors);
418 if (args == NULL)
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000419 goto onError;
Guido van Rossum98297ee2007-11-06 21:34:58 +0000420
Guido van Rossumfeee4b92000-03-10 22:57:27 +0000421 result = PyEval_CallObject(decoder,args);
Nick Coghlanc4c25802013-11-15 21:47:37 +1000422 if (result == NULL) {
423 wrap_codec_error("decoding", encoding);
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000424 goto onError;
Nick Coghlanc4c25802013-11-15 21:47:37 +1000425 }
Guido van Rossum98297ee2007-11-06 21:34:58 +0000426 if (!PyTuple_Check(result) ||
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000427 PyTuple_GET_SIZE(result) != 2) {
428 PyErr_SetString(PyExc_TypeError,
429 "decoder must return a tuple (object,integer)");
430 goto onError;
Guido van Rossumfeee4b92000-03-10 22:57:27 +0000431 }
432 v = PyTuple_GET_ITEM(result,0);
433 Py_INCREF(v);
434 /* We don't check or use the second (integer) entry. */
435
436 Py_DECREF(args);
437 Py_DECREF(decoder);
438 Py_DECREF(result);
439 return v;
Guido van Rossum98297ee2007-11-06 21:34:58 +0000440
Guido van Rossumfeee4b92000-03-10 22:57:27 +0000441 onError:
442 Py_XDECREF(args);
443 Py_XDECREF(decoder);
444 Py_XDECREF(result);
445 return NULL;
446}
447
Walter Dörwald3aeb6322002-09-02 13:14:32 +0000448/* Register the error handling callback function error under the name
449 name. This function will be called by the codec when it encounters
450 an unencodable characters/undecodable bytes and doesn't know the
451 callback name, when name is specified as the error parameter
452 in the call to the encode/decode function.
453 Return 0 on success, -1 on error */
454int PyCodec_RegisterError(const char *name, PyObject *error)
455{
Nicholas Bastine5662ae2004-03-24 22:22:12 +0000456 PyInterpreterState *interp = PyThreadState_GET()->interp;
Gustavo Niemeyer5ddd4c32003-03-19 00:35:36 +0000457 if (interp->codec_search_path == NULL && _PyCodecRegistry_Init())
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000458 return -1;
Walter Dörwald3aeb6322002-09-02 13:14:32 +0000459 if (!PyCallable_Check(error)) {
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000460 PyErr_SetString(PyExc_TypeError, "handler must be callable");
461 return -1;
Walter Dörwald3aeb6322002-09-02 13:14:32 +0000462 }
Gustavo Niemeyer5ddd4c32003-03-19 00:35:36 +0000463 return PyDict_SetItemString(interp->codec_error_registry,
Serhiy Storchakac6792272013-10-19 21:03:34 +0300464 name, error);
Walter Dörwald3aeb6322002-09-02 13:14:32 +0000465}
466
467/* Lookup the error handling callback function registered under the
468 name error. As a special case NULL can be passed, in which case
469 the error handling callback for strict encoding will be returned. */
470PyObject *PyCodec_LookupError(const char *name)
471{
472 PyObject *handler = NULL;
473
Nicholas Bastine5662ae2004-03-24 22:22:12 +0000474 PyInterpreterState *interp = PyThreadState_GET()->interp;
Gustavo Niemeyer5ddd4c32003-03-19 00:35:36 +0000475 if (interp->codec_search_path == NULL && _PyCodecRegistry_Init())
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000476 return NULL;
Gustavo Niemeyer5ddd4c32003-03-19 00:35:36 +0000477
Walter Dörwald3aeb6322002-09-02 13:14:32 +0000478 if (name==NULL)
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000479 name = "strict";
Serhiy Storchakac6792272013-10-19 21:03:34 +0300480 handler = PyDict_GetItemString(interp->codec_error_registry, name);
Walter Dörwald3aeb6322002-09-02 13:14:32 +0000481 if (!handler)
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000482 PyErr_Format(PyExc_LookupError, "unknown error handler name '%.400s'", name);
Walter Dörwald3aeb6322002-09-02 13:14:32 +0000483 else
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000484 Py_INCREF(handler);
Walter Dörwald3aeb6322002-09-02 13:14:32 +0000485 return handler;
486}
487
488static void wrong_exception_type(PyObject *exc)
489{
Martin v. Löwisbd928fe2011-10-14 10:20:37 +0200490 _Py_IDENTIFIER(__class__);
491 _Py_IDENTIFIER(__name__);
Martin v. Löwis1ee1b6f2011-10-10 18:11:30 +0200492 PyObject *type = _PyObject_GetAttrId(exc, &PyId___class__);
Walter Dörwald3aeb6322002-09-02 13:14:32 +0000493 if (type != NULL) {
Martin v. Löwis1ee1b6f2011-10-10 18:11:30 +0200494 PyObject *name = _PyObject_GetAttrId(type, &PyId___name__);
Walter Dörwald573c08c2007-05-25 15:46:59 +0000495 Py_DECREF(type);
496 if (name != NULL) {
497 PyErr_Format(PyExc_TypeError,
498 "don't know how to handle %S in error callback", name);
499 Py_DECREF(name);
500 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +0000501 }
502}
503
504PyObject *PyCodec_StrictErrors(PyObject *exc)
505{
Brett Cannonbf364092006-03-01 04:25:17 +0000506 if (PyExceptionInstance_Check(exc))
507 PyErr_SetObject(PyExceptionInstance_Class(exc), exc);
Walter Dörwald3aeb6322002-09-02 13:14:32 +0000508 else
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000509 PyErr_SetString(PyExc_TypeError, "codec must pass exception instance");
Walter Dörwald3aeb6322002-09-02 13:14:32 +0000510 return NULL;
511}
512
513
514PyObject *PyCodec_IgnoreErrors(PyObject *exc)
515{
Martin v. Löwis18e16552006-02-15 17:27:45 +0000516 Py_ssize_t end;
Walter Dörwald3aeb6322002-09-02 13:14:32 +0000517 if (PyObject_IsInstance(exc, PyExc_UnicodeEncodeError)) {
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000518 if (PyUnicodeEncodeError_GetEnd(exc, &end))
519 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +0000520 }
521 else if (PyObject_IsInstance(exc, PyExc_UnicodeDecodeError)) {
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000522 if (PyUnicodeDecodeError_GetEnd(exc, &end))
523 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +0000524 }
525 else if (PyObject_IsInstance(exc, PyExc_UnicodeTranslateError)) {
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000526 if (PyUnicodeTranslateError_GetEnd(exc, &end))
527 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +0000528 }
529 else {
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000530 wrong_exception_type(exc);
531 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +0000532 }
Victor Stinneree450092011-12-01 02:52:11 +0100533 return Py_BuildValue("(Nn)", PyUnicode_New(0, 0), end);
Walter Dörwald3aeb6322002-09-02 13:14:32 +0000534}
535
536
537PyObject *PyCodec_ReplaceErrors(PyObject *exc)
538{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200539 Py_ssize_t start, end, i, len;
Walter Dörwald3aeb6322002-09-02 13:14:32 +0000540
541 if (PyObject_IsInstance(exc, PyExc_UnicodeEncodeError)) {
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000542 PyObject *res;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200543 int kind;
544 void *data;
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000545 if (PyUnicodeEncodeError_GetStart(exc, &start))
546 return NULL;
547 if (PyUnicodeEncodeError_GetEnd(exc, &end))
548 return NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200549 len = end - start;
550 res = PyUnicode_New(len, '?');
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000551 if (res == NULL)
552 return NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200553 kind = PyUnicode_KIND(res);
554 data = PyUnicode_DATA(res);
555 for (i = 0; i < len; ++i)
556 PyUnicode_WRITE(kind, data, i, '?');
Victor Stinner8f825062012-04-27 13:55:39 +0200557 assert(_PyUnicode_CheckConsistency(res, 1));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200558 return Py_BuildValue("(Nn)", res, end);
Walter Dörwald3aeb6322002-09-02 13:14:32 +0000559 }
560 else if (PyObject_IsInstance(exc, PyExc_UnicodeDecodeError)) {
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000561 if (PyUnicodeDecodeError_GetEnd(exc, &end))
562 return NULL;
Victor Stinner1a15aba2011-10-02 19:00:15 +0200563 return Py_BuildValue("(Cn)",
564 (int)Py_UNICODE_REPLACEMENT_CHARACTER,
565 end);
Walter Dörwald3aeb6322002-09-02 13:14:32 +0000566 }
567 else if (PyObject_IsInstance(exc, PyExc_UnicodeTranslateError)) {
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000568 PyObject *res;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200569 int kind;
570 void *data;
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000571 if (PyUnicodeTranslateError_GetStart(exc, &start))
572 return NULL;
573 if (PyUnicodeTranslateError_GetEnd(exc, &end))
574 return NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200575 len = end - start;
576 res = PyUnicode_New(len, Py_UNICODE_REPLACEMENT_CHARACTER);
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000577 if (res == NULL)
578 return NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200579 kind = PyUnicode_KIND(res);
580 data = PyUnicode_DATA(res);
581 for (i=0; i < len; i++)
582 PyUnicode_WRITE(kind, data, i, Py_UNICODE_REPLACEMENT_CHARACTER);
Victor Stinner8f825062012-04-27 13:55:39 +0200583 assert(_PyUnicode_CheckConsistency(res, 1));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200584 return Py_BuildValue("(Nn)", res, end);
Walter Dörwald3aeb6322002-09-02 13:14:32 +0000585 }
586 else {
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000587 wrong_exception_type(exc);
588 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +0000589 }
590}
591
592PyObject *PyCodec_XMLCharRefReplaceErrors(PyObject *exc)
593{
594 if (PyObject_IsInstance(exc, PyExc_UnicodeEncodeError)) {
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000595 PyObject *restuple;
596 PyObject *object;
Victor Stinnerb31f1bc2011-11-04 21:29:10 +0100597 Py_ssize_t i;
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000598 Py_ssize_t start;
599 Py_ssize_t end;
600 PyObject *res;
Martin v. Löwisb09af032011-11-04 11:16:41 +0100601 unsigned char *outp;
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000602 int ressize;
Martin v. Löwisb09af032011-11-04 11:16:41 +0100603 Py_UCS4 ch;
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000604 if (PyUnicodeEncodeError_GetStart(exc, &start))
605 return NULL;
606 if (PyUnicodeEncodeError_GetEnd(exc, &end))
607 return NULL;
608 if (!(object = PyUnicodeEncodeError_GetObject(exc)))
609 return NULL;
Martin v. Löwisb09af032011-11-04 11:16:41 +0100610 for (i = start, ressize = 0; i < end; ++i) {
611 /* object is guaranteed to be "ready" */
612 ch = PyUnicode_READ_CHAR(object, i);
613 if (ch<10)
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000614 ressize += 2+1+1;
Martin v. Löwisb09af032011-11-04 11:16:41 +0100615 else if (ch<100)
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000616 ressize += 2+2+1;
Martin v. Löwisb09af032011-11-04 11:16:41 +0100617 else if (ch<1000)
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000618 ressize += 2+3+1;
Martin v. Löwisb09af032011-11-04 11:16:41 +0100619 else if (ch<10000)
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000620 ressize += 2+4+1;
Martin v. Löwisb09af032011-11-04 11:16:41 +0100621 else if (ch<100000)
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000622 ressize += 2+5+1;
Martin v. Löwisb09af032011-11-04 11:16:41 +0100623 else if (ch<1000000)
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000624 ressize += 2+6+1;
625 else
626 ressize += 2+7+1;
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000627 }
628 /* allocate replacement */
Martin v. Löwisb09af032011-11-04 11:16:41 +0100629 res = PyUnicode_New(ressize, 127);
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000630 if (res == NULL) {
631 Py_DECREF(object);
632 return NULL;
633 }
Martin v. Löwisb09af032011-11-04 11:16:41 +0100634 outp = PyUnicode_1BYTE_DATA(res);
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000635 /* generate replacement */
Victor Stinnerb31f1bc2011-11-04 21:29:10 +0100636 for (i = start; i < end; ++i) {
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000637 int digits;
638 int base;
Martin v. Löwis8ba79302011-11-04 12:26:49 +0100639 ch = PyUnicode_READ_CHAR(object, i);
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000640 *outp++ = '&';
641 *outp++ = '#';
Martin v. Löwisb09af032011-11-04 11:16:41 +0100642 if (ch<10) {
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000643 digits = 1;
644 base = 1;
645 }
Martin v. Löwisb09af032011-11-04 11:16:41 +0100646 else if (ch<100) {
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000647 digits = 2;
648 base = 10;
649 }
Martin v. Löwisb09af032011-11-04 11:16:41 +0100650 else if (ch<1000) {
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000651 digits = 3;
652 base = 100;
653 }
Martin v. Löwisb09af032011-11-04 11:16:41 +0100654 else if (ch<10000) {
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000655 digits = 4;
656 base = 1000;
657 }
Martin v. Löwisb09af032011-11-04 11:16:41 +0100658 else if (ch<100000) {
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000659 digits = 5;
660 base = 10000;
661 }
Martin v. Löwisb09af032011-11-04 11:16:41 +0100662 else if (ch<1000000) {
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000663 digits = 6;
664 base = 100000;
665 }
666 else {
667 digits = 7;
668 base = 1000000;
669 }
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000670 while (digits-->0) {
Martin v. Löwisb09af032011-11-04 11:16:41 +0100671 *outp++ = '0' + ch/base;
672 ch %= base;
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000673 base /= 10;
674 }
675 *outp++ = ';';
676 }
Victor Stinner8f825062012-04-27 13:55:39 +0200677 assert(_PyUnicode_CheckConsistency(res, 1));
678 restuple = Py_BuildValue("(Nn)", res, end);
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000679 Py_DECREF(object);
680 return restuple;
Walter Dörwald3aeb6322002-09-02 13:14:32 +0000681 }
682 else {
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000683 wrong_exception_type(exc);
684 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +0000685 }
686}
687
Walter Dörwald3aeb6322002-09-02 13:14:32 +0000688PyObject *PyCodec_BackslashReplaceErrors(PyObject *exc)
689{
690 if (PyObject_IsInstance(exc, PyExc_UnicodeEncodeError)) {
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000691 PyObject *restuple;
692 PyObject *object;
Martin v. Löwisb09af032011-11-04 11:16:41 +0100693 Py_ssize_t i;
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000694 Py_ssize_t start;
695 Py_ssize_t end;
696 PyObject *res;
Martin v. Löwisb09af032011-11-04 11:16:41 +0100697 unsigned char *outp;
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000698 int ressize;
Martin v. Löwisb09af032011-11-04 11:16:41 +0100699 Py_UCS4 c;
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000700 if (PyUnicodeEncodeError_GetStart(exc, &start))
701 return NULL;
702 if (PyUnicodeEncodeError_GetEnd(exc, &end))
703 return NULL;
704 if (!(object = PyUnicodeEncodeError_GetObject(exc)))
705 return NULL;
Martin v. Löwisb09af032011-11-04 11:16:41 +0100706 for (i = start, ressize = 0; i < end; ++i) {
707 /* object is guaranteed to be "ready" */
708 c = PyUnicode_READ_CHAR(object, i);
709 if (c >= 0x10000) {
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000710 ressize += 1+1+8;
Martin v. Löwisb09af032011-11-04 11:16:41 +0100711 }
712 else if (c >= 0x100) {
713 ressize += 1+1+4;
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000714 }
715 else
716 ressize += 1+1+2;
717 }
Martin v. Löwisb09af032011-11-04 11:16:41 +0100718 res = PyUnicode_New(ressize, 127);
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000719 if (res==NULL)
720 return NULL;
Martin v. Löwisb09af032011-11-04 11:16:41 +0100721 for (i = start, outp = PyUnicode_1BYTE_DATA(res);
722 i < end; ++i) {
723 c = PyUnicode_READ_CHAR(object, i);
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000724 *outp++ = '\\';
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000725 if (c >= 0x00010000) {
726 *outp++ = 'U';
Victor Stinnerf5cff562011-10-14 02:13:11 +0200727 *outp++ = Py_hexdigits[(c>>28)&0xf];
728 *outp++ = Py_hexdigits[(c>>24)&0xf];
729 *outp++ = Py_hexdigits[(c>>20)&0xf];
730 *outp++ = Py_hexdigits[(c>>16)&0xf];
731 *outp++ = Py_hexdigits[(c>>12)&0xf];
732 *outp++ = Py_hexdigits[(c>>8)&0xf];
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000733 }
Antoine Pitroue4a18922010-09-09 20:30:23 +0000734 else if (c >= 0x100) {
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000735 *outp++ = 'u';
Victor Stinnerf5cff562011-10-14 02:13:11 +0200736 *outp++ = Py_hexdigits[(c>>12)&0xf];
737 *outp++ = Py_hexdigits[(c>>8)&0xf];
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000738 }
739 else
740 *outp++ = 'x';
Victor Stinnerf5cff562011-10-14 02:13:11 +0200741 *outp++ = Py_hexdigits[(c>>4)&0xf];
742 *outp++ = Py_hexdigits[c&0xf];
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000743 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +0000744
Victor Stinner8f825062012-04-27 13:55:39 +0200745 assert(_PyUnicode_CheckConsistency(res, 1));
746 restuple = Py_BuildValue("(Nn)", res, end);
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000747 Py_DECREF(object);
748 return restuple;
Walter Dörwald3aeb6322002-09-02 13:14:32 +0000749 }
750 else {
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000751 wrong_exception_type(exc);
752 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +0000753 }
754}
755
Serhiy Storchaka58cf6072013-11-19 11:32:41 +0200756#define ENC_UTF8 0
757#define ENC_UTF16BE 1
758#define ENC_UTF16LE 2
759#define ENC_UTF32BE 3
760#define ENC_UTF32LE 4
761
762static int
763get_standard_encoding(const char *encoding, int *bytelength)
764{
765 if (Py_TOLOWER(encoding[0]) == 'u' &&
766 Py_TOLOWER(encoding[1]) == 't' &&
767 Py_TOLOWER(encoding[2]) == 'f') {
768 encoding += 3;
769 if (*encoding == '-' || *encoding == '_' )
770 encoding++;
771 if (encoding[0] == '1' && encoding[1] == '6') {
772 encoding += 2;
773 *bytelength = 2;
774 if (*encoding == '\0') {
775#ifdef WORDS_BIGENDIAN
776 return ENC_UTF16BE;
777#else
778 return ENC_UTF16LE;
779#endif
780 }
781 if (*encoding == '-' || *encoding == '_' )
782 encoding++;
783 if (Py_TOLOWER(encoding[1]) == 'e' && encoding[2] == '\0') {
784 if (Py_TOLOWER(encoding[0]) == 'b')
785 return ENC_UTF16BE;
786 if (Py_TOLOWER(encoding[0]) == 'l')
787 return ENC_UTF16LE;
788 }
789 }
790 else if (encoding[0] == '3' && encoding[1] == '2') {
791 encoding += 2;
792 *bytelength = 4;
793 if (*encoding == '\0') {
794#ifdef WORDS_BIGENDIAN
795 return ENC_UTF32BE;
796#else
797 return ENC_UTF32LE;
798#endif
799 }
800 if (*encoding == '-' || *encoding == '_' )
801 encoding++;
802 if (Py_TOLOWER(encoding[1]) == 'e' && encoding[2] == '\0') {
803 if (Py_TOLOWER(encoding[0]) == 'b')
804 return ENC_UTF32BE;
805 if (Py_TOLOWER(encoding[0]) == 'l')
806 return ENC_UTF32LE;
807 }
808 }
809 }
810 /* utf-8 */
811 *bytelength = 3;
812 return ENC_UTF8;
813}
814
Martin v. Löwisaef3fb02009-05-02 19:27:30 +0000815/* This handler is declared static until someone demonstrates
816 a need to call it directly. */
817static PyObject *
Martin v. Löwise0a2b722009-05-10 08:08:56 +0000818PyCodec_SurrogatePassErrors(PyObject *exc)
Martin v. Löwisdb12d452009-05-02 18:52:14 +0000819{
820 PyObject *restuple;
821 PyObject *object;
Serhiy Storchaka58cf6072013-11-19 11:32:41 +0200822 PyObject *encode;
823 char *encoding;
824 int code;
825 int bytelength;
Martin v. Löwisb09af032011-11-04 11:16:41 +0100826 Py_ssize_t i;
Martin v. Löwisdb12d452009-05-02 18:52:14 +0000827 Py_ssize_t start;
828 Py_ssize_t end;
829 PyObject *res;
830 if (PyObject_IsInstance(exc, PyExc_UnicodeEncodeError)) {
Serhiy Storchaka58cf6072013-11-19 11:32:41 +0200831 unsigned char *outp;
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000832 if (PyUnicodeEncodeError_GetStart(exc, &start))
833 return NULL;
834 if (PyUnicodeEncodeError_GetEnd(exc, &end))
835 return NULL;
836 if (!(object = PyUnicodeEncodeError_GetObject(exc)))
837 return NULL;
Serhiy Storchaka58cf6072013-11-19 11:32:41 +0200838 if (!(encode = PyUnicodeEncodeError_GetEncoding(exc))) {
839 Py_DECREF(object);
840 return NULL;
841 }
842 if (!(encoding = PyUnicode_AsUTF8(encode))) {
843 Py_DECREF(object);
844 Py_DECREF(encode);
845 return NULL;
846 }
847 code = get_standard_encoding(encoding, &bytelength);
848 Py_DECREF(encode);
849
850 res = PyBytes_FromStringAndSize(NULL, bytelength*(end-start));
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000851 if (!res) {
852 Py_DECREF(object);
853 return NULL;
854 }
Serhiy Storchaka58cf6072013-11-19 11:32:41 +0200855 outp = (unsigned char*)PyBytes_AsString(res);
Martin v. Löwisb09af032011-11-04 11:16:41 +0100856 for (i = start; i < end; i++) {
857 /* object is guaranteed to be "ready" */
858 Py_UCS4 ch = PyUnicode_READ_CHAR(object, i);
Victor Stinner76df43d2012-10-30 01:42:39 +0100859 if (!Py_UNICODE_IS_SURROGATE(ch)) {
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000860 /* Not a surrogate, fail with original exception */
861 PyErr_SetObject(PyExceptionInstance_Class(exc), exc);
862 Py_DECREF(res);
863 Py_DECREF(object);
864 return NULL;
865 }
Serhiy Storchaka58cf6072013-11-19 11:32:41 +0200866 switch (code) {
867 case ENC_UTF8:
868 *outp++ = (unsigned char)(0xe0 | (ch >> 12));
869 *outp++ = (unsigned char)(0x80 | ((ch >> 6) & 0x3f));
870 *outp++ = (unsigned char)(0x80 | (ch & 0x3f));
871 break;
872 case ENC_UTF16LE:
873 *outp++ = (unsigned char) ch;
874 *outp++ = (unsigned char)(ch >> 8);
875 break;
876 case ENC_UTF16BE:
877 *outp++ = (unsigned char)(ch >> 8);
878 *outp++ = (unsigned char) ch;
879 break;
880 case ENC_UTF32LE:
881 *outp++ = (unsigned char) ch;
882 *outp++ = (unsigned char)(ch >> 8);
883 *outp++ = (unsigned char)(ch >> 16);
884 *outp++ = (unsigned char)(ch >> 24);
885 break;
886 case ENC_UTF32BE:
887 *outp++ = (unsigned char)(ch >> 24);
888 *outp++ = (unsigned char)(ch >> 16);
889 *outp++ = (unsigned char)(ch >> 8);
890 *outp++ = (unsigned char) ch;
891 break;
892 }
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000893 }
894 restuple = Py_BuildValue("(On)", res, end);
895 Py_DECREF(res);
896 Py_DECREF(object);
897 return restuple;
Martin v. Löwisdb12d452009-05-02 18:52:14 +0000898 }
899 else if (PyObject_IsInstance(exc, PyExc_UnicodeDecodeError)) {
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000900 unsigned char *p;
Victor Stinnerc06bb7a2011-11-04 21:36:35 +0100901 Py_UCS4 ch = 0;
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000902 if (PyUnicodeDecodeError_GetStart(exc, &start))
903 return NULL;
Serhiy Storchaka58cf6072013-11-19 11:32:41 +0200904 if (PyUnicodeDecodeError_GetEnd(exc, &end))
905 return NULL;
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000906 if (!(object = PyUnicodeDecodeError_GetObject(exc)))
907 return NULL;
908 if (!(p = (unsigned char*)PyBytes_AsString(object))) {
909 Py_DECREF(object);
910 return NULL;
911 }
Serhiy Storchaka58cf6072013-11-19 11:32:41 +0200912 if (!(encode = PyUnicodeDecodeError_GetEncoding(exc))) {
913 Py_DECREF(object);
914 return NULL;
915 }
916 if (!(encoding = PyUnicode_AsUTF8(encode))) {
917 Py_DECREF(object);
918 Py_DECREF(encode);
919 return NULL;
920 }
921 code = get_standard_encoding(encoding, &bytelength);
922 Py_DECREF(encode);
923
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000924 /* Try decoding a single surrogate character. If
925 there are more, let the codec call us again. */
926 p += start;
Serhiy Storchaka58cf6072013-11-19 11:32:41 +0200927 if (PyBytes_GET_SIZE(object) - start >= bytelength) {
928 switch (code) {
929 case ENC_UTF8:
930 if ((p[0] & 0xf0) == 0xe0 &&
931 (p[1] & 0xc0) == 0x80 &&
932 (p[2] & 0xc0) == 0x80) {
933 /* it's a three-byte code */
934 ch = ((p[0] & 0x0f) << 12) + ((p[1] & 0x3f) << 6) + (p[2] & 0x3f);
935 }
936 break;
937 case ENC_UTF16LE:
938 ch = p[1] << 8 | p[0];
939 break;
940 case ENC_UTF16BE:
941 ch = p[0] << 8 | p[1];
942 break;
943 case ENC_UTF32LE:
944 ch = (p[3] << 24) | (p[2] << 16) | (p[1] << 8) | p[0];
945 break;
946 case ENC_UTF32BE:
947 ch = (p[0] << 24) | (p[1] << 16) | (p[2] << 8) | p[3];
948 break;
949 }
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000950 }
Serhiy Storchaka58cf6072013-11-19 11:32:41 +0200951
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000952 Py_DECREF(object);
Serhiy Storchaka58cf6072013-11-19 11:32:41 +0200953 if (!Py_UNICODE_IS_SURROGATE(ch)) {
954 /* it's not a surrogate - fail */
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000955 PyErr_SetObject(PyExceptionInstance_Class(exc), exc);
956 return NULL;
957 }
Victor Stinnerc06bb7a2011-11-04 21:36:35 +0100958 res = PyUnicode_FromOrdinal(ch);
959 if (res == NULL)
960 return NULL;
Serhiy Storchaka58cf6072013-11-19 11:32:41 +0200961 return Py_BuildValue("(Nn)", res, start + bytelength);
Martin v. Löwisdb12d452009-05-02 18:52:14 +0000962 }
963 else {
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000964 wrong_exception_type(exc);
965 return NULL;
Martin v. Löwisdb12d452009-05-02 18:52:14 +0000966 }
967}
968
Martin v. Löwis011e8422009-05-05 04:43:17 +0000969static PyObject *
Martin v. Löwis43c57782009-05-10 08:15:24 +0000970PyCodec_SurrogateEscapeErrors(PyObject *exc)
Martin v. Löwis011e8422009-05-05 04:43:17 +0000971{
972 PyObject *restuple;
973 PyObject *object;
Martin v. Löwisb09af032011-11-04 11:16:41 +0100974 Py_ssize_t i;
Martin v. Löwis011e8422009-05-05 04:43:17 +0000975 Py_ssize_t start;
976 Py_ssize_t end;
977 PyObject *res;
978 if (PyObject_IsInstance(exc, PyExc_UnicodeEncodeError)) {
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000979 char *outp;
980 if (PyUnicodeEncodeError_GetStart(exc, &start))
981 return NULL;
982 if (PyUnicodeEncodeError_GetEnd(exc, &end))
983 return NULL;
984 if (!(object = PyUnicodeEncodeError_GetObject(exc)))
985 return NULL;
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000986 res = PyBytes_FromStringAndSize(NULL, end-start);
987 if (!res) {
988 Py_DECREF(object);
989 return NULL;
990 }
991 outp = PyBytes_AsString(res);
Martin v. Löwisb09af032011-11-04 11:16:41 +0100992 for (i = start; i < end; i++) {
993 /* object is guaranteed to be "ready" */
994 Py_UCS4 ch = PyUnicode_READ_CHAR(object, i);
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000995 if (ch < 0xdc80 || ch > 0xdcff) {
996 /* Not a UTF-8b surrogate, fail with original exception */
997 PyErr_SetObject(PyExceptionInstance_Class(exc), exc);
998 Py_DECREF(res);
999 Py_DECREF(object);
1000 return NULL;
1001 }
1002 *outp++ = ch - 0xdc00;
1003 }
1004 restuple = Py_BuildValue("(On)", res, end);
1005 Py_DECREF(res);
1006 Py_DECREF(object);
1007 return restuple;
Martin v. Löwis011e8422009-05-05 04:43:17 +00001008 }
1009 else if (PyObject_IsInstance(exc, PyExc_UnicodeDecodeError)) {
Victor Stinnerc06bb7a2011-11-04 21:36:35 +01001010 PyObject *str;
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001011 unsigned char *p;
Victor Stinnerc06bb7a2011-11-04 21:36:35 +01001012 Py_UCS2 ch[4]; /* decode up to 4 bad bytes. */
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001013 int consumed = 0;
1014 if (PyUnicodeDecodeError_GetStart(exc, &start))
1015 return NULL;
1016 if (PyUnicodeDecodeError_GetEnd(exc, &end))
1017 return NULL;
1018 if (!(object = PyUnicodeDecodeError_GetObject(exc)))
1019 return NULL;
1020 if (!(p = (unsigned char*)PyBytes_AsString(object))) {
1021 Py_DECREF(object);
1022 return NULL;
1023 }
1024 while (consumed < 4 && consumed < end-start) {
1025 /* Refuse to escape ASCII bytes. */
1026 if (p[start+consumed] < 128)
1027 break;
1028 ch[consumed] = 0xdc00 + p[start+consumed];
1029 consumed++;
1030 }
1031 Py_DECREF(object);
1032 if (!consumed) {
1033 /* codec complained about ASCII byte. */
1034 PyErr_SetObject(PyExceptionInstance_Class(exc), exc);
1035 return NULL;
1036 }
Victor Stinnerc06bb7a2011-11-04 21:36:35 +01001037 str = PyUnicode_FromKindAndData(PyUnicode_2BYTE_KIND, ch, consumed);
1038 if (str == NULL)
1039 return NULL;
1040 return Py_BuildValue("(Nn)", str, start+consumed);
Martin v. Löwis011e8422009-05-05 04:43:17 +00001041 }
1042 else {
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001043 wrong_exception_type(exc);
1044 return NULL;
Martin v. Löwis011e8422009-05-05 04:43:17 +00001045 }
1046}
1047
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001048
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001049static PyObject *strict_errors(PyObject *self, PyObject *exc)
1050{
1051 return PyCodec_StrictErrors(exc);
1052}
1053
1054
1055static PyObject *ignore_errors(PyObject *self, PyObject *exc)
1056{
1057 return PyCodec_IgnoreErrors(exc);
1058}
1059
1060
1061static PyObject *replace_errors(PyObject *self, PyObject *exc)
1062{
1063 return PyCodec_ReplaceErrors(exc);
1064}
1065
1066
1067static PyObject *xmlcharrefreplace_errors(PyObject *self, PyObject *exc)
1068{
1069 return PyCodec_XMLCharRefReplaceErrors(exc);
1070}
1071
1072
1073static PyObject *backslashreplace_errors(PyObject *self, PyObject *exc)
1074{
1075 return PyCodec_BackslashReplaceErrors(exc);
1076}
1077
Martin v. Löwise0a2b722009-05-10 08:08:56 +00001078static PyObject *surrogatepass_errors(PyObject *self, PyObject *exc)
Martin v. Löwisdb12d452009-05-02 18:52:14 +00001079{
Martin v. Löwise0a2b722009-05-10 08:08:56 +00001080 return PyCodec_SurrogatePassErrors(exc);
Martin v. Löwisdb12d452009-05-02 18:52:14 +00001081}
1082
Martin v. Löwis43c57782009-05-10 08:15:24 +00001083static PyObject *surrogateescape_errors(PyObject *self, PyObject *exc)
Martin v. Löwis011e8422009-05-05 04:43:17 +00001084{
Martin v. Löwis43c57782009-05-10 08:15:24 +00001085 return PyCodec_SurrogateEscapeErrors(exc);
Martin v. Löwis011e8422009-05-05 04:43:17 +00001086}
1087
Gustavo Niemeyer5ddd4c32003-03-19 00:35:36 +00001088static int _PyCodecRegistry_Init(void)
Guido van Rossumfeee4b92000-03-10 22:57:27 +00001089{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001090 static struct {
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001091 char *name;
1092 PyMethodDef def;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001093 } methods[] =
1094 {
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001095 {
1096 "strict",
1097 {
1098 "strict_errors",
1099 strict_errors,
1100 METH_O,
1101 PyDoc_STR("Implements the 'strict' error handling, which "
1102 "raises a UnicodeError on coding errors.")
1103 }
1104 },
1105 {
1106 "ignore",
1107 {
1108 "ignore_errors",
1109 ignore_errors,
1110 METH_O,
1111 PyDoc_STR("Implements the 'ignore' error handling, which "
1112 "ignores malformed data and continues.")
1113 }
1114 },
1115 {
1116 "replace",
1117 {
1118 "replace_errors",
1119 replace_errors,
1120 METH_O,
1121 PyDoc_STR("Implements the 'replace' error handling, which "
1122 "replaces malformed data with a replacement marker.")
1123 }
1124 },
1125 {
1126 "xmlcharrefreplace",
1127 {
1128 "xmlcharrefreplace_errors",
1129 xmlcharrefreplace_errors,
1130 METH_O,
1131 PyDoc_STR("Implements the 'xmlcharrefreplace' error handling, "
1132 "which replaces an unencodable character with the "
1133 "appropriate XML character reference.")
1134 }
1135 },
1136 {
1137 "backslashreplace",
1138 {
1139 "backslashreplace_errors",
1140 backslashreplace_errors,
1141 METH_O,
1142 PyDoc_STR("Implements the 'backslashreplace' error handling, "
1143 "which replaces an unencodable character with a "
1144 "backslashed escape sequence.")
1145 }
1146 },
1147 {
1148 "surrogatepass",
1149 {
1150 "surrogatepass",
1151 surrogatepass_errors,
1152 METH_O
1153 }
1154 },
1155 {
1156 "surrogateescape",
1157 {
1158 "surrogateescape",
1159 surrogateescape_errors,
1160 METH_O
1161 }
1162 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001163 };
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001164
Nicholas Bastine5662ae2004-03-24 22:22:12 +00001165 PyInterpreterState *interp = PyThreadState_GET()->interp;
Gustavo Niemeyer5ddd4c32003-03-19 00:35:36 +00001166 PyObject *mod;
Neal Norwitz739a8f82004-07-08 01:55:58 +00001167 unsigned i;
Gustavo Niemeyer5ddd4c32003-03-19 00:35:36 +00001168
1169 if (interp->codec_search_path != NULL)
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001170 return 0;
Gustavo Niemeyer5ddd4c32003-03-19 00:35:36 +00001171
1172 interp->codec_search_path = PyList_New(0);
1173 interp->codec_search_cache = PyDict_New();
1174 interp->codec_error_registry = PyDict_New();
1175
1176 if (interp->codec_error_registry) {
Victor Stinner63941882011-09-29 00:42:28 +02001177 for (i = 0; i < Py_ARRAY_LENGTH(methods); ++i) {
Andrew Svetlov3ba3a3e2012-12-25 13:32:35 +02001178 PyObject *func = PyCFunction_NewEx(&methods[i].def, NULL, NULL);
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001179 int res;
1180 if (!func)
1181 Py_FatalError("can't initialize codec error registry");
1182 res = PyCodec_RegisterError(methods[i].name, func);
1183 Py_DECREF(func);
1184 if (res)
1185 Py_FatalError("can't initialize codec error registry");
1186 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001187 }
Guido van Rossumfeee4b92000-03-10 22:57:27 +00001188
Gustavo Niemeyer5ddd4c32003-03-19 00:35:36 +00001189 if (interp->codec_search_path == NULL ||
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001190 interp->codec_search_cache == NULL ||
1191 interp->codec_error_registry == NULL)
1192 Py_FatalError("can't initialize codec registry");
Gustavo Niemeyer5ddd4c32003-03-19 00:35:36 +00001193
Christian Heimes819b8bf2008-01-03 23:05:47 +00001194 mod = PyImport_ImportModuleNoBlock("encodings");
Gustavo Niemeyer5ddd4c32003-03-19 00:35:36 +00001195 if (mod == NULL) {
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001196 return -1;
Gustavo Niemeyer5ddd4c32003-03-19 00:35:36 +00001197 }
1198 Py_DECREF(mod);
Christian Heimes6a27efa2008-10-30 21:48:26 +00001199 interp->codecs_initialized = 1;
Gustavo Niemeyer5ddd4c32003-03-19 00:35:36 +00001200 return 0;
Guido van Rossumfeee4b92000-03-10 22:57:27 +00001201}