blob: fe0cab4f7fb6e9b4f1023b966a319f06ed8d47b9 [file] [log] [blame]
Guido van Rossumfeee4b92000-03-10 22:57:27 +00001/* ------------------------------------------------------------------------
2
3 Python Codec Registry and support functions
4
5Written by Marc-Andre Lemburg (mal@lemburg.com).
6
Guido van Rossum16b1ad92000-08-03 16:24:25 +00007Copyright (c) Corporation for National Research Initiatives.
Guido van Rossumfeee4b92000-03-10 22:57:27 +00008
9 ------------------------------------------------------------------------ */
10
11#include "Python.h"
12#include <ctype.h>
13
Victor Stinnerf5cff562011-10-14 02:13:11 +020014const char *Py_hexdigits = "0123456789abcdef";
15
Guido van Rossumfeee4b92000-03-10 22:57:27 +000016/* --- Codec Registry ----------------------------------------------------- */
17
18/* Import the standard encodings package which will register the first
Guido van Rossum98297ee2007-11-06 21:34:58 +000019 codec search function.
Guido van Rossumfeee4b92000-03-10 22:57:27 +000020
21 This is done in a lazy way so that the Unicode implementation does
22 not downgrade startup time of scripts not needing it.
23
Guido van Rossumb95de4f2000-03-31 17:25:23 +000024 ImportErrors are silently ignored by this function. Only one try is
25 made.
Guido van Rossumfeee4b92000-03-10 22:57:27 +000026
27*/
28
Gustavo Niemeyer5ddd4c32003-03-19 00:35:36 +000029static int _PyCodecRegistry_Init(void); /* Forward */
Guido van Rossumfeee4b92000-03-10 22:57:27 +000030
Guido van Rossumfeee4b92000-03-10 22:57:27 +000031int PyCodec_Register(PyObject *search_function)
32{
Nicholas Bastine5662ae2004-03-24 22:22:12 +000033 PyInterpreterState *interp = PyThreadState_GET()->interp;
Gustavo Niemeyer5ddd4c32003-03-19 00:35:36 +000034 if (interp->codec_search_path == NULL && _PyCodecRegistry_Init())
Antoine Pitrouf95a1b32010-05-09 15:52:27 +000035 goto onError;
Guido van Rossumfeee4b92000-03-10 22:57:27 +000036 if (search_function == NULL) {
Antoine Pitrouf95a1b32010-05-09 15:52:27 +000037 PyErr_BadArgument();
38 goto onError;
Guido van Rossumfeee4b92000-03-10 22:57:27 +000039 }
40 if (!PyCallable_Check(search_function)) {
Antoine Pitrouf95a1b32010-05-09 15:52:27 +000041 PyErr_SetString(PyExc_TypeError, "argument must be callable");
42 goto onError;
Guido van Rossumfeee4b92000-03-10 22:57:27 +000043 }
Gustavo Niemeyer5ddd4c32003-03-19 00:35:36 +000044 return PyList_Append(interp->codec_search_path, search_function);
Guido van Rossumb95de4f2000-03-31 17:25:23 +000045
46 onError:
47 return -1;
Guido van Rossumfeee4b92000-03-10 22:57:27 +000048}
49
Guido van Rossum9e896b32000-04-05 20:11:21 +000050/* Convert a string to a normalized Python string: all characters are
51 converted to lower case, spaces are replaced with underscores. */
52
Guido van Rossumfeee4b92000-03-10 22:57:27 +000053static
Guido van Rossum9e896b32000-04-05 20:11:21 +000054PyObject *normalizestring(const char *string)
Guido van Rossumfeee4b92000-03-10 22:57:27 +000055{
Antoine Pitrou9ed5f272013-08-13 20:18:52 +020056 size_t i;
Guido van Rossum582acec2000-06-28 22:07:35 +000057 size_t len = strlen(string);
Guido van Rossumfeee4b92000-03-10 22:57:27 +000058 char *p;
59 PyObject *v;
Guido van Rossum21431e82007-10-19 21:48:41 +000060
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000061 if (len > PY_SSIZE_T_MAX) {
Antoine Pitrouf95a1b32010-05-09 15:52:27 +000062 PyErr_SetString(PyExc_OverflowError, "string is too large");
63 return NULL;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000064 }
Guido van Rossum21431e82007-10-19 21:48:41 +000065
66 p = PyMem_Malloc(len + 1);
67 if (p == NULL)
Victor Stinnercc351592013-07-12 00:02:55 +020068 return PyErr_NoMemory();
Guido van Rossum9e896b32000-04-05 20:11:21 +000069 for (i = 0; i < len; i++) {
Antoine Pitrou9ed5f272013-08-13 20:18:52 +020070 char ch = string[i];
Guido van Rossum9e896b32000-04-05 20:11:21 +000071 if (ch == ' ')
72 ch = '-';
73 else
Antoine Pitroucf9d3c02011-07-24 02:27:04 +020074 ch = Py_TOLOWER(Py_CHARMASK(ch));
Antoine Pitrouf95a1b32010-05-09 15:52:27 +000075 p[i] = ch;
Guido van Rossum9e896b32000-04-05 20:11:21 +000076 }
Guido van Rossum21431e82007-10-19 21:48:41 +000077 p[i] = '\0';
78 v = PyUnicode_FromString(p);
79 if (v == NULL)
80 return NULL;
81 PyMem_Free(p);
Guido van Rossumfeee4b92000-03-10 22:57:27 +000082 return v;
83}
84
85/* Lookup the given encoding and return a tuple providing the codec
86 facilities.
87
88 The encoding string is looked up converted to all lower-case
89 characters. This makes encodings looked up through this mechanism
90 effectively case-insensitive.
91
Guido van Rossum98297ee2007-11-06 21:34:58 +000092 If no codec is found, a LookupError is set and NULL returned.
Guido van Rossumb95de4f2000-03-31 17:25:23 +000093
94 As side effect, this tries to load the encodings package, if not
95 yet done. This is part of the lazy load strategy for the encodings
96 package.
97
98*/
Guido van Rossumfeee4b92000-03-10 22:57:27 +000099
100PyObject *_PyCodec_Lookup(const char *encoding)
101{
Gustavo Niemeyer5ddd4c32003-03-19 00:35:36 +0000102 PyInterpreterState *interp;
Guido van Rossum5ba3c842000-03-24 20:52:23 +0000103 PyObject *result, *args = NULL, *v;
Thomas Wouters477c8d52006-05-27 19:21:47 +0000104 Py_ssize_t i, len;
Guido van Rossumfeee4b92000-03-10 22:57:27 +0000105
Fred Drake766de832000-05-09 19:55:59 +0000106 if (encoding == NULL) {
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000107 PyErr_BadArgument();
108 goto onError;
Fred Drake766de832000-05-09 19:55:59 +0000109 }
Gustavo Niemeyer5ddd4c32003-03-19 00:35:36 +0000110
Nicholas Bastine5662ae2004-03-24 22:22:12 +0000111 interp = PyThreadState_GET()->interp;
Gustavo Niemeyer5ddd4c32003-03-19 00:35:36 +0000112 if (interp->codec_search_path == NULL && _PyCodecRegistry_Init())
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000113 goto onError;
Guido van Rossumfeee4b92000-03-10 22:57:27 +0000114
Guido van Rossum9e896b32000-04-05 20:11:21 +0000115 /* Convert the encoding to a normalized Python string: all
Thomas Wouters7e474022000-07-16 12:04:32 +0000116 characters are converted to lower case, spaces and hyphens are
Guido van Rossum9e896b32000-04-05 20:11:21 +0000117 replaced with underscores. */
118 v = normalizestring(encoding);
Guido van Rossumfeee4b92000-03-10 22:57:27 +0000119 if (v == NULL)
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000120 goto onError;
Guido van Rossum21431e82007-10-19 21:48:41 +0000121 PyUnicode_InternInPlace(&v);
Guido van Rossumfeee4b92000-03-10 22:57:27 +0000122
123 /* First, try to lookup the name in the registry dictionary */
Gustavo Niemeyer5ddd4c32003-03-19 00:35:36 +0000124 result = PyDict_GetItem(interp->codec_search_cache, v);
Guido van Rossumfeee4b92000-03-10 22:57:27 +0000125 if (result != NULL) {
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000126 Py_INCREF(result);
127 Py_DECREF(v);
128 return result;
Guido van Rossumfeee4b92000-03-10 22:57:27 +0000129 }
Guido van Rossum98297ee2007-11-06 21:34:58 +0000130
Guido van Rossumfeee4b92000-03-10 22:57:27 +0000131 /* Next, scan the search functions in order of registration */
Guido van Rossumfeee4b92000-03-10 22:57:27 +0000132 args = PyTuple_New(1);
133 if (args == NULL)
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000134 goto onError;
Guido van Rossumfeee4b92000-03-10 22:57:27 +0000135 PyTuple_SET_ITEM(args,0,v);
Guido van Rossum5ba3c842000-03-24 20:52:23 +0000136
Gustavo Niemeyer5ddd4c32003-03-19 00:35:36 +0000137 len = PyList_Size(interp->codec_search_path);
Guido van Rossum5ba3c842000-03-24 20:52:23 +0000138 if (len < 0)
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000139 goto onError;
Guido van Rossumb95de4f2000-03-31 17:25:23 +0000140 if (len == 0) {
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000141 PyErr_SetString(PyExc_LookupError,
142 "no codec search functions registered: "
143 "can't find encoding");
144 goto onError;
Guido van Rossumb95de4f2000-03-31 17:25:23 +0000145 }
Guido van Rossumfeee4b92000-03-10 22:57:27 +0000146
147 for (i = 0; i < len; i++) {
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000148 PyObject *func;
Guido van Rossum98297ee2007-11-06 21:34:58 +0000149
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000150 func = PyList_GetItem(interp->codec_search_path, i);
151 if (func == NULL)
152 goto onError;
153 result = PyEval_CallObject(func, args);
154 if (result == NULL)
155 goto onError;
156 if (result == Py_None) {
157 Py_DECREF(result);
158 continue;
159 }
160 if (!PyTuple_Check(result) || PyTuple_GET_SIZE(result) != 4) {
161 PyErr_SetString(PyExc_TypeError,
162 "codec search functions must return 4-tuples");
163 Py_DECREF(result);
164 goto onError;
165 }
166 break;
Guido van Rossumfeee4b92000-03-10 22:57:27 +0000167 }
168 if (i == len) {
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000169 /* XXX Perhaps we should cache misses too ? */
170 PyErr_Format(PyExc_LookupError,
Martin v. Löwiseb42b022002-09-26 16:01:24 +0000171 "unknown encoding: %s", encoding);
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000172 goto onError;
Guido van Rossumfeee4b92000-03-10 22:57:27 +0000173 }
174
175 /* Cache and return the result */
Neal Norwitz9edcc2e2007-08-11 04:58:26 +0000176 if (PyDict_SetItem(interp->codec_search_cache, v, result) < 0) {
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000177 Py_DECREF(result);
178 goto onError;
Neal Norwitz9edcc2e2007-08-11 04:58:26 +0000179 }
Guido van Rossumfeee4b92000-03-10 22:57:27 +0000180 Py_DECREF(args);
181 return result;
182
183 onError:
184 Py_XDECREF(args);
185 return NULL;
186}
187
Marc-André Lemburgb2750b52008-06-06 12:18:17 +0000188/* Codec registry encoding check API. */
189
190int PyCodec_KnownEncoding(const char *encoding)
191{
192 PyObject *codecs;
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000193
Marc-André Lemburgb2750b52008-06-06 12:18:17 +0000194 codecs = _PyCodec_Lookup(encoding);
195 if (!codecs) {
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000196 PyErr_Clear();
197 return 0;
Marc-André Lemburgb2750b52008-06-06 12:18:17 +0000198 }
199 else {
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000200 Py_DECREF(codecs);
201 return 1;
Marc-André Lemburgb2750b52008-06-06 12:18:17 +0000202 }
203}
204
Guido van Rossumfeee4b92000-03-10 22:57:27 +0000205static
206PyObject *args_tuple(PyObject *object,
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000207 const char *errors)
Guido van Rossumfeee4b92000-03-10 22:57:27 +0000208{
209 PyObject *args;
Guido van Rossum98297ee2007-11-06 21:34:58 +0000210
Guido van Rossumfeee4b92000-03-10 22:57:27 +0000211 args = PyTuple_New(1 + (errors != NULL));
212 if (args == NULL)
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000213 return NULL;
Guido van Rossumfeee4b92000-03-10 22:57:27 +0000214 Py_INCREF(object);
215 PyTuple_SET_ITEM(args,0,object);
216 if (errors) {
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000217 PyObject *v;
Guido van Rossum98297ee2007-11-06 21:34:58 +0000218
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000219 v = PyUnicode_FromString(errors);
220 if (v == NULL) {
221 Py_DECREF(args);
222 return NULL;
223 }
224 PyTuple_SET_ITEM(args, 1, v);
Guido van Rossumfeee4b92000-03-10 22:57:27 +0000225 }
226 return args;
227}
228
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000229/* Helper function to get a codec item */
Guido van Rossumfeee4b92000-03-10 22:57:27 +0000230
231static
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000232PyObject *codec_getitem(const char *encoding, int index)
Guido van Rossumfeee4b92000-03-10 22:57:27 +0000233{
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000234 PyObject *codecs;
235 PyObject *v;
Guido van Rossumfeee4b92000-03-10 22:57:27 +0000236
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000237 codecs = _PyCodec_Lookup(encoding);
238 if (codecs == NULL)
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000239 return NULL;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000240 v = PyTuple_GET_ITEM(codecs, index);
241 Py_DECREF(codecs);
242 Py_INCREF(v);
243 return v;
244}
245
246/* Helper function to create an incremental codec. */
247
248static
249PyObject *codec_getincrementalcodec(const char *encoding,
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000250 const char *errors,
251 const char *attrname)
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000252{
253 PyObject *codecs, *ret, *inccodec;
254
255 codecs = _PyCodec_Lookup(encoding);
256 if (codecs == NULL)
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000257 return NULL;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000258 inccodec = PyObject_GetAttrString(codecs, attrname);
259 Py_DECREF(codecs);
260 if (inccodec == NULL)
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000261 return NULL;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000262 if (errors)
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000263 ret = PyObject_CallFunction(inccodec, "s", errors);
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000264 else
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000265 ret = PyObject_CallFunction(inccodec, NULL);
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000266 Py_DECREF(inccodec);
267 return ret;
268}
269
270/* Helper function to create a stream codec. */
271
272static
273PyObject *codec_getstreamcodec(const char *encoding,
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000274 PyObject *stream,
275 const char *errors,
276 const int index)
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000277{
Thomas Wouters0e3f5912006-08-11 14:57:12 +0000278 PyObject *codecs, *streamcodec, *codeccls;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000279
280 codecs = _PyCodec_Lookup(encoding);
281 if (codecs == NULL)
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000282 return NULL;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000283
Thomas Wouters0e3f5912006-08-11 14:57:12 +0000284 codeccls = PyTuple_GET_ITEM(codecs, index);
285 if (errors != NULL)
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000286 streamcodec = PyObject_CallFunction(codeccls, "Os", stream, errors);
Thomas Wouters0e3f5912006-08-11 14:57:12 +0000287 else
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000288 streamcodec = PyObject_CallFunction(codeccls, "O", stream);
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000289 Py_DECREF(codecs);
290 return streamcodec;
Guido van Rossumfeee4b92000-03-10 22:57:27 +0000291}
292
Guido van Rossum98297ee2007-11-06 21:34:58 +0000293/* Convenience APIs to query the Codec registry.
294
Guido van Rossumfeee4b92000-03-10 22:57:27 +0000295 All APIs return a codec object with incremented refcount.
Guido van Rossum98297ee2007-11-06 21:34:58 +0000296
Guido van Rossumfeee4b92000-03-10 22:57:27 +0000297 */
298
299PyObject *PyCodec_Encoder(const char *encoding)
300{
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000301 return codec_getitem(encoding, 0);
Guido van Rossumfeee4b92000-03-10 22:57:27 +0000302}
303
304PyObject *PyCodec_Decoder(const char *encoding)
305{
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000306 return codec_getitem(encoding, 1);
Guido van Rossumfeee4b92000-03-10 22:57:27 +0000307}
308
Thomas Woutersa9773292006-04-21 09:43:23 +0000309PyObject *PyCodec_IncrementalEncoder(const char *encoding,
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000310 const char *errors)
Thomas Woutersa9773292006-04-21 09:43:23 +0000311{
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000312 return codec_getincrementalcodec(encoding, errors, "incrementalencoder");
Thomas Woutersa9773292006-04-21 09:43:23 +0000313}
314
315PyObject *PyCodec_IncrementalDecoder(const char *encoding,
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000316 const char *errors)
Thomas Woutersa9773292006-04-21 09:43:23 +0000317{
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000318 return codec_getincrementalcodec(encoding, errors, "incrementaldecoder");
Thomas Woutersa9773292006-04-21 09:43:23 +0000319}
320
Guido van Rossumfeee4b92000-03-10 22:57:27 +0000321PyObject *PyCodec_StreamReader(const char *encoding,
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000322 PyObject *stream,
323 const char *errors)
Guido van Rossumfeee4b92000-03-10 22:57:27 +0000324{
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000325 return codec_getstreamcodec(encoding, stream, errors, 2);
Guido van Rossumfeee4b92000-03-10 22:57:27 +0000326}
327
328PyObject *PyCodec_StreamWriter(const char *encoding,
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000329 PyObject *stream,
330 const char *errors)
Guido van Rossumfeee4b92000-03-10 22:57:27 +0000331{
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000332 return codec_getstreamcodec(encoding, stream, errors, 3);
Guido van Rossumfeee4b92000-03-10 22:57:27 +0000333}
334
Nick Coghlan8b097b42013-11-13 23:49:21 +1000335/* Helper that tries to ensure the reported exception chain indicates the
336 * codec that was invoked to trigger the failure without changing the type
337 * of the exception raised.
338 */
339static void
340wrap_codec_error(const char *operation,
341 const char *encoding)
342{
343 /* TrySetFromCause will replace the active exception with a suitably
344 * updated clone if it can, otherwise it will leave the original
345 * exception alone.
346 */
347 _PyErr_TrySetFromCause("%s with '%s' codec failed",
348 operation, encoding);
349}
350
Guido van Rossumfeee4b92000-03-10 22:57:27 +0000351/* Encode an object (e.g. an Unicode object) using the given encoding
352 and return the resulting encoded object (usually a Python string).
353
354 errors is passed to the encoder factory as argument if non-NULL. */
355
356PyObject *PyCodec_Encode(PyObject *object,
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000357 const char *encoding,
358 const char *errors)
Guido van Rossumfeee4b92000-03-10 22:57:27 +0000359{
360 PyObject *encoder = NULL;
Neal Norwitz3715c3e2005-11-24 22:09:18 +0000361 PyObject *args = NULL, *result = NULL;
Guido van Rossum98297ee2007-11-06 21:34:58 +0000362 PyObject *v = NULL;
Guido van Rossumfeee4b92000-03-10 22:57:27 +0000363
364 encoder = PyCodec_Encoder(encoding);
365 if (encoder == NULL)
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000366 goto onError;
Guido van Rossumfeee4b92000-03-10 22:57:27 +0000367
368 args = args_tuple(object, errors);
369 if (args == NULL)
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000370 goto onError;
Guido van Rossum98297ee2007-11-06 21:34:58 +0000371
372 result = PyEval_CallObject(encoder, args);
Nick Coghlanc4c25802013-11-15 21:47:37 +1000373 if (result == NULL) {
374 wrap_codec_error("encoding", encoding);
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000375 goto onError;
Nick Coghlanc4c25802013-11-15 21:47:37 +1000376 }
Guido van Rossumfeee4b92000-03-10 22:57:27 +0000377
Guido van Rossum98297ee2007-11-06 21:34:58 +0000378 if (!PyTuple_Check(result) ||
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000379 PyTuple_GET_SIZE(result) != 2) {
380 PyErr_SetString(PyExc_TypeError,
381 "encoder must return a tuple (object, integer)");
382 goto onError;
Guido van Rossumfeee4b92000-03-10 22:57:27 +0000383 }
Marc-André Lemburgb2750b52008-06-06 12:18:17 +0000384 v = PyTuple_GET_ITEM(result,0);
385 Py_INCREF(v);
Guido van Rossumfeee4b92000-03-10 22:57:27 +0000386 /* We don't check or use the second (integer) entry. */
387
Marc-André Lemburgb2750b52008-06-06 12:18:17 +0000388 Py_DECREF(args);
389 Py_DECREF(encoder);
390 Py_DECREF(result);
391 return v;
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000392
Guido van Rossumfeee4b92000-03-10 22:57:27 +0000393 onError:
Neal Norwitz3715c3e2005-11-24 22:09:18 +0000394 Py_XDECREF(result);
Guido van Rossumfeee4b92000-03-10 22:57:27 +0000395 Py_XDECREF(args);
396 Py_XDECREF(encoder);
Marc-André Lemburgb2750b52008-06-06 12:18:17 +0000397 return NULL;
Guido van Rossumfeee4b92000-03-10 22:57:27 +0000398}
399
400/* Decode an object (usually a Python string) using the given encoding
401 and return an equivalent object (e.g. an Unicode object).
402
403 errors is passed to the decoder factory as argument if non-NULL. */
404
405PyObject *PyCodec_Decode(PyObject *object,
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000406 const char *encoding,
407 const char *errors)
Guido van Rossumfeee4b92000-03-10 22:57:27 +0000408{
409 PyObject *decoder = NULL;
410 PyObject *args = NULL, *result = NULL;
411 PyObject *v;
412
413 decoder = PyCodec_Decoder(encoding);
414 if (decoder == NULL)
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000415 goto onError;
Guido van Rossumfeee4b92000-03-10 22:57:27 +0000416
417 args = args_tuple(object, errors);
418 if (args == NULL)
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000419 goto onError;
Guido van Rossum98297ee2007-11-06 21:34:58 +0000420
Guido van Rossumfeee4b92000-03-10 22:57:27 +0000421 result = PyEval_CallObject(decoder,args);
Nick Coghlanc4c25802013-11-15 21:47:37 +1000422 if (result == NULL) {
423 wrap_codec_error("decoding", encoding);
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000424 goto onError;
Nick Coghlanc4c25802013-11-15 21:47:37 +1000425 }
Guido van Rossum98297ee2007-11-06 21:34:58 +0000426 if (!PyTuple_Check(result) ||
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000427 PyTuple_GET_SIZE(result) != 2) {
428 PyErr_SetString(PyExc_TypeError,
429 "decoder must return a tuple (object,integer)");
430 goto onError;
Guido van Rossumfeee4b92000-03-10 22:57:27 +0000431 }
432 v = PyTuple_GET_ITEM(result,0);
433 Py_INCREF(v);
434 /* We don't check or use the second (integer) entry. */
435
436 Py_DECREF(args);
437 Py_DECREF(decoder);
438 Py_DECREF(result);
439 return v;
Guido van Rossum98297ee2007-11-06 21:34:58 +0000440
Guido van Rossumfeee4b92000-03-10 22:57:27 +0000441 onError:
442 Py_XDECREF(args);
443 Py_XDECREF(decoder);
444 Py_XDECREF(result);
445 return NULL;
446}
447
Walter Dörwald3aeb6322002-09-02 13:14:32 +0000448/* Register the error handling callback function error under the name
449 name. This function will be called by the codec when it encounters
450 an unencodable characters/undecodable bytes and doesn't know the
451 callback name, when name is specified as the error parameter
452 in the call to the encode/decode function.
453 Return 0 on success, -1 on error */
454int PyCodec_RegisterError(const char *name, PyObject *error)
455{
Nicholas Bastine5662ae2004-03-24 22:22:12 +0000456 PyInterpreterState *interp = PyThreadState_GET()->interp;
Gustavo Niemeyer5ddd4c32003-03-19 00:35:36 +0000457 if (interp->codec_search_path == NULL && _PyCodecRegistry_Init())
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000458 return -1;
Walter Dörwald3aeb6322002-09-02 13:14:32 +0000459 if (!PyCallable_Check(error)) {
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000460 PyErr_SetString(PyExc_TypeError, "handler must be callable");
461 return -1;
Walter Dörwald3aeb6322002-09-02 13:14:32 +0000462 }
Gustavo Niemeyer5ddd4c32003-03-19 00:35:36 +0000463 return PyDict_SetItemString(interp->codec_error_registry,
Serhiy Storchakac6792272013-10-19 21:03:34 +0300464 name, error);
Walter Dörwald3aeb6322002-09-02 13:14:32 +0000465}
466
467/* Lookup the error handling callback function registered under the
468 name error. As a special case NULL can be passed, in which case
469 the error handling callback for strict encoding will be returned. */
470PyObject *PyCodec_LookupError(const char *name)
471{
472 PyObject *handler = NULL;
473
Nicholas Bastine5662ae2004-03-24 22:22:12 +0000474 PyInterpreterState *interp = PyThreadState_GET()->interp;
Gustavo Niemeyer5ddd4c32003-03-19 00:35:36 +0000475 if (interp->codec_search_path == NULL && _PyCodecRegistry_Init())
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000476 return NULL;
Gustavo Niemeyer5ddd4c32003-03-19 00:35:36 +0000477
Walter Dörwald3aeb6322002-09-02 13:14:32 +0000478 if (name==NULL)
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000479 name = "strict";
Serhiy Storchakac6792272013-10-19 21:03:34 +0300480 handler = PyDict_GetItemString(interp->codec_error_registry, name);
Walter Dörwald3aeb6322002-09-02 13:14:32 +0000481 if (!handler)
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000482 PyErr_Format(PyExc_LookupError, "unknown error handler name '%.400s'", name);
Walter Dörwald3aeb6322002-09-02 13:14:32 +0000483 else
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000484 Py_INCREF(handler);
Walter Dörwald3aeb6322002-09-02 13:14:32 +0000485 return handler;
486}
487
488static void wrong_exception_type(PyObject *exc)
489{
Martin v. Löwisbd928fe2011-10-14 10:20:37 +0200490 _Py_IDENTIFIER(__class__);
491 _Py_IDENTIFIER(__name__);
Martin v. Löwis1ee1b6f2011-10-10 18:11:30 +0200492 PyObject *type = _PyObject_GetAttrId(exc, &PyId___class__);
Walter Dörwald3aeb6322002-09-02 13:14:32 +0000493 if (type != NULL) {
Martin v. Löwis1ee1b6f2011-10-10 18:11:30 +0200494 PyObject *name = _PyObject_GetAttrId(type, &PyId___name__);
Walter Dörwald573c08c2007-05-25 15:46:59 +0000495 Py_DECREF(type);
496 if (name != NULL) {
497 PyErr_Format(PyExc_TypeError,
498 "don't know how to handle %S in error callback", name);
499 Py_DECREF(name);
500 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +0000501 }
502}
503
504PyObject *PyCodec_StrictErrors(PyObject *exc)
505{
Brett Cannonbf364092006-03-01 04:25:17 +0000506 if (PyExceptionInstance_Check(exc))
507 PyErr_SetObject(PyExceptionInstance_Class(exc), exc);
Walter Dörwald3aeb6322002-09-02 13:14:32 +0000508 else
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000509 PyErr_SetString(PyExc_TypeError, "codec must pass exception instance");
Walter Dörwald3aeb6322002-09-02 13:14:32 +0000510 return NULL;
511}
512
513
514PyObject *PyCodec_IgnoreErrors(PyObject *exc)
515{
Martin v. Löwis18e16552006-02-15 17:27:45 +0000516 Py_ssize_t end;
Walter Dörwald3aeb6322002-09-02 13:14:32 +0000517 if (PyObject_IsInstance(exc, PyExc_UnicodeEncodeError)) {
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000518 if (PyUnicodeEncodeError_GetEnd(exc, &end))
519 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +0000520 }
521 else if (PyObject_IsInstance(exc, PyExc_UnicodeDecodeError)) {
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000522 if (PyUnicodeDecodeError_GetEnd(exc, &end))
523 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +0000524 }
525 else if (PyObject_IsInstance(exc, PyExc_UnicodeTranslateError)) {
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000526 if (PyUnicodeTranslateError_GetEnd(exc, &end))
527 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +0000528 }
529 else {
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000530 wrong_exception_type(exc);
531 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +0000532 }
Victor Stinneree450092011-12-01 02:52:11 +0100533 return Py_BuildValue("(Nn)", PyUnicode_New(0, 0), end);
Walter Dörwald3aeb6322002-09-02 13:14:32 +0000534}
535
536
537PyObject *PyCodec_ReplaceErrors(PyObject *exc)
538{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200539 Py_ssize_t start, end, i, len;
Walter Dörwald3aeb6322002-09-02 13:14:32 +0000540
541 if (PyObject_IsInstance(exc, PyExc_UnicodeEncodeError)) {
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000542 PyObject *res;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200543 int kind;
544 void *data;
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000545 if (PyUnicodeEncodeError_GetStart(exc, &start))
546 return NULL;
547 if (PyUnicodeEncodeError_GetEnd(exc, &end))
548 return NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200549 len = end - start;
550 res = PyUnicode_New(len, '?');
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000551 if (res == NULL)
552 return NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200553 kind = PyUnicode_KIND(res);
554 data = PyUnicode_DATA(res);
555 for (i = 0; i < len; ++i)
556 PyUnicode_WRITE(kind, data, i, '?');
Victor Stinner8f825062012-04-27 13:55:39 +0200557 assert(_PyUnicode_CheckConsistency(res, 1));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200558 return Py_BuildValue("(Nn)", res, end);
Walter Dörwald3aeb6322002-09-02 13:14:32 +0000559 }
560 else if (PyObject_IsInstance(exc, PyExc_UnicodeDecodeError)) {
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000561 if (PyUnicodeDecodeError_GetEnd(exc, &end))
562 return NULL;
Victor Stinner1a15aba2011-10-02 19:00:15 +0200563 return Py_BuildValue("(Cn)",
564 (int)Py_UNICODE_REPLACEMENT_CHARACTER,
565 end);
Walter Dörwald3aeb6322002-09-02 13:14:32 +0000566 }
567 else if (PyObject_IsInstance(exc, PyExc_UnicodeTranslateError)) {
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000568 PyObject *res;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200569 int kind;
570 void *data;
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000571 if (PyUnicodeTranslateError_GetStart(exc, &start))
572 return NULL;
573 if (PyUnicodeTranslateError_GetEnd(exc, &end))
574 return NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200575 len = end - start;
576 res = PyUnicode_New(len, Py_UNICODE_REPLACEMENT_CHARACTER);
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000577 if (res == NULL)
578 return NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200579 kind = PyUnicode_KIND(res);
580 data = PyUnicode_DATA(res);
581 for (i=0; i < len; i++)
582 PyUnicode_WRITE(kind, data, i, Py_UNICODE_REPLACEMENT_CHARACTER);
Victor Stinner8f825062012-04-27 13:55:39 +0200583 assert(_PyUnicode_CheckConsistency(res, 1));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200584 return Py_BuildValue("(Nn)", res, end);
Walter Dörwald3aeb6322002-09-02 13:14:32 +0000585 }
586 else {
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000587 wrong_exception_type(exc);
588 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +0000589 }
590}
591
592PyObject *PyCodec_XMLCharRefReplaceErrors(PyObject *exc)
593{
594 if (PyObject_IsInstance(exc, PyExc_UnicodeEncodeError)) {
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000595 PyObject *restuple;
596 PyObject *object;
Victor Stinnerb31f1bc2011-11-04 21:29:10 +0100597 Py_ssize_t i;
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000598 Py_ssize_t start;
599 Py_ssize_t end;
600 PyObject *res;
Martin v. Löwisb09af032011-11-04 11:16:41 +0100601 unsigned char *outp;
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000602 int ressize;
Martin v. Löwisb09af032011-11-04 11:16:41 +0100603 Py_UCS4 ch;
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000604 if (PyUnicodeEncodeError_GetStart(exc, &start))
605 return NULL;
606 if (PyUnicodeEncodeError_GetEnd(exc, &end))
607 return NULL;
608 if (!(object = PyUnicodeEncodeError_GetObject(exc)))
609 return NULL;
Martin v. Löwisb09af032011-11-04 11:16:41 +0100610 for (i = start, ressize = 0; i < end; ++i) {
611 /* object is guaranteed to be "ready" */
612 ch = PyUnicode_READ_CHAR(object, i);
613 if (ch<10)
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000614 ressize += 2+1+1;
Martin v. Löwisb09af032011-11-04 11:16:41 +0100615 else if (ch<100)
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000616 ressize += 2+2+1;
Martin v. Löwisb09af032011-11-04 11:16:41 +0100617 else if (ch<1000)
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000618 ressize += 2+3+1;
Martin v. Löwisb09af032011-11-04 11:16:41 +0100619 else if (ch<10000)
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000620 ressize += 2+4+1;
Martin v. Löwisb09af032011-11-04 11:16:41 +0100621 else if (ch<100000)
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000622 ressize += 2+5+1;
Martin v. Löwisb09af032011-11-04 11:16:41 +0100623 else if (ch<1000000)
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000624 ressize += 2+6+1;
625 else
626 ressize += 2+7+1;
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000627 }
628 /* allocate replacement */
Martin v. Löwisb09af032011-11-04 11:16:41 +0100629 res = PyUnicode_New(ressize, 127);
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000630 if (res == NULL) {
631 Py_DECREF(object);
632 return NULL;
633 }
Martin v. Löwisb09af032011-11-04 11:16:41 +0100634 outp = PyUnicode_1BYTE_DATA(res);
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000635 /* generate replacement */
Victor Stinnerb31f1bc2011-11-04 21:29:10 +0100636 for (i = start; i < end; ++i) {
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000637 int digits;
638 int base;
Martin v. Löwis8ba79302011-11-04 12:26:49 +0100639 ch = PyUnicode_READ_CHAR(object, i);
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000640 *outp++ = '&';
641 *outp++ = '#';
Martin v. Löwisb09af032011-11-04 11:16:41 +0100642 if (ch<10) {
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000643 digits = 1;
644 base = 1;
645 }
Martin v. Löwisb09af032011-11-04 11:16:41 +0100646 else if (ch<100) {
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000647 digits = 2;
648 base = 10;
649 }
Martin v. Löwisb09af032011-11-04 11:16:41 +0100650 else if (ch<1000) {
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000651 digits = 3;
652 base = 100;
653 }
Martin v. Löwisb09af032011-11-04 11:16:41 +0100654 else if (ch<10000) {
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000655 digits = 4;
656 base = 1000;
657 }
Martin v. Löwisb09af032011-11-04 11:16:41 +0100658 else if (ch<100000) {
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000659 digits = 5;
660 base = 10000;
661 }
Martin v. Löwisb09af032011-11-04 11:16:41 +0100662 else if (ch<1000000) {
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000663 digits = 6;
664 base = 100000;
665 }
666 else {
667 digits = 7;
668 base = 1000000;
669 }
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000670 while (digits-->0) {
Martin v. Löwisb09af032011-11-04 11:16:41 +0100671 *outp++ = '0' + ch/base;
672 ch %= base;
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000673 base /= 10;
674 }
675 *outp++ = ';';
676 }
Victor Stinner8f825062012-04-27 13:55:39 +0200677 assert(_PyUnicode_CheckConsistency(res, 1));
678 restuple = Py_BuildValue("(Nn)", res, end);
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000679 Py_DECREF(object);
680 return restuple;
Walter Dörwald3aeb6322002-09-02 13:14:32 +0000681 }
682 else {
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000683 wrong_exception_type(exc);
684 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +0000685 }
686}
687
Walter Dörwald3aeb6322002-09-02 13:14:32 +0000688PyObject *PyCodec_BackslashReplaceErrors(PyObject *exc)
689{
690 if (PyObject_IsInstance(exc, PyExc_UnicodeEncodeError)) {
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000691 PyObject *restuple;
692 PyObject *object;
Martin v. Löwisb09af032011-11-04 11:16:41 +0100693 Py_ssize_t i;
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000694 Py_ssize_t start;
695 Py_ssize_t end;
696 PyObject *res;
Martin v. Löwisb09af032011-11-04 11:16:41 +0100697 unsigned char *outp;
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000698 int ressize;
Martin v. Löwisb09af032011-11-04 11:16:41 +0100699 Py_UCS4 c;
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000700 if (PyUnicodeEncodeError_GetStart(exc, &start))
701 return NULL;
702 if (PyUnicodeEncodeError_GetEnd(exc, &end))
703 return NULL;
704 if (!(object = PyUnicodeEncodeError_GetObject(exc)))
705 return NULL;
Martin v. Löwisb09af032011-11-04 11:16:41 +0100706 for (i = start, ressize = 0; i < end; ++i) {
707 /* object is guaranteed to be "ready" */
708 c = PyUnicode_READ_CHAR(object, i);
709 if (c >= 0x10000) {
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000710 ressize += 1+1+8;
Martin v. Löwisb09af032011-11-04 11:16:41 +0100711 }
712 else if (c >= 0x100) {
713 ressize += 1+1+4;
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000714 }
715 else
716 ressize += 1+1+2;
717 }
Martin v. Löwisb09af032011-11-04 11:16:41 +0100718 res = PyUnicode_New(ressize, 127);
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000719 if (res==NULL)
720 return NULL;
Martin v. Löwisb09af032011-11-04 11:16:41 +0100721 for (i = start, outp = PyUnicode_1BYTE_DATA(res);
722 i < end; ++i) {
723 c = PyUnicode_READ_CHAR(object, i);
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000724 *outp++ = '\\';
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000725 if (c >= 0x00010000) {
726 *outp++ = 'U';
Victor Stinnerf5cff562011-10-14 02:13:11 +0200727 *outp++ = Py_hexdigits[(c>>28)&0xf];
728 *outp++ = Py_hexdigits[(c>>24)&0xf];
729 *outp++ = Py_hexdigits[(c>>20)&0xf];
730 *outp++ = Py_hexdigits[(c>>16)&0xf];
731 *outp++ = Py_hexdigits[(c>>12)&0xf];
732 *outp++ = Py_hexdigits[(c>>8)&0xf];
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000733 }
Antoine Pitroue4a18922010-09-09 20:30:23 +0000734 else if (c >= 0x100) {
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000735 *outp++ = 'u';
Victor Stinnerf5cff562011-10-14 02:13:11 +0200736 *outp++ = Py_hexdigits[(c>>12)&0xf];
737 *outp++ = Py_hexdigits[(c>>8)&0xf];
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000738 }
739 else
740 *outp++ = 'x';
Victor Stinnerf5cff562011-10-14 02:13:11 +0200741 *outp++ = Py_hexdigits[(c>>4)&0xf];
742 *outp++ = Py_hexdigits[c&0xf];
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000743 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +0000744
Victor Stinner8f825062012-04-27 13:55:39 +0200745 assert(_PyUnicode_CheckConsistency(res, 1));
746 restuple = Py_BuildValue("(Nn)", res, end);
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000747 Py_DECREF(object);
748 return restuple;
Walter Dörwald3aeb6322002-09-02 13:14:32 +0000749 }
750 else {
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000751 wrong_exception_type(exc);
752 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +0000753 }
754}
755
Martin v. Löwisaef3fb02009-05-02 19:27:30 +0000756/* This handler is declared static until someone demonstrates
757 a need to call it directly. */
758static PyObject *
Martin v. Löwise0a2b722009-05-10 08:08:56 +0000759PyCodec_SurrogatePassErrors(PyObject *exc)
Martin v. Löwisdb12d452009-05-02 18:52:14 +0000760{
761 PyObject *restuple;
762 PyObject *object;
Martin v. Löwisb09af032011-11-04 11:16:41 +0100763 Py_ssize_t i;
Martin v. Löwisdb12d452009-05-02 18:52:14 +0000764 Py_ssize_t start;
765 Py_ssize_t end;
766 PyObject *res;
767 if (PyObject_IsInstance(exc, PyExc_UnicodeEncodeError)) {
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000768 char *outp;
769 if (PyUnicodeEncodeError_GetStart(exc, &start))
770 return NULL;
771 if (PyUnicodeEncodeError_GetEnd(exc, &end))
772 return NULL;
773 if (!(object = PyUnicodeEncodeError_GetObject(exc)))
774 return NULL;
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000775 res = PyBytes_FromStringAndSize(NULL, 3*(end-start));
776 if (!res) {
777 Py_DECREF(object);
778 return NULL;
779 }
780 outp = PyBytes_AsString(res);
Martin v. Löwisb09af032011-11-04 11:16:41 +0100781 for (i = start; i < end; i++) {
782 /* object is guaranteed to be "ready" */
783 Py_UCS4 ch = PyUnicode_READ_CHAR(object, i);
Victor Stinner76df43d2012-10-30 01:42:39 +0100784 if (!Py_UNICODE_IS_SURROGATE(ch)) {
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000785 /* Not a surrogate, fail with original exception */
786 PyErr_SetObject(PyExceptionInstance_Class(exc), exc);
787 Py_DECREF(res);
788 Py_DECREF(object);
789 return NULL;
790 }
791 *outp++ = (char)(0xe0 | (ch >> 12));
792 *outp++ = (char)(0x80 | ((ch >> 6) & 0x3f));
793 *outp++ = (char)(0x80 | (ch & 0x3f));
794 }
795 restuple = Py_BuildValue("(On)", res, end);
796 Py_DECREF(res);
797 Py_DECREF(object);
798 return restuple;
Martin v. Löwisdb12d452009-05-02 18:52:14 +0000799 }
800 else if (PyObject_IsInstance(exc, PyExc_UnicodeDecodeError)) {
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000801 unsigned char *p;
Victor Stinnerc06bb7a2011-11-04 21:36:35 +0100802 Py_UCS4 ch = 0;
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000803 if (PyUnicodeDecodeError_GetStart(exc, &start))
804 return NULL;
805 if (!(object = PyUnicodeDecodeError_GetObject(exc)))
806 return NULL;
807 if (!(p = (unsigned char*)PyBytes_AsString(object))) {
808 Py_DECREF(object);
809 return NULL;
810 }
811 /* Try decoding a single surrogate character. If
812 there are more, let the codec call us again. */
813 p += start;
Ezio Melotti540da762012-11-03 23:03:39 +0200814 if (PyBytes_GET_SIZE(object) - start >= 3 &&
815 (p[0] & 0xf0) == 0xe0 &&
816 (p[1] & 0xc0) == 0x80 &&
817 (p[2] & 0xc0) == 0x80) {
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000818 /* it's a three-byte code */
819 ch = ((p[0] & 0x0f) << 12) + ((p[1] & 0x3f) << 6) + (p[2] & 0x3f);
Victor Stinner76df43d2012-10-30 01:42:39 +0100820 if (!Py_UNICODE_IS_SURROGATE(ch))
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000821 /* it's not a surrogate - fail */
822 ch = 0;
823 }
824 Py_DECREF(object);
825 if (ch == 0) {
826 PyErr_SetObject(PyExceptionInstance_Class(exc), exc);
827 return NULL;
828 }
Victor Stinnerc06bb7a2011-11-04 21:36:35 +0100829 res = PyUnicode_FromOrdinal(ch);
830 if (res == NULL)
831 return NULL;
832 return Py_BuildValue("(Nn)", res, start+3);
Martin v. Löwisdb12d452009-05-02 18:52:14 +0000833 }
834 else {
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000835 wrong_exception_type(exc);
836 return NULL;
Martin v. Löwisdb12d452009-05-02 18:52:14 +0000837 }
838}
839
Martin v. Löwis011e8422009-05-05 04:43:17 +0000840static PyObject *
Martin v. Löwis43c57782009-05-10 08:15:24 +0000841PyCodec_SurrogateEscapeErrors(PyObject *exc)
Martin v. Löwis011e8422009-05-05 04:43:17 +0000842{
843 PyObject *restuple;
844 PyObject *object;
Martin v. Löwisb09af032011-11-04 11:16:41 +0100845 Py_ssize_t i;
Martin v. Löwis011e8422009-05-05 04:43:17 +0000846 Py_ssize_t start;
847 Py_ssize_t end;
848 PyObject *res;
849 if (PyObject_IsInstance(exc, PyExc_UnicodeEncodeError)) {
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000850 char *outp;
851 if (PyUnicodeEncodeError_GetStart(exc, &start))
852 return NULL;
853 if (PyUnicodeEncodeError_GetEnd(exc, &end))
854 return NULL;
855 if (!(object = PyUnicodeEncodeError_GetObject(exc)))
856 return NULL;
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000857 res = PyBytes_FromStringAndSize(NULL, end-start);
858 if (!res) {
859 Py_DECREF(object);
860 return NULL;
861 }
862 outp = PyBytes_AsString(res);
Martin v. Löwisb09af032011-11-04 11:16:41 +0100863 for (i = start; i < end; i++) {
864 /* object is guaranteed to be "ready" */
865 Py_UCS4 ch = PyUnicode_READ_CHAR(object, i);
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000866 if (ch < 0xdc80 || ch > 0xdcff) {
867 /* Not a UTF-8b surrogate, fail with original exception */
868 PyErr_SetObject(PyExceptionInstance_Class(exc), exc);
869 Py_DECREF(res);
870 Py_DECREF(object);
871 return NULL;
872 }
873 *outp++ = ch - 0xdc00;
874 }
875 restuple = Py_BuildValue("(On)", res, end);
876 Py_DECREF(res);
877 Py_DECREF(object);
878 return restuple;
Martin v. Löwis011e8422009-05-05 04:43:17 +0000879 }
880 else if (PyObject_IsInstance(exc, PyExc_UnicodeDecodeError)) {
Victor Stinnerc06bb7a2011-11-04 21:36:35 +0100881 PyObject *str;
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000882 unsigned char *p;
Victor Stinnerc06bb7a2011-11-04 21:36:35 +0100883 Py_UCS2 ch[4]; /* decode up to 4 bad bytes. */
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000884 int consumed = 0;
885 if (PyUnicodeDecodeError_GetStart(exc, &start))
886 return NULL;
887 if (PyUnicodeDecodeError_GetEnd(exc, &end))
888 return NULL;
889 if (!(object = PyUnicodeDecodeError_GetObject(exc)))
890 return NULL;
891 if (!(p = (unsigned char*)PyBytes_AsString(object))) {
892 Py_DECREF(object);
893 return NULL;
894 }
895 while (consumed < 4 && consumed < end-start) {
896 /* Refuse to escape ASCII bytes. */
897 if (p[start+consumed] < 128)
898 break;
899 ch[consumed] = 0xdc00 + p[start+consumed];
900 consumed++;
901 }
902 Py_DECREF(object);
903 if (!consumed) {
904 /* codec complained about ASCII byte. */
905 PyErr_SetObject(PyExceptionInstance_Class(exc), exc);
906 return NULL;
907 }
Victor Stinnerc06bb7a2011-11-04 21:36:35 +0100908 str = PyUnicode_FromKindAndData(PyUnicode_2BYTE_KIND, ch, consumed);
909 if (str == NULL)
910 return NULL;
911 return Py_BuildValue("(Nn)", str, start+consumed);
Martin v. Löwis011e8422009-05-05 04:43:17 +0000912 }
913 else {
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000914 wrong_exception_type(exc);
915 return NULL;
Martin v. Löwis011e8422009-05-05 04:43:17 +0000916 }
917}
918
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000919
Walter Dörwald3aeb6322002-09-02 13:14:32 +0000920static PyObject *strict_errors(PyObject *self, PyObject *exc)
921{
922 return PyCodec_StrictErrors(exc);
923}
924
925
926static PyObject *ignore_errors(PyObject *self, PyObject *exc)
927{
928 return PyCodec_IgnoreErrors(exc);
929}
930
931
932static PyObject *replace_errors(PyObject *self, PyObject *exc)
933{
934 return PyCodec_ReplaceErrors(exc);
935}
936
937
938static PyObject *xmlcharrefreplace_errors(PyObject *self, PyObject *exc)
939{
940 return PyCodec_XMLCharRefReplaceErrors(exc);
941}
942
943
944static PyObject *backslashreplace_errors(PyObject *self, PyObject *exc)
945{
946 return PyCodec_BackslashReplaceErrors(exc);
947}
948
Martin v. Löwise0a2b722009-05-10 08:08:56 +0000949static PyObject *surrogatepass_errors(PyObject *self, PyObject *exc)
Martin v. Löwisdb12d452009-05-02 18:52:14 +0000950{
Martin v. Löwise0a2b722009-05-10 08:08:56 +0000951 return PyCodec_SurrogatePassErrors(exc);
Martin v. Löwisdb12d452009-05-02 18:52:14 +0000952}
953
Martin v. Löwis43c57782009-05-10 08:15:24 +0000954static PyObject *surrogateescape_errors(PyObject *self, PyObject *exc)
Martin v. Löwis011e8422009-05-05 04:43:17 +0000955{
Martin v. Löwis43c57782009-05-10 08:15:24 +0000956 return PyCodec_SurrogateEscapeErrors(exc);
Martin v. Löwis011e8422009-05-05 04:43:17 +0000957}
958
Gustavo Niemeyer5ddd4c32003-03-19 00:35:36 +0000959static int _PyCodecRegistry_Init(void)
Guido van Rossumfeee4b92000-03-10 22:57:27 +0000960{
Walter Dörwald3aeb6322002-09-02 13:14:32 +0000961 static struct {
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000962 char *name;
963 PyMethodDef def;
Walter Dörwald3aeb6322002-09-02 13:14:32 +0000964 } methods[] =
965 {
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000966 {
967 "strict",
968 {
969 "strict_errors",
970 strict_errors,
971 METH_O,
972 PyDoc_STR("Implements the 'strict' error handling, which "
973 "raises a UnicodeError on coding errors.")
974 }
975 },
976 {
977 "ignore",
978 {
979 "ignore_errors",
980 ignore_errors,
981 METH_O,
982 PyDoc_STR("Implements the 'ignore' error handling, which "
983 "ignores malformed data and continues.")
984 }
985 },
986 {
987 "replace",
988 {
989 "replace_errors",
990 replace_errors,
991 METH_O,
992 PyDoc_STR("Implements the 'replace' error handling, which "
993 "replaces malformed data with a replacement marker.")
994 }
995 },
996 {
997 "xmlcharrefreplace",
998 {
999 "xmlcharrefreplace_errors",
1000 xmlcharrefreplace_errors,
1001 METH_O,
1002 PyDoc_STR("Implements the 'xmlcharrefreplace' error handling, "
1003 "which replaces an unencodable character with the "
1004 "appropriate XML character reference.")
1005 }
1006 },
1007 {
1008 "backslashreplace",
1009 {
1010 "backslashreplace_errors",
1011 backslashreplace_errors,
1012 METH_O,
1013 PyDoc_STR("Implements the 'backslashreplace' error handling, "
1014 "which replaces an unencodable character with a "
1015 "backslashed escape sequence.")
1016 }
1017 },
1018 {
1019 "surrogatepass",
1020 {
1021 "surrogatepass",
1022 surrogatepass_errors,
1023 METH_O
1024 }
1025 },
1026 {
1027 "surrogateescape",
1028 {
1029 "surrogateescape",
1030 surrogateescape_errors,
1031 METH_O
1032 }
1033 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001034 };
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001035
Nicholas Bastine5662ae2004-03-24 22:22:12 +00001036 PyInterpreterState *interp = PyThreadState_GET()->interp;
Gustavo Niemeyer5ddd4c32003-03-19 00:35:36 +00001037 PyObject *mod;
Neal Norwitz739a8f82004-07-08 01:55:58 +00001038 unsigned i;
Gustavo Niemeyer5ddd4c32003-03-19 00:35:36 +00001039
1040 if (interp->codec_search_path != NULL)
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001041 return 0;
Gustavo Niemeyer5ddd4c32003-03-19 00:35:36 +00001042
1043 interp->codec_search_path = PyList_New(0);
1044 interp->codec_search_cache = PyDict_New();
1045 interp->codec_error_registry = PyDict_New();
1046
1047 if (interp->codec_error_registry) {
Victor Stinner63941882011-09-29 00:42:28 +02001048 for (i = 0; i < Py_ARRAY_LENGTH(methods); ++i) {
Andrew Svetlov3ba3a3e2012-12-25 13:32:35 +02001049 PyObject *func = PyCFunction_NewEx(&methods[i].def, NULL, NULL);
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001050 int res;
1051 if (!func)
1052 Py_FatalError("can't initialize codec error registry");
1053 res = PyCodec_RegisterError(methods[i].name, func);
1054 Py_DECREF(func);
1055 if (res)
1056 Py_FatalError("can't initialize codec error registry");
1057 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001058 }
Guido van Rossumfeee4b92000-03-10 22:57:27 +00001059
Gustavo Niemeyer5ddd4c32003-03-19 00:35:36 +00001060 if (interp->codec_search_path == NULL ||
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001061 interp->codec_search_cache == NULL ||
1062 interp->codec_error_registry == NULL)
1063 Py_FatalError("can't initialize codec registry");
Gustavo Niemeyer5ddd4c32003-03-19 00:35:36 +00001064
Christian Heimes819b8bf2008-01-03 23:05:47 +00001065 mod = PyImport_ImportModuleNoBlock("encodings");
Gustavo Niemeyer5ddd4c32003-03-19 00:35:36 +00001066 if (mod == NULL) {
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001067 return -1;
Gustavo Niemeyer5ddd4c32003-03-19 00:35:36 +00001068 }
1069 Py_DECREF(mod);
Christian Heimes6a27efa2008-10-30 21:48:26 +00001070 interp->codecs_initialized = 1;
Gustavo Niemeyer5ddd4c32003-03-19 00:35:36 +00001071 return 0;
Guido van Rossumfeee4b92000-03-10 22:57:27 +00001072}