blob: 5ff41b57df2957f0e128a950f2429cb3729be32a [file] [log] [blame]
Guido van Rossumfeee4b92000-03-10 22:57:27 +00001/* ------------------------------------------------------------------------
2
3 Python Codec Registry and support functions
4
5Written by Marc-Andre Lemburg (mal@lemburg.com).
6
Guido van Rossum16b1ad92000-08-03 16:24:25 +00007Copyright (c) Corporation for National Research Initiatives.
Guido van Rossumfeee4b92000-03-10 22:57:27 +00008
9 ------------------------------------------------------------------------ */
10
11#include "Python.h"
12#include <ctype.h>
13
Victor Stinnerf5cff562011-10-14 02:13:11 +020014const char *Py_hexdigits = "0123456789abcdef";
15
Guido van Rossumfeee4b92000-03-10 22:57:27 +000016/* --- Codec Registry ----------------------------------------------------- */
17
18/* Import the standard encodings package which will register the first
Guido van Rossum98297ee2007-11-06 21:34:58 +000019 codec search function.
Guido van Rossumfeee4b92000-03-10 22:57:27 +000020
21 This is done in a lazy way so that the Unicode implementation does
22 not downgrade startup time of scripts not needing it.
23
Guido van Rossumb95de4f2000-03-31 17:25:23 +000024 ImportErrors are silently ignored by this function. Only one try is
25 made.
Guido van Rossumfeee4b92000-03-10 22:57:27 +000026
27*/
28
Gustavo Niemeyer5ddd4c32003-03-19 00:35:36 +000029static int _PyCodecRegistry_Init(void); /* Forward */
Guido van Rossumfeee4b92000-03-10 22:57:27 +000030
Guido van Rossumfeee4b92000-03-10 22:57:27 +000031int PyCodec_Register(PyObject *search_function)
32{
Nicholas Bastine5662ae2004-03-24 22:22:12 +000033 PyInterpreterState *interp = PyThreadState_GET()->interp;
Gustavo Niemeyer5ddd4c32003-03-19 00:35:36 +000034 if (interp->codec_search_path == NULL && _PyCodecRegistry_Init())
Antoine Pitrouf95a1b32010-05-09 15:52:27 +000035 goto onError;
Guido van Rossumfeee4b92000-03-10 22:57:27 +000036 if (search_function == NULL) {
Antoine Pitrouf95a1b32010-05-09 15:52:27 +000037 PyErr_BadArgument();
38 goto onError;
Guido van Rossumfeee4b92000-03-10 22:57:27 +000039 }
40 if (!PyCallable_Check(search_function)) {
Antoine Pitrouf95a1b32010-05-09 15:52:27 +000041 PyErr_SetString(PyExc_TypeError, "argument must be callable");
42 goto onError;
Guido van Rossumfeee4b92000-03-10 22:57:27 +000043 }
Gustavo Niemeyer5ddd4c32003-03-19 00:35:36 +000044 return PyList_Append(interp->codec_search_path, search_function);
Guido van Rossumb95de4f2000-03-31 17:25:23 +000045
46 onError:
47 return -1;
Guido van Rossumfeee4b92000-03-10 22:57:27 +000048}
49
Guido van Rossum9e896b32000-04-05 20:11:21 +000050/* Convert a string to a normalized Python string: all characters are
51 converted to lower case, spaces are replaced with underscores. */
52
Guido van Rossumfeee4b92000-03-10 22:57:27 +000053static
Guido van Rossum9e896b32000-04-05 20:11:21 +000054PyObject *normalizestring(const char *string)
Guido van Rossumfeee4b92000-03-10 22:57:27 +000055{
Antoine Pitrou9ed5f272013-08-13 20:18:52 +020056 size_t i;
Guido van Rossum582acec2000-06-28 22:07:35 +000057 size_t len = strlen(string);
Guido van Rossumfeee4b92000-03-10 22:57:27 +000058 char *p;
59 PyObject *v;
Guido van Rossum21431e82007-10-19 21:48:41 +000060
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000061 if (len > PY_SSIZE_T_MAX) {
Antoine Pitrouf95a1b32010-05-09 15:52:27 +000062 PyErr_SetString(PyExc_OverflowError, "string is too large");
63 return NULL;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000064 }
Guido van Rossum21431e82007-10-19 21:48:41 +000065
66 p = PyMem_Malloc(len + 1);
67 if (p == NULL)
Victor Stinnercc351592013-07-12 00:02:55 +020068 return PyErr_NoMemory();
Guido van Rossum9e896b32000-04-05 20:11:21 +000069 for (i = 0; i < len; i++) {
Antoine Pitrou9ed5f272013-08-13 20:18:52 +020070 char ch = string[i];
Guido van Rossum9e896b32000-04-05 20:11:21 +000071 if (ch == ' ')
72 ch = '-';
73 else
Antoine Pitroucf9d3c02011-07-24 02:27:04 +020074 ch = Py_TOLOWER(Py_CHARMASK(ch));
Antoine Pitrouf95a1b32010-05-09 15:52:27 +000075 p[i] = ch;
Guido van Rossum9e896b32000-04-05 20:11:21 +000076 }
Guido van Rossum21431e82007-10-19 21:48:41 +000077 p[i] = '\0';
78 v = PyUnicode_FromString(p);
79 if (v == NULL)
80 return NULL;
81 PyMem_Free(p);
Guido van Rossumfeee4b92000-03-10 22:57:27 +000082 return v;
83}
84
85/* Lookup the given encoding and return a tuple providing the codec
86 facilities.
87
88 The encoding string is looked up converted to all lower-case
89 characters. This makes encodings looked up through this mechanism
90 effectively case-insensitive.
91
Guido van Rossum98297ee2007-11-06 21:34:58 +000092 If no codec is found, a LookupError is set and NULL returned.
Guido van Rossumb95de4f2000-03-31 17:25:23 +000093
94 As side effect, this tries to load the encodings package, if not
95 yet done. This is part of the lazy load strategy for the encodings
96 package.
97
98*/
Guido van Rossumfeee4b92000-03-10 22:57:27 +000099
100PyObject *_PyCodec_Lookup(const char *encoding)
101{
Gustavo Niemeyer5ddd4c32003-03-19 00:35:36 +0000102 PyInterpreterState *interp;
Guido van Rossum5ba3c842000-03-24 20:52:23 +0000103 PyObject *result, *args = NULL, *v;
Thomas Wouters477c8d52006-05-27 19:21:47 +0000104 Py_ssize_t i, len;
Guido van Rossumfeee4b92000-03-10 22:57:27 +0000105
Fred Drake766de832000-05-09 19:55:59 +0000106 if (encoding == NULL) {
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000107 PyErr_BadArgument();
108 goto onError;
Fred Drake766de832000-05-09 19:55:59 +0000109 }
Gustavo Niemeyer5ddd4c32003-03-19 00:35:36 +0000110
Nicholas Bastine5662ae2004-03-24 22:22:12 +0000111 interp = PyThreadState_GET()->interp;
Gustavo Niemeyer5ddd4c32003-03-19 00:35:36 +0000112 if (interp->codec_search_path == NULL && _PyCodecRegistry_Init())
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000113 goto onError;
Guido van Rossumfeee4b92000-03-10 22:57:27 +0000114
Guido van Rossum9e896b32000-04-05 20:11:21 +0000115 /* Convert the encoding to a normalized Python string: all
Thomas Wouters7e474022000-07-16 12:04:32 +0000116 characters are converted to lower case, spaces and hyphens are
Guido van Rossum9e896b32000-04-05 20:11:21 +0000117 replaced with underscores. */
118 v = normalizestring(encoding);
Guido van Rossumfeee4b92000-03-10 22:57:27 +0000119 if (v == NULL)
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000120 goto onError;
Guido van Rossum21431e82007-10-19 21:48:41 +0000121 PyUnicode_InternInPlace(&v);
Guido van Rossumfeee4b92000-03-10 22:57:27 +0000122
123 /* First, try to lookup the name in the registry dictionary */
Gustavo Niemeyer5ddd4c32003-03-19 00:35:36 +0000124 result = PyDict_GetItem(interp->codec_search_cache, v);
Guido van Rossumfeee4b92000-03-10 22:57:27 +0000125 if (result != NULL) {
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000126 Py_INCREF(result);
127 Py_DECREF(v);
128 return result;
Guido van Rossumfeee4b92000-03-10 22:57:27 +0000129 }
Guido van Rossum98297ee2007-11-06 21:34:58 +0000130
Guido van Rossumfeee4b92000-03-10 22:57:27 +0000131 /* Next, scan the search functions in order of registration */
Guido van Rossumfeee4b92000-03-10 22:57:27 +0000132 args = PyTuple_New(1);
133 if (args == NULL)
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000134 goto onError;
Guido van Rossumfeee4b92000-03-10 22:57:27 +0000135 PyTuple_SET_ITEM(args,0,v);
Guido van Rossum5ba3c842000-03-24 20:52:23 +0000136
Gustavo Niemeyer5ddd4c32003-03-19 00:35:36 +0000137 len = PyList_Size(interp->codec_search_path);
Guido van Rossum5ba3c842000-03-24 20:52:23 +0000138 if (len < 0)
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000139 goto onError;
Guido van Rossumb95de4f2000-03-31 17:25:23 +0000140 if (len == 0) {
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000141 PyErr_SetString(PyExc_LookupError,
142 "no codec search functions registered: "
143 "can't find encoding");
144 goto onError;
Guido van Rossumb95de4f2000-03-31 17:25:23 +0000145 }
Guido van Rossumfeee4b92000-03-10 22:57:27 +0000146
147 for (i = 0; i < len; i++) {
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000148 PyObject *func;
Guido van Rossum98297ee2007-11-06 21:34:58 +0000149
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000150 func = PyList_GetItem(interp->codec_search_path, i);
151 if (func == NULL)
152 goto onError;
153 result = PyEval_CallObject(func, args);
154 if (result == NULL)
155 goto onError;
156 if (result == Py_None) {
157 Py_DECREF(result);
158 continue;
159 }
160 if (!PyTuple_Check(result) || PyTuple_GET_SIZE(result) != 4) {
161 PyErr_SetString(PyExc_TypeError,
162 "codec search functions must return 4-tuples");
163 Py_DECREF(result);
164 goto onError;
165 }
166 break;
Guido van Rossumfeee4b92000-03-10 22:57:27 +0000167 }
168 if (i == len) {
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000169 /* XXX Perhaps we should cache misses too ? */
170 PyErr_Format(PyExc_LookupError,
Martin v. Löwiseb42b022002-09-26 16:01:24 +0000171 "unknown encoding: %s", encoding);
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000172 goto onError;
Guido van Rossumfeee4b92000-03-10 22:57:27 +0000173 }
174
175 /* Cache and return the result */
Neal Norwitz9edcc2e2007-08-11 04:58:26 +0000176 if (PyDict_SetItem(interp->codec_search_cache, v, result) < 0) {
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000177 Py_DECREF(result);
178 goto onError;
Neal Norwitz9edcc2e2007-08-11 04:58:26 +0000179 }
Guido van Rossumfeee4b92000-03-10 22:57:27 +0000180 Py_DECREF(args);
181 return result;
182
183 onError:
184 Py_XDECREF(args);
185 return NULL;
186}
187
Marc-André Lemburgb2750b52008-06-06 12:18:17 +0000188/* Codec registry encoding check API. */
189
190int PyCodec_KnownEncoding(const char *encoding)
191{
192 PyObject *codecs;
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000193
Marc-André Lemburgb2750b52008-06-06 12:18:17 +0000194 codecs = _PyCodec_Lookup(encoding);
195 if (!codecs) {
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000196 PyErr_Clear();
197 return 0;
Marc-André Lemburgb2750b52008-06-06 12:18:17 +0000198 }
199 else {
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000200 Py_DECREF(codecs);
201 return 1;
Marc-André Lemburgb2750b52008-06-06 12:18:17 +0000202 }
203}
204
Guido van Rossumfeee4b92000-03-10 22:57:27 +0000205static
206PyObject *args_tuple(PyObject *object,
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000207 const char *errors)
Guido van Rossumfeee4b92000-03-10 22:57:27 +0000208{
209 PyObject *args;
Guido van Rossum98297ee2007-11-06 21:34:58 +0000210
Guido van Rossumfeee4b92000-03-10 22:57:27 +0000211 args = PyTuple_New(1 + (errors != NULL));
212 if (args == NULL)
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000213 return NULL;
Guido van Rossumfeee4b92000-03-10 22:57:27 +0000214 Py_INCREF(object);
215 PyTuple_SET_ITEM(args,0,object);
216 if (errors) {
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000217 PyObject *v;
Guido van Rossum98297ee2007-11-06 21:34:58 +0000218
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000219 v = PyUnicode_FromString(errors);
220 if (v == NULL) {
221 Py_DECREF(args);
222 return NULL;
223 }
224 PyTuple_SET_ITEM(args, 1, v);
Guido van Rossumfeee4b92000-03-10 22:57:27 +0000225 }
226 return args;
227}
228
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000229/* Helper function to get a codec item */
Guido van Rossumfeee4b92000-03-10 22:57:27 +0000230
231static
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000232PyObject *codec_getitem(const char *encoding, int index)
Guido van Rossumfeee4b92000-03-10 22:57:27 +0000233{
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000234 PyObject *codecs;
235 PyObject *v;
Guido van Rossumfeee4b92000-03-10 22:57:27 +0000236
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000237 codecs = _PyCodec_Lookup(encoding);
238 if (codecs == NULL)
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000239 return NULL;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000240 v = PyTuple_GET_ITEM(codecs, index);
241 Py_DECREF(codecs);
242 Py_INCREF(v);
243 return v;
244}
245
246/* Helper function to create an incremental codec. */
247
248static
249PyObject *codec_getincrementalcodec(const char *encoding,
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000250 const char *errors,
251 const char *attrname)
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000252{
253 PyObject *codecs, *ret, *inccodec;
254
255 codecs = _PyCodec_Lookup(encoding);
256 if (codecs == NULL)
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000257 return NULL;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000258 inccodec = PyObject_GetAttrString(codecs, attrname);
259 Py_DECREF(codecs);
260 if (inccodec == NULL)
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000261 return NULL;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000262 if (errors)
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000263 ret = PyObject_CallFunction(inccodec, "s", errors);
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000264 else
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000265 ret = PyObject_CallFunction(inccodec, NULL);
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000266 Py_DECREF(inccodec);
267 return ret;
268}
269
270/* Helper function to create a stream codec. */
271
272static
273PyObject *codec_getstreamcodec(const char *encoding,
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000274 PyObject *stream,
275 const char *errors,
276 const int index)
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000277{
Thomas Wouters0e3f5912006-08-11 14:57:12 +0000278 PyObject *codecs, *streamcodec, *codeccls;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000279
280 codecs = _PyCodec_Lookup(encoding);
281 if (codecs == NULL)
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000282 return NULL;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000283
Thomas Wouters0e3f5912006-08-11 14:57:12 +0000284 codeccls = PyTuple_GET_ITEM(codecs, index);
285 if (errors != NULL)
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000286 streamcodec = PyObject_CallFunction(codeccls, "Os", stream, errors);
Thomas Wouters0e3f5912006-08-11 14:57:12 +0000287 else
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000288 streamcodec = PyObject_CallFunction(codeccls, "O", stream);
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000289 Py_DECREF(codecs);
290 return streamcodec;
Guido van Rossumfeee4b92000-03-10 22:57:27 +0000291}
292
Guido van Rossum98297ee2007-11-06 21:34:58 +0000293/* Convenience APIs to query the Codec registry.
294
Guido van Rossumfeee4b92000-03-10 22:57:27 +0000295 All APIs return a codec object with incremented refcount.
Guido van Rossum98297ee2007-11-06 21:34:58 +0000296
Guido van Rossumfeee4b92000-03-10 22:57:27 +0000297 */
298
299PyObject *PyCodec_Encoder(const char *encoding)
300{
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000301 return codec_getitem(encoding, 0);
Guido van Rossumfeee4b92000-03-10 22:57:27 +0000302}
303
304PyObject *PyCodec_Decoder(const char *encoding)
305{
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000306 return codec_getitem(encoding, 1);
Guido van Rossumfeee4b92000-03-10 22:57:27 +0000307}
308
Thomas Woutersa9773292006-04-21 09:43:23 +0000309PyObject *PyCodec_IncrementalEncoder(const char *encoding,
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000310 const char *errors)
Thomas Woutersa9773292006-04-21 09:43:23 +0000311{
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000312 return codec_getincrementalcodec(encoding, errors, "incrementalencoder");
Thomas Woutersa9773292006-04-21 09:43:23 +0000313}
314
315PyObject *PyCodec_IncrementalDecoder(const char *encoding,
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000316 const char *errors)
Thomas Woutersa9773292006-04-21 09:43:23 +0000317{
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000318 return codec_getincrementalcodec(encoding, errors, "incrementaldecoder");
Thomas Woutersa9773292006-04-21 09:43:23 +0000319}
320
Guido van Rossumfeee4b92000-03-10 22:57:27 +0000321PyObject *PyCodec_StreamReader(const char *encoding,
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000322 PyObject *stream,
323 const char *errors)
Guido van Rossumfeee4b92000-03-10 22:57:27 +0000324{
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000325 return codec_getstreamcodec(encoding, stream, errors, 2);
Guido van Rossumfeee4b92000-03-10 22:57:27 +0000326}
327
328PyObject *PyCodec_StreamWriter(const char *encoding,
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000329 PyObject *stream,
330 const char *errors)
Guido van Rossumfeee4b92000-03-10 22:57:27 +0000331{
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000332 return codec_getstreamcodec(encoding, stream, errors, 3);
Guido van Rossumfeee4b92000-03-10 22:57:27 +0000333}
334
Nick Coghlan8b097b42013-11-13 23:49:21 +1000335/* Helper that tries to ensure the reported exception chain indicates the
336 * codec that was invoked to trigger the failure without changing the type
337 * of the exception raised.
338 */
339static void
340wrap_codec_error(const char *operation,
341 const char *encoding)
342{
343 /* TrySetFromCause will replace the active exception with a suitably
344 * updated clone if it can, otherwise it will leave the original
345 * exception alone.
346 */
347 _PyErr_TrySetFromCause("%s with '%s' codec failed",
348 operation, encoding);
349}
350
Guido van Rossumfeee4b92000-03-10 22:57:27 +0000351/* Encode an object (e.g. an Unicode object) using the given encoding
352 and return the resulting encoded object (usually a Python string).
353
354 errors is passed to the encoder factory as argument if non-NULL. */
355
Nick Coghlanc72e4e62013-11-22 22:39:36 +1000356static PyObject *
357_PyCodec_EncodeInternal(PyObject *object,
358 PyObject *encoder,
359 const char *encoding,
360 const char *errors)
Guido van Rossumfeee4b92000-03-10 22:57:27 +0000361{
Neal Norwitz3715c3e2005-11-24 22:09:18 +0000362 PyObject *args = NULL, *result = NULL;
Guido van Rossum98297ee2007-11-06 21:34:58 +0000363 PyObject *v = NULL;
Guido van Rossumfeee4b92000-03-10 22:57:27 +0000364
Guido van Rossumfeee4b92000-03-10 22:57:27 +0000365 args = args_tuple(object, errors);
366 if (args == NULL)
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000367 goto onError;
Guido van Rossum98297ee2007-11-06 21:34:58 +0000368
369 result = PyEval_CallObject(encoder, args);
Nick Coghlanc4c25802013-11-15 21:47:37 +1000370 if (result == NULL) {
371 wrap_codec_error("encoding", encoding);
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000372 goto onError;
Nick Coghlanc4c25802013-11-15 21:47:37 +1000373 }
Guido van Rossumfeee4b92000-03-10 22:57:27 +0000374
Guido van Rossum98297ee2007-11-06 21:34:58 +0000375 if (!PyTuple_Check(result) ||
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000376 PyTuple_GET_SIZE(result) != 2) {
377 PyErr_SetString(PyExc_TypeError,
378 "encoder must return a tuple (object, integer)");
379 goto onError;
Guido van Rossumfeee4b92000-03-10 22:57:27 +0000380 }
Marc-André Lemburgb2750b52008-06-06 12:18:17 +0000381 v = PyTuple_GET_ITEM(result,0);
382 Py_INCREF(v);
Guido van Rossumfeee4b92000-03-10 22:57:27 +0000383 /* We don't check or use the second (integer) entry. */
384
Marc-André Lemburgb2750b52008-06-06 12:18:17 +0000385 Py_DECREF(args);
386 Py_DECREF(encoder);
387 Py_DECREF(result);
388 return v;
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000389
Guido van Rossumfeee4b92000-03-10 22:57:27 +0000390 onError:
Neal Norwitz3715c3e2005-11-24 22:09:18 +0000391 Py_XDECREF(result);
Guido van Rossumfeee4b92000-03-10 22:57:27 +0000392 Py_XDECREF(args);
393 Py_XDECREF(encoder);
Marc-André Lemburgb2750b52008-06-06 12:18:17 +0000394 return NULL;
Guido van Rossumfeee4b92000-03-10 22:57:27 +0000395}
396
397/* Decode an object (usually a Python string) using the given encoding
398 and return an equivalent object (e.g. an Unicode object).
399
400 errors is passed to the decoder factory as argument if non-NULL. */
401
Nick Coghlanc72e4e62013-11-22 22:39:36 +1000402static PyObject *
403_PyCodec_DecodeInternal(PyObject *object,
404 PyObject *decoder,
405 const char *encoding,
406 const char *errors)
Guido van Rossumfeee4b92000-03-10 22:57:27 +0000407{
Guido van Rossumfeee4b92000-03-10 22:57:27 +0000408 PyObject *args = NULL, *result = NULL;
409 PyObject *v;
410
Guido van Rossumfeee4b92000-03-10 22:57:27 +0000411 args = args_tuple(object, errors);
412 if (args == NULL)
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000413 goto onError;
Guido van Rossum98297ee2007-11-06 21:34:58 +0000414
Guido van Rossumfeee4b92000-03-10 22:57:27 +0000415 result = PyEval_CallObject(decoder,args);
Nick Coghlanc4c25802013-11-15 21:47:37 +1000416 if (result == NULL) {
417 wrap_codec_error("decoding", encoding);
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000418 goto onError;
Nick Coghlanc4c25802013-11-15 21:47:37 +1000419 }
Guido van Rossum98297ee2007-11-06 21:34:58 +0000420 if (!PyTuple_Check(result) ||
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000421 PyTuple_GET_SIZE(result) != 2) {
422 PyErr_SetString(PyExc_TypeError,
423 "decoder must return a tuple (object,integer)");
424 goto onError;
Guido van Rossumfeee4b92000-03-10 22:57:27 +0000425 }
426 v = PyTuple_GET_ITEM(result,0);
427 Py_INCREF(v);
428 /* We don't check or use the second (integer) entry. */
429
430 Py_DECREF(args);
431 Py_DECREF(decoder);
432 Py_DECREF(result);
433 return v;
Guido van Rossum98297ee2007-11-06 21:34:58 +0000434
Guido van Rossumfeee4b92000-03-10 22:57:27 +0000435 onError:
436 Py_XDECREF(args);
437 Py_XDECREF(decoder);
438 Py_XDECREF(result);
439 return NULL;
440}
441
Nick Coghlanc72e4e62013-11-22 22:39:36 +1000442/* Generic encoding/decoding API */
443PyObject *PyCodec_Encode(PyObject *object,
444 const char *encoding,
445 const char *errors)
446{
447 PyObject *encoder;
448
449 encoder = PyCodec_Encoder(encoding);
450 if (encoder == NULL)
451 return NULL;
452
453 return _PyCodec_EncodeInternal(object, encoder, encoding, errors);
454}
455
456PyObject *PyCodec_Decode(PyObject *object,
457 const char *encoding,
458 const char *errors)
459{
460 PyObject *decoder;
461
462 decoder = PyCodec_Decoder(encoding);
463 if (decoder == NULL)
464 return NULL;
465
466 return _PyCodec_DecodeInternal(object, decoder, encoding, errors);
467}
468
469/* Text encoding/decoding API */
470static
471PyObject *codec_getitem_checked(const char *encoding,
472 const char *operation_name,
473 int index)
474{
475 _Py_IDENTIFIER(_is_text_encoding);
476 PyObject *codec;
477 PyObject *attr;
478 PyObject *v;
479 int is_text_codec;
480
481 codec = _PyCodec_Lookup(encoding);
482 if (codec == NULL)
483 return NULL;
484
485 /* Backwards compatibility: assume any raw tuple describes a text
486 * encoding, and the same for anything lacking the private
487 * attribute.
488 */
489 if (!PyTuple_CheckExact(codec)) {
490 attr = _PyObject_GetAttrId(codec, &PyId__is_text_encoding);
491 if (attr == NULL) {
492 if (PyErr_ExceptionMatches(PyExc_AttributeError)) {
493 PyErr_Clear();
494 } else {
495 Py_DECREF(codec);
496 return NULL;
497 }
498 } else {
499 is_text_codec = PyObject_IsTrue(attr);
500 Py_DECREF(attr);
501 if (!is_text_codec) {
502 Py_DECREF(codec);
503 PyErr_Format(PyExc_LookupError,
504 "'%.400s' is not a text encoding; "
505 "use codecs.%s() to handle arbitrary codecs",
506 encoding, operation_name);
507 return NULL;
508 }
509 }
510 }
511
512 v = PyTuple_GET_ITEM(codec, index);
513 Py_DECREF(codec);
514 Py_INCREF(v);
515 return v;
516}
517
518static PyObject * _PyCodec_TextEncoder(const char *encoding)
519{
520 return codec_getitem_checked(encoding, "encode", 0);
521}
522
523static PyObject * _PyCodec_TextDecoder(const char *encoding)
524{
525 return codec_getitem_checked(encoding, "decode", 1);
526}
527
528PyObject *_PyCodec_EncodeText(PyObject *object,
529 const char *encoding,
530 const char *errors)
531{
532 PyObject *encoder;
533
534 encoder = _PyCodec_TextEncoder(encoding);
535 if (encoder == NULL)
536 return NULL;
537
538 return _PyCodec_EncodeInternal(object, encoder, encoding, errors);
539}
540
541PyObject *_PyCodec_DecodeText(PyObject *object,
542 const char *encoding,
543 const char *errors)
544{
545 PyObject *decoder;
546
547 decoder = _PyCodec_TextDecoder(encoding);
548 if (decoder == NULL)
549 return NULL;
550
551 return _PyCodec_DecodeInternal(object, decoder, encoding, errors);
552}
553
Walter Dörwald3aeb6322002-09-02 13:14:32 +0000554/* Register the error handling callback function error under the name
555 name. This function will be called by the codec when it encounters
556 an unencodable characters/undecodable bytes and doesn't know the
557 callback name, when name is specified as the error parameter
558 in the call to the encode/decode function.
559 Return 0 on success, -1 on error */
560int PyCodec_RegisterError(const char *name, PyObject *error)
561{
Nicholas Bastine5662ae2004-03-24 22:22:12 +0000562 PyInterpreterState *interp = PyThreadState_GET()->interp;
Gustavo Niemeyer5ddd4c32003-03-19 00:35:36 +0000563 if (interp->codec_search_path == NULL && _PyCodecRegistry_Init())
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000564 return -1;
Walter Dörwald3aeb6322002-09-02 13:14:32 +0000565 if (!PyCallable_Check(error)) {
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000566 PyErr_SetString(PyExc_TypeError, "handler must be callable");
567 return -1;
Walter Dörwald3aeb6322002-09-02 13:14:32 +0000568 }
Gustavo Niemeyer5ddd4c32003-03-19 00:35:36 +0000569 return PyDict_SetItemString(interp->codec_error_registry,
Serhiy Storchakac6792272013-10-19 21:03:34 +0300570 name, error);
Walter Dörwald3aeb6322002-09-02 13:14:32 +0000571}
572
573/* Lookup the error handling callback function registered under the
574 name error. As a special case NULL can be passed, in which case
575 the error handling callback for strict encoding will be returned. */
576PyObject *PyCodec_LookupError(const char *name)
577{
578 PyObject *handler = NULL;
579
Nicholas Bastine5662ae2004-03-24 22:22:12 +0000580 PyInterpreterState *interp = PyThreadState_GET()->interp;
Gustavo Niemeyer5ddd4c32003-03-19 00:35:36 +0000581 if (interp->codec_search_path == NULL && _PyCodecRegistry_Init())
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000582 return NULL;
Gustavo Niemeyer5ddd4c32003-03-19 00:35:36 +0000583
Walter Dörwald3aeb6322002-09-02 13:14:32 +0000584 if (name==NULL)
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000585 name = "strict";
Serhiy Storchakac6792272013-10-19 21:03:34 +0300586 handler = PyDict_GetItemString(interp->codec_error_registry, name);
Walter Dörwald3aeb6322002-09-02 13:14:32 +0000587 if (!handler)
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000588 PyErr_Format(PyExc_LookupError, "unknown error handler name '%.400s'", name);
Walter Dörwald3aeb6322002-09-02 13:14:32 +0000589 else
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000590 Py_INCREF(handler);
Walter Dörwald3aeb6322002-09-02 13:14:32 +0000591 return handler;
592}
593
594static void wrong_exception_type(PyObject *exc)
595{
Martin v. Löwisbd928fe2011-10-14 10:20:37 +0200596 _Py_IDENTIFIER(__class__);
597 _Py_IDENTIFIER(__name__);
Martin v. Löwis1ee1b6f2011-10-10 18:11:30 +0200598 PyObject *type = _PyObject_GetAttrId(exc, &PyId___class__);
Walter Dörwald3aeb6322002-09-02 13:14:32 +0000599 if (type != NULL) {
Martin v. Löwis1ee1b6f2011-10-10 18:11:30 +0200600 PyObject *name = _PyObject_GetAttrId(type, &PyId___name__);
Walter Dörwald573c08c2007-05-25 15:46:59 +0000601 Py_DECREF(type);
602 if (name != NULL) {
603 PyErr_Format(PyExc_TypeError,
604 "don't know how to handle %S in error callback", name);
605 Py_DECREF(name);
606 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +0000607 }
608}
609
610PyObject *PyCodec_StrictErrors(PyObject *exc)
611{
Brett Cannonbf364092006-03-01 04:25:17 +0000612 if (PyExceptionInstance_Check(exc))
613 PyErr_SetObject(PyExceptionInstance_Class(exc), exc);
Walter Dörwald3aeb6322002-09-02 13:14:32 +0000614 else
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000615 PyErr_SetString(PyExc_TypeError, "codec must pass exception instance");
Walter Dörwald3aeb6322002-09-02 13:14:32 +0000616 return NULL;
617}
618
619
620PyObject *PyCodec_IgnoreErrors(PyObject *exc)
621{
Martin v. Löwis18e16552006-02-15 17:27:45 +0000622 Py_ssize_t end;
Walter Dörwald3aeb6322002-09-02 13:14:32 +0000623 if (PyObject_IsInstance(exc, PyExc_UnicodeEncodeError)) {
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000624 if (PyUnicodeEncodeError_GetEnd(exc, &end))
625 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +0000626 }
627 else if (PyObject_IsInstance(exc, PyExc_UnicodeDecodeError)) {
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000628 if (PyUnicodeDecodeError_GetEnd(exc, &end))
629 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +0000630 }
631 else if (PyObject_IsInstance(exc, PyExc_UnicodeTranslateError)) {
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000632 if (PyUnicodeTranslateError_GetEnd(exc, &end))
633 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +0000634 }
635 else {
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000636 wrong_exception_type(exc);
637 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +0000638 }
Victor Stinneree450092011-12-01 02:52:11 +0100639 return Py_BuildValue("(Nn)", PyUnicode_New(0, 0), end);
Walter Dörwald3aeb6322002-09-02 13:14:32 +0000640}
641
642
643PyObject *PyCodec_ReplaceErrors(PyObject *exc)
644{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200645 Py_ssize_t start, end, i, len;
Walter Dörwald3aeb6322002-09-02 13:14:32 +0000646
647 if (PyObject_IsInstance(exc, PyExc_UnicodeEncodeError)) {
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000648 PyObject *res;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200649 int kind;
650 void *data;
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000651 if (PyUnicodeEncodeError_GetStart(exc, &start))
652 return NULL;
653 if (PyUnicodeEncodeError_GetEnd(exc, &end))
654 return NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200655 len = end - start;
656 res = PyUnicode_New(len, '?');
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000657 if (res == NULL)
658 return NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200659 kind = PyUnicode_KIND(res);
660 data = PyUnicode_DATA(res);
661 for (i = 0; i < len; ++i)
662 PyUnicode_WRITE(kind, data, i, '?');
Victor Stinner8f825062012-04-27 13:55:39 +0200663 assert(_PyUnicode_CheckConsistency(res, 1));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200664 return Py_BuildValue("(Nn)", res, end);
Walter Dörwald3aeb6322002-09-02 13:14:32 +0000665 }
666 else if (PyObject_IsInstance(exc, PyExc_UnicodeDecodeError)) {
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000667 if (PyUnicodeDecodeError_GetEnd(exc, &end))
668 return NULL;
Victor Stinner1a15aba2011-10-02 19:00:15 +0200669 return Py_BuildValue("(Cn)",
670 (int)Py_UNICODE_REPLACEMENT_CHARACTER,
671 end);
Walter Dörwald3aeb6322002-09-02 13:14:32 +0000672 }
673 else if (PyObject_IsInstance(exc, PyExc_UnicodeTranslateError)) {
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000674 PyObject *res;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200675 int kind;
676 void *data;
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000677 if (PyUnicodeTranslateError_GetStart(exc, &start))
678 return NULL;
679 if (PyUnicodeTranslateError_GetEnd(exc, &end))
680 return NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200681 len = end - start;
682 res = PyUnicode_New(len, Py_UNICODE_REPLACEMENT_CHARACTER);
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000683 if (res == NULL)
684 return NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200685 kind = PyUnicode_KIND(res);
686 data = PyUnicode_DATA(res);
687 for (i=0; i < len; i++)
688 PyUnicode_WRITE(kind, data, i, Py_UNICODE_REPLACEMENT_CHARACTER);
Victor Stinner8f825062012-04-27 13:55:39 +0200689 assert(_PyUnicode_CheckConsistency(res, 1));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200690 return Py_BuildValue("(Nn)", res, end);
Walter Dörwald3aeb6322002-09-02 13:14:32 +0000691 }
692 else {
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000693 wrong_exception_type(exc);
694 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +0000695 }
696}
697
698PyObject *PyCodec_XMLCharRefReplaceErrors(PyObject *exc)
699{
700 if (PyObject_IsInstance(exc, PyExc_UnicodeEncodeError)) {
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000701 PyObject *restuple;
702 PyObject *object;
Victor Stinnerb31f1bc2011-11-04 21:29:10 +0100703 Py_ssize_t i;
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000704 Py_ssize_t start;
705 Py_ssize_t end;
706 PyObject *res;
Martin v. Löwisb09af032011-11-04 11:16:41 +0100707 unsigned char *outp;
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000708 int ressize;
Martin v. Löwisb09af032011-11-04 11:16:41 +0100709 Py_UCS4 ch;
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000710 if (PyUnicodeEncodeError_GetStart(exc, &start))
711 return NULL;
712 if (PyUnicodeEncodeError_GetEnd(exc, &end))
713 return NULL;
714 if (!(object = PyUnicodeEncodeError_GetObject(exc)))
715 return NULL;
Martin v. Löwisb09af032011-11-04 11:16:41 +0100716 for (i = start, ressize = 0; i < end; ++i) {
717 /* object is guaranteed to be "ready" */
718 ch = PyUnicode_READ_CHAR(object, i);
719 if (ch<10)
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000720 ressize += 2+1+1;
Martin v. Löwisb09af032011-11-04 11:16:41 +0100721 else if (ch<100)
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000722 ressize += 2+2+1;
Martin v. Löwisb09af032011-11-04 11:16:41 +0100723 else if (ch<1000)
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000724 ressize += 2+3+1;
Martin v. Löwisb09af032011-11-04 11:16:41 +0100725 else if (ch<10000)
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000726 ressize += 2+4+1;
Martin v. Löwisb09af032011-11-04 11:16:41 +0100727 else if (ch<100000)
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000728 ressize += 2+5+1;
Martin v. Löwisb09af032011-11-04 11:16:41 +0100729 else if (ch<1000000)
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000730 ressize += 2+6+1;
731 else
732 ressize += 2+7+1;
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000733 }
734 /* allocate replacement */
Martin v. Löwisb09af032011-11-04 11:16:41 +0100735 res = PyUnicode_New(ressize, 127);
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000736 if (res == NULL) {
737 Py_DECREF(object);
738 return NULL;
739 }
Martin v. Löwisb09af032011-11-04 11:16:41 +0100740 outp = PyUnicode_1BYTE_DATA(res);
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000741 /* generate replacement */
Victor Stinnerb31f1bc2011-11-04 21:29:10 +0100742 for (i = start; i < end; ++i) {
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000743 int digits;
744 int base;
Martin v. Löwis8ba79302011-11-04 12:26:49 +0100745 ch = PyUnicode_READ_CHAR(object, i);
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000746 *outp++ = '&';
747 *outp++ = '#';
Martin v. Löwisb09af032011-11-04 11:16:41 +0100748 if (ch<10) {
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000749 digits = 1;
750 base = 1;
751 }
Martin v. Löwisb09af032011-11-04 11:16:41 +0100752 else if (ch<100) {
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000753 digits = 2;
754 base = 10;
755 }
Martin v. Löwisb09af032011-11-04 11:16:41 +0100756 else if (ch<1000) {
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000757 digits = 3;
758 base = 100;
759 }
Martin v. Löwisb09af032011-11-04 11:16:41 +0100760 else if (ch<10000) {
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000761 digits = 4;
762 base = 1000;
763 }
Martin v. Löwisb09af032011-11-04 11:16:41 +0100764 else if (ch<100000) {
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000765 digits = 5;
766 base = 10000;
767 }
Martin v. Löwisb09af032011-11-04 11:16:41 +0100768 else if (ch<1000000) {
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000769 digits = 6;
770 base = 100000;
771 }
772 else {
773 digits = 7;
774 base = 1000000;
775 }
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000776 while (digits-->0) {
Martin v. Löwisb09af032011-11-04 11:16:41 +0100777 *outp++ = '0' + ch/base;
778 ch %= base;
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000779 base /= 10;
780 }
781 *outp++ = ';';
782 }
Victor Stinner8f825062012-04-27 13:55:39 +0200783 assert(_PyUnicode_CheckConsistency(res, 1));
784 restuple = Py_BuildValue("(Nn)", res, end);
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000785 Py_DECREF(object);
786 return restuple;
Walter Dörwald3aeb6322002-09-02 13:14:32 +0000787 }
788 else {
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000789 wrong_exception_type(exc);
790 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +0000791 }
792}
793
Walter Dörwald3aeb6322002-09-02 13:14:32 +0000794PyObject *PyCodec_BackslashReplaceErrors(PyObject *exc)
795{
796 if (PyObject_IsInstance(exc, PyExc_UnicodeEncodeError)) {
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000797 PyObject *restuple;
798 PyObject *object;
Martin v. Löwisb09af032011-11-04 11:16:41 +0100799 Py_ssize_t i;
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000800 Py_ssize_t start;
801 Py_ssize_t end;
802 PyObject *res;
Martin v. Löwisb09af032011-11-04 11:16:41 +0100803 unsigned char *outp;
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000804 int ressize;
Martin v. Löwisb09af032011-11-04 11:16:41 +0100805 Py_UCS4 c;
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000806 if (PyUnicodeEncodeError_GetStart(exc, &start))
807 return NULL;
808 if (PyUnicodeEncodeError_GetEnd(exc, &end))
809 return NULL;
810 if (!(object = PyUnicodeEncodeError_GetObject(exc)))
811 return NULL;
Martin v. Löwisb09af032011-11-04 11:16:41 +0100812 for (i = start, ressize = 0; i < end; ++i) {
813 /* object is guaranteed to be "ready" */
814 c = PyUnicode_READ_CHAR(object, i);
815 if (c >= 0x10000) {
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000816 ressize += 1+1+8;
Martin v. Löwisb09af032011-11-04 11:16:41 +0100817 }
818 else if (c >= 0x100) {
819 ressize += 1+1+4;
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000820 }
821 else
822 ressize += 1+1+2;
823 }
Martin v. Löwisb09af032011-11-04 11:16:41 +0100824 res = PyUnicode_New(ressize, 127);
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000825 if (res==NULL)
826 return NULL;
Martin v. Löwisb09af032011-11-04 11:16:41 +0100827 for (i = start, outp = PyUnicode_1BYTE_DATA(res);
828 i < end; ++i) {
829 c = PyUnicode_READ_CHAR(object, i);
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000830 *outp++ = '\\';
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000831 if (c >= 0x00010000) {
832 *outp++ = 'U';
Victor Stinnerf5cff562011-10-14 02:13:11 +0200833 *outp++ = Py_hexdigits[(c>>28)&0xf];
834 *outp++ = Py_hexdigits[(c>>24)&0xf];
835 *outp++ = Py_hexdigits[(c>>20)&0xf];
836 *outp++ = Py_hexdigits[(c>>16)&0xf];
837 *outp++ = Py_hexdigits[(c>>12)&0xf];
838 *outp++ = Py_hexdigits[(c>>8)&0xf];
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000839 }
Antoine Pitroue4a18922010-09-09 20:30:23 +0000840 else if (c >= 0x100) {
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000841 *outp++ = 'u';
Victor Stinnerf5cff562011-10-14 02:13:11 +0200842 *outp++ = Py_hexdigits[(c>>12)&0xf];
843 *outp++ = Py_hexdigits[(c>>8)&0xf];
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000844 }
845 else
846 *outp++ = 'x';
Victor Stinnerf5cff562011-10-14 02:13:11 +0200847 *outp++ = Py_hexdigits[(c>>4)&0xf];
848 *outp++ = Py_hexdigits[c&0xf];
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000849 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +0000850
Victor Stinner8f825062012-04-27 13:55:39 +0200851 assert(_PyUnicode_CheckConsistency(res, 1));
852 restuple = Py_BuildValue("(Nn)", res, end);
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000853 Py_DECREF(object);
854 return restuple;
Walter Dörwald3aeb6322002-09-02 13:14:32 +0000855 }
856 else {
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000857 wrong_exception_type(exc);
858 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +0000859 }
860}
861
Serhiy Storchaka58cf6072013-11-19 11:32:41 +0200862#define ENC_UTF8 0
863#define ENC_UTF16BE 1
864#define ENC_UTF16LE 2
865#define ENC_UTF32BE 3
866#define ENC_UTF32LE 4
867
868static int
869get_standard_encoding(const char *encoding, int *bytelength)
870{
871 if (Py_TOLOWER(encoding[0]) == 'u' &&
872 Py_TOLOWER(encoding[1]) == 't' &&
873 Py_TOLOWER(encoding[2]) == 'f') {
874 encoding += 3;
875 if (*encoding == '-' || *encoding == '_' )
876 encoding++;
877 if (encoding[0] == '1' && encoding[1] == '6') {
878 encoding += 2;
879 *bytelength = 2;
880 if (*encoding == '\0') {
881#ifdef WORDS_BIGENDIAN
882 return ENC_UTF16BE;
883#else
884 return ENC_UTF16LE;
885#endif
886 }
887 if (*encoding == '-' || *encoding == '_' )
888 encoding++;
889 if (Py_TOLOWER(encoding[1]) == 'e' && encoding[2] == '\0') {
890 if (Py_TOLOWER(encoding[0]) == 'b')
891 return ENC_UTF16BE;
892 if (Py_TOLOWER(encoding[0]) == 'l')
893 return ENC_UTF16LE;
894 }
895 }
896 else if (encoding[0] == '3' && encoding[1] == '2') {
897 encoding += 2;
898 *bytelength = 4;
899 if (*encoding == '\0') {
900#ifdef WORDS_BIGENDIAN
901 return ENC_UTF32BE;
902#else
903 return ENC_UTF32LE;
904#endif
905 }
906 if (*encoding == '-' || *encoding == '_' )
907 encoding++;
908 if (Py_TOLOWER(encoding[1]) == 'e' && encoding[2] == '\0') {
909 if (Py_TOLOWER(encoding[0]) == 'b')
910 return ENC_UTF32BE;
911 if (Py_TOLOWER(encoding[0]) == 'l')
912 return ENC_UTF32LE;
913 }
914 }
915 }
916 /* utf-8 */
917 *bytelength = 3;
918 return ENC_UTF8;
919}
920
Martin v. Löwisaef3fb02009-05-02 19:27:30 +0000921/* This handler is declared static until someone demonstrates
922 a need to call it directly. */
923static PyObject *
Martin v. Löwise0a2b722009-05-10 08:08:56 +0000924PyCodec_SurrogatePassErrors(PyObject *exc)
Martin v. Löwisdb12d452009-05-02 18:52:14 +0000925{
926 PyObject *restuple;
927 PyObject *object;
Serhiy Storchaka58cf6072013-11-19 11:32:41 +0200928 PyObject *encode;
929 char *encoding;
930 int code;
931 int bytelength;
Martin v. Löwisb09af032011-11-04 11:16:41 +0100932 Py_ssize_t i;
Martin v. Löwisdb12d452009-05-02 18:52:14 +0000933 Py_ssize_t start;
934 Py_ssize_t end;
935 PyObject *res;
936 if (PyObject_IsInstance(exc, PyExc_UnicodeEncodeError)) {
Serhiy Storchaka58cf6072013-11-19 11:32:41 +0200937 unsigned char *outp;
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000938 if (PyUnicodeEncodeError_GetStart(exc, &start))
939 return NULL;
940 if (PyUnicodeEncodeError_GetEnd(exc, &end))
941 return NULL;
942 if (!(object = PyUnicodeEncodeError_GetObject(exc)))
943 return NULL;
Serhiy Storchaka58cf6072013-11-19 11:32:41 +0200944 if (!(encode = PyUnicodeEncodeError_GetEncoding(exc))) {
945 Py_DECREF(object);
946 return NULL;
947 }
948 if (!(encoding = PyUnicode_AsUTF8(encode))) {
949 Py_DECREF(object);
950 Py_DECREF(encode);
951 return NULL;
952 }
953 code = get_standard_encoding(encoding, &bytelength);
954 Py_DECREF(encode);
955
956 res = PyBytes_FromStringAndSize(NULL, bytelength*(end-start));
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000957 if (!res) {
958 Py_DECREF(object);
959 return NULL;
960 }
Serhiy Storchaka58cf6072013-11-19 11:32:41 +0200961 outp = (unsigned char*)PyBytes_AsString(res);
Martin v. Löwisb09af032011-11-04 11:16:41 +0100962 for (i = start; i < end; i++) {
963 /* object is guaranteed to be "ready" */
964 Py_UCS4 ch = PyUnicode_READ_CHAR(object, i);
Victor Stinner76df43d2012-10-30 01:42:39 +0100965 if (!Py_UNICODE_IS_SURROGATE(ch)) {
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000966 /* Not a surrogate, fail with original exception */
967 PyErr_SetObject(PyExceptionInstance_Class(exc), exc);
968 Py_DECREF(res);
969 Py_DECREF(object);
970 return NULL;
971 }
Serhiy Storchaka58cf6072013-11-19 11:32:41 +0200972 switch (code) {
973 case ENC_UTF8:
974 *outp++ = (unsigned char)(0xe0 | (ch >> 12));
975 *outp++ = (unsigned char)(0x80 | ((ch >> 6) & 0x3f));
976 *outp++ = (unsigned char)(0x80 | (ch & 0x3f));
977 break;
978 case ENC_UTF16LE:
979 *outp++ = (unsigned char) ch;
980 *outp++ = (unsigned char)(ch >> 8);
981 break;
982 case ENC_UTF16BE:
983 *outp++ = (unsigned char)(ch >> 8);
984 *outp++ = (unsigned char) ch;
985 break;
986 case ENC_UTF32LE:
987 *outp++ = (unsigned char) ch;
988 *outp++ = (unsigned char)(ch >> 8);
989 *outp++ = (unsigned char)(ch >> 16);
990 *outp++ = (unsigned char)(ch >> 24);
991 break;
992 case ENC_UTF32BE:
993 *outp++ = (unsigned char)(ch >> 24);
994 *outp++ = (unsigned char)(ch >> 16);
995 *outp++ = (unsigned char)(ch >> 8);
996 *outp++ = (unsigned char) ch;
997 break;
998 }
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000999 }
1000 restuple = Py_BuildValue("(On)", res, end);
1001 Py_DECREF(res);
1002 Py_DECREF(object);
1003 return restuple;
Martin v. Löwisdb12d452009-05-02 18:52:14 +00001004 }
1005 else if (PyObject_IsInstance(exc, PyExc_UnicodeDecodeError)) {
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001006 unsigned char *p;
Victor Stinnerc06bb7a2011-11-04 21:36:35 +01001007 Py_UCS4 ch = 0;
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001008 if (PyUnicodeDecodeError_GetStart(exc, &start))
1009 return NULL;
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02001010 if (PyUnicodeDecodeError_GetEnd(exc, &end))
1011 return NULL;
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001012 if (!(object = PyUnicodeDecodeError_GetObject(exc)))
1013 return NULL;
1014 if (!(p = (unsigned char*)PyBytes_AsString(object))) {
1015 Py_DECREF(object);
1016 return NULL;
1017 }
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02001018 if (!(encode = PyUnicodeDecodeError_GetEncoding(exc))) {
1019 Py_DECREF(object);
1020 return NULL;
1021 }
1022 if (!(encoding = PyUnicode_AsUTF8(encode))) {
1023 Py_DECREF(object);
1024 Py_DECREF(encode);
1025 return NULL;
1026 }
1027 code = get_standard_encoding(encoding, &bytelength);
1028 Py_DECREF(encode);
1029
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001030 /* Try decoding a single surrogate character. If
1031 there are more, let the codec call us again. */
1032 p += start;
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02001033 if (PyBytes_GET_SIZE(object) - start >= bytelength) {
1034 switch (code) {
1035 case ENC_UTF8:
1036 if ((p[0] & 0xf0) == 0xe0 &&
1037 (p[1] & 0xc0) == 0x80 &&
1038 (p[2] & 0xc0) == 0x80) {
1039 /* it's a three-byte code */
1040 ch = ((p[0] & 0x0f) << 12) + ((p[1] & 0x3f) << 6) + (p[2] & 0x3f);
1041 }
1042 break;
1043 case ENC_UTF16LE:
1044 ch = p[1] << 8 | p[0];
1045 break;
1046 case ENC_UTF16BE:
1047 ch = p[0] << 8 | p[1];
1048 break;
1049 case ENC_UTF32LE:
1050 ch = (p[3] << 24) | (p[2] << 16) | (p[1] << 8) | p[0];
1051 break;
1052 case ENC_UTF32BE:
1053 ch = (p[0] << 24) | (p[1] << 16) | (p[2] << 8) | p[3];
1054 break;
1055 }
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001056 }
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02001057
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001058 Py_DECREF(object);
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02001059 if (!Py_UNICODE_IS_SURROGATE(ch)) {
1060 /* it's not a surrogate - fail */
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001061 PyErr_SetObject(PyExceptionInstance_Class(exc), exc);
1062 return NULL;
1063 }
Victor Stinnerc06bb7a2011-11-04 21:36:35 +01001064 res = PyUnicode_FromOrdinal(ch);
1065 if (res == NULL)
1066 return NULL;
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02001067 return Py_BuildValue("(Nn)", res, start + bytelength);
Martin v. Löwisdb12d452009-05-02 18:52:14 +00001068 }
1069 else {
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001070 wrong_exception_type(exc);
1071 return NULL;
Martin v. Löwisdb12d452009-05-02 18:52:14 +00001072 }
1073}
1074
Martin v. Löwis011e8422009-05-05 04:43:17 +00001075static PyObject *
Martin v. Löwis43c57782009-05-10 08:15:24 +00001076PyCodec_SurrogateEscapeErrors(PyObject *exc)
Martin v. Löwis011e8422009-05-05 04:43:17 +00001077{
1078 PyObject *restuple;
1079 PyObject *object;
Martin v. Löwisb09af032011-11-04 11:16:41 +01001080 Py_ssize_t i;
Martin v. Löwis011e8422009-05-05 04:43:17 +00001081 Py_ssize_t start;
1082 Py_ssize_t end;
1083 PyObject *res;
1084 if (PyObject_IsInstance(exc, PyExc_UnicodeEncodeError)) {
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001085 char *outp;
1086 if (PyUnicodeEncodeError_GetStart(exc, &start))
1087 return NULL;
1088 if (PyUnicodeEncodeError_GetEnd(exc, &end))
1089 return NULL;
1090 if (!(object = PyUnicodeEncodeError_GetObject(exc)))
1091 return NULL;
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001092 res = PyBytes_FromStringAndSize(NULL, end-start);
1093 if (!res) {
1094 Py_DECREF(object);
1095 return NULL;
1096 }
1097 outp = PyBytes_AsString(res);
Martin v. Löwisb09af032011-11-04 11:16:41 +01001098 for (i = start; i < end; i++) {
1099 /* object is guaranteed to be "ready" */
1100 Py_UCS4 ch = PyUnicode_READ_CHAR(object, i);
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001101 if (ch < 0xdc80 || ch > 0xdcff) {
1102 /* Not a UTF-8b surrogate, fail with original exception */
1103 PyErr_SetObject(PyExceptionInstance_Class(exc), exc);
1104 Py_DECREF(res);
1105 Py_DECREF(object);
1106 return NULL;
1107 }
1108 *outp++ = ch - 0xdc00;
1109 }
1110 restuple = Py_BuildValue("(On)", res, end);
1111 Py_DECREF(res);
1112 Py_DECREF(object);
1113 return restuple;
Martin v. Löwis011e8422009-05-05 04:43:17 +00001114 }
1115 else if (PyObject_IsInstance(exc, PyExc_UnicodeDecodeError)) {
Victor Stinnerc06bb7a2011-11-04 21:36:35 +01001116 PyObject *str;
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001117 unsigned char *p;
Victor Stinnerc06bb7a2011-11-04 21:36:35 +01001118 Py_UCS2 ch[4]; /* decode up to 4 bad bytes. */
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001119 int consumed = 0;
1120 if (PyUnicodeDecodeError_GetStart(exc, &start))
1121 return NULL;
1122 if (PyUnicodeDecodeError_GetEnd(exc, &end))
1123 return NULL;
1124 if (!(object = PyUnicodeDecodeError_GetObject(exc)))
1125 return NULL;
1126 if (!(p = (unsigned char*)PyBytes_AsString(object))) {
1127 Py_DECREF(object);
1128 return NULL;
1129 }
1130 while (consumed < 4 && consumed < end-start) {
1131 /* Refuse to escape ASCII bytes. */
1132 if (p[start+consumed] < 128)
1133 break;
1134 ch[consumed] = 0xdc00 + p[start+consumed];
1135 consumed++;
1136 }
1137 Py_DECREF(object);
1138 if (!consumed) {
1139 /* codec complained about ASCII byte. */
1140 PyErr_SetObject(PyExceptionInstance_Class(exc), exc);
1141 return NULL;
1142 }
Victor Stinnerc06bb7a2011-11-04 21:36:35 +01001143 str = PyUnicode_FromKindAndData(PyUnicode_2BYTE_KIND, ch, consumed);
1144 if (str == NULL)
1145 return NULL;
1146 return Py_BuildValue("(Nn)", str, start+consumed);
Martin v. Löwis011e8422009-05-05 04:43:17 +00001147 }
1148 else {
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001149 wrong_exception_type(exc);
1150 return NULL;
Martin v. Löwis011e8422009-05-05 04:43:17 +00001151 }
1152}
1153
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001154
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001155static PyObject *strict_errors(PyObject *self, PyObject *exc)
1156{
1157 return PyCodec_StrictErrors(exc);
1158}
1159
1160
1161static PyObject *ignore_errors(PyObject *self, PyObject *exc)
1162{
1163 return PyCodec_IgnoreErrors(exc);
1164}
1165
1166
1167static PyObject *replace_errors(PyObject *self, PyObject *exc)
1168{
1169 return PyCodec_ReplaceErrors(exc);
1170}
1171
1172
1173static PyObject *xmlcharrefreplace_errors(PyObject *self, PyObject *exc)
1174{
1175 return PyCodec_XMLCharRefReplaceErrors(exc);
1176}
1177
1178
1179static PyObject *backslashreplace_errors(PyObject *self, PyObject *exc)
1180{
1181 return PyCodec_BackslashReplaceErrors(exc);
1182}
1183
Martin v. Löwise0a2b722009-05-10 08:08:56 +00001184static PyObject *surrogatepass_errors(PyObject *self, PyObject *exc)
Martin v. Löwisdb12d452009-05-02 18:52:14 +00001185{
Martin v. Löwise0a2b722009-05-10 08:08:56 +00001186 return PyCodec_SurrogatePassErrors(exc);
Martin v. Löwisdb12d452009-05-02 18:52:14 +00001187}
1188
Martin v. Löwis43c57782009-05-10 08:15:24 +00001189static PyObject *surrogateescape_errors(PyObject *self, PyObject *exc)
Martin v. Löwis011e8422009-05-05 04:43:17 +00001190{
Martin v. Löwis43c57782009-05-10 08:15:24 +00001191 return PyCodec_SurrogateEscapeErrors(exc);
Martin v. Löwis011e8422009-05-05 04:43:17 +00001192}
1193
Gustavo Niemeyer5ddd4c32003-03-19 00:35:36 +00001194static int _PyCodecRegistry_Init(void)
Guido van Rossumfeee4b92000-03-10 22:57:27 +00001195{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001196 static struct {
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001197 char *name;
1198 PyMethodDef def;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001199 } methods[] =
1200 {
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001201 {
1202 "strict",
1203 {
1204 "strict_errors",
1205 strict_errors,
1206 METH_O,
1207 PyDoc_STR("Implements the 'strict' error handling, which "
1208 "raises a UnicodeError on coding errors.")
1209 }
1210 },
1211 {
1212 "ignore",
1213 {
1214 "ignore_errors",
1215 ignore_errors,
1216 METH_O,
1217 PyDoc_STR("Implements the 'ignore' error handling, which "
1218 "ignores malformed data and continues.")
1219 }
1220 },
1221 {
1222 "replace",
1223 {
1224 "replace_errors",
1225 replace_errors,
1226 METH_O,
1227 PyDoc_STR("Implements the 'replace' error handling, which "
1228 "replaces malformed data with a replacement marker.")
1229 }
1230 },
1231 {
1232 "xmlcharrefreplace",
1233 {
1234 "xmlcharrefreplace_errors",
1235 xmlcharrefreplace_errors,
1236 METH_O,
1237 PyDoc_STR("Implements the 'xmlcharrefreplace' error handling, "
1238 "which replaces an unencodable character with the "
1239 "appropriate XML character reference.")
1240 }
1241 },
1242 {
1243 "backslashreplace",
1244 {
1245 "backslashreplace_errors",
1246 backslashreplace_errors,
1247 METH_O,
1248 PyDoc_STR("Implements the 'backslashreplace' error handling, "
1249 "which replaces an unencodable character with a "
1250 "backslashed escape sequence.")
1251 }
1252 },
1253 {
1254 "surrogatepass",
1255 {
1256 "surrogatepass",
1257 surrogatepass_errors,
1258 METH_O
1259 }
1260 },
1261 {
1262 "surrogateescape",
1263 {
1264 "surrogateescape",
1265 surrogateescape_errors,
1266 METH_O
1267 }
1268 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001269 };
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001270
Nicholas Bastine5662ae2004-03-24 22:22:12 +00001271 PyInterpreterState *interp = PyThreadState_GET()->interp;
Gustavo Niemeyer5ddd4c32003-03-19 00:35:36 +00001272 PyObject *mod;
Neal Norwitz739a8f82004-07-08 01:55:58 +00001273 unsigned i;
Gustavo Niemeyer5ddd4c32003-03-19 00:35:36 +00001274
1275 if (interp->codec_search_path != NULL)
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001276 return 0;
Gustavo Niemeyer5ddd4c32003-03-19 00:35:36 +00001277
1278 interp->codec_search_path = PyList_New(0);
1279 interp->codec_search_cache = PyDict_New();
1280 interp->codec_error_registry = PyDict_New();
1281
1282 if (interp->codec_error_registry) {
Victor Stinner63941882011-09-29 00:42:28 +02001283 for (i = 0; i < Py_ARRAY_LENGTH(methods); ++i) {
Andrew Svetlov3ba3a3e2012-12-25 13:32:35 +02001284 PyObject *func = PyCFunction_NewEx(&methods[i].def, NULL, NULL);
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001285 int res;
1286 if (!func)
1287 Py_FatalError("can't initialize codec error registry");
1288 res = PyCodec_RegisterError(methods[i].name, func);
1289 Py_DECREF(func);
1290 if (res)
1291 Py_FatalError("can't initialize codec error registry");
1292 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001293 }
Guido van Rossumfeee4b92000-03-10 22:57:27 +00001294
Gustavo Niemeyer5ddd4c32003-03-19 00:35:36 +00001295 if (interp->codec_search_path == NULL ||
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001296 interp->codec_search_cache == NULL ||
1297 interp->codec_error_registry == NULL)
1298 Py_FatalError("can't initialize codec registry");
Gustavo Niemeyer5ddd4c32003-03-19 00:35:36 +00001299
Christian Heimes819b8bf2008-01-03 23:05:47 +00001300 mod = PyImport_ImportModuleNoBlock("encodings");
Gustavo Niemeyer5ddd4c32003-03-19 00:35:36 +00001301 if (mod == NULL) {
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001302 return -1;
Gustavo Niemeyer5ddd4c32003-03-19 00:35:36 +00001303 }
1304 Py_DECREF(mod);
Christian Heimes6a27efa2008-10-30 21:48:26 +00001305 interp->codecs_initialized = 1;
Gustavo Niemeyer5ddd4c32003-03-19 00:35:36 +00001306 return 0;
Guido van Rossumfeee4b92000-03-10 22:57:27 +00001307}