blob: ea33c49f2090b511c66055c65ceea6e086b62e66 [file] [log] [blame]
Guido van Rossumfeee4b92000-03-10 22:57:27 +00001/* ------------------------------------------------------------------------
2
3 Python Codec Registry and support functions
4
5Written by Marc-Andre Lemburg (mal@lemburg.com).
6
Guido van Rossum16b1ad92000-08-03 16:24:25 +00007Copyright (c) Corporation for National Research Initiatives.
Guido van Rossumfeee4b92000-03-10 22:57:27 +00008
9 ------------------------------------------------------------------------ */
10
11#include "Python.h"
12#include <ctype.h>
13
Victor Stinnerf5cff562011-10-14 02:13:11 +020014const char *Py_hexdigits = "0123456789abcdef";
15
Guido van Rossumfeee4b92000-03-10 22:57:27 +000016/* --- Codec Registry ----------------------------------------------------- */
17
18/* Import the standard encodings package which will register the first
Guido van Rossum98297ee2007-11-06 21:34:58 +000019 codec search function.
Guido van Rossumfeee4b92000-03-10 22:57:27 +000020
21 This is done in a lazy way so that the Unicode implementation does
22 not downgrade startup time of scripts not needing it.
23
Guido van Rossumb95de4f2000-03-31 17:25:23 +000024 ImportErrors are silently ignored by this function. Only one try is
25 made.
Guido van Rossumfeee4b92000-03-10 22:57:27 +000026
27*/
28
Gustavo Niemeyer5ddd4c32003-03-19 00:35:36 +000029static int _PyCodecRegistry_Init(void); /* Forward */
Guido van Rossumfeee4b92000-03-10 22:57:27 +000030
Guido van Rossumfeee4b92000-03-10 22:57:27 +000031int PyCodec_Register(PyObject *search_function)
32{
Nicholas Bastine5662ae2004-03-24 22:22:12 +000033 PyInterpreterState *interp = PyThreadState_GET()->interp;
Gustavo Niemeyer5ddd4c32003-03-19 00:35:36 +000034 if (interp->codec_search_path == NULL && _PyCodecRegistry_Init())
Antoine Pitrouf95a1b32010-05-09 15:52:27 +000035 goto onError;
Guido van Rossumfeee4b92000-03-10 22:57:27 +000036 if (search_function == NULL) {
Antoine Pitrouf95a1b32010-05-09 15:52:27 +000037 PyErr_BadArgument();
38 goto onError;
Guido van Rossumfeee4b92000-03-10 22:57:27 +000039 }
40 if (!PyCallable_Check(search_function)) {
Antoine Pitrouf95a1b32010-05-09 15:52:27 +000041 PyErr_SetString(PyExc_TypeError, "argument must be callable");
42 goto onError;
Guido van Rossumfeee4b92000-03-10 22:57:27 +000043 }
Gustavo Niemeyer5ddd4c32003-03-19 00:35:36 +000044 return PyList_Append(interp->codec_search_path, search_function);
Guido van Rossumb95de4f2000-03-31 17:25:23 +000045
46 onError:
47 return -1;
Guido van Rossumfeee4b92000-03-10 22:57:27 +000048}
49
Guido van Rossum9e896b32000-04-05 20:11:21 +000050/* Convert a string to a normalized Python string: all characters are
51 converted to lower case, spaces are replaced with underscores. */
52
Guido van Rossumfeee4b92000-03-10 22:57:27 +000053static
Guido van Rossum9e896b32000-04-05 20:11:21 +000054PyObject *normalizestring(const char *string)
Guido van Rossumfeee4b92000-03-10 22:57:27 +000055{
Guido van Rossum33831132000-06-29 14:50:15 +000056 register size_t i;
Guido van Rossum582acec2000-06-28 22:07:35 +000057 size_t len = strlen(string);
Guido van Rossumfeee4b92000-03-10 22:57:27 +000058 char *p;
59 PyObject *v;
Guido van Rossum21431e82007-10-19 21:48:41 +000060
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000061 if (len > PY_SSIZE_T_MAX) {
Antoine Pitrouf95a1b32010-05-09 15:52:27 +000062 PyErr_SetString(PyExc_OverflowError, "string is too large");
63 return NULL;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000064 }
Guido van Rossum21431e82007-10-19 21:48:41 +000065
66 p = PyMem_Malloc(len + 1);
67 if (p == NULL)
68 return NULL;
Guido van Rossum9e896b32000-04-05 20:11:21 +000069 for (i = 0; i < len; i++) {
70 register char ch = string[i];
71 if (ch == ' ')
72 ch = '-';
73 else
Antoine Pitroucf9d3c02011-07-24 02:27:04 +020074 ch = Py_TOLOWER(Py_CHARMASK(ch));
Antoine Pitrouf95a1b32010-05-09 15:52:27 +000075 p[i] = ch;
Guido van Rossum9e896b32000-04-05 20:11:21 +000076 }
Guido van Rossum21431e82007-10-19 21:48:41 +000077 p[i] = '\0';
78 v = PyUnicode_FromString(p);
79 if (v == NULL)
80 return NULL;
81 PyMem_Free(p);
Guido van Rossumfeee4b92000-03-10 22:57:27 +000082 return v;
83}
84
85/* Lookup the given encoding and return a tuple providing the codec
86 facilities.
87
88 The encoding string is looked up converted to all lower-case
89 characters. This makes encodings looked up through this mechanism
90 effectively case-insensitive.
91
Guido van Rossum98297ee2007-11-06 21:34:58 +000092 If no codec is found, a LookupError is set and NULL returned.
Guido van Rossumb95de4f2000-03-31 17:25:23 +000093
94 As side effect, this tries to load the encodings package, if not
95 yet done. This is part of the lazy load strategy for the encodings
96 package.
97
98*/
Guido van Rossumfeee4b92000-03-10 22:57:27 +000099
100PyObject *_PyCodec_Lookup(const char *encoding)
101{
Gustavo Niemeyer5ddd4c32003-03-19 00:35:36 +0000102 PyInterpreterState *interp;
Guido van Rossum5ba3c842000-03-24 20:52:23 +0000103 PyObject *result, *args = NULL, *v;
Thomas Wouters477c8d52006-05-27 19:21:47 +0000104 Py_ssize_t i, len;
Guido van Rossumfeee4b92000-03-10 22:57:27 +0000105
Fred Drake766de832000-05-09 19:55:59 +0000106 if (encoding == NULL) {
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000107 PyErr_BadArgument();
108 goto onError;
Fred Drake766de832000-05-09 19:55:59 +0000109 }
Gustavo Niemeyer5ddd4c32003-03-19 00:35:36 +0000110
Nicholas Bastine5662ae2004-03-24 22:22:12 +0000111 interp = PyThreadState_GET()->interp;
Gustavo Niemeyer5ddd4c32003-03-19 00:35:36 +0000112 if (interp->codec_search_path == NULL && _PyCodecRegistry_Init())
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000113 goto onError;
Guido van Rossumfeee4b92000-03-10 22:57:27 +0000114
Guido van Rossum9e896b32000-04-05 20:11:21 +0000115 /* Convert the encoding to a normalized Python string: all
Thomas Wouters7e474022000-07-16 12:04:32 +0000116 characters are converted to lower case, spaces and hyphens are
Guido van Rossum9e896b32000-04-05 20:11:21 +0000117 replaced with underscores. */
118 v = normalizestring(encoding);
Guido van Rossumfeee4b92000-03-10 22:57:27 +0000119 if (v == NULL)
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000120 goto onError;
Guido van Rossum21431e82007-10-19 21:48:41 +0000121 PyUnicode_InternInPlace(&v);
Guido van Rossumfeee4b92000-03-10 22:57:27 +0000122
123 /* First, try to lookup the name in the registry dictionary */
Gustavo Niemeyer5ddd4c32003-03-19 00:35:36 +0000124 result = PyDict_GetItem(interp->codec_search_cache, v);
Guido van Rossumfeee4b92000-03-10 22:57:27 +0000125 if (result != NULL) {
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000126 Py_INCREF(result);
127 Py_DECREF(v);
128 return result;
Guido van Rossumfeee4b92000-03-10 22:57:27 +0000129 }
Guido van Rossum98297ee2007-11-06 21:34:58 +0000130
Guido van Rossumfeee4b92000-03-10 22:57:27 +0000131 /* Next, scan the search functions in order of registration */
Guido van Rossumfeee4b92000-03-10 22:57:27 +0000132 args = PyTuple_New(1);
133 if (args == NULL)
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000134 goto onError;
Guido van Rossumfeee4b92000-03-10 22:57:27 +0000135 PyTuple_SET_ITEM(args,0,v);
Guido van Rossum5ba3c842000-03-24 20:52:23 +0000136
Gustavo Niemeyer5ddd4c32003-03-19 00:35:36 +0000137 len = PyList_Size(interp->codec_search_path);
Guido van Rossum5ba3c842000-03-24 20:52:23 +0000138 if (len < 0)
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000139 goto onError;
Guido van Rossumb95de4f2000-03-31 17:25:23 +0000140 if (len == 0) {
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000141 PyErr_SetString(PyExc_LookupError,
142 "no codec search functions registered: "
143 "can't find encoding");
144 goto onError;
Guido van Rossumb95de4f2000-03-31 17:25:23 +0000145 }
Guido van Rossumfeee4b92000-03-10 22:57:27 +0000146
147 for (i = 0; i < len; i++) {
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000148 PyObject *func;
Guido van Rossum98297ee2007-11-06 21:34:58 +0000149
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000150 func = PyList_GetItem(interp->codec_search_path, i);
151 if (func == NULL)
152 goto onError;
153 result = PyEval_CallObject(func, args);
154 if (result == NULL)
155 goto onError;
156 if (result == Py_None) {
157 Py_DECREF(result);
158 continue;
159 }
160 if (!PyTuple_Check(result) || PyTuple_GET_SIZE(result) != 4) {
161 PyErr_SetString(PyExc_TypeError,
162 "codec search functions must return 4-tuples");
163 Py_DECREF(result);
164 goto onError;
165 }
166 break;
Guido van Rossumfeee4b92000-03-10 22:57:27 +0000167 }
168 if (i == len) {
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000169 /* XXX Perhaps we should cache misses too ? */
170 PyErr_Format(PyExc_LookupError,
Martin v. Löwiseb42b022002-09-26 16:01:24 +0000171 "unknown encoding: %s", encoding);
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000172 goto onError;
Guido van Rossumfeee4b92000-03-10 22:57:27 +0000173 }
174
175 /* Cache and return the result */
Neal Norwitz9edcc2e2007-08-11 04:58:26 +0000176 if (PyDict_SetItem(interp->codec_search_cache, v, result) < 0) {
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000177 Py_DECREF(result);
178 goto onError;
Neal Norwitz9edcc2e2007-08-11 04:58:26 +0000179 }
Guido van Rossumfeee4b92000-03-10 22:57:27 +0000180 Py_DECREF(args);
181 return result;
182
183 onError:
184 Py_XDECREF(args);
185 return NULL;
186}
187
Marc-André Lemburgb2750b52008-06-06 12:18:17 +0000188/* Codec registry encoding check API. */
189
190int PyCodec_KnownEncoding(const char *encoding)
191{
192 PyObject *codecs;
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000193
Marc-André Lemburgb2750b52008-06-06 12:18:17 +0000194 codecs = _PyCodec_Lookup(encoding);
195 if (!codecs) {
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000196 PyErr_Clear();
197 return 0;
Marc-André Lemburgb2750b52008-06-06 12:18:17 +0000198 }
199 else {
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000200 Py_DECREF(codecs);
201 return 1;
Marc-André Lemburgb2750b52008-06-06 12:18:17 +0000202 }
203}
204
Guido van Rossumfeee4b92000-03-10 22:57:27 +0000205static
206PyObject *args_tuple(PyObject *object,
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000207 const char *errors)
Guido van Rossumfeee4b92000-03-10 22:57:27 +0000208{
209 PyObject *args;
Guido van Rossum98297ee2007-11-06 21:34:58 +0000210
Guido van Rossumfeee4b92000-03-10 22:57:27 +0000211 args = PyTuple_New(1 + (errors != NULL));
212 if (args == NULL)
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000213 return NULL;
Guido van Rossumfeee4b92000-03-10 22:57:27 +0000214 Py_INCREF(object);
215 PyTuple_SET_ITEM(args,0,object);
216 if (errors) {
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000217 PyObject *v;
Guido van Rossum98297ee2007-11-06 21:34:58 +0000218
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000219 v = PyUnicode_FromString(errors);
220 if (v == NULL) {
221 Py_DECREF(args);
222 return NULL;
223 }
224 PyTuple_SET_ITEM(args, 1, v);
Guido van Rossumfeee4b92000-03-10 22:57:27 +0000225 }
226 return args;
227}
228
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000229/* Helper function to get a codec item */
Guido van Rossumfeee4b92000-03-10 22:57:27 +0000230
231static
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000232PyObject *codec_getitem(const char *encoding, int index)
Guido van Rossumfeee4b92000-03-10 22:57:27 +0000233{
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000234 PyObject *codecs;
235 PyObject *v;
Guido van Rossumfeee4b92000-03-10 22:57:27 +0000236
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000237 codecs = _PyCodec_Lookup(encoding);
238 if (codecs == NULL)
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000239 return NULL;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000240 v = PyTuple_GET_ITEM(codecs, index);
241 Py_DECREF(codecs);
242 Py_INCREF(v);
243 return v;
244}
245
Georg Brandl2fc8f772014-03-02 09:18:31 +0100246/* Helper functions to create an incremental codec. */
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000247static
Georg Brandl2fc8f772014-03-02 09:18:31 +0100248PyObject *codec_makeincrementalcodec(PyObject *codec_info,
249 const char *errors,
250 const char *attrname)
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000251{
Georg Brandl2fc8f772014-03-02 09:18:31 +0100252 PyObject *ret, *inccodec;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000253
Georg Brandl2fc8f772014-03-02 09:18:31 +0100254 inccodec = PyObject_GetAttrString(codec_info, attrname);
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000255 if (inccodec == NULL)
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000256 return NULL;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000257 if (errors)
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000258 ret = PyObject_CallFunction(inccodec, "s", errors);
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000259 else
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000260 ret = PyObject_CallFunction(inccodec, NULL);
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000261 Py_DECREF(inccodec);
262 return ret;
263}
264
Georg Brandl2fc8f772014-03-02 09:18:31 +0100265static
266PyObject *codec_getincrementalcodec(const char *encoding,
267 const char *errors,
268 const char *attrname)
269{
270 PyObject *codec_info, *ret;
271
272 codec_info = _PyCodec_Lookup(encoding);
273 if (codec_info == NULL)
274 return NULL;
275 ret = codec_makeincrementalcodec(codec_info, errors, attrname);
276 Py_DECREF(codec_info);
277 return ret;
278}
279
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000280/* Helper function to create a stream codec. */
281
282static
283PyObject *codec_getstreamcodec(const char *encoding,
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000284 PyObject *stream,
285 const char *errors,
286 const int index)
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000287{
Thomas Wouters0e3f5912006-08-11 14:57:12 +0000288 PyObject *codecs, *streamcodec, *codeccls;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000289
290 codecs = _PyCodec_Lookup(encoding);
291 if (codecs == NULL)
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000292 return NULL;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000293
Thomas Wouters0e3f5912006-08-11 14:57:12 +0000294 codeccls = PyTuple_GET_ITEM(codecs, index);
295 if (errors != NULL)
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000296 streamcodec = PyObject_CallFunction(codeccls, "Os", stream, errors);
Thomas Wouters0e3f5912006-08-11 14:57:12 +0000297 else
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000298 streamcodec = PyObject_CallFunction(codeccls, "O", stream);
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000299 Py_DECREF(codecs);
300 return streamcodec;
Guido van Rossumfeee4b92000-03-10 22:57:27 +0000301}
302
Georg Brandl2fc8f772014-03-02 09:18:31 +0100303/* Helpers to work with the result of _PyCodec_Lookup
304
305 */
306PyObject *_PyCodecInfo_GetIncrementalDecoder(PyObject *codec_info,
307 const char *errors)
308{
309 return codec_makeincrementalcodec(codec_info, errors,
310 "incrementaldecoder");
311}
312
313PyObject *_PyCodecInfo_GetIncrementalEncoder(PyObject *codec_info,
314 const char *errors)
315{
316 return codec_makeincrementalcodec(codec_info, errors,
317 "incrementalencoder");
318}
319
320
Guido van Rossum98297ee2007-11-06 21:34:58 +0000321/* Convenience APIs to query the Codec registry.
322
Guido van Rossumfeee4b92000-03-10 22:57:27 +0000323 All APIs return a codec object with incremented refcount.
Guido van Rossum98297ee2007-11-06 21:34:58 +0000324
Guido van Rossumfeee4b92000-03-10 22:57:27 +0000325 */
326
327PyObject *PyCodec_Encoder(const char *encoding)
328{
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000329 return codec_getitem(encoding, 0);
Guido van Rossumfeee4b92000-03-10 22:57:27 +0000330}
331
332PyObject *PyCodec_Decoder(const char *encoding)
333{
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000334 return codec_getitem(encoding, 1);
Guido van Rossumfeee4b92000-03-10 22:57:27 +0000335}
336
Thomas Woutersa9773292006-04-21 09:43:23 +0000337PyObject *PyCodec_IncrementalEncoder(const char *encoding,
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000338 const char *errors)
Thomas Woutersa9773292006-04-21 09:43:23 +0000339{
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000340 return codec_getincrementalcodec(encoding, errors, "incrementalencoder");
Thomas Woutersa9773292006-04-21 09:43:23 +0000341}
342
343PyObject *PyCodec_IncrementalDecoder(const char *encoding,
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000344 const char *errors)
Thomas Woutersa9773292006-04-21 09:43:23 +0000345{
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000346 return codec_getincrementalcodec(encoding, errors, "incrementaldecoder");
Thomas Woutersa9773292006-04-21 09:43:23 +0000347}
348
Guido van Rossumfeee4b92000-03-10 22:57:27 +0000349PyObject *PyCodec_StreamReader(const char *encoding,
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000350 PyObject *stream,
351 const char *errors)
Guido van Rossumfeee4b92000-03-10 22:57:27 +0000352{
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000353 return codec_getstreamcodec(encoding, stream, errors, 2);
Guido van Rossumfeee4b92000-03-10 22:57:27 +0000354}
355
356PyObject *PyCodec_StreamWriter(const char *encoding,
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000357 PyObject *stream,
358 const char *errors)
Guido van Rossumfeee4b92000-03-10 22:57:27 +0000359{
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000360 return codec_getstreamcodec(encoding, stream, errors, 3);
Guido van Rossumfeee4b92000-03-10 22:57:27 +0000361}
362
363/* Encode an object (e.g. an Unicode object) using the given encoding
364 and return the resulting encoded object (usually a Python string).
365
366 errors is passed to the encoder factory as argument if non-NULL. */
367
Serhiy Storchaka94ee3892014-02-24 14:43:03 +0200368static PyObject *
369_PyCodec_EncodeInternal(PyObject *object,
370 PyObject *encoder,
371 const char *encoding,
372 const char *errors)
Guido van Rossumfeee4b92000-03-10 22:57:27 +0000373{
Neal Norwitz3715c3e2005-11-24 22:09:18 +0000374 PyObject *args = NULL, *result = NULL;
Guido van Rossum98297ee2007-11-06 21:34:58 +0000375 PyObject *v = NULL;
Guido van Rossumfeee4b92000-03-10 22:57:27 +0000376
Guido van Rossumfeee4b92000-03-10 22:57:27 +0000377 args = args_tuple(object, errors);
378 if (args == NULL)
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000379 goto onError;
Guido van Rossum98297ee2007-11-06 21:34:58 +0000380
381 result = PyEval_CallObject(encoder, args);
Guido van Rossumfeee4b92000-03-10 22:57:27 +0000382 if (result == NULL)
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000383 goto onError;
Guido van Rossumfeee4b92000-03-10 22:57:27 +0000384
Guido van Rossum98297ee2007-11-06 21:34:58 +0000385 if (!PyTuple_Check(result) ||
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000386 PyTuple_GET_SIZE(result) != 2) {
387 PyErr_SetString(PyExc_TypeError,
388 "encoder must return a tuple (object, integer)");
389 goto onError;
Guido van Rossumfeee4b92000-03-10 22:57:27 +0000390 }
Marc-André Lemburgb2750b52008-06-06 12:18:17 +0000391 v = PyTuple_GET_ITEM(result,0);
392 Py_INCREF(v);
Guido van Rossumfeee4b92000-03-10 22:57:27 +0000393 /* We don't check or use the second (integer) entry. */
394
Marc-André Lemburgb2750b52008-06-06 12:18:17 +0000395 Py_DECREF(args);
396 Py_DECREF(encoder);
397 Py_DECREF(result);
398 return v;
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000399
Guido van Rossumfeee4b92000-03-10 22:57:27 +0000400 onError:
Neal Norwitz3715c3e2005-11-24 22:09:18 +0000401 Py_XDECREF(result);
Guido van Rossumfeee4b92000-03-10 22:57:27 +0000402 Py_XDECREF(args);
403 Py_XDECREF(encoder);
Marc-André Lemburgb2750b52008-06-06 12:18:17 +0000404 return NULL;
Guido van Rossumfeee4b92000-03-10 22:57:27 +0000405}
406
407/* Decode an object (usually a Python string) using the given encoding
408 and return an equivalent object (e.g. an Unicode object).
409
410 errors is passed to the decoder factory as argument if non-NULL. */
411
Serhiy Storchaka94ee3892014-02-24 14:43:03 +0200412static PyObject *
413_PyCodec_DecodeInternal(PyObject *object,
414 PyObject *decoder,
415 const char *encoding,
416 const char *errors)
Guido van Rossumfeee4b92000-03-10 22:57:27 +0000417{
Guido van Rossumfeee4b92000-03-10 22:57:27 +0000418 PyObject *args = NULL, *result = NULL;
419 PyObject *v;
420
Guido van Rossumfeee4b92000-03-10 22:57:27 +0000421 args = args_tuple(object, errors);
422 if (args == NULL)
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000423 goto onError;
Guido van Rossum98297ee2007-11-06 21:34:58 +0000424
Guido van Rossumfeee4b92000-03-10 22:57:27 +0000425 result = PyEval_CallObject(decoder,args);
426 if (result == NULL)
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000427 goto onError;
Guido van Rossum98297ee2007-11-06 21:34:58 +0000428 if (!PyTuple_Check(result) ||
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000429 PyTuple_GET_SIZE(result) != 2) {
430 PyErr_SetString(PyExc_TypeError,
431 "decoder must return a tuple (object,integer)");
432 goto onError;
Guido van Rossumfeee4b92000-03-10 22:57:27 +0000433 }
434 v = PyTuple_GET_ITEM(result,0);
435 Py_INCREF(v);
436 /* We don't check or use the second (integer) entry. */
437
438 Py_DECREF(args);
439 Py_DECREF(decoder);
440 Py_DECREF(result);
441 return v;
Guido van Rossum98297ee2007-11-06 21:34:58 +0000442
Guido van Rossumfeee4b92000-03-10 22:57:27 +0000443 onError:
444 Py_XDECREF(args);
445 Py_XDECREF(decoder);
446 Py_XDECREF(result);
447 return NULL;
448}
449
Serhiy Storchaka94ee3892014-02-24 14:43:03 +0200450/* Generic encoding/decoding API */
451PyObject *PyCodec_Encode(PyObject *object,
452 const char *encoding,
453 const char *errors)
454{
455 PyObject *encoder;
456
457 encoder = PyCodec_Encoder(encoding);
458 if (encoder == NULL)
459 return NULL;
460
461 return _PyCodec_EncodeInternal(object, encoder, encoding, errors);
462}
463
464PyObject *PyCodec_Decode(PyObject *object,
465 const char *encoding,
466 const char *errors)
467{
468 PyObject *decoder;
469
470 decoder = PyCodec_Decoder(encoding);
471 if (decoder == NULL)
472 return NULL;
473
474 return _PyCodec_DecodeInternal(object, decoder, encoding, errors);
475}
476
477/* Text encoding/decoding API */
Georg Brandl2fc8f772014-03-02 09:18:31 +0100478PyObject * _PyCodec_LookupTextEncoding(const char *encoding,
479 const char *alternate_command)
Serhiy Storchaka94ee3892014-02-24 14:43:03 +0200480{
481 _Py_IDENTIFIER(_is_text_encoding);
482 PyObject *codec;
483 PyObject *attr;
Serhiy Storchaka94ee3892014-02-24 14:43:03 +0200484 int is_text_codec;
485
486 codec = _PyCodec_Lookup(encoding);
487 if (codec == NULL)
488 return NULL;
489
490 /* Backwards compatibility: assume any raw tuple describes a text
491 * encoding, and the same for anything lacking the private
492 * attribute.
493 */
494 if (!PyTuple_CheckExact(codec)) {
495 attr = _PyObject_GetAttrId(codec, &PyId__is_text_encoding);
496 if (attr == NULL) {
497 if (PyErr_ExceptionMatches(PyExc_AttributeError)) {
498 PyErr_Clear();
499 } else {
500 Py_DECREF(codec);
501 return NULL;
502 }
503 } else {
504 is_text_codec = PyObject_IsTrue(attr);
505 Py_DECREF(attr);
506 if (!is_text_codec) {
507 Py_DECREF(codec);
508 PyErr_Format(PyExc_LookupError,
509 "'%.400s' is not a text encoding; "
Georg Brandl2fc8f772014-03-02 09:18:31 +0100510 "use %s to handle arbitrary codecs",
511 encoding, alternate_command);
Serhiy Storchaka94ee3892014-02-24 14:43:03 +0200512 return NULL;
513 }
514 }
515 }
516
Georg Brandl2fc8f772014-03-02 09:18:31 +0100517 /* This appears to be a valid text encoding */
518 return codec;
519}
520
521
522static
523PyObject *codec_getitem_checked(const char *encoding,
524 const char *alternate_command,
525 int index)
526{
527 PyObject *codec;
528 PyObject *v;
529
530 codec = _PyCodec_LookupTextEncoding(encoding, alternate_command);
531 if (codec == NULL)
532 return NULL;
533
Serhiy Storchaka94ee3892014-02-24 14:43:03 +0200534 v = PyTuple_GET_ITEM(codec, index);
Serhiy Storchaka94ee3892014-02-24 14:43:03 +0200535 Py_INCREF(v);
Georg Brandl2fc8f772014-03-02 09:18:31 +0100536 Py_DECREF(codec);
Serhiy Storchaka94ee3892014-02-24 14:43:03 +0200537 return v;
538}
539
540static PyObject * _PyCodec_TextEncoder(const char *encoding)
541{
Georg Brandl2fc8f772014-03-02 09:18:31 +0100542 return codec_getitem_checked(encoding, "codecs.encode()", 0);
Serhiy Storchaka94ee3892014-02-24 14:43:03 +0200543}
544
545static PyObject * _PyCodec_TextDecoder(const char *encoding)
546{
Georg Brandl2fc8f772014-03-02 09:18:31 +0100547 return codec_getitem_checked(encoding, "codecs.decode()", 1);
Serhiy Storchaka94ee3892014-02-24 14:43:03 +0200548}
549
550PyObject *_PyCodec_EncodeText(PyObject *object,
551 const char *encoding,
552 const char *errors)
553{
554 PyObject *encoder;
555
556 encoder = _PyCodec_TextEncoder(encoding);
557 if (encoder == NULL)
558 return NULL;
559
560 return _PyCodec_EncodeInternal(object, encoder, encoding, errors);
561}
562
563PyObject *_PyCodec_DecodeText(PyObject *object,
564 const char *encoding,
565 const char *errors)
566{
567 PyObject *decoder;
568
569 decoder = _PyCodec_TextDecoder(encoding);
570 if (decoder == NULL)
571 return NULL;
572
573 return _PyCodec_DecodeInternal(object, decoder, encoding, errors);
574}
575
Walter Dörwald3aeb6322002-09-02 13:14:32 +0000576/* Register the error handling callback function error under the name
577 name. This function will be called by the codec when it encounters
578 an unencodable characters/undecodable bytes and doesn't know the
579 callback name, when name is specified as the error parameter
580 in the call to the encode/decode function.
581 Return 0 on success, -1 on error */
582int PyCodec_RegisterError(const char *name, PyObject *error)
583{
Nicholas Bastine5662ae2004-03-24 22:22:12 +0000584 PyInterpreterState *interp = PyThreadState_GET()->interp;
Gustavo Niemeyer5ddd4c32003-03-19 00:35:36 +0000585 if (interp->codec_search_path == NULL && _PyCodecRegistry_Init())
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000586 return -1;
Walter Dörwald3aeb6322002-09-02 13:14:32 +0000587 if (!PyCallable_Check(error)) {
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000588 PyErr_SetString(PyExc_TypeError, "handler must be callable");
589 return -1;
Walter Dörwald3aeb6322002-09-02 13:14:32 +0000590 }
Gustavo Niemeyer5ddd4c32003-03-19 00:35:36 +0000591 return PyDict_SetItemString(interp->codec_error_registry,
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000592 (char *)name, error);
Walter Dörwald3aeb6322002-09-02 13:14:32 +0000593}
594
595/* Lookup the error handling callback function registered under the
596 name error. As a special case NULL can be passed, in which case
597 the error handling callback for strict encoding will be returned. */
598PyObject *PyCodec_LookupError(const char *name)
599{
600 PyObject *handler = NULL;
601
Nicholas Bastine5662ae2004-03-24 22:22:12 +0000602 PyInterpreterState *interp = PyThreadState_GET()->interp;
Gustavo Niemeyer5ddd4c32003-03-19 00:35:36 +0000603 if (interp->codec_search_path == NULL && _PyCodecRegistry_Init())
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000604 return NULL;
Gustavo Niemeyer5ddd4c32003-03-19 00:35:36 +0000605
Walter Dörwald3aeb6322002-09-02 13:14:32 +0000606 if (name==NULL)
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000607 name = "strict";
Gustavo Niemeyer5ddd4c32003-03-19 00:35:36 +0000608 handler = PyDict_GetItemString(interp->codec_error_registry, (char *)name);
Walter Dörwald3aeb6322002-09-02 13:14:32 +0000609 if (!handler)
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000610 PyErr_Format(PyExc_LookupError, "unknown error handler name '%.400s'", name);
Walter Dörwald3aeb6322002-09-02 13:14:32 +0000611 else
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000612 Py_INCREF(handler);
Walter Dörwald3aeb6322002-09-02 13:14:32 +0000613 return handler;
614}
615
616static void wrong_exception_type(PyObject *exc)
617{
Martin v. Löwisbd928fe2011-10-14 10:20:37 +0200618 _Py_IDENTIFIER(__class__);
619 _Py_IDENTIFIER(__name__);
Martin v. Löwis1ee1b6f2011-10-10 18:11:30 +0200620 PyObject *type = _PyObject_GetAttrId(exc, &PyId___class__);
Walter Dörwald3aeb6322002-09-02 13:14:32 +0000621 if (type != NULL) {
Martin v. Löwis1ee1b6f2011-10-10 18:11:30 +0200622 PyObject *name = _PyObject_GetAttrId(type, &PyId___name__);
Walter Dörwald573c08c2007-05-25 15:46:59 +0000623 Py_DECREF(type);
624 if (name != NULL) {
625 PyErr_Format(PyExc_TypeError,
626 "don't know how to handle %S in error callback", name);
627 Py_DECREF(name);
628 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +0000629 }
630}
631
632PyObject *PyCodec_StrictErrors(PyObject *exc)
633{
Brett Cannonbf364092006-03-01 04:25:17 +0000634 if (PyExceptionInstance_Check(exc))
635 PyErr_SetObject(PyExceptionInstance_Class(exc), exc);
Walter Dörwald3aeb6322002-09-02 13:14:32 +0000636 else
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000637 PyErr_SetString(PyExc_TypeError, "codec must pass exception instance");
Walter Dörwald3aeb6322002-09-02 13:14:32 +0000638 return NULL;
639}
640
641
642PyObject *PyCodec_IgnoreErrors(PyObject *exc)
643{
Martin v. Löwis18e16552006-02-15 17:27:45 +0000644 Py_ssize_t end;
Walter Dörwald3aeb6322002-09-02 13:14:32 +0000645 if (PyObject_IsInstance(exc, PyExc_UnicodeEncodeError)) {
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000646 if (PyUnicodeEncodeError_GetEnd(exc, &end))
647 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +0000648 }
649 else if (PyObject_IsInstance(exc, PyExc_UnicodeDecodeError)) {
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000650 if (PyUnicodeDecodeError_GetEnd(exc, &end))
651 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +0000652 }
653 else if (PyObject_IsInstance(exc, PyExc_UnicodeTranslateError)) {
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000654 if (PyUnicodeTranslateError_GetEnd(exc, &end))
655 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +0000656 }
657 else {
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000658 wrong_exception_type(exc);
659 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +0000660 }
Victor Stinneree450092011-12-01 02:52:11 +0100661 return Py_BuildValue("(Nn)", PyUnicode_New(0, 0), end);
Walter Dörwald3aeb6322002-09-02 13:14:32 +0000662}
663
664
665PyObject *PyCodec_ReplaceErrors(PyObject *exc)
666{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200667 Py_ssize_t start, end, i, len;
Walter Dörwald3aeb6322002-09-02 13:14:32 +0000668
669 if (PyObject_IsInstance(exc, PyExc_UnicodeEncodeError)) {
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000670 PyObject *res;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200671 int kind;
672 void *data;
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000673 if (PyUnicodeEncodeError_GetStart(exc, &start))
674 return NULL;
675 if (PyUnicodeEncodeError_GetEnd(exc, &end))
676 return NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200677 len = end - start;
678 res = PyUnicode_New(len, '?');
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000679 if (res == NULL)
680 return NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200681 kind = PyUnicode_KIND(res);
682 data = PyUnicode_DATA(res);
683 for (i = 0; i < len; ++i)
684 PyUnicode_WRITE(kind, data, i, '?');
Victor Stinner8f825062012-04-27 13:55:39 +0200685 assert(_PyUnicode_CheckConsistency(res, 1));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200686 return Py_BuildValue("(Nn)", res, end);
Walter Dörwald3aeb6322002-09-02 13:14:32 +0000687 }
688 else if (PyObject_IsInstance(exc, PyExc_UnicodeDecodeError)) {
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000689 if (PyUnicodeDecodeError_GetEnd(exc, &end))
690 return NULL;
Victor Stinner1a15aba2011-10-02 19:00:15 +0200691 return Py_BuildValue("(Cn)",
692 (int)Py_UNICODE_REPLACEMENT_CHARACTER,
693 end);
Walter Dörwald3aeb6322002-09-02 13:14:32 +0000694 }
695 else if (PyObject_IsInstance(exc, PyExc_UnicodeTranslateError)) {
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000696 PyObject *res;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200697 int kind;
698 void *data;
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000699 if (PyUnicodeTranslateError_GetStart(exc, &start))
700 return NULL;
701 if (PyUnicodeTranslateError_GetEnd(exc, &end))
702 return NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200703 len = end - start;
704 res = PyUnicode_New(len, Py_UNICODE_REPLACEMENT_CHARACTER);
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000705 if (res == NULL)
706 return NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200707 kind = PyUnicode_KIND(res);
708 data = PyUnicode_DATA(res);
709 for (i=0; i < len; i++)
710 PyUnicode_WRITE(kind, data, i, Py_UNICODE_REPLACEMENT_CHARACTER);
Victor Stinner8f825062012-04-27 13:55:39 +0200711 assert(_PyUnicode_CheckConsistency(res, 1));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200712 return Py_BuildValue("(Nn)", res, end);
Walter Dörwald3aeb6322002-09-02 13:14:32 +0000713 }
714 else {
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000715 wrong_exception_type(exc);
716 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +0000717 }
718}
719
720PyObject *PyCodec_XMLCharRefReplaceErrors(PyObject *exc)
721{
722 if (PyObject_IsInstance(exc, PyExc_UnicodeEncodeError)) {
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000723 PyObject *restuple;
724 PyObject *object;
Victor Stinnerb31f1bc2011-11-04 21:29:10 +0100725 Py_ssize_t i;
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000726 Py_ssize_t start;
727 Py_ssize_t end;
728 PyObject *res;
Martin v. Löwisb09af032011-11-04 11:16:41 +0100729 unsigned char *outp;
Serhiy Storchaka4b168182014-10-04 14:15:49 +0300730 Py_ssize_t ressize;
Martin v. Löwisb09af032011-11-04 11:16:41 +0100731 Py_UCS4 ch;
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000732 if (PyUnicodeEncodeError_GetStart(exc, &start))
733 return NULL;
734 if (PyUnicodeEncodeError_GetEnd(exc, &end))
735 return NULL;
736 if (!(object = PyUnicodeEncodeError_GetObject(exc)))
737 return NULL;
Serhiy Storchaka4b168182014-10-04 14:15:49 +0300738 if (end - start > PY_SSIZE_T_MAX / (2+7+1))
739 end = start + PY_SSIZE_T_MAX / (2+7+1);
Martin v. Löwisb09af032011-11-04 11:16:41 +0100740 for (i = start, ressize = 0; i < end; ++i) {
741 /* object is guaranteed to be "ready" */
742 ch = PyUnicode_READ_CHAR(object, i);
743 if (ch<10)
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000744 ressize += 2+1+1;
Martin v. Löwisb09af032011-11-04 11:16:41 +0100745 else if (ch<100)
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000746 ressize += 2+2+1;
Martin v. Löwisb09af032011-11-04 11:16:41 +0100747 else if (ch<1000)
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000748 ressize += 2+3+1;
Martin v. Löwisb09af032011-11-04 11:16:41 +0100749 else if (ch<10000)
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000750 ressize += 2+4+1;
Martin v. Löwisb09af032011-11-04 11:16:41 +0100751 else if (ch<100000)
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000752 ressize += 2+5+1;
Martin v. Löwisb09af032011-11-04 11:16:41 +0100753 else if (ch<1000000)
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000754 ressize += 2+6+1;
755 else
756 ressize += 2+7+1;
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000757 }
758 /* allocate replacement */
Martin v. Löwisb09af032011-11-04 11:16:41 +0100759 res = PyUnicode_New(ressize, 127);
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000760 if (res == NULL) {
761 Py_DECREF(object);
762 return NULL;
763 }
Martin v. Löwisb09af032011-11-04 11:16:41 +0100764 outp = PyUnicode_1BYTE_DATA(res);
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000765 /* generate replacement */
Victor Stinnerb31f1bc2011-11-04 21:29:10 +0100766 for (i = start; i < end; ++i) {
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000767 int digits;
768 int base;
Martin v. Löwis8ba79302011-11-04 12:26:49 +0100769 ch = PyUnicode_READ_CHAR(object, i);
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000770 *outp++ = '&';
771 *outp++ = '#';
Martin v. Löwisb09af032011-11-04 11:16:41 +0100772 if (ch<10) {
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000773 digits = 1;
774 base = 1;
775 }
Martin v. Löwisb09af032011-11-04 11:16:41 +0100776 else if (ch<100) {
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000777 digits = 2;
778 base = 10;
779 }
Martin v. Löwisb09af032011-11-04 11:16:41 +0100780 else if (ch<1000) {
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000781 digits = 3;
782 base = 100;
783 }
Martin v. Löwisb09af032011-11-04 11:16:41 +0100784 else if (ch<10000) {
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000785 digits = 4;
786 base = 1000;
787 }
Martin v. Löwisb09af032011-11-04 11:16:41 +0100788 else if (ch<100000) {
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000789 digits = 5;
790 base = 10000;
791 }
Martin v. Löwisb09af032011-11-04 11:16:41 +0100792 else if (ch<1000000) {
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000793 digits = 6;
794 base = 100000;
795 }
796 else {
797 digits = 7;
798 base = 1000000;
799 }
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000800 while (digits-->0) {
Martin v. Löwisb09af032011-11-04 11:16:41 +0100801 *outp++ = '0' + ch/base;
802 ch %= base;
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000803 base /= 10;
804 }
805 *outp++ = ';';
806 }
Victor Stinner8f825062012-04-27 13:55:39 +0200807 assert(_PyUnicode_CheckConsistency(res, 1));
808 restuple = Py_BuildValue("(Nn)", res, end);
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000809 Py_DECREF(object);
810 return restuple;
Walter Dörwald3aeb6322002-09-02 13:14:32 +0000811 }
812 else {
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000813 wrong_exception_type(exc);
814 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +0000815 }
816}
817
Walter Dörwald3aeb6322002-09-02 13:14:32 +0000818PyObject *PyCodec_BackslashReplaceErrors(PyObject *exc)
819{
820 if (PyObject_IsInstance(exc, PyExc_UnicodeEncodeError)) {
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000821 PyObject *restuple;
822 PyObject *object;
Martin v. Löwisb09af032011-11-04 11:16:41 +0100823 Py_ssize_t i;
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000824 Py_ssize_t start;
825 Py_ssize_t end;
826 PyObject *res;
Martin v. Löwisb09af032011-11-04 11:16:41 +0100827 unsigned char *outp;
Serhiy Storchaka4b168182014-10-04 14:15:49 +0300828 Py_ssize_t ressize;
Martin v. Löwisb09af032011-11-04 11:16:41 +0100829 Py_UCS4 c;
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000830 if (PyUnicodeEncodeError_GetStart(exc, &start))
831 return NULL;
832 if (PyUnicodeEncodeError_GetEnd(exc, &end))
833 return NULL;
834 if (!(object = PyUnicodeEncodeError_GetObject(exc)))
835 return NULL;
Serhiy Storchaka4b168182014-10-04 14:15:49 +0300836 if (end - start > PY_SSIZE_T_MAX / (1+1+8))
837 end = start + PY_SSIZE_T_MAX / (1+1+8);
Martin v. Löwisb09af032011-11-04 11:16:41 +0100838 for (i = start, ressize = 0; i < end; ++i) {
839 /* object is guaranteed to be "ready" */
840 c = PyUnicode_READ_CHAR(object, i);
841 if (c >= 0x10000) {
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000842 ressize += 1+1+8;
Martin v. Löwisb09af032011-11-04 11:16:41 +0100843 }
844 else if (c >= 0x100) {
845 ressize += 1+1+4;
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000846 }
847 else
848 ressize += 1+1+2;
849 }
Martin v. Löwisb09af032011-11-04 11:16:41 +0100850 res = PyUnicode_New(ressize, 127);
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000851 if (res==NULL)
852 return NULL;
Martin v. Löwisb09af032011-11-04 11:16:41 +0100853 for (i = start, outp = PyUnicode_1BYTE_DATA(res);
854 i < end; ++i) {
855 c = PyUnicode_READ_CHAR(object, i);
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000856 *outp++ = '\\';
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000857 if (c >= 0x00010000) {
858 *outp++ = 'U';
Victor Stinnerf5cff562011-10-14 02:13:11 +0200859 *outp++ = Py_hexdigits[(c>>28)&0xf];
860 *outp++ = Py_hexdigits[(c>>24)&0xf];
861 *outp++ = Py_hexdigits[(c>>20)&0xf];
862 *outp++ = Py_hexdigits[(c>>16)&0xf];
863 *outp++ = Py_hexdigits[(c>>12)&0xf];
864 *outp++ = Py_hexdigits[(c>>8)&0xf];
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000865 }
Antoine Pitroue4a18922010-09-09 20:30:23 +0000866 else if (c >= 0x100) {
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000867 *outp++ = 'u';
Victor Stinnerf5cff562011-10-14 02:13:11 +0200868 *outp++ = Py_hexdigits[(c>>12)&0xf];
869 *outp++ = Py_hexdigits[(c>>8)&0xf];
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000870 }
871 else
872 *outp++ = 'x';
Victor Stinnerf5cff562011-10-14 02:13:11 +0200873 *outp++ = Py_hexdigits[(c>>4)&0xf];
874 *outp++ = Py_hexdigits[c&0xf];
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000875 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +0000876
Victor Stinner8f825062012-04-27 13:55:39 +0200877 assert(_PyUnicode_CheckConsistency(res, 1));
878 restuple = Py_BuildValue("(Nn)", res, end);
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000879 Py_DECREF(object);
880 return restuple;
Walter Dörwald3aeb6322002-09-02 13:14:32 +0000881 }
882 else {
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000883 wrong_exception_type(exc);
884 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +0000885 }
886}
887
Martin v. Löwisaef3fb02009-05-02 19:27:30 +0000888/* This handler is declared static until someone demonstrates
889 a need to call it directly. */
890static PyObject *
Martin v. Löwise0a2b722009-05-10 08:08:56 +0000891PyCodec_SurrogatePassErrors(PyObject *exc)
Martin v. Löwisdb12d452009-05-02 18:52:14 +0000892{
893 PyObject *restuple;
894 PyObject *object;
Martin v. Löwisb09af032011-11-04 11:16:41 +0100895 Py_ssize_t i;
Martin v. Löwisdb12d452009-05-02 18:52:14 +0000896 Py_ssize_t start;
897 Py_ssize_t end;
898 PyObject *res;
899 if (PyObject_IsInstance(exc, PyExc_UnicodeEncodeError)) {
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000900 char *outp;
901 if (PyUnicodeEncodeError_GetStart(exc, &start))
902 return NULL;
903 if (PyUnicodeEncodeError_GetEnd(exc, &end))
904 return NULL;
905 if (!(object = PyUnicodeEncodeError_GetObject(exc)))
906 return NULL;
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000907 res = PyBytes_FromStringAndSize(NULL, 3*(end-start));
908 if (!res) {
909 Py_DECREF(object);
910 return NULL;
911 }
912 outp = PyBytes_AsString(res);
Martin v. Löwisb09af032011-11-04 11:16:41 +0100913 for (i = start; i < end; i++) {
914 /* object is guaranteed to be "ready" */
915 Py_UCS4 ch = PyUnicode_READ_CHAR(object, i);
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000916 if (ch < 0xd800 || ch > 0xdfff) {
917 /* Not a surrogate, fail with original exception */
918 PyErr_SetObject(PyExceptionInstance_Class(exc), exc);
919 Py_DECREF(res);
920 Py_DECREF(object);
921 return NULL;
922 }
923 *outp++ = (char)(0xe0 | (ch >> 12));
924 *outp++ = (char)(0x80 | ((ch >> 6) & 0x3f));
925 *outp++ = (char)(0x80 | (ch & 0x3f));
926 }
927 restuple = Py_BuildValue("(On)", res, end);
928 Py_DECREF(res);
929 Py_DECREF(object);
930 return restuple;
Martin v. Löwisdb12d452009-05-02 18:52:14 +0000931 }
932 else if (PyObject_IsInstance(exc, PyExc_UnicodeDecodeError)) {
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000933 unsigned char *p;
Victor Stinnerc06bb7a2011-11-04 21:36:35 +0100934 Py_UCS4 ch = 0;
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000935 if (PyUnicodeDecodeError_GetStart(exc, &start))
936 return NULL;
937 if (!(object = PyUnicodeDecodeError_GetObject(exc)))
938 return NULL;
939 if (!(p = (unsigned char*)PyBytes_AsString(object))) {
940 Py_DECREF(object);
941 return NULL;
942 }
943 /* Try decoding a single surrogate character. If
944 there are more, let the codec call us again. */
945 p += start;
Ezio Melotti540da762012-11-03 23:03:39 +0200946 if (PyBytes_GET_SIZE(object) - start >= 3 &&
947 (p[0] & 0xf0) == 0xe0 &&
948 (p[1] & 0xc0) == 0x80 &&
949 (p[2] & 0xc0) == 0x80) {
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000950 /* it's a three-byte code */
951 ch = ((p[0] & 0x0f) << 12) + ((p[1] & 0x3f) << 6) + (p[2] & 0x3f);
952 if (ch < 0xd800 || ch > 0xdfff)
953 /* it's not a surrogate - fail */
954 ch = 0;
955 }
956 Py_DECREF(object);
957 if (ch == 0) {
958 PyErr_SetObject(PyExceptionInstance_Class(exc), exc);
959 return NULL;
960 }
Victor Stinnerc06bb7a2011-11-04 21:36:35 +0100961 res = PyUnicode_FromOrdinal(ch);
962 if (res == NULL)
963 return NULL;
964 return Py_BuildValue("(Nn)", res, start+3);
Martin v. Löwisdb12d452009-05-02 18:52:14 +0000965 }
966 else {
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000967 wrong_exception_type(exc);
968 return NULL;
Martin v. Löwisdb12d452009-05-02 18:52:14 +0000969 }
970}
971
Martin v. Löwis011e8422009-05-05 04:43:17 +0000972static PyObject *
Martin v. Löwis43c57782009-05-10 08:15:24 +0000973PyCodec_SurrogateEscapeErrors(PyObject *exc)
Martin v. Löwis011e8422009-05-05 04:43:17 +0000974{
975 PyObject *restuple;
976 PyObject *object;
Martin v. Löwisb09af032011-11-04 11:16:41 +0100977 Py_ssize_t i;
Martin v. Löwis011e8422009-05-05 04:43:17 +0000978 Py_ssize_t start;
979 Py_ssize_t end;
980 PyObject *res;
981 if (PyObject_IsInstance(exc, PyExc_UnicodeEncodeError)) {
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000982 char *outp;
983 if (PyUnicodeEncodeError_GetStart(exc, &start))
984 return NULL;
985 if (PyUnicodeEncodeError_GetEnd(exc, &end))
986 return NULL;
987 if (!(object = PyUnicodeEncodeError_GetObject(exc)))
988 return NULL;
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000989 res = PyBytes_FromStringAndSize(NULL, end-start);
990 if (!res) {
991 Py_DECREF(object);
992 return NULL;
993 }
994 outp = PyBytes_AsString(res);
Martin v. Löwisb09af032011-11-04 11:16:41 +0100995 for (i = start; i < end; i++) {
996 /* object is guaranteed to be "ready" */
997 Py_UCS4 ch = PyUnicode_READ_CHAR(object, i);
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000998 if (ch < 0xdc80 || ch > 0xdcff) {
999 /* Not a UTF-8b surrogate, fail with original exception */
1000 PyErr_SetObject(PyExceptionInstance_Class(exc), exc);
1001 Py_DECREF(res);
1002 Py_DECREF(object);
1003 return NULL;
1004 }
1005 *outp++ = ch - 0xdc00;
1006 }
1007 restuple = Py_BuildValue("(On)", res, end);
1008 Py_DECREF(res);
1009 Py_DECREF(object);
1010 return restuple;
Martin v. Löwis011e8422009-05-05 04:43:17 +00001011 }
1012 else if (PyObject_IsInstance(exc, PyExc_UnicodeDecodeError)) {
Victor Stinnerc06bb7a2011-11-04 21:36:35 +01001013 PyObject *str;
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001014 unsigned char *p;
Victor Stinnerc06bb7a2011-11-04 21:36:35 +01001015 Py_UCS2 ch[4]; /* decode up to 4 bad bytes. */
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001016 int consumed = 0;
1017 if (PyUnicodeDecodeError_GetStart(exc, &start))
1018 return NULL;
1019 if (PyUnicodeDecodeError_GetEnd(exc, &end))
1020 return NULL;
1021 if (!(object = PyUnicodeDecodeError_GetObject(exc)))
1022 return NULL;
1023 if (!(p = (unsigned char*)PyBytes_AsString(object))) {
1024 Py_DECREF(object);
1025 return NULL;
1026 }
1027 while (consumed < 4 && consumed < end-start) {
1028 /* Refuse to escape ASCII bytes. */
1029 if (p[start+consumed] < 128)
1030 break;
1031 ch[consumed] = 0xdc00 + p[start+consumed];
1032 consumed++;
1033 }
1034 Py_DECREF(object);
1035 if (!consumed) {
1036 /* codec complained about ASCII byte. */
1037 PyErr_SetObject(PyExceptionInstance_Class(exc), exc);
1038 return NULL;
1039 }
Victor Stinnerc06bb7a2011-11-04 21:36:35 +01001040 str = PyUnicode_FromKindAndData(PyUnicode_2BYTE_KIND, ch, consumed);
1041 if (str == NULL)
1042 return NULL;
1043 return Py_BuildValue("(Nn)", str, start+consumed);
Martin v. Löwis011e8422009-05-05 04:43:17 +00001044 }
1045 else {
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001046 wrong_exception_type(exc);
1047 return NULL;
Martin v. Löwis011e8422009-05-05 04:43:17 +00001048 }
1049}
1050
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001051
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001052static PyObject *strict_errors(PyObject *self, PyObject *exc)
1053{
1054 return PyCodec_StrictErrors(exc);
1055}
1056
1057
1058static PyObject *ignore_errors(PyObject *self, PyObject *exc)
1059{
1060 return PyCodec_IgnoreErrors(exc);
1061}
1062
1063
1064static PyObject *replace_errors(PyObject *self, PyObject *exc)
1065{
1066 return PyCodec_ReplaceErrors(exc);
1067}
1068
1069
1070static PyObject *xmlcharrefreplace_errors(PyObject *self, PyObject *exc)
1071{
1072 return PyCodec_XMLCharRefReplaceErrors(exc);
1073}
1074
1075
1076static PyObject *backslashreplace_errors(PyObject *self, PyObject *exc)
1077{
1078 return PyCodec_BackslashReplaceErrors(exc);
1079}
1080
Martin v. Löwise0a2b722009-05-10 08:08:56 +00001081static PyObject *surrogatepass_errors(PyObject *self, PyObject *exc)
Martin v. Löwisdb12d452009-05-02 18:52:14 +00001082{
Martin v. Löwise0a2b722009-05-10 08:08:56 +00001083 return PyCodec_SurrogatePassErrors(exc);
Martin v. Löwisdb12d452009-05-02 18:52:14 +00001084}
1085
Martin v. Löwis43c57782009-05-10 08:15:24 +00001086static PyObject *surrogateescape_errors(PyObject *self, PyObject *exc)
Martin v. Löwis011e8422009-05-05 04:43:17 +00001087{
Martin v. Löwis43c57782009-05-10 08:15:24 +00001088 return PyCodec_SurrogateEscapeErrors(exc);
Martin v. Löwis011e8422009-05-05 04:43:17 +00001089}
1090
Gustavo Niemeyer5ddd4c32003-03-19 00:35:36 +00001091static int _PyCodecRegistry_Init(void)
Guido van Rossumfeee4b92000-03-10 22:57:27 +00001092{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001093 static struct {
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001094 char *name;
1095 PyMethodDef def;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001096 } methods[] =
1097 {
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001098 {
1099 "strict",
1100 {
1101 "strict_errors",
1102 strict_errors,
1103 METH_O,
1104 PyDoc_STR("Implements the 'strict' error handling, which "
1105 "raises a UnicodeError on coding errors.")
1106 }
1107 },
1108 {
1109 "ignore",
1110 {
1111 "ignore_errors",
1112 ignore_errors,
1113 METH_O,
1114 PyDoc_STR("Implements the 'ignore' error handling, which "
1115 "ignores malformed data and continues.")
1116 }
1117 },
1118 {
1119 "replace",
1120 {
1121 "replace_errors",
1122 replace_errors,
1123 METH_O,
1124 PyDoc_STR("Implements the 'replace' error handling, which "
1125 "replaces malformed data with a replacement marker.")
1126 }
1127 },
1128 {
1129 "xmlcharrefreplace",
1130 {
1131 "xmlcharrefreplace_errors",
1132 xmlcharrefreplace_errors,
1133 METH_O,
1134 PyDoc_STR("Implements the 'xmlcharrefreplace' error handling, "
1135 "which replaces an unencodable character with the "
1136 "appropriate XML character reference.")
1137 }
1138 },
1139 {
1140 "backslashreplace",
1141 {
1142 "backslashreplace_errors",
1143 backslashreplace_errors,
1144 METH_O,
1145 PyDoc_STR("Implements the 'backslashreplace' error handling, "
1146 "which replaces an unencodable character with a "
1147 "backslashed escape sequence.")
1148 }
1149 },
1150 {
1151 "surrogatepass",
1152 {
1153 "surrogatepass",
1154 surrogatepass_errors,
1155 METH_O
1156 }
1157 },
1158 {
1159 "surrogateescape",
1160 {
1161 "surrogateescape",
1162 surrogateescape_errors,
1163 METH_O
1164 }
1165 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001166 };
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001167
Nicholas Bastine5662ae2004-03-24 22:22:12 +00001168 PyInterpreterState *interp = PyThreadState_GET()->interp;
Gustavo Niemeyer5ddd4c32003-03-19 00:35:36 +00001169 PyObject *mod;
Neal Norwitz739a8f82004-07-08 01:55:58 +00001170 unsigned i;
Gustavo Niemeyer5ddd4c32003-03-19 00:35:36 +00001171
1172 if (interp->codec_search_path != NULL)
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001173 return 0;
Gustavo Niemeyer5ddd4c32003-03-19 00:35:36 +00001174
1175 interp->codec_search_path = PyList_New(0);
1176 interp->codec_search_cache = PyDict_New();
1177 interp->codec_error_registry = PyDict_New();
1178
1179 if (interp->codec_error_registry) {
Victor Stinner63941882011-09-29 00:42:28 +02001180 for (i = 0; i < Py_ARRAY_LENGTH(methods); ++i) {
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001181 PyObject *func = PyCFunction_New(&methods[i].def, NULL);
1182 int res;
1183 if (!func)
1184 Py_FatalError("can't initialize codec error registry");
1185 res = PyCodec_RegisterError(methods[i].name, func);
1186 Py_DECREF(func);
1187 if (res)
1188 Py_FatalError("can't initialize codec error registry");
1189 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001190 }
Guido van Rossumfeee4b92000-03-10 22:57:27 +00001191
Gustavo Niemeyer5ddd4c32003-03-19 00:35:36 +00001192 if (interp->codec_search_path == NULL ||
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001193 interp->codec_search_cache == NULL ||
1194 interp->codec_error_registry == NULL)
1195 Py_FatalError("can't initialize codec registry");
Gustavo Niemeyer5ddd4c32003-03-19 00:35:36 +00001196
Christian Heimes819b8bf2008-01-03 23:05:47 +00001197 mod = PyImport_ImportModuleNoBlock("encodings");
Gustavo Niemeyer5ddd4c32003-03-19 00:35:36 +00001198 if (mod == NULL) {
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001199 return -1;
Gustavo Niemeyer5ddd4c32003-03-19 00:35:36 +00001200 }
1201 Py_DECREF(mod);
Christian Heimes6a27efa2008-10-30 21:48:26 +00001202 interp->codecs_initialized = 1;
Gustavo Niemeyer5ddd4c32003-03-19 00:35:36 +00001203 return 0;
Guido van Rossumfeee4b92000-03-10 22:57:27 +00001204}