blob: 0b736c1715b20dcd13a45bfa7edaa5bf216c08ec [file] [log] [blame]
Guido van Rossumfeee4b92000-03-10 22:57:27 +00001/* ------------------------------------------------------------------------
2
3 Python Codec Registry and support functions
4
5Written by Marc-Andre Lemburg (mal@lemburg.com).
6
Guido van Rossum16b1ad92000-08-03 16:24:25 +00007Copyright (c) Corporation for National Research Initiatives.
Guido van Rossumfeee4b92000-03-10 22:57:27 +00008
9 ------------------------------------------------------------------------ */
10
11#include "Python.h"
12#include <ctype.h>
13
Victor Stinnerf5cff562011-10-14 02:13:11 +020014const char *Py_hexdigits = "0123456789abcdef";
15
Guido van Rossumfeee4b92000-03-10 22:57:27 +000016/* --- Codec Registry ----------------------------------------------------- */
17
18/* Import the standard encodings package which will register the first
Guido van Rossum98297ee2007-11-06 21:34:58 +000019 codec search function.
Guido van Rossumfeee4b92000-03-10 22:57:27 +000020
21 This is done in a lazy way so that the Unicode implementation does
22 not downgrade startup time of scripts not needing it.
23
Guido van Rossumb95de4f2000-03-31 17:25:23 +000024 ImportErrors are silently ignored by this function. Only one try is
25 made.
Guido van Rossumfeee4b92000-03-10 22:57:27 +000026
27*/
28
Gustavo Niemeyer5ddd4c32003-03-19 00:35:36 +000029static int _PyCodecRegistry_Init(void); /* Forward */
Guido van Rossumfeee4b92000-03-10 22:57:27 +000030
Guido van Rossumfeee4b92000-03-10 22:57:27 +000031int PyCodec_Register(PyObject *search_function)
32{
Nicholas Bastine5662ae2004-03-24 22:22:12 +000033 PyInterpreterState *interp = PyThreadState_GET()->interp;
Gustavo Niemeyer5ddd4c32003-03-19 00:35:36 +000034 if (interp->codec_search_path == NULL && _PyCodecRegistry_Init())
Antoine Pitrouf95a1b32010-05-09 15:52:27 +000035 goto onError;
Guido van Rossumfeee4b92000-03-10 22:57:27 +000036 if (search_function == NULL) {
Antoine Pitrouf95a1b32010-05-09 15:52:27 +000037 PyErr_BadArgument();
38 goto onError;
Guido van Rossumfeee4b92000-03-10 22:57:27 +000039 }
40 if (!PyCallable_Check(search_function)) {
Antoine Pitrouf95a1b32010-05-09 15:52:27 +000041 PyErr_SetString(PyExc_TypeError, "argument must be callable");
42 goto onError;
Guido van Rossumfeee4b92000-03-10 22:57:27 +000043 }
Gustavo Niemeyer5ddd4c32003-03-19 00:35:36 +000044 return PyList_Append(interp->codec_search_path, search_function);
Guido van Rossumb95de4f2000-03-31 17:25:23 +000045
46 onError:
47 return -1;
Guido van Rossumfeee4b92000-03-10 22:57:27 +000048}
49
Guido van Rossum9e896b32000-04-05 20:11:21 +000050/* Convert a string to a normalized Python string: all characters are
51 converted to lower case, spaces are replaced with underscores. */
52
Guido van Rossumfeee4b92000-03-10 22:57:27 +000053static
Guido van Rossum9e896b32000-04-05 20:11:21 +000054PyObject *normalizestring(const char *string)
Guido van Rossumfeee4b92000-03-10 22:57:27 +000055{
Guido van Rossum33831132000-06-29 14:50:15 +000056 register size_t i;
Guido van Rossum582acec2000-06-28 22:07:35 +000057 size_t len = strlen(string);
Guido van Rossumfeee4b92000-03-10 22:57:27 +000058 char *p;
59 PyObject *v;
Guido van Rossum21431e82007-10-19 21:48:41 +000060
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000061 if (len > PY_SSIZE_T_MAX) {
Antoine Pitrouf95a1b32010-05-09 15:52:27 +000062 PyErr_SetString(PyExc_OverflowError, "string is too large");
63 return NULL;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000064 }
Guido van Rossum21431e82007-10-19 21:48:41 +000065
66 p = PyMem_Malloc(len + 1);
67 if (p == NULL)
68 return NULL;
Guido van Rossum9e896b32000-04-05 20:11:21 +000069 for (i = 0; i < len; i++) {
70 register char ch = string[i];
71 if (ch == ' ')
72 ch = '-';
73 else
Antoine Pitroucf9d3c02011-07-24 02:27:04 +020074 ch = Py_TOLOWER(Py_CHARMASK(ch));
Antoine Pitrouf95a1b32010-05-09 15:52:27 +000075 p[i] = ch;
Guido van Rossum9e896b32000-04-05 20:11:21 +000076 }
Guido van Rossum21431e82007-10-19 21:48:41 +000077 p[i] = '\0';
78 v = PyUnicode_FromString(p);
79 if (v == NULL)
80 return NULL;
81 PyMem_Free(p);
Guido van Rossumfeee4b92000-03-10 22:57:27 +000082 return v;
83}
84
85/* Lookup the given encoding and return a tuple providing the codec
86 facilities.
87
88 The encoding string is looked up converted to all lower-case
89 characters. This makes encodings looked up through this mechanism
90 effectively case-insensitive.
91
Guido van Rossum98297ee2007-11-06 21:34:58 +000092 If no codec is found, a LookupError is set and NULL returned.
Guido van Rossumb95de4f2000-03-31 17:25:23 +000093
94 As side effect, this tries to load the encodings package, if not
95 yet done. This is part of the lazy load strategy for the encodings
96 package.
97
98*/
Guido van Rossumfeee4b92000-03-10 22:57:27 +000099
100PyObject *_PyCodec_Lookup(const char *encoding)
101{
Gustavo Niemeyer5ddd4c32003-03-19 00:35:36 +0000102 PyInterpreterState *interp;
Guido van Rossum5ba3c842000-03-24 20:52:23 +0000103 PyObject *result, *args = NULL, *v;
Thomas Wouters477c8d52006-05-27 19:21:47 +0000104 Py_ssize_t i, len;
Guido van Rossumfeee4b92000-03-10 22:57:27 +0000105
Fred Drake766de832000-05-09 19:55:59 +0000106 if (encoding == NULL) {
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000107 PyErr_BadArgument();
108 goto onError;
Fred Drake766de832000-05-09 19:55:59 +0000109 }
Gustavo Niemeyer5ddd4c32003-03-19 00:35:36 +0000110
Nicholas Bastine5662ae2004-03-24 22:22:12 +0000111 interp = PyThreadState_GET()->interp;
Gustavo Niemeyer5ddd4c32003-03-19 00:35:36 +0000112 if (interp->codec_search_path == NULL && _PyCodecRegistry_Init())
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000113 goto onError;
Guido van Rossumfeee4b92000-03-10 22:57:27 +0000114
Guido van Rossum9e896b32000-04-05 20:11:21 +0000115 /* Convert the encoding to a normalized Python string: all
Thomas Wouters7e474022000-07-16 12:04:32 +0000116 characters are converted to lower case, spaces and hyphens are
Guido van Rossum9e896b32000-04-05 20:11:21 +0000117 replaced with underscores. */
118 v = normalizestring(encoding);
Guido van Rossumfeee4b92000-03-10 22:57:27 +0000119 if (v == NULL)
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000120 goto onError;
Guido van Rossum21431e82007-10-19 21:48:41 +0000121 PyUnicode_InternInPlace(&v);
Guido van Rossumfeee4b92000-03-10 22:57:27 +0000122
123 /* First, try to lookup the name in the registry dictionary */
Gustavo Niemeyer5ddd4c32003-03-19 00:35:36 +0000124 result = PyDict_GetItem(interp->codec_search_cache, v);
Guido van Rossumfeee4b92000-03-10 22:57:27 +0000125 if (result != NULL) {
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000126 Py_INCREF(result);
127 Py_DECREF(v);
128 return result;
Guido van Rossumfeee4b92000-03-10 22:57:27 +0000129 }
Guido van Rossum98297ee2007-11-06 21:34:58 +0000130
Guido van Rossumfeee4b92000-03-10 22:57:27 +0000131 /* Next, scan the search functions in order of registration */
Guido van Rossumfeee4b92000-03-10 22:57:27 +0000132 args = PyTuple_New(1);
133 if (args == NULL)
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000134 goto onError;
Guido van Rossumfeee4b92000-03-10 22:57:27 +0000135 PyTuple_SET_ITEM(args,0,v);
Guido van Rossum5ba3c842000-03-24 20:52:23 +0000136
Gustavo Niemeyer5ddd4c32003-03-19 00:35:36 +0000137 len = PyList_Size(interp->codec_search_path);
Guido van Rossum5ba3c842000-03-24 20:52:23 +0000138 if (len < 0)
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000139 goto onError;
Guido van Rossumb95de4f2000-03-31 17:25:23 +0000140 if (len == 0) {
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000141 PyErr_SetString(PyExc_LookupError,
142 "no codec search functions registered: "
143 "can't find encoding");
144 goto onError;
Guido van Rossumb95de4f2000-03-31 17:25:23 +0000145 }
Guido van Rossumfeee4b92000-03-10 22:57:27 +0000146
147 for (i = 0; i < len; i++) {
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000148 PyObject *func;
Guido van Rossum98297ee2007-11-06 21:34:58 +0000149
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000150 func = PyList_GetItem(interp->codec_search_path, i);
151 if (func == NULL)
152 goto onError;
153 result = PyEval_CallObject(func, args);
154 if (result == NULL)
155 goto onError;
156 if (result == Py_None) {
157 Py_DECREF(result);
158 continue;
159 }
160 if (!PyTuple_Check(result) || PyTuple_GET_SIZE(result) != 4) {
161 PyErr_SetString(PyExc_TypeError,
162 "codec search functions must return 4-tuples");
163 Py_DECREF(result);
164 goto onError;
165 }
166 break;
Guido van Rossumfeee4b92000-03-10 22:57:27 +0000167 }
168 if (i == len) {
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000169 /* XXX Perhaps we should cache misses too ? */
170 PyErr_Format(PyExc_LookupError,
Martin v. Löwiseb42b022002-09-26 16:01:24 +0000171 "unknown encoding: %s", encoding);
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000172 goto onError;
Guido van Rossumfeee4b92000-03-10 22:57:27 +0000173 }
174
175 /* Cache and return the result */
Neal Norwitz9edcc2e2007-08-11 04:58:26 +0000176 if (PyDict_SetItem(interp->codec_search_cache, v, result) < 0) {
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000177 Py_DECREF(result);
178 goto onError;
Neal Norwitz9edcc2e2007-08-11 04:58:26 +0000179 }
Guido van Rossumfeee4b92000-03-10 22:57:27 +0000180 Py_DECREF(args);
181 return result;
182
183 onError:
184 Py_XDECREF(args);
185 return NULL;
186}
187
Marc-André Lemburgb2750b52008-06-06 12:18:17 +0000188/* Codec registry encoding check API. */
189
190int PyCodec_KnownEncoding(const char *encoding)
191{
192 PyObject *codecs;
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000193
Marc-André Lemburgb2750b52008-06-06 12:18:17 +0000194 codecs = _PyCodec_Lookup(encoding);
195 if (!codecs) {
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000196 PyErr_Clear();
197 return 0;
Marc-André Lemburgb2750b52008-06-06 12:18:17 +0000198 }
199 else {
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000200 Py_DECREF(codecs);
201 return 1;
Marc-André Lemburgb2750b52008-06-06 12:18:17 +0000202 }
203}
204
Guido van Rossumfeee4b92000-03-10 22:57:27 +0000205static
206PyObject *args_tuple(PyObject *object,
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000207 const char *errors)
Guido van Rossumfeee4b92000-03-10 22:57:27 +0000208{
209 PyObject *args;
Guido van Rossum98297ee2007-11-06 21:34:58 +0000210
Guido van Rossumfeee4b92000-03-10 22:57:27 +0000211 args = PyTuple_New(1 + (errors != NULL));
212 if (args == NULL)
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000213 return NULL;
Guido van Rossumfeee4b92000-03-10 22:57:27 +0000214 Py_INCREF(object);
215 PyTuple_SET_ITEM(args,0,object);
216 if (errors) {
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000217 PyObject *v;
Guido van Rossum98297ee2007-11-06 21:34:58 +0000218
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000219 v = PyUnicode_FromString(errors);
220 if (v == NULL) {
221 Py_DECREF(args);
222 return NULL;
223 }
224 PyTuple_SET_ITEM(args, 1, v);
Guido van Rossumfeee4b92000-03-10 22:57:27 +0000225 }
226 return args;
227}
228
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000229/* Helper function to get a codec item */
Guido van Rossumfeee4b92000-03-10 22:57:27 +0000230
231static
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000232PyObject *codec_getitem(const char *encoding, int index)
Guido van Rossumfeee4b92000-03-10 22:57:27 +0000233{
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000234 PyObject *codecs;
235 PyObject *v;
Guido van Rossumfeee4b92000-03-10 22:57:27 +0000236
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000237 codecs = _PyCodec_Lookup(encoding);
238 if (codecs == NULL)
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000239 return NULL;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000240 v = PyTuple_GET_ITEM(codecs, index);
241 Py_DECREF(codecs);
242 Py_INCREF(v);
243 return v;
244}
245
Georg Brandl2fc8f772014-03-02 09:18:31 +0100246/* Helper functions to create an incremental codec. */
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000247static
Georg Brandl2fc8f772014-03-02 09:18:31 +0100248PyObject *codec_makeincrementalcodec(PyObject *codec_info,
249 const char *errors,
250 const char *attrname)
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000251{
Georg Brandl2fc8f772014-03-02 09:18:31 +0100252 PyObject *ret, *inccodec;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000253
Georg Brandl2fc8f772014-03-02 09:18:31 +0100254 inccodec = PyObject_GetAttrString(codec_info, attrname);
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000255 if (inccodec == NULL)
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000256 return NULL;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000257 if (errors)
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000258 ret = PyObject_CallFunction(inccodec, "s", errors);
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000259 else
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000260 ret = PyObject_CallFunction(inccodec, NULL);
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000261 Py_DECREF(inccodec);
262 return ret;
263}
264
Georg Brandl2fc8f772014-03-02 09:18:31 +0100265static
266PyObject *codec_getincrementalcodec(const char *encoding,
267 const char *errors,
268 const char *attrname)
269{
270 PyObject *codec_info, *ret;
271
272 codec_info = _PyCodec_Lookup(encoding);
273 if (codec_info == NULL)
274 return NULL;
275 ret = codec_makeincrementalcodec(codec_info, errors, attrname);
276 Py_DECREF(codec_info);
277 return ret;
278}
279
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000280/* Helper function to create a stream codec. */
281
282static
283PyObject *codec_getstreamcodec(const char *encoding,
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000284 PyObject *stream,
285 const char *errors,
286 const int index)
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000287{
Thomas Wouters0e3f5912006-08-11 14:57:12 +0000288 PyObject *codecs, *streamcodec, *codeccls;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000289
290 codecs = _PyCodec_Lookup(encoding);
291 if (codecs == NULL)
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000292 return NULL;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000293
Thomas Wouters0e3f5912006-08-11 14:57:12 +0000294 codeccls = PyTuple_GET_ITEM(codecs, index);
295 if (errors != NULL)
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000296 streamcodec = PyObject_CallFunction(codeccls, "Os", stream, errors);
Thomas Wouters0e3f5912006-08-11 14:57:12 +0000297 else
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000298 streamcodec = PyObject_CallFunction(codeccls, "O", stream);
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000299 Py_DECREF(codecs);
300 return streamcodec;
Guido van Rossumfeee4b92000-03-10 22:57:27 +0000301}
302
Georg Brandl2fc8f772014-03-02 09:18:31 +0100303/* Helpers to work with the result of _PyCodec_Lookup
304
305 */
306PyObject *_PyCodecInfo_GetIncrementalDecoder(PyObject *codec_info,
307 const char *errors)
308{
309 return codec_makeincrementalcodec(codec_info, errors,
310 "incrementaldecoder");
311}
312
313PyObject *_PyCodecInfo_GetIncrementalEncoder(PyObject *codec_info,
314 const char *errors)
315{
316 return codec_makeincrementalcodec(codec_info, errors,
317 "incrementalencoder");
318}
319
320
Guido van Rossum98297ee2007-11-06 21:34:58 +0000321/* Convenience APIs to query the Codec registry.
322
Guido van Rossumfeee4b92000-03-10 22:57:27 +0000323 All APIs return a codec object with incremented refcount.
Guido van Rossum98297ee2007-11-06 21:34:58 +0000324
Guido van Rossumfeee4b92000-03-10 22:57:27 +0000325 */
326
327PyObject *PyCodec_Encoder(const char *encoding)
328{
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000329 return codec_getitem(encoding, 0);
Guido van Rossumfeee4b92000-03-10 22:57:27 +0000330}
331
332PyObject *PyCodec_Decoder(const char *encoding)
333{
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000334 return codec_getitem(encoding, 1);
Guido van Rossumfeee4b92000-03-10 22:57:27 +0000335}
336
Thomas Woutersa9773292006-04-21 09:43:23 +0000337PyObject *PyCodec_IncrementalEncoder(const char *encoding,
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000338 const char *errors)
Thomas Woutersa9773292006-04-21 09:43:23 +0000339{
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000340 return codec_getincrementalcodec(encoding, errors, "incrementalencoder");
Thomas Woutersa9773292006-04-21 09:43:23 +0000341}
342
343PyObject *PyCodec_IncrementalDecoder(const char *encoding,
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000344 const char *errors)
Thomas Woutersa9773292006-04-21 09:43:23 +0000345{
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000346 return codec_getincrementalcodec(encoding, errors, "incrementaldecoder");
Thomas Woutersa9773292006-04-21 09:43:23 +0000347}
348
Guido van Rossumfeee4b92000-03-10 22:57:27 +0000349PyObject *PyCodec_StreamReader(const char *encoding,
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000350 PyObject *stream,
351 const char *errors)
Guido van Rossumfeee4b92000-03-10 22:57:27 +0000352{
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000353 return codec_getstreamcodec(encoding, stream, errors, 2);
Guido van Rossumfeee4b92000-03-10 22:57:27 +0000354}
355
356PyObject *PyCodec_StreamWriter(const char *encoding,
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000357 PyObject *stream,
358 const char *errors)
Guido van Rossumfeee4b92000-03-10 22:57:27 +0000359{
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000360 return codec_getstreamcodec(encoding, stream, errors, 3);
Guido van Rossumfeee4b92000-03-10 22:57:27 +0000361}
362
363/* Encode an object (e.g. an Unicode object) using the given encoding
364 and return the resulting encoded object (usually a Python string).
365
366 errors is passed to the encoder factory as argument if non-NULL. */
367
Serhiy Storchaka94ee3892014-02-24 14:43:03 +0200368static PyObject *
369_PyCodec_EncodeInternal(PyObject *object,
370 PyObject *encoder,
371 const char *encoding,
372 const char *errors)
Guido van Rossumfeee4b92000-03-10 22:57:27 +0000373{
Neal Norwitz3715c3e2005-11-24 22:09:18 +0000374 PyObject *args = NULL, *result = NULL;
Guido van Rossum98297ee2007-11-06 21:34:58 +0000375 PyObject *v = NULL;
Guido van Rossumfeee4b92000-03-10 22:57:27 +0000376
Guido van Rossumfeee4b92000-03-10 22:57:27 +0000377 args = args_tuple(object, errors);
378 if (args == NULL)
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000379 goto onError;
Guido van Rossum98297ee2007-11-06 21:34:58 +0000380
381 result = PyEval_CallObject(encoder, args);
Guido van Rossumfeee4b92000-03-10 22:57:27 +0000382 if (result == NULL)
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000383 goto onError;
Guido van Rossumfeee4b92000-03-10 22:57:27 +0000384
Guido van Rossum98297ee2007-11-06 21:34:58 +0000385 if (!PyTuple_Check(result) ||
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000386 PyTuple_GET_SIZE(result) != 2) {
387 PyErr_SetString(PyExc_TypeError,
388 "encoder must return a tuple (object, integer)");
389 goto onError;
Guido van Rossumfeee4b92000-03-10 22:57:27 +0000390 }
Marc-André Lemburgb2750b52008-06-06 12:18:17 +0000391 v = PyTuple_GET_ITEM(result,0);
392 Py_INCREF(v);
Guido van Rossumfeee4b92000-03-10 22:57:27 +0000393 /* We don't check or use the second (integer) entry. */
394
Marc-André Lemburgb2750b52008-06-06 12:18:17 +0000395 Py_DECREF(args);
396 Py_DECREF(encoder);
397 Py_DECREF(result);
398 return v;
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000399
Guido van Rossumfeee4b92000-03-10 22:57:27 +0000400 onError:
Neal Norwitz3715c3e2005-11-24 22:09:18 +0000401 Py_XDECREF(result);
Guido van Rossumfeee4b92000-03-10 22:57:27 +0000402 Py_XDECREF(args);
403 Py_XDECREF(encoder);
Marc-André Lemburgb2750b52008-06-06 12:18:17 +0000404 return NULL;
Guido van Rossumfeee4b92000-03-10 22:57:27 +0000405}
406
407/* Decode an object (usually a Python string) using the given encoding
408 and return an equivalent object (e.g. an Unicode object).
409
410 errors is passed to the decoder factory as argument if non-NULL. */
411
Serhiy Storchaka94ee3892014-02-24 14:43:03 +0200412static PyObject *
413_PyCodec_DecodeInternal(PyObject *object,
414 PyObject *decoder,
415 const char *encoding,
416 const char *errors)
Guido van Rossumfeee4b92000-03-10 22:57:27 +0000417{
Guido van Rossumfeee4b92000-03-10 22:57:27 +0000418 PyObject *args = NULL, *result = NULL;
419 PyObject *v;
420
Guido van Rossumfeee4b92000-03-10 22:57:27 +0000421 args = args_tuple(object, errors);
422 if (args == NULL)
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000423 goto onError;
Guido van Rossum98297ee2007-11-06 21:34:58 +0000424
Guido van Rossumfeee4b92000-03-10 22:57:27 +0000425 result = PyEval_CallObject(decoder,args);
426 if (result == NULL)
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000427 goto onError;
Guido van Rossum98297ee2007-11-06 21:34:58 +0000428 if (!PyTuple_Check(result) ||
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000429 PyTuple_GET_SIZE(result) != 2) {
430 PyErr_SetString(PyExc_TypeError,
431 "decoder must return a tuple (object,integer)");
432 goto onError;
Guido van Rossumfeee4b92000-03-10 22:57:27 +0000433 }
434 v = PyTuple_GET_ITEM(result,0);
435 Py_INCREF(v);
436 /* We don't check or use the second (integer) entry. */
437
438 Py_DECREF(args);
439 Py_DECREF(decoder);
440 Py_DECREF(result);
441 return v;
Guido van Rossum98297ee2007-11-06 21:34:58 +0000442
Guido van Rossumfeee4b92000-03-10 22:57:27 +0000443 onError:
444 Py_XDECREF(args);
445 Py_XDECREF(decoder);
446 Py_XDECREF(result);
447 return NULL;
448}
449
Serhiy Storchaka94ee3892014-02-24 14:43:03 +0200450/* Generic encoding/decoding API */
451PyObject *PyCodec_Encode(PyObject *object,
452 const char *encoding,
453 const char *errors)
454{
455 PyObject *encoder;
456
457 encoder = PyCodec_Encoder(encoding);
458 if (encoder == NULL)
459 return NULL;
460
461 return _PyCodec_EncodeInternal(object, encoder, encoding, errors);
462}
463
464PyObject *PyCodec_Decode(PyObject *object,
465 const char *encoding,
466 const char *errors)
467{
468 PyObject *decoder;
469
470 decoder = PyCodec_Decoder(encoding);
471 if (decoder == NULL)
472 return NULL;
473
474 return _PyCodec_DecodeInternal(object, decoder, encoding, errors);
475}
476
477/* Text encoding/decoding API */
Georg Brandl2fc8f772014-03-02 09:18:31 +0100478PyObject * _PyCodec_LookupTextEncoding(const char *encoding,
479 const char *alternate_command)
Serhiy Storchaka94ee3892014-02-24 14:43:03 +0200480{
481 _Py_IDENTIFIER(_is_text_encoding);
482 PyObject *codec;
483 PyObject *attr;
Serhiy Storchaka94ee3892014-02-24 14:43:03 +0200484 int is_text_codec;
485
486 codec = _PyCodec_Lookup(encoding);
487 if (codec == NULL)
488 return NULL;
489
490 /* Backwards compatibility: assume any raw tuple describes a text
491 * encoding, and the same for anything lacking the private
492 * attribute.
493 */
494 if (!PyTuple_CheckExact(codec)) {
495 attr = _PyObject_GetAttrId(codec, &PyId__is_text_encoding);
496 if (attr == NULL) {
497 if (PyErr_ExceptionMatches(PyExc_AttributeError)) {
498 PyErr_Clear();
499 } else {
500 Py_DECREF(codec);
501 return NULL;
502 }
503 } else {
504 is_text_codec = PyObject_IsTrue(attr);
505 Py_DECREF(attr);
506 if (!is_text_codec) {
507 Py_DECREF(codec);
508 PyErr_Format(PyExc_LookupError,
509 "'%.400s' is not a text encoding; "
Georg Brandl2fc8f772014-03-02 09:18:31 +0100510 "use %s to handle arbitrary codecs",
511 encoding, alternate_command);
Serhiy Storchaka94ee3892014-02-24 14:43:03 +0200512 return NULL;
513 }
514 }
515 }
516
Georg Brandl2fc8f772014-03-02 09:18:31 +0100517 /* This appears to be a valid text encoding */
518 return codec;
519}
520
521
522static
523PyObject *codec_getitem_checked(const char *encoding,
524 const char *alternate_command,
525 int index)
526{
527 PyObject *codec;
528 PyObject *v;
529
530 codec = _PyCodec_LookupTextEncoding(encoding, alternate_command);
531 if (codec == NULL)
532 return NULL;
533
Serhiy Storchaka94ee3892014-02-24 14:43:03 +0200534 v = PyTuple_GET_ITEM(codec, index);
Serhiy Storchaka94ee3892014-02-24 14:43:03 +0200535 Py_INCREF(v);
Georg Brandl2fc8f772014-03-02 09:18:31 +0100536 Py_DECREF(codec);
Serhiy Storchaka94ee3892014-02-24 14:43:03 +0200537 return v;
538}
539
540static PyObject * _PyCodec_TextEncoder(const char *encoding)
541{
Georg Brandl2fc8f772014-03-02 09:18:31 +0100542 return codec_getitem_checked(encoding, "codecs.encode()", 0);
Serhiy Storchaka94ee3892014-02-24 14:43:03 +0200543}
544
545static PyObject * _PyCodec_TextDecoder(const char *encoding)
546{
Georg Brandl2fc8f772014-03-02 09:18:31 +0100547 return codec_getitem_checked(encoding, "codecs.decode()", 1);
Serhiy Storchaka94ee3892014-02-24 14:43:03 +0200548}
549
550PyObject *_PyCodec_EncodeText(PyObject *object,
551 const char *encoding,
552 const char *errors)
553{
554 PyObject *encoder;
555
556 encoder = _PyCodec_TextEncoder(encoding);
557 if (encoder == NULL)
558 return NULL;
559
560 return _PyCodec_EncodeInternal(object, encoder, encoding, errors);
561}
562
563PyObject *_PyCodec_DecodeText(PyObject *object,
564 const char *encoding,
565 const char *errors)
566{
567 PyObject *decoder;
568
569 decoder = _PyCodec_TextDecoder(encoding);
570 if (decoder == NULL)
571 return NULL;
572
573 return _PyCodec_DecodeInternal(object, decoder, encoding, errors);
574}
575
Walter Dörwald3aeb6322002-09-02 13:14:32 +0000576/* Register the error handling callback function error under the name
577 name. This function will be called by the codec when it encounters
578 an unencodable characters/undecodable bytes and doesn't know the
579 callback name, when name is specified as the error parameter
580 in the call to the encode/decode function.
581 Return 0 on success, -1 on error */
582int PyCodec_RegisterError(const char *name, PyObject *error)
583{
Nicholas Bastine5662ae2004-03-24 22:22:12 +0000584 PyInterpreterState *interp = PyThreadState_GET()->interp;
Gustavo Niemeyer5ddd4c32003-03-19 00:35:36 +0000585 if (interp->codec_search_path == NULL && _PyCodecRegistry_Init())
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000586 return -1;
Walter Dörwald3aeb6322002-09-02 13:14:32 +0000587 if (!PyCallable_Check(error)) {
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000588 PyErr_SetString(PyExc_TypeError, "handler must be callable");
589 return -1;
Walter Dörwald3aeb6322002-09-02 13:14:32 +0000590 }
Gustavo Niemeyer5ddd4c32003-03-19 00:35:36 +0000591 return PyDict_SetItemString(interp->codec_error_registry,
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000592 (char *)name, error);
Walter Dörwald3aeb6322002-09-02 13:14:32 +0000593}
594
595/* Lookup the error handling callback function registered under the
596 name error. As a special case NULL can be passed, in which case
597 the error handling callback for strict encoding will be returned. */
598PyObject *PyCodec_LookupError(const char *name)
599{
600 PyObject *handler = NULL;
601
Nicholas Bastine5662ae2004-03-24 22:22:12 +0000602 PyInterpreterState *interp = PyThreadState_GET()->interp;
Gustavo Niemeyer5ddd4c32003-03-19 00:35:36 +0000603 if (interp->codec_search_path == NULL && _PyCodecRegistry_Init())
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000604 return NULL;
Gustavo Niemeyer5ddd4c32003-03-19 00:35:36 +0000605
Walter Dörwald3aeb6322002-09-02 13:14:32 +0000606 if (name==NULL)
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000607 name = "strict";
Gustavo Niemeyer5ddd4c32003-03-19 00:35:36 +0000608 handler = PyDict_GetItemString(interp->codec_error_registry, (char *)name);
Walter Dörwald3aeb6322002-09-02 13:14:32 +0000609 if (!handler)
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000610 PyErr_Format(PyExc_LookupError, "unknown error handler name '%.400s'", name);
Walter Dörwald3aeb6322002-09-02 13:14:32 +0000611 else
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000612 Py_INCREF(handler);
Walter Dörwald3aeb6322002-09-02 13:14:32 +0000613 return handler;
614}
615
616static void wrong_exception_type(PyObject *exc)
617{
Martin v. Löwisbd928fe2011-10-14 10:20:37 +0200618 _Py_IDENTIFIER(__class__);
619 _Py_IDENTIFIER(__name__);
Martin v. Löwis1ee1b6f2011-10-10 18:11:30 +0200620 PyObject *type = _PyObject_GetAttrId(exc, &PyId___class__);
Walter Dörwald3aeb6322002-09-02 13:14:32 +0000621 if (type != NULL) {
Martin v. Löwis1ee1b6f2011-10-10 18:11:30 +0200622 PyObject *name = _PyObject_GetAttrId(type, &PyId___name__);
Walter Dörwald573c08c2007-05-25 15:46:59 +0000623 Py_DECREF(type);
624 if (name != NULL) {
625 PyErr_Format(PyExc_TypeError,
626 "don't know how to handle %S in error callback", name);
627 Py_DECREF(name);
628 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +0000629 }
630}
631
632PyObject *PyCodec_StrictErrors(PyObject *exc)
633{
Brett Cannonbf364092006-03-01 04:25:17 +0000634 if (PyExceptionInstance_Check(exc))
635 PyErr_SetObject(PyExceptionInstance_Class(exc), exc);
Walter Dörwald3aeb6322002-09-02 13:14:32 +0000636 else
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000637 PyErr_SetString(PyExc_TypeError, "codec must pass exception instance");
Walter Dörwald3aeb6322002-09-02 13:14:32 +0000638 return NULL;
639}
640
641
642PyObject *PyCodec_IgnoreErrors(PyObject *exc)
643{
Martin v. Löwis18e16552006-02-15 17:27:45 +0000644 Py_ssize_t end;
Walter Dörwald3aeb6322002-09-02 13:14:32 +0000645 if (PyObject_IsInstance(exc, PyExc_UnicodeEncodeError)) {
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000646 if (PyUnicodeEncodeError_GetEnd(exc, &end))
647 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +0000648 }
649 else if (PyObject_IsInstance(exc, PyExc_UnicodeDecodeError)) {
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000650 if (PyUnicodeDecodeError_GetEnd(exc, &end))
651 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +0000652 }
653 else if (PyObject_IsInstance(exc, PyExc_UnicodeTranslateError)) {
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000654 if (PyUnicodeTranslateError_GetEnd(exc, &end))
655 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +0000656 }
657 else {
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000658 wrong_exception_type(exc);
659 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +0000660 }
Victor Stinneree450092011-12-01 02:52:11 +0100661 return Py_BuildValue("(Nn)", PyUnicode_New(0, 0), end);
Walter Dörwald3aeb6322002-09-02 13:14:32 +0000662}
663
664
665PyObject *PyCodec_ReplaceErrors(PyObject *exc)
666{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200667 Py_ssize_t start, end, i, len;
Walter Dörwald3aeb6322002-09-02 13:14:32 +0000668
669 if (PyObject_IsInstance(exc, PyExc_UnicodeEncodeError)) {
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000670 PyObject *res;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200671 int kind;
672 void *data;
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000673 if (PyUnicodeEncodeError_GetStart(exc, &start))
674 return NULL;
675 if (PyUnicodeEncodeError_GetEnd(exc, &end))
676 return NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200677 len = end - start;
678 res = PyUnicode_New(len, '?');
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000679 if (res == NULL)
680 return NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200681 kind = PyUnicode_KIND(res);
682 data = PyUnicode_DATA(res);
683 for (i = 0; i < len; ++i)
684 PyUnicode_WRITE(kind, data, i, '?');
Victor Stinner8f825062012-04-27 13:55:39 +0200685 assert(_PyUnicode_CheckConsistency(res, 1));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200686 return Py_BuildValue("(Nn)", res, end);
Walter Dörwald3aeb6322002-09-02 13:14:32 +0000687 }
688 else if (PyObject_IsInstance(exc, PyExc_UnicodeDecodeError)) {
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000689 if (PyUnicodeDecodeError_GetEnd(exc, &end))
690 return NULL;
Victor Stinner1a15aba2011-10-02 19:00:15 +0200691 return Py_BuildValue("(Cn)",
692 (int)Py_UNICODE_REPLACEMENT_CHARACTER,
693 end);
Walter Dörwald3aeb6322002-09-02 13:14:32 +0000694 }
695 else if (PyObject_IsInstance(exc, PyExc_UnicodeTranslateError)) {
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000696 PyObject *res;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200697 int kind;
698 void *data;
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000699 if (PyUnicodeTranslateError_GetStart(exc, &start))
700 return NULL;
701 if (PyUnicodeTranslateError_GetEnd(exc, &end))
702 return NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200703 len = end - start;
704 res = PyUnicode_New(len, Py_UNICODE_REPLACEMENT_CHARACTER);
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000705 if (res == NULL)
706 return NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200707 kind = PyUnicode_KIND(res);
708 data = PyUnicode_DATA(res);
709 for (i=0; i < len; i++)
710 PyUnicode_WRITE(kind, data, i, Py_UNICODE_REPLACEMENT_CHARACTER);
Victor Stinner8f825062012-04-27 13:55:39 +0200711 assert(_PyUnicode_CheckConsistency(res, 1));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200712 return Py_BuildValue("(Nn)", res, end);
Walter Dörwald3aeb6322002-09-02 13:14:32 +0000713 }
714 else {
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000715 wrong_exception_type(exc);
716 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +0000717 }
718}
719
720PyObject *PyCodec_XMLCharRefReplaceErrors(PyObject *exc)
721{
722 if (PyObject_IsInstance(exc, PyExc_UnicodeEncodeError)) {
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000723 PyObject *restuple;
724 PyObject *object;
Victor Stinnerb31f1bc2011-11-04 21:29:10 +0100725 Py_ssize_t i;
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000726 Py_ssize_t start;
727 Py_ssize_t end;
728 PyObject *res;
Martin v. Löwisb09af032011-11-04 11:16:41 +0100729 unsigned char *outp;
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000730 int ressize;
Martin v. Löwisb09af032011-11-04 11:16:41 +0100731 Py_UCS4 ch;
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000732 if (PyUnicodeEncodeError_GetStart(exc, &start))
733 return NULL;
734 if (PyUnicodeEncodeError_GetEnd(exc, &end))
735 return NULL;
736 if (!(object = PyUnicodeEncodeError_GetObject(exc)))
737 return NULL;
Martin v. Löwisb09af032011-11-04 11:16:41 +0100738 for (i = start, ressize = 0; i < end; ++i) {
739 /* object is guaranteed to be "ready" */
740 ch = PyUnicode_READ_CHAR(object, i);
741 if (ch<10)
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000742 ressize += 2+1+1;
Martin v. Löwisb09af032011-11-04 11:16:41 +0100743 else if (ch<100)
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000744 ressize += 2+2+1;
Martin v. Löwisb09af032011-11-04 11:16:41 +0100745 else if (ch<1000)
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000746 ressize += 2+3+1;
Martin v. Löwisb09af032011-11-04 11:16:41 +0100747 else if (ch<10000)
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000748 ressize += 2+4+1;
Martin v. Löwisb09af032011-11-04 11:16:41 +0100749 else if (ch<100000)
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000750 ressize += 2+5+1;
Martin v. Löwisb09af032011-11-04 11:16:41 +0100751 else if (ch<1000000)
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000752 ressize += 2+6+1;
753 else
754 ressize += 2+7+1;
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000755 }
756 /* allocate replacement */
Martin v. Löwisb09af032011-11-04 11:16:41 +0100757 res = PyUnicode_New(ressize, 127);
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000758 if (res == NULL) {
759 Py_DECREF(object);
760 return NULL;
761 }
Martin v. Löwisb09af032011-11-04 11:16:41 +0100762 outp = PyUnicode_1BYTE_DATA(res);
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000763 /* generate replacement */
Victor Stinnerb31f1bc2011-11-04 21:29:10 +0100764 for (i = start; i < end; ++i) {
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000765 int digits;
766 int base;
Martin v. Löwis8ba79302011-11-04 12:26:49 +0100767 ch = PyUnicode_READ_CHAR(object, i);
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000768 *outp++ = '&';
769 *outp++ = '#';
Martin v. Löwisb09af032011-11-04 11:16:41 +0100770 if (ch<10) {
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000771 digits = 1;
772 base = 1;
773 }
Martin v. Löwisb09af032011-11-04 11:16:41 +0100774 else if (ch<100) {
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000775 digits = 2;
776 base = 10;
777 }
Martin v. Löwisb09af032011-11-04 11:16:41 +0100778 else if (ch<1000) {
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000779 digits = 3;
780 base = 100;
781 }
Martin v. Löwisb09af032011-11-04 11:16:41 +0100782 else if (ch<10000) {
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000783 digits = 4;
784 base = 1000;
785 }
Martin v. Löwisb09af032011-11-04 11:16:41 +0100786 else if (ch<100000) {
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000787 digits = 5;
788 base = 10000;
789 }
Martin v. Löwisb09af032011-11-04 11:16:41 +0100790 else if (ch<1000000) {
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000791 digits = 6;
792 base = 100000;
793 }
794 else {
795 digits = 7;
796 base = 1000000;
797 }
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000798 while (digits-->0) {
Martin v. Löwisb09af032011-11-04 11:16:41 +0100799 *outp++ = '0' + ch/base;
800 ch %= base;
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000801 base /= 10;
802 }
803 *outp++ = ';';
804 }
Victor Stinner8f825062012-04-27 13:55:39 +0200805 assert(_PyUnicode_CheckConsistency(res, 1));
806 restuple = Py_BuildValue("(Nn)", res, end);
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000807 Py_DECREF(object);
808 return restuple;
Walter Dörwald3aeb6322002-09-02 13:14:32 +0000809 }
810 else {
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000811 wrong_exception_type(exc);
812 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +0000813 }
814}
815
Walter Dörwald3aeb6322002-09-02 13:14:32 +0000816PyObject *PyCodec_BackslashReplaceErrors(PyObject *exc)
817{
818 if (PyObject_IsInstance(exc, PyExc_UnicodeEncodeError)) {
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000819 PyObject *restuple;
820 PyObject *object;
Martin v. Löwisb09af032011-11-04 11:16:41 +0100821 Py_ssize_t i;
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000822 Py_ssize_t start;
823 Py_ssize_t end;
824 PyObject *res;
Martin v. Löwisb09af032011-11-04 11:16:41 +0100825 unsigned char *outp;
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000826 int ressize;
Martin v. Löwisb09af032011-11-04 11:16:41 +0100827 Py_UCS4 c;
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000828 if (PyUnicodeEncodeError_GetStart(exc, &start))
829 return NULL;
830 if (PyUnicodeEncodeError_GetEnd(exc, &end))
831 return NULL;
832 if (!(object = PyUnicodeEncodeError_GetObject(exc)))
833 return NULL;
Martin v. Löwisb09af032011-11-04 11:16:41 +0100834 for (i = start, ressize = 0; i < end; ++i) {
835 /* object is guaranteed to be "ready" */
836 c = PyUnicode_READ_CHAR(object, i);
837 if (c >= 0x10000) {
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000838 ressize += 1+1+8;
Martin v. Löwisb09af032011-11-04 11:16:41 +0100839 }
840 else if (c >= 0x100) {
841 ressize += 1+1+4;
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000842 }
843 else
844 ressize += 1+1+2;
845 }
Martin v. Löwisb09af032011-11-04 11:16:41 +0100846 res = PyUnicode_New(ressize, 127);
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000847 if (res==NULL)
848 return NULL;
Martin v. Löwisb09af032011-11-04 11:16:41 +0100849 for (i = start, outp = PyUnicode_1BYTE_DATA(res);
850 i < end; ++i) {
851 c = PyUnicode_READ_CHAR(object, i);
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000852 *outp++ = '\\';
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000853 if (c >= 0x00010000) {
854 *outp++ = 'U';
Victor Stinnerf5cff562011-10-14 02:13:11 +0200855 *outp++ = Py_hexdigits[(c>>28)&0xf];
856 *outp++ = Py_hexdigits[(c>>24)&0xf];
857 *outp++ = Py_hexdigits[(c>>20)&0xf];
858 *outp++ = Py_hexdigits[(c>>16)&0xf];
859 *outp++ = Py_hexdigits[(c>>12)&0xf];
860 *outp++ = Py_hexdigits[(c>>8)&0xf];
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000861 }
Antoine Pitroue4a18922010-09-09 20:30:23 +0000862 else if (c >= 0x100) {
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000863 *outp++ = 'u';
Victor Stinnerf5cff562011-10-14 02:13:11 +0200864 *outp++ = Py_hexdigits[(c>>12)&0xf];
865 *outp++ = Py_hexdigits[(c>>8)&0xf];
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000866 }
867 else
868 *outp++ = 'x';
Victor Stinnerf5cff562011-10-14 02:13:11 +0200869 *outp++ = Py_hexdigits[(c>>4)&0xf];
870 *outp++ = Py_hexdigits[c&0xf];
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000871 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +0000872
Victor Stinner8f825062012-04-27 13:55:39 +0200873 assert(_PyUnicode_CheckConsistency(res, 1));
874 restuple = Py_BuildValue("(Nn)", res, end);
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000875 Py_DECREF(object);
876 return restuple;
Walter Dörwald3aeb6322002-09-02 13:14:32 +0000877 }
878 else {
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000879 wrong_exception_type(exc);
880 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +0000881 }
882}
883
Martin v. Löwisaef3fb02009-05-02 19:27:30 +0000884/* This handler is declared static until someone demonstrates
885 a need to call it directly. */
886static PyObject *
Martin v. Löwise0a2b722009-05-10 08:08:56 +0000887PyCodec_SurrogatePassErrors(PyObject *exc)
Martin v. Löwisdb12d452009-05-02 18:52:14 +0000888{
889 PyObject *restuple;
890 PyObject *object;
Martin v. Löwisb09af032011-11-04 11:16:41 +0100891 Py_ssize_t i;
Martin v. Löwisdb12d452009-05-02 18:52:14 +0000892 Py_ssize_t start;
893 Py_ssize_t end;
894 PyObject *res;
895 if (PyObject_IsInstance(exc, PyExc_UnicodeEncodeError)) {
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000896 char *outp;
897 if (PyUnicodeEncodeError_GetStart(exc, &start))
898 return NULL;
899 if (PyUnicodeEncodeError_GetEnd(exc, &end))
900 return NULL;
901 if (!(object = PyUnicodeEncodeError_GetObject(exc)))
902 return NULL;
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000903 res = PyBytes_FromStringAndSize(NULL, 3*(end-start));
904 if (!res) {
905 Py_DECREF(object);
906 return NULL;
907 }
908 outp = PyBytes_AsString(res);
Martin v. Löwisb09af032011-11-04 11:16:41 +0100909 for (i = start; i < end; i++) {
910 /* object is guaranteed to be "ready" */
911 Py_UCS4 ch = PyUnicode_READ_CHAR(object, i);
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000912 if (ch < 0xd800 || ch > 0xdfff) {
913 /* Not a surrogate, fail with original exception */
914 PyErr_SetObject(PyExceptionInstance_Class(exc), exc);
915 Py_DECREF(res);
916 Py_DECREF(object);
917 return NULL;
918 }
919 *outp++ = (char)(0xe0 | (ch >> 12));
920 *outp++ = (char)(0x80 | ((ch >> 6) & 0x3f));
921 *outp++ = (char)(0x80 | (ch & 0x3f));
922 }
923 restuple = Py_BuildValue("(On)", res, end);
924 Py_DECREF(res);
925 Py_DECREF(object);
926 return restuple;
Martin v. Löwisdb12d452009-05-02 18:52:14 +0000927 }
928 else if (PyObject_IsInstance(exc, PyExc_UnicodeDecodeError)) {
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000929 unsigned char *p;
Victor Stinnerc06bb7a2011-11-04 21:36:35 +0100930 Py_UCS4 ch = 0;
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000931 if (PyUnicodeDecodeError_GetStart(exc, &start))
932 return NULL;
933 if (!(object = PyUnicodeDecodeError_GetObject(exc)))
934 return NULL;
935 if (!(p = (unsigned char*)PyBytes_AsString(object))) {
936 Py_DECREF(object);
937 return NULL;
938 }
939 /* Try decoding a single surrogate character. If
940 there are more, let the codec call us again. */
941 p += start;
Ezio Melotti540da762012-11-03 23:03:39 +0200942 if (PyBytes_GET_SIZE(object) - start >= 3 &&
943 (p[0] & 0xf0) == 0xe0 &&
944 (p[1] & 0xc0) == 0x80 &&
945 (p[2] & 0xc0) == 0x80) {
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000946 /* it's a three-byte code */
947 ch = ((p[0] & 0x0f) << 12) + ((p[1] & 0x3f) << 6) + (p[2] & 0x3f);
948 if (ch < 0xd800 || ch > 0xdfff)
949 /* it's not a surrogate - fail */
950 ch = 0;
951 }
952 Py_DECREF(object);
953 if (ch == 0) {
954 PyErr_SetObject(PyExceptionInstance_Class(exc), exc);
955 return NULL;
956 }
Victor Stinnerc06bb7a2011-11-04 21:36:35 +0100957 res = PyUnicode_FromOrdinal(ch);
958 if (res == NULL)
959 return NULL;
960 return Py_BuildValue("(Nn)", res, start+3);
Martin v. Löwisdb12d452009-05-02 18:52:14 +0000961 }
962 else {
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000963 wrong_exception_type(exc);
964 return NULL;
Martin v. Löwisdb12d452009-05-02 18:52:14 +0000965 }
966}
967
Martin v. Löwis011e8422009-05-05 04:43:17 +0000968static PyObject *
Martin v. Löwis43c57782009-05-10 08:15:24 +0000969PyCodec_SurrogateEscapeErrors(PyObject *exc)
Martin v. Löwis011e8422009-05-05 04:43:17 +0000970{
971 PyObject *restuple;
972 PyObject *object;
Martin v. Löwisb09af032011-11-04 11:16:41 +0100973 Py_ssize_t i;
Martin v. Löwis011e8422009-05-05 04:43:17 +0000974 Py_ssize_t start;
975 Py_ssize_t end;
976 PyObject *res;
977 if (PyObject_IsInstance(exc, PyExc_UnicodeEncodeError)) {
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000978 char *outp;
979 if (PyUnicodeEncodeError_GetStart(exc, &start))
980 return NULL;
981 if (PyUnicodeEncodeError_GetEnd(exc, &end))
982 return NULL;
983 if (!(object = PyUnicodeEncodeError_GetObject(exc)))
984 return NULL;
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000985 res = PyBytes_FromStringAndSize(NULL, end-start);
986 if (!res) {
987 Py_DECREF(object);
988 return NULL;
989 }
990 outp = PyBytes_AsString(res);
Martin v. Löwisb09af032011-11-04 11:16:41 +0100991 for (i = start; i < end; i++) {
992 /* object is guaranteed to be "ready" */
993 Py_UCS4 ch = PyUnicode_READ_CHAR(object, i);
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000994 if (ch < 0xdc80 || ch > 0xdcff) {
995 /* Not a UTF-8b surrogate, fail with original exception */
996 PyErr_SetObject(PyExceptionInstance_Class(exc), exc);
997 Py_DECREF(res);
998 Py_DECREF(object);
999 return NULL;
1000 }
1001 *outp++ = ch - 0xdc00;
1002 }
1003 restuple = Py_BuildValue("(On)", res, end);
1004 Py_DECREF(res);
1005 Py_DECREF(object);
1006 return restuple;
Martin v. Löwis011e8422009-05-05 04:43:17 +00001007 }
1008 else if (PyObject_IsInstance(exc, PyExc_UnicodeDecodeError)) {
Victor Stinnerc06bb7a2011-11-04 21:36:35 +01001009 PyObject *str;
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001010 unsigned char *p;
Victor Stinnerc06bb7a2011-11-04 21:36:35 +01001011 Py_UCS2 ch[4]; /* decode up to 4 bad bytes. */
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001012 int consumed = 0;
1013 if (PyUnicodeDecodeError_GetStart(exc, &start))
1014 return NULL;
1015 if (PyUnicodeDecodeError_GetEnd(exc, &end))
1016 return NULL;
1017 if (!(object = PyUnicodeDecodeError_GetObject(exc)))
1018 return NULL;
1019 if (!(p = (unsigned char*)PyBytes_AsString(object))) {
1020 Py_DECREF(object);
1021 return NULL;
1022 }
1023 while (consumed < 4 && consumed < end-start) {
1024 /* Refuse to escape ASCII bytes. */
1025 if (p[start+consumed] < 128)
1026 break;
1027 ch[consumed] = 0xdc00 + p[start+consumed];
1028 consumed++;
1029 }
1030 Py_DECREF(object);
1031 if (!consumed) {
1032 /* codec complained about ASCII byte. */
1033 PyErr_SetObject(PyExceptionInstance_Class(exc), exc);
1034 return NULL;
1035 }
Victor Stinnerc06bb7a2011-11-04 21:36:35 +01001036 str = PyUnicode_FromKindAndData(PyUnicode_2BYTE_KIND, ch, consumed);
1037 if (str == NULL)
1038 return NULL;
1039 return Py_BuildValue("(Nn)", str, start+consumed);
Martin v. Löwis011e8422009-05-05 04:43:17 +00001040 }
1041 else {
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001042 wrong_exception_type(exc);
1043 return NULL;
Martin v. Löwis011e8422009-05-05 04:43:17 +00001044 }
1045}
1046
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001047
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001048static PyObject *strict_errors(PyObject *self, PyObject *exc)
1049{
1050 return PyCodec_StrictErrors(exc);
1051}
1052
1053
1054static PyObject *ignore_errors(PyObject *self, PyObject *exc)
1055{
1056 return PyCodec_IgnoreErrors(exc);
1057}
1058
1059
1060static PyObject *replace_errors(PyObject *self, PyObject *exc)
1061{
1062 return PyCodec_ReplaceErrors(exc);
1063}
1064
1065
1066static PyObject *xmlcharrefreplace_errors(PyObject *self, PyObject *exc)
1067{
1068 return PyCodec_XMLCharRefReplaceErrors(exc);
1069}
1070
1071
1072static PyObject *backslashreplace_errors(PyObject *self, PyObject *exc)
1073{
1074 return PyCodec_BackslashReplaceErrors(exc);
1075}
1076
Martin v. Löwise0a2b722009-05-10 08:08:56 +00001077static PyObject *surrogatepass_errors(PyObject *self, PyObject *exc)
Martin v. Löwisdb12d452009-05-02 18:52:14 +00001078{
Martin v. Löwise0a2b722009-05-10 08:08:56 +00001079 return PyCodec_SurrogatePassErrors(exc);
Martin v. Löwisdb12d452009-05-02 18:52:14 +00001080}
1081
Martin v. Löwis43c57782009-05-10 08:15:24 +00001082static PyObject *surrogateescape_errors(PyObject *self, PyObject *exc)
Martin v. Löwis011e8422009-05-05 04:43:17 +00001083{
Martin v. Löwis43c57782009-05-10 08:15:24 +00001084 return PyCodec_SurrogateEscapeErrors(exc);
Martin v. Löwis011e8422009-05-05 04:43:17 +00001085}
1086
Gustavo Niemeyer5ddd4c32003-03-19 00:35:36 +00001087static int _PyCodecRegistry_Init(void)
Guido van Rossumfeee4b92000-03-10 22:57:27 +00001088{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001089 static struct {
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001090 char *name;
1091 PyMethodDef def;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001092 } methods[] =
1093 {
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001094 {
1095 "strict",
1096 {
1097 "strict_errors",
1098 strict_errors,
1099 METH_O,
1100 PyDoc_STR("Implements the 'strict' error handling, which "
1101 "raises a UnicodeError on coding errors.")
1102 }
1103 },
1104 {
1105 "ignore",
1106 {
1107 "ignore_errors",
1108 ignore_errors,
1109 METH_O,
1110 PyDoc_STR("Implements the 'ignore' error handling, which "
1111 "ignores malformed data and continues.")
1112 }
1113 },
1114 {
1115 "replace",
1116 {
1117 "replace_errors",
1118 replace_errors,
1119 METH_O,
1120 PyDoc_STR("Implements the 'replace' error handling, which "
1121 "replaces malformed data with a replacement marker.")
1122 }
1123 },
1124 {
1125 "xmlcharrefreplace",
1126 {
1127 "xmlcharrefreplace_errors",
1128 xmlcharrefreplace_errors,
1129 METH_O,
1130 PyDoc_STR("Implements the 'xmlcharrefreplace' error handling, "
1131 "which replaces an unencodable character with the "
1132 "appropriate XML character reference.")
1133 }
1134 },
1135 {
1136 "backslashreplace",
1137 {
1138 "backslashreplace_errors",
1139 backslashreplace_errors,
1140 METH_O,
1141 PyDoc_STR("Implements the 'backslashreplace' error handling, "
1142 "which replaces an unencodable character with a "
1143 "backslashed escape sequence.")
1144 }
1145 },
1146 {
1147 "surrogatepass",
1148 {
1149 "surrogatepass",
1150 surrogatepass_errors,
1151 METH_O
1152 }
1153 },
1154 {
1155 "surrogateescape",
1156 {
1157 "surrogateescape",
1158 surrogateescape_errors,
1159 METH_O
1160 }
1161 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001162 };
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001163
Nicholas Bastine5662ae2004-03-24 22:22:12 +00001164 PyInterpreterState *interp = PyThreadState_GET()->interp;
Gustavo Niemeyer5ddd4c32003-03-19 00:35:36 +00001165 PyObject *mod;
Neal Norwitz739a8f82004-07-08 01:55:58 +00001166 unsigned i;
Gustavo Niemeyer5ddd4c32003-03-19 00:35:36 +00001167
1168 if (interp->codec_search_path != NULL)
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001169 return 0;
Gustavo Niemeyer5ddd4c32003-03-19 00:35:36 +00001170
1171 interp->codec_search_path = PyList_New(0);
1172 interp->codec_search_cache = PyDict_New();
1173 interp->codec_error_registry = PyDict_New();
1174
1175 if (interp->codec_error_registry) {
Victor Stinner63941882011-09-29 00:42:28 +02001176 for (i = 0; i < Py_ARRAY_LENGTH(methods); ++i) {
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001177 PyObject *func = PyCFunction_New(&methods[i].def, NULL);
1178 int res;
1179 if (!func)
1180 Py_FatalError("can't initialize codec error registry");
1181 res = PyCodec_RegisterError(methods[i].name, func);
1182 Py_DECREF(func);
1183 if (res)
1184 Py_FatalError("can't initialize codec error registry");
1185 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001186 }
Guido van Rossumfeee4b92000-03-10 22:57:27 +00001187
Gustavo Niemeyer5ddd4c32003-03-19 00:35:36 +00001188 if (interp->codec_search_path == NULL ||
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001189 interp->codec_search_cache == NULL ||
1190 interp->codec_error_registry == NULL)
1191 Py_FatalError("can't initialize codec registry");
Gustavo Niemeyer5ddd4c32003-03-19 00:35:36 +00001192
Christian Heimes819b8bf2008-01-03 23:05:47 +00001193 mod = PyImport_ImportModuleNoBlock("encodings");
Gustavo Niemeyer5ddd4c32003-03-19 00:35:36 +00001194 if (mod == NULL) {
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001195 return -1;
Gustavo Niemeyer5ddd4c32003-03-19 00:35:36 +00001196 }
1197 Py_DECREF(mod);
Christian Heimes6a27efa2008-10-30 21:48:26 +00001198 interp->codecs_initialized = 1;
Gustavo Niemeyer5ddd4c32003-03-19 00:35:36 +00001199 return 0;
Guido van Rossumfeee4b92000-03-10 22:57:27 +00001200}