blob: 18edfbdab94d60f23b00587f776999b027a36484 [file] [log] [blame]
Guido van Rossumfeee4b92000-03-10 22:57:27 +00001/* ------------------------------------------------------------------------
2
3 Python Codec Registry and support functions
4
5Written by Marc-Andre Lemburg (mal@lemburg.com).
6
Guido van Rossum16b1ad92000-08-03 16:24:25 +00007Copyright (c) Corporation for National Research Initiatives.
Guido van Rossumfeee4b92000-03-10 22:57:27 +00008
9 ------------------------------------------------------------------------ */
10
11#include "Python.h"
Eric Snow2ebc5ce2017-09-07 23:51:28 -060012#include "internal/pystate.h"
Serhiy Storchaka166ebc42014-11-25 13:57:17 +020013#include "ucnhash.h"
Guido van Rossumfeee4b92000-03-10 22:57:27 +000014#include <ctype.h>
15
Victor Stinnerf5cff562011-10-14 02:13:11 +020016const char *Py_hexdigits = "0123456789abcdef";
17
Guido van Rossumfeee4b92000-03-10 22:57:27 +000018/* --- Codec Registry ----------------------------------------------------- */
19
20/* Import the standard encodings package which will register the first
Guido van Rossum98297ee2007-11-06 21:34:58 +000021 codec search function.
Guido van Rossumfeee4b92000-03-10 22:57:27 +000022
23 This is done in a lazy way so that the Unicode implementation does
24 not downgrade startup time of scripts not needing it.
25
Guido van Rossumb95de4f2000-03-31 17:25:23 +000026 ImportErrors are silently ignored by this function. Only one try is
27 made.
Guido van Rossumfeee4b92000-03-10 22:57:27 +000028
29*/
30
Gustavo Niemeyer5ddd4c32003-03-19 00:35:36 +000031static int _PyCodecRegistry_Init(void); /* Forward */
Guido van Rossumfeee4b92000-03-10 22:57:27 +000032
Guido van Rossumfeee4b92000-03-10 22:57:27 +000033int PyCodec_Register(PyObject *search_function)
34{
Nicholas Bastine5662ae2004-03-24 22:22:12 +000035 PyInterpreterState *interp = PyThreadState_GET()->interp;
Gustavo Niemeyer5ddd4c32003-03-19 00:35:36 +000036 if (interp->codec_search_path == NULL && _PyCodecRegistry_Init())
Antoine Pitrouf95a1b32010-05-09 15:52:27 +000037 goto onError;
Guido van Rossumfeee4b92000-03-10 22:57:27 +000038 if (search_function == NULL) {
Antoine Pitrouf95a1b32010-05-09 15:52:27 +000039 PyErr_BadArgument();
40 goto onError;
Guido van Rossumfeee4b92000-03-10 22:57:27 +000041 }
42 if (!PyCallable_Check(search_function)) {
Antoine Pitrouf95a1b32010-05-09 15:52:27 +000043 PyErr_SetString(PyExc_TypeError, "argument must be callable");
44 goto onError;
Guido van Rossumfeee4b92000-03-10 22:57:27 +000045 }
Gustavo Niemeyer5ddd4c32003-03-19 00:35:36 +000046 return PyList_Append(interp->codec_search_path, search_function);
Guido van Rossumb95de4f2000-03-31 17:25:23 +000047
48 onError:
49 return -1;
Guido van Rossumfeee4b92000-03-10 22:57:27 +000050}
51
Guido van Rossum9e896b32000-04-05 20:11:21 +000052/* Convert a string to a normalized Python string: all characters are
53 converted to lower case, spaces are replaced with underscores. */
54
Guido van Rossumfeee4b92000-03-10 22:57:27 +000055static
Guido van Rossum9e896b32000-04-05 20:11:21 +000056PyObject *normalizestring(const char *string)
Guido van Rossumfeee4b92000-03-10 22:57:27 +000057{
Antoine Pitrou9ed5f272013-08-13 20:18:52 +020058 size_t i;
Guido van Rossum582acec2000-06-28 22:07:35 +000059 size_t len = strlen(string);
Guido van Rossumfeee4b92000-03-10 22:57:27 +000060 char *p;
61 PyObject *v;
Guido van Rossum21431e82007-10-19 21:48:41 +000062
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000063 if (len > PY_SSIZE_T_MAX) {
Antoine Pitrouf95a1b32010-05-09 15:52:27 +000064 PyErr_SetString(PyExc_OverflowError, "string is too large");
65 return NULL;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000066 }
Guido van Rossum21431e82007-10-19 21:48:41 +000067
68 p = PyMem_Malloc(len + 1);
69 if (p == NULL)
Victor Stinnercc351592013-07-12 00:02:55 +020070 return PyErr_NoMemory();
Guido van Rossum9e896b32000-04-05 20:11:21 +000071 for (i = 0; i < len; i++) {
Antoine Pitrou9ed5f272013-08-13 20:18:52 +020072 char ch = string[i];
Guido van Rossum9e896b32000-04-05 20:11:21 +000073 if (ch == ' ')
74 ch = '-';
75 else
Antoine Pitroucf9d3c02011-07-24 02:27:04 +020076 ch = Py_TOLOWER(Py_CHARMASK(ch));
Antoine Pitrouf95a1b32010-05-09 15:52:27 +000077 p[i] = ch;
Guido van Rossum9e896b32000-04-05 20:11:21 +000078 }
Guido van Rossum21431e82007-10-19 21:48:41 +000079 p[i] = '\0';
80 v = PyUnicode_FromString(p);
81 if (v == NULL)
82 return NULL;
83 PyMem_Free(p);
Guido van Rossumfeee4b92000-03-10 22:57:27 +000084 return v;
85}
86
87/* Lookup the given encoding and return a tuple providing the codec
88 facilities.
89
90 The encoding string is looked up converted to all lower-case
91 characters. This makes encodings looked up through this mechanism
92 effectively case-insensitive.
93
Guido van Rossum98297ee2007-11-06 21:34:58 +000094 If no codec is found, a LookupError is set and NULL returned.
Guido van Rossumb95de4f2000-03-31 17:25:23 +000095
96 As side effect, this tries to load the encodings package, if not
97 yet done. This is part of the lazy load strategy for the encodings
98 package.
99
100*/
Guido van Rossumfeee4b92000-03-10 22:57:27 +0000101
102PyObject *_PyCodec_Lookup(const char *encoding)
103{
Gustavo Niemeyer5ddd4c32003-03-19 00:35:36 +0000104 PyInterpreterState *interp;
Guido van Rossum5ba3c842000-03-24 20:52:23 +0000105 PyObject *result, *args = NULL, *v;
Thomas Wouters477c8d52006-05-27 19:21:47 +0000106 Py_ssize_t i, len;
Guido van Rossumfeee4b92000-03-10 22:57:27 +0000107
Fred Drake766de832000-05-09 19:55:59 +0000108 if (encoding == NULL) {
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000109 PyErr_BadArgument();
110 goto onError;
Fred Drake766de832000-05-09 19:55:59 +0000111 }
Gustavo Niemeyer5ddd4c32003-03-19 00:35:36 +0000112
Nicholas Bastine5662ae2004-03-24 22:22:12 +0000113 interp = PyThreadState_GET()->interp;
Gustavo Niemeyer5ddd4c32003-03-19 00:35:36 +0000114 if (interp->codec_search_path == NULL && _PyCodecRegistry_Init())
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000115 goto onError;
Guido van Rossumfeee4b92000-03-10 22:57:27 +0000116
Guido van Rossum9e896b32000-04-05 20:11:21 +0000117 /* Convert the encoding to a normalized Python string: all
Thomas Wouters7e474022000-07-16 12:04:32 +0000118 characters are converted to lower case, spaces and hyphens are
Guido van Rossum9e896b32000-04-05 20:11:21 +0000119 replaced with underscores. */
120 v = normalizestring(encoding);
Guido van Rossumfeee4b92000-03-10 22:57:27 +0000121 if (v == NULL)
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000122 goto onError;
Guido van Rossum21431e82007-10-19 21:48:41 +0000123 PyUnicode_InternInPlace(&v);
Guido van Rossumfeee4b92000-03-10 22:57:27 +0000124
125 /* First, try to lookup the name in the registry dictionary */
Gustavo Niemeyer5ddd4c32003-03-19 00:35:36 +0000126 result = PyDict_GetItem(interp->codec_search_cache, v);
Guido van Rossumfeee4b92000-03-10 22:57:27 +0000127 if (result != NULL) {
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000128 Py_INCREF(result);
129 Py_DECREF(v);
130 return result;
Guido van Rossumfeee4b92000-03-10 22:57:27 +0000131 }
Guido van Rossum98297ee2007-11-06 21:34:58 +0000132
Guido van Rossumfeee4b92000-03-10 22:57:27 +0000133 /* Next, scan the search functions in order of registration */
Guido van Rossumfeee4b92000-03-10 22:57:27 +0000134 args = PyTuple_New(1);
135 if (args == NULL)
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000136 goto onError;
Guido van Rossumfeee4b92000-03-10 22:57:27 +0000137 PyTuple_SET_ITEM(args,0,v);
Guido van Rossum5ba3c842000-03-24 20:52:23 +0000138
Gustavo Niemeyer5ddd4c32003-03-19 00:35:36 +0000139 len = PyList_Size(interp->codec_search_path);
Guido van Rossum5ba3c842000-03-24 20:52:23 +0000140 if (len < 0)
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000141 goto onError;
Guido van Rossumb95de4f2000-03-31 17:25:23 +0000142 if (len == 0) {
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000143 PyErr_SetString(PyExc_LookupError,
144 "no codec search functions registered: "
145 "can't find encoding");
146 goto onError;
Guido van Rossumb95de4f2000-03-31 17:25:23 +0000147 }
Guido van Rossumfeee4b92000-03-10 22:57:27 +0000148
149 for (i = 0; i < len; i++) {
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000150 PyObject *func;
Guido van Rossum98297ee2007-11-06 21:34:58 +0000151
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000152 func = PyList_GetItem(interp->codec_search_path, i);
153 if (func == NULL)
154 goto onError;
155 result = PyEval_CallObject(func, args);
156 if (result == NULL)
157 goto onError;
158 if (result == Py_None) {
159 Py_DECREF(result);
160 continue;
161 }
162 if (!PyTuple_Check(result) || PyTuple_GET_SIZE(result) != 4) {
163 PyErr_SetString(PyExc_TypeError,
164 "codec search functions must return 4-tuples");
165 Py_DECREF(result);
166 goto onError;
167 }
168 break;
Guido van Rossumfeee4b92000-03-10 22:57:27 +0000169 }
170 if (i == len) {
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000171 /* XXX Perhaps we should cache misses too ? */
172 PyErr_Format(PyExc_LookupError,
Martin v. Löwiseb42b022002-09-26 16:01:24 +0000173 "unknown encoding: %s", encoding);
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000174 goto onError;
Guido van Rossumfeee4b92000-03-10 22:57:27 +0000175 }
176
177 /* Cache and return the result */
Neal Norwitz9edcc2e2007-08-11 04:58:26 +0000178 if (PyDict_SetItem(interp->codec_search_cache, v, result) < 0) {
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000179 Py_DECREF(result);
180 goto onError;
Neal Norwitz9edcc2e2007-08-11 04:58:26 +0000181 }
Guido van Rossumfeee4b92000-03-10 22:57:27 +0000182 Py_DECREF(args);
183 return result;
184
185 onError:
186 Py_XDECREF(args);
187 return NULL;
188}
189
Nick Coghlan8fad1672014-09-15 23:50:44 +1200190int _PyCodec_Forget(const char *encoding)
191{
192 PyInterpreterState *interp;
193 PyObject *v;
194 int result;
195
196 interp = PyThreadState_GET()->interp;
197 if (interp->codec_search_path == NULL) {
198 return -1;
199 }
200
201 /* Convert the encoding to a normalized Python string: all
202 characters are converted to lower case, spaces and hyphens are
203 replaced with underscores. */
204 v = normalizestring(encoding);
205 if (v == NULL) {
206 return -1;
207 }
208
209 /* Drop the named codec from the internal cache */
210 result = PyDict_DelItem(interp->codec_search_cache, v);
211 Py_DECREF(v);
212
213 return result;
214}
215
Marc-André Lemburgb2750b52008-06-06 12:18:17 +0000216/* Codec registry encoding check API. */
217
218int PyCodec_KnownEncoding(const char *encoding)
219{
220 PyObject *codecs;
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000221
Marc-André Lemburgb2750b52008-06-06 12:18:17 +0000222 codecs = _PyCodec_Lookup(encoding);
223 if (!codecs) {
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000224 PyErr_Clear();
225 return 0;
Marc-André Lemburgb2750b52008-06-06 12:18:17 +0000226 }
227 else {
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000228 Py_DECREF(codecs);
229 return 1;
Marc-André Lemburgb2750b52008-06-06 12:18:17 +0000230 }
231}
232
Guido van Rossumfeee4b92000-03-10 22:57:27 +0000233static
234PyObject *args_tuple(PyObject *object,
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000235 const char *errors)
Guido van Rossumfeee4b92000-03-10 22:57:27 +0000236{
237 PyObject *args;
Guido van Rossum98297ee2007-11-06 21:34:58 +0000238
Guido van Rossumfeee4b92000-03-10 22:57:27 +0000239 args = PyTuple_New(1 + (errors != NULL));
240 if (args == NULL)
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000241 return NULL;
Guido van Rossumfeee4b92000-03-10 22:57:27 +0000242 Py_INCREF(object);
243 PyTuple_SET_ITEM(args,0,object);
244 if (errors) {
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000245 PyObject *v;
Guido van Rossum98297ee2007-11-06 21:34:58 +0000246
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000247 v = PyUnicode_FromString(errors);
248 if (v == NULL) {
249 Py_DECREF(args);
250 return NULL;
251 }
252 PyTuple_SET_ITEM(args, 1, v);
Guido van Rossumfeee4b92000-03-10 22:57:27 +0000253 }
254 return args;
255}
256
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000257/* Helper function to get a codec item */
Guido van Rossumfeee4b92000-03-10 22:57:27 +0000258
259static
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000260PyObject *codec_getitem(const char *encoding, int index)
Guido van Rossumfeee4b92000-03-10 22:57:27 +0000261{
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000262 PyObject *codecs;
263 PyObject *v;
Guido van Rossumfeee4b92000-03-10 22:57:27 +0000264
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000265 codecs = _PyCodec_Lookup(encoding);
266 if (codecs == NULL)
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000267 return NULL;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000268 v = PyTuple_GET_ITEM(codecs, index);
269 Py_DECREF(codecs);
270 Py_INCREF(v);
271 return v;
272}
273
Nick Coghlana9b15242014-02-04 22:11:18 +1000274/* Helper functions to create an incremental codec. */
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000275static
Nick Coghlana9b15242014-02-04 22:11:18 +1000276PyObject *codec_makeincrementalcodec(PyObject *codec_info,
277 const char *errors,
278 const char *attrname)
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000279{
Nick Coghlana9b15242014-02-04 22:11:18 +1000280 PyObject *ret, *inccodec;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000281
Nick Coghlana9b15242014-02-04 22:11:18 +1000282 inccodec = PyObject_GetAttrString(codec_info, attrname);
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000283 if (inccodec == NULL)
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000284 return NULL;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000285 if (errors)
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000286 ret = PyObject_CallFunction(inccodec, "s", errors);
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000287 else
Victor Stinner4778eab2016-12-01 14:51:04 +0100288 ret = _PyObject_CallNoArg(inccodec);
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000289 Py_DECREF(inccodec);
290 return ret;
291}
292
Nick Coghlana9b15242014-02-04 22:11:18 +1000293static
294PyObject *codec_getincrementalcodec(const char *encoding,
295 const char *errors,
296 const char *attrname)
297{
298 PyObject *codec_info, *ret;
299
300 codec_info = _PyCodec_Lookup(encoding);
301 if (codec_info == NULL)
302 return NULL;
303 ret = codec_makeincrementalcodec(codec_info, errors, attrname);
304 Py_DECREF(codec_info);
305 return ret;
306}
307
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000308/* Helper function to create a stream codec. */
309
310static
311PyObject *codec_getstreamcodec(const char *encoding,
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000312 PyObject *stream,
313 const char *errors,
314 const int index)
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000315{
Thomas Wouters0e3f5912006-08-11 14:57:12 +0000316 PyObject *codecs, *streamcodec, *codeccls;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000317
318 codecs = _PyCodec_Lookup(encoding);
319 if (codecs == NULL)
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000320 return NULL;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000321
Thomas Wouters0e3f5912006-08-11 14:57:12 +0000322 codeccls = PyTuple_GET_ITEM(codecs, index);
323 if (errors != NULL)
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000324 streamcodec = PyObject_CallFunction(codeccls, "Os", stream, errors);
Thomas Wouters0e3f5912006-08-11 14:57:12 +0000325 else
Victor Stinner7bfb42d2016-12-05 17:04:32 +0100326 streamcodec = PyObject_CallFunctionObjArgs(codeccls, stream, NULL);
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000327 Py_DECREF(codecs);
328 return streamcodec;
Guido van Rossumfeee4b92000-03-10 22:57:27 +0000329}
330
Nick Coghlana9b15242014-02-04 22:11:18 +1000331/* Helpers to work with the result of _PyCodec_Lookup
332
333 */
334PyObject *_PyCodecInfo_GetIncrementalDecoder(PyObject *codec_info,
335 const char *errors)
336{
337 return codec_makeincrementalcodec(codec_info, errors,
338 "incrementaldecoder");
339}
340
341PyObject *_PyCodecInfo_GetIncrementalEncoder(PyObject *codec_info,
342 const char *errors)
343{
344 return codec_makeincrementalcodec(codec_info, errors,
345 "incrementalencoder");
346}
347
348
Guido van Rossum98297ee2007-11-06 21:34:58 +0000349/* Convenience APIs to query the Codec registry.
350
Guido van Rossumfeee4b92000-03-10 22:57:27 +0000351 All APIs return a codec object with incremented refcount.
Guido van Rossum98297ee2007-11-06 21:34:58 +0000352
Guido van Rossumfeee4b92000-03-10 22:57:27 +0000353 */
354
355PyObject *PyCodec_Encoder(const char *encoding)
356{
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000357 return codec_getitem(encoding, 0);
Guido van Rossumfeee4b92000-03-10 22:57:27 +0000358}
359
360PyObject *PyCodec_Decoder(const char *encoding)
361{
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000362 return codec_getitem(encoding, 1);
Guido van Rossumfeee4b92000-03-10 22:57:27 +0000363}
364
Thomas Woutersa9773292006-04-21 09:43:23 +0000365PyObject *PyCodec_IncrementalEncoder(const char *encoding,
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000366 const char *errors)
Thomas Woutersa9773292006-04-21 09:43:23 +0000367{
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000368 return codec_getincrementalcodec(encoding, errors, "incrementalencoder");
Thomas Woutersa9773292006-04-21 09:43:23 +0000369}
370
371PyObject *PyCodec_IncrementalDecoder(const char *encoding,
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000372 const char *errors)
Thomas Woutersa9773292006-04-21 09:43:23 +0000373{
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000374 return codec_getincrementalcodec(encoding, errors, "incrementaldecoder");
Thomas Woutersa9773292006-04-21 09:43:23 +0000375}
376
Guido van Rossumfeee4b92000-03-10 22:57:27 +0000377PyObject *PyCodec_StreamReader(const char *encoding,
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000378 PyObject *stream,
379 const char *errors)
Guido van Rossumfeee4b92000-03-10 22:57:27 +0000380{
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000381 return codec_getstreamcodec(encoding, stream, errors, 2);
Guido van Rossumfeee4b92000-03-10 22:57:27 +0000382}
383
384PyObject *PyCodec_StreamWriter(const char *encoding,
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000385 PyObject *stream,
386 const char *errors)
Guido van Rossumfeee4b92000-03-10 22:57:27 +0000387{
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000388 return codec_getstreamcodec(encoding, stream, errors, 3);
Guido van Rossumfeee4b92000-03-10 22:57:27 +0000389}
390
Nick Coghlan8b097b42013-11-13 23:49:21 +1000391/* Helper that tries to ensure the reported exception chain indicates the
392 * codec that was invoked to trigger the failure without changing the type
393 * of the exception raised.
394 */
395static void
396wrap_codec_error(const char *operation,
397 const char *encoding)
398{
399 /* TrySetFromCause will replace the active exception with a suitably
400 * updated clone if it can, otherwise it will leave the original
401 * exception alone.
402 */
403 _PyErr_TrySetFromCause("%s with '%s' codec failed",
404 operation, encoding);
405}
406
Martin Panter6245cb32016-04-15 02:14:19 +0000407/* Encode an object (e.g. a Unicode object) using the given encoding
Guido van Rossumfeee4b92000-03-10 22:57:27 +0000408 and return the resulting encoded object (usually a Python string).
409
410 errors is passed to the encoder factory as argument if non-NULL. */
411
Nick Coghlanc72e4e62013-11-22 22:39:36 +1000412static PyObject *
413_PyCodec_EncodeInternal(PyObject *object,
414 PyObject *encoder,
415 const char *encoding,
416 const char *errors)
Guido van Rossumfeee4b92000-03-10 22:57:27 +0000417{
Neal Norwitz3715c3e2005-11-24 22:09:18 +0000418 PyObject *args = NULL, *result = NULL;
Guido van Rossum98297ee2007-11-06 21:34:58 +0000419 PyObject *v = NULL;
Guido van Rossumfeee4b92000-03-10 22:57:27 +0000420
Guido van Rossumfeee4b92000-03-10 22:57:27 +0000421 args = args_tuple(object, errors);
422 if (args == NULL)
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000423 goto onError;
Guido van Rossum98297ee2007-11-06 21:34:58 +0000424
425 result = PyEval_CallObject(encoder, args);
Nick Coghlanc4c25802013-11-15 21:47:37 +1000426 if (result == NULL) {
427 wrap_codec_error("encoding", encoding);
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000428 goto onError;
Nick Coghlanc4c25802013-11-15 21:47:37 +1000429 }
Guido van Rossumfeee4b92000-03-10 22:57:27 +0000430
Guido van Rossum98297ee2007-11-06 21:34:58 +0000431 if (!PyTuple_Check(result) ||
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000432 PyTuple_GET_SIZE(result) != 2) {
433 PyErr_SetString(PyExc_TypeError,
434 "encoder must return a tuple (object, integer)");
435 goto onError;
Guido van Rossumfeee4b92000-03-10 22:57:27 +0000436 }
Marc-André Lemburgb2750b52008-06-06 12:18:17 +0000437 v = PyTuple_GET_ITEM(result,0);
438 Py_INCREF(v);
Guido van Rossumfeee4b92000-03-10 22:57:27 +0000439 /* We don't check or use the second (integer) entry. */
440
Marc-André Lemburgb2750b52008-06-06 12:18:17 +0000441 Py_DECREF(args);
442 Py_DECREF(encoder);
443 Py_DECREF(result);
444 return v;
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000445
Guido van Rossumfeee4b92000-03-10 22:57:27 +0000446 onError:
Neal Norwitz3715c3e2005-11-24 22:09:18 +0000447 Py_XDECREF(result);
Guido van Rossumfeee4b92000-03-10 22:57:27 +0000448 Py_XDECREF(args);
449 Py_XDECREF(encoder);
Marc-André Lemburgb2750b52008-06-06 12:18:17 +0000450 return NULL;
Guido van Rossumfeee4b92000-03-10 22:57:27 +0000451}
452
453/* Decode an object (usually a Python string) using the given encoding
Martin Panter6245cb32016-04-15 02:14:19 +0000454 and return an equivalent object (e.g. a Unicode object).
Guido van Rossumfeee4b92000-03-10 22:57:27 +0000455
456 errors is passed to the decoder factory as argument if non-NULL. */
457
Nick Coghlanc72e4e62013-11-22 22:39:36 +1000458static PyObject *
459_PyCodec_DecodeInternal(PyObject *object,
460 PyObject *decoder,
461 const char *encoding,
462 const char *errors)
Guido van Rossumfeee4b92000-03-10 22:57:27 +0000463{
Guido van Rossumfeee4b92000-03-10 22:57:27 +0000464 PyObject *args = NULL, *result = NULL;
465 PyObject *v;
466
Guido van Rossumfeee4b92000-03-10 22:57:27 +0000467 args = args_tuple(object, errors);
468 if (args == NULL)
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000469 goto onError;
Guido van Rossum98297ee2007-11-06 21:34:58 +0000470
Guido van Rossumfeee4b92000-03-10 22:57:27 +0000471 result = PyEval_CallObject(decoder,args);
Nick Coghlanc4c25802013-11-15 21:47:37 +1000472 if (result == NULL) {
473 wrap_codec_error("decoding", encoding);
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000474 goto onError;
Nick Coghlanc4c25802013-11-15 21:47:37 +1000475 }
Guido van Rossum98297ee2007-11-06 21:34:58 +0000476 if (!PyTuple_Check(result) ||
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000477 PyTuple_GET_SIZE(result) != 2) {
478 PyErr_SetString(PyExc_TypeError,
479 "decoder must return a tuple (object,integer)");
480 goto onError;
Guido van Rossumfeee4b92000-03-10 22:57:27 +0000481 }
482 v = PyTuple_GET_ITEM(result,0);
483 Py_INCREF(v);
484 /* We don't check or use the second (integer) entry. */
485
486 Py_DECREF(args);
487 Py_DECREF(decoder);
488 Py_DECREF(result);
489 return v;
Guido van Rossum98297ee2007-11-06 21:34:58 +0000490
Guido van Rossumfeee4b92000-03-10 22:57:27 +0000491 onError:
492 Py_XDECREF(args);
493 Py_XDECREF(decoder);
494 Py_XDECREF(result);
495 return NULL;
496}
497
Nick Coghlanc72e4e62013-11-22 22:39:36 +1000498/* Generic encoding/decoding API */
499PyObject *PyCodec_Encode(PyObject *object,
500 const char *encoding,
501 const char *errors)
502{
503 PyObject *encoder;
504
505 encoder = PyCodec_Encoder(encoding);
506 if (encoder == NULL)
507 return NULL;
508
509 return _PyCodec_EncodeInternal(object, encoder, encoding, errors);
510}
511
512PyObject *PyCodec_Decode(PyObject *object,
513 const char *encoding,
514 const char *errors)
515{
516 PyObject *decoder;
517
518 decoder = PyCodec_Decoder(encoding);
519 if (decoder == NULL)
520 return NULL;
521
522 return _PyCodec_DecodeInternal(object, decoder, encoding, errors);
523}
524
525/* Text encoding/decoding API */
Nick Coghlana9b15242014-02-04 22:11:18 +1000526PyObject * _PyCodec_LookupTextEncoding(const char *encoding,
527 const char *alternate_command)
Nick Coghlanc72e4e62013-11-22 22:39:36 +1000528{
529 _Py_IDENTIFIER(_is_text_encoding);
530 PyObject *codec;
531 PyObject *attr;
Nick Coghlanc72e4e62013-11-22 22:39:36 +1000532 int is_text_codec;
533
534 codec = _PyCodec_Lookup(encoding);
535 if (codec == NULL)
536 return NULL;
537
538 /* Backwards compatibility: assume any raw tuple describes a text
539 * encoding, and the same for anything lacking the private
540 * attribute.
541 */
542 if (!PyTuple_CheckExact(codec)) {
543 attr = _PyObject_GetAttrId(codec, &PyId__is_text_encoding);
544 if (attr == NULL) {
545 if (PyErr_ExceptionMatches(PyExc_AttributeError)) {
546 PyErr_Clear();
547 } else {
548 Py_DECREF(codec);
549 return NULL;
550 }
551 } else {
552 is_text_codec = PyObject_IsTrue(attr);
553 Py_DECREF(attr);
Serhiy Storchakafa494fd2015-05-30 17:45:22 +0300554 if (is_text_codec <= 0) {
Nick Coghlanc72e4e62013-11-22 22:39:36 +1000555 Py_DECREF(codec);
Serhiy Storchakafa494fd2015-05-30 17:45:22 +0300556 if (!is_text_codec)
557 PyErr_Format(PyExc_LookupError,
558 "'%.400s' is not a text encoding; "
559 "use %s to handle arbitrary codecs",
560 encoding, alternate_command);
Nick Coghlanc72e4e62013-11-22 22:39:36 +1000561 return NULL;
562 }
563 }
564 }
565
Nick Coghlana9b15242014-02-04 22:11:18 +1000566 /* This appears to be a valid text encoding */
567 return codec;
568}
569
570
571static
572PyObject *codec_getitem_checked(const char *encoding,
573 const char *alternate_command,
574 int index)
575{
576 PyObject *codec;
577 PyObject *v;
578
579 codec = _PyCodec_LookupTextEncoding(encoding, alternate_command);
580 if (codec == NULL)
581 return NULL;
582
Nick Coghlanc72e4e62013-11-22 22:39:36 +1000583 v = PyTuple_GET_ITEM(codec, index);
Nick Coghlanc72e4e62013-11-22 22:39:36 +1000584 Py_INCREF(v);
Nick Coghlana9b15242014-02-04 22:11:18 +1000585 Py_DECREF(codec);
Nick Coghlanc72e4e62013-11-22 22:39:36 +1000586 return v;
587}
588
589static PyObject * _PyCodec_TextEncoder(const char *encoding)
590{
Nick Coghlana9b15242014-02-04 22:11:18 +1000591 return codec_getitem_checked(encoding, "codecs.encode()", 0);
Nick Coghlanc72e4e62013-11-22 22:39:36 +1000592}
593
594static PyObject * _PyCodec_TextDecoder(const char *encoding)
595{
Nick Coghlana9b15242014-02-04 22:11:18 +1000596 return codec_getitem_checked(encoding, "codecs.decode()", 1);
Nick Coghlanc72e4e62013-11-22 22:39:36 +1000597}
598
599PyObject *_PyCodec_EncodeText(PyObject *object,
600 const char *encoding,
601 const char *errors)
602{
603 PyObject *encoder;
604
605 encoder = _PyCodec_TextEncoder(encoding);
606 if (encoder == NULL)
607 return NULL;
608
609 return _PyCodec_EncodeInternal(object, encoder, encoding, errors);
610}
611
612PyObject *_PyCodec_DecodeText(PyObject *object,
613 const char *encoding,
614 const char *errors)
615{
616 PyObject *decoder;
617
618 decoder = _PyCodec_TextDecoder(encoding);
619 if (decoder == NULL)
620 return NULL;
621
622 return _PyCodec_DecodeInternal(object, decoder, encoding, errors);
623}
624
Walter Dörwald3aeb6322002-09-02 13:14:32 +0000625/* Register the error handling callback function error under the name
626 name. This function will be called by the codec when it encounters
627 an unencodable characters/undecodable bytes and doesn't know the
628 callback name, when name is specified as the error parameter
629 in the call to the encode/decode function.
630 Return 0 on success, -1 on error */
631int PyCodec_RegisterError(const char *name, PyObject *error)
632{
Nicholas Bastine5662ae2004-03-24 22:22:12 +0000633 PyInterpreterState *interp = PyThreadState_GET()->interp;
Gustavo Niemeyer5ddd4c32003-03-19 00:35:36 +0000634 if (interp->codec_search_path == NULL && _PyCodecRegistry_Init())
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000635 return -1;
Walter Dörwald3aeb6322002-09-02 13:14:32 +0000636 if (!PyCallable_Check(error)) {
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000637 PyErr_SetString(PyExc_TypeError, "handler must be callable");
638 return -1;
Walter Dörwald3aeb6322002-09-02 13:14:32 +0000639 }
Gustavo Niemeyer5ddd4c32003-03-19 00:35:36 +0000640 return PyDict_SetItemString(interp->codec_error_registry,
Serhiy Storchakac6792272013-10-19 21:03:34 +0300641 name, error);
Walter Dörwald3aeb6322002-09-02 13:14:32 +0000642}
643
644/* Lookup the error handling callback function registered under the
645 name error. As a special case NULL can be passed, in which case
646 the error handling callback for strict encoding will be returned. */
647PyObject *PyCodec_LookupError(const char *name)
648{
649 PyObject *handler = NULL;
650
Nicholas Bastine5662ae2004-03-24 22:22:12 +0000651 PyInterpreterState *interp = PyThreadState_GET()->interp;
Gustavo Niemeyer5ddd4c32003-03-19 00:35:36 +0000652 if (interp->codec_search_path == NULL && _PyCodecRegistry_Init())
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000653 return NULL;
Gustavo Niemeyer5ddd4c32003-03-19 00:35:36 +0000654
Walter Dörwald3aeb6322002-09-02 13:14:32 +0000655 if (name==NULL)
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000656 name = "strict";
Serhiy Storchakac6792272013-10-19 21:03:34 +0300657 handler = PyDict_GetItemString(interp->codec_error_registry, name);
Walter Dörwald3aeb6322002-09-02 13:14:32 +0000658 if (!handler)
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000659 PyErr_Format(PyExc_LookupError, "unknown error handler name '%.400s'", name);
Walter Dörwald3aeb6322002-09-02 13:14:32 +0000660 else
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000661 Py_INCREF(handler);
Walter Dörwald3aeb6322002-09-02 13:14:32 +0000662 return handler;
663}
664
665static void wrong_exception_type(PyObject *exc)
666{
Serhiy Storchakaca7fecb2015-05-18 16:08:52 +0300667 PyErr_Format(PyExc_TypeError,
668 "don't know how to handle %.200s in error callback",
669 exc->ob_type->tp_name);
Walter Dörwald3aeb6322002-09-02 13:14:32 +0000670}
671
672PyObject *PyCodec_StrictErrors(PyObject *exc)
673{
Brett Cannonbf364092006-03-01 04:25:17 +0000674 if (PyExceptionInstance_Check(exc))
675 PyErr_SetObject(PyExceptionInstance_Class(exc), exc);
Walter Dörwald3aeb6322002-09-02 13:14:32 +0000676 else
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000677 PyErr_SetString(PyExc_TypeError, "codec must pass exception instance");
Walter Dörwald3aeb6322002-09-02 13:14:32 +0000678 return NULL;
679}
680
681
682PyObject *PyCodec_IgnoreErrors(PyObject *exc)
683{
Martin v. Löwis18e16552006-02-15 17:27:45 +0000684 Py_ssize_t end;
Serhiy Storchakaca7fecb2015-05-18 16:08:52 +0300685
686 if (PyObject_TypeCheck(exc, (PyTypeObject *)PyExc_UnicodeEncodeError)) {
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000687 if (PyUnicodeEncodeError_GetEnd(exc, &end))
688 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +0000689 }
Serhiy Storchakaca7fecb2015-05-18 16:08:52 +0300690 else if (PyObject_TypeCheck(exc, (PyTypeObject *)PyExc_UnicodeDecodeError)) {
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000691 if (PyUnicodeDecodeError_GetEnd(exc, &end))
692 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +0000693 }
Serhiy Storchakaca7fecb2015-05-18 16:08:52 +0300694 else if (PyObject_TypeCheck(exc, (PyTypeObject *)PyExc_UnicodeTranslateError)) {
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000695 if (PyUnicodeTranslateError_GetEnd(exc, &end))
696 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +0000697 }
698 else {
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000699 wrong_exception_type(exc);
700 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +0000701 }
Victor Stinneree450092011-12-01 02:52:11 +0100702 return Py_BuildValue("(Nn)", PyUnicode_New(0, 0), end);
Walter Dörwald3aeb6322002-09-02 13:14:32 +0000703}
704
705
706PyObject *PyCodec_ReplaceErrors(PyObject *exc)
707{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200708 Py_ssize_t start, end, i, len;
Walter Dörwald3aeb6322002-09-02 13:14:32 +0000709
Serhiy Storchakaca7fecb2015-05-18 16:08:52 +0300710 if (PyObject_TypeCheck(exc, (PyTypeObject *)PyExc_UnicodeEncodeError)) {
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000711 PyObject *res;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200712 int kind;
713 void *data;
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000714 if (PyUnicodeEncodeError_GetStart(exc, &start))
715 return NULL;
716 if (PyUnicodeEncodeError_GetEnd(exc, &end))
717 return NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200718 len = end - start;
719 res = PyUnicode_New(len, '?');
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000720 if (res == NULL)
721 return NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200722 kind = PyUnicode_KIND(res);
723 data = PyUnicode_DATA(res);
724 for (i = 0; i < len; ++i)
725 PyUnicode_WRITE(kind, data, i, '?');
Victor Stinner8f825062012-04-27 13:55:39 +0200726 assert(_PyUnicode_CheckConsistency(res, 1));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200727 return Py_BuildValue("(Nn)", res, end);
Walter Dörwald3aeb6322002-09-02 13:14:32 +0000728 }
Serhiy Storchakaca7fecb2015-05-18 16:08:52 +0300729 else if (PyObject_TypeCheck(exc, (PyTypeObject *)PyExc_UnicodeDecodeError)) {
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000730 if (PyUnicodeDecodeError_GetEnd(exc, &end))
731 return NULL;
Victor Stinner1a15aba2011-10-02 19:00:15 +0200732 return Py_BuildValue("(Cn)",
733 (int)Py_UNICODE_REPLACEMENT_CHARACTER,
734 end);
Walter Dörwald3aeb6322002-09-02 13:14:32 +0000735 }
Serhiy Storchakaca7fecb2015-05-18 16:08:52 +0300736 else if (PyObject_TypeCheck(exc, (PyTypeObject *)PyExc_UnicodeTranslateError)) {
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000737 PyObject *res;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200738 int kind;
739 void *data;
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000740 if (PyUnicodeTranslateError_GetStart(exc, &start))
741 return NULL;
742 if (PyUnicodeTranslateError_GetEnd(exc, &end))
743 return NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200744 len = end - start;
745 res = PyUnicode_New(len, Py_UNICODE_REPLACEMENT_CHARACTER);
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000746 if (res == NULL)
747 return NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200748 kind = PyUnicode_KIND(res);
749 data = PyUnicode_DATA(res);
750 for (i=0; i < len; i++)
751 PyUnicode_WRITE(kind, data, i, Py_UNICODE_REPLACEMENT_CHARACTER);
Victor Stinner8f825062012-04-27 13:55:39 +0200752 assert(_PyUnicode_CheckConsistency(res, 1));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200753 return Py_BuildValue("(Nn)", res, end);
Walter Dörwald3aeb6322002-09-02 13:14:32 +0000754 }
755 else {
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000756 wrong_exception_type(exc);
757 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +0000758 }
759}
760
761PyObject *PyCodec_XMLCharRefReplaceErrors(PyObject *exc)
762{
Serhiy Storchakaca7fecb2015-05-18 16:08:52 +0300763 if (PyObject_TypeCheck(exc, (PyTypeObject *)PyExc_UnicodeEncodeError)) {
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000764 PyObject *restuple;
765 PyObject *object;
Victor Stinnerb31f1bc2011-11-04 21:29:10 +0100766 Py_ssize_t i;
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000767 Py_ssize_t start;
768 Py_ssize_t end;
769 PyObject *res;
Martin v. Löwisb09af032011-11-04 11:16:41 +0100770 unsigned char *outp;
Serhiy Storchaka2e374092014-10-04 14:15:49 +0300771 Py_ssize_t ressize;
Martin v. Löwisb09af032011-11-04 11:16:41 +0100772 Py_UCS4 ch;
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000773 if (PyUnicodeEncodeError_GetStart(exc, &start))
774 return NULL;
775 if (PyUnicodeEncodeError_GetEnd(exc, &end))
776 return NULL;
777 if (!(object = PyUnicodeEncodeError_GetObject(exc)))
778 return NULL;
Serhiy Storchaka2e374092014-10-04 14:15:49 +0300779 if (end - start > PY_SSIZE_T_MAX / (2+7+1))
780 end = start + PY_SSIZE_T_MAX / (2+7+1);
Martin v. Löwisb09af032011-11-04 11:16:41 +0100781 for (i = start, ressize = 0; i < end; ++i) {
782 /* object is guaranteed to be "ready" */
783 ch = PyUnicode_READ_CHAR(object, i);
784 if (ch<10)
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000785 ressize += 2+1+1;
Martin v. Löwisb09af032011-11-04 11:16:41 +0100786 else if (ch<100)
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000787 ressize += 2+2+1;
Martin v. Löwisb09af032011-11-04 11:16:41 +0100788 else if (ch<1000)
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000789 ressize += 2+3+1;
Martin v. Löwisb09af032011-11-04 11:16:41 +0100790 else if (ch<10000)
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000791 ressize += 2+4+1;
Martin v. Löwisb09af032011-11-04 11:16:41 +0100792 else if (ch<100000)
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000793 ressize += 2+5+1;
Martin v. Löwisb09af032011-11-04 11:16:41 +0100794 else if (ch<1000000)
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000795 ressize += 2+6+1;
796 else
797 ressize += 2+7+1;
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000798 }
799 /* allocate replacement */
Martin v. Löwisb09af032011-11-04 11:16:41 +0100800 res = PyUnicode_New(ressize, 127);
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000801 if (res == NULL) {
802 Py_DECREF(object);
803 return NULL;
804 }
Martin v. Löwisb09af032011-11-04 11:16:41 +0100805 outp = PyUnicode_1BYTE_DATA(res);
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000806 /* generate replacement */
Victor Stinnerb31f1bc2011-11-04 21:29:10 +0100807 for (i = start; i < end; ++i) {
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000808 int digits;
809 int base;
Martin v. Löwis8ba79302011-11-04 12:26:49 +0100810 ch = PyUnicode_READ_CHAR(object, i);
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000811 *outp++ = '&';
812 *outp++ = '#';
Martin v. Löwisb09af032011-11-04 11:16:41 +0100813 if (ch<10) {
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000814 digits = 1;
815 base = 1;
816 }
Martin v. Löwisb09af032011-11-04 11:16:41 +0100817 else if (ch<100) {
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000818 digits = 2;
819 base = 10;
820 }
Martin v. Löwisb09af032011-11-04 11:16:41 +0100821 else if (ch<1000) {
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000822 digits = 3;
823 base = 100;
824 }
Martin v. Löwisb09af032011-11-04 11:16:41 +0100825 else if (ch<10000) {
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000826 digits = 4;
827 base = 1000;
828 }
Martin v. Löwisb09af032011-11-04 11:16:41 +0100829 else if (ch<100000) {
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000830 digits = 5;
831 base = 10000;
832 }
Martin v. Löwisb09af032011-11-04 11:16:41 +0100833 else if (ch<1000000) {
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000834 digits = 6;
835 base = 100000;
836 }
837 else {
838 digits = 7;
839 base = 1000000;
840 }
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000841 while (digits-->0) {
Martin v. Löwisb09af032011-11-04 11:16:41 +0100842 *outp++ = '0' + ch/base;
843 ch %= base;
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000844 base /= 10;
845 }
846 *outp++ = ';';
847 }
Victor Stinner8f825062012-04-27 13:55:39 +0200848 assert(_PyUnicode_CheckConsistency(res, 1));
849 restuple = Py_BuildValue("(Nn)", res, end);
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000850 Py_DECREF(object);
851 return restuple;
Walter Dörwald3aeb6322002-09-02 13:14:32 +0000852 }
853 else {
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000854 wrong_exception_type(exc);
855 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +0000856 }
857}
858
Walter Dörwald3aeb6322002-09-02 13:14:32 +0000859PyObject *PyCodec_BackslashReplaceErrors(PyObject *exc)
860{
Serhiy Storchaka07985ef2015-01-25 22:56:57 +0200861 PyObject *object;
862 Py_ssize_t i;
863 Py_ssize_t start;
864 Py_ssize_t end;
865 PyObject *res;
866 unsigned char *outp;
867 int ressize;
868 Py_UCS4 c;
869
Serhiy Storchakac0937f72015-05-18 16:10:40 +0300870 if (PyObject_TypeCheck(exc, (PyTypeObject *)PyExc_UnicodeDecodeError)) {
Serhiy Storchakacb33a012016-10-23 09:44:50 +0300871 const unsigned char *p;
Serhiy Storchaka07985ef2015-01-25 22:56:57 +0200872 if (PyUnicodeDecodeError_GetStart(exc, &start))
873 return NULL;
874 if (PyUnicodeDecodeError_GetEnd(exc, &end))
875 return NULL;
876 if (!(object = PyUnicodeDecodeError_GetObject(exc)))
877 return NULL;
Serhiy Storchakacb33a012016-10-23 09:44:50 +0300878 p = (const unsigned char*)PyBytes_AS_STRING(object);
Serhiy Storchaka07985ef2015-01-25 22:56:57 +0200879 res = PyUnicode_New(4 * (end - start), 127);
880 if (res == NULL) {
881 Py_DECREF(object);
882 return NULL;
883 }
884 outp = PyUnicode_1BYTE_DATA(res);
885 for (i = start; i < end; i++, outp += 4) {
886 unsigned char c = p[i];
887 outp[0] = '\\';
888 outp[1] = 'x';
889 outp[2] = Py_hexdigits[(c>>4)&0xf];
890 outp[3] = Py_hexdigits[c&0xf];
891 }
892
893 assert(_PyUnicode_CheckConsistency(res, 1));
894 Py_DECREF(object);
895 return Py_BuildValue("(Nn)", res, end);
896 }
Serhiy Storchakac0937f72015-05-18 16:10:40 +0300897 if (PyObject_TypeCheck(exc, (PyTypeObject *)PyExc_UnicodeEncodeError)) {
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000898 if (PyUnicodeEncodeError_GetStart(exc, &start))
899 return NULL;
900 if (PyUnicodeEncodeError_GetEnd(exc, &end))
901 return NULL;
902 if (!(object = PyUnicodeEncodeError_GetObject(exc)))
903 return NULL;
Serhiy Storchaka07985ef2015-01-25 22:56:57 +0200904 }
Serhiy Storchakac0937f72015-05-18 16:10:40 +0300905 else if (PyObject_TypeCheck(exc, (PyTypeObject *)PyExc_UnicodeTranslateError)) {
Serhiy Storchaka07985ef2015-01-25 22:56:57 +0200906 if (PyUnicodeTranslateError_GetStart(exc, &start))
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000907 return NULL;
Serhiy Storchaka07985ef2015-01-25 22:56:57 +0200908 if (PyUnicodeTranslateError_GetEnd(exc, &end))
909 return NULL;
910 if (!(object = PyUnicodeTranslateError_GetObject(exc)))
911 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +0000912 }
913 else {
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000914 wrong_exception_type(exc);
915 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +0000916 }
Serhiy Storchaka07985ef2015-01-25 22:56:57 +0200917
918 if (end - start > PY_SSIZE_T_MAX / (1+1+8))
919 end = start + PY_SSIZE_T_MAX / (1+1+8);
920 for (i = start, ressize = 0; i < end; ++i) {
921 /* object is guaranteed to be "ready" */
922 c = PyUnicode_READ_CHAR(object, i);
923 if (c >= 0x10000) {
924 ressize += 1+1+8;
925 }
926 else if (c >= 0x100) {
927 ressize += 1+1+4;
928 }
929 else
930 ressize += 1+1+2;
931 }
932 res = PyUnicode_New(ressize, 127);
933 if (res == NULL) {
934 Py_DECREF(object);
935 return NULL;
936 }
937 outp = PyUnicode_1BYTE_DATA(res);
938 for (i = start; i < end; ++i) {
939 c = PyUnicode_READ_CHAR(object, i);
940 *outp++ = '\\';
941 if (c >= 0x00010000) {
942 *outp++ = 'U';
943 *outp++ = Py_hexdigits[(c>>28)&0xf];
944 *outp++ = Py_hexdigits[(c>>24)&0xf];
945 *outp++ = Py_hexdigits[(c>>20)&0xf];
946 *outp++ = Py_hexdigits[(c>>16)&0xf];
947 *outp++ = Py_hexdigits[(c>>12)&0xf];
948 *outp++ = Py_hexdigits[(c>>8)&0xf];
949 }
950 else if (c >= 0x100) {
951 *outp++ = 'u';
952 *outp++ = Py_hexdigits[(c>>12)&0xf];
953 *outp++ = Py_hexdigits[(c>>8)&0xf];
954 }
955 else
956 *outp++ = 'x';
957 *outp++ = Py_hexdigits[(c>>4)&0xf];
958 *outp++ = Py_hexdigits[c&0xf];
959 }
960
961 assert(_PyUnicode_CheckConsistency(res, 1));
962 Py_DECREF(object);
963 return Py_BuildValue("(Nn)", res, end);
Walter Dörwald3aeb6322002-09-02 13:14:32 +0000964}
965
Serhiy Storchaka166ebc42014-11-25 13:57:17 +0200966static _PyUnicode_Name_CAPI *ucnhash_CAPI = NULL;
Serhiy Storchaka166ebc42014-11-25 13:57:17 +0200967
968PyObject *PyCodec_NameReplaceErrors(PyObject *exc)
969{
Serhiy Storchakaca7fecb2015-05-18 16:08:52 +0300970 if (PyObject_TypeCheck(exc, (PyTypeObject *)PyExc_UnicodeEncodeError)) {
Serhiy Storchaka166ebc42014-11-25 13:57:17 +0200971 PyObject *restuple;
972 PyObject *object;
973 Py_ssize_t i;
974 Py_ssize_t start;
975 Py_ssize_t end;
976 PyObject *res;
977 unsigned char *outp;
Serhiy Storchakaaacfccc2014-11-26 12:11:40 +0200978 Py_ssize_t ressize;
979 int replsize;
Serhiy Storchaka166ebc42014-11-25 13:57:17 +0200980 Py_UCS4 c;
981 char buffer[256]; /* NAME_MAXLEN */
982 if (PyUnicodeEncodeError_GetStart(exc, &start))
983 return NULL;
984 if (PyUnicodeEncodeError_GetEnd(exc, &end))
985 return NULL;
986 if (!(object = PyUnicodeEncodeError_GetObject(exc)))
987 return NULL;
Victor Stinner38b8ae02015-09-03 16:19:40 +0200988 if (!ucnhash_CAPI) {
Serhiy Storchaka166ebc42014-11-25 13:57:17 +0200989 /* load the unicode data module */
990 ucnhash_CAPI = (_PyUnicode_Name_CAPI *)PyCapsule_Import(
991 PyUnicodeData_CAPSULE_NAME, 1);
Victor Stinner38b8ae02015-09-03 16:19:40 +0200992 if (!ucnhash_CAPI)
993 return NULL;
Serhiy Storchaka166ebc42014-11-25 13:57:17 +0200994 }
995 for (i = start, ressize = 0; i < end; ++i) {
996 /* object is guaranteed to be "ready" */
997 c = PyUnicode_READ_CHAR(object, i);
Victor Stinner38b8ae02015-09-03 16:19:40 +0200998 if (ucnhash_CAPI->getname(NULL, c, buffer, sizeof(buffer), 1)) {
Serhiy Storchaka26861b02015-02-16 20:52:17 +0200999 replsize = 1+1+1+(int)strlen(buffer)+1;
Serhiy Storchaka166ebc42014-11-25 13:57:17 +02001000 }
1001 else if (c >= 0x10000) {
Serhiy Storchakaaacfccc2014-11-26 12:11:40 +02001002 replsize = 1+1+8;
Serhiy Storchaka166ebc42014-11-25 13:57:17 +02001003 }
1004 else if (c >= 0x100) {
Serhiy Storchakaaacfccc2014-11-26 12:11:40 +02001005 replsize = 1+1+4;
Serhiy Storchaka166ebc42014-11-25 13:57:17 +02001006 }
1007 else
Serhiy Storchakaaacfccc2014-11-26 12:11:40 +02001008 replsize = 1+1+2;
1009 if (ressize > PY_SSIZE_T_MAX - replsize)
1010 break;
1011 ressize += replsize;
Serhiy Storchaka166ebc42014-11-25 13:57:17 +02001012 }
Serhiy Storchakaaacfccc2014-11-26 12:11:40 +02001013 end = i;
Serhiy Storchaka166ebc42014-11-25 13:57:17 +02001014 res = PyUnicode_New(ressize, 127);
1015 if (res==NULL)
1016 return NULL;
1017 for (i = start, outp = PyUnicode_1BYTE_DATA(res);
1018 i < end; ++i) {
1019 c = PyUnicode_READ_CHAR(object, i);
1020 *outp++ = '\\';
Victor Stinner38b8ae02015-09-03 16:19:40 +02001021 if (ucnhash_CAPI->getname(NULL, c, buffer, sizeof(buffer), 1)) {
Serhiy Storchaka166ebc42014-11-25 13:57:17 +02001022 *outp++ = 'N';
1023 *outp++ = '{';
1024 strcpy((char *)outp, buffer);
1025 outp += strlen(buffer);
1026 *outp++ = '}';
1027 continue;
1028 }
1029 if (c >= 0x00010000) {
1030 *outp++ = 'U';
1031 *outp++ = Py_hexdigits[(c>>28)&0xf];
1032 *outp++ = Py_hexdigits[(c>>24)&0xf];
1033 *outp++ = Py_hexdigits[(c>>20)&0xf];
1034 *outp++ = Py_hexdigits[(c>>16)&0xf];
1035 *outp++ = Py_hexdigits[(c>>12)&0xf];
1036 *outp++ = Py_hexdigits[(c>>8)&0xf];
1037 }
1038 else if (c >= 0x100) {
1039 *outp++ = 'u';
1040 *outp++ = Py_hexdigits[(c>>12)&0xf];
1041 *outp++ = Py_hexdigits[(c>>8)&0xf];
1042 }
1043 else
1044 *outp++ = 'x';
1045 *outp++ = Py_hexdigits[(c>>4)&0xf];
1046 *outp++ = Py_hexdigits[c&0xf];
1047 }
1048
Benjamin Peterson3663b582014-11-26 14:39:54 -06001049 assert(outp == PyUnicode_1BYTE_DATA(res) + ressize);
Serhiy Storchaka166ebc42014-11-25 13:57:17 +02001050 assert(_PyUnicode_CheckConsistency(res, 1));
1051 restuple = Py_BuildValue("(Nn)", res, end);
1052 Py_DECREF(object);
1053 return restuple;
1054 }
1055 else {
1056 wrong_exception_type(exc);
1057 return NULL;
1058 }
1059}
1060
Serhiy Storchaka88d8fb62014-05-15 14:37:42 +03001061#define ENC_UNKNOWN -1
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02001062#define ENC_UTF8 0
1063#define ENC_UTF16BE 1
1064#define ENC_UTF16LE 2
1065#define ENC_UTF32BE 3
1066#define ENC_UTF32LE 4
1067
1068static int
1069get_standard_encoding(const char *encoding, int *bytelength)
1070{
1071 if (Py_TOLOWER(encoding[0]) == 'u' &&
1072 Py_TOLOWER(encoding[1]) == 't' &&
1073 Py_TOLOWER(encoding[2]) == 'f') {
1074 encoding += 3;
1075 if (*encoding == '-' || *encoding == '_' )
1076 encoding++;
Serhiy Storchaka88d8fb62014-05-15 14:37:42 +03001077 if (encoding[0] == '8' && encoding[1] == '\0') {
1078 *bytelength = 3;
1079 return ENC_UTF8;
1080 }
1081 else if (encoding[0] == '1' && encoding[1] == '6') {
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02001082 encoding += 2;
1083 *bytelength = 2;
1084 if (*encoding == '\0') {
1085#ifdef WORDS_BIGENDIAN
1086 return ENC_UTF16BE;
1087#else
1088 return ENC_UTF16LE;
1089#endif
1090 }
1091 if (*encoding == '-' || *encoding == '_' )
1092 encoding++;
1093 if (Py_TOLOWER(encoding[1]) == 'e' && encoding[2] == '\0') {
1094 if (Py_TOLOWER(encoding[0]) == 'b')
1095 return ENC_UTF16BE;
1096 if (Py_TOLOWER(encoding[0]) == 'l')
1097 return ENC_UTF16LE;
1098 }
1099 }
1100 else if (encoding[0] == '3' && encoding[1] == '2') {
1101 encoding += 2;
1102 *bytelength = 4;
1103 if (*encoding == '\0') {
1104#ifdef WORDS_BIGENDIAN
1105 return ENC_UTF32BE;
1106#else
1107 return ENC_UTF32LE;
1108#endif
1109 }
1110 if (*encoding == '-' || *encoding == '_' )
1111 encoding++;
1112 if (Py_TOLOWER(encoding[1]) == 'e' && encoding[2] == '\0') {
1113 if (Py_TOLOWER(encoding[0]) == 'b')
1114 return ENC_UTF32BE;
1115 if (Py_TOLOWER(encoding[0]) == 'l')
1116 return ENC_UTF32LE;
1117 }
1118 }
1119 }
Victor Stinner0d4e01c2014-05-16 14:46:20 +02001120 else if (strcmp(encoding, "CP_UTF8") == 0) {
1121 *bytelength = 3;
1122 return ENC_UTF8;
1123 }
Serhiy Storchaka88d8fb62014-05-15 14:37:42 +03001124 return ENC_UNKNOWN;
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02001125}
1126
Martin v. Löwisaef3fb02009-05-02 19:27:30 +00001127/* This handler is declared static until someone demonstrates
1128 a need to call it directly. */
1129static PyObject *
Martin v. Löwise0a2b722009-05-10 08:08:56 +00001130PyCodec_SurrogatePassErrors(PyObject *exc)
Martin v. Löwisdb12d452009-05-02 18:52:14 +00001131{
1132 PyObject *restuple;
1133 PyObject *object;
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02001134 PyObject *encode;
Serhiy Storchaka85b0f5b2016-11-20 10:16:47 +02001135 const char *encoding;
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02001136 int code;
1137 int bytelength;
Martin v. Löwisb09af032011-11-04 11:16:41 +01001138 Py_ssize_t i;
Martin v. Löwisdb12d452009-05-02 18:52:14 +00001139 Py_ssize_t start;
1140 Py_ssize_t end;
1141 PyObject *res;
Serhiy Storchakaca7fecb2015-05-18 16:08:52 +03001142
1143 if (PyObject_TypeCheck(exc, (PyTypeObject *)PyExc_UnicodeEncodeError)) {
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02001144 unsigned char *outp;
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001145 if (PyUnicodeEncodeError_GetStart(exc, &start))
1146 return NULL;
1147 if (PyUnicodeEncodeError_GetEnd(exc, &end))
1148 return NULL;
1149 if (!(object = PyUnicodeEncodeError_GetObject(exc)))
1150 return NULL;
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02001151 if (!(encode = PyUnicodeEncodeError_GetEncoding(exc))) {
1152 Py_DECREF(object);
1153 return NULL;
1154 }
1155 if (!(encoding = PyUnicode_AsUTF8(encode))) {
1156 Py_DECREF(object);
1157 Py_DECREF(encode);
1158 return NULL;
1159 }
1160 code = get_standard_encoding(encoding, &bytelength);
1161 Py_DECREF(encode);
Serhiy Storchaka88d8fb62014-05-15 14:37:42 +03001162 if (code == ENC_UNKNOWN) {
1163 /* Not supported, fail with original exception */
1164 PyErr_SetObject(PyExceptionInstance_Class(exc), exc);
1165 Py_DECREF(object);
1166 return NULL;
1167 }
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02001168
Serhiy Storchaka2e374092014-10-04 14:15:49 +03001169 if (end - start > PY_SSIZE_T_MAX / bytelength)
1170 end = start + PY_SSIZE_T_MAX / bytelength;
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02001171 res = PyBytes_FromStringAndSize(NULL, bytelength*(end-start));
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001172 if (!res) {
1173 Py_DECREF(object);
1174 return NULL;
1175 }
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02001176 outp = (unsigned char*)PyBytes_AsString(res);
Martin v. Löwisb09af032011-11-04 11:16:41 +01001177 for (i = start; i < end; i++) {
1178 /* object is guaranteed to be "ready" */
1179 Py_UCS4 ch = PyUnicode_READ_CHAR(object, i);
Victor Stinner76df43d2012-10-30 01:42:39 +01001180 if (!Py_UNICODE_IS_SURROGATE(ch)) {
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001181 /* Not a surrogate, fail with original exception */
1182 PyErr_SetObject(PyExceptionInstance_Class(exc), exc);
1183 Py_DECREF(res);
1184 Py_DECREF(object);
1185 return NULL;
1186 }
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02001187 switch (code) {
1188 case ENC_UTF8:
1189 *outp++ = (unsigned char)(0xe0 | (ch >> 12));
1190 *outp++ = (unsigned char)(0x80 | ((ch >> 6) & 0x3f));
1191 *outp++ = (unsigned char)(0x80 | (ch & 0x3f));
1192 break;
1193 case ENC_UTF16LE:
1194 *outp++ = (unsigned char) ch;
1195 *outp++ = (unsigned char)(ch >> 8);
1196 break;
1197 case ENC_UTF16BE:
1198 *outp++ = (unsigned char)(ch >> 8);
1199 *outp++ = (unsigned char) ch;
1200 break;
1201 case ENC_UTF32LE:
1202 *outp++ = (unsigned char) ch;
1203 *outp++ = (unsigned char)(ch >> 8);
1204 *outp++ = (unsigned char)(ch >> 16);
1205 *outp++ = (unsigned char)(ch >> 24);
1206 break;
1207 case ENC_UTF32BE:
1208 *outp++ = (unsigned char)(ch >> 24);
1209 *outp++ = (unsigned char)(ch >> 16);
1210 *outp++ = (unsigned char)(ch >> 8);
1211 *outp++ = (unsigned char) ch;
1212 break;
1213 }
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001214 }
1215 restuple = Py_BuildValue("(On)", res, end);
1216 Py_DECREF(res);
1217 Py_DECREF(object);
1218 return restuple;
Martin v. Löwisdb12d452009-05-02 18:52:14 +00001219 }
Serhiy Storchakaca7fecb2015-05-18 16:08:52 +03001220 else if (PyObject_TypeCheck(exc, (PyTypeObject *)PyExc_UnicodeDecodeError)) {
Serhiy Storchakacb33a012016-10-23 09:44:50 +03001221 const unsigned char *p;
Victor Stinnerc06bb7a2011-11-04 21:36:35 +01001222 Py_UCS4 ch = 0;
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001223 if (PyUnicodeDecodeError_GetStart(exc, &start))
1224 return NULL;
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02001225 if (PyUnicodeDecodeError_GetEnd(exc, &end))
1226 return NULL;
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001227 if (!(object = PyUnicodeDecodeError_GetObject(exc)))
1228 return NULL;
Serhiy Storchakacb33a012016-10-23 09:44:50 +03001229 p = (const unsigned char*)PyBytes_AS_STRING(object);
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02001230 if (!(encode = PyUnicodeDecodeError_GetEncoding(exc))) {
1231 Py_DECREF(object);
1232 return NULL;
1233 }
1234 if (!(encoding = PyUnicode_AsUTF8(encode))) {
1235 Py_DECREF(object);
1236 Py_DECREF(encode);
1237 return NULL;
1238 }
1239 code = get_standard_encoding(encoding, &bytelength);
1240 Py_DECREF(encode);
Serhiy Storchaka88d8fb62014-05-15 14:37:42 +03001241 if (code == ENC_UNKNOWN) {
1242 /* Not supported, fail with original exception */
1243 PyErr_SetObject(PyExceptionInstance_Class(exc), exc);
1244 Py_DECREF(object);
1245 return NULL;
1246 }
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02001247
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001248 /* Try decoding a single surrogate character. If
1249 there are more, let the codec call us again. */
1250 p += start;
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02001251 if (PyBytes_GET_SIZE(object) - start >= bytelength) {
1252 switch (code) {
1253 case ENC_UTF8:
1254 if ((p[0] & 0xf0) == 0xe0 &&
1255 (p[1] & 0xc0) == 0x80 &&
1256 (p[2] & 0xc0) == 0x80) {
1257 /* it's a three-byte code */
1258 ch = ((p[0] & 0x0f) << 12) + ((p[1] & 0x3f) << 6) + (p[2] & 0x3f);
1259 }
1260 break;
1261 case ENC_UTF16LE:
1262 ch = p[1] << 8 | p[0];
1263 break;
1264 case ENC_UTF16BE:
1265 ch = p[0] << 8 | p[1];
1266 break;
1267 case ENC_UTF32LE:
1268 ch = (p[3] << 24) | (p[2] << 16) | (p[1] << 8) | p[0];
1269 break;
1270 case ENC_UTF32BE:
1271 ch = (p[0] << 24) | (p[1] << 16) | (p[2] << 8) | p[3];
1272 break;
1273 }
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001274 }
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02001275
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001276 Py_DECREF(object);
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02001277 if (!Py_UNICODE_IS_SURROGATE(ch)) {
1278 /* it's not a surrogate - fail */
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001279 PyErr_SetObject(PyExceptionInstance_Class(exc), exc);
1280 return NULL;
1281 }
Victor Stinnerc06bb7a2011-11-04 21:36:35 +01001282 res = PyUnicode_FromOrdinal(ch);
1283 if (res == NULL)
1284 return NULL;
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02001285 return Py_BuildValue("(Nn)", res, start + bytelength);
Martin v. Löwisdb12d452009-05-02 18:52:14 +00001286 }
1287 else {
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001288 wrong_exception_type(exc);
1289 return NULL;
Martin v. Löwisdb12d452009-05-02 18:52:14 +00001290 }
1291}
1292
Martin v. Löwis011e8422009-05-05 04:43:17 +00001293static PyObject *
Martin v. Löwis43c57782009-05-10 08:15:24 +00001294PyCodec_SurrogateEscapeErrors(PyObject *exc)
Martin v. Löwis011e8422009-05-05 04:43:17 +00001295{
1296 PyObject *restuple;
1297 PyObject *object;
Martin v. Löwisb09af032011-11-04 11:16:41 +01001298 Py_ssize_t i;
Martin v. Löwis011e8422009-05-05 04:43:17 +00001299 Py_ssize_t start;
1300 Py_ssize_t end;
1301 PyObject *res;
Serhiy Storchakaca7fecb2015-05-18 16:08:52 +03001302
1303 if (PyObject_TypeCheck(exc, (PyTypeObject *)PyExc_UnicodeEncodeError)) {
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001304 char *outp;
1305 if (PyUnicodeEncodeError_GetStart(exc, &start))
1306 return NULL;
1307 if (PyUnicodeEncodeError_GetEnd(exc, &end))
1308 return NULL;
1309 if (!(object = PyUnicodeEncodeError_GetObject(exc)))
1310 return NULL;
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001311 res = PyBytes_FromStringAndSize(NULL, end-start);
1312 if (!res) {
1313 Py_DECREF(object);
1314 return NULL;
1315 }
1316 outp = PyBytes_AsString(res);
Martin v. Löwisb09af032011-11-04 11:16:41 +01001317 for (i = start; i < end; i++) {
1318 /* object is guaranteed to be "ready" */
1319 Py_UCS4 ch = PyUnicode_READ_CHAR(object, i);
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001320 if (ch < 0xdc80 || ch > 0xdcff) {
1321 /* Not a UTF-8b surrogate, fail with original exception */
1322 PyErr_SetObject(PyExceptionInstance_Class(exc), exc);
1323 Py_DECREF(res);
1324 Py_DECREF(object);
1325 return NULL;
1326 }
1327 *outp++ = ch - 0xdc00;
1328 }
1329 restuple = Py_BuildValue("(On)", res, end);
1330 Py_DECREF(res);
1331 Py_DECREF(object);
1332 return restuple;
Martin v. Löwis011e8422009-05-05 04:43:17 +00001333 }
Serhiy Storchakaca7fecb2015-05-18 16:08:52 +03001334 else if (PyObject_TypeCheck(exc, (PyTypeObject *)PyExc_UnicodeDecodeError)) {
Victor Stinnerc06bb7a2011-11-04 21:36:35 +01001335 PyObject *str;
Serhiy Storchakacb33a012016-10-23 09:44:50 +03001336 const unsigned char *p;
Victor Stinnerc06bb7a2011-11-04 21:36:35 +01001337 Py_UCS2 ch[4]; /* decode up to 4 bad bytes. */
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001338 int consumed = 0;
1339 if (PyUnicodeDecodeError_GetStart(exc, &start))
1340 return NULL;
1341 if (PyUnicodeDecodeError_GetEnd(exc, &end))
1342 return NULL;
1343 if (!(object = PyUnicodeDecodeError_GetObject(exc)))
1344 return NULL;
Serhiy Storchakacb33a012016-10-23 09:44:50 +03001345 p = (const unsigned char*)PyBytes_AS_STRING(object);
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001346 while (consumed < 4 && consumed < end-start) {
1347 /* Refuse to escape ASCII bytes. */
1348 if (p[start+consumed] < 128)
1349 break;
1350 ch[consumed] = 0xdc00 + p[start+consumed];
1351 consumed++;
1352 }
1353 Py_DECREF(object);
1354 if (!consumed) {
1355 /* codec complained about ASCII byte. */
1356 PyErr_SetObject(PyExceptionInstance_Class(exc), exc);
1357 return NULL;
1358 }
Victor Stinnerc06bb7a2011-11-04 21:36:35 +01001359 str = PyUnicode_FromKindAndData(PyUnicode_2BYTE_KIND, ch, consumed);
1360 if (str == NULL)
1361 return NULL;
1362 return Py_BuildValue("(Nn)", str, start+consumed);
Martin v. Löwis011e8422009-05-05 04:43:17 +00001363 }
1364 else {
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001365 wrong_exception_type(exc);
1366 return NULL;
Martin v. Löwis011e8422009-05-05 04:43:17 +00001367 }
1368}
1369
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001370
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001371static PyObject *strict_errors(PyObject *self, PyObject *exc)
1372{
1373 return PyCodec_StrictErrors(exc);
1374}
1375
1376
1377static PyObject *ignore_errors(PyObject *self, PyObject *exc)
1378{
1379 return PyCodec_IgnoreErrors(exc);
1380}
1381
1382
1383static PyObject *replace_errors(PyObject *self, PyObject *exc)
1384{
1385 return PyCodec_ReplaceErrors(exc);
1386}
1387
1388
1389static PyObject *xmlcharrefreplace_errors(PyObject *self, PyObject *exc)
1390{
1391 return PyCodec_XMLCharRefReplaceErrors(exc);
1392}
1393
1394
1395static PyObject *backslashreplace_errors(PyObject *self, PyObject *exc)
1396{
1397 return PyCodec_BackslashReplaceErrors(exc);
1398}
1399
Serhiy Storchaka166ebc42014-11-25 13:57:17 +02001400static PyObject *namereplace_errors(PyObject *self, PyObject *exc)
1401{
1402 return PyCodec_NameReplaceErrors(exc);
1403}
1404
Martin v. Löwise0a2b722009-05-10 08:08:56 +00001405static PyObject *surrogatepass_errors(PyObject *self, PyObject *exc)
Martin v. Löwisdb12d452009-05-02 18:52:14 +00001406{
Martin v. Löwise0a2b722009-05-10 08:08:56 +00001407 return PyCodec_SurrogatePassErrors(exc);
Martin v. Löwisdb12d452009-05-02 18:52:14 +00001408}
1409
Martin v. Löwis43c57782009-05-10 08:15:24 +00001410static PyObject *surrogateescape_errors(PyObject *self, PyObject *exc)
Martin v. Löwis011e8422009-05-05 04:43:17 +00001411{
Martin v. Löwis43c57782009-05-10 08:15:24 +00001412 return PyCodec_SurrogateEscapeErrors(exc);
Martin v. Löwis011e8422009-05-05 04:43:17 +00001413}
1414
Gustavo Niemeyer5ddd4c32003-03-19 00:35:36 +00001415static int _PyCodecRegistry_Init(void)
Guido van Rossumfeee4b92000-03-10 22:57:27 +00001416{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001417 static struct {
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001418 char *name;
1419 PyMethodDef def;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001420 } methods[] =
1421 {
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001422 {
1423 "strict",
1424 {
1425 "strict_errors",
1426 strict_errors,
1427 METH_O,
1428 PyDoc_STR("Implements the 'strict' error handling, which "
1429 "raises a UnicodeError on coding errors.")
1430 }
1431 },
1432 {
1433 "ignore",
1434 {
1435 "ignore_errors",
1436 ignore_errors,
1437 METH_O,
1438 PyDoc_STR("Implements the 'ignore' error handling, which "
1439 "ignores malformed data and continues.")
1440 }
1441 },
1442 {
1443 "replace",
1444 {
1445 "replace_errors",
1446 replace_errors,
1447 METH_O,
1448 PyDoc_STR("Implements the 'replace' error handling, which "
1449 "replaces malformed data with a replacement marker.")
1450 }
1451 },
1452 {
1453 "xmlcharrefreplace",
1454 {
1455 "xmlcharrefreplace_errors",
1456 xmlcharrefreplace_errors,
1457 METH_O,
1458 PyDoc_STR("Implements the 'xmlcharrefreplace' error handling, "
1459 "which replaces an unencodable character with the "
1460 "appropriate XML character reference.")
1461 }
1462 },
1463 {
1464 "backslashreplace",
1465 {
1466 "backslashreplace_errors",
1467 backslashreplace_errors,
1468 METH_O,
1469 PyDoc_STR("Implements the 'backslashreplace' error handling, "
Serhiy Storchaka07985ef2015-01-25 22:56:57 +02001470 "which replaces malformed data with a backslashed "
1471 "escape sequence.")
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001472 }
1473 },
1474 {
Serhiy Storchaka166ebc42014-11-25 13:57:17 +02001475 "namereplace",
1476 {
1477 "namereplace_errors",
1478 namereplace_errors,
1479 METH_O,
1480 PyDoc_STR("Implements the 'namereplace' error handling, "
1481 "which replaces an unencodable character with a "
1482 "\\N{...} escape sequence.")
1483 }
1484 },
1485 {
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001486 "surrogatepass",
1487 {
1488 "surrogatepass",
1489 surrogatepass_errors,
1490 METH_O
1491 }
1492 },
1493 {
1494 "surrogateescape",
1495 {
1496 "surrogateescape",
1497 surrogateescape_errors,
1498 METH_O
1499 }
1500 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001501 };
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001502
Nicholas Bastine5662ae2004-03-24 22:22:12 +00001503 PyInterpreterState *interp = PyThreadState_GET()->interp;
Gustavo Niemeyer5ddd4c32003-03-19 00:35:36 +00001504 PyObject *mod;
Neal Norwitz739a8f82004-07-08 01:55:58 +00001505 unsigned i;
Gustavo Niemeyer5ddd4c32003-03-19 00:35:36 +00001506
1507 if (interp->codec_search_path != NULL)
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001508 return 0;
Gustavo Niemeyer5ddd4c32003-03-19 00:35:36 +00001509
1510 interp->codec_search_path = PyList_New(0);
1511 interp->codec_search_cache = PyDict_New();
1512 interp->codec_error_registry = PyDict_New();
1513
1514 if (interp->codec_error_registry) {
Victor Stinner63941882011-09-29 00:42:28 +02001515 for (i = 0; i < Py_ARRAY_LENGTH(methods); ++i) {
Andrew Svetlov3ba3a3e2012-12-25 13:32:35 +02001516 PyObject *func = PyCFunction_NewEx(&methods[i].def, NULL, NULL);
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001517 int res;
1518 if (!func)
1519 Py_FatalError("can't initialize codec error registry");
1520 res = PyCodec_RegisterError(methods[i].name, func);
1521 Py_DECREF(func);
1522 if (res)
1523 Py_FatalError("can't initialize codec error registry");
1524 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001525 }
Guido van Rossumfeee4b92000-03-10 22:57:27 +00001526
Gustavo Niemeyer5ddd4c32003-03-19 00:35:36 +00001527 if (interp->codec_search_path == NULL ||
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001528 interp->codec_search_cache == NULL ||
1529 interp->codec_error_registry == NULL)
1530 Py_FatalError("can't initialize codec registry");
Gustavo Niemeyer5ddd4c32003-03-19 00:35:36 +00001531
Christian Heimes819b8bf2008-01-03 23:05:47 +00001532 mod = PyImport_ImportModuleNoBlock("encodings");
Gustavo Niemeyer5ddd4c32003-03-19 00:35:36 +00001533 if (mod == NULL) {
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001534 return -1;
Gustavo Niemeyer5ddd4c32003-03-19 00:35:36 +00001535 }
1536 Py_DECREF(mod);
Christian Heimes6a27efa2008-10-30 21:48:26 +00001537 interp->codecs_initialized = 1;
Gustavo Niemeyer5ddd4c32003-03-19 00:35:36 +00001538 return 0;
Guido van Rossumfeee4b92000-03-10 22:57:27 +00001539}