blob: 223ccca603fcf59013e141424fcf2ac0e7e66d08 [file] [log] [blame]
Guido van Rossumfeee4b92000-03-10 22:57:27 +00001/* ------------------------------------------------------------------------
2
3 Python Codec Registry and support functions
4
5Written by Marc-Andre Lemburg (mal@lemburg.com).
6
Guido van Rossum16b1ad92000-08-03 16:24:25 +00007Copyright (c) Corporation for National Research Initiatives.
Guido van Rossumfeee4b92000-03-10 22:57:27 +00008
9 ------------------------------------------------------------------------ */
10
11#include "Python.h"
Eric Snow2ebc5ce2017-09-07 23:51:28 -060012#include "internal/pystate.h"
Serhiy Storchaka166ebc42014-11-25 13:57:17 +020013#include "ucnhash.h"
Guido van Rossumfeee4b92000-03-10 22:57:27 +000014#include <ctype.h>
15
Victor Stinnerf5cff562011-10-14 02:13:11 +020016const char *Py_hexdigits = "0123456789abcdef";
17
Guido van Rossumfeee4b92000-03-10 22:57:27 +000018/* --- Codec Registry ----------------------------------------------------- */
19
20/* Import the standard encodings package which will register the first
Guido van Rossum98297ee2007-11-06 21:34:58 +000021 codec search function.
Guido van Rossumfeee4b92000-03-10 22:57:27 +000022
23 This is done in a lazy way so that the Unicode implementation does
24 not downgrade startup time of scripts not needing it.
25
Guido van Rossumb95de4f2000-03-31 17:25:23 +000026 ImportErrors are silently ignored by this function. Only one try is
27 made.
Guido van Rossumfeee4b92000-03-10 22:57:27 +000028
29*/
30
Gustavo Niemeyer5ddd4c32003-03-19 00:35:36 +000031static int _PyCodecRegistry_Init(void); /* Forward */
Guido van Rossumfeee4b92000-03-10 22:57:27 +000032
Guido van Rossumfeee4b92000-03-10 22:57:27 +000033int PyCodec_Register(PyObject *search_function)
34{
Nicholas Bastine5662ae2004-03-24 22:22:12 +000035 PyInterpreterState *interp = PyThreadState_GET()->interp;
Gustavo Niemeyer5ddd4c32003-03-19 00:35:36 +000036 if (interp->codec_search_path == NULL && _PyCodecRegistry_Init())
Antoine Pitrouf95a1b32010-05-09 15:52:27 +000037 goto onError;
Guido van Rossumfeee4b92000-03-10 22:57:27 +000038 if (search_function == NULL) {
Antoine Pitrouf95a1b32010-05-09 15:52:27 +000039 PyErr_BadArgument();
40 goto onError;
Guido van Rossumfeee4b92000-03-10 22:57:27 +000041 }
42 if (!PyCallable_Check(search_function)) {
Antoine Pitrouf95a1b32010-05-09 15:52:27 +000043 PyErr_SetString(PyExc_TypeError, "argument must be callable");
44 goto onError;
Guido van Rossumfeee4b92000-03-10 22:57:27 +000045 }
Gustavo Niemeyer5ddd4c32003-03-19 00:35:36 +000046 return PyList_Append(interp->codec_search_path, search_function);
Guido van Rossumb95de4f2000-03-31 17:25:23 +000047
48 onError:
49 return -1;
Guido van Rossumfeee4b92000-03-10 22:57:27 +000050}
51
Guido van Rossum9e896b32000-04-05 20:11:21 +000052/* Convert a string to a normalized Python string: all characters are
53 converted to lower case, spaces are replaced with underscores. */
54
Guido van Rossumfeee4b92000-03-10 22:57:27 +000055static
Guido van Rossum9e896b32000-04-05 20:11:21 +000056PyObject *normalizestring(const char *string)
Guido van Rossumfeee4b92000-03-10 22:57:27 +000057{
Antoine Pitrou9ed5f272013-08-13 20:18:52 +020058 size_t i;
Guido van Rossum582acec2000-06-28 22:07:35 +000059 size_t len = strlen(string);
Guido van Rossumfeee4b92000-03-10 22:57:27 +000060 char *p;
61 PyObject *v;
Guido van Rossum21431e82007-10-19 21:48:41 +000062
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000063 if (len > PY_SSIZE_T_MAX) {
Antoine Pitrouf95a1b32010-05-09 15:52:27 +000064 PyErr_SetString(PyExc_OverflowError, "string is too large");
65 return NULL;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000066 }
Guido van Rossum21431e82007-10-19 21:48:41 +000067
68 p = PyMem_Malloc(len + 1);
69 if (p == NULL)
Victor Stinnercc351592013-07-12 00:02:55 +020070 return PyErr_NoMemory();
Guido van Rossum9e896b32000-04-05 20:11:21 +000071 for (i = 0; i < len; i++) {
Antoine Pitrou9ed5f272013-08-13 20:18:52 +020072 char ch = string[i];
Guido van Rossum9e896b32000-04-05 20:11:21 +000073 if (ch == ' ')
74 ch = '-';
75 else
Antoine Pitroucf9d3c02011-07-24 02:27:04 +020076 ch = Py_TOLOWER(Py_CHARMASK(ch));
Antoine Pitrouf95a1b32010-05-09 15:52:27 +000077 p[i] = ch;
Guido van Rossum9e896b32000-04-05 20:11:21 +000078 }
Guido van Rossum21431e82007-10-19 21:48:41 +000079 p[i] = '\0';
80 v = PyUnicode_FromString(p);
81 if (v == NULL)
82 return NULL;
83 PyMem_Free(p);
Guido van Rossumfeee4b92000-03-10 22:57:27 +000084 return v;
85}
86
87/* Lookup the given encoding and return a tuple providing the codec
88 facilities.
89
90 The encoding string is looked up converted to all lower-case
91 characters. This makes encodings looked up through this mechanism
92 effectively case-insensitive.
93
Guido van Rossum98297ee2007-11-06 21:34:58 +000094 If no codec is found, a LookupError is set and NULL returned.
Guido van Rossumb95de4f2000-03-31 17:25:23 +000095
96 As side effect, this tries to load the encodings package, if not
97 yet done. This is part of the lazy load strategy for the encodings
98 package.
99
100*/
Guido van Rossumfeee4b92000-03-10 22:57:27 +0000101
102PyObject *_PyCodec_Lookup(const char *encoding)
103{
Gustavo Niemeyer5ddd4c32003-03-19 00:35:36 +0000104 PyInterpreterState *interp;
Guido van Rossum5ba3c842000-03-24 20:52:23 +0000105 PyObject *result, *args = NULL, *v;
Thomas Wouters477c8d52006-05-27 19:21:47 +0000106 Py_ssize_t i, len;
Guido van Rossumfeee4b92000-03-10 22:57:27 +0000107
Fred Drake766de832000-05-09 19:55:59 +0000108 if (encoding == NULL) {
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000109 PyErr_BadArgument();
110 goto onError;
Fred Drake766de832000-05-09 19:55:59 +0000111 }
Gustavo Niemeyer5ddd4c32003-03-19 00:35:36 +0000112
Nicholas Bastine5662ae2004-03-24 22:22:12 +0000113 interp = PyThreadState_GET()->interp;
Gustavo Niemeyer5ddd4c32003-03-19 00:35:36 +0000114 if (interp->codec_search_path == NULL && _PyCodecRegistry_Init())
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000115 goto onError;
Guido van Rossumfeee4b92000-03-10 22:57:27 +0000116
Guido van Rossum9e896b32000-04-05 20:11:21 +0000117 /* Convert the encoding to a normalized Python string: all
Thomas Wouters7e474022000-07-16 12:04:32 +0000118 characters are converted to lower case, spaces and hyphens are
Guido van Rossum9e896b32000-04-05 20:11:21 +0000119 replaced with underscores. */
120 v = normalizestring(encoding);
Guido van Rossumfeee4b92000-03-10 22:57:27 +0000121 if (v == NULL)
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000122 goto onError;
Guido van Rossum21431e82007-10-19 21:48:41 +0000123 PyUnicode_InternInPlace(&v);
Guido van Rossumfeee4b92000-03-10 22:57:27 +0000124
125 /* First, try to lookup the name in the registry dictionary */
Gustavo Niemeyer5ddd4c32003-03-19 00:35:36 +0000126 result = PyDict_GetItem(interp->codec_search_cache, v);
Guido van Rossumfeee4b92000-03-10 22:57:27 +0000127 if (result != NULL) {
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000128 Py_INCREF(result);
129 Py_DECREF(v);
130 return result;
Guido van Rossumfeee4b92000-03-10 22:57:27 +0000131 }
Guido van Rossum98297ee2007-11-06 21:34:58 +0000132
Guido van Rossumfeee4b92000-03-10 22:57:27 +0000133 /* Next, scan the search functions in order of registration */
Guido van Rossumfeee4b92000-03-10 22:57:27 +0000134 args = PyTuple_New(1);
135 if (args == NULL)
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000136 goto onError;
Guido van Rossumfeee4b92000-03-10 22:57:27 +0000137 PyTuple_SET_ITEM(args,0,v);
Guido van Rossum5ba3c842000-03-24 20:52:23 +0000138
Gustavo Niemeyer5ddd4c32003-03-19 00:35:36 +0000139 len = PyList_Size(interp->codec_search_path);
Guido van Rossum5ba3c842000-03-24 20:52:23 +0000140 if (len < 0)
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000141 goto onError;
Guido van Rossumb95de4f2000-03-31 17:25:23 +0000142 if (len == 0) {
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000143 PyErr_SetString(PyExc_LookupError,
144 "no codec search functions registered: "
145 "can't find encoding");
146 goto onError;
Guido van Rossumb95de4f2000-03-31 17:25:23 +0000147 }
Guido van Rossumfeee4b92000-03-10 22:57:27 +0000148
149 for (i = 0; i < len; i++) {
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000150 PyObject *func;
Guido van Rossum98297ee2007-11-06 21:34:58 +0000151
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000152 func = PyList_GetItem(interp->codec_search_path, i);
153 if (func == NULL)
154 goto onError;
155 result = PyEval_CallObject(func, args);
156 if (result == NULL)
157 goto onError;
158 if (result == Py_None) {
159 Py_DECREF(result);
160 continue;
161 }
162 if (!PyTuple_Check(result) || PyTuple_GET_SIZE(result) != 4) {
163 PyErr_SetString(PyExc_TypeError,
164 "codec search functions must return 4-tuples");
165 Py_DECREF(result);
166 goto onError;
167 }
168 break;
Guido van Rossumfeee4b92000-03-10 22:57:27 +0000169 }
170 if (i == len) {
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000171 /* XXX Perhaps we should cache misses too ? */
172 PyErr_Format(PyExc_LookupError,
Martin v. Löwiseb42b022002-09-26 16:01:24 +0000173 "unknown encoding: %s", encoding);
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000174 goto onError;
Guido van Rossumfeee4b92000-03-10 22:57:27 +0000175 }
176
177 /* Cache and return the result */
Neal Norwitz9edcc2e2007-08-11 04:58:26 +0000178 if (PyDict_SetItem(interp->codec_search_cache, v, result) < 0) {
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000179 Py_DECREF(result);
180 goto onError;
Neal Norwitz9edcc2e2007-08-11 04:58:26 +0000181 }
Guido van Rossumfeee4b92000-03-10 22:57:27 +0000182 Py_DECREF(args);
183 return result;
184
185 onError:
186 Py_XDECREF(args);
187 return NULL;
188}
189
Nick Coghlan8fad1672014-09-15 23:50:44 +1200190int _PyCodec_Forget(const char *encoding)
191{
192 PyInterpreterState *interp;
193 PyObject *v;
194 int result;
195
196 interp = PyThreadState_GET()->interp;
197 if (interp->codec_search_path == NULL) {
198 return -1;
199 }
200
201 /* Convert the encoding to a normalized Python string: all
202 characters are converted to lower case, spaces and hyphens are
203 replaced with underscores. */
204 v = normalizestring(encoding);
205 if (v == NULL) {
206 return -1;
207 }
208
209 /* Drop the named codec from the internal cache */
210 result = PyDict_DelItem(interp->codec_search_cache, v);
211 Py_DECREF(v);
212
213 return result;
214}
215
Marc-André Lemburgb2750b52008-06-06 12:18:17 +0000216/* Codec registry encoding check API. */
217
218int PyCodec_KnownEncoding(const char *encoding)
219{
220 PyObject *codecs;
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000221
Marc-André Lemburgb2750b52008-06-06 12:18:17 +0000222 codecs = _PyCodec_Lookup(encoding);
223 if (!codecs) {
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000224 PyErr_Clear();
225 return 0;
Marc-André Lemburgb2750b52008-06-06 12:18:17 +0000226 }
227 else {
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000228 Py_DECREF(codecs);
229 return 1;
Marc-André Lemburgb2750b52008-06-06 12:18:17 +0000230 }
231}
232
Guido van Rossumfeee4b92000-03-10 22:57:27 +0000233static
234PyObject *args_tuple(PyObject *object,
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000235 const char *errors)
Guido van Rossumfeee4b92000-03-10 22:57:27 +0000236{
237 PyObject *args;
Guido van Rossum98297ee2007-11-06 21:34:58 +0000238
Guido van Rossumfeee4b92000-03-10 22:57:27 +0000239 args = PyTuple_New(1 + (errors != NULL));
240 if (args == NULL)
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000241 return NULL;
Guido van Rossumfeee4b92000-03-10 22:57:27 +0000242 Py_INCREF(object);
243 PyTuple_SET_ITEM(args,0,object);
244 if (errors) {
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000245 PyObject *v;
Guido van Rossum98297ee2007-11-06 21:34:58 +0000246
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000247 v = PyUnicode_FromString(errors);
248 if (v == NULL) {
249 Py_DECREF(args);
250 return NULL;
251 }
252 PyTuple_SET_ITEM(args, 1, v);
Guido van Rossumfeee4b92000-03-10 22:57:27 +0000253 }
254 return args;
255}
256
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000257/* Helper function to get a codec item */
Guido van Rossumfeee4b92000-03-10 22:57:27 +0000258
259static
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000260PyObject *codec_getitem(const char *encoding, int index)
Guido van Rossumfeee4b92000-03-10 22:57:27 +0000261{
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000262 PyObject *codecs;
263 PyObject *v;
Guido van Rossumfeee4b92000-03-10 22:57:27 +0000264
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000265 codecs = _PyCodec_Lookup(encoding);
266 if (codecs == NULL)
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000267 return NULL;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000268 v = PyTuple_GET_ITEM(codecs, index);
269 Py_DECREF(codecs);
270 Py_INCREF(v);
271 return v;
272}
273
Nick Coghlana9b15242014-02-04 22:11:18 +1000274/* Helper functions to create an incremental codec. */
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000275static
Nick Coghlana9b15242014-02-04 22:11:18 +1000276PyObject *codec_makeincrementalcodec(PyObject *codec_info,
277 const char *errors,
278 const char *attrname)
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000279{
Nick Coghlana9b15242014-02-04 22:11:18 +1000280 PyObject *ret, *inccodec;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000281
Nick Coghlana9b15242014-02-04 22:11:18 +1000282 inccodec = PyObject_GetAttrString(codec_info, attrname);
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000283 if (inccodec == NULL)
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000284 return NULL;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000285 if (errors)
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000286 ret = PyObject_CallFunction(inccodec, "s", errors);
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000287 else
Victor Stinner4778eab2016-12-01 14:51:04 +0100288 ret = _PyObject_CallNoArg(inccodec);
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000289 Py_DECREF(inccodec);
290 return ret;
291}
292
Nick Coghlana9b15242014-02-04 22:11:18 +1000293static
294PyObject *codec_getincrementalcodec(const char *encoding,
295 const char *errors,
296 const char *attrname)
297{
298 PyObject *codec_info, *ret;
299
300 codec_info = _PyCodec_Lookup(encoding);
301 if (codec_info == NULL)
302 return NULL;
303 ret = codec_makeincrementalcodec(codec_info, errors, attrname);
304 Py_DECREF(codec_info);
305 return ret;
306}
307
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000308/* Helper function to create a stream codec. */
309
310static
311PyObject *codec_getstreamcodec(const char *encoding,
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000312 PyObject *stream,
313 const char *errors,
314 const int index)
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000315{
Thomas Wouters0e3f5912006-08-11 14:57:12 +0000316 PyObject *codecs, *streamcodec, *codeccls;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000317
318 codecs = _PyCodec_Lookup(encoding);
319 if (codecs == NULL)
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000320 return NULL;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000321
Thomas Wouters0e3f5912006-08-11 14:57:12 +0000322 codeccls = PyTuple_GET_ITEM(codecs, index);
323 if (errors != NULL)
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000324 streamcodec = PyObject_CallFunction(codeccls, "Os", stream, errors);
Thomas Wouters0e3f5912006-08-11 14:57:12 +0000325 else
Victor Stinner7bfb42d2016-12-05 17:04:32 +0100326 streamcodec = PyObject_CallFunctionObjArgs(codeccls, stream, NULL);
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000327 Py_DECREF(codecs);
328 return streamcodec;
Guido van Rossumfeee4b92000-03-10 22:57:27 +0000329}
330
Nick Coghlana9b15242014-02-04 22:11:18 +1000331/* Helpers to work with the result of _PyCodec_Lookup
332
333 */
334PyObject *_PyCodecInfo_GetIncrementalDecoder(PyObject *codec_info,
335 const char *errors)
336{
337 return codec_makeincrementalcodec(codec_info, errors,
338 "incrementaldecoder");
339}
340
341PyObject *_PyCodecInfo_GetIncrementalEncoder(PyObject *codec_info,
342 const char *errors)
343{
344 return codec_makeincrementalcodec(codec_info, errors,
345 "incrementalencoder");
346}
347
348
Guido van Rossum98297ee2007-11-06 21:34:58 +0000349/* Convenience APIs to query the Codec registry.
350
Guido van Rossumfeee4b92000-03-10 22:57:27 +0000351 All APIs return a codec object with incremented refcount.
Guido van Rossum98297ee2007-11-06 21:34:58 +0000352
Guido van Rossumfeee4b92000-03-10 22:57:27 +0000353 */
354
355PyObject *PyCodec_Encoder(const char *encoding)
356{
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000357 return codec_getitem(encoding, 0);
Guido van Rossumfeee4b92000-03-10 22:57:27 +0000358}
359
360PyObject *PyCodec_Decoder(const char *encoding)
361{
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000362 return codec_getitem(encoding, 1);
Guido van Rossumfeee4b92000-03-10 22:57:27 +0000363}
364
Thomas Woutersa9773292006-04-21 09:43:23 +0000365PyObject *PyCodec_IncrementalEncoder(const char *encoding,
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000366 const char *errors)
Thomas Woutersa9773292006-04-21 09:43:23 +0000367{
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000368 return codec_getincrementalcodec(encoding, errors, "incrementalencoder");
Thomas Woutersa9773292006-04-21 09:43:23 +0000369}
370
371PyObject *PyCodec_IncrementalDecoder(const char *encoding,
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000372 const char *errors)
Thomas Woutersa9773292006-04-21 09:43:23 +0000373{
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000374 return codec_getincrementalcodec(encoding, errors, "incrementaldecoder");
Thomas Woutersa9773292006-04-21 09:43:23 +0000375}
376
Guido van Rossumfeee4b92000-03-10 22:57:27 +0000377PyObject *PyCodec_StreamReader(const char *encoding,
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000378 PyObject *stream,
379 const char *errors)
Guido van Rossumfeee4b92000-03-10 22:57:27 +0000380{
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000381 return codec_getstreamcodec(encoding, stream, errors, 2);
Guido van Rossumfeee4b92000-03-10 22:57:27 +0000382}
383
384PyObject *PyCodec_StreamWriter(const char *encoding,
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000385 PyObject *stream,
386 const char *errors)
Guido van Rossumfeee4b92000-03-10 22:57:27 +0000387{
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000388 return codec_getstreamcodec(encoding, stream, errors, 3);
Guido van Rossumfeee4b92000-03-10 22:57:27 +0000389}
390
Nick Coghlan8b097b42013-11-13 23:49:21 +1000391/* Helper that tries to ensure the reported exception chain indicates the
392 * codec that was invoked to trigger the failure without changing the type
393 * of the exception raised.
394 */
395static void
396wrap_codec_error(const char *operation,
397 const char *encoding)
398{
399 /* TrySetFromCause will replace the active exception with a suitably
400 * updated clone if it can, otherwise it will leave the original
401 * exception alone.
402 */
403 _PyErr_TrySetFromCause("%s with '%s' codec failed",
404 operation, encoding);
405}
406
Martin Panter6245cb32016-04-15 02:14:19 +0000407/* Encode an object (e.g. a Unicode object) using the given encoding
Guido van Rossumfeee4b92000-03-10 22:57:27 +0000408 and return the resulting encoded object (usually a Python string).
409
410 errors is passed to the encoder factory as argument if non-NULL. */
411
Nick Coghlanc72e4e62013-11-22 22:39:36 +1000412static PyObject *
413_PyCodec_EncodeInternal(PyObject *object,
414 PyObject *encoder,
415 const char *encoding,
416 const char *errors)
Guido van Rossumfeee4b92000-03-10 22:57:27 +0000417{
Neal Norwitz3715c3e2005-11-24 22:09:18 +0000418 PyObject *args = NULL, *result = NULL;
Guido van Rossum98297ee2007-11-06 21:34:58 +0000419 PyObject *v = NULL;
Guido van Rossumfeee4b92000-03-10 22:57:27 +0000420
Guido van Rossumfeee4b92000-03-10 22:57:27 +0000421 args = args_tuple(object, errors);
422 if (args == NULL)
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000423 goto onError;
Guido van Rossum98297ee2007-11-06 21:34:58 +0000424
425 result = PyEval_CallObject(encoder, args);
Nick Coghlanc4c25802013-11-15 21:47:37 +1000426 if (result == NULL) {
427 wrap_codec_error("encoding", encoding);
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000428 goto onError;
Nick Coghlanc4c25802013-11-15 21:47:37 +1000429 }
Guido van Rossumfeee4b92000-03-10 22:57:27 +0000430
Guido van Rossum98297ee2007-11-06 21:34:58 +0000431 if (!PyTuple_Check(result) ||
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000432 PyTuple_GET_SIZE(result) != 2) {
433 PyErr_SetString(PyExc_TypeError,
434 "encoder must return a tuple (object, integer)");
435 goto onError;
Guido van Rossumfeee4b92000-03-10 22:57:27 +0000436 }
Marc-André Lemburgb2750b52008-06-06 12:18:17 +0000437 v = PyTuple_GET_ITEM(result,0);
438 Py_INCREF(v);
Guido van Rossumfeee4b92000-03-10 22:57:27 +0000439 /* We don't check or use the second (integer) entry. */
440
Marc-André Lemburgb2750b52008-06-06 12:18:17 +0000441 Py_DECREF(args);
442 Py_DECREF(encoder);
443 Py_DECREF(result);
444 return v;
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000445
Guido van Rossumfeee4b92000-03-10 22:57:27 +0000446 onError:
Neal Norwitz3715c3e2005-11-24 22:09:18 +0000447 Py_XDECREF(result);
Guido van Rossumfeee4b92000-03-10 22:57:27 +0000448 Py_XDECREF(args);
449 Py_XDECREF(encoder);
Marc-André Lemburgb2750b52008-06-06 12:18:17 +0000450 return NULL;
Guido van Rossumfeee4b92000-03-10 22:57:27 +0000451}
452
453/* Decode an object (usually a Python string) using the given encoding
Martin Panter6245cb32016-04-15 02:14:19 +0000454 and return an equivalent object (e.g. a Unicode object).
Guido van Rossumfeee4b92000-03-10 22:57:27 +0000455
456 errors is passed to the decoder factory as argument if non-NULL. */
457
Nick Coghlanc72e4e62013-11-22 22:39:36 +1000458static PyObject *
459_PyCodec_DecodeInternal(PyObject *object,
460 PyObject *decoder,
461 const char *encoding,
462 const char *errors)
Guido van Rossumfeee4b92000-03-10 22:57:27 +0000463{
Guido van Rossumfeee4b92000-03-10 22:57:27 +0000464 PyObject *args = NULL, *result = NULL;
465 PyObject *v;
466
Guido van Rossumfeee4b92000-03-10 22:57:27 +0000467 args = args_tuple(object, errors);
468 if (args == NULL)
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000469 goto onError;
Guido van Rossum98297ee2007-11-06 21:34:58 +0000470
Guido van Rossumfeee4b92000-03-10 22:57:27 +0000471 result = PyEval_CallObject(decoder,args);
Nick Coghlanc4c25802013-11-15 21:47:37 +1000472 if (result == NULL) {
473 wrap_codec_error("decoding", encoding);
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000474 goto onError;
Nick Coghlanc4c25802013-11-15 21:47:37 +1000475 }
Guido van Rossum98297ee2007-11-06 21:34:58 +0000476 if (!PyTuple_Check(result) ||
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000477 PyTuple_GET_SIZE(result) != 2) {
478 PyErr_SetString(PyExc_TypeError,
479 "decoder must return a tuple (object,integer)");
480 goto onError;
Guido van Rossumfeee4b92000-03-10 22:57:27 +0000481 }
482 v = PyTuple_GET_ITEM(result,0);
483 Py_INCREF(v);
484 /* We don't check or use the second (integer) entry. */
485
486 Py_DECREF(args);
487 Py_DECREF(decoder);
488 Py_DECREF(result);
489 return v;
Guido van Rossum98297ee2007-11-06 21:34:58 +0000490
Guido van Rossumfeee4b92000-03-10 22:57:27 +0000491 onError:
492 Py_XDECREF(args);
493 Py_XDECREF(decoder);
494 Py_XDECREF(result);
495 return NULL;
496}
497
Nick Coghlanc72e4e62013-11-22 22:39:36 +1000498/* Generic encoding/decoding API */
499PyObject *PyCodec_Encode(PyObject *object,
500 const char *encoding,
501 const char *errors)
502{
503 PyObject *encoder;
504
505 encoder = PyCodec_Encoder(encoding);
506 if (encoder == NULL)
507 return NULL;
508
509 return _PyCodec_EncodeInternal(object, encoder, encoding, errors);
510}
511
512PyObject *PyCodec_Decode(PyObject *object,
513 const char *encoding,
514 const char *errors)
515{
516 PyObject *decoder;
517
518 decoder = PyCodec_Decoder(encoding);
519 if (decoder == NULL)
520 return NULL;
521
522 return _PyCodec_DecodeInternal(object, decoder, encoding, errors);
523}
524
525/* Text encoding/decoding API */
Nick Coghlana9b15242014-02-04 22:11:18 +1000526PyObject * _PyCodec_LookupTextEncoding(const char *encoding,
527 const char *alternate_command)
Nick Coghlanc72e4e62013-11-22 22:39:36 +1000528{
529 _Py_IDENTIFIER(_is_text_encoding);
530 PyObject *codec;
531 PyObject *attr;
Nick Coghlanc72e4e62013-11-22 22:39:36 +1000532 int is_text_codec;
533
534 codec = _PyCodec_Lookup(encoding);
535 if (codec == NULL)
536 return NULL;
537
538 /* Backwards compatibility: assume any raw tuple describes a text
539 * encoding, and the same for anything lacking the private
540 * attribute.
541 */
542 if (!PyTuple_CheckExact(codec)) {
Serhiy Storchakaf320be72018-01-25 10:49:40 +0200543 if (_PyObject_LookupAttrId(codec, &PyId__is_text_encoding, &attr) < 0) {
544 Py_DECREF(codec);
545 return NULL;
546 }
547 if (attr != NULL) {
Nick Coghlanc72e4e62013-11-22 22:39:36 +1000548 is_text_codec = PyObject_IsTrue(attr);
549 Py_DECREF(attr);
Serhiy Storchakafa494fd2015-05-30 17:45:22 +0300550 if (is_text_codec <= 0) {
Nick Coghlanc72e4e62013-11-22 22:39:36 +1000551 Py_DECREF(codec);
Serhiy Storchakafa494fd2015-05-30 17:45:22 +0300552 if (!is_text_codec)
553 PyErr_Format(PyExc_LookupError,
554 "'%.400s' is not a text encoding; "
555 "use %s to handle arbitrary codecs",
556 encoding, alternate_command);
Nick Coghlanc72e4e62013-11-22 22:39:36 +1000557 return NULL;
558 }
559 }
560 }
561
Nick Coghlana9b15242014-02-04 22:11:18 +1000562 /* This appears to be a valid text encoding */
563 return codec;
564}
565
566
567static
568PyObject *codec_getitem_checked(const char *encoding,
569 const char *alternate_command,
570 int index)
571{
572 PyObject *codec;
573 PyObject *v;
574
575 codec = _PyCodec_LookupTextEncoding(encoding, alternate_command);
576 if (codec == NULL)
577 return NULL;
578
Nick Coghlanc72e4e62013-11-22 22:39:36 +1000579 v = PyTuple_GET_ITEM(codec, index);
Nick Coghlanc72e4e62013-11-22 22:39:36 +1000580 Py_INCREF(v);
Nick Coghlana9b15242014-02-04 22:11:18 +1000581 Py_DECREF(codec);
Nick Coghlanc72e4e62013-11-22 22:39:36 +1000582 return v;
583}
584
585static PyObject * _PyCodec_TextEncoder(const char *encoding)
586{
Nick Coghlana9b15242014-02-04 22:11:18 +1000587 return codec_getitem_checked(encoding, "codecs.encode()", 0);
Nick Coghlanc72e4e62013-11-22 22:39:36 +1000588}
589
590static PyObject * _PyCodec_TextDecoder(const char *encoding)
591{
Nick Coghlana9b15242014-02-04 22:11:18 +1000592 return codec_getitem_checked(encoding, "codecs.decode()", 1);
Nick Coghlanc72e4e62013-11-22 22:39:36 +1000593}
594
595PyObject *_PyCodec_EncodeText(PyObject *object,
596 const char *encoding,
597 const char *errors)
598{
599 PyObject *encoder;
600
601 encoder = _PyCodec_TextEncoder(encoding);
602 if (encoder == NULL)
603 return NULL;
604
605 return _PyCodec_EncodeInternal(object, encoder, encoding, errors);
606}
607
608PyObject *_PyCodec_DecodeText(PyObject *object,
609 const char *encoding,
610 const char *errors)
611{
612 PyObject *decoder;
613
614 decoder = _PyCodec_TextDecoder(encoding);
615 if (decoder == NULL)
616 return NULL;
617
618 return _PyCodec_DecodeInternal(object, decoder, encoding, errors);
619}
620
Walter Dörwald3aeb6322002-09-02 13:14:32 +0000621/* Register the error handling callback function error under the name
622 name. This function will be called by the codec when it encounters
623 an unencodable characters/undecodable bytes and doesn't know the
624 callback name, when name is specified as the error parameter
625 in the call to the encode/decode function.
626 Return 0 on success, -1 on error */
627int PyCodec_RegisterError(const char *name, PyObject *error)
628{
Nicholas Bastine5662ae2004-03-24 22:22:12 +0000629 PyInterpreterState *interp = PyThreadState_GET()->interp;
Gustavo Niemeyer5ddd4c32003-03-19 00:35:36 +0000630 if (interp->codec_search_path == NULL && _PyCodecRegistry_Init())
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000631 return -1;
Walter Dörwald3aeb6322002-09-02 13:14:32 +0000632 if (!PyCallable_Check(error)) {
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000633 PyErr_SetString(PyExc_TypeError, "handler must be callable");
634 return -1;
Walter Dörwald3aeb6322002-09-02 13:14:32 +0000635 }
Gustavo Niemeyer5ddd4c32003-03-19 00:35:36 +0000636 return PyDict_SetItemString(interp->codec_error_registry,
Serhiy Storchakac6792272013-10-19 21:03:34 +0300637 name, error);
Walter Dörwald3aeb6322002-09-02 13:14:32 +0000638}
639
640/* Lookup the error handling callback function registered under the
641 name error. As a special case NULL can be passed, in which case
642 the error handling callback for strict encoding will be returned. */
643PyObject *PyCodec_LookupError(const char *name)
644{
645 PyObject *handler = NULL;
646
Nicholas Bastine5662ae2004-03-24 22:22:12 +0000647 PyInterpreterState *interp = PyThreadState_GET()->interp;
Gustavo Niemeyer5ddd4c32003-03-19 00:35:36 +0000648 if (interp->codec_search_path == NULL && _PyCodecRegistry_Init())
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000649 return NULL;
Gustavo Niemeyer5ddd4c32003-03-19 00:35:36 +0000650
Walter Dörwald3aeb6322002-09-02 13:14:32 +0000651 if (name==NULL)
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000652 name = "strict";
Serhiy Storchakac6792272013-10-19 21:03:34 +0300653 handler = PyDict_GetItemString(interp->codec_error_registry, name);
Walter Dörwald3aeb6322002-09-02 13:14:32 +0000654 if (!handler)
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000655 PyErr_Format(PyExc_LookupError, "unknown error handler name '%.400s'", name);
Walter Dörwald3aeb6322002-09-02 13:14:32 +0000656 else
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000657 Py_INCREF(handler);
Walter Dörwald3aeb6322002-09-02 13:14:32 +0000658 return handler;
659}
660
661static void wrong_exception_type(PyObject *exc)
662{
Serhiy Storchakaca7fecb2015-05-18 16:08:52 +0300663 PyErr_Format(PyExc_TypeError,
664 "don't know how to handle %.200s in error callback",
665 exc->ob_type->tp_name);
Walter Dörwald3aeb6322002-09-02 13:14:32 +0000666}
667
668PyObject *PyCodec_StrictErrors(PyObject *exc)
669{
Brett Cannonbf364092006-03-01 04:25:17 +0000670 if (PyExceptionInstance_Check(exc))
671 PyErr_SetObject(PyExceptionInstance_Class(exc), exc);
Walter Dörwald3aeb6322002-09-02 13:14:32 +0000672 else
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000673 PyErr_SetString(PyExc_TypeError, "codec must pass exception instance");
Walter Dörwald3aeb6322002-09-02 13:14:32 +0000674 return NULL;
675}
676
677
678PyObject *PyCodec_IgnoreErrors(PyObject *exc)
679{
Martin v. Löwis18e16552006-02-15 17:27:45 +0000680 Py_ssize_t end;
Serhiy Storchakaca7fecb2015-05-18 16:08:52 +0300681
682 if (PyObject_TypeCheck(exc, (PyTypeObject *)PyExc_UnicodeEncodeError)) {
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000683 if (PyUnicodeEncodeError_GetEnd(exc, &end))
684 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +0000685 }
Serhiy Storchakaca7fecb2015-05-18 16:08:52 +0300686 else if (PyObject_TypeCheck(exc, (PyTypeObject *)PyExc_UnicodeDecodeError)) {
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000687 if (PyUnicodeDecodeError_GetEnd(exc, &end))
688 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +0000689 }
Serhiy Storchakaca7fecb2015-05-18 16:08:52 +0300690 else if (PyObject_TypeCheck(exc, (PyTypeObject *)PyExc_UnicodeTranslateError)) {
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000691 if (PyUnicodeTranslateError_GetEnd(exc, &end))
692 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +0000693 }
694 else {
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000695 wrong_exception_type(exc);
696 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +0000697 }
Victor Stinneree450092011-12-01 02:52:11 +0100698 return Py_BuildValue("(Nn)", PyUnicode_New(0, 0), end);
Walter Dörwald3aeb6322002-09-02 13:14:32 +0000699}
700
701
702PyObject *PyCodec_ReplaceErrors(PyObject *exc)
703{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200704 Py_ssize_t start, end, i, len;
Walter Dörwald3aeb6322002-09-02 13:14:32 +0000705
Serhiy Storchakaca7fecb2015-05-18 16:08:52 +0300706 if (PyObject_TypeCheck(exc, (PyTypeObject *)PyExc_UnicodeEncodeError)) {
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000707 PyObject *res;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200708 int kind;
709 void *data;
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000710 if (PyUnicodeEncodeError_GetStart(exc, &start))
711 return NULL;
712 if (PyUnicodeEncodeError_GetEnd(exc, &end))
713 return NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200714 len = end - start;
715 res = PyUnicode_New(len, '?');
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000716 if (res == NULL)
717 return NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200718 kind = PyUnicode_KIND(res);
719 data = PyUnicode_DATA(res);
720 for (i = 0; i < len; ++i)
721 PyUnicode_WRITE(kind, data, i, '?');
Victor Stinner8f825062012-04-27 13:55:39 +0200722 assert(_PyUnicode_CheckConsistency(res, 1));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200723 return Py_BuildValue("(Nn)", res, end);
Walter Dörwald3aeb6322002-09-02 13:14:32 +0000724 }
Serhiy Storchakaca7fecb2015-05-18 16:08:52 +0300725 else if (PyObject_TypeCheck(exc, (PyTypeObject *)PyExc_UnicodeDecodeError)) {
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000726 if (PyUnicodeDecodeError_GetEnd(exc, &end))
727 return NULL;
Victor Stinner1a15aba2011-10-02 19:00:15 +0200728 return Py_BuildValue("(Cn)",
729 (int)Py_UNICODE_REPLACEMENT_CHARACTER,
730 end);
Walter Dörwald3aeb6322002-09-02 13:14:32 +0000731 }
Serhiy Storchakaca7fecb2015-05-18 16:08:52 +0300732 else if (PyObject_TypeCheck(exc, (PyTypeObject *)PyExc_UnicodeTranslateError)) {
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000733 PyObject *res;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200734 int kind;
735 void *data;
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000736 if (PyUnicodeTranslateError_GetStart(exc, &start))
737 return NULL;
738 if (PyUnicodeTranslateError_GetEnd(exc, &end))
739 return NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200740 len = end - start;
741 res = PyUnicode_New(len, Py_UNICODE_REPLACEMENT_CHARACTER);
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000742 if (res == NULL)
743 return NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200744 kind = PyUnicode_KIND(res);
745 data = PyUnicode_DATA(res);
746 for (i=0; i < len; i++)
747 PyUnicode_WRITE(kind, data, i, Py_UNICODE_REPLACEMENT_CHARACTER);
Victor Stinner8f825062012-04-27 13:55:39 +0200748 assert(_PyUnicode_CheckConsistency(res, 1));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200749 return Py_BuildValue("(Nn)", res, end);
Walter Dörwald3aeb6322002-09-02 13:14:32 +0000750 }
751 else {
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000752 wrong_exception_type(exc);
753 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +0000754 }
755}
756
757PyObject *PyCodec_XMLCharRefReplaceErrors(PyObject *exc)
758{
Serhiy Storchakaca7fecb2015-05-18 16:08:52 +0300759 if (PyObject_TypeCheck(exc, (PyTypeObject *)PyExc_UnicodeEncodeError)) {
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000760 PyObject *restuple;
761 PyObject *object;
Victor Stinnerb31f1bc2011-11-04 21:29:10 +0100762 Py_ssize_t i;
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000763 Py_ssize_t start;
764 Py_ssize_t end;
765 PyObject *res;
Martin v. Löwisb09af032011-11-04 11:16:41 +0100766 unsigned char *outp;
Serhiy Storchaka2e374092014-10-04 14:15:49 +0300767 Py_ssize_t ressize;
Martin v. Löwisb09af032011-11-04 11:16:41 +0100768 Py_UCS4 ch;
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000769 if (PyUnicodeEncodeError_GetStart(exc, &start))
770 return NULL;
771 if (PyUnicodeEncodeError_GetEnd(exc, &end))
772 return NULL;
773 if (!(object = PyUnicodeEncodeError_GetObject(exc)))
774 return NULL;
Serhiy Storchaka2e374092014-10-04 14:15:49 +0300775 if (end - start > PY_SSIZE_T_MAX / (2+7+1))
776 end = start + PY_SSIZE_T_MAX / (2+7+1);
Martin v. Löwisb09af032011-11-04 11:16:41 +0100777 for (i = start, ressize = 0; i < end; ++i) {
778 /* object is guaranteed to be "ready" */
779 ch = PyUnicode_READ_CHAR(object, i);
780 if (ch<10)
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000781 ressize += 2+1+1;
Martin v. Löwisb09af032011-11-04 11:16:41 +0100782 else if (ch<100)
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000783 ressize += 2+2+1;
Martin v. Löwisb09af032011-11-04 11:16:41 +0100784 else if (ch<1000)
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000785 ressize += 2+3+1;
Martin v. Löwisb09af032011-11-04 11:16:41 +0100786 else if (ch<10000)
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000787 ressize += 2+4+1;
Martin v. Löwisb09af032011-11-04 11:16:41 +0100788 else if (ch<100000)
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000789 ressize += 2+5+1;
Martin v. Löwisb09af032011-11-04 11:16:41 +0100790 else if (ch<1000000)
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000791 ressize += 2+6+1;
792 else
793 ressize += 2+7+1;
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000794 }
795 /* allocate replacement */
Martin v. Löwisb09af032011-11-04 11:16:41 +0100796 res = PyUnicode_New(ressize, 127);
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000797 if (res == NULL) {
798 Py_DECREF(object);
799 return NULL;
800 }
Martin v. Löwisb09af032011-11-04 11:16:41 +0100801 outp = PyUnicode_1BYTE_DATA(res);
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000802 /* generate replacement */
Victor Stinnerb31f1bc2011-11-04 21:29:10 +0100803 for (i = start; i < end; ++i) {
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000804 int digits;
805 int base;
Martin v. Löwis8ba79302011-11-04 12:26:49 +0100806 ch = PyUnicode_READ_CHAR(object, i);
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000807 *outp++ = '&';
808 *outp++ = '#';
Martin v. Löwisb09af032011-11-04 11:16:41 +0100809 if (ch<10) {
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000810 digits = 1;
811 base = 1;
812 }
Martin v. Löwisb09af032011-11-04 11:16:41 +0100813 else if (ch<100) {
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000814 digits = 2;
815 base = 10;
816 }
Martin v. Löwisb09af032011-11-04 11:16:41 +0100817 else if (ch<1000) {
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000818 digits = 3;
819 base = 100;
820 }
Martin v. Löwisb09af032011-11-04 11:16:41 +0100821 else if (ch<10000) {
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000822 digits = 4;
823 base = 1000;
824 }
Martin v. Löwisb09af032011-11-04 11:16:41 +0100825 else if (ch<100000) {
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000826 digits = 5;
827 base = 10000;
828 }
Martin v. Löwisb09af032011-11-04 11:16:41 +0100829 else if (ch<1000000) {
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000830 digits = 6;
831 base = 100000;
832 }
833 else {
834 digits = 7;
835 base = 1000000;
836 }
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000837 while (digits-->0) {
Martin v. Löwisb09af032011-11-04 11:16:41 +0100838 *outp++ = '0' + ch/base;
839 ch %= base;
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000840 base /= 10;
841 }
842 *outp++ = ';';
843 }
Victor Stinner8f825062012-04-27 13:55:39 +0200844 assert(_PyUnicode_CheckConsistency(res, 1));
845 restuple = Py_BuildValue("(Nn)", res, end);
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000846 Py_DECREF(object);
847 return restuple;
Walter Dörwald3aeb6322002-09-02 13:14:32 +0000848 }
849 else {
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000850 wrong_exception_type(exc);
851 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +0000852 }
853}
854
Walter Dörwald3aeb6322002-09-02 13:14:32 +0000855PyObject *PyCodec_BackslashReplaceErrors(PyObject *exc)
856{
Serhiy Storchaka07985ef2015-01-25 22:56:57 +0200857 PyObject *object;
858 Py_ssize_t i;
859 Py_ssize_t start;
860 Py_ssize_t end;
861 PyObject *res;
862 unsigned char *outp;
863 int ressize;
864 Py_UCS4 c;
865
Serhiy Storchakac0937f72015-05-18 16:10:40 +0300866 if (PyObject_TypeCheck(exc, (PyTypeObject *)PyExc_UnicodeDecodeError)) {
Serhiy Storchakacb33a012016-10-23 09:44:50 +0300867 const unsigned char *p;
Serhiy Storchaka07985ef2015-01-25 22:56:57 +0200868 if (PyUnicodeDecodeError_GetStart(exc, &start))
869 return NULL;
870 if (PyUnicodeDecodeError_GetEnd(exc, &end))
871 return NULL;
872 if (!(object = PyUnicodeDecodeError_GetObject(exc)))
873 return NULL;
Serhiy Storchakacb33a012016-10-23 09:44:50 +0300874 p = (const unsigned char*)PyBytes_AS_STRING(object);
Serhiy Storchaka07985ef2015-01-25 22:56:57 +0200875 res = PyUnicode_New(4 * (end - start), 127);
876 if (res == NULL) {
877 Py_DECREF(object);
878 return NULL;
879 }
880 outp = PyUnicode_1BYTE_DATA(res);
881 for (i = start; i < end; i++, outp += 4) {
882 unsigned char c = p[i];
883 outp[0] = '\\';
884 outp[1] = 'x';
885 outp[2] = Py_hexdigits[(c>>4)&0xf];
886 outp[3] = Py_hexdigits[c&0xf];
887 }
888
889 assert(_PyUnicode_CheckConsistency(res, 1));
890 Py_DECREF(object);
891 return Py_BuildValue("(Nn)", res, end);
892 }
Serhiy Storchakac0937f72015-05-18 16:10:40 +0300893 if (PyObject_TypeCheck(exc, (PyTypeObject *)PyExc_UnicodeEncodeError)) {
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000894 if (PyUnicodeEncodeError_GetStart(exc, &start))
895 return NULL;
896 if (PyUnicodeEncodeError_GetEnd(exc, &end))
897 return NULL;
898 if (!(object = PyUnicodeEncodeError_GetObject(exc)))
899 return NULL;
Serhiy Storchaka07985ef2015-01-25 22:56:57 +0200900 }
Serhiy Storchakac0937f72015-05-18 16:10:40 +0300901 else if (PyObject_TypeCheck(exc, (PyTypeObject *)PyExc_UnicodeTranslateError)) {
Serhiy Storchaka07985ef2015-01-25 22:56:57 +0200902 if (PyUnicodeTranslateError_GetStart(exc, &start))
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000903 return NULL;
Serhiy Storchaka07985ef2015-01-25 22:56:57 +0200904 if (PyUnicodeTranslateError_GetEnd(exc, &end))
905 return NULL;
906 if (!(object = PyUnicodeTranslateError_GetObject(exc)))
907 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +0000908 }
909 else {
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000910 wrong_exception_type(exc);
911 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +0000912 }
Serhiy Storchaka07985ef2015-01-25 22:56:57 +0200913
914 if (end - start > PY_SSIZE_T_MAX / (1+1+8))
915 end = start + PY_SSIZE_T_MAX / (1+1+8);
916 for (i = start, ressize = 0; i < end; ++i) {
917 /* object is guaranteed to be "ready" */
918 c = PyUnicode_READ_CHAR(object, i);
919 if (c >= 0x10000) {
920 ressize += 1+1+8;
921 }
922 else if (c >= 0x100) {
923 ressize += 1+1+4;
924 }
925 else
926 ressize += 1+1+2;
927 }
928 res = PyUnicode_New(ressize, 127);
929 if (res == NULL) {
930 Py_DECREF(object);
931 return NULL;
932 }
933 outp = PyUnicode_1BYTE_DATA(res);
934 for (i = start; i < end; ++i) {
935 c = PyUnicode_READ_CHAR(object, i);
936 *outp++ = '\\';
937 if (c >= 0x00010000) {
938 *outp++ = 'U';
939 *outp++ = Py_hexdigits[(c>>28)&0xf];
940 *outp++ = Py_hexdigits[(c>>24)&0xf];
941 *outp++ = Py_hexdigits[(c>>20)&0xf];
942 *outp++ = Py_hexdigits[(c>>16)&0xf];
943 *outp++ = Py_hexdigits[(c>>12)&0xf];
944 *outp++ = Py_hexdigits[(c>>8)&0xf];
945 }
946 else if (c >= 0x100) {
947 *outp++ = 'u';
948 *outp++ = Py_hexdigits[(c>>12)&0xf];
949 *outp++ = Py_hexdigits[(c>>8)&0xf];
950 }
951 else
952 *outp++ = 'x';
953 *outp++ = Py_hexdigits[(c>>4)&0xf];
954 *outp++ = Py_hexdigits[c&0xf];
955 }
956
957 assert(_PyUnicode_CheckConsistency(res, 1));
958 Py_DECREF(object);
959 return Py_BuildValue("(Nn)", res, end);
Walter Dörwald3aeb6322002-09-02 13:14:32 +0000960}
961
Serhiy Storchaka166ebc42014-11-25 13:57:17 +0200962static _PyUnicode_Name_CAPI *ucnhash_CAPI = NULL;
Serhiy Storchaka166ebc42014-11-25 13:57:17 +0200963
964PyObject *PyCodec_NameReplaceErrors(PyObject *exc)
965{
Serhiy Storchakaca7fecb2015-05-18 16:08:52 +0300966 if (PyObject_TypeCheck(exc, (PyTypeObject *)PyExc_UnicodeEncodeError)) {
Serhiy Storchaka166ebc42014-11-25 13:57:17 +0200967 PyObject *restuple;
968 PyObject *object;
969 Py_ssize_t i;
970 Py_ssize_t start;
971 Py_ssize_t end;
972 PyObject *res;
973 unsigned char *outp;
Serhiy Storchakaaacfccc2014-11-26 12:11:40 +0200974 Py_ssize_t ressize;
975 int replsize;
Serhiy Storchaka166ebc42014-11-25 13:57:17 +0200976 Py_UCS4 c;
977 char buffer[256]; /* NAME_MAXLEN */
978 if (PyUnicodeEncodeError_GetStart(exc, &start))
979 return NULL;
980 if (PyUnicodeEncodeError_GetEnd(exc, &end))
981 return NULL;
982 if (!(object = PyUnicodeEncodeError_GetObject(exc)))
983 return NULL;
Victor Stinner38b8ae02015-09-03 16:19:40 +0200984 if (!ucnhash_CAPI) {
Serhiy Storchaka166ebc42014-11-25 13:57:17 +0200985 /* load the unicode data module */
986 ucnhash_CAPI = (_PyUnicode_Name_CAPI *)PyCapsule_Import(
987 PyUnicodeData_CAPSULE_NAME, 1);
Victor Stinner38b8ae02015-09-03 16:19:40 +0200988 if (!ucnhash_CAPI)
989 return NULL;
Serhiy Storchaka166ebc42014-11-25 13:57:17 +0200990 }
991 for (i = start, ressize = 0; i < end; ++i) {
992 /* object is guaranteed to be "ready" */
993 c = PyUnicode_READ_CHAR(object, i);
Victor Stinner38b8ae02015-09-03 16:19:40 +0200994 if (ucnhash_CAPI->getname(NULL, c, buffer, sizeof(buffer), 1)) {
Serhiy Storchaka26861b02015-02-16 20:52:17 +0200995 replsize = 1+1+1+(int)strlen(buffer)+1;
Serhiy Storchaka166ebc42014-11-25 13:57:17 +0200996 }
997 else if (c >= 0x10000) {
Serhiy Storchakaaacfccc2014-11-26 12:11:40 +0200998 replsize = 1+1+8;
Serhiy Storchaka166ebc42014-11-25 13:57:17 +0200999 }
1000 else if (c >= 0x100) {
Serhiy Storchakaaacfccc2014-11-26 12:11:40 +02001001 replsize = 1+1+4;
Serhiy Storchaka166ebc42014-11-25 13:57:17 +02001002 }
1003 else
Serhiy Storchakaaacfccc2014-11-26 12:11:40 +02001004 replsize = 1+1+2;
1005 if (ressize > PY_SSIZE_T_MAX - replsize)
1006 break;
1007 ressize += replsize;
Serhiy Storchaka166ebc42014-11-25 13:57:17 +02001008 }
Serhiy Storchakaaacfccc2014-11-26 12:11:40 +02001009 end = i;
Serhiy Storchaka166ebc42014-11-25 13:57:17 +02001010 res = PyUnicode_New(ressize, 127);
1011 if (res==NULL)
1012 return NULL;
1013 for (i = start, outp = PyUnicode_1BYTE_DATA(res);
1014 i < end; ++i) {
1015 c = PyUnicode_READ_CHAR(object, i);
1016 *outp++ = '\\';
Victor Stinner38b8ae02015-09-03 16:19:40 +02001017 if (ucnhash_CAPI->getname(NULL, c, buffer, sizeof(buffer), 1)) {
Serhiy Storchaka166ebc42014-11-25 13:57:17 +02001018 *outp++ = 'N';
1019 *outp++ = '{';
1020 strcpy((char *)outp, buffer);
1021 outp += strlen(buffer);
1022 *outp++ = '}';
1023 continue;
1024 }
1025 if (c >= 0x00010000) {
1026 *outp++ = 'U';
1027 *outp++ = Py_hexdigits[(c>>28)&0xf];
1028 *outp++ = Py_hexdigits[(c>>24)&0xf];
1029 *outp++ = Py_hexdigits[(c>>20)&0xf];
1030 *outp++ = Py_hexdigits[(c>>16)&0xf];
1031 *outp++ = Py_hexdigits[(c>>12)&0xf];
1032 *outp++ = Py_hexdigits[(c>>8)&0xf];
1033 }
1034 else if (c >= 0x100) {
1035 *outp++ = 'u';
1036 *outp++ = Py_hexdigits[(c>>12)&0xf];
1037 *outp++ = Py_hexdigits[(c>>8)&0xf];
1038 }
1039 else
1040 *outp++ = 'x';
1041 *outp++ = Py_hexdigits[(c>>4)&0xf];
1042 *outp++ = Py_hexdigits[c&0xf];
1043 }
1044
Benjamin Peterson3663b582014-11-26 14:39:54 -06001045 assert(outp == PyUnicode_1BYTE_DATA(res) + ressize);
Serhiy Storchaka166ebc42014-11-25 13:57:17 +02001046 assert(_PyUnicode_CheckConsistency(res, 1));
1047 restuple = Py_BuildValue("(Nn)", res, end);
1048 Py_DECREF(object);
1049 return restuple;
1050 }
1051 else {
1052 wrong_exception_type(exc);
1053 return NULL;
1054 }
1055}
1056
Serhiy Storchaka88d8fb62014-05-15 14:37:42 +03001057#define ENC_UNKNOWN -1
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02001058#define ENC_UTF8 0
1059#define ENC_UTF16BE 1
1060#define ENC_UTF16LE 2
1061#define ENC_UTF32BE 3
1062#define ENC_UTF32LE 4
1063
1064static int
1065get_standard_encoding(const char *encoding, int *bytelength)
1066{
1067 if (Py_TOLOWER(encoding[0]) == 'u' &&
1068 Py_TOLOWER(encoding[1]) == 't' &&
1069 Py_TOLOWER(encoding[2]) == 'f') {
1070 encoding += 3;
1071 if (*encoding == '-' || *encoding == '_' )
1072 encoding++;
Serhiy Storchaka88d8fb62014-05-15 14:37:42 +03001073 if (encoding[0] == '8' && encoding[1] == '\0') {
1074 *bytelength = 3;
1075 return ENC_UTF8;
1076 }
1077 else if (encoding[0] == '1' && encoding[1] == '6') {
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02001078 encoding += 2;
1079 *bytelength = 2;
1080 if (*encoding == '\0') {
1081#ifdef WORDS_BIGENDIAN
1082 return ENC_UTF16BE;
1083#else
1084 return ENC_UTF16LE;
1085#endif
1086 }
1087 if (*encoding == '-' || *encoding == '_' )
1088 encoding++;
1089 if (Py_TOLOWER(encoding[1]) == 'e' && encoding[2] == '\0') {
1090 if (Py_TOLOWER(encoding[0]) == 'b')
1091 return ENC_UTF16BE;
1092 if (Py_TOLOWER(encoding[0]) == 'l')
1093 return ENC_UTF16LE;
1094 }
1095 }
1096 else if (encoding[0] == '3' && encoding[1] == '2') {
1097 encoding += 2;
1098 *bytelength = 4;
1099 if (*encoding == '\0') {
1100#ifdef WORDS_BIGENDIAN
1101 return ENC_UTF32BE;
1102#else
1103 return ENC_UTF32LE;
1104#endif
1105 }
1106 if (*encoding == '-' || *encoding == '_' )
1107 encoding++;
1108 if (Py_TOLOWER(encoding[1]) == 'e' && encoding[2] == '\0') {
1109 if (Py_TOLOWER(encoding[0]) == 'b')
1110 return ENC_UTF32BE;
1111 if (Py_TOLOWER(encoding[0]) == 'l')
1112 return ENC_UTF32LE;
1113 }
1114 }
1115 }
Victor Stinner0d4e01c2014-05-16 14:46:20 +02001116 else if (strcmp(encoding, "CP_UTF8") == 0) {
1117 *bytelength = 3;
1118 return ENC_UTF8;
1119 }
Serhiy Storchaka88d8fb62014-05-15 14:37:42 +03001120 return ENC_UNKNOWN;
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02001121}
1122
Martin v. Löwisaef3fb02009-05-02 19:27:30 +00001123/* This handler is declared static until someone demonstrates
1124 a need to call it directly. */
1125static PyObject *
Martin v. Löwise0a2b722009-05-10 08:08:56 +00001126PyCodec_SurrogatePassErrors(PyObject *exc)
Martin v. Löwisdb12d452009-05-02 18:52:14 +00001127{
1128 PyObject *restuple;
1129 PyObject *object;
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02001130 PyObject *encode;
Serhiy Storchaka85b0f5b2016-11-20 10:16:47 +02001131 const char *encoding;
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02001132 int code;
1133 int bytelength;
Martin v. Löwisb09af032011-11-04 11:16:41 +01001134 Py_ssize_t i;
Martin v. Löwisdb12d452009-05-02 18:52:14 +00001135 Py_ssize_t start;
1136 Py_ssize_t end;
1137 PyObject *res;
Serhiy Storchakaca7fecb2015-05-18 16:08:52 +03001138
1139 if (PyObject_TypeCheck(exc, (PyTypeObject *)PyExc_UnicodeEncodeError)) {
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02001140 unsigned char *outp;
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001141 if (PyUnicodeEncodeError_GetStart(exc, &start))
1142 return NULL;
1143 if (PyUnicodeEncodeError_GetEnd(exc, &end))
1144 return NULL;
1145 if (!(object = PyUnicodeEncodeError_GetObject(exc)))
1146 return NULL;
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02001147 if (!(encode = PyUnicodeEncodeError_GetEncoding(exc))) {
1148 Py_DECREF(object);
1149 return NULL;
1150 }
1151 if (!(encoding = PyUnicode_AsUTF8(encode))) {
1152 Py_DECREF(object);
1153 Py_DECREF(encode);
1154 return NULL;
1155 }
1156 code = get_standard_encoding(encoding, &bytelength);
1157 Py_DECREF(encode);
Serhiy Storchaka88d8fb62014-05-15 14:37:42 +03001158 if (code == ENC_UNKNOWN) {
1159 /* Not supported, fail with original exception */
1160 PyErr_SetObject(PyExceptionInstance_Class(exc), exc);
1161 Py_DECREF(object);
1162 return NULL;
1163 }
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02001164
Serhiy Storchaka2e374092014-10-04 14:15:49 +03001165 if (end - start > PY_SSIZE_T_MAX / bytelength)
1166 end = start + PY_SSIZE_T_MAX / bytelength;
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02001167 res = PyBytes_FromStringAndSize(NULL, bytelength*(end-start));
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001168 if (!res) {
1169 Py_DECREF(object);
1170 return NULL;
1171 }
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02001172 outp = (unsigned char*)PyBytes_AsString(res);
Martin v. Löwisb09af032011-11-04 11:16:41 +01001173 for (i = start; i < end; i++) {
1174 /* object is guaranteed to be "ready" */
1175 Py_UCS4 ch = PyUnicode_READ_CHAR(object, i);
Victor Stinner76df43d2012-10-30 01:42:39 +01001176 if (!Py_UNICODE_IS_SURROGATE(ch)) {
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001177 /* Not a surrogate, fail with original exception */
1178 PyErr_SetObject(PyExceptionInstance_Class(exc), exc);
1179 Py_DECREF(res);
1180 Py_DECREF(object);
1181 return NULL;
1182 }
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02001183 switch (code) {
1184 case ENC_UTF8:
1185 *outp++ = (unsigned char)(0xe0 | (ch >> 12));
1186 *outp++ = (unsigned char)(0x80 | ((ch >> 6) & 0x3f));
1187 *outp++ = (unsigned char)(0x80 | (ch & 0x3f));
1188 break;
1189 case ENC_UTF16LE:
1190 *outp++ = (unsigned char) ch;
1191 *outp++ = (unsigned char)(ch >> 8);
1192 break;
1193 case ENC_UTF16BE:
1194 *outp++ = (unsigned char)(ch >> 8);
1195 *outp++ = (unsigned char) ch;
1196 break;
1197 case ENC_UTF32LE:
1198 *outp++ = (unsigned char) ch;
1199 *outp++ = (unsigned char)(ch >> 8);
1200 *outp++ = (unsigned char)(ch >> 16);
1201 *outp++ = (unsigned char)(ch >> 24);
1202 break;
1203 case ENC_UTF32BE:
1204 *outp++ = (unsigned char)(ch >> 24);
1205 *outp++ = (unsigned char)(ch >> 16);
1206 *outp++ = (unsigned char)(ch >> 8);
1207 *outp++ = (unsigned char) ch;
1208 break;
1209 }
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001210 }
1211 restuple = Py_BuildValue("(On)", res, end);
1212 Py_DECREF(res);
1213 Py_DECREF(object);
1214 return restuple;
Martin v. Löwisdb12d452009-05-02 18:52:14 +00001215 }
Serhiy Storchakaca7fecb2015-05-18 16:08:52 +03001216 else if (PyObject_TypeCheck(exc, (PyTypeObject *)PyExc_UnicodeDecodeError)) {
Serhiy Storchakacb33a012016-10-23 09:44:50 +03001217 const unsigned char *p;
Victor Stinnerc06bb7a2011-11-04 21:36:35 +01001218 Py_UCS4 ch = 0;
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001219 if (PyUnicodeDecodeError_GetStart(exc, &start))
1220 return NULL;
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02001221 if (PyUnicodeDecodeError_GetEnd(exc, &end))
1222 return NULL;
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001223 if (!(object = PyUnicodeDecodeError_GetObject(exc)))
1224 return NULL;
Serhiy Storchakacb33a012016-10-23 09:44:50 +03001225 p = (const unsigned char*)PyBytes_AS_STRING(object);
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02001226 if (!(encode = PyUnicodeDecodeError_GetEncoding(exc))) {
1227 Py_DECREF(object);
1228 return NULL;
1229 }
1230 if (!(encoding = PyUnicode_AsUTF8(encode))) {
1231 Py_DECREF(object);
1232 Py_DECREF(encode);
1233 return NULL;
1234 }
1235 code = get_standard_encoding(encoding, &bytelength);
1236 Py_DECREF(encode);
Serhiy Storchaka88d8fb62014-05-15 14:37:42 +03001237 if (code == ENC_UNKNOWN) {
1238 /* Not supported, fail with original exception */
1239 PyErr_SetObject(PyExceptionInstance_Class(exc), exc);
1240 Py_DECREF(object);
1241 return NULL;
1242 }
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02001243
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001244 /* Try decoding a single surrogate character. If
1245 there are more, let the codec call us again. */
1246 p += start;
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02001247 if (PyBytes_GET_SIZE(object) - start >= bytelength) {
1248 switch (code) {
1249 case ENC_UTF8:
1250 if ((p[0] & 0xf0) == 0xe0 &&
1251 (p[1] & 0xc0) == 0x80 &&
1252 (p[2] & 0xc0) == 0x80) {
1253 /* it's a three-byte code */
1254 ch = ((p[0] & 0x0f) << 12) + ((p[1] & 0x3f) << 6) + (p[2] & 0x3f);
1255 }
1256 break;
1257 case ENC_UTF16LE:
1258 ch = p[1] << 8 | p[0];
1259 break;
1260 case ENC_UTF16BE:
1261 ch = p[0] << 8 | p[1];
1262 break;
1263 case ENC_UTF32LE:
1264 ch = (p[3] << 24) | (p[2] << 16) | (p[1] << 8) | p[0];
1265 break;
1266 case ENC_UTF32BE:
1267 ch = (p[0] << 24) | (p[1] << 16) | (p[2] << 8) | p[3];
1268 break;
1269 }
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001270 }
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02001271
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001272 Py_DECREF(object);
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02001273 if (!Py_UNICODE_IS_SURROGATE(ch)) {
1274 /* it's not a surrogate - fail */
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001275 PyErr_SetObject(PyExceptionInstance_Class(exc), exc);
1276 return NULL;
1277 }
Victor Stinnerc06bb7a2011-11-04 21:36:35 +01001278 res = PyUnicode_FromOrdinal(ch);
1279 if (res == NULL)
1280 return NULL;
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02001281 return Py_BuildValue("(Nn)", res, start + bytelength);
Martin v. Löwisdb12d452009-05-02 18:52:14 +00001282 }
1283 else {
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001284 wrong_exception_type(exc);
1285 return NULL;
Martin v. Löwisdb12d452009-05-02 18:52:14 +00001286 }
1287}
1288
Martin v. Löwis011e8422009-05-05 04:43:17 +00001289static PyObject *
Martin v. Löwis43c57782009-05-10 08:15:24 +00001290PyCodec_SurrogateEscapeErrors(PyObject *exc)
Martin v. Löwis011e8422009-05-05 04:43:17 +00001291{
1292 PyObject *restuple;
1293 PyObject *object;
Martin v. Löwisb09af032011-11-04 11:16:41 +01001294 Py_ssize_t i;
Martin v. Löwis011e8422009-05-05 04:43:17 +00001295 Py_ssize_t start;
1296 Py_ssize_t end;
1297 PyObject *res;
Serhiy Storchakaca7fecb2015-05-18 16:08:52 +03001298
1299 if (PyObject_TypeCheck(exc, (PyTypeObject *)PyExc_UnicodeEncodeError)) {
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001300 char *outp;
1301 if (PyUnicodeEncodeError_GetStart(exc, &start))
1302 return NULL;
1303 if (PyUnicodeEncodeError_GetEnd(exc, &end))
1304 return NULL;
1305 if (!(object = PyUnicodeEncodeError_GetObject(exc)))
1306 return NULL;
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001307 res = PyBytes_FromStringAndSize(NULL, end-start);
1308 if (!res) {
1309 Py_DECREF(object);
1310 return NULL;
1311 }
1312 outp = PyBytes_AsString(res);
Martin v. Löwisb09af032011-11-04 11:16:41 +01001313 for (i = start; i < end; i++) {
1314 /* object is guaranteed to be "ready" */
1315 Py_UCS4 ch = PyUnicode_READ_CHAR(object, i);
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001316 if (ch < 0xdc80 || ch > 0xdcff) {
1317 /* Not a UTF-8b surrogate, fail with original exception */
1318 PyErr_SetObject(PyExceptionInstance_Class(exc), exc);
1319 Py_DECREF(res);
1320 Py_DECREF(object);
1321 return NULL;
1322 }
1323 *outp++ = ch - 0xdc00;
1324 }
1325 restuple = Py_BuildValue("(On)", res, end);
1326 Py_DECREF(res);
1327 Py_DECREF(object);
1328 return restuple;
Martin v. Löwis011e8422009-05-05 04:43:17 +00001329 }
Serhiy Storchakaca7fecb2015-05-18 16:08:52 +03001330 else if (PyObject_TypeCheck(exc, (PyTypeObject *)PyExc_UnicodeDecodeError)) {
Victor Stinnerc06bb7a2011-11-04 21:36:35 +01001331 PyObject *str;
Serhiy Storchakacb33a012016-10-23 09:44:50 +03001332 const unsigned char *p;
Victor Stinnerc06bb7a2011-11-04 21:36:35 +01001333 Py_UCS2 ch[4]; /* decode up to 4 bad bytes. */
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001334 int consumed = 0;
1335 if (PyUnicodeDecodeError_GetStart(exc, &start))
1336 return NULL;
1337 if (PyUnicodeDecodeError_GetEnd(exc, &end))
1338 return NULL;
1339 if (!(object = PyUnicodeDecodeError_GetObject(exc)))
1340 return NULL;
Serhiy Storchakacb33a012016-10-23 09:44:50 +03001341 p = (const unsigned char*)PyBytes_AS_STRING(object);
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001342 while (consumed < 4 && consumed < end-start) {
1343 /* Refuse to escape ASCII bytes. */
1344 if (p[start+consumed] < 128)
1345 break;
1346 ch[consumed] = 0xdc00 + p[start+consumed];
1347 consumed++;
1348 }
1349 Py_DECREF(object);
1350 if (!consumed) {
1351 /* codec complained about ASCII byte. */
1352 PyErr_SetObject(PyExceptionInstance_Class(exc), exc);
1353 return NULL;
1354 }
Victor Stinnerc06bb7a2011-11-04 21:36:35 +01001355 str = PyUnicode_FromKindAndData(PyUnicode_2BYTE_KIND, ch, consumed);
1356 if (str == NULL)
1357 return NULL;
1358 return Py_BuildValue("(Nn)", str, start+consumed);
Martin v. Löwis011e8422009-05-05 04:43:17 +00001359 }
1360 else {
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001361 wrong_exception_type(exc);
1362 return NULL;
Martin v. Löwis011e8422009-05-05 04:43:17 +00001363 }
1364}
1365
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001366
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001367static PyObject *strict_errors(PyObject *self, PyObject *exc)
1368{
1369 return PyCodec_StrictErrors(exc);
1370}
1371
1372
1373static PyObject *ignore_errors(PyObject *self, PyObject *exc)
1374{
1375 return PyCodec_IgnoreErrors(exc);
1376}
1377
1378
1379static PyObject *replace_errors(PyObject *self, PyObject *exc)
1380{
1381 return PyCodec_ReplaceErrors(exc);
1382}
1383
1384
1385static PyObject *xmlcharrefreplace_errors(PyObject *self, PyObject *exc)
1386{
1387 return PyCodec_XMLCharRefReplaceErrors(exc);
1388}
1389
1390
1391static PyObject *backslashreplace_errors(PyObject *self, PyObject *exc)
1392{
1393 return PyCodec_BackslashReplaceErrors(exc);
1394}
1395
Serhiy Storchaka166ebc42014-11-25 13:57:17 +02001396static PyObject *namereplace_errors(PyObject *self, PyObject *exc)
1397{
1398 return PyCodec_NameReplaceErrors(exc);
1399}
1400
Martin v. Löwise0a2b722009-05-10 08:08:56 +00001401static PyObject *surrogatepass_errors(PyObject *self, PyObject *exc)
Martin v. Löwisdb12d452009-05-02 18:52:14 +00001402{
Martin v. Löwise0a2b722009-05-10 08:08:56 +00001403 return PyCodec_SurrogatePassErrors(exc);
Martin v. Löwisdb12d452009-05-02 18:52:14 +00001404}
1405
Martin v. Löwis43c57782009-05-10 08:15:24 +00001406static PyObject *surrogateescape_errors(PyObject *self, PyObject *exc)
Martin v. Löwis011e8422009-05-05 04:43:17 +00001407{
Martin v. Löwis43c57782009-05-10 08:15:24 +00001408 return PyCodec_SurrogateEscapeErrors(exc);
Martin v. Löwis011e8422009-05-05 04:43:17 +00001409}
1410
Gustavo Niemeyer5ddd4c32003-03-19 00:35:36 +00001411static int _PyCodecRegistry_Init(void)
Guido van Rossumfeee4b92000-03-10 22:57:27 +00001412{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001413 static struct {
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001414 char *name;
1415 PyMethodDef def;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001416 } methods[] =
1417 {
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001418 {
1419 "strict",
1420 {
1421 "strict_errors",
1422 strict_errors,
1423 METH_O,
1424 PyDoc_STR("Implements the 'strict' error handling, which "
1425 "raises a UnicodeError on coding errors.")
1426 }
1427 },
1428 {
1429 "ignore",
1430 {
1431 "ignore_errors",
1432 ignore_errors,
1433 METH_O,
1434 PyDoc_STR("Implements the 'ignore' error handling, which "
1435 "ignores malformed data and continues.")
1436 }
1437 },
1438 {
1439 "replace",
1440 {
1441 "replace_errors",
1442 replace_errors,
1443 METH_O,
1444 PyDoc_STR("Implements the 'replace' error handling, which "
1445 "replaces malformed data with a replacement marker.")
1446 }
1447 },
1448 {
1449 "xmlcharrefreplace",
1450 {
1451 "xmlcharrefreplace_errors",
1452 xmlcharrefreplace_errors,
1453 METH_O,
1454 PyDoc_STR("Implements the 'xmlcharrefreplace' error handling, "
1455 "which replaces an unencodable character with the "
1456 "appropriate XML character reference.")
1457 }
1458 },
1459 {
1460 "backslashreplace",
1461 {
1462 "backslashreplace_errors",
1463 backslashreplace_errors,
1464 METH_O,
1465 PyDoc_STR("Implements the 'backslashreplace' error handling, "
Serhiy Storchaka07985ef2015-01-25 22:56:57 +02001466 "which replaces malformed data with a backslashed "
1467 "escape sequence.")
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001468 }
1469 },
1470 {
Serhiy Storchaka166ebc42014-11-25 13:57:17 +02001471 "namereplace",
1472 {
1473 "namereplace_errors",
1474 namereplace_errors,
1475 METH_O,
1476 PyDoc_STR("Implements the 'namereplace' error handling, "
1477 "which replaces an unencodable character with a "
1478 "\\N{...} escape sequence.")
1479 }
1480 },
1481 {
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001482 "surrogatepass",
1483 {
1484 "surrogatepass",
1485 surrogatepass_errors,
1486 METH_O
1487 }
1488 },
1489 {
1490 "surrogateescape",
1491 {
1492 "surrogateescape",
1493 surrogateescape_errors,
1494 METH_O
1495 }
1496 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001497 };
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001498
Nicholas Bastine5662ae2004-03-24 22:22:12 +00001499 PyInterpreterState *interp = PyThreadState_GET()->interp;
Gustavo Niemeyer5ddd4c32003-03-19 00:35:36 +00001500 PyObject *mod;
Neal Norwitz739a8f82004-07-08 01:55:58 +00001501 unsigned i;
Gustavo Niemeyer5ddd4c32003-03-19 00:35:36 +00001502
1503 if (interp->codec_search_path != NULL)
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001504 return 0;
Gustavo Niemeyer5ddd4c32003-03-19 00:35:36 +00001505
1506 interp->codec_search_path = PyList_New(0);
1507 interp->codec_search_cache = PyDict_New();
1508 interp->codec_error_registry = PyDict_New();
1509
1510 if (interp->codec_error_registry) {
Victor Stinner63941882011-09-29 00:42:28 +02001511 for (i = 0; i < Py_ARRAY_LENGTH(methods); ++i) {
Andrew Svetlov3ba3a3e2012-12-25 13:32:35 +02001512 PyObject *func = PyCFunction_NewEx(&methods[i].def, NULL, NULL);
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001513 int res;
1514 if (!func)
1515 Py_FatalError("can't initialize codec error registry");
1516 res = PyCodec_RegisterError(methods[i].name, func);
1517 Py_DECREF(func);
1518 if (res)
1519 Py_FatalError("can't initialize codec error registry");
1520 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001521 }
Guido van Rossumfeee4b92000-03-10 22:57:27 +00001522
Gustavo Niemeyer5ddd4c32003-03-19 00:35:36 +00001523 if (interp->codec_search_path == NULL ||
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001524 interp->codec_search_cache == NULL ||
1525 interp->codec_error_registry == NULL)
1526 Py_FatalError("can't initialize codec registry");
Gustavo Niemeyer5ddd4c32003-03-19 00:35:36 +00001527
Christian Heimes819b8bf2008-01-03 23:05:47 +00001528 mod = PyImport_ImportModuleNoBlock("encodings");
Gustavo Niemeyer5ddd4c32003-03-19 00:35:36 +00001529 if (mod == NULL) {
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001530 return -1;
Gustavo Niemeyer5ddd4c32003-03-19 00:35:36 +00001531 }
1532 Py_DECREF(mod);
Christian Heimes6a27efa2008-10-30 21:48:26 +00001533 interp->codecs_initialized = 1;
Gustavo Niemeyer5ddd4c32003-03-19 00:35:36 +00001534 return 0;
Guido van Rossumfeee4b92000-03-10 22:57:27 +00001535}