blob: d4b34f8397f05a09fc3c158e689b2225a42ea434 [file] [log] [blame]
Guido van Rossumfeee4b92000-03-10 22:57:27 +00001/* ------------------------------------------------------------------------
2
3 Python Codec Registry and support functions
4
5Written by Marc-Andre Lemburg (mal@lemburg.com).
6
Guido van Rossum16b1ad92000-08-03 16:24:25 +00007Copyright (c) Corporation for National Research Initiatives.
Guido van Rossumfeee4b92000-03-10 22:57:27 +00008
9 ------------------------------------------------------------------------ */
10
11#include "Python.h"
Victor Stinner621cebe2018-11-12 16:53:38 +010012#include "pycore_pystate.h"
Serhiy Storchaka166ebc42014-11-25 13:57:17 +020013#include "ucnhash.h"
Guido van Rossumfeee4b92000-03-10 22:57:27 +000014#include <ctype.h>
15
Victor Stinnerf5cff562011-10-14 02:13:11 +020016const char *Py_hexdigits = "0123456789abcdef";
17
Guido van Rossumfeee4b92000-03-10 22:57:27 +000018/* --- Codec Registry ----------------------------------------------------- */
19
20/* Import the standard encodings package which will register the first
Guido van Rossum98297ee2007-11-06 21:34:58 +000021 codec search function.
Guido van Rossumfeee4b92000-03-10 22:57:27 +000022
23 This is done in a lazy way so that the Unicode implementation does
24 not downgrade startup time of scripts not needing it.
25
Guido van Rossumb95de4f2000-03-31 17:25:23 +000026 ImportErrors are silently ignored by this function. Only one try is
27 made.
Guido van Rossumfeee4b92000-03-10 22:57:27 +000028
29*/
30
Gustavo Niemeyer5ddd4c32003-03-19 00:35:36 +000031static int _PyCodecRegistry_Init(void); /* Forward */
Guido van Rossumfeee4b92000-03-10 22:57:27 +000032
Guido van Rossumfeee4b92000-03-10 22:57:27 +000033int PyCodec_Register(PyObject *search_function)
34{
Victor Stinnercaba55b2018-08-03 15:33:52 +020035 PyInterpreterState *interp = _PyInterpreterState_Get();
Gustavo Niemeyer5ddd4c32003-03-19 00:35:36 +000036 if (interp->codec_search_path == NULL && _PyCodecRegistry_Init())
Antoine Pitrouf95a1b32010-05-09 15:52:27 +000037 goto onError;
Guido van Rossumfeee4b92000-03-10 22:57:27 +000038 if (search_function == NULL) {
Antoine Pitrouf95a1b32010-05-09 15:52:27 +000039 PyErr_BadArgument();
40 goto onError;
Guido van Rossumfeee4b92000-03-10 22:57:27 +000041 }
42 if (!PyCallable_Check(search_function)) {
Antoine Pitrouf95a1b32010-05-09 15:52:27 +000043 PyErr_SetString(PyExc_TypeError, "argument must be callable");
44 goto onError;
Guido van Rossumfeee4b92000-03-10 22:57:27 +000045 }
Gustavo Niemeyer5ddd4c32003-03-19 00:35:36 +000046 return PyList_Append(interp->codec_search_path, search_function);
Guido van Rossumb95de4f2000-03-31 17:25:23 +000047
48 onError:
49 return -1;
Guido van Rossumfeee4b92000-03-10 22:57:27 +000050}
51
Guido van Rossum9e896b32000-04-05 20:11:21 +000052/* Convert a string to a normalized Python string: all characters are
53 converted to lower case, spaces are replaced with underscores. */
54
Guido van Rossumfeee4b92000-03-10 22:57:27 +000055static
Guido van Rossum9e896b32000-04-05 20:11:21 +000056PyObject *normalizestring(const char *string)
Guido van Rossumfeee4b92000-03-10 22:57:27 +000057{
Antoine Pitrou9ed5f272013-08-13 20:18:52 +020058 size_t i;
Guido van Rossum582acec2000-06-28 22:07:35 +000059 size_t len = strlen(string);
Guido van Rossumfeee4b92000-03-10 22:57:27 +000060 char *p;
61 PyObject *v;
Guido van Rossum21431e82007-10-19 21:48:41 +000062
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000063 if (len > PY_SSIZE_T_MAX) {
Antoine Pitrouf95a1b32010-05-09 15:52:27 +000064 PyErr_SetString(PyExc_OverflowError, "string is too large");
65 return NULL;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000066 }
Guido van Rossum21431e82007-10-19 21:48:41 +000067
68 p = PyMem_Malloc(len + 1);
69 if (p == NULL)
Victor Stinnercc351592013-07-12 00:02:55 +020070 return PyErr_NoMemory();
Guido van Rossum9e896b32000-04-05 20:11:21 +000071 for (i = 0; i < len; i++) {
Antoine Pitrou9ed5f272013-08-13 20:18:52 +020072 char ch = string[i];
Guido van Rossum9e896b32000-04-05 20:11:21 +000073 if (ch == ' ')
74 ch = '-';
75 else
Antoine Pitroucf9d3c02011-07-24 02:27:04 +020076 ch = Py_TOLOWER(Py_CHARMASK(ch));
Antoine Pitrouf95a1b32010-05-09 15:52:27 +000077 p[i] = ch;
Guido van Rossum9e896b32000-04-05 20:11:21 +000078 }
Guido van Rossum21431e82007-10-19 21:48:41 +000079 p[i] = '\0';
80 v = PyUnicode_FromString(p);
Guido van Rossum21431e82007-10-19 21:48:41 +000081 PyMem_Free(p);
Guido van Rossumfeee4b92000-03-10 22:57:27 +000082 return v;
83}
84
85/* Lookup the given encoding and return a tuple providing the codec
86 facilities.
87
88 The encoding string is looked up converted to all lower-case
89 characters. This makes encodings looked up through this mechanism
90 effectively case-insensitive.
91
Guido van Rossum98297ee2007-11-06 21:34:58 +000092 If no codec is found, a LookupError is set and NULL returned.
Guido van Rossumb95de4f2000-03-31 17:25:23 +000093
94 As side effect, this tries to load the encodings package, if not
95 yet done. This is part of the lazy load strategy for the encodings
96 package.
97
98*/
Guido van Rossumfeee4b92000-03-10 22:57:27 +000099
100PyObject *_PyCodec_Lookup(const char *encoding)
101{
Guido van Rossum5ba3c842000-03-24 20:52:23 +0000102 PyObject *result, *args = NULL, *v;
Thomas Wouters477c8d52006-05-27 19:21:47 +0000103 Py_ssize_t i, len;
Guido van Rossumfeee4b92000-03-10 22:57:27 +0000104
Fred Drake766de832000-05-09 19:55:59 +0000105 if (encoding == NULL) {
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000106 PyErr_BadArgument();
107 goto onError;
Fred Drake766de832000-05-09 19:55:59 +0000108 }
Gustavo Niemeyer5ddd4c32003-03-19 00:35:36 +0000109
Victor Stinnercaba55b2018-08-03 15:33:52 +0200110 PyInterpreterState *interp = _PyInterpreterState_GET_UNSAFE();
Gustavo Niemeyer5ddd4c32003-03-19 00:35:36 +0000111 if (interp->codec_search_path == NULL && _PyCodecRegistry_Init())
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000112 goto onError;
Guido van Rossumfeee4b92000-03-10 22:57:27 +0000113
Guido van Rossum9e896b32000-04-05 20:11:21 +0000114 /* Convert the encoding to a normalized Python string: all
Thomas Wouters7e474022000-07-16 12:04:32 +0000115 characters are converted to lower case, spaces and hyphens are
Guido van Rossum9e896b32000-04-05 20:11:21 +0000116 replaced with underscores. */
117 v = normalizestring(encoding);
Guido van Rossumfeee4b92000-03-10 22:57:27 +0000118 if (v == NULL)
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000119 goto onError;
Guido van Rossum21431e82007-10-19 21:48:41 +0000120 PyUnicode_InternInPlace(&v);
Guido van Rossumfeee4b92000-03-10 22:57:27 +0000121
122 /* First, try to lookup the name in the registry dictionary */
Serhiy Storchakaa24107b2019-02-25 17:59:46 +0200123 result = PyDict_GetItemWithError(interp->codec_search_cache, v);
Guido van Rossumfeee4b92000-03-10 22:57:27 +0000124 if (result != NULL) {
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000125 Py_INCREF(result);
126 Py_DECREF(v);
127 return result;
Guido van Rossumfeee4b92000-03-10 22:57:27 +0000128 }
Serhiy Storchakaa24107b2019-02-25 17:59:46 +0200129 else if (PyErr_Occurred()) {
130 Py_DECREF(v);
131 return NULL;
132 }
Guido van Rossum98297ee2007-11-06 21:34:58 +0000133
Guido van Rossumfeee4b92000-03-10 22:57:27 +0000134 /* Next, scan the search functions in order of registration */
Guido van Rossumfeee4b92000-03-10 22:57:27 +0000135 args = PyTuple_New(1);
Serhiy Storchaka8905fcc2018-12-11 08:38:03 +0200136 if (args == NULL) {
137 Py_DECREF(v);
138 return NULL;
139 }
Guido van Rossumfeee4b92000-03-10 22:57:27 +0000140 PyTuple_SET_ITEM(args,0,v);
Guido van Rossum5ba3c842000-03-24 20:52:23 +0000141
Gustavo Niemeyer5ddd4c32003-03-19 00:35:36 +0000142 len = PyList_Size(interp->codec_search_path);
Guido van Rossum5ba3c842000-03-24 20:52:23 +0000143 if (len < 0)
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000144 goto onError;
Guido van Rossumb95de4f2000-03-31 17:25:23 +0000145 if (len == 0) {
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000146 PyErr_SetString(PyExc_LookupError,
147 "no codec search functions registered: "
148 "can't find encoding");
149 goto onError;
Guido van Rossumb95de4f2000-03-31 17:25:23 +0000150 }
Guido van Rossumfeee4b92000-03-10 22:57:27 +0000151
152 for (i = 0; i < len; i++) {
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000153 PyObject *func;
Guido van Rossum98297ee2007-11-06 21:34:58 +0000154
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000155 func = PyList_GetItem(interp->codec_search_path, i);
156 if (func == NULL)
157 goto onError;
158 result = PyEval_CallObject(func, args);
159 if (result == NULL)
160 goto onError;
161 if (result == Py_None) {
162 Py_DECREF(result);
163 continue;
164 }
165 if (!PyTuple_Check(result) || PyTuple_GET_SIZE(result) != 4) {
166 PyErr_SetString(PyExc_TypeError,
167 "codec search functions must return 4-tuples");
168 Py_DECREF(result);
169 goto onError;
170 }
171 break;
Guido van Rossumfeee4b92000-03-10 22:57:27 +0000172 }
173 if (i == len) {
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000174 /* XXX Perhaps we should cache misses too ? */
175 PyErr_Format(PyExc_LookupError,
Martin v. Löwiseb42b022002-09-26 16:01:24 +0000176 "unknown encoding: %s", encoding);
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000177 goto onError;
Guido van Rossumfeee4b92000-03-10 22:57:27 +0000178 }
179
180 /* Cache and return the result */
Neal Norwitz9edcc2e2007-08-11 04:58:26 +0000181 if (PyDict_SetItem(interp->codec_search_cache, v, result) < 0) {
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000182 Py_DECREF(result);
183 goto onError;
Neal Norwitz9edcc2e2007-08-11 04:58:26 +0000184 }
Guido van Rossumfeee4b92000-03-10 22:57:27 +0000185 Py_DECREF(args);
186 return result;
187
188 onError:
189 Py_XDECREF(args);
190 return NULL;
191}
192
Nick Coghlan8fad1672014-09-15 23:50:44 +1200193int _PyCodec_Forget(const char *encoding)
194{
Nick Coghlan8fad1672014-09-15 23:50:44 +1200195 PyObject *v;
196 int result;
197
Victor Stinnercaba55b2018-08-03 15:33:52 +0200198 PyInterpreterState *interp = _PyInterpreterState_Get();
Nick Coghlan8fad1672014-09-15 23:50:44 +1200199 if (interp->codec_search_path == NULL) {
200 return -1;
201 }
202
203 /* Convert the encoding to a normalized Python string: all
204 characters are converted to lower case, spaces and hyphens are
205 replaced with underscores. */
206 v = normalizestring(encoding);
207 if (v == NULL) {
208 return -1;
209 }
210
211 /* Drop the named codec from the internal cache */
212 result = PyDict_DelItem(interp->codec_search_cache, v);
213 Py_DECREF(v);
214
215 return result;
216}
217
Marc-André Lemburgb2750b52008-06-06 12:18:17 +0000218/* Codec registry encoding check API. */
219
220int PyCodec_KnownEncoding(const char *encoding)
221{
222 PyObject *codecs;
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000223
Marc-André Lemburgb2750b52008-06-06 12:18:17 +0000224 codecs = _PyCodec_Lookup(encoding);
225 if (!codecs) {
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000226 PyErr_Clear();
227 return 0;
Marc-André Lemburgb2750b52008-06-06 12:18:17 +0000228 }
229 else {
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000230 Py_DECREF(codecs);
231 return 1;
Marc-André Lemburgb2750b52008-06-06 12:18:17 +0000232 }
233}
234
Guido van Rossumfeee4b92000-03-10 22:57:27 +0000235static
236PyObject *args_tuple(PyObject *object,
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000237 const char *errors)
Guido van Rossumfeee4b92000-03-10 22:57:27 +0000238{
239 PyObject *args;
Guido van Rossum98297ee2007-11-06 21:34:58 +0000240
Guido van Rossumfeee4b92000-03-10 22:57:27 +0000241 args = PyTuple_New(1 + (errors != NULL));
242 if (args == NULL)
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000243 return NULL;
Guido van Rossumfeee4b92000-03-10 22:57:27 +0000244 Py_INCREF(object);
245 PyTuple_SET_ITEM(args,0,object);
246 if (errors) {
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000247 PyObject *v;
Guido van Rossum98297ee2007-11-06 21:34:58 +0000248
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000249 v = PyUnicode_FromString(errors);
250 if (v == NULL) {
251 Py_DECREF(args);
252 return NULL;
253 }
254 PyTuple_SET_ITEM(args, 1, v);
Guido van Rossumfeee4b92000-03-10 22:57:27 +0000255 }
256 return args;
257}
258
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000259/* Helper function to get a codec item */
Guido van Rossumfeee4b92000-03-10 22:57:27 +0000260
261static
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000262PyObject *codec_getitem(const char *encoding, int index)
Guido van Rossumfeee4b92000-03-10 22:57:27 +0000263{
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000264 PyObject *codecs;
265 PyObject *v;
Guido van Rossumfeee4b92000-03-10 22:57:27 +0000266
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000267 codecs = _PyCodec_Lookup(encoding);
268 if (codecs == NULL)
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000269 return NULL;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000270 v = PyTuple_GET_ITEM(codecs, index);
271 Py_DECREF(codecs);
272 Py_INCREF(v);
273 return v;
274}
275
Nick Coghlana9b15242014-02-04 22:11:18 +1000276/* Helper functions to create an incremental codec. */
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000277static
Nick Coghlana9b15242014-02-04 22:11:18 +1000278PyObject *codec_makeincrementalcodec(PyObject *codec_info,
279 const char *errors,
280 const char *attrname)
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000281{
Nick Coghlana9b15242014-02-04 22:11:18 +1000282 PyObject *ret, *inccodec;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000283
Nick Coghlana9b15242014-02-04 22:11:18 +1000284 inccodec = PyObject_GetAttrString(codec_info, attrname);
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000285 if (inccodec == NULL)
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000286 return NULL;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000287 if (errors)
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000288 ret = PyObject_CallFunction(inccodec, "s", errors);
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000289 else
Victor Stinner4778eab2016-12-01 14:51:04 +0100290 ret = _PyObject_CallNoArg(inccodec);
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000291 Py_DECREF(inccodec);
292 return ret;
293}
294
Nick Coghlana9b15242014-02-04 22:11:18 +1000295static
296PyObject *codec_getincrementalcodec(const char *encoding,
297 const char *errors,
298 const char *attrname)
299{
300 PyObject *codec_info, *ret;
301
302 codec_info = _PyCodec_Lookup(encoding);
303 if (codec_info == NULL)
304 return NULL;
305 ret = codec_makeincrementalcodec(codec_info, errors, attrname);
306 Py_DECREF(codec_info);
307 return ret;
308}
309
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000310/* Helper function to create a stream codec. */
311
312static
313PyObject *codec_getstreamcodec(const char *encoding,
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000314 PyObject *stream,
315 const char *errors,
316 const int index)
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000317{
Thomas Wouters0e3f5912006-08-11 14:57:12 +0000318 PyObject *codecs, *streamcodec, *codeccls;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000319
320 codecs = _PyCodec_Lookup(encoding);
321 if (codecs == NULL)
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000322 return NULL;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000323
Thomas Wouters0e3f5912006-08-11 14:57:12 +0000324 codeccls = PyTuple_GET_ITEM(codecs, index);
325 if (errors != NULL)
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000326 streamcodec = PyObject_CallFunction(codeccls, "Os", stream, errors);
Thomas Wouters0e3f5912006-08-11 14:57:12 +0000327 else
Victor Stinner7bfb42d2016-12-05 17:04:32 +0100328 streamcodec = PyObject_CallFunctionObjArgs(codeccls, stream, NULL);
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000329 Py_DECREF(codecs);
330 return streamcodec;
Guido van Rossumfeee4b92000-03-10 22:57:27 +0000331}
332
Nick Coghlana9b15242014-02-04 22:11:18 +1000333/* Helpers to work with the result of _PyCodec_Lookup
334
335 */
336PyObject *_PyCodecInfo_GetIncrementalDecoder(PyObject *codec_info,
337 const char *errors)
338{
339 return codec_makeincrementalcodec(codec_info, errors,
340 "incrementaldecoder");
341}
342
343PyObject *_PyCodecInfo_GetIncrementalEncoder(PyObject *codec_info,
344 const char *errors)
345{
346 return codec_makeincrementalcodec(codec_info, errors,
347 "incrementalencoder");
348}
349
350
Guido van Rossum98297ee2007-11-06 21:34:58 +0000351/* Convenience APIs to query the Codec registry.
352
Guido van Rossumfeee4b92000-03-10 22:57:27 +0000353 All APIs return a codec object with incremented refcount.
Guido van Rossum98297ee2007-11-06 21:34:58 +0000354
Guido van Rossumfeee4b92000-03-10 22:57:27 +0000355 */
356
357PyObject *PyCodec_Encoder(const char *encoding)
358{
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000359 return codec_getitem(encoding, 0);
Guido van Rossumfeee4b92000-03-10 22:57:27 +0000360}
361
362PyObject *PyCodec_Decoder(const char *encoding)
363{
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000364 return codec_getitem(encoding, 1);
Guido van Rossumfeee4b92000-03-10 22:57:27 +0000365}
366
Thomas Woutersa9773292006-04-21 09:43:23 +0000367PyObject *PyCodec_IncrementalEncoder(const char *encoding,
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000368 const char *errors)
Thomas Woutersa9773292006-04-21 09:43:23 +0000369{
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000370 return codec_getincrementalcodec(encoding, errors, "incrementalencoder");
Thomas Woutersa9773292006-04-21 09:43:23 +0000371}
372
373PyObject *PyCodec_IncrementalDecoder(const char *encoding,
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000374 const char *errors)
Thomas Woutersa9773292006-04-21 09:43:23 +0000375{
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000376 return codec_getincrementalcodec(encoding, errors, "incrementaldecoder");
Thomas Woutersa9773292006-04-21 09:43:23 +0000377}
378
Guido van Rossumfeee4b92000-03-10 22:57:27 +0000379PyObject *PyCodec_StreamReader(const char *encoding,
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000380 PyObject *stream,
381 const char *errors)
Guido van Rossumfeee4b92000-03-10 22:57:27 +0000382{
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000383 return codec_getstreamcodec(encoding, stream, errors, 2);
Guido van Rossumfeee4b92000-03-10 22:57:27 +0000384}
385
386PyObject *PyCodec_StreamWriter(const char *encoding,
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000387 PyObject *stream,
388 const char *errors)
Guido van Rossumfeee4b92000-03-10 22:57:27 +0000389{
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000390 return codec_getstreamcodec(encoding, stream, errors, 3);
Guido van Rossumfeee4b92000-03-10 22:57:27 +0000391}
392
Nick Coghlan8b097b42013-11-13 23:49:21 +1000393/* Helper that tries to ensure the reported exception chain indicates the
394 * codec that was invoked to trigger the failure without changing the type
395 * of the exception raised.
396 */
397static void
398wrap_codec_error(const char *operation,
399 const char *encoding)
400{
401 /* TrySetFromCause will replace the active exception with a suitably
402 * updated clone if it can, otherwise it will leave the original
403 * exception alone.
404 */
405 _PyErr_TrySetFromCause("%s with '%s' codec failed",
406 operation, encoding);
407}
408
Martin Panter6245cb32016-04-15 02:14:19 +0000409/* Encode an object (e.g. a Unicode object) using the given encoding
Guido van Rossumfeee4b92000-03-10 22:57:27 +0000410 and return the resulting encoded object (usually a Python string).
411
412 errors is passed to the encoder factory as argument if non-NULL. */
413
Nick Coghlanc72e4e62013-11-22 22:39:36 +1000414static PyObject *
415_PyCodec_EncodeInternal(PyObject *object,
416 PyObject *encoder,
417 const char *encoding,
418 const char *errors)
Guido van Rossumfeee4b92000-03-10 22:57:27 +0000419{
Neal Norwitz3715c3e2005-11-24 22:09:18 +0000420 PyObject *args = NULL, *result = NULL;
Guido van Rossum98297ee2007-11-06 21:34:58 +0000421 PyObject *v = NULL;
Guido van Rossumfeee4b92000-03-10 22:57:27 +0000422
Guido van Rossumfeee4b92000-03-10 22:57:27 +0000423 args = args_tuple(object, errors);
424 if (args == NULL)
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000425 goto onError;
Guido van Rossum98297ee2007-11-06 21:34:58 +0000426
427 result = PyEval_CallObject(encoder, args);
Nick Coghlanc4c25802013-11-15 21:47:37 +1000428 if (result == NULL) {
429 wrap_codec_error("encoding", encoding);
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000430 goto onError;
Nick Coghlanc4c25802013-11-15 21:47:37 +1000431 }
Guido van Rossumfeee4b92000-03-10 22:57:27 +0000432
Guido van Rossum98297ee2007-11-06 21:34:58 +0000433 if (!PyTuple_Check(result) ||
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000434 PyTuple_GET_SIZE(result) != 2) {
435 PyErr_SetString(PyExc_TypeError,
436 "encoder must return a tuple (object, integer)");
437 goto onError;
Guido van Rossumfeee4b92000-03-10 22:57:27 +0000438 }
Marc-André Lemburgb2750b52008-06-06 12:18:17 +0000439 v = PyTuple_GET_ITEM(result,0);
440 Py_INCREF(v);
Guido van Rossumfeee4b92000-03-10 22:57:27 +0000441 /* We don't check or use the second (integer) entry. */
442
Marc-André Lemburgb2750b52008-06-06 12:18:17 +0000443 Py_DECREF(args);
444 Py_DECREF(encoder);
445 Py_DECREF(result);
446 return v;
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000447
Guido van Rossumfeee4b92000-03-10 22:57:27 +0000448 onError:
Neal Norwitz3715c3e2005-11-24 22:09:18 +0000449 Py_XDECREF(result);
Guido van Rossumfeee4b92000-03-10 22:57:27 +0000450 Py_XDECREF(args);
451 Py_XDECREF(encoder);
Marc-André Lemburgb2750b52008-06-06 12:18:17 +0000452 return NULL;
Guido van Rossumfeee4b92000-03-10 22:57:27 +0000453}
454
455/* Decode an object (usually a Python string) using the given encoding
Martin Panter6245cb32016-04-15 02:14:19 +0000456 and return an equivalent object (e.g. a Unicode object).
Guido van Rossumfeee4b92000-03-10 22:57:27 +0000457
458 errors is passed to the decoder factory as argument if non-NULL. */
459
Nick Coghlanc72e4e62013-11-22 22:39:36 +1000460static PyObject *
461_PyCodec_DecodeInternal(PyObject *object,
462 PyObject *decoder,
463 const char *encoding,
464 const char *errors)
Guido van Rossumfeee4b92000-03-10 22:57:27 +0000465{
Guido van Rossumfeee4b92000-03-10 22:57:27 +0000466 PyObject *args = NULL, *result = NULL;
467 PyObject *v;
468
Guido van Rossumfeee4b92000-03-10 22:57:27 +0000469 args = args_tuple(object, errors);
470 if (args == NULL)
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000471 goto onError;
Guido van Rossum98297ee2007-11-06 21:34:58 +0000472
Guido van Rossumfeee4b92000-03-10 22:57:27 +0000473 result = PyEval_CallObject(decoder,args);
Nick Coghlanc4c25802013-11-15 21:47:37 +1000474 if (result == NULL) {
475 wrap_codec_error("decoding", encoding);
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000476 goto onError;
Nick Coghlanc4c25802013-11-15 21:47:37 +1000477 }
Guido van Rossum98297ee2007-11-06 21:34:58 +0000478 if (!PyTuple_Check(result) ||
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000479 PyTuple_GET_SIZE(result) != 2) {
480 PyErr_SetString(PyExc_TypeError,
481 "decoder must return a tuple (object,integer)");
482 goto onError;
Guido van Rossumfeee4b92000-03-10 22:57:27 +0000483 }
484 v = PyTuple_GET_ITEM(result,0);
485 Py_INCREF(v);
486 /* We don't check or use the second (integer) entry. */
487
488 Py_DECREF(args);
489 Py_DECREF(decoder);
490 Py_DECREF(result);
491 return v;
Guido van Rossum98297ee2007-11-06 21:34:58 +0000492
Guido van Rossumfeee4b92000-03-10 22:57:27 +0000493 onError:
494 Py_XDECREF(args);
495 Py_XDECREF(decoder);
496 Py_XDECREF(result);
497 return NULL;
498}
499
Nick Coghlanc72e4e62013-11-22 22:39:36 +1000500/* Generic encoding/decoding API */
501PyObject *PyCodec_Encode(PyObject *object,
502 const char *encoding,
503 const char *errors)
504{
505 PyObject *encoder;
506
507 encoder = PyCodec_Encoder(encoding);
508 if (encoder == NULL)
509 return NULL;
510
511 return _PyCodec_EncodeInternal(object, encoder, encoding, errors);
512}
513
514PyObject *PyCodec_Decode(PyObject *object,
515 const char *encoding,
516 const char *errors)
517{
518 PyObject *decoder;
519
520 decoder = PyCodec_Decoder(encoding);
521 if (decoder == NULL)
522 return NULL;
523
524 return _PyCodec_DecodeInternal(object, decoder, encoding, errors);
525}
526
527/* Text encoding/decoding API */
Nick Coghlana9b15242014-02-04 22:11:18 +1000528PyObject * _PyCodec_LookupTextEncoding(const char *encoding,
529 const char *alternate_command)
Nick Coghlanc72e4e62013-11-22 22:39:36 +1000530{
531 _Py_IDENTIFIER(_is_text_encoding);
532 PyObject *codec;
533 PyObject *attr;
Nick Coghlanc72e4e62013-11-22 22:39:36 +1000534 int is_text_codec;
535
536 codec = _PyCodec_Lookup(encoding);
537 if (codec == NULL)
538 return NULL;
539
540 /* Backwards compatibility: assume any raw tuple describes a text
541 * encoding, and the same for anything lacking the private
542 * attribute.
543 */
544 if (!PyTuple_CheckExact(codec)) {
Serhiy Storchakaf320be72018-01-25 10:49:40 +0200545 if (_PyObject_LookupAttrId(codec, &PyId__is_text_encoding, &attr) < 0) {
546 Py_DECREF(codec);
547 return NULL;
548 }
549 if (attr != NULL) {
Nick Coghlanc72e4e62013-11-22 22:39:36 +1000550 is_text_codec = PyObject_IsTrue(attr);
551 Py_DECREF(attr);
Serhiy Storchakafa494fd2015-05-30 17:45:22 +0300552 if (is_text_codec <= 0) {
Nick Coghlanc72e4e62013-11-22 22:39:36 +1000553 Py_DECREF(codec);
Serhiy Storchakafa494fd2015-05-30 17:45:22 +0300554 if (!is_text_codec)
555 PyErr_Format(PyExc_LookupError,
556 "'%.400s' is not a text encoding; "
557 "use %s to handle arbitrary codecs",
558 encoding, alternate_command);
Nick Coghlanc72e4e62013-11-22 22:39:36 +1000559 return NULL;
560 }
561 }
562 }
563
Nick Coghlana9b15242014-02-04 22:11:18 +1000564 /* This appears to be a valid text encoding */
565 return codec;
566}
567
568
569static
570PyObject *codec_getitem_checked(const char *encoding,
571 const char *alternate_command,
572 int index)
573{
574 PyObject *codec;
575 PyObject *v;
576
577 codec = _PyCodec_LookupTextEncoding(encoding, alternate_command);
578 if (codec == NULL)
579 return NULL;
580
Nick Coghlanc72e4e62013-11-22 22:39:36 +1000581 v = PyTuple_GET_ITEM(codec, index);
Nick Coghlanc72e4e62013-11-22 22:39:36 +1000582 Py_INCREF(v);
Nick Coghlana9b15242014-02-04 22:11:18 +1000583 Py_DECREF(codec);
Nick Coghlanc72e4e62013-11-22 22:39:36 +1000584 return v;
585}
586
587static PyObject * _PyCodec_TextEncoder(const char *encoding)
588{
Nick Coghlana9b15242014-02-04 22:11:18 +1000589 return codec_getitem_checked(encoding, "codecs.encode()", 0);
Nick Coghlanc72e4e62013-11-22 22:39:36 +1000590}
591
592static PyObject * _PyCodec_TextDecoder(const char *encoding)
593{
Nick Coghlana9b15242014-02-04 22:11:18 +1000594 return codec_getitem_checked(encoding, "codecs.decode()", 1);
Nick Coghlanc72e4e62013-11-22 22:39:36 +1000595}
596
597PyObject *_PyCodec_EncodeText(PyObject *object,
598 const char *encoding,
599 const char *errors)
600{
601 PyObject *encoder;
602
603 encoder = _PyCodec_TextEncoder(encoding);
604 if (encoder == NULL)
605 return NULL;
606
607 return _PyCodec_EncodeInternal(object, encoder, encoding, errors);
608}
609
610PyObject *_PyCodec_DecodeText(PyObject *object,
611 const char *encoding,
612 const char *errors)
613{
614 PyObject *decoder;
615
616 decoder = _PyCodec_TextDecoder(encoding);
617 if (decoder == NULL)
618 return NULL;
619
620 return _PyCodec_DecodeInternal(object, decoder, encoding, errors);
621}
622
Walter Dörwald3aeb6322002-09-02 13:14:32 +0000623/* Register the error handling callback function error under the name
624 name. This function will be called by the codec when it encounters
625 an unencodable characters/undecodable bytes and doesn't know the
626 callback name, when name is specified as the error parameter
627 in the call to the encode/decode function.
628 Return 0 on success, -1 on error */
629int PyCodec_RegisterError(const char *name, PyObject *error)
630{
Victor Stinnercaba55b2018-08-03 15:33:52 +0200631 PyInterpreterState *interp = _PyInterpreterState_Get();
Gustavo Niemeyer5ddd4c32003-03-19 00:35:36 +0000632 if (interp->codec_search_path == NULL && _PyCodecRegistry_Init())
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000633 return -1;
Walter Dörwald3aeb6322002-09-02 13:14:32 +0000634 if (!PyCallable_Check(error)) {
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000635 PyErr_SetString(PyExc_TypeError, "handler must be callable");
636 return -1;
Walter Dörwald3aeb6322002-09-02 13:14:32 +0000637 }
Gustavo Niemeyer5ddd4c32003-03-19 00:35:36 +0000638 return PyDict_SetItemString(interp->codec_error_registry,
Serhiy Storchakac6792272013-10-19 21:03:34 +0300639 name, error);
Walter Dörwald3aeb6322002-09-02 13:14:32 +0000640}
641
642/* Lookup the error handling callback function registered under the
643 name error. As a special case NULL can be passed, in which case
644 the error handling callback for strict encoding will be returned. */
645PyObject *PyCodec_LookupError(const char *name)
646{
647 PyObject *handler = NULL;
648
Victor Stinnercaba55b2018-08-03 15:33:52 +0200649 PyInterpreterState *interp = _PyInterpreterState_GET_UNSAFE();
Gustavo Niemeyer5ddd4c32003-03-19 00:35:36 +0000650 if (interp->codec_search_path == NULL && _PyCodecRegistry_Init())
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000651 return NULL;
Gustavo Niemeyer5ddd4c32003-03-19 00:35:36 +0000652
Walter Dörwald3aeb6322002-09-02 13:14:32 +0000653 if (name==NULL)
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000654 name = "strict";
Serhiy Storchakaa24107b2019-02-25 17:59:46 +0200655 handler = _PyDict_GetItemStringWithError(interp->codec_error_registry, name);
656 if (handler) {
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000657 Py_INCREF(handler);
Serhiy Storchakaa24107b2019-02-25 17:59:46 +0200658 }
659 else if (!PyErr_Occurred()) {
660 PyErr_Format(PyExc_LookupError, "unknown error handler name '%.400s'", name);
661 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +0000662 return handler;
663}
664
665static void wrong_exception_type(PyObject *exc)
666{
Serhiy Storchakaca7fecb2015-05-18 16:08:52 +0300667 PyErr_Format(PyExc_TypeError,
668 "don't know how to handle %.200s in error callback",
669 exc->ob_type->tp_name);
Walter Dörwald3aeb6322002-09-02 13:14:32 +0000670}
671
672PyObject *PyCodec_StrictErrors(PyObject *exc)
673{
Brett Cannonbf364092006-03-01 04:25:17 +0000674 if (PyExceptionInstance_Check(exc))
675 PyErr_SetObject(PyExceptionInstance_Class(exc), exc);
Walter Dörwald3aeb6322002-09-02 13:14:32 +0000676 else
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000677 PyErr_SetString(PyExc_TypeError, "codec must pass exception instance");
Walter Dörwald3aeb6322002-09-02 13:14:32 +0000678 return NULL;
679}
680
681
682PyObject *PyCodec_IgnoreErrors(PyObject *exc)
683{
Martin v. Löwis18e16552006-02-15 17:27:45 +0000684 Py_ssize_t end;
Serhiy Storchakaca7fecb2015-05-18 16:08:52 +0300685
686 if (PyObject_TypeCheck(exc, (PyTypeObject *)PyExc_UnicodeEncodeError)) {
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000687 if (PyUnicodeEncodeError_GetEnd(exc, &end))
688 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +0000689 }
Serhiy Storchakaca7fecb2015-05-18 16:08:52 +0300690 else if (PyObject_TypeCheck(exc, (PyTypeObject *)PyExc_UnicodeDecodeError)) {
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000691 if (PyUnicodeDecodeError_GetEnd(exc, &end))
692 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +0000693 }
Serhiy Storchakaca7fecb2015-05-18 16:08:52 +0300694 else if (PyObject_TypeCheck(exc, (PyTypeObject *)PyExc_UnicodeTranslateError)) {
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000695 if (PyUnicodeTranslateError_GetEnd(exc, &end))
696 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +0000697 }
698 else {
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000699 wrong_exception_type(exc);
700 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +0000701 }
Victor Stinneree450092011-12-01 02:52:11 +0100702 return Py_BuildValue("(Nn)", PyUnicode_New(0, 0), end);
Walter Dörwald3aeb6322002-09-02 13:14:32 +0000703}
704
705
706PyObject *PyCodec_ReplaceErrors(PyObject *exc)
707{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200708 Py_ssize_t start, end, i, len;
Walter Dörwald3aeb6322002-09-02 13:14:32 +0000709
Serhiy Storchakaca7fecb2015-05-18 16:08:52 +0300710 if (PyObject_TypeCheck(exc, (PyTypeObject *)PyExc_UnicodeEncodeError)) {
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000711 PyObject *res;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200712 int kind;
713 void *data;
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000714 if (PyUnicodeEncodeError_GetStart(exc, &start))
715 return NULL;
716 if (PyUnicodeEncodeError_GetEnd(exc, &end))
717 return NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200718 len = end - start;
719 res = PyUnicode_New(len, '?');
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000720 if (res == NULL)
721 return NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200722 kind = PyUnicode_KIND(res);
723 data = PyUnicode_DATA(res);
724 for (i = 0; i < len; ++i)
725 PyUnicode_WRITE(kind, data, i, '?');
Victor Stinner8f825062012-04-27 13:55:39 +0200726 assert(_PyUnicode_CheckConsistency(res, 1));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200727 return Py_BuildValue("(Nn)", res, end);
Walter Dörwald3aeb6322002-09-02 13:14:32 +0000728 }
Serhiy Storchakaca7fecb2015-05-18 16:08:52 +0300729 else if (PyObject_TypeCheck(exc, (PyTypeObject *)PyExc_UnicodeDecodeError)) {
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000730 if (PyUnicodeDecodeError_GetEnd(exc, &end))
731 return NULL;
Victor Stinner1a15aba2011-10-02 19:00:15 +0200732 return Py_BuildValue("(Cn)",
733 (int)Py_UNICODE_REPLACEMENT_CHARACTER,
734 end);
Walter Dörwald3aeb6322002-09-02 13:14:32 +0000735 }
Serhiy Storchakaca7fecb2015-05-18 16:08:52 +0300736 else if (PyObject_TypeCheck(exc, (PyTypeObject *)PyExc_UnicodeTranslateError)) {
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000737 PyObject *res;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200738 int kind;
739 void *data;
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000740 if (PyUnicodeTranslateError_GetStart(exc, &start))
741 return NULL;
742 if (PyUnicodeTranslateError_GetEnd(exc, &end))
743 return NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200744 len = end - start;
745 res = PyUnicode_New(len, Py_UNICODE_REPLACEMENT_CHARACTER);
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000746 if (res == NULL)
747 return NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200748 kind = PyUnicode_KIND(res);
749 data = PyUnicode_DATA(res);
750 for (i=0; i < len; i++)
751 PyUnicode_WRITE(kind, data, i, Py_UNICODE_REPLACEMENT_CHARACTER);
Victor Stinner8f825062012-04-27 13:55:39 +0200752 assert(_PyUnicode_CheckConsistency(res, 1));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200753 return Py_BuildValue("(Nn)", res, end);
Walter Dörwald3aeb6322002-09-02 13:14:32 +0000754 }
755 else {
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000756 wrong_exception_type(exc);
757 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +0000758 }
759}
760
761PyObject *PyCodec_XMLCharRefReplaceErrors(PyObject *exc)
762{
Serhiy Storchakaca7fecb2015-05-18 16:08:52 +0300763 if (PyObject_TypeCheck(exc, (PyTypeObject *)PyExc_UnicodeEncodeError)) {
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000764 PyObject *restuple;
765 PyObject *object;
Victor Stinnerb31f1bc2011-11-04 21:29:10 +0100766 Py_ssize_t i;
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000767 Py_ssize_t start;
768 Py_ssize_t end;
769 PyObject *res;
Martin v. Löwisb09af032011-11-04 11:16:41 +0100770 unsigned char *outp;
Serhiy Storchaka2e374092014-10-04 14:15:49 +0300771 Py_ssize_t ressize;
Martin v. Löwisb09af032011-11-04 11:16:41 +0100772 Py_UCS4 ch;
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000773 if (PyUnicodeEncodeError_GetStart(exc, &start))
774 return NULL;
775 if (PyUnicodeEncodeError_GetEnd(exc, &end))
776 return NULL;
777 if (!(object = PyUnicodeEncodeError_GetObject(exc)))
778 return NULL;
Serhiy Storchaka2e374092014-10-04 14:15:49 +0300779 if (end - start > PY_SSIZE_T_MAX / (2+7+1))
780 end = start + PY_SSIZE_T_MAX / (2+7+1);
Martin v. Löwisb09af032011-11-04 11:16:41 +0100781 for (i = start, ressize = 0; i < end; ++i) {
782 /* object is guaranteed to be "ready" */
783 ch = PyUnicode_READ_CHAR(object, i);
784 if (ch<10)
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000785 ressize += 2+1+1;
Martin v. Löwisb09af032011-11-04 11:16:41 +0100786 else if (ch<100)
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000787 ressize += 2+2+1;
Martin v. Löwisb09af032011-11-04 11:16:41 +0100788 else if (ch<1000)
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000789 ressize += 2+3+1;
Martin v. Löwisb09af032011-11-04 11:16:41 +0100790 else if (ch<10000)
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000791 ressize += 2+4+1;
Martin v. Löwisb09af032011-11-04 11:16:41 +0100792 else if (ch<100000)
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000793 ressize += 2+5+1;
Martin v. Löwisb09af032011-11-04 11:16:41 +0100794 else if (ch<1000000)
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000795 ressize += 2+6+1;
796 else
797 ressize += 2+7+1;
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000798 }
799 /* allocate replacement */
Martin v. Löwisb09af032011-11-04 11:16:41 +0100800 res = PyUnicode_New(ressize, 127);
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000801 if (res == NULL) {
802 Py_DECREF(object);
803 return NULL;
804 }
Martin v. Löwisb09af032011-11-04 11:16:41 +0100805 outp = PyUnicode_1BYTE_DATA(res);
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000806 /* generate replacement */
Victor Stinnerb31f1bc2011-11-04 21:29:10 +0100807 for (i = start; i < end; ++i) {
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000808 int digits;
809 int base;
Martin v. Löwis8ba79302011-11-04 12:26:49 +0100810 ch = PyUnicode_READ_CHAR(object, i);
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000811 *outp++ = '&';
812 *outp++ = '#';
Martin v. Löwisb09af032011-11-04 11:16:41 +0100813 if (ch<10) {
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000814 digits = 1;
815 base = 1;
816 }
Martin v. Löwisb09af032011-11-04 11:16:41 +0100817 else if (ch<100) {
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000818 digits = 2;
819 base = 10;
820 }
Martin v. Löwisb09af032011-11-04 11:16:41 +0100821 else if (ch<1000) {
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000822 digits = 3;
823 base = 100;
824 }
Martin v. Löwisb09af032011-11-04 11:16:41 +0100825 else if (ch<10000) {
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000826 digits = 4;
827 base = 1000;
828 }
Martin v. Löwisb09af032011-11-04 11:16:41 +0100829 else if (ch<100000) {
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000830 digits = 5;
831 base = 10000;
832 }
Martin v. Löwisb09af032011-11-04 11:16:41 +0100833 else if (ch<1000000) {
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000834 digits = 6;
835 base = 100000;
836 }
837 else {
838 digits = 7;
839 base = 1000000;
840 }
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000841 while (digits-->0) {
Martin v. Löwisb09af032011-11-04 11:16:41 +0100842 *outp++ = '0' + ch/base;
843 ch %= base;
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000844 base /= 10;
845 }
846 *outp++ = ';';
847 }
Victor Stinner8f825062012-04-27 13:55:39 +0200848 assert(_PyUnicode_CheckConsistency(res, 1));
849 restuple = Py_BuildValue("(Nn)", res, end);
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000850 Py_DECREF(object);
851 return restuple;
Walter Dörwald3aeb6322002-09-02 13:14:32 +0000852 }
853 else {
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000854 wrong_exception_type(exc);
855 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +0000856 }
857}
858
Walter Dörwald3aeb6322002-09-02 13:14:32 +0000859PyObject *PyCodec_BackslashReplaceErrors(PyObject *exc)
860{
Serhiy Storchaka07985ef2015-01-25 22:56:57 +0200861 PyObject *object;
862 Py_ssize_t i;
863 Py_ssize_t start;
864 Py_ssize_t end;
865 PyObject *res;
866 unsigned char *outp;
867 int ressize;
868 Py_UCS4 c;
869
Serhiy Storchakac0937f72015-05-18 16:10:40 +0300870 if (PyObject_TypeCheck(exc, (PyTypeObject *)PyExc_UnicodeDecodeError)) {
Serhiy Storchakacb33a012016-10-23 09:44:50 +0300871 const unsigned char *p;
Serhiy Storchaka07985ef2015-01-25 22:56:57 +0200872 if (PyUnicodeDecodeError_GetStart(exc, &start))
873 return NULL;
874 if (PyUnicodeDecodeError_GetEnd(exc, &end))
875 return NULL;
876 if (!(object = PyUnicodeDecodeError_GetObject(exc)))
877 return NULL;
Serhiy Storchakacb33a012016-10-23 09:44:50 +0300878 p = (const unsigned char*)PyBytes_AS_STRING(object);
Serhiy Storchaka07985ef2015-01-25 22:56:57 +0200879 res = PyUnicode_New(4 * (end - start), 127);
880 if (res == NULL) {
881 Py_DECREF(object);
882 return NULL;
883 }
884 outp = PyUnicode_1BYTE_DATA(res);
885 for (i = start; i < end; i++, outp += 4) {
886 unsigned char c = p[i];
887 outp[0] = '\\';
888 outp[1] = 'x';
889 outp[2] = Py_hexdigits[(c>>4)&0xf];
890 outp[3] = Py_hexdigits[c&0xf];
891 }
892
893 assert(_PyUnicode_CheckConsistency(res, 1));
894 Py_DECREF(object);
895 return Py_BuildValue("(Nn)", res, end);
896 }
Serhiy Storchakac0937f72015-05-18 16:10:40 +0300897 if (PyObject_TypeCheck(exc, (PyTypeObject *)PyExc_UnicodeEncodeError)) {
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000898 if (PyUnicodeEncodeError_GetStart(exc, &start))
899 return NULL;
900 if (PyUnicodeEncodeError_GetEnd(exc, &end))
901 return NULL;
902 if (!(object = PyUnicodeEncodeError_GetObject(exc)))
903 return NULL;
Serhiy Storchaka07985ef2015-01-25 22:56:57 +0200904 }
Serhiy Storchakac0937f72015-05-18 16:10:40 +0300905 else if (PyObject_TypeCheck(exc, (PyTypeObject *)PyExc_UnicodeTranslateError)) {
Serhiy Storchaka07985ef2015-01-25 22:56:57 +0200906 if (PyUnicodeTranslateError_GetStart(exc, &start))
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000907 return NULL;
Serhiy Storchaka07985ef2015-01-25 22:56:57 +0200908 if (PyUnicodeTranslateError_GetEnd(exc, &end))
909 return NULL;
910 if (!(object = PyUnicodeTranslateError_GetObject(exc)))
911 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +0000912 }
913 else {
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000914 wrong_exception_type(exc);
915 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +0000916 }
Serhiy Storchaka07985ef2015-01-25 22:56:57 +0200917
918 if (end - start > PY_SSIZE_T_MAX / (1+1+8))
919 end = start + PY_SSIZE_T_MAX / (1+1+8);
920 for (i = start, ressize = 0; i < end; ++i) {
921 /* object is guaranteed to be "ready" */
922 c = PyUnicode_READ_CHAR(object, i);
923 if (c >= 0x10000) {
924 ressize += 1+1+8;
925 }
926 else if (c >= 0x100) {
927 ressize += 1+1+4;
928 }
929 else
930 ressize += 1+1+2;
931 }
932 res = PyUnicode_New(ressize, 127);
933 if (res == NULL) {
934 Py_DECREF(object);
935 return NULL;
936 }
937 outp = PyUnicode_1BYTE_DATA(res);
938 for (i = start; i < end; ++i) {
939 c = PyUnicode_READ_CHAR(object, i);
940 *outp++ = '\\';
941 if (c >= 0x00010000) {
942 *outp++ = 'U';
943 *outp++ = Py_hexdigits[(c>>28)&0xf];
944 *outp++ = Py_hexdigits[(c>>24)&0xf];
945 *outp++ = Py_hexdigits[(c>>20)&0xf];
946 *outp++ = Py_hexdigits[(c>>16)&0xf];
947 *outp++ = Py_hexdigits[(c>>12)&0xf];
948 *outp++ = Py_hexdigits[(c>>8)&0xf];
949 }
950 else if (c >= 0x100) {
951 *outp++ = 'u';
952 *outp++ = Py_hexdigits[(c>>12)&0xf];
953 *outp++ = Py_hexdigits[(c>>8)&0xf];
954 }
955 else
956 *outp++ = 'x';
957 *outp++ = Py_hexdigits[(c>>4)&0xf];
958 *outp++ = Py_hexdigits[c&0xf];
959 }
960
961 assert(_PyUnicode_CheckConsistency(res, 1));
962 Py_DECREF(object);
963 return Py_BuildValue("(Nn)", res, end);
Walter Dörwald3aeb6322002-09-02 13:14:32 +0000964}
965
Serhiy Storchaka166ebc42014-11-25 13:57:17 +0200966static _PyUnicode_Name_CAPI *ucnhash_CAPI = NULL;
Serhiy Storchaka166ebc42014-11-25 13:57:17 +0200967
968PyObject *PyCodec_NameReplaceErrors(PyObject *exc)
969{
Serhiy Storchakaca7fecb2015-05-18 16:08:52 +0300970 if (PyObject_TypeCheck(exc, (PyTypeObject *)PyExc_UnicodeEncodeError)) {
Serhiy Storchaka166ebc42014-11-25 13:57:17 +0200971 PyObject *restuple;
972 PyObject *object;
973 Py_ssize_t i;
974 Py_ssize_t start;
975 Py_ssize_t end;
976 PyObject *res;
977 unsigned char *outp;
Serhiy Storchakaaacfccc2014-11-26 12:11:40 +0200978 Py_ssize_t ressize;
979 int replsize;
Serhiy Storchaka166ebc42014-11-25 13:57:17 +0200980 Py_UCS4 c;
981 char buffer[256]; /* NAME_MAXLEN */
982 if (PyUnicodeEncodeError_GetStart(exc, &start))
983 return NULL;
984 if (PyUnicodeEncodeError_GetEnd(exc, &end))
985 return NULL;
986 if (!(object = PyUnicodeEncodeError_GetObject(exc)))
987 return NULL;
Victor Stinner38b8ae02015-09-03 16:19:40 +0200988 if (!ucnhash_CAPI) {
Serhiy Storchaka166ebc42014-11-25 13:57:17 +0200989 /* load the unicode data module */
990 ucnhash_CAPI = (_PyUnicode_Name_CAPI *)PyCapsule_Import(
991 PyUnicodeData_CAPSULE_NAME, 1);
Victor Stinner38b8ae02015-09-03 16:19:40 +0200992 if (!ucnhash_CAPI)
993 return NULL;
Serhiy Storchaka166ebc42014-11-25 13:57:17 +0200994 }
995 for (i = start, ressize = 0; i < end; ++i) {
996 /* object is guaranteed to be "ready" */
997 c = PyUnicode_READ_CHAR(object, i);
Victor Stinner38b8ae02015-09-03 16:19:40 +0200998 if (ucnhash_CAPI->getname(NULL, c, buffer, sizeof(buffer), 1)) {
Serhiy Storchaka26861b02015-02-16 20:52:17 +0200999 replsize = 1+1+1+(int)strlen(buffer)+1;
Serhiy Storchaka166ebc42014-11-25 13:57:17 +02001000 }
1001 else if (c >= 0x10000) {
Serhiy Storchakaaacfccc2014-11-26 12:11:40 +02001002 replsize = 1+1+8;
Serhiy Storchaka166ebc42014-11-25 13:57:17 +02001003 }
1004 else if (c >= 0x100) {
Serhiy Storchakaaacfccc2014-11-26 12:11:40 +02001005 replsize = 1+1+4;
Serhiy Storchaka166ebc42014-11-25 13:57:17 +02001006 }
1007 else
Serhiy Storchakaaacfccc2014-11-26 12:11:40 +02001008 replsize = 1+1+2;
1009 if (ressize > PY_SSIZE_T_MAX - replsize)
1010 break;
1011 ressize += replsize;
Serhiy Storchaka166ebc42014-11-25 13:57:17 +02001012 }
Serhiy Storchakaaacfccc2014-11-26 12:11:40 +02001013 end = i;
Serhiy Storchaka166ebc42014-11-25 13:57:17 +02001014 res = PyUnicode_New(ressize, 127);
1015 if (res==NULL)
1016 return NULL;
1017 for (i = start, outp = PyUnicode_1BYTE_DATA(res);
1018 i < end; ++i) {
1019 c = PyUnicode_READ_CHAR(object, i);
1020 *outp++ = '\\';
Victor Stinner38b8ae02015-09-03 16:19:40 +02001021 if (ucnhash_CAPI->getname(NULL, c, buffer, sizeof(buffer), 1)) {
Serhiy Storchaka166ebc42014-11-25 13:57:17 +02001022 *outp++ = 'N';
1023 *outp++ = '{';
1024 strcpy((char *)outp, buffer);
1025 outp += strlen(buffer);
1026 *outp++ = '}';
1027 continue;
1028 }
1029 if (c >= 0x00010000) {
1030 *outp++ = 'U';
1031 *outp++ = Py_hexdigits[(c>>28)&0xf];
1032 *outp++ = Py_hexdigits[(c>>24)&0xf];
1033 *outp++ = Py_hexdigits[(c>>20)&0xf];
1034 *outp++ = Py_hexdigits[(c>>16)&0xf];
1035 *outp++ = Py_hexdigits[(c>>12)&0xf];
1036 *outp++ = Py_hexdigits[(c>>8)&0xf];
1037 }
1038 else if (c >= 0x100) {
1039 *outp++ = 'u';
1040 *outp++ = Py_hexdigits[(c>>12)&0xf];
1041 *outp++ = Py_hexdigits[(c>>8)&0xf];
1042 }
1043 else
1044 *outp++ = 'x';
1045 *outp++ = Py_hexdigits[(c>>4)&0xf];
1046 *outp++ = Py_hexdigits[c&0xf];
1047 }
1048
Benjamin Peterson3663b582014-11-26 14:39:54 -06001049 assert(outp == PyUnicode_1BYTE_DATA(res) + ressize);
Serhiy Storchaka166ebc42014-11-25 13:57:17 +02001050 assert(_PyUnicode_CheckConsistency(res, 1));
1051 restuple = Py_BuildValue("(Nn)", res, end);
1052 Py_DECREF(object);
1053 return restuple;
1054 }
1055 else {
1056 wrong_exception_type(exc);
1057 return NULL;
1058 }
1059}
1060
Serhiy Storchaka88d8fb62014-05-15 14:37:42 +03001061#define ENC_UNKNOWN -1
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02001062#define ENC_UTF8 0
1063#define ENC_UTF16BE 1
1064#define ENC_UTF16LE 2
1065#define ENC_UTF32BE 3
1066#define ENC_UTF32LE 4
1067
1068static int
1069get_standard_encoding(const char *encoding, int *bytelength)
1070{
1071 if (Py_TOLOWER(encoding[0]) == 'u' &&
1072 Py_TOLOWER(encoding[1]) == 't' &&
1073 Py_TOLOWER(encoding[2]) == 'f') {
1074 encoding += 3;
1075 if (*encoding == '-' || *encoding == '_' )
1076 encoding++;
Serhiy Storchaka88d8fb62014-05-15 14:37:42 +03001077 if (encoding[0] == '8' && encoding[1] == '\0') {
1078 *bytelength = 3;
1079 return ENC_UTF8;
1080 }
1081 else if (encoding[0] == '1' && encoding[1] == '6') {
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02001082 encoding += 2;
1083 *bytelength = 2;
1084 if (*encoding == '\0') {
1085#ifdef WORDS_BIGENDIAN
1086 return ENC_UTF16BE;
1087#else
1088 return ENC_UTF16LE;
1089#endif
1090 }
1091 if (*encoding == '-' || *encoding == '_' )
1092 encoding++;
1093 if (Py_TOLOWER(encoding[1]) == 'e' && encoding[2] == '\0') {
1094 if (Py_TOLOWER(encoding[0]) == 'b')
1095 return ENC_UTF16BE;
1096 if (Py_TOLOWER(encoding[0]) == 'l')
1097 return ENC_UTF16LE;
1098 }
1099 }
1100 else if (encoding[0] == '3' && encoding[1] == '2') {
1101 encoding += 2;
1102 *bytelength = 4;
1103 if (*encoding == '\0') {
1104#ifdef WORDS_BIGENDIAN
1105 return ENC_UTF32BE;
1106#else
1107 return ENC_UTF32LE;
1108#endif
1109 }
1110 if (*encoding == '-' || *encoding == '_' )
1111 encoding++;
1112 if (Py_TOLOWER(encoding[1]) == 'e' && encoding[2] == '\0') {
1113 if (Py_TOLOWER(encoding[0]) == 'b')
1114 return ENC_UTF32BE;
1115 if (Py_TOLOWER(encoding[0]) == 'l')
1116 return ENC_UTF32LE;
1117 }
1118 }
1119 }
Victor Stinner0d4e01c2014-05-16 14:46:20 +02001120 else if (strcmp(encoding, "CP_UTF8") == 0) {
1121 *bytelength = 3;
1122 return ENC_UTF8;
1123 }
Serhiy Storchaka88d8fb62014-05-15 14:37:42 +03001124 return ENC_UNKNOWN;
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02001125}
1126
Martin v. Löwisaef3fb02009-05-02 19:27:30 +00001127/* This handler is declared static until someone demonstrates
1128 a need to call it directly. */
1129static PyObject *
Martin v. Löwise0a2b722009-05-10 08:08:56 +00001130PyCodec_SurrogatePassErrors(PyObject *exc)
Martin v. Löwisdb12d452009-05-02 18:52:14 +00001131{
1132 PyObject *restuple;
1133 PyObject *object;
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02001134 PyObject *encode;
Serhiy Storchaka85b0f5b2016-11-20 10:16:47 +02001135 const char *encoding;
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02001136 int code;
1137 int bytelength;
Martin v. Löwisb09af032011-11-04 11:16:41 +01001138 Py_ssize_t i;
Martin v. Löwisdb12d452009-05-02 18:52:14 +00001139 Py_ssize_t start;
1140 Py_ssize_t end;
1141 PyObject *res;
Serhiy Storchakaca7fecb2015-05-18 16:08:52 +03001142
1143 if (PyObject_TypeCheck(exc, (PyTypeObject *)PyExc_UnicodeEncodeError)) {
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02001144 unsigned char *outp;
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001145 if (PyUnicodeEncodeError_GetStart(exc, &start))
1146 return NULL;
1147 if (PyUnicodeEncodeError_GetEnd(exc, &end))
1148 return NULL;
1149 if (!(object = PyUnicodeEncodeError_GetObject(exc)))
1150 return NULL;
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02001151 if (!(encode = PyUnicodeEncodeError_GetEncoding(exc))) {
1152 Py_DECREF(object);
1153 return NULL;
1154 }
1155 if (!(encoding = PyUnicode_AsUTF8(encode))) {
1156 Py_DECREF(object);
1157 Py_DECREF(encode);
1158 return NULL;
1159 }
1160 code = get_standard_encoding(encoding, &bytelength);
1161 Py_DECREF(encode);
Serhiy Storchaka88d8fb62014-05-15 14:37:42 +03001162 if (code == ENC_UNKNOWN) {
1163 /* Not supported, fail with original exception */
1164 PyErr_SetObject(PyExceptionInstance_Class(exc), exc);
1165 Py_DECREF(object);
1166 return NULL;
1167 }
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02001168
Serhiy Storchaka2e374092014-10-04 14:15:49 +03001169 if (end - start > PY_SSIZE_T_MAX / bytelength)
1170 end = start + PY_SSIZE_T_MAX / bytelength;
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02001171 res = PyBytes_FromStringAndSize(NULL, bytelength*(end-start));
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001172 if (!res) {
1173 Py_DECREF(object);
1174 return NULL;
1175 }
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02001176 outp = (unsigned char*)PyBytes_AsString(res);
Martin v. Löwisb09af032011-11-04 11:16:41 +01001177 for (i = start; i < end; i++) {
1178 /* object is guaranteed to be "ready" */
1179 Py_UCS4 ch = PyUnicode_READ_CHAR(object, i);
Victor Stinner76df43d2012-10-30 01:42:39 +01001180 if (!Py_UNICODE_IS_SURROGATE(ch)) {
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001181 /* Not a surrogate, fail with original exception */
1182 PyErr_SetObject(PyExceptionInstance_Class(exc), exc);
1183 Py_DECREF(res);
1184 Py_DECREF(object);
1185 return NULL;
1186 }
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02001187 switch (code) {
1188 case ENC_UTF8:
1189 *outp++ = (unsigned char)(0xe0 | (ch >> 12));
1190 *outp++ = (unsigned char)(0x80 | ((ch >> 6) & 0x3f));
1191 *outp++ = (unsigned char)(0x80 | (ch & 0x3f));
1192 break;
1193 case ENC_UTF16LE:
1194 *outp++ = (unsigned char) ch;
1195 *outp++ = (unsigned char)(ch >> 8);
1196 break;
1197 case ENC_UTF16BE:
1198 *outp++ = (unsigned char)(ch >> 8);
1199 *outp++ = (unsigned char) ch;
1200 break;
1201 case ENC_UTF32LE:
1202 *outp++ = (unsigned char) ch;
1203 *outp++ = (unsigned char)(ch >> 8);
1204 *outp++ = (unsigned char)(ch >> 16);
1205 *outp++ = (unsigned char)(ch >> 24);
1206 break;
1207 case ENC_UTF32BE:
1208 *outp++ = (unsigned char)(ch >> 24);
1209 *outp++ = (unsigned char)(ch >> 16);
1210 *outp++ = (unsigned char)(ch >> 8);
1211 *outp++ = (unsigned char) ch;
1212 break;
1213 }
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001214 }
1215 restuple = Py_BuildValue("(On)", res, end);
1216 Py_DECREF(res);
1217 Py_DECREF(object);
1218 return restuple;
Martin v. Löwisdb12d452009-05-02 18:52:14 +00001219 }
Serhiy Storchakaca7fecb2015-05-18 16:08:52 +03001220 else if (PyObject_TypeCheck(exc, (PyTypeObject *)PyExc_UnicodeDecodeError)) {
Serhiy Storchakacb33a012016-10-23 09:44:50 +03001221 const unsigned char *p;
Victor Stinnerc06bb7a2011-11-04 21:36:35 +01001222 Py_UCS4 ch = 0;
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001223 if (PyUnicodeDecodeError_GetStart(exc, &start))
1224 return NULL;
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02001225 if (PyUnicodeDecodeError_GetEnd(exc, &end))
1226 return NULL;
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001227 if (!(object = PyUnicodeDecodeError_GetObject(exc)))
1228 return NULL;
Serhiy Storchakacb33a012016-10-23 09:44:50 +03001229 p = (const unsigned char*)PyBytes_AS_STRING(object);
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02001230 if (!(encode = PyUnicodeDecodeError_GetEncoding(exc))) {
1231 Py_DECREF(object);
1232 return NULL;
1233 }
1234 if (!(encoding = PyUnicode_AsUTF8(encode))) {
1235 Py_DECREF(object);
1236 Py_DECREF(encode);
1237 return NULL;
1238 }
1239 code = get_standard_encoding(encoding, &bytelength);
1240 Py_DECREF(encode);
Serhiy Storchaka88d8fb62014-05-15 14:37:42 +03001241 if (code == ENC_UNKNOWN) {
1242 /* Not supported, fail with original exception */
1243 PyErr_SetObject(PyExceptionInstance_Class(exc), exc);
1244 Py_DECREF(object);
1245 return NULL;
1246 }
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02001247
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001248 /* Try decoding a single surrogate character. If
1249 there are more, let the codec call us again. */
1250 p += start;
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02001251 if (PyBytes_GET_SIZE(object) - start >= bytelength) {
1252 switch (code) {
1253 case ENC_UTF8:
1254 if ((p[0] & 0xf0) == 0xe0 &&
1255 (p[1] & 0xc0) == 0x80 &&
1256 (p[2] & 0xc0) == 0x80) {
1257 /* it's a three-byte code */
1258 ch = ((p[0] & 0x0f) << 12) + ((p[1] & 0x3f) << 6) + (p[2] & 0x3f);
1259 }
1260 break;
1261 case ENC_UTF16LE:
1262 ch = p[1] << 8 | p[0];
1263 break;
1264 case ENC_UTF16BE:
1265 ch = p[0] << 8 | p[1];
1266 break;
1267 case ENC_UTF32LE:
1268 ch = (p[3] << 24) | (p[2] << 16) | (p[1] << 8) | p[0];
1269 break;
1270 case ENC_UTF32BE:
1271 ch = (p[0] << 24) | (p[1] << 16) | (p[2] << 8) | p[3];
1272 break;
1273 }
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001274 }
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02001275
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001276 Py_DECREF(object);
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02001277 if (!Py_UNICODE_IS_SURROGATE(ch)) {
1278 /* it's not a surrogate - fail */
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001279 PyErr_SetObject(PyExceptionInstance_Class(exc), exc);
1280 return NULL;
1281 }
Victor Stinnerc06bb7a2011-11-04 21:36:35 +01001282 res = PyUnicode_FromOrdinal(ch);
1283 if (res == NULL)
1284 return NULL;
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02001285 return Py_BuildValue("(Nn)", res, start + bytelength);
Martin v. Löwisdb12d452009-05-02 18:52:14 +00001286 }
1287 else {
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001288 wrong_exception_type(exc);
1289 return NULL;
Martin v. Löwisdb12d452009-05-02 18:52:14 +00001290 }
1291}
1292
Martin v. Löwis011e8422009-05-05 04:43:17 +00001293static PyObject *
Martin v. Löwis43c57782009-05-10 08:15:24 +00001294PyCodec_SurrogateEscapeErrors(PyObject *exc)
Martin v. Löwis011e8422009-05-05 04:43:17 +00001295{
1296 PyObject *restuple;
1297 PyObject *object;
Martin v. Löwisb09af032011-11-04 11:16:41 +01001298 Py_ssize_t i;
Martin v. Löwis011e8422009-05-05 04:43:17 +00001299 Py_ssize_t start;
1300 Py_ssize_t end;
1301 PyObject *res;
Serhiy Storchakaca7fecb2015-05-18 16:08:52 +03001302
1303 if (PyObject_TypeCheck(exc, (PyTypeObject *)PyExc_UnicodeEncodeError)) {
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001304 char *outp;
1305 if (PyUnicodeEncodeError_GetStart(exc, &start))
1306 return NULL;
1307 if (PyUnicodeEncodeError_GetEnd(exc, &end))
1308 return NULL;
1309 if (!(object = PyUnicodeEncodeError_GetObject(exc)))
1310 return NULL;
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001311 res = PyBytes_FromStringAndSize(NULL, end-start);
1312 if (!res) {
1313 Py_DECREF(object);
1314 return NULL;
1315 }
1316 outp = PyBytes_AsString(res);
Martin v. Löwisb09af032011-11-04 11:16:41 +01001317 for (i = start; i < end; i++) {
1318 /* object is guaranteed to be "ready" */
1319 Py_UCS4 ch = PyUnicode_READ_CHAR(object, i);
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001320 if (ch < 0xdc80 || ch > 0xdcff) {
1321 /* Not a UTF-8b surrogate, fail with original exception */
1322 PyErr_SetObject(PyExceptionInstance_Class(exc), exc);
1323 Py_DECREF(res);
1324 Py_DECREF(object);
1325 return NULL;
1326 }
1327 *outp++ = ch - 0xdc00;
1328 }
1329 restuple = Py_BuildValue("(On)", res, end);
1330 Py_DECREF(res);
1331 Py_DECREF(object);
1332 return restuple;
Martin v. Löwis011e8422009-05-05 04:43:17 +00001333 }
Serhiy Storchakaca7fecb2015-05-18 16:08:52 +03001334 else if (PyObject_TypeCheck(exc, (PyTypeObject *)PyExc_UnicodeDecodeError)) {
Victor Stinnerc06bb7a2011-11-04 21:36:35 +01001335 PyObject *str;
Serhiy Storchakacb33a012016-10-23 09:44:50 +03001336 const unsigned char *p;
Victor Stinnerc06bb7a2011-11-04 21:36:35 +01001337 Py_UCS2 ch[4]; /* decode up to 4 bad bytes. */
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001338 int consumed = 0;
1339 if (PyUnicodeDecodeError_GetStart(exc, &start))
1340 return NULL;
1341 if (PyUnicodeDecodeError_GetEnd(exc, &end))
1342 return NULL;
1343 if (!(object = PyUnicodeDecodeError_GetObject(exc)))
1344 return NULL;
Serhiy Storchakacb33a012016-10-23 09:44:50 +03001345 p = (const unsigned char*)PyBytes_AS_STRING(object);
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001346 while (consumed < 4 && consumed < end-start) {
1347 /* Refuse to escape ASCII bytes. */
1348 if (p[start+consumed] < 128)
1349 break;
1350 ch[consumed] = 0xdc00 + p[start+consumed];
1351 consumed++;
1352 }
1353 Py_DECREF(object);
1354 if (!consumed) {
1355 /* codec complained about ASCII byte. */
1356 PyErr_SetObject(PyExceptionInstance_Class(exc), exc);
1357 return NULL;
1358 }
Victor Stinnerc06bb7a2011-11-04 21:36:35 +01001359 str = PyUnicode_FromKindAndData(PyUnicode_2BYTE_KIND, ch, consumed);
1360 if (str == NULL)
1361 return NULL;
1362 return Py_BuildValue("(Nn)", str, start+consumed);
Martin v. Löwis011e8422009-05-05 04:43:17 +00001363 }
1364 else {
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001365 wrong_exception_type(exc);
1366 return NULL;
Martin v. Löwis011e8422009-05-05 04:43:17 +00001367 }
1368}
1369
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001370
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001371static PyObject *strict_errors(PyObject *self, PyObject *exc)
1372{
1373 return PyCodec_StrictErrors(exc);
1374}
1375
1376
1377static PyObject *ignore_errors(PyObject *self, PyObject *exc)
1378{
1379 return PyCodec_IgnoreErrors(exc);
1380}
1381
1382
1383static PyObject *replace_errors(PyObject *self, PyObject *exc)
1384{
1385 return PyCodec_ReplaceErrors(exc);
1386}
1387
1388
1389static PyObject *xmlcharrefreplace_errors(PyObject *self, PyObject *exc)
1390{
1391 return PyCodec_XMLCharRefReplaceErrors(exc);
1392}
1393
1394
1395static PyObject *backslashreplace_errors(PyObject *self, PyObject *exc)
1396{
1397 return PyCodec_BackslashReplaceErrors(exc);
1398}
1399
Serhiy Storchaka166ebc42014-11-25 13:57:17 +02001400static PyObject *namereplace_errors(PyObject *self, PyObject *exc)
1401{
1402 return PyCodec_NameReplaceErrors(exc);
1403}
1404
Martin v. Löwise0a2b722009-05-10 08:08:56 +00001405static PyObject *surrogatepass_errors(PyObject *self, PyObject *exc)
Martin v. Löwisdb12d452009-05-02 18:52:14 +00001406{
Martin v. Löwise0a2b722009-05-10 08:08:56 +00001407 return PyCodec_SurrogatePassErrors(exc);
Martin v. Löwisdb12d452009-05-02 18:52:14 +00001408}
1409
Martin v. Löwis43c57782009-05-10 08:15:24 +00001410static PyObject *surrogateescape_errors(PyObject *self, PyObject *exc)
Martin v. Löwis011e8422009-05-05 04:43:17 +00001411{
Martin v. Löwis43c57782009-05-10 08:15:24 +00001412 return PyCodec_SurrogateEscapeErrors(exc);
Martin v. Löwis011e8422009-05-05 04:43:17 +00001413}
1414
Gustavo Niemeyer5ddd4c32003-03-19 00:35:36 +00001415static int _PyCodecRegistry_Init(void)
Guido van Rossumfeee4b92000-03-10 22:57:27 +00001416{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001417 static struct {
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001418 char *name;
1419 PyMethodDef def;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001420 } methods[] =
1421 {
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001422 {
1423 "strict",
1424 {
1425 "strict_errors",
1426 strict_errors,
1427 METH_O,
1428 PyDoc_STR("Implements the 'strict' error handling, which "
1429 "raises a UnicodeError on coding errors.")
1430 }
1431 },
1432 {
1433 "ignore",
1434 {
1435 "ignore_errors",
1436 ignore_errors,
1437 METH_O,
1438 PyDoc_STR("Implements the 'ignore' error handling, which "
1439 "ignores malformed data and continues.")
1440 }
1441 },
1442 {
1443 "replace",
1444 {
1445 "replace_errors",
1446 replace_errors,
1447 METH_O,
1448 PyDoc_STR("Implements the 'replace' error handling, which "
1449 "replaces malformed data with a replacement marker.")
1450 }
1451 },
1452 {
1453 "xmlcharrefreplace",
1454 {
1455 "xmlcharrefreplace_errors",
1456 xmlcharrefreplace_errors,
1457 METH_O,
1458 PyDoc_STR("Implements the 'xmlcharrefreplace' error handling, "
1459 "which replaces an unencodable character with the "
1460 "appropriate XML character reference.")
1461 }
1462 },
1463 {
1464 "backslashreplace",
1465 {
1466 "backslashreplace_errors",
1467 backslashreplace_errors,
1468 METH_O,
1469 PyDoc_STR("Implements the 'backslashreplace' error handling, "
Serhiy Storchaka07985ef2015-01-25 22:56:57 +02001470 "which replaces malformed data with a backslashed "
1471 "escape sequence.")
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001472 }
1473 },
1474 {
Serhiy Storchaka166ebc42014-11-25 13:57:17 +02001475 "namereplace",
1476 {
1477 "namereplace_errors",
1478 namereplace_errors,
1479 METH_O,
1480 PyDoc_STR("Implements the 'namereplace' error handling, "
1481 "which replaces an unencodable character with a "
1482 "\\N{...} escape sequence.")
1483 }
1484 },
1485 {
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001486 "surrogatepass",
1487 {
1488 "surrogatepass",
1489 surrogatepass_errors,
1490 METH_O
1491 }
1492 },
1493 {
1494 "surrogateescape",
1495 {
1496 "surrogateescape",
1497 surrogateescape_errors,
1498 METH_O
1499 }
1500 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001501 };
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001502
Victor Stinnercaba55b2018-08-03 15:33:52 +02001503 PyInterpreterState *interp = _PyInterpreterState_Get();
Gustavo Niemeyer5ddd4c32003-03-19 00:35:36 +00001504 PyObject *mod;
Neal Norwitz739a8f82004-07-08 01:55:58 +00001505 unsigned i;
Gustavo Niemeyer5ddd4c32003-03-19 00:35:36 +00001506
1507 if (interp->codec_search_path != NULL)
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001508 return 0;
Gustavo Niemeyer5ddd4c32003-03-19 00:35:36 +00001509
1510 interp->codec_search_path = PyList_New(0);
1511 interp->codec_search_cache = PyDict_New();
1512 interp->codec_error_registry = PyDict_New();
1513
1514 if (interp->codec_error_registry) {
Victor Stinner63941882011-09-29 00:42:28 +02001515 for (i = 0; i < Py_ARRAY_LENGTH(methods); ++i) {
Andrew Svetlov3ba3a3e2012-12-25 13:32:35 +02001516 PyObject *func = PyCFunction_NewEx(&methods[i].def, NULL, NULL);
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001517 int res;
1518 if (!func)
1519 Py_FatalError("can't initialize codec error registry");
1520 res = PyCodec_RegisterError(methods[i].name, func);
1521 Py_DECREF(func);
1522 if (res)
1523 Py_FatalError("can't initialize codec error registry");
1524 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001525 }
Guido van Rossumfeee4b92000-03-10 22:57:27 +00001526
Gustavo Niemeyer5ddd4c32003-03-19 00:35:36 +00001527 if (interp->codec_search_path == NULL ||
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001528 interp->codec_search_cache == NULL ||
1529 interp->codec_error_registry == NULL)
1530 Py_FatalError("can't initialize codec registry");
Gustavo Niemeyer5ddd4c32003-03-19 00:35:36 +00001531
Christian Heimes819b8bf2008-01-03 23:05:47 +00001532 mod = PyImport_ImportModuleNoBlock("encodings");
Gustavo Niemeyer5ddd4c32003-03-19 00:35:36 +00001533 if (mod == NULL) {
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001534 return -1;
Gustavo Niemeyer5ddd4c32003-03-19 00:35:36 +00001535 }
1536 Py_DECREF(mod);
Christian Heimes6a27efa2008-10-30 21:48:26 +00001537 interp->codecs_initialized = 1;
Gustavo Niemeyer5ddd4c32003-03-19 00:35:36 +00001538 return 0;
Guido van Rossumfeee4b92000-03-10 22:57:27 +00001539}