blob: a8233a73c4ed3f51886b24fa178f71fe41c7c8bd [file] [log] [blame]
Guido van Rossumfeee4b92000-03-10 22:57:27 +00001/* ------------------------------------------------------------------------
2
3 Python Codec Registry and support functions
4
5Written by Marc-Andre Lemburg (mal@lemburg.com).
6
Guido van Rossum16b1ad92000-08-03 16:24:25 +00007Copyright (c) Corporation for National Research Initiatives.
Guido van Rossumfeee4b92000-03-10 22:57:27 +00008
9 ------------------------------------------------------------------------ */
10
11#include "Python.h"
Victor Stinnere5014be2020-04-14 17:52:15 +020012#include "pycore_interp.h" // PyInterpreterState.codec_search_path
13#include "pycore_pystate.h" // _PyInterpreterState_GET()
Serhiy Storchaka166ebc42014-11-25 13:57:17 +020014#include "ucnhash.h"
Guido van Rossumfeee4b92000-03-10 22:57:27 +000015#include <ctype.h>
16
Victor Stinnerf5cff562011-10-14 02:13:11 +020017const char *Py_hexdigits = "0123456789abcdef";
18
Guido van Rossumfeee4b92000-03-10 22:57:27 +000019/* --- Codec Registry ----------------------------------------------------- */
20
21/* Import the standard encodings package which will register the first
Guido van Rossum98297ee2007-11-06 21:34:58 +000022 codec search function.
Guido van Rossumfeee4b92000-03-10 22:57:27 +000023
24 This is done in a lazy way so that the Unicode implementation does
25 not downgrade startup time of scripts not needing it.
26
Guido van Rossumb95de4f2000-03-31 17:25:23 +000027 ImportErrors are silently ignored by this function. Only one try is
28 made.
Guido van Rossumfeee4b92000-03-10 22:57:27 +000029
30*/
31
Gustavo Niemeyer5ddd4c32003-03-19 00:35:36 +000032static int _PyCodecRegistry_Init(void); /* Forward */
Guido van Rossumfeee4b92000-03-10 22:57:27 +000033
Guido van Rossumfeee4b92000-03-10 22:57:27 +000034int PyCodec_Register(PyObject *search_function)
35{
Victor Stinner81a7be32020-04-14 15:14:01 +020036 PyInterpreterState *interp = _PyInterpreterState_GET();
Gustavo Niemeyer5ddd4c32003-03-19 00:35:36 +000037 if (interp->codec_search_path == NULL && _PyCodecRegistry_Init())
Antoine Pitrouf95a1b32010-05-09 15:52:27 +000038 goto onError;
Guido van Rossumfeee4b92000-03-10 22:57:27 +000039 if (search_function == NULL) {
Antoine Pitrouf95a1b32010-05-09 15:52:27 +000040 PyErr_BadArgument();
41 goto onError;
Guido van Rossumfeee4b92000-03-10 22:57:27 +000042 }
43 if (!PyCallable_Check(search_function)) {
Antoine Pitrouf95a1b32010-05-09 15:52:27 +000044 PyErr_SetString(PyExc_TypeError, "argument must be callable");
45 goto onError;
Guido van Rossumfeee4b92000-03-10 22:57:27 +000046 }
Gustavo Niemeyer5ddd4c32003-03-19 00:35:36 +000047 return PyList_Append(interp->codec_search_path, search_function);
Guido van Rossumb95de4f2000-03-31 17:25:23 +000048
49 onError:
50 return -1;
Guido van Rossumfeee4b92000-03-10 22:57:27 +000051}
52
Hai Shid332e7b2020-09-29 05:41:11 +080053int
54PyCodec_Unregister(PyObject *search_function)
55{
56 PyInterpreterState *interp = PyInterpreterState_Get();
57 PyObject *codec_search_path = interp->codec_search_path;
58 /* Do nothing if codec_search_path is not created yet or was cleared. */
59 if (codec_search_path == NULL) {
60 return 0;
61 }
62
63 assert(PyList_CheckExact(codec_search_path));
64 Py_ssize_t n = PyList_GET_SIZE(codec_search_path);
65 for (Py_ssize_t i = 0; i < n; i++) {
66 PyObject *item = PyList_GET_ITEM(codec_search_path, i);
67 if (item == search_function) {
68 if (interp->codec_search_cache != NULL) {
69 assert(PyDict_CheckExact(interp->codec_search_cache));
70 PyDict_Clear(interp->codec_search_cache);
71 }
72 return PyList_SetSlice(codec_search_path, i, i+1, NULL);
73 }
74 }
75 return 0;
76}
77
Jordon Xu20f59fe2019-08-21 21:26:20 +080078extern int _Py_normalize_encoding(const char *, char *, size_t);
79
80/* Convert a string to a normalized Python string(decoded from UTF-8): all characters are
81 converted to lower case, spaces and hyphens are replaced with underscores. */
Guido van Rossum9e896b32000-04-05 20:11:21 +000082
Guido van Rossumfeee4b92000-03-10 22:57:27 +000083static
Guido van Rossum9e896b32000-04-05 20:11:21 +000084PyObject *normalizestring(const char *string)
Guido van Rossumfeee4b92000-03-10 22:57:27 +000085{
Guido van Rossum582acec2000-06-28 22:07:35 +000086 size_t len = strlen(string);
Jordon Xu20f59fe2019-08-21 21:26:20 +080087 char *encoding;
Guido van Rossumfeee4b92000-03-10 22:57:27 +000088 PyObject *v;
Guido van Rossum21431e82007-10-19 21:48:41 +000089
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000090 if (len > PY_SSIZE_T_MAX) {
Antoine Pitrouf95a1b32010-05-09 15:52:27 +000091 PyErr_SetString(PyExc_OverflowError, "string is too large");
92 return NULL;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000093 }
Guido van Rossum21431e82007-10-19 21:48:41 +000094
Jordon Xu20f59fe2019-08-21 21:26:20 +080095 encoding = PyMem_Malloc(len + 1);
96 if (encoding == NULL)
Victor Stinnercc351592013-07-12 00:02:55 +020097 return PyErr_NoMemory();
Jordon Xu20f59fe2019-08-21 21:26:20 +080098
99 if (!_Py_normalize_encoding(string, encoding, len + 1))
100 {
101 PyErr_SetString(PyExc_RuntimeError, "_Py_normalize_encoding() failed");
102 PyMem_Free(encoding);
103 return NULL;
Guido van Rossum9e896b32000-04-05 20:11:21 +0000104 }
Jordon Xu20f59fe2019-08-21 21:26:20 +0800105
106 v = PyUnicode_FromString(encoding);
107 PyMem_Free(encoding);
Guido van Rossumfeee4b92000-03-10 22:57:27 +0000108 return v;
109}
110
111/* Lookup the given encoding and return a tuple providing the codec
112 facilities.
113
114 The encoding string is looked up converted to all lower-case
115 characters. This makes encodings looked up through this mechanism
116 effectively case-insensitive.
117
Guido van Rossum98297ee2007-11-06 21:34:58 +0000118 If no codec is found, a LookupError is set and NULL returned.
Guido van Rossumb95de4f2000-03-31 17:25:23 +0000119
120 As side effect, this tries to load the encodings package, if not
121 yet done. This is part of the lazy load strategy for the encodings
122 package.
123
124*/
Guido van Rossumfeee4b92000-03-10 22:57:27 +0000125
126PyObject *_PyCodec_Lookup(const char *encoding)
127{
Fred Drake766de832000-05-09 19:55:59 +0000128 if (encoding == NULL) {
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000129 PyErr_BadArgument();
Jeroen Demeyer6e43d072019-07-05 12:57:32 +0200130 return NULL;
Fred Drake766de832000-05-09 19:55:59 +0000131 }
Gustavo Niemeyer5ddd4c32003-03-19 00:35:36 +0000132
Victor Stinner81a7be32020-04-14 15:14:01 +0200133 PyInterpreterState *interp = _PyInterpreterState_GET();
Jeroen Demeyer6e43d072019-07-05 12:57:32 +0200134 if (interp->codec_search_path == NULL && _PyCodecRegistry_Init()) {
135 return NULL;
136 }
Guido van Rossumfeee4b92000-03-10 22:57:27 +0000137
Guido van Rossum9e896b32000-04-05 20:11:21 +0000138 /* Convert the encoding to a normalized Python string: all
Thomas Wouters7e474022000-07-16 12:04:32 +0000139 characters are converted to lower case, spaces and hyphens are
Guido van Rossum9e896b32000-04-05 20:11:21 +0000140 replaced with underscores. */
Jeroen Demeyer6e43d072019-07-05 12:57:32 +0200141 PyObject *v = normalizestring(encoding);
142 if (v == NULL) {
143 return NULL;
144 }
Guido van Rossum21431e82007-10-19 21:48:41 +0000145 PyUnicode_InternInPlace(&v);
Guido van Rossumfeee4b92000-03-10 22:57:27 +0000146
147 /* First, try to lookup the name in the registry dictionary */
Jeroen Demeyer6e43d072019-07-05 12:57:32 +0200148 PyObject *result = PyDict_GetItemWithError(interp->codec_search_cache, v);
Guido van Rossumfeee4b92000-03-10 22:57:27 +0000149 if (result != NULL) {
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000150 Py_INCREF(result);
151 Py_DECREF(v);
152 return result;
Guido van Rossumfeee4b92000-03-10 22:57:27 +0000153 }
Serhiy Storchakaa24107b2019-02-25 17:59:46 +0200154 else if (PyErr_Occurred()) {
Jeroen Demeyer6e43d072019-07-05 12:57:32 +0200155 goto onError;
Serhiy Storchakaa24107b2019-02-25 17:59:46 +0200156 }
Guido van Rossum98297ee2007-11-06 21:34:58 +0000157
Guido van Rossumfeee4b92000-03-10 22:57:27 +0000158 /* Next, scan the search functions in order of registration */
Jeroen Demeyer6e43d072019-07-05 12:57:32 +0200159 const Py_ssize_t len = PyList_Size(interp->codec_search_path);
Guido van Rossum5ba3c842000-03-24 20:52:23 +0000160 if (len < 0)
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000161 goto onError;
Guido van Rossumb95de4f2000-03-31 17:25:23 +0000162 if (len == 0) {
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000163 PyErr_SetString(PyExc_LookupError,
164 "no codec search functions registered: "
165 "can't find encoding");
166 goto onError;
Guido van Rossumb95de4f2000-03-31 17:25:23 +0000167 }
Guido van Rossumfeee4b92000-03-10 22:57:27 +0000168
Jeroen Demeyer6e43d072019-07-05 12:57:32 +0200169 Py_ssize_t i;
Guido van Rossumfeee4b92000-03-10 22:57:27 +0000170 for (i = 0; i < len; i++) {
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000171 PyObject *func;
Guido van Rossum98297ee2007-11-06 21:34:58 +0000172
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000173 func = PyList_GetItem(interp->codec_search_path, i);
174 if (func == NULL)
175 goto onError;
Petr Viktorinffd97532020-02-11 17:46:57 +0100176 result = PyObject_CallOneArg(func, v);
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000177 if (result == NULL)
178 goto onError;
179 if (result == Py_None) {
180 Py_DECREF(result);
181 continue;
182 }
183 if (!PyTuple_Check(result) || PyTuple_GET_SIZE(result) != 4) {
184 PyErr_SetString(PyExc_TypeError,
185 "codec search functions must return 4-tuples");
186 Py_DECREF(result);
187 goto onError;
188 }
189 break;
Guido van Rossumfeee4b92000-03-10 22:57:27 +0000190 }
191 if (i == len) {
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000192 /* XXX Perhaps we should cache misses too ? */
193 PyErr_Format(PyExc_LookupError,
Martin v. Löwiseb42b022002-09-26 16:01:24 +0000194 "unknown encoding: %s", encoding);
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000195 goto onError;
Guido van Rossumfeee4b92000-03-10 22:57:27 +0000196 }
197
198 /* Cache and return the result */
Neal Norwitz9edcc2e2007-08-11 04:58:26 +0000199 if (PyDict_SetItem(interp->codec_search_cache, v, result) < 0) {
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000200 Py_DECREF(result);
201 goto onError;
Neal Norwitz9edcc2e2007-08-11 04:58:26 +0000202 }
Jeroen Demeyer6e43d072019-07-05 12:57:32 +0200203 Py_DECREF(v);
Guido van Rossumfeee4b92000-03-10 22:57:27 +0000204 return result;
205
206 onError:
Jeroen Demeyer6e43d072019-07-05 12:57:32 +0200207 Py_DECREF(v);
Guido van Rossumfeee4b92000-03-10 22:57:27 +0000208 return NULL;
209}
210
Nick Coghlan8fad1672014-09-15 23:50:44 +1200211int _PyCodec_Forget(const char *encoding)
212{
Nick Coghlan8fad1672014-09-15 23:50:44 +1200213 PyObject *v;
214 int result;
215
Victor Stinner81a7be32020-04-14 15:14:01 +0200216 PyInterpreterState *interp = _PyInterpreterState_GET();
Nick Coghlan8fad1672014-09-15 23:50:44 +1200217 if (interp->codec_search_path == NULL) {
218 return -1;
219 }
220
221 /* Convert the encoding to a normalized Python string: all
222 characters are converted to lower case, spaces and hyphens are
223 replaced with underscores. */
224 v = normalizestring(encoding);
225 if (v == NULL) {
226 return -1;
227 }
228
229 /* Drop the named codec from the internal cache */
230 result = PyDict_DelItem(interp->codec_search_cache, v);
231 Py_DECREF(v);
232
233 return result;
234}
235
Marc-André Lemburgb2750b52008-06-06 12:18:17 +0000236/* Codec registry encoding check API. */
237
238int PyCodec_KnownEncoding(const char *encoding)
239{
240 PyObject *codecs;
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000241
Marc-André Lemburgb2750b52008-06-06 12:18:17 +0000242 codecs = _PyCodec_Lookup(encoding);
243 if (!codecs) {
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000244 PyErr_Clear();
245 return 0;
Marc-André Lemburgb2750b52008-06-06 12:18:17 +0000246 }
247 else {
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000248 Py_DECREF(codecs);
249 return 1;
Marc-André Lemburgb2750b52008-06-06 12:18:17 +0000250 }
251}
252
Guido van Rossumfeee4b92000-03-10 22:57:27 +0000253static
254PyObject *args_tuple(PyObject *object,
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000255 const char *errors)
Guido van Rossumfeee4b92000-03-10 22:57:27 +0000256{
257 PyObject *args;
Guido van Rossum98297ee2007-11-06 21:34:58 +0000258
Guido van Rossumfeee4b92000-03-10 22:57:27 +0000259 args = PyTuple_New(1 + (errors != NULL));
260 if (args == NULL)
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000261 return NULL;
Guido van Rossumfeee4b92000-03-10 22:57:27 +0000262 Py_INCREF(object);
263 PyTuple_SET_ITEM(args,0,object);
264 if (errors) {
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000265 PyObject *v;
Guido van Rossum98297ee2007-11-06 21:34:58 +0000266
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000267 v = PyUnicode_FromString(errors);
268 if (v == NULL) {
269 Py_DECREF(args);
270 return NULL;
271 }
272 PyTuple_SET_ITEM(args, 1, v);
Guido van Rossumfeee4b92000-03-10 22:57:27 +0000273 }
274 return args;
275}
276
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000277/* Helper function to get a codec item */
Guido van Rossumfeee4b92000-03-10 22:57:27 +0000278
279static
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000280PyObject *codec_getitem(const char *encoding, int index)
Guido van Rossumfeee4b92000-03-10 22:57:27 +0000281{
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000282 PyObject *codecs;
283 PyObject *v;
Guido van Rossumfeee4b92000-03-10 22:57:27 +0000284
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000285 codecs = _PyCodec_Lookup(encoding);
286 if (codecs == NULL)
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000287 return NULL;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000288 v = PyTuple_GET_ITEM(codecs, index);
289 Py_DECREF(codecs);
290 Py_INCREF(v);
291 return v;
292}
293
Nick Coghlana9b15242014-02-04 22:11:18 +1000294/* Helper functions to create an incremental codec. */
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000295static
Nick Coghlana9b15242014-02-04 22:11:18 +1000296PyObject *codec_makeincrementalcodec(PyObject *codec_info,
297 const char *errors,
298 const char *attrname)
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000299{
Nick Coghlana9b15242014-02-04 22:11:18 +1000300 PyObject *ret, *inccodec;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000301
Nick Coghlana9b15242014-02-04 22:11:18 +1000302 inccodec = PyObject_GetAttrString(codec_info, attrname);
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000303 if (inccodec == NULL)
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000304 return NULL;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000305 if (errors)
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000306 ret = PyObject_CallFunction(inccodec, "s", errors);
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000307 else
Victor Stinner4778eab2016-12-01 14:51:04 +0100308 ret = _PyObject_CallNoArg(inccodec);
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000309 Py_DECREF(inccodec);
310 return ret;
311}
312
Nick Coghlana9b15242014-02-04 22:11:18 +1000313static
314PyObject *codec_getincrementalcodec(const char *encoding,
315 const char *errors,
316 const char *attrname)
317{
318 PyObject *codec_info, *ret;
319
320 codec_info = _PyCodec_Lookup(encoding);
321 if (codec_info == NULL)
322 return NULL;
323 ret = codec_makeincrementalcodec(codec_info, errors, attrname);
324 Py_DECREF(codec_info);
325 return ret;
326}
327
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000328/* Helper function to create a stream codec. */
329
330static
331PyObject *codec_getstreamcodec(const char *encoding,
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000332 PyObject *stream,
333 const char *errors,
334 const int index)
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000335{
Thomas Wouters0e3f5912006-08-11 14:57:12 +0000336 PyObject *codecs, *streamcodec, *codeccls;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000337
338 codecs = _PyCodec_Lookup(encoding);
339 if (codecs == NULL)
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000340 return NULL;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000341
Thomas Wouters0e3f5912006-08-11 14:57:12 +0000342 codeccls = PyTuple_GET_ITEM(codecs, index);
343 if (errors != NULL)
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000344 streamcodec = PyObject_CallFunction(codeccls, "Os", stream, errors);
Thomas Wouters0e3f5912006-08-11 14:57:12 +0000345 else
Petr Viktorinffd97532020-02-11 17:46:57 +0100346 streamcodec = PyObject_CallOneArg(codeccls, stream);
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000347 Py_DECREF(codecs);
348 return streamcodec;
Guido van Rossumfeee4b92000-03-10 22:57:27 +0000349}
350
Nick Coghlana9b15242014-02-04 22:11:18 +1000351/* Helpers to work with the result of _PyCodec_Lookup
352
353 */
354PyObject *_PyCodecInfo_GetIncrementalDecoder(PyObject *codec_info,
355 const char *errors)
356{
357 return codec_makeincrementalcodec(codec_info, errors,
358 "incrementaldecoder");
359}
360
361PyObject *_PyCodecInfo_GetIncrementalEncoder(PyObject *codec_info,
362 const char *errors)
363{
364 return codec_makeincrementalcodec(codec_info, errors,
365 "incrementalencoder");
366}
367
368
Guido van Rossum98297ee2007-11-06 21:34:58 +0000369/* Convenience APIs to query the Codec registry.
370
Guido van Rossumfeee4b92000-03-10 22:57:27 +0000371 All APIs return a codec object with incremented refcount.
Guido van Rossum98297ee2007-11-06 21:34:58 +0000372
Guido van Rossumfeee4b92000-03-10 22:57:27 +0000373 */
374
375PyObject *PyCodec_Encoder(const char *encoding)
376{
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000377 return codec_getitem(encoding, 0);
Guido van Rossumfeee4b92000-03-10 22:57:27 +0000378}
379
380PyObject *PyCodec_Decoder(const char *encoding)
381{
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000382 return codec_getitem(encoding, 1);
Guido van Rossumfeee4b92000-03-10 22:57:27 +0000383}
384
Thomas Woutersa9773292006-04-21 09:43:23 +0000385PyObject *PyCodec_IncrementalEncoder(const char *encoding,
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000386 const char *errors)
Thomas Woutersa9773292006-04-21 09:43:23 +0000387{
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000388 return codec_getincrementalcodec(encoding, errors, "incrementalencoder");
Thomas Woutersa9773292006-04-21 09:43:23 +0000389}
390
391PyObject *PyCodec_IncrementalDecoder(const char *encoding,
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000392 const char *errors)
Thomas Woutersa9773292006-04-21 09:43:23 +0000393{
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000394 return codec_getincrementalcodec(encoding, errors, "incrementaldecoder");
Thomas Woutersa9773292006-04-21 09:43:23 +0000395}
396
Guido van Rossumfeee4b92000-03-10 22:57:27 +0000397PyObject *PyCodec_StreamReader(const char *encoding,
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000398 PyObject *stream,
399 const char *errors)
Guido van Rossumfeee4b92000-03-10 22:57:27 +0000400{
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000401 return codec_getstreamcodec(encoding, stream, errors, 2);
Guido van Rossumfeee4b92000-03-10 22:57:27 +0000402}
403
404PyObject *PyCodec_StreamWriter(const char *encoding,
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000405 PyObject *stream,
406 const char *errors)
Guido van Rossumfeee4b92000-03-10 22:57:27 +0000407{
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000408 return codec_getstreamcodec(encoding, stream, errors, 3);
Guido van Rossumfeee4b92000-03-10 22:57:27 +0000409}
410
Nick Coghlan8b097b42013-11-13 23:49:21 +1000411/* Helper that tries to ensure the reported exception chain indicates the
412 * codec that was invoked to trigger the failure without changing the type
413 * of the exception raised.
414 */
415static void
416wrap_codec_error(const char *operation,
417 const char *encoding)
418{
419 /* TrySetFromCause will replace the active exception with a suitably
420 * updated clone if it can, otherwise it will leave the original
421 * exception alone.
422 */
423 _PyErr_TrySetFromCause("%s with '%s' codec failed",
424 operation, encoding);
425}
426
Martin Panter6245cb32016-04-15 02:14:19 +0000427/* Encode an object (e.g. a Unicode object) using the given encoding
Guido van Rossumfeee4b92000-03-10 22:57:27 +0000428 and return the resulting encoded object (usually a Python string).
429
430 errors is passed to the encoder factory as argument if non-NULL. */
431
Nick Coghlanc72e4e62013-11-22 22:39:36 +1000432static PyObject *
433_PyCodec_EncodeInternal(PyObject *object,
434 PyObject *encoder,
435 const char *encoding,
436 const char *errors)
Guido van Rossumfeee4b92000-03-10 22:57:27 +0000437{
Neal Norwitz3715c3e2005-11-24 22:09:18 +0000438 PyObject *args = NULL, *result = NULL;
Guido van Rossum98297ee2007-11-06 21:34:58 +0000439 PyObject *v = NULL;
Guido van Rossumfeee4b92000-03-10 22:57:27 +0000440
Guido van Rossumfeee4b92000-03-10 22:57:27 +0000441 args = args_tuple(object, errors);
442 if (args == NULL)
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000443 goto onError;
Guido van Rossum98297ee2007-11-06 21:34:58 +0000444
Jeroen Demeyer1dbd0842019-07-11 17:57:32 +0200445 result = PyObject_Call(encoder, args, NULL);
Nick Coghlanc4c25802013-11-15 21:47:37 +1000446 if (result == NULL) {
447 wrap_codec_error("encoding", encoding);
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000448 goto onError;
Nick Coghlanc4c25802013-11-15 21:47:37 +1000449 }
Guido van Rossumfeee4b92000-03-10 22:57:27 +0000450
Guido van Rossum98297ee2007-11-06 21:34:58 +0000451 if (!PyTuple_Check(result) ||
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000452 PyTuple_GET_SIZE(result) != 2) {
453 PyErr_SetString(PyExc_TypeError,
454 "encoder must return a tuple (object, integer)");
455 goto onError;
Guido van Rossumfeee4b92000-03-10 22:57:27 +0000456 }
Marc-André Lemburgb2750b52008-06-06 12:18:17 +0000457 v = PyTuple_GET_ITEM(result,0);
458 Py_INCREF(v);
Guido van Rossumfeee4b92000-03-10 22:57:27 +0000459 /* We don't check or use the second (integer) entry. */
460
Marc-André Lemburgb2750b52008-06-06 12:18:17 +0000461 Py_DECREF(args);
462 Py_DECREF(encoder);
463 Py_DECREF(result);
464 return v;
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000465
Guido van Rossumfeee4b92000-03-10 22:57:27 +0000466 onError:
Neal Norwitz3715c3e2005-11-24 22:09:18 +0000467 Py_XDECREF(result);
Guido van Rossumfeee4b92000-03-10 22:57:27 +0000468 Py_XDECREF(args);
469 Py_XDECREF(encoder);
Marc-André Lemburgb2750b52008-06-06 12:18:17 +0000470 return NULL;
Guido van Rossumfeee4b92000-03-10 22:57:27 +0000471}
472
473/* Decode an object (usually a Python string) using the given encoding
Martin Panter6245cb32016-04-15 02:14:19 +0000474 and return an equivalent object (e.g. a Unicode object).
Guido van Rossumfeee4b92000-03-10 22:57:27 +0000475
476 errors is passed to the decoder factory as argument if non-NULL. */
477
Nick Coghlanc72e4e62013-11-22 22:39:36 +1000478static PyObject *
479_PyCodec_DecodeInternal(PyObject *object,
480 PyObject *decoder,
481 const char *encoding,
482 const char *errors)
Guido van Rossumfeee4b92000-03-10 22:57:27 +0000483{
Guido van Rossumfeee4b92000-03-10 22:57:27 +0000484 PyObject *args = NULL, *result = NULL;
485 PyObject *v;
486
Guido van Rossumfeee4b92000-03-10 22:57:27 +0000487 args = args_tuple(object, errors);
488 if (args == NULL)
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000489 goto onError;
Guido van Rossum98297ee2007-11-06 21:34:58 +0000490
Jeroen Demeyer1dbd0842019-07-11 17:57:32 +0200491 result = PyObject_Call(decoder, args, NULL);
Nick Coghlanc4c25802013-11-15 21:47:37 +1000492 if (result == NULL) {
493 wrap_codec_error("decoding", encoding);
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000494 goto onError;
Nick Coghlanc4c25802013-11-15 21:47:37 +1000495 }
Guido van Rossum98297ee2007-11-06 21:34:58 +0000496 if (!PyTuple_Check(result) ||
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000497 PyTuple_GET_SIZE(result) != 2) {
498 PyErr_SetString(PyExc_TypeError,
499 "decoder must return a tuple (object,integer)");
500 goto onError;
Guido van Rossumfeee4b92000-03-10 22:57:27 +0000501 }
502 v = PyTuple_GET_ITEM(result,0);
503 Py_INCREF(v);
504 /* We don't check or use the second (integer) entry. */
505
506 Py_DECREF(args);
507 Py_DECREF(decoder);
508 Py_DECREF(result);
509 return v;
Guido van Rossum98297ee2007-11-06 21:34:58 +0000510
Guido van Rossumfeee4b92000-03-10 22:57:27 +0000511 onError:
512 Py_XDECREF(args);
513 Py_XDECREF(decoder);
514 Py_XDECREF(result);
515 return NULL;
516}
517
Nick Coghlanc72e4e62013-11-22 22:39:36 +1000518/* Generic encoding/decoding API */
519PyObject *PyCodec_Encode(PyObject *object,
520 const char *encoding,
521 const char *errors)
522{
523 PyObject *encoder;
524
525 encoder = PyCodec_Encoder(encoding);
526 if (encoder == NULL)
527 return NULL;
528
529 return _PyCodec_EncodeInternal(object, encoder, encoding, errors);
530}
531
532PyObject *PyCodec_Decode(PyObject *object,
533 const char *encoding,
534 const char *errors)
535{
536 PyObject *decoder;
537
538 decoder = PyCodec_Decoder(encoding);
539 if (decoder == NULL)
540 return NULL;
541
542 return _PyCodec_DecodeInternal(object, decoder, encoding, errors);
543}
544
545/* Text encoding/decoding API */
Nick Coghlana9b15242014-02-04 22:11:18 +1000546PyObject * _PyCodec_LookupTextEncoding(const char *encoding,
547 const char *alternate_command)
Nick Coghlanc72e4e62013-11-22 22:39:36 +1000548{
549 _Py_IDENTIFIER(_is_text_encoding);
550 PyObject *codec;
551 PyObject *attr;
Nick Coghlanc72e4e62013-11-22 22:39:36 +1000552 int is_text_codec;
553
554 codec = _PyCodec_Lookup(encoding);
555 if (codec == NULL)
556 return NULL;
557
558 /* Backwards compatibility: assume any raw tuple describes a text
559 * encoding, and the same for anything lacking the private
560 * attribute.
561 */
562 if (!PyTuple_CheckExact(codec)) {
Serhiy Storchakaf320be72018-01-25 10:49:40 +0200563 if (_PyObject_LookupAttrId(codec, &PyId__is_text_encoding, &attr) < 0) {
564 Py_DECREF(codec);
565 return NULL;
566 }
567 if (attr != NULL) {
Nick Coghlanc72e4e62013-11-22 22:39:36 +1000568 is_text_codec = PyObject_IsTrue(attr);
569 Py_DECREF(attr);
Serhiy Storchakafa494fd2015-05-30 17:45:22 +0300570 if (is_text_codec <= 0) {
Nick Coghlanc72e4e62013-11-22 22:39:36 +1000571 Py_DECREF(codec);
Serhiy Storchakafa494fd2015-05-30 17:45:22 +0300572 if (!is_text_codec)
573 PyErr_Format(PyExc_LookupError,
574 "'%.400s' is not a text encoding; "
575 "use %s to handle arbitrary codecs",
576 encoding, alternate_command);
Nick Coghlanc72e4e62013-11-22 22:39:36 +1000577 return NULL;
578 }
579 }
580 }
581
Nick Coghlana9b15242014-02-04 22:11:18 +1000582 /* This appears to be a valid text encoding */
583 return codec;
584}
585
586
587static
588PyObject *codec_getitem_checked(const char *encoding,
589 const char *alternate_command,
590 int index)
591{
592 PyObject *codec;
593 PyObject *v;
594
595 codec = _PyCodec_LookupTextEncoding(encoding, alternate_command);
596 if (codec == NULL)
597 return NULL;
598
Nick Coghlanc72e4e62013-11-22 22:39:36 +1000599 v = PyTuple_GET_ITEM(codec, index);
Nick Coghlanc72e4e62013-11-22 22:39:36 +1000600 Py_INCREF(v);
Nick Coghlana9b15242014-02-04 22:11:18 +1000601 Py_DECREF(codec);
Nick Coghlanc72e4e62013-11-22 22:39:36 +1000602 return v;
603}
604
605static PyObject * _PyCodec_TextEncoder(const char *encoding)
606{
Nick Coghlana9b15242014-02-04 22:11:18 +1000607 return codec_getitem_checked(encoding, "codecs.encode()", 0);
Nick Coghlanc72e4e62013-11-22 22:39:36 +1000608}
609
610static PyObject * _PyCodec_TextDecoder(const char *encoding)
611{
Nick Coghlana9b15242014-02-04 22:11:18 +1000612 return codec_getitem_checked(encoding, "codecs.decode()", 1);
Nick Coghlanc72e4e62013-11-22 22:39:36 +1000613}
614
615PyObject *_PyCodec_EncodeText(PyObject *object,
616 const char *encoding,
617 const char *errors)
618{
619 PyObject *encoder;
620
621 encoder = _PyCodec_TextEncoder(encoding);
622 if (encoder == NULL)
623 return NULL;
624
625 return _PyCodec_EncodeInternal(object, encoder, encoding, errors);
626}
627
628PyObject *_PyCodec_DecodeText(PyObject *object,
629 const char *encoding,
630 const char *errors)
631{
632 PyObject *decoder;
633
634 decoder = _PyCodec_TextDecoder(encoding);
635 if (decoder == NULL)
636 return NULL;
637
638 return _PyCodec_DecodeInternal(object, decoder, encoding, errors);
639}
640
Walter Dörwald3aeb6322002-09-02 13:14:32 +0000641/* Register the error handling callback function error under the name
642 name. This function will be called by the codec when it encounters
643 an unencodable characters/undecodable bytes and doesn't know the
644 callback name, when name is specified as the error parameter
645 in the call to the encode/decode function.
646 Return 0 on success, -1 on error */
647int PyCodec_RegisterError(const char *name, PyObject *error)
648{
Victor Stinner81a7be32020-04-14 15:14:01 +0200649 PyInterpreterState *interp = _PyInterpreterState_GET();
Gustavo Niemeyer5ddd4c32003-03-19 00:35:36 +0000650 if (interp->codec_search_path == NULL && _PyCodecRegistry_Init())
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000651 return -1;
Walter Dörwald3aeb6322002-09-02 13:14:32 +0000652 if (!PyCallable_Check(error)) {
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000653 PyErr_SetString(PyExc_TypeError, "handler must be callable");
654 return -1;
Walter Dörwald3aeb6322002-09-02 13:14:32 +0000655 }
Gustavo Niemeyer5ddd4c32003-03-19 00:35:36 +0000656 return PyDict_SetItemString(interp->codec_error_registry,
Serhiy Storchakac6792272013-10-19 21:03:34 +0300657 name, error);
Walter Dörwald3aeb6322002-09-02 13:14:32 +0000658}
659
660/* Lookup the error handling callback function registered under the
661 name error. As a special case NULL can be passed, in which case
662 the error handling callback for strict encoding will be returned. */
663PyObject *PyCodec_LookupError(const char *name)
664{
665 PyObject *handler = NULL;
666
Victor Stinner81a7be32020-04-14 15:14:01 +0200667 PyInterpreterState *interp = _PyInterpreterState_GET();
Gustavo Niemeyer5ddd4c32003-03-19 00:35:36 +0000668 if (interp->codec_search_path == NULL && _PyCodecRegistry_Init())
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000669 return NULL;
Gustavo Niemeyer5ddd4c32003-03-19 00:35:36 +0000670
Walter Dörwald3aeb6322002-09-02 13:14:32 +0000671 if (name==NULL)
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000672 name = "strict";
Serhiy Storchakaa24107b2019-02-25 17:59:46 +0200673 handler = _PyDict_GetItemStringWithError(interp->codec_error_registry, name);
674 if (handler) {
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000675 Py_INCREF(handler);
Serhiy Storchakaa24107b2019-02-25 17:59:46 +0200676 }
677 else if (!PyErr_Occurred()) {
678 PyErr_Format(PyExc_LookupError, "unknown error handler name '%.400s'", name);
679 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +0000680 return handler;
681}
682
683static void wrong_exception_type(PyObject *exc)
684{
Serhiy Storchakaca7fecb2015-05-18 16:08:52 +0300685 PyErr_Format(PyExc_TypeError,
686 "don't know how to handle %.200s in error callback",
Victor Stinnera102ed72020-02-07 02:24:48 +0100687 Py_TYPE(exc)->tp_name);
Walter Dörwald3aeb6322002-09-02 13:14:32 +0000688}
689
690PyObject *PyCodec_StrictErrors(PyObject *exc)
691{
Brett Cannonbf364092006-03-01 04:25:17 +0000692 if (PyExceptionInstance_Check(exc))
693 PyErr_SetObject(PyExceptionInstance_Class(exc), exc);
Walter Dörwald3aeb6322002-09-02 13:14:32 +0000694 else
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000695 PyErr_SetString(PyExc_TypeError, "codec must pass exception instance");
Walter Dörwald3aeb6322002-09-02 13:14:32 +0000696 return NULL;
697}
698
699
700PyObject *PyCodec_IgnoreErrors(PyObject *exc)
701{
Martin v. Löwis18e16552006-02-15 17:27:45 +0000702 Py_ssize_t end;
Serhiy Storchakaca7fecb2015-05-18 16:08:52 +0300703
704 if (PyObject_TypeCheck(exc, (PyTypeObject *)PyExc_UnicodeEncodeError)) {
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000705 if (PyUnicodeEncodeError_GetEnd(exc, &end))
706 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +0000707 }
Serhiy Storchakaca7fecb2015-05-18 16:08:52 +0300708 else if (PyObject_TypeCheck(exc, (PyTypeObject *)PyExc_UnicodeDecodeError)) {
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000709 if (PyUnicodeDecodeError_GetEnd(exc, &end))
710 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +0000711 }
Serhiy Storchakaca7fecb2015-05-18 16:08:52 +0300712 else if (PyObject_TypeCheck(exc, (PyTypeObject *)PyExc_UnicodeTranslateError)) {
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000713 if (PyUnicodeTranslateError_GetEnd(exc, &end))
714 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +0000715 }
716 else {
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000717 wrong_exception_type(exc);
718 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +0000719 }
Victor Stinneree450092011-12-01 02:52:11 +0100720 return Py_BuildValue("(Nn)", PyUnicode_New(0, 0), end);
Walter Dörwald3aeb6322002-09-02 13:14:32 +0000721}
722
723
724PyObject *PyCodec_ReplaceErrors(PyObject *exc)
725{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200726 Py_ssize_t start, end, i, len;
Walter Dörwald3aeb6322002-09-02 13:14:32 +0000727
Serhiy Storchakaca7fecb2015-05-18 16:08:52 +0300728 if (PyObject_TypeCheck(exc, (PyTypeObject *)PyExc_UnicodeEncodeError)) {
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000729 PyObject *res;
Serhiy Storchakacd8295f2020-04-11 10:48:40 +0300730 Py_UCS1 *outp;
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000731 if (PyUnicodeEncodeError_GetStart(exc, &start))
732 return NULL;
733 if (PyUnicodeEncodeError_GetEnd(exc, &end))
734 return NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200735 len = end - start;
736 res = PyUnicode_New(len, '?');
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000737 if (res == NULL)
738 return NULL;
Serhiy Storchakacd8295f2020-04-11 10:48:40 +0300739 assert(PyUnicode_KIND(res) == PyUnicode_1BYTE_KIND);
740 outp = PyUnicode_1BYTE_DATA(res);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200741 for (i = 0; i < len; ++i)
Serhiy Storchakacd8295f2020-04-11 10:48:40 +0300742 outp[i] = '?';
Victor Stinner8f825062012-04-27 13:55:39 +0200743 assert(_PyUnicode_CheckConsistency(res, 1));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200744 return Py_BuildValue("(Nn)", res, end);
Walter Dörwald3aeb6322002-09-02 13:14:32 +0000745 }
Serhiy Storchakaca7fecb2015-05-18 16:08:52 +0300746 else if (PyObject_TypeCheck(exc, (PyTypeObject *)PyExc_UnicodeDecodeError)) {
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000747 if (PyUnicodeDecodeError_GetEnd(exc, &end))
748 return NULL;
Victor Stinner1a15aba2011-10-02 19:00:15 +0200749 return Py_BuildValue("(Cn)",
750 (int)Py_UNICODE_REPLACEMENT_CHARACTER,
751 end);
Walter Dörwald3aeb6322002-09-02 13:14:32 +0000752 }
Serhiy Storchakaca7fecb2015-05-18 16:08:52 +0300753 else if (PyObject_TypeCheck(exc, (PyTypeObject *)PyExc_UnicodeTranslateError)) {
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000754 PyObject *res;
Serhiy Storchakacd8295f2020-04-11 10:48:40 +0300755 Py_UCS2 *outp;
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000756 if (PyUnicodeTranslateError_GetStart(exc, &start))
757 return NULL;
758 if (PyUnicodeTranslateError_GetEnd(exc, &end))
759 return NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200760 len = end - start;
761 res = PyUnicode_New(len, Py_UNICODE_REPLACEMENT_CHARACTER);
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000762 if (res == NULL)
763 return NULL;
Serhiy Storchakacd8295f2020-04-11 10:48:40 +0300764 assert(PyUnicode_KIND(res) == PyUnicode_2BYTE_KIND);
765 outp = PyUnicode_2BYTE_DATA(res);
766 for (i = 0; i < len; i++)
767 outp[i] = Py_UNICODE_REPLACEMENT_CHARACTER;
Victor Stinner8f825062012-04-27 13:55:39 +0200768 assert(_PyUnicode_CheckConsistency(res, 1));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200769 return Py_BuildValue("(Nn)", res, end);
Walter Dörwald3aeb6322002-09-02 13:14:32 +0000770 }
771 else {
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000772 wrong_exception_type(exc);
773 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +0000774 }
775}
776
777PyObject *PyCodec_XMLCharRefReplaceErrors(PyObject *exc)
778{
Serhiy Storchakaca7fecb2015-05-18 16:08:52 +0300779 if (PyObject_TypeCheck(exc, (PyTypeObject *)PyExc_UnicodeEncodeError)) {
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000780 PyObject *restuple;
781 PyObject *object;
Victor Stinnerb31f1bc2011-11-04 21:29:10 +0100782 Py_ssize_t i;
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000783 Py_ssize_t start;
784 Py_ssize_t end;
785 PyObject *res;
Serhiy Storchakacd8295f2020-04-11 10:48:40 +0300786 Py_UCS1 *outp;
Serhiy Storchaka2e374092014-10-04 14:15:49 +0300787 Py_ssize_t ressize;
Martin v. Löwisb09af032011-11-04 11:16:41 +0100788 Py_UCS4 ch;
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000789 if (PyUnicodeEncodeError_GetStart(exc, &start))
790 return NULL;
791 if (PyUnicodeEncodeError_GetEnd(exc, &end))
792 return NULL;
793 if (!(object = PyUnicodeEncodeError_GetObject(exc)))
794 return NULL;
Serhiy Storchaka2e374092014-10-04 14:15:49 +0300795 if (end - start > PY_SSIZE_T_MAX / (2+7+1))
796 end = start + PY_SSIZE_T_MAX / (2+7+1);
Martin v. Löwisb09af032011-11-04 11:16:41 +0100797 for (i = start, ressize = 0; i < end; ++i) {
798 /* object is guaranteed to be "ready" */
799 ch = PyUnicode_READ_CHAR(object, i);
800 if (ch<10)
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000801 ressize += 2+1+1;
Martin v. Löwisb09af032011-11-04 11:16:41 +0100802 else if (ch<100)
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000803 ressize += 2+2+1;
Martin v. Löwisb09af032011-11-04 11:16:41 +0100804 else if (ch<1000)
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000805 ressize += 2+3+1;
Martin v. Löwisb09af032011-11-04 11:16:41 +0100806 else if (ch<10000)
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000807 ressize += 2+4+1;
Martin v. Löwisb09af032011-11-04 11:16:41 +0100808 else if (ch<100000)
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000809 ressize += 2+5+1;
Martin v. Löwisb09af032011-11-04 11:16:41 +0100810 else if (ch<1000000)
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000811 ressize += 2+6+1;
812 else
813 ressize += 2+7+1;
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000814 }
815 /* allocate replacement */
Martin v. Löwisb09af032011-11-04 11:16:41 +0100816 res = PyUnicode_New(ressize, 127);
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000817 if (res == NULL) {
818 Py_DECREF(object);
819 return NULL;
820 }
Martin v. Löwisb09af032011-11-04 11:16:41 +0100821 outp = PyUnicode_1BYTE_DATA(res);
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000822 /* generate replacement */
Victor Stinnerb31f1bc2011-11-04 21:29:10 +0100823 for (i = start; i < end; ++i) {
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000824 int digits;
825 int base;
Martin v. Löwis8ba79302011-11-04 12:26:49 +0100826 ch = PyUnicode_READ_CHAR(object, i);
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000827 *outp++ = '&';
828 *outp++ = '#';
Martin v. Löwisb09af032011-11-04 11:16:41 +0100829 if (ch<10) {
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000830 digits = 1;
831 base = 1;
832 }
Martin v. Löwisb09af032011-11-04 11:16:41 +0100833 else if (ch<100) {
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000834 digits = 2;
835 base = 10;
836 }
Martin v. Löwisb09af032011-11-04 11:16:41 +0100837 else if (ch<1000) {
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000838 digits = 3;
839 base = 100;
840 }
Martin v. Löwisb09af032011-11-04 11:16:41 +0100841 else if (ch<10000) {
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000842 digits = 4;
843 base = 1000;
844 }
Martin v. Löwisb09af032011-11-04 11:16:41 +0100845 else if (ch<100000) {
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000846 digits = 5;
847 base = 10000;
848 }
Martin v. Löwisb09af032011-11-04 11:16:41 +0100849 else if (ch<1000000) {
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000850 digits = 6;
851 base = 100000;
852 }
853 else {
854 digits = 7;
855 base = 1000000;
856 }
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000857 while (digits-->0) {
Martin v. Löwisb09af032011-11-04 11:16:41 +0100858 *outp++ = '0' + ch/base;
859 ch %= base;
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000860 base /= 10;
861 }
862 *outp++ = ';';
863 }
Victor Stinner8f825062012-04-27 13:55:39 +0200864 assert(_PyUnicode_CheckConsistency(res, 1));
865 restuple = Py_BuildValue("(Nn)", res, end);
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000866 Py_DECREF(object);
867 return restuple;
Walter Dörwald3aeb6322002-09-02 13:14:32 +0000868 }
869 else {
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000870 wrong_exception_type(exc);
871 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +0000872 }
873}
874
Walter Dörwald3aeb6322002-09-02 13:14:32 +0000875PyObject *PyCodec_BackslashReplaceErrors(PyObject *exc)
876{
Serhiy Storchaka07985ef2015-01-25 22:56:57 +0200877 PyObject *object;
878 Py_ssize_t i;
879 Py_ssize_t start;
880 Py_ssize_t end;
881 PyObject *res;
Serhiy Storchakacd8295f2020-04-11 10:48:40 +0300882 Py_UCS1 *outp;
Serhiy Storchaka07985ef2015-01-25 22:56:57 +0200883 int ressize;
884 Py_UCS4 c;
885
Serhiy Storchakac0937f72015-05-18 16:10:40 +0300886 if (PyObject_TypeCheck(exc, (PyTypeObject *)PyExc_UnicodeDecodeError)) {
Serhiy Storchakacb33a012016-10-23 09:44:50 +0300887 const unsigned char *p;
Serhiy Storchaka07985ef2015-01-25 22:56:57 +0200888 if (PyUnicodeDecodeError_GetStart(exc, &start))
889 return NULL;
890 if (PyUnicodeDecodeError_GetEnd(exc, &end))
891 return NULL;
892 if (!(object = PyUnicodeDecodeError_GetObject(exc)))
893 return NULL;
Serhiy Storchakacb33a012016-10-23 09:44:50 +0300894 p = (const unsigned char*)PyBytes_AS_STRING(object);
Serhiy Storchaka07985ef2015-01-25 22:56:57 +0200895 res = PyUnicode_New(4 * (end - start), 127);
896 if (res == NULL) {
897 Py_DECREF(object);
898 return NULL;
899 }
900 outp = PyUnicode_1BYTE_DATA(res);
901 for (i = start; i < end; i++, outp += 4) {
902 unsigned char c = p[i];
903 outp[0] = '\\';
904 outp[1] = 'x';
905 outp[2] = Py_hexdigits[(c>>4)&0xf];
906 outp[3] = Py_hexdigits[c&0xf];
907 }
908
909 assert(_PyUnicode_CheckConsistency(res, 1));
910 Py_DECREF(object);
911 return Py_BuildValue("(Nn)", res, end);
912 }
Serhiy Storchakac0937f72015-05-18 16:10:40 +0300913 if (PyObject_TypeCheck(exc, (PyTypeObject *)PyExc_UnicodeEncodeError)) {
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000914 if (PyUnicodeEncodeError_GetStart(exc, &start))
915 return NULL;
916 if (PyUnicodeEncodeError_GetEnd(exc, &end))
917 return NULL;
918 if (!(object = PyUnicodeEncodeError_GetObject(exc)))
919 return NULL;
Serhiy Storchaka07985ef2015-01-25 22:56:57 +0200920 }
Serhiy Storchakac0937f72015-05-18 16:10:40 +0300921 else if (PyObject_TypeCheck(exc, (PyTypeObject *)PyExc_UnicodeTranslateError)) {
Serhiy Storchaka07985ef2015-01-25 22:56:57 +0200922 if (PyUnicodeTranslateError_GetStart(exc, &start))
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000923 return NULL;
Serhiy Storchaka07985ef2015-01-25 22:56:57 +0200924 if (PyUnicodeTranslateError_GetEnd(exc, &end))
925 return NULL;
926 if (!(object = PyUnicodeTranslateError_GetObject(exc)))
927 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +0000928 }
929 else {
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000930 wrong_exception_type(exc);
931 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +0000932 }
Serhiy Storchaka07985ef2015-01-25 22:56:57 +0200933
934 if (end - start > PY_SSIZE_T_MAX / (1+1+8))
935 end = start + PY_SSIZE_T_MAX / (1+1+8);
936 for (i = start, ressize = 0; i < end; ++i) {
937 /* object is guaranteed to be "ready" */
938 c = PyUnicode_READ_CHAR(object, i);
939 if (c >= 0x10000) {
940 ressize += 1+1+8;
941 }
942 else if (c >= 0x100) {
943 ressize += 1+1+4;
944 }
945 else
946 ressize += 1+1+2;
947 }
948 res = PyUnicode_New(ressize, 127);
949 if (res == NULL) {
950 Py_DECREF(object);
951 return NULL;
952 }
953 outp = PyUnicode_1BYTE_DATA(res);
954 for (i = start; i < end; ++i) {
955 c = PyUnicode_READ_CHAR(object, i);
956 *outp++ = '\\';
957 if (c >= 0x00010000) {
958 *outp++ = 'U';
959 *outp++ = Py_hexdigits[(c>>28)&0xf];
960 *outp++ = Py_hexdigits[(c>>24)&0xf];
961 *outp++ = Py_hexdigits[(c>>20)&0xf];
962 *outp++ = Py_hexdigits[(c>>16)&0xf];
963 *outp++ = Py_hexdigits[(c>>12)&0xf];
964 *outp++ = Py_hexdigits[(c>>8)&0xf];
965 }
966 else if (c >= 0x100) {
967 *outp++ = 'u';
968 *outp++ = Py_hexdigits[(c>>12)&0xf];
969 *outp++ = Py_hexdigits[(c>>8)&0xf];
970 }
971 else
972 *outp++ = 'x';
973 *outp++ = Py_hexdigits[(c>>4)&0xf];
974 *outp++ = Py_hexdigits[c&0xf];
975 }
976
977 assert(_PyUnicode_CheckConsistency(res, 1));
978 Py_DECREF(object);
979 return Py_BuildValue("(Nn)", res, end);
Walter Dörwald3aeb6322002-09-02 13:14:32 +0000980}
981
Serhiy Storchaka166ebc42014-11-25 13:57:17 +0200982static _PyUnicode_Name_CAPI *ucnhash_CAPI = NULL;
Serhiy Storchaka166ebc42014-11-25 13:57:17 +0200983
984PyObject *PyCodec_NameReplaceErrors(PyObject *exc)
985{
Serhiy Storchakaca7fecb2015-05-18 16:08:52 +0300986 if (PyObject_TypeCheck(exc, (PyTypeObject *)PyExc_UnicodeEncodeError)) {
Serhiy Storchaka166ebc42014-11-25 13:57:17 +0200987 PyObject *restuple;
988 PyObject *object;
989 Py_ssize_t i;
990 Py_ssize_t start;
991 Py_ssize_t end;
992 PyObject *res;
Serhiy Storchakacd8295f2020-04-11 10:48:40 +0300993 Py_UCS1 *outp;
Serhiy Storchakaaacfccc2014-11-26 12:11:40 +0200994 Py_ssize_t ressize;
995 int replsize;
Serhiy Storchaka166ebc42014-11-25 13:57:17 +0200996 Py_UCS4 c;
997 char buffer[256]; /* NAME_MAXLEN */
998 if (PyUnicodeEncodeError_GetStart(exc, &start))
999 return NULL;
1000 if (PyUnicodeEncodeError_GetEnd(exc, &end))
1001 return NULL;
1002 if (!(object = PyUnicodeEncodeError_GetObject(exc)))
1003 return NULL;
Victor Stinner38b8ae02015-09-03 16:19:40 +02001004 if (!ucnhash_CAPI) {
Serhiy Storchaka166ebc42014-11-25 13:57:17 +02001005 /* load the unicode data module */
1006 ucnhash_CAPI = (_PyUnicode_Name_CAPI *)PyCapsule_Import(
1007 PyUnicodeData_CAPSULE_NAME, 1);
Victor Stinner38b8ae02015-09-03 16:19:40 +02001008 if (!ucnhash_CAPI)
1009 return NULL;
Serhiy Storchaka166ebc42014-11-25 13:57:17 +02001010 }
1011 for (i = start, ressize = 0; i < end; ++i) {
1012 /* object is guaranteed to be "ready" */
1013 c = PyUnicode_READ_CHAR(object, i);
Victor Stinner38b8ae02015-09-03 16:19:40 +02001014 if (ucnhash_CAPI->getname(NULL, c, buffer, sizeof(buffer), 1)) {
Serhiy Storchaka26861b02015-02-16 20:52:17 +02001015 replsize = 1+1+1+(int)strlen(buffer)+1;
Serhiy Storchaka166ebc42014-11-25 13:57:17 +02001016 }
1017 else if (c >= 0x10000) {
Serhiy Storchakaaacfccc2014-11-26 12:11:40 +02001018 replsize = 1+1+8;
Serhiy Storchaka166ebc42014-11-25 13:57:17 +02001019 }
1020 else if (c >= 0x100) {
Serhiy Storchakaaacfccc2014-11-26 12:11:40 +02001021 replsize = 1+1+4;
Serhiy Storchaka166ebc42014-11-25 13:57:17 +02001022 }
1023 else
Serhiy Storchakaaacfccc2014-11-26 12:11:40 +02001024 replsize = 1+1+2;
1025 if (ressize > PY_SSIZE_T_MAX - replsize)
1026 break;
1027 ressize += replsize;
Serhiy Storchaka166ebc42014-11-25 13:57:17 +02001028 }
Serhiy Storchakaaacfccc2014-11-26 12:11:40 +02001029 end = i;
Serhiy Storchaka166ebc42014-11-25 13:57:17 +02001030 res = PyUnicode_New(ressize, 127);
1031 if (res==NULL)
1032 return NULL;
1033 for (i = start, outp = PyUnicode_1BYTE_DATA(res);
1034 i < end; ++i) {
1035 c = PyUnicode_READ_CHAR(object, i);
1036 *outp++ = '\\';
Victor Stinner38b8ae02015-09-03 16:19:40 +02001037 if (ucnhash_CAPI->getname(NULL, c, buffer, sizeof(buffer), 1)) {
Serhiy Storchaka166ebc42014-11-25 13:57:17 +02001038 *outp++ = 'N';
1039 *outp++ = '{';
1040 strcpy((char *)outp, buffer);
1041 outp += strlen(buffer);
1042 *outp++ = '}';
1043 continue;
1044 }
1045 if (c >= 0x00010000) {
1046 *outp++ = 'U';
1047 *outp++ = Py_hexdigits[(c>>28)&0xf];
1048 *outp++ = Py_hexdigits[(c>>24)&0xf];
1049 *outp++ = Py_hexdigits[(c>>20)&0xf];
1050 *outp++ = Py_hexdigits[(c>>16)&0xf];
1051 *outp++ = Py_hexdigits[(c>>12)&0xf];
1052 *outp++ = Py_hexdigits[(c>>8)&0xf];
1053 }
1054 else if (c >= 0x100) {
1055 *outp++ = 'u';
1056 *outp++ = Py_hexdigits[(c>>12)&0xf];
1057 *outp++ = Py_hexdigits[(c>>8)&0xf];
1058 }
1059 else
1060 *outp++ = 'x';
1061 *outp++ = Py_hexdigits[(c>>4)&0xf];
1062 *outp++ = Py_hexdigits[c&0xf];
1063 }
1064
Benjamin Peterson3663b582014-11-26 14:39:54 -06001065 assert(outp == PyUnicode_1BYTE_DATA(res) + ressize);
Serhiy Storchaka166ebc42014-11-25 13:57:17 +02001066 assert(_PyUnicode_CheckConsistency(res, 1));
1067 restuple = Py_BuildValue("(Nn)", res, end);
1068 Py_DECREF(object);
1069 return restuple;
1070 }
1071 else {
1072 wrong_exception_type(exc);
1073 return NULL;
1074 }
1075}
1076
Serhiy Storchaka88d8fb62014-05-15 14:37:42 +03001077#define ENC_UNKNOWN -1
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02001078#define ENC_UTF8 0
1079#define ENC_UTF16BE 1
1080#define ENC_UTF16LE 2
1081#define ENC_UTF32BE 3
1082#define ENC_UTF32LE 4
1083
1084static int
1085get_standard_encoding(const char *encoding, int *bytelength)
1086{
1087 if (Py_TOLOWER(encoding[0]) == 'u' &&
1088 Py_TOLOWER(encoding[1]) == 't' &&
1089 Py_TOLOWER(encoding[2]) == 'f') {
1090 encoding += 3;
1091 if (*encoding == '-' || *encoding == '_' )
1092 encoding++;
Serhiy Storchaka88d8fb62014-05-15 14:37:42 +03001093 if (encoding[0] == '8' && encoding[1] == '\0') {
1094 *bytelength = 3;
1095 return ENC_UTF8;
1096 }
1097 else if (encoding[0] == '1' && encoding[1] == '6') {
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02001098 encoding += 2;
1099 *bytelength = 2;
1100 if (*encoding == '\0') {
1101#ifdef WORDS_BIGENDIAN
1102 return ENC_UTF16BE;
1103#else
1104 return ENC_UTF16LE;
1105#endif
1106 }
1107 if (*encoding == '-' || *encoding == '_' )
1108 encoding++;
1109 if (Py_TOLOWER(encoding[1]) == 'e' && encoding[2] == '\0') {
1110 if (Py_TOLOWER(encoding[0]) == 'b')
1111 return ENC_UTF16BE;
1112 if (Py_TOLOWER(encoding[0]) == 'l')
1113 return ENC_UTF16LE;
1114 }
1115 }
1116 else if (encoding[0] == '3' && encoding[1] == '2') {
1117 encoding += 2;
1118 *bytelength = 4;
1119 if (*encoding == '\0') {
1120#ifdef WORDS_BIGENDIAN
1121 return ENC_UTF32BE;
1122#else
1123 return ENC_UTF32LE;
1124#endif
1125 }
1126 if (*encoding == '-' || *encoding == '_' )
1127 encoding++;
1128 if (Py_TOLOWER(encoding[1]) == 'e' && encoding[2] == '\0') {
1129 if (Py_TOLOWER(encoding[0]) == 'b')
1130 return ENC_UTF32BE;
1131 if (Py_TOLOWER(encoding[0]) == 'l')
1132 return ENC_UTF32LE;
1133 }
1134 }
1135 }
Victor Stinner0d4e01c2014-05-16 14:46:20 +02001136 else if (strcmp(encoding, "CP_UTF8") == 0) {
1137 *bytelength = 3;
1138 return ENC_UTF8;
1139 }
Serhiy Storchaka88d8fb62014-05-15 14:37:42 +03001140 return ENC_UNKNOWN;
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02001141}
1142
Martin v. Löwisaef3fb02009-05-02 19:27:30 +00001143/* This handler is declared static until someone demonstrates
1144 a need to call it directly. */
1145static PyObject *
Martin v. Löwise0a2b722009-05-10 08:08:56 +00001146PyCodec_SurrogatePassErrors(PyObject *exc)
Martin v. Löwisdb12d452009-05-02 18:52:14 +00001147{
1148 PyObject *restuple;
1149 PyObject *object;
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02001150 PyObject *encode;
Serhiy Storchaka85b0f5b2016-11-20 10:16:47 +02001151 const char *encoding;
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02001152 int code;
1153 int bytelength;
Martin v. Löwisb09af032011-11-04 11:16:41 +01001154 Py_ssize_t i;
Martin v. Löwisdb12d452009-05-02 18:52:14 +00001155 Py_ssize_t start;
1156 Py_ssize_t end;
1157 PyObject *res;
Serhiy Storchakaca7fecb2015-05-18 16:08:52 +03001158
1159 if (PyObject_TypeCheck(exc, (PyTypeObject *)PyExc_UnicodeEncodeError)) {
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02001160 unsigned char *outp;
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001161 if (PyUnicodeEncodeError_GetStart(exc, &start))
1162 return NULL;
1163 if (PyUnicodeEncodeError_GetEnd(exc, &end))
1164 return NULL;
1165 if (!(object = PyUnicodeEncodeError_GetObject(exc)))
1166 return NULL;
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02001167 if (!(encode = PyUnicodeEncodeError_GetEncoding(exc))) {
1168 Py_DECREF(object);
1169 return NULL;
1170 }
1171 if (!(encoding = PyUnicode_AsUTF8(encode))) {
1172 Py_DECREF(object);
1173 Py_DECREF(encode);
1174 return NULL;
1175 }
1176 code = get_standard_encoding(encoding, &bytelength);
1177 Py_DECREF(encode);
Serhiy Storchaka88d8fb62014-05-15 14:37:42 +03001178 if (code == ENC_UNKNOWN) {
1179 /* Not supported, fail with original exception */
1180 PyErr_SetObject(PyExceptionInstance_Class(exc), exc);
1181 Py_DECREF(object);
1182 return NULL;
1183 }
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02001184
Serhiy Storchaka2e374092014-10-04 14:15:49 +03001185 if (end - start > PY_SSIZE_T_MAX / bytelength)
1186 end = start + PY_SSIZE_T_MAX / bytelength;
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02001187 res = PyBytes_FromStringAndSize(NULL, bytelength*(end-start));
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001188 if (!res) {
1189 Py_DECREF(object);
1190 return NULL;
1191 }
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02001192 outp = (unsigned char*)PyBytes_AsString(res);
Martin v. Löwisb09af032011-11-04 11:16:41 +01001193 for (i = start; i < end; i++) {
1194 /* object is guaranteed to be "ready" */
1195 Py_UCS4 ch = PyUnicode_READ_CHAR(object, i);
Victor Stinner76df43d2012-10-30 01:42:39 +01001196 if (!Py_UNICODE_IS_SURROGATE(ch)) {
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001197 /* Not a surrogate, fail with original exception */
1198 PyErr_SetObject(PyExceptionInstance_Class(exc), exc);
1199 Py_DECREF(res);
1200 Py_DECREF(object);
1201 return NULL;
1202 }
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02001203 switch (code) {
1204 case ENC_UTF8:
1205 *outp++ = (unsigned char)(0xe0 | (ch >> 12));
1206 *outp++ = (unsigned char)(0x80 | ((ch >> 6) & 0x3f));
1207 *outp++ = (unsigned char)(0x80 | (ch & 0x3f));
1208 break;
1209 case ENC_UTF16LE:
1210 *outp++ = (unsigned char) ch;
1211 *outp++ = (unsigned char)(ch >> 8);
1212 break;
1213 case ENC_UTF16BE:
1214 *outp++ = (unsigned char)(ch >> 8);
1215 *outp++ = (unsigned char) ch;
1216 break;
1217 case ENC_UTF32LE:
1218 *outp++ = (unsigned char) ch;
1219 *outp++ = (unsigned char)(ch >> 8);
1220 *outp++ = (unsigned char)(ch >> 16);
1221 *outp++ = (unsigned char)(ch >> 24);
1222 break;
1223 case ENC_UTF32BE:
1224 *outp++ = (unsigned char)(ch >> 24);
1225 *outp++ = (unsigned char)(ch >> 16);
1226 *outp++ = (unsigned char)(ch >> 8);
1227 *outp++ = (unsigned char) ch;
1228 break;
1229 }
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001230 }
1231 restuple = Py_BuildValue("(On)", res, end);
1232 Py_DECREF(res);
1233 Py_DECREF(object);
1234 return restuple;
Martin v. Löwisdb12d452009-05-02 18:52:14 +00001235 }
Serhiy Storchakaca7fecb2015-05-18 16:08:52 +03001236 else if (PyObject_TypeCheck(exc, (PyTypeObject *)PyExc_UnicodeDecodeError)) {
Serhiy Storchakacb33a012016-10-23 09:44:50 +03001237 const unsigned char *p;
Victor Stinnerc06bb7a2011-11-04 21:36:35 +01001238 Py_UCS4 ch = 0;
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001239 if (PyUnicodeDecodeError_GetStart(exc, &start))
1240 return NULL;
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02001241 if (PyUnicodeDecodeError_GetEnd(exc, &end))
1242 return NULL;
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001243 if (!(object = PyUnicodeDecodeError_GetObject(exc)))
1244 return NULL;
Serhiy Storchakacb33a012016-10-23 09:44:50 +03001245 p = (const unsigned char*)PyBytes_AS_STRING(object);
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02001246 if (!(encode = PyUnicodeDecodeError_GetEncoding(exc))) {
1247 Py_DECREF(object);
1248 return NULL;
1249 }
1250 if (!(encoding = PyUnicode_AsUTF8(encode))) {
1251 Py_DECREF(object);
1252 Py_DECREF(encode);
1253 return NULL;
1254 }
1255 code = get_standard_encoding(encoding, &bytelength);
1256 Py_DECREF(encode);
Serhiy Storchaka88d8fb62014-05-15 14:37:42 +03001257 if (code == ENC_UNKNOWN) {
1258 /* Not supported, fail with original exception */
1259 PyErr_SetObject(PyExceptionInstance_Class(exc), exc);
1260 Py_DECREF(object);
1261 return NULL;
1262 }
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02001263
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001264 /* Try decoding a single surrogate character. If
1265 there are more, let the codec call us again. */
1266 p += start;
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02001267 if (PyBytes_GET_SIZE(object) - start >= bytelength) {
1268 switch (code) {
1269 case ENC_UTF8:
1270 if ((p[0] & 0xf0) == 0xe0 &&
1271 (p[1] & 0xc0) == 0x80 &&
1272 (p[2] & 0xc0) == 0x80) {
1273 /* it's a three-byte code */
1274 ch = ((p[0] & 0x0f) << 12) + ((p[1] & 0x3f) << 6) + (p[2] & 0x3f);
1275 }
1276 break;
1277 case ENC_UTF16LE:
1278 ch = p[1] << 8 | p[0];
1279 break;
1280 case ENC_UTF16BE:
1281 ch = p[0] << 8 | p[1];
1282 break;
1283 case ENC_UTF32LE:
1284 ch = (p[3] << 24) | (p[2] << 16) | (p[1] << 8) | p[0];
1285 break;
1286 case ENC_UTF32BE:
1287 ch = (p[0] << 24) | (p[1] << 16) | (p[2] << 8) | p[3];
1288 break;
1289 }
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001290 }
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02001291
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001292 Py_DECREF(object);
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02001293 if (!Py_UNICODE_IS_SURROGATE(ch)) {
1294 /* it's not a surrogate - fail */
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001295 PyErr_SetObject(PyExceptionInstance_Class(exc), exc);
1296 return NULL;
1297 }
Victor Stinnerc06bb7a2011-11-04 21:36:35 +01001298 res = PyUnicode_FromOrdinal(ch);
1299 if (res == NULL)
1300 return NULL;
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02001301 return Py_BuildValue("(Nn)", res, start + bytelength);
Martin v. Löwisdb12d452009-05-02 18:52:14 +00001302 }
1303 else {
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001304 wrong_exception_type(exc);
1305 return NULL;
Martin v. Löwisdb12d452009-05-02 18:52:14 +00001306 }
1307}
1308
Martin v. Löwis011e8422009-05-05 04:43:17 +00001309static PyObject *
Martin v. Löwis43c57782009-05-10 08:15:24 +00001310PyCodec_SurrogateEscapeErrors(PyObject *exc)
Martin v. Löwis011e8422009-05-05 04:43:17 +00001311{
1312 PyObject *restuple;
1313 PyObject *object;
Martin v. Löwisb09af032011-11-04 11:16:41 +01001314 Py_ssize_t i;
Martin v. Löwis011e8422009-05-05 04:43:17 +00001315 Py_ssize_t start;
1316 Py_ssize_t end;
1317 PyObject *res;
Serhiy Storchakaca7fecb2015-05-18 16:08:52 +03001318
1319 if (PyObject_TypeCheck(exc, (PyTypeObject *)PyExc_UnicodeEncodeError)) {
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001320 char *outp;
1321 if (PyUnicodeEncodeError_GetStart(exc, &start))
1322 return NULL;
1323 if (PyUnicodeEncodeError_GetEnd(exc, &end))
1324 return NULL;
1325 if (!(object = PyUnicodeEncodeError_GetObject(exc)))
1326 return NULL;
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001327 res = PyBytes_FromStringAndSize(NULL, end-start);
1328 if (!res) {
1329 Py_DECREF(object);
1330 return NULL;
1331 }
1332 outp = PyBytes_AsString(res);
Martin v. Löwisb09af032011-11-04 11:16:41 +01001333 for (i = start; i < end; i++) {
1334 /* object is guaranteed to be "ready" */
1335 Py_UCS4 ch = PyUnicode_READ_CHAR(object, i);
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001336 if (ch < 0xdc80 || ch > 0xdcff) {
1337 /* Not a UTF-8b surrogate, fail with original exception */
1338 PyErr_SetObject(PyExceptionInstance_Class(exc), exc);
1339 Py_DECREF(res);
1340 Py_DECREF(object);
1341 return NULL;
1342 }
1343 *outp++ = ch - 0xdc00;
1344 }
1345 restuple = Py_BuildValue("(On)", res, end);
1346 Py_DECREF(res);
1347 Py_DECREF(object);
1348 return restuple;
Martin v. Löwis011e8422009-05-05 04:43:17 +00001349 }
Serhiy Storchakaca7fecb2015-05-18 16:08:52 +03001350 else if (PyObject_TypeCheck(exc, (PyTypeObject *)PyExc_UnicodeDecodeError)) {
Victor Stinnerc06bb7a2011-11-04 21:36:35 +01001351 PyObject *str;
Serhiy Storchakacb33a012016-10-23 09:44:50 +03001352 const unsigned char *p;
Victor Stinnerc06bb7a2011-11-04 21:36:35 +01001353 Py_UCS2 ch[4]; /* decode up to 4 bad bytes. */
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001354 int consumed = 0;
1355 if (PyUnicodeDecodeError_GetStart(exc, &start))
1356 return NULL;
1357 if (PyUnicodeDecodeError_GetEnd(exc, &end))
1358 return NULL;
1359 if (!(object = PyUnicodeDecodeError_GetObject(exc)))
1360 return NULL;
Serhiy Storchakacb33a012016-10-23 09:44:50 +03001361 p = (const unsigned char*)PyBytes_AS_STRING(object);
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001362 while (consumed < 4 && consumed < end-start) {
1363 /* Refuse to escape ASCII bytes. */
1364 if (p[start+consumed] < 128)
1365 break;
1366 ch[consumed] = 0xdc00 + p[start+consumed];
1367 consumed++;
1368 }
1369 Py_DECREF(object);
1370 if (!consumed) {
1371 /* codec complained about ASCII byte. */
1372 PyErr_SetObject(PyExceptionInstance_Class(exc), exc);
1373 return NULL;
1374 }
Victor Stinnerc06bb7a2011-11-04 21:36:35 +01001375 str = PyUnicode_FromKindAndData(PyUnicode_2BYTE_KIND, ch, consumed);
1376 if (str == NULL)
1377 return NULL;
1378 return Py_BuildValue("(Nn)", str, start+consumed);
Martin v. Löwis011e8422009-05-05 04:43:17 +00001379 }
1380 else {
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001381 wrong_exception_type(exc);
1382 return NULL;
Martin v. Löwis011e8422009-05-05 04:43:17 +00001383 }
1384}
1385
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001386
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001387static PyObject *strict_errors(PyObject *self, PyObject *exc)
1388{
1389 return PyCodec_StrictErrors(exc);
1390}
1391
1392
1393static PyObject *ignore_errors(PyObject *self, PyObject *exc)
1394{
1395 return PyCodec_IgnoreErrors(exc);
1396}
1397
1398
1399static PyObject *replace_errors(PyObject *self, PyObject *exc)
1400{
1401 return PyCodec_ReplaceErrors(exc);
1402}
1403
1404
1405static PyObject *xmlcharrefreplace_errors(PyObject *self, PyObject *exc)
1406{
1407 return PyCodec_XMLCharRefReplaceErrors(exc);
1408}
1409
1410
1411static PyObject *backslashreplace_errors(PyObject *self, PyObject *exc)
1412{
1413 return PyCodec_BackslashReplaceErrors(exc);
1414}
1415
Serhiy Storchaka166ebc42014-11-25 13:57:17 +02001416static PyObject *namereplace_errors(PyObject *self, PyObject *exc)
1417{
1418 return PyCodec_NameReplaceErrors(exc);
1419}
1420
Martin v. Löwise0a2b722009-05-10 08:08:56 +00001421static PyObject *surrogatepass_errors(PyObject *self, PyObject *exc)
Martin v. Löwisdb12d452009-05-02 18:52:14 +00001422{
Martin v. Löwise0a2b722009-05-10 08:08:56 +00001423 return PyCodec_SurrogatePassErrors(exc);
Martin v. Löwisdb12d452009-05-02 18:52:14 +00001424}
1425
Martin v. Löwis43c57782009-05-10 08:15:24 +00001426static PyObject *surrogateescape_errors(PyObject *self, PyObject *exc)
Martin v. Löwis011e8422009-05-05 04:43:17 +00001427{
Martin v. Löwis43c57782009-05-10 08:15:24 +00001428 return PyCodec_SurrogateEscapeErrors(exc);
Martin v. Löwis011e8422009-05-05 04:43:17 +00001429}
1430
Gustavo Niemeyer5ddd4c32003-03-19 00:35:36 +00001431static int _PyCodecRegistry_Init(void)
Guido van Rossumfeee4b92000-03-10 22:57:27 +00001432{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001433 static struct {
Andy Lester7386a702020-02-13 22:42:56 -06001434 const char *name;
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001435 PyMethodDef def;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001436 } methods[] =
1437 {
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001438 {
1439 "strict",
1440 {
1441 "strict_errors",
1442 strict_errors,
1443 METH_O,
1444 PyDoc_STR("Implements the 'strict' error handling, which "
1445 "raises a UnicodeError on coding errors.")
1446 }
1447 },
1448 {
1449 "ignore",
1450 {
1451 "ignore_errors",
1452 ignore_errors,
1453 METH_O,
1454 PyDoc_STR("Implements the 'ignore' error handling, which "
1455 "ignores malformed data and continues.")
1456 }
1457 },
1458 {
1459 "replace",
1460 {
1461 "replace_errors",
1462 replace_errors,
1463 METH_O,
1464 PyDoc_STR("Implements the 'replace' error handling, which "
1465 "replaces malformed data with a replacement marker.")
1466 }
1467 },
1468 {
1469 "xmlcharrefreplace",
1470 {
1471 "xmlcharrefreplace_errors",
1472 xmlcharrefreplace_errors,
1473 METH_O,
1474 PyDoc_STR("Implements the 'xmlcharrefreplace' error handling, "
1475 "which replaces an unencodable character with the "
1476 "appropriate XML character reference.")
1477 }
1478 },
1479 {
1480 "backslashreplace",
1481 {
1482 "backslashreplace_errors",
1483 backslashreplace_errors,
1484 METH_O,
1485 PyDoc_STR("Implements the 'backslashreplace' error handling, "
Serhiy Storchaka07985ef2015-01-25 22:56:57 +02001486 "which replaces malformed data with a backslashed "
1487 "escape sequence.")
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001488 }
1489 },
1490 {
Serhiy Storchaka166ebc42014-11-25 13:57:17 +02001491 "namereplace",
1492 {
1493 "namereplace_errors",
1494 namereplace_errors,
1495 METH_O,
1496 PyDoc_STR("Implements the 'namereplace' error handling, "
1497 "which replaces an unencodable character with a "
1498 "\\N{...} escape sequence.")
1499 }
1500 },
1501 {
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001502 "surrogatepass",
1503 {
1504 "surrogatepass",
1505 surrogatepass_errors,
1506 METH_O
1507 }
1508 },
1509 {
1510 "surrogateescape",
1511 {
1512 "surrogateescape",
1513 surrogateescape_errors,
1514 METH_O
1515 }
1516 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001517 };
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001518
Victor Stinner81a7be32020-04-14 15:14:01 +02001519 PyInterpreterState *interp = _PyInterpreterState_GET();
Gustavo Niemeyer5ddd4c32003-03-19 00:35:36 +00001520 PyObject *mod;
Gustavo Niemeyer5ddd4c32003-03-19 00:35:36 +00001521
1522 if (interp->codec_search_path != NULL)
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001523 return 0;
Gustavo Niemeyer5ddd4c32003-03-19 00:35:36 +00001524
1525 interp->codec_search_path = PyList_New(0);
Victor Stinnerd3a1de22020-01-27 23:23:12 +01001526 if (interp->codec_search_path == NULL) {
1527 return -1;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001528 }
Guido van Rossumfeee4b92000-03-10 22:57:27 +00001529
Victor Stinnerd3a1de22020-01-27 23:23:12 +01001530 interp->codec_search_cache = PyDict_New();
1531 if (interp->codec_search_cache == NULL) {
1532 return -1;
1533 }
1534
1535 interp->codec_error_registry = PyDict_New();
1536 if (interp->codec_error_registry == NULL) {
1537 return -1;
1538 }
1539
1540 for (size_t i = 0; i < Py_ARRAY_LENGTH(methods); ++i) {
1541 PyObject *func = PyCFunction_NewEx(&methods[i].def, NULL, NULL);
1542 if (!func) {
1543 return -1;
1544 }
1545
1546 int res = PyCodec_RegisterError(methods[i].name, func);
1547 Py_DECREF(func);
1548 if (res) {
1549 return -1;
1550 }
1551 }
Gustavo Niemeyer5ddd4c32003-03-19 00:35:36 +00001552
Christian Heimes819b8bf2008-01-03 23:05:47 +00001553 mod = PyImport_ImportModuleNoBlock("encodings");
Gustavo Niemeyer5ddd4c32003-03-19 00:35:36 +00001554 if (mod == NULL) {
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001555 return -1;
Gustavo Niemeyer5ddd4c32003-03-19 00:35:36 +00001556 }
1557 Py_DECREF(mod);
Christian Heimes6a27efa2008-10-30 21:48:26 +00001558 interp->codecs_initialized = 1;
Gustavo Niemeyer5ddd4c32003-03-19 00:35:36 +00001559 return 0;
Guido van Rossumfeee4b92000-03-10 22:57:27 +00001560}