blob: 7b35ded2edcd580daeca0d95ce47ea81dba74783 [file] [log] [blame]
Guido van Rossumfeee4b92000-03-10 22:57:27 +00001/* ------------------------------------------------------------------------
2
3 Python Codec Registry and support functions
4
5Written by Marc-Andre Lemburg (mal@lemburg.com).
6
Guido van Rossum16b1ad92000-08-03 16:24:25 +00007Copyright (c) Corporation for National Research Initiatives.
Guido van Rossumfeee4b92000-03-10 22:57:27 +00008
9 ------------------------------------------------------------------------ */
10
11#include "Python.h"
Victor Stinner621cebe2018-11-12 16:53:38 +010012#include "pycore_pystate.h"
Serhiy Storchaka166ebc42014-11-25 13:57:17 +020013#include "ucnhash.h"
Guido van Rossumfeee4b92000-03-10 22:57:27 +000014#include <ctype.h>
15
Victor Stinnerf5cff562011-10-14 02:13:11 +020016const char *Py_hexdigits = "0123456789abcdef";
17
Guido van Rossumfeee4b92000-03-10 22:57:27 +000018/* --- Codec Registry ----------------------------------------------------- */
19
20/* Import the standard encodings package which will register the first
Guido van Rossum98297ee2007-11-06 21:34:58 +000021 codec search function.
Guido van Rossumfeee4b92000-03-10 22:57:27 +000022
23 This is done in a lazy way so that the Unicode implementation does
24 not downgrade startup time of scripts not needing it.
25
Guido van Rossumb95de4f2000-03-31 17:25:23 +000026 ImportErrors are silently ignored by this function. Only one try is
27 made.
Guido van Rossumfeee4b92000-03-10 22:57:27 +000028
29*/
30
Gustavo Niemeyer5ddd4c32003-03-19 00:35:36 +000031static int _PyCodecRegistry_Init(void); /* Forward */
Guido van Rossumfeee4b92000-03-10 22:57:27 +000032
Guido van Rossumfeee4b92000-03-10 22:57:27 +000033int PyCodec_Register(PyObject *search_function)
34{
Victor Stinnerff4584c2020-03-13 18:03:56 +010035 PyInterpreterState *interp = _PyInterpreterState_GET_UNSAFE();
Gustavo Niemeyer5ddd4c32003-03-19 00:35:36 +000036 if (interp->codec_search_path == NULL && _PyCodecRegistry_Init())
Antoine Pitrouf95a1b32010-05-09 15:52:27 +000037 goto onError;
Guido van Rossumfeee4b92000-03-10 22:57:27 +000038 if (search_function == NULL) {
Antoine Pitrouf95a1b32010-05-09 15:52:27 +000039 PyErr_BadArgument();
40 goto onError;
Guido van Rossumfeee4b92000-03-10 22:57:27 +000041 }
42 if (!PyCallable_Check(search_function)) {
Antoine Pitrouf95a1b32010-05-09 15:52:27 +000043 PyErr_SetString(PyExc_TypeError, "argument must be callable");
44 goto onError;
Guido van Rossumfeee4b92000-03-10 22:57:27 +000045 }
Gustavo Niemeyer5ddd4c32003-03-19 00:35:36 +000046 return PyList_Append(interp->codec_search_path, search_function);
Guido van Rossumb95de4f2000-03-31 17:25:23 +000047
48 onError:
49 return -1;
Guido van Rossumfeee4b92000-03-10 22:57:27 +000050}
51
Jordon Xu20f59fe2019-08-21 21:26:20 +080052extern int _Py_normalize_encoding(const char *, char *, size_t);
53
54/* Convert a string to a normalized Python string(decoded from UTF-8): all characters are
55 converted to lower case, spaces and hyphens are replaced with underscores. */
Guido van Rossum9e896b32000-04-05 20:11:21 +000056
Guido van Rossumfeee4b92000-03-10 22:57:27 +000057static
Guido van Rossum9e896b32000-04-05 20:11:21 +000058PyObject *normalizestring(const char *string)
Guido van Rossumfeee4b92000-03-10 22:57:27 +000059{
Guido van Rossum582acec2000-06-28 22:07:35 +000060 size_t len = strlen(string);
Jordon Xu20f59fe2019-08-21 21:26:20 +080061 char *encoding;
Guido van Rossumfeee4b92000-03-10 22:57:27 +000062 PyObject *v;
Guido van Rossum21431e82007-10-19 21:48:41 +000063
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000064 if (len > PY_SSIZE_T_MAX) {
Antoine Pitrouf95a1b32010-05-09 15:52:27 +000065 PyErr_SetString(PyExc_OverflowError, "string is too large");
66 return NULL;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000067 }
Guido van Rossum21431e82007-10-19 21:48:41 +000068
Jordon Xu20f59fe2019-08-21 21:26:20 +080069 encoding = PyMem_Malloc(len + 1);
70 if (encoding == NULL)
Victor Stinnercc351592013-07-12 00:02:55 +020071 return PyErr_NoMemory();
Jordon Xu20f59fe2019-08-21 21:26:20 +080072
73 if (!_Py_normalize_encoding(string, encoding, len + 1))
74 {
75 PyErr_SetString(PyExc_RuntimeError, "_Py_normalize_encoding() failed");
76 PyMem_Free(encoding);
77 return NULL;
Guido van Rossum9e896b32000-04-05 20:11:21 +000078 }
Jordon Xu20f59fe2019-08-21 21:26:20 +080079
80 v = PyUnicode_FromString(encoding);
81 PyMem_Free(encoding);
Guido van Rossumfeee4b92000-03-10 22:57:27 +000082 return v;
83}
84
85/* Lookup the given encoding and return a tuple providing the codec
86 facilities.
87
88 The encoding string is looked up converted to all lower-case
89 characters. This makes encodings looked up through this mechanism
90 effectively case-insensitive.
91
Guido van Rossum98297ee2007-11-06 21:34:58 +000092 If no codec is found, a LookupError is set and NULL returned.
Guido van Rossumb95de4f2000-03-31 17:25:23 +000093
94 As side effect, this tries to load the encodings package, if not
95 yet done. This is part of the lazy load strategy for the encodings
96 package.
97
98*/
Guido van Rossumfeee4b92000-03-10 22:57:27 +000099
100PyObject *_PyCodec_Lookup(const char *encoding)
101{
Fred Drake766de832000-05-09 19:55:59 +0000102 if (encoding == NULL) {
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000103 PyErr_BadArgument();
Jeroen Demeyer6e43d072019-07-05 12:57:32 +0200104 return NULL;
Fred Drake766de832000-05-09 19:55:59 +0000105 }
Gustavo Niemeyer5ddd4c32003-03-19 00:35:36 +0000106
Victor Stinnercaba55b2018-08-03 15:33:52 +0200107 PyInterpreterState *interp = _PyInterpreterState_GET_UNSAFE();
Jeroen Demeyer6e43d072019-07-05 12:57:32 +0200108 if (interp->codec_search_path == NULL && _PyCodecRegistry_Init()) {
109 return NULL;
110 }
Guido van Rossumfeee4b92000-03-10 22:57:27 +0000111
Guido van Rossum9e896b32000-04-05 20:11:21 +0000112 /* Convert the encoding to a normalized Python string: all
Thomas Wouters7e474022000-07-16 12:04:32 +0000113 characters are converted to lower case, spaces and hyphens are
Guido van Rossum9e896b32000-04-05 20:11:21 +0000114 replaced with underscores. */
Jeroen Demeyer6e43d072019-07-05 12:57:32 +0200115 PyObject *v = normalizestring(encoding);
116 if (v == NULL) {
117 return NULL;
118 }
Guido van Rossum21431e82007-10-19 21:48:41 +0000119 PyUnicode_InternInPlace(&v);
Guido van Rossumfeee4b92000-03-10 22:57:27 +0000120
121 /* First, try to lookup the name in the registry dictionary */
Jeroen Demeyer6e43d072019-07-05 12:57:32 +0200122 PyObject *result = PyDict_GetItemWithError(interp->codec_search_cache, v);
Guido van Rossumfeee4b92000-03-10 22:57:27 +0000123 if (result != NULL) {
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000124 Py_INCREF(result);
125 Py_DECREF(v);
126 return result;
Guido van Rossumfeee4b92000-03-10 22:57:27 +0000127 }
Serhiy Storchakaa24107b2019-02-25 17:59:46 +0200128 else if (PyErr_Occurred()) {
Jeroen Demeyer6e43d072019-07-05 12:57:32 +0200129 goto onError;
Serhiy Storchakaa24107b2019-02-25 17:59:46 +0200130 }
Guido van Rossum98297ee2007-11-06 21:34:58 +0000131
Guido van Rossumfeee4b92000-03-10 22:57:27 +0000132 /* Next, scan the search functions in order of registration */
Jeroen Demeyer6e43d072019-07-05 12:57:32 +0200133 const Py_ssize_t len = PyList_Size(interp->codec_search_path);
Guido van Rossum5ba3c842000-03-24 20:52:23 +0000134 if (len < 0)
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000135 goto onError;
Guido van Rossumb95de4f2000-03-31 17:25:23 +0000136 if (len == 0) {
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000137 PyErr_SetString(PyExc_LookupError,
138 "no codec search functions registered: "
139 "can't find encoding");
140 goto onError;
Guido van Rossumb95de4f2000-03-31 17:25:23 +0000141 }
Guido van Rossumfeee4b92000-03-10 22:57:27 +0000142
Jeroen Demeyer6e43d072019-07-05 12:57:32 +0200143 Py_ssize_t i;
Guido van Rossumfeee4b92000-03-10 22:57:27 +0000144 for (i = 0; i < len; i++) {
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000145 PyObject *func;
Guido van Rossum98297ee2007-11-06 21:34:58 +0000146
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000147 func = PyList_GetItem(interp->codec_search_path, i);
148 if (func == NULL)
149 goto onError;
Petr Viktorinffd97532020-02-11 17:46:57 +0100150 result = PyObject_CallOneArg(func, v);
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000151 if (result == NULL)
152 goto onError;
153 if (result == Py_None) {
154 Py_DECREF(result);
155 continue;
156 }
157 if (!PyTuple_Check(result) || PyTuple_GET_SIZE(result) != 4) {
158 PyErr_SetString(PyExc_TypeError,
159 "codec search functions must return 4-tuples");
160 Py_DECREF(result);
161 goto onError;
162 }
163 break;
Guido van Rossumfeee4b92000-03-10 22:57:27 +0000164 }
165 if (i == len) {
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000166 /* XXX Perhaps we should cache misses too ? */
167 PyErr_Format(PyExc_LookupError,
Martin v. Löwiseb42b022002-09-26 16:01:24 +0000168 "unknown encoding: %s", encoding);
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000169 goto onError;
Guido van Rossumfeee4b92000-03-10 22:57:27 +0000170 }
171
172 /* Cache and return the result */
Neal Norwitz9edcc2e2007-08-11 04:58:26 +0000173 if (PyDict_SetItem(interp->codec_search_cache, v, result) < 0) {
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000174 Py_DECREF(result);
175 goto onError;
Neal Norwitz9edcc2e2007-08-11 04:58:26 +0000176 }
Jeroen Demeyer6e43d072019-07-05 12:57:32 +0200177 Py_DECREF(v);
Guido van Rossumfeee4b92000-03-10 22:57:27 +0000178 return result;
179
180 onError:
Jeroen Demeyer6e43d072019-07-05 12:57:32 +0200181 Py_DECREF(v);
Guido van Rossumfeee4b92000-03-10 22:57:27 +0000182 return NULL;
183}
184
Nick Coghlan8fad1672014-09-15 23:50:44 +1200185int _PyCodec_Forget(const char *encoding)
186{
Nick Coghlan8fad1672014-09-15 23:50:44 +1200187 PyObject *v;
188 int result;
189
Victor Stinnerff4584c2020-03-13 18:03:56 +0100190 PyInterpreterState *interp = _PyInterpreterState_GET_UNSAFE();
Nick Coghlan8fad1672014-09-15 23:50:44 +1200191 if (interp->codec_search_path == NULL) {
192 return -1;
193 }
194
195 /* Convert the encoding to a normalized Python string: all
196 characters are converted to lower case, spaces and hyphens are
197 replaced with underscores. */
198 v = normalizestring(encoding);
199 if (v == NULL) {
200 return -1;
201 }
202
203 /* Drop the named codec from the internal cache */
204 result = PyDict_DelItem(interp->codec_search_cache, v);
205 Py_DECREF(v);
206
207 return result;
208}
209
Marc-André Lemburgb2750b52008-06-06 12:18:17 +0000210/* Codec registry encoding check API. */
211
212int PyCodec_KnownEncoding(const char *encoding)
213{
214 PyObject *codecs;
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000215
Marc-André Lemburgb2750b52008-06-06 12:18:17 +0000216 codecs = _PyCodec_Lookup(encoding);
217 if (!codecs) {
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000218 PyErr_Clear();
219 return 0;
Marc-André Lemburgb2750b52008-06-06 12:18:17 +0000220 }
221 else {
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000222 Py_DECREF(codecs);
223 return 1;
Marc-André Lemburgb2750b52008-06-06 12:18:17 +0000224 }
225}
226
Guido van Rossumfeee4b92000-03-10 22:57:27 +0000227static
228PyObject *args_tuple(PyObject *object,
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000229 const char *errors)
Guido van Rossumfeee4b92000-03-10 22:57:27 +0000230{
231 PyObject *args;
Guido van Rossum98297ee2007-11-06 21:34:58 +0000232
Guido van Rossumfeee4b92000-03-10 22:57:27 +0000233 args = PyTuple_New(1 + (errors != NULL));
234 if (args == NULL)
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000235 return NULL;
Guido van Rossumfeee4b92000-03-10 22:57:27 +0000236 Py_INCREF(object);
237 PyTuple_SET_ITEM(args,0,object);
238 if (errors) {
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000239 PyObject *v;
Guido van Rossum98297ee2007-11-06 21:34:58 +0000240
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000241 v = PyUnicode_FromString(errors);
242 if (v == NULL) {
243 Py_DECREF(args);
244 return NULL;
245 }
246 PyTuple_SET_ITEM(args, 1, v);
Guido van Rossumfeee4b92000-03-10 22:57:27 +0000247 }
248 return args;
249}
250
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000251/* Helper function to get a codec item */
Guido van Rossumfeee4b92000-03-10 22:57:27 +0000252
253static
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000254PyObject *codec_getitem(const char *encoding, int index)
Guido van Rossumfeee4b92000-03-10 22:57:27 +0000255{
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000256 PyObject *codecs;
257 PyObject *v;
Guido van Rossumfeee4b92000-03-10 22:57:27 +0000258
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000259 codecs = _PyCodec_Lookup(encoding);
260 if (codecs == NULL)
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000261 return NULL;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000262 v = PyTuple_GET_ITEM(codecs, index);
263 Py_DECREF(codecs);
264 Py_INCREF(v);
265 return v;
266}
267
Nick Coghlana9b15242014-02-04 22:11:18 +1000268/* Helper functions to create an incremental codec. */
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000269static
Nick Coghlana9b15242014-02-04 22:11:18 +1000270PyObject *codec_makeincrementalcodec(PyObject *codec_info,
271 const char *errors,
272 const char *attrname)
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000273{
Nick Coghlana9b15242014-02-04 22:11:18 +1000274 PyObject *ret, *inccodec;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000275
Nick Coghlana9b15242014-02-04 22:11:18 +1000276 inccodec = PyObject_GetAttrString(codec_info, attrname);
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000277 if (inccodec == NULL)
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000278 return NULL;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000279 if (errors)
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000280 ret = PyObject_CallFunction(inccodec, "s", errors);
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000281 else
Victor Stinner4778eab2016-12-01 14:51:04 +0100282 ret = _PyObject_CallNoArg(inccodec);
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000283 Py_DECREF(inccodec);
284 return ret;
285}
286
Nick Coghlana9b15242014-02-04 22:11:18 +1000287static
288PyObject *codec_getincrementalcodec(const char *encoding,
289 const char *errors,
290 const char *attrname)
291{
292 PyObject *codec_info, *ret;
293
294 codec_info = _PyCodec_Lookup(encoding);
295 if (codec_info == NULL)
296 return NULL;
297 ret = codec_makeincrementalcodec(codec_info, errors, attrname);
298 Py_DECREF(codec_info);
299 return ret;
300}
301
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000302/* Helper function to create a stream codec. */
303
304static
305PyObject *codec_getstreamcodec(const char *encoding,
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000306 PyObject *stream,
307 const char *errors,
308 const int index)
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000309{
Thomas Wouters0e3f5912006-08-11 14:57:12 +0000310 PyObject *codecs, *streamcodec, *codeccls;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000311
312 codecs = _PyCodec_Lookup(encoding);
313 if (codecs == NULL)
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000314 return NULL;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000315
Thomas Wouters0e3f5912006-08-11 14:57:12 +0000316 codeccls = PyTuple_GET_ITEM(codecs, index);
317 if (errors != NULL)
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000318 streamcodec = PyObject_CallFunction(codeccls, "Os", stream, errors);
Thomas Wouters0e3f5912006-08-11 14:57:12 +0000319 else
Petr Viktorinffd97532020-02-11 17:46:57 +0100320 streamcodec = PyObject_CallOneArg(codeccls, stream);
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000321 Py_DECREF(codecs);
322 return streamcodec;
Guido van Rossumfeee4b92000-03-10 22:57:27 +0000323}
324
Nick Coghlana9b15242014-02-04 22:11:18 +1000325/* Helpers to work with the result of _PyCodec_Lookup
326
327 */
328PyObject *_PyCodecInfo_GetIncrementalDecoder(PyObject *codec_info,
329 const char *errors)
330{
331 return codec_makeincrementalcodec(codec_info, errors,
332 "incrementaldecoder");
333}
334
335PyObject *_PyCodecInfo_GetIncrementalEncoder(PyObject *codec_info,
336 const char *errors)
337{
338 return codec_makeincrementalcodec(codec_info, errors,
339 "incrementalencoder");
340}
341
342
Guido van Rossum98297ee2007-11-06 21:34:58 +0000343/* Convenience APIs to query the Codec registry.
344
Guido van Rossumfeee4b92000-03-10 22:57:27 +0000345 All APIs return a codec object with incremented refcount.
Guido van Rossum98297ee2007-11-06 21:34:58 +0000346
Guido van Rossumfeee4b92000-03-10 22:57:27 +0000347 */
348
349PyObject *PyCodec_Encoder(const char *encoding)
350{
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000351 return codec_getitem(encoding, 0);
Guido van Rossumfeee4b92000-03-10 22:57:27 +0000352}
353
354PyObject *PyCodec_Decoder(const char *encoding)
355{
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000356 return codec_getitem(encoding, 1);
Guido van Rossumfeee4b92000-03-10 22:57:27 +0000357}
358
Thomas Woutersa9773292006-04-21 09:43:23 +0000359PyObject *PyCodec_IncrementalEncoder(const char *encoding,
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000360 const char *errors)
Thomas Woutersa9773292006-04-21 09:43:23 +0000361{
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000362 return codec_getincrementalcodec(encoding, errors, "incrementalencoder");
Thomas Woutersa9773292006-04-21 09:43:23 +0000363}
364
365PyObject *PyCodec_IncrementalDecoder(const char *encoding,
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000366 const char *errors)
Thomas Woutersa9773292006-04-21 09:43:23 +0000367{
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000368 return codec_getincrementalcodec(encoding, errors, "incrementaldecoder");
Thomas Woutersa9773292006-04-21 09:43:23 +0000369}
370
Guido van Rossumfeee4b92000-03-10 22:57:27 +0000371PyObject *PyCodec_StreamReader(const char *encoding,
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000372 PyObject *stream,
373 const char *errors)
Guido van Rossumfeee4b92000-03-10 22:57:27 +0000374{
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000375 return codec_getstreamcodec(encoding, stream, errors, 2);
Guido van Rossumfeee4b92000-03-10 22:57:27 +0000376}
377
378PyObject *PyCodec_StreamWriter(const char *encoding,
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000379 PyObject *stream,
380 const char *errors)
Guido van Rossumfeee4b92000-03-10 22:57:27 +0000381{
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000382 return codec_getstreamcodec(encoding, stream, errors, 3);
Guido van Rossumfeee4b92000-03-10 22:57:27 +0000383}
384
Nick Coghlan8b097b42013-11-13 23:49:21 +1000385/* Helper that tries to ensure the reported exception chain indicates the
386 * codec that was invoked to trigger the failure without changing the type
387 * of the exception raised.
388 */
389static void
390wrap_codec_error(const char *operation,
391 const char *encoding)
392{
393 /* TrySetFromCause will replace the active exception with a suitably
394 * updated clone if it can, otherwise it will leave the original
395 * exception alone.
396 */
397 _PyErr_TrySetFromCause("%s with '%s' codec failed",
398 operation, encoding);
399}
400
Martin Panter6245cb32016-04-15 02:14:19 +0000401/* Encode an object (e.g. a Unicode object) using the given encoding
Guido van Rossumfeee4b92000-03-10 22:57:27 +0000402 and return the resulting encoded object (usually a Python string).
403
404 errors is passed to the encoder factory as argument if non-NULL. */
405
Nick Coghlanc72e4e62013-11-22 22:39:36 +1000406static PyObject *
407_PyCodec_EncodeInternal(PyObject *object,
408 PyObject *encoder,
409 const char *encoding,
410 const char *errors)
Guido van Rossumfeee4b92000-03-10 22:57:27 +0000411{
Neal Norwitz3715c3e2005-11-24 22:09:18 +0000412 PyObject *args = NULL, *result = NULL;
Guido van Rossum98297ee2007-11-06 21:34:58 +0000413 PyObject *v = NULL;
Guido van Rossumfeee4b92000-03-10 22:57:27 +0000414
Guido van Rossumfeee4b92000-03-10 22:57:27 +0000415 args = args_tuple(object, errors);
416 if (args == NULL)
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000417 goto onError;
Guido van Rossum98297ee2007-11-06 21:34:58 +0000418
Jeroen Demeyer1dbd0842019-07-11 17:57:32 +0200419 result = PyObject_Call(encoder, args, NULL);
Nick Coghlanc4c25802013-11-15 21:47:37 +1000420 if (result == NULL) {
421 wrap_codec_error("encoding", encoding);
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000422 goto onError;
Nick Coghlanc4c25802013-11-15 21:47:37 +1000423 }
Guido van Rossumfeee4b92000-03-10 22:57:27 +0000424
Guido van Rossum98297ee2007-11-06 21:34:58 +0000425 if (!PyTuple_Check(result) ||
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000426 PyTuple_GET_SIZE(result) != 2) {
427 PyErr_SetString(PyExc_TypeError,
428 "encoder must return a tuple (object, integer)");
429 goto onError;
Guido van Rossumfeee4b92000-03-10 22:57:27 +0000430 }
Marc-André Lemburgb2750b52008-06-06 12:18:17 +0000431 v = PyTuple_GET_ITEM(result,0);
432 Py_INCREF(v);
Guido van Rossumfeee4b92000-03-10 22:57:27 +0000433 /* We don't check or use the second (integer) entry. */
434
Marc-André Lemburgb2750b52008-06-06 12:18:17 +0000435 Py_DECREF(args);
436 Py_DECREF(encoder);
437 Py_DECREF(result);
438 return v;
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000439
Guido van Rossumfeee4b92000-03-10 22:57:27 +0000440 onError:
Neal Norwitz3715c3e2005-11-24 22:09:18 +0000441 Py_XDECREF(result);
Guido van Rossumfeee4b92000-03-10 22:57:27 +0000442 Py_XDECREF(args);
443 Py_XDECREF(encoder);
Marc-André Lemburgb2750b52008-06-06 12:18:17 +0000444 return NULL;
Guido van Rossumfeee4b92000-03-10 22:57:27 +0000445}
446
447/* Decode an object (usually a Python string) using the given encoding
Martin Panter6245cb32016-04-15 02:14:19 +0000448 and return an equivalent object (e.g. a Unicode object).
Guido van Rossumfeee4b92000-03-10 22:57:27 +0000449
450 errors is passed to the decoder factory as argument if non-NULL. */
451
Nick Coghlanc72e4e62013-11-22 22:39:36 +1000452static PyObject *
453_PyCodec_DecodeInternal(PyObject *object,
454 PyObject *decoder,
455 const char *encoding,
456 const char *errors)
Guido van Rossumfeee4b92000-03-10 22:57:27 +0000457{
Guido van Rossumfeee4b92000-03-10 22:57:27 +0000458 PyObject *args = NULL, *result = NULL;
459 PyObject *v;
460
Guido van Rossumfeee4b92000-03-10 22:57:27 +0000461 args = args_tuple(object, errors);
462 if (args == NULL)
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000463 goto onError;
Guido van Rossum98297ee2007-11-06 21:34:58 +0000464
Jeroen Demeyer1dbd0842019-07-11 17:57:32 +0200465 result = PyObject_Call(decoder, args, NULL);
Nick Coghlanc4c25802013-11-15 21:47:37 +1000466 if (result == NULL) {
467 wrap_codec_error("decoding", encoding);
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000468 goto onError;
Nick Coghlanc4c25802013-11-15 21:47:37 +1000469 }
Guido van Rossum98297ee2007-11-06 21:34:58 +0000470 if (!PyTuple_Check(result) ||
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000471 PyTuple_GET_SIZE(result) != 2) {
472 PyErr_SetString(PyExc_TypeError,
473 "decoder must return a tuple (object,integer)");
474 goto onError;
Guido van Rossumfeee4b92000-03-10 22:57:27 +0000475 }
476 v = PyTuple_GET_ITEM(result,0);
477 Py_INCREF(v);
478 /* We don't check or use the second (integer) entry. */
479
480 Py_DECREF(args);
481 Py_DECREF(decoder);
482 Py_DECREF(result);
483 return v;
Guido van Rossum98297ee2007-11-06 21:34:58 +0000484
Guido van Rossumfeee4b92000-03-10 22:57:27 +0000485 onError:
486 Py_XDECREF(args);
487 Py_XDECREF(decoder);
488 Py_XDECREF(result);
489 return NULL;
490}
491
Nick Coghlanc72e4e62013-11-22 22:39:36 +1000492/* Generic encoding/decoding API */
493PyObject *PyCodec_Encode(PyObject *object,
494 const char *encoding,
495 const char *errors)
496{
497 PyObject *encoder;
498
499 encoder = PyCodec_Encoder(encoding);
500 if (encoder == NULL)
501 return NULL;
502
503 return _PyCodec_EncodeInternal(object, encoder, encoding, errors);
504}
505
506PyObject *PyCodec_Decode(PyObject *object,
507 const char *encoding,
508 const char *errors)
509{
510 PyObject *decoder;
511
512 decoder = PyCodec_Decoder(encoding);
513 if (decoder == NULL)
514 return NULL;
515
516 return _PyCodec_DecodeInternal(object, decoder, encoding, errors);
517}
518
519/* Text encoding/decoding API */
Nick Coghlana9b15242014-02-04 22:11:18 +1000520PyObject * _PyCodec_LookupTextEncoding(const char *encoding,
521 const char *alternate_command)
Nick Coghlanc72e4e62013-11-22 22:39:36 +1000522{
523 _Py_IDENTIFIER(_is_text_encoding);
524 PyObject *codec;
525 PyObject *attr;
Nick Coghlanc72e4e62013-11-22 22:39:36 +1000526 int is_text_codec;
527
528 codec = _PyCodec_Lookup(encoding);
529 if (codec == NULL)
530 return NULL;
531
532 /* Backwards compatibility: assume any raw tuple describes a text
533 * encoding, and the same for anything lacking the private
534 * attribute.
535 */
536 if (!PyTuple_CheckExact(codec)) {
Serhiy Storchakaf320be72018-01-25 10:49:40 +0200537 if (_PyObject_LookupAttrId(codec, &PyId__is_text_encoding, &attr) < 0) {
538 Py_DECREF(codec);
539 return NULL;
540 }
541 if (attr != NULL) {
Nick Coghlanc72e4e62013-11-22 22:39:36 +1000542 is_text_codec = PyObject_IsTrue(attr);
543 Py_DECREF(attr);
Serhiy Storchakafa494fd2015-05-30 17:45:22 +0300544 if (is_text_codec <= 0) {
Nick Coghlanc72e4e62013-11-22 22:39:36 +1000545 Py_DECREF(codec);
Serhiy Storchakafa494fd2015-05-30 17:45:22 +0300546 if (!is_text_codec)
547 PyErr_Format(PyExc_LookupError,
548 "'%.400s' is not a text encoding; "
549 "use %s to handle arbitrary codecs",
550 encoding, alternate_command);
Nick Coghlanc72e4e62013-11-22 22:39:36 +1000551 return NULL;
552 }
553 }
554 }
555
Nick Coghlana9b15242014-02-04 22:11:18 +1000556 /* This appears to be a valid text encoding */
557 return codec;
558}
559
560
561static
562PyObject *codec_getitem_checked(const char *encoding,
563 const char *alternate_command,
564 int index)
565{
566 PyObject *codec;
567 PyObject *v;
568
569 codec = _PyCodec_LookupTextEncoding(encoding, alternate_command);
570 if (codec == NULL)
571 return NULL;
572
Nick Coghlanc72e4e62013-11-22 22:39:36 +1000573 v = PyTuple_GET_ITEM(codec, index);
Nick Coghlanc72e4e62013-11-22 22:39:36 +1000574 Py_INCREF(v);
Nick Coghlana9b15242014-02-04 22:11:18 +1000575 Py_DECREF(codec);
Nick Coghlanc72e4e62013-11-22 22:39:36 +1000576 return v;
577}
578
579static PyObject * _PyCodec_TextEncoder(const char *encoding)
580{
Nick Coghlana9b15242014-02-04 22:11:18 +1000581 return codec_getitem_checked(encoding, "codecs.encode()", 0);
Nick Coghlanc72e4e62013-11-22 22:39:36 +1000582}
583
584static PyObject * _PyCodec_TextDecoder(const char *encoding)
585{
Nick Coghlana9b15242014-02-04 22:11:18 +1000586 return codec_getitem_checked(encoding, "codecs.decode()", 1);
Nick Coghlanc72e4e62013-11-22 22:39:36 +1000587}
588
589PyObject *_PyCodec_EncodeText(PyObject *object,
590 const char *encoding,
591 const char *errors)
592{
593 PyObject *encoder;
594
595 encoder = _PyCodec_TextEncoder(encoding);
596 if (encoder == NULL)
597 return NULL;
598
599 return _PyCodec_EncodeInternal(object, encoder, encoding, errors);
600}
601
602PyObject *_PyCodec_DecodeText(PyObject *object,
603 const char *encoding,
604 const char *errors)
605{
606 PyObject *decoder;
607
608 decoder = _PyCodec_TextDecoder(encoding);
609 if (decoder == NULL)
610 return NULL;
611
612 return _PyCodec_DecodeInternal(object, decoder, encoding, errors);
613}
614
Walter Dörwald3aeb6322002-09-02 13:14:32 +0000615/* Register the error handling callback function error under the name
616 name. This function will be called by the codec when it encounters
617 an unencodable characters/undecodable bytes and doesn't know the
618 callback name, when name is specified as the error parameter
619 in the call to the encode/decode function.
620 Return 0 on success, -1 on error */
621int PyCodec_RegisterError(const char *name, PyObject *error)
622{
Victor Stinnerff4584c2020-03-13 18:03:56 +0100623 PyInterpreterState *interp = _PyInterpreterState_GET_UNSAFE();
Gustavo Niemeyer5ddd4c32003-03-19 00:35:36 +0000624 if (interp->codec_search_path == NULL && _PyCodecRegistry_Init())
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000625 return -1;
Walter Dörwald3aeb6322002-09-02 13:14:32 +0000626 if (!PyCallable_Check(error)) {
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000627 PyErr_SetString(PyExc_TypeError, "handler must be callable");
628 return -1;
Walter Dörwald3aeb6322002-09-02 13:14:32 +0000629 }
Gustavo Niemeyer5ddd4c32003-03-19 00:35:36 +0000630 return PyDict_SetItemString(interp->codec_error_registry,
Serhiy Storchakac6792272013-10-19 21:03:34 +0300631 name, error);
Walter Dörwald3aeb6322002-09-02 13:14:32 +0000632}
633
634/* Lookup the error handling callback function registered under the
635 name error. As a special case NULL can be passed, in which case
636 the error handling callback for strict encoding will be returned. */
637PyObject *PyCodec_LookupError(const char *name)
638{
639 PyObject *handler = NULL;
640
Victor Stinnercaba55b2018-08-03 15:33:52 +0200641 PyInterpreterState *interp = _PyInterpreterState_GET_UNSAFE();
Gustavo Niemeyer5ddd4c32003-03-19 00:35:36 +0000642 if (interp->codec_search_path == NULL && _PyCodecRegistry_Init())
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000643 return NULL;
Gustavo Niemeyer5ddd4c32003-03-19 00:35:36 +0000644
Walter Dörwald3aeb6322002-09-02 13:14:32 +0000645 if (name==NULL)
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000646 name = "strict";
Serhiy Storchakaa24107b2019-02-25 17:59:46 +0200647 handler = _PyDict_GetItemStringWithError(interp->codec_error_registry, name);
648 if (handler) {
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000649 Py_INCREF(handler);
Serhiy Storchakaa24107b2019-02-25 17:59:46 +0200650 }
651 else if (!PyErr_Occurred()) {
652 PyErr_Format(PyExc_LookupError, "unknown error handler name '%.400s'", name);
653 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +0000654 return handler;
655}
656
657static void wrong_exception_type(PyObject *exc)
658{
Serhiy Storchakaca7fecb2015-05-18 16:08:52 +0300659 PyErr_Format(PyExc_TypeError,
660 "don't know how to handle %.200s in error callback",
Victor Stinnera102ed72020-02-07 02:24:48 +0100661 Py_TYPE(exc)->tp_name);
Walter Dörwald3aeb6322002-09-02 13:14:32 +0000662}
663
664PyObject *PyCodec_StrictErrors(PyObject *exc)
665{
Brett Cannonbf364092006-03-01 04:25:17 +0000666 if (PyExceptionInstance_Check(exc))
667 PyErr_SetObject(PyExceptionInstance_Class(exc), exc);
Walter Dörwald3aeb6322002-09-02 13:14:32 +0000668 else
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000669 PyErr_SetString(PyExc_TypeError, "codec must pass exception instance");
Walter Dörwald3aeb6322002-09-02 13:14:32 +0000670 return NULL;
671}
672
673
674PyObject *PyCodec_IgnoreErrors(PyObject *exc)
675{
Martin v. Löwis18e16552006-02-15 17:27:45 +0000676 Py_ssize_t end;
Serhiy Storchakaca7fecb2015-05-18 16:08:52 +0300677
678 if (PyObject_TypeCheck(exc, (PyTypeObject *)PyExc_UnicodeEncodeError)) {
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000679 if (PyUnicodeEncodeError_GetEnd(exc, &end))
680 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +0000681 }
Serhiy Storchakaca7fecb2015-05-18 16:08:52 +0300682 else if (PyObject_TypeCheck(exc, (PyTypeObject *)PyExc_UnicodeDecodeError)) {
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000683 if (PyUnicodeDecodeError_GetEnd(exc, &end))
684 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +0000685 }
Serhiy Storchakaca7fecb2015-05-18 16:08:52 +0300686 else if (PyObject_TypeCheck(exc, (PyTypeObject *)PyExc_UnicodeTranslateError)) {
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000687 if (PyUnicodeTranslateError_GetEnd(exc, &end))
688 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +0000689 }
690 else {
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000691 wrong_exception_type(exc);
692 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +0000693 }
Victor Stinneree450092011-12-01 02:52:11 +0100694 return Py_BuildValue("(Nn)", PyUnicode_New(0, 0), end);
Walter Dörwald3aeb6322002-09-02 13:14:32 +0000695}
696
697
698PyObject *PyCodec_ReplaceErrors(PyObject *exc)
699{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200700 Py_ssize_t start, end, i, len;
Walter Dörwald3aeb6322002-09-02 13:14:32 +0000701
Serhiy Storchakaca7fecb2015-05-18 16:08:52 +0300702 if (PyObject_TypeCheck(exc, (PyTypeObject *)PyExc_UnicodeEncodeError)) {
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000703 PyObject *res;
Serhiy Storchakacd8295f2020-04-11 10:48:40 +0300704 Py_UCS1 *outp;
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000705 if (PyUnicodeEncodeError_GetStart(exc, &start))
706 return NULL;
707 if (PyUnicodeEncodeError_GetEnd(exc, &end))
708 return NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200709 len = end - start;
710 res = PyUnicode_New(len, '?');
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000711 if (res == NULL)
712 return NULL;
Serhiy Storchakacd8295f2020-04-11 10:48:40 +0300713 assert(PyUnicode_KIND(res) == PyUnicode_1BYTE_KIND);
714 outp = PyUnicode_1BYTE_DATA(res);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200715 for (i = 0; i < len; ++i)
Serhiy Storchakacd8295f2020-04-11 10:48:40 +0300716 outp[i] = '?';
Victor Stinner8f825062012-04-27 13:55:39 +0200717 assert(_PyUnicode_CheckConsistency(res, 1));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200718 return Py_BuildValue("(Nn)", res, end);
Walter Dörwald3aeb6322002-09-02 13:14:32 +0000719 }
Serhiy Storchakaca7fecb2015-05-18 16:08:52 +0300720 else if (PyObject_TypeCheck(exc, (PyTypeObject *)PyExc_UnicodeDecodeError)) {
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000721 if (PyUnicodeDecodeError_GetEnd(exc, &end))
722 return NULL;
Victor Stinner1a15aba2011-10-02 19:00:15 +0200723 return Py_BuildValue("(Cn)",
724 (int)Py_UNICODE_REPLACEMENT_CHARACTER,
725 end);
Walter Dörwald3aeb6322002-09-02 13:14:32 +0000726 }
Serhiy Storchakaca7fecb2015-05-18 16:08:52 +0300727 else if (PyObject_TypeCheck(exc, (PyTypeObject *)PyExc_UnicodeTranslateError)) {
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000728 PyObject *res;
Serhiy Storchakacd8295f2020-04-11 10:48:40 +0300729 Py_UCS2 *outp;
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000730 if (PyUnicodeTranslateError_GetStart(exc, &start))
731 return NULL;
732 if (PyUnicodeTranslateError_GetEnd(exc, &end))
733 return NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200734 len = end - start;
735 res = PyUnicode_New(len, Py_UNICODE_REPLACEMENT_CHARACTER);
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000736 if (res == NULL)
737 return NULL;
Serhiy Storchakacd8295f2020-04-11 10:48:40 +0300738 assert(PyUnicode_KIND(res) == PyUnicode_2BYTE_KIND);
739 outp = PyUnicode_2BYTE_DATA(res);
740 for (i = 0; i < len; i++)
741 outp[i] = Py_UNICODE_REPLACEMENT_CHARACTER;
Victor Stinner8f825062012-04-27 13:55:39 +0200742 assert(_PyUnicode_CheckConsistency(res, 1));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200743 return Py_BuildValue("(Nn)", res, end);
Walter Dörwald3aeb6322002-09-02 13:14:32 +0000744 }
745 else {
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000746 wrong_exception_type(exc);
747 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +0000748 }
749}
750
751PyObject *PyCodec_XMLCharRefReplaceErrors(PyObject *exc)
752{
Serhiy Storchakaca7fecb2015-05-18 16:08:52 +0300753 if (PyObject_TypeCheck(exc, (PyTypeObject *)PyExc_UnicodeEncodeError)) {
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000754 PyObject *restuple;
755 PyObject *object;
Victor Stinnerb31f1bc2011-11-04 21:29:10 +0100756 Py_ssize_t i;
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000757 Py_ssize_t start;
758 Py_ssize_t end;
759 PyObject *res;
Serhiy Storchakacd8295f2020-04-11 10:48:40 +0300760 Py_UCS1 *outp;
Serhiy Storchaka2e374092014-10-04 14:15:49 +0300761 Py_ssize_t ressize;
Martin v. Löwisb09af032011-11-04 11:16:41 +0100762 Py_UCS4 ch;
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000763 if (PyUnicodeEncodeError_GetStart(exc, &start))
764 return NULL;
765 if (PyUnicodeEncodeError_GetEnd(exc, &end))
766 return NULL;
767 if (!(object = PyUnicodeEncodeError_GetObject(exc)))
768 return NULL;
Serhiy Storchaka2e374092014-10-04 14:15:49 +0300769 if (end - start > PY_SSIZE_T_MAX / (2+7+1))
770 end = start + PY_SSIZE_T_MAX / (2+7+1);
Martin v. Löwisb09af032011-11-04 11:16:41 +0100771 for (i = start, ressize = 0; i < end; ++i) {
772 /* object is guaranteed to be "ready" */
773 ch = PyUnicode_READ_CHAR(object, i);
774 if (ch<10)
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000775 ressize += 2+1+1;
Martin v. Löwisb09af032011-11-04 11:16:41 +0100776 else if (ch<100)
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000777 ressize += 2+2+1;
Martin v. Löwisb09af032011-11-04 11:16:41 +0100778 else if (ch<1000)
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000779 ressize += 2+3+1;
Martin v. Löwisb09af032011-11-04 11:16:41 +0100780 else if (ch<10000)
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000781 ressize += 2+4+1;
Martin v. Löwisb09af032011-11-04 11:16:41 +0100782 else if (ch<100000)
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000783 ressize += 2+5+1;
Martin v. Löwisb09af032011-11-04 11:16:41 +0100784 else if (ch<1000000)
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000785 ressize += 2+6+1;
786 else
787 ressize += 2+7+1;
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000788 }
789 /* allocate replacement */
Martin v. Löwisb09af032011-11-04 11:16:41 +0100790 res = PyUnicode_New(ressize, 127);
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000791 if (res == NULL) {
792 Py_DECREF(object);
793 return NULL;
794 }
Martin v. Löwisb09af032011-11-04 11:16:41 +0100795 outp = PyUnicode_1BYTE_DATA(res);
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000796 /* generate replacement */
Victor Stinnerb31f1bc2011-11-04 21:29:10 +0100797 for (i = start; i < end; ++i) {
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000798 int digits;
799 int base;
Martin v. Löwis8ba79302011-11-04 12:26:49 +0100800 ch = PyUnicode_READ_CHAR(object, i);
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000801 *outp++ = '&';
802 *outp++ = '#';
Martin v. Löwisb09af032011-11-04 11:16:41 +0100803 if (ch<10) {
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000804 digits = 1;
805 base = 1;
806 }
Martin v. Löwisb09af032011-11-04 11:16:41 +0100807 else if (ch<100) {
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000808 digits = 2;
809 base = 10;
810 }
Martin v. Löwisb09af032011-11-04 11:16:41 +0100811 else if (ch<1000) {
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000812 digits = 3;
813 base = 100;
814 }
Martin v. Löwisb09af032011-11-04 11:16:41 +0100815 else if (ch<10000) {
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000816 digits = 4;
817 base = 1000;
818 }
Martin v. Löwisb09af032011-11-04 11:16:41 +0100819 else if (ch<100000) {
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000820 digits = 5;
821 base = 10000;
822 }
Martin v. Löwisb09af032011-11-04 11:16:41 +0100823 else if (ch<1000000) {
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000824 digits = 6;
825 base = 100000;
826 }
827 else {
828 digits = 7;
829 base = 1000000;
830 }
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000831 while (digits-->0) {
Martin v. Löwisb09af032011-11-04 11:16:41 +0100832 *outp++ = '0' + ch/base;
833 ch %= base;
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000834 base /= 10;
835 }
836 *outp++ = ';';
837 }
Victor Stinner8f825062012-04-27 13:55:39 +0200838 assert(_PyUnicode_CheckConsistency(res, 1));
839 restuple = Py_BuildValue("(Nn)", res, end);
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000840 Py_DECREF(object);
841 return restuple;
Walter Dörwald3aeb6322002-09-02 13:14:32 +0000842 }
843 else {
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000844 wrong_exception_type(exc);
845 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +0000846 }
847}
848
Walter Dörwald3aeb6322002-09-02 13:14:32 +0000849PyObject *PyCodec_BackslashReplaceErrors(PyObject *exc)
850{
Serhiy Storchaka07985ef2015-01-25 22:56:57 +0200851 PyObject *object;
852 Py_ssize_t i;
853 Py_ssize_t start;
854 Py_ssize_t end;
855 PyObject *res;
Serhiy Storchakacd8295f2020-04-11 10:48:40 +0300856 Py_UCS1 *outp;
Serhiy Storchaka07985ef2015-01-25 22:56:57 +0200857 int ressize;
858 Py_UCS4 c;
859
Serhiy Storchakac0937f72015-05-18 16:10:40 +0300860 if (PyObject_TypeCheck(exc, (PyTypeObject *)PyExc_UnicodeDecodeError)) {
Serhiy Storchakacb33a012016-10-23 09:44:50 +0300861 const unsigned char *p;
Serhiy Storchaka07985ef2015-01-25 22:56:57 +0200862 if (PyUnicodeDecodeError_GetStart(exc, &start))
863 return NULL;
864 if (PyUnicodeDecodeError_GetEnd(exc, &end))
865 return NULL;
866 if (!(object = PyUnicodeDecodeError_GetObject(exc)))
867 return NULL;
Serhiy Storchakacb33a012016-10-23 09:44:50 +0300868 p = (const unsigned char*)PyBytes_AS_STRING(object);
Serhiy Storchaka07985ef2015-01-25 22:56:57 +0200869 res = PyUnicode_New(4 * (end - start), 127);
870 if (res == NULL) {
871 Py_DECREF(object);
872 return NULL;
873 }
874 outp = PyUnicode_1BYTE_DATA(res);
875 for (i = start; i < end; i++, outp += 4) {
876 unsigned char c = p[i];
877 outp[0] = '\\';
878 outp[1] = 'x';
879 outp[2] = Py_hexdigits[(c>>4)&0xf];
880 outp[3] = Py_hexdigits[c&0xf];
881 }
882
883 assert(_PyUnicode_CheckConsistency(res, 1));
884 Py_DECREF(object);
885 return Py_BuildValue("(Nn)", res, end);
886 }
Serhiy Storchakac0937f72015-05-18 16:10:40 +0300887 if (PyObject_TypeCheck(exc, (PyTypeObject *)PyExc_UnicodeEncodeError)) {
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000888 if (PyUnicodeEncodeError_GetStart(exc, &start))
889 return NULL;
890 if (PyUnicodeEncodeError_GetEnd(exc, &end))
891 return NULL;
892 if (!(object = PyUnicodeEncodeError_GetObject(exc)))
893 return NULL;
Serhiy Storchaka07985ef2015-01-25 22:56:57 +0200894 }
Serhiy Storchakac0937f72015-05-18 16:10:40 +0300895 else if (PyObject_TypeCheck(exc, (PyTypeObject *)PyExc_UnicodeTranslateError)) {
Serhiy Storchaka07985ef2015-01-25 22:56:57 +0200896 if (PyUnicodeTranslateError_GetStart(exc, &start))
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000897 return NULL;
Serhiy Storchaka07985ef2015-01-25 22:56:57 +0200898 if (PyUnicodeTranslateError_GetEnd(exc, &end))
899 return NULL;
900 if (!(object = PyUnicodeTranslateError_GetObject(exc)))
901 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +0000902 }
903 else {
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000904 wrong_exception_type(exc);
905 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +0000906 }
Serhiy Storchaka07985ef2015-01-25 22:56:57 +0200907
908 if (end - start > PY_SSIZE_T_MAX / (1+1+8))
909 end = start + PY_SSIZE_T_MAX / (1+1+8);
910 for (i = start, ressize = 0; i < end; ++i) {
911 /* object is guaranteed to be "ready" */
912 c = PyUnicode_READ_CHAR(object, i);
913 if (c >= 0x10000) {
914 ressize += 1+1+8;
915 }
916 else if (c >= 0x100) {
917 ressize += 1+1+4;
918 }
919 else
920 ressize += 1+1+2;
921 }
922 res = PyUnicode_New(ressize, 127);
923 if (res == NULL) {
924 Py_DECREF(object);
925 return NULL;
926 }
927 outp = PyUnicode_1BYTE_DATA(res);
928 for (i = start; i < end; ++i) {
929 c = PyUnicode_READ_CHAR(object, i);
930 *outp++ = '\\';
931 if (c >= 0x00010000) {
932 *outp++ = 'U';
933 *outp++ = Py_hexdigits[(c>>28)&0xf];
934 *outp++ = Py_hexdigits[(c>>24)&0xf];
935 *outp++ = Py_hexdigits[(c>>20)&0xf];
936 *outp++ = Py_hexdigits[(c>>16)&0xf];
937 *outp++ = Py_hexdigits[(c>>12)&0xf];
938 *outp++ = Py_hexdigits[(c>>8)&0xf];
939 }
940 else if (c >= 0x100) {
941 *outp++ = 'u';
942 *outp++ = Py_hexdigits[(c>>12)&0xf];
943 *outp++ = Py_hexdigits[(c>>8)&0xf];
944 }
945 else
946 *outp++ = 'x';
947 *outp++ = Py_hexdigits[(c>>4)&0xf];
948 *outp++ = Py_hexdigits[c&0xf];
949 }
950
951 assert(_PyUnicode_CheckConsistency(res, 1));
952 Py_DECREF(object);
953 return Py_BuildValue("(Nn)", res, end);
Walter Dörwald3aeb6322002-09-02 13:14:32 +0000954}
955
Serhiy Storchaka166ebc42014-11-25 13:57:17 +0200956static _PyUnicode_Name_CAPI *ucnhash_CAPI = NULL;
Serhiy Storchaka166ebc42014-11-25 13:57:17 +0200957
958PyObject *PyCodec_NameReplaceErrors(PyObject *exc)
959{
Serhiy Storchakaca7fecb2015-05-18 16:08:52 +0300960 if (PyObject_TypeCheck(exc, (PyTypeObject *)PyExc_UnicodeEncodeError)) {
Serhiy Storchaka166ebc42014-11-25 13:57:17 +0200961 PyObject *restuple;
962 PyObject *object;
963 Py_ssize_t i;
964 Py_ssize_t start;
965 Py_ssize_t end;
966 PyObject *res;
Serhiy Storchakacd8295f2020-04-11 10:48:40 +0300967 Py_UCS1 *outp;
Serhiy Storchakaaacfccc2014-11-26 12:11:40 +0200968 Py_ssize_t ressize;
969 int replsize;
Serhiy Storchaka166ebc42014-11-25 13:57:17 +0200970 Py_UCS4 c;
971 char buffer[256]; /* NAME_MAXLEN */
972 if (PyUnicodeEncodeError_GetStart(exc, &start))
973 return NULL;
974 if (PyUnicodeEncodeError_GetEnd(exc, &end))
975 return NULL;
976 if (!(object = PyUnicodeEncodeError_GetObject(exc)))
977 return NULL;
Victor Stinner38b8ae02015-09-03 16:19:40 +0200978 if (!ucnhash_CAPI) {
Serhiy Storchaka166ebc42014-11-25 13:57:17 +0200979 /* load the unicode data module */
980 ucnhash_CAPI = (_PyUnicode_Name_CAPI *)PyCapsule_Import(
981 PyUnicodeData_CAPSULE_NAME, 1);
Victor Stinner38b8ae02015-09-03 16:19:40 +0200982 if (!ucnhash_CAPI)
983 return NULL;
Serhiy Storchaka166ebc42014-11-25 13:57:17 +0200984 }
985 for (i = start, ressize = 0; i < end; ++i) {
986 /* object is guaranteed to be "ready" */
987 c = PyUnicode_READ_CHAR(object, i);
Victor Stinner38b8ae02015-09-03 16:19:40 +0200988 if (ucnhash_CAPI->getname(NULL, c, buffer, sizeof(buffer), 1)) {
Serhiy Storchaka26861b02015-02-16 20:52:17 +0200989 replsize = 1+1+1+(int)strlen(buffer)+1;
Serhiy Storchaka166ebc42014-11-25 13:57:17 +0200990 }
991 else if (c >= 0x10000) {
Serhiy Storchakaaacfccc2014-11-26 12:11:40 +0200992 replsize = 1+1+8;
Serhiy Storchaka166ebc42014-11-25 13:57:17 +0200993 }
994 else if (c >= 0x100) {
Serhiy Storchakaaacfccc2014-11-26 12:11:40 +0200995 replsize = 1+1+4;
Serhiy Storchaka166ebc42014-11-25 13:57:17 +0200996 }
997 else
Serhiy Storchakaaacfccc2014-11-26 12:11:40 +0200998 replsize = 1+1+2;
999 if (ressize > PY_SSIZE_T_MAX - replsize)
1000 break;
1001 ressize += replsize;
Serhiy Storchaka166ebc42014-11-25 13:57:17 +02001002 }
Serhiy Storchakaaacfccc2014-11-26 12:11:40 +02001003 end = i;
Serhiy Storchaka166ebc42014-11-25 13:57:17 +02001004 res = PyUnicode_New(ressize, 127);
1005 if (res==NULL)
1006 return NULL;
1007 for (i = start, outp = PyUnicode_1BYTE_DATA(res);
1008 i < end; ++i) {
1009 c = PyUnicode_READ_CHAR(object, i);
1010 *outp++ = '\\';
Victor Stinner38b8ae02015-09-03 16:19:40 +02001011 if (ucnhash_CAPI->getname(NULL, c, buffer, sizeof(buffer), 1)) {
Serhiy Storchaka166ebc42014-11-25 13:57:17 +02001012 *outp++ = 'N';
1013 *outp++ = '{';
1014 strcpy((char *)outp, buffer);
1015 outp += strlen(buffer);
1016 *outp++ = '}';
1017 continue;
1018 }
1019 if (c >= 0x00010000) {
1020 *outp++ = 'U';
1021 *outp++ = Py_hexdigits[(c>>28)&0xf];
1022 *outp++ = Py_hexdigits[(c>>24)&0xf];
1023 *outp++ = Py_hexdigits[(c>>20)&0xf];
1024 *outp++ = Py_hexdigits[(c>>16)&0xf];
1025 *outp++ = Py_hexdigits[(c>>12)&0xf];
1026 *outp++ = Py_hexdigits[(c>>8)&0xf];
1027 }
1028 else if (c >= 0x100) {
1029 *outp++ = 'u';
1030 *outp++ = Py_hexdigits[(c>>12)&0xf];
1031 *outp++ = Py_hexdigits[(c>>8)&0xf];
1032 }
1033 else
1034 *outp++ = 'x';
1035 *outp++ = Py_hexdigits[(c>>4)&0xf];
1036 *outp++ = Py_hexdigits[c&0xf];
1037 }
1038
Benjamin Peterson3663b582014-11-26 14:39:54 -06001039 assert(outp == PyUnicode_1BYTE_DATA(res) + ressize);
Serhiy Storchaka166ebc42014-11-25 13:57:17 +02001040 assert(_PyUnicode_CheckConsistency(res, 1));
1041 restuple = Py_BuildValue("(Nn)", res, end);
1042 Py_DECREF(object);
1043 return restuple;
1044 }
1045 else {
1046 wrong_exception_type(exc);
1047 return NULL;
1048 }
1049}
1050
Serhiy Storchaka88d8fb62014-05-15 14:37:42 +03001051#define ENC_UNKNOWN -1
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02001052#define ENC_UTF8 0
1053#define ENC_UTF16BE 1
1054#define ENC_UTF16LE 2
1055#define ENC_UTF32BE 3
1056#define ENC_UTF32LE 4
1057
1058static int
1059get_standard_encoding(const char *encoding, int *bytelength)
1060{
1061 if (Py_TOLOWER(encoding[0]) == 'u' &&
1062 Py_TOLOWER(encoding[1]) == 't' &&
1063 Py_TOLOWER(encoding[2]) == 'f') {
1064 encoding += 3;
1065 if (*encoding == '-' || *encoding == '_' )
1066 encoding++;
Serhiy Storchaka88d8fb62014-05-15 14:37:42 +03001067 if (encoding[0] == '8' && encoding[1] == '\0') {
1068 *bytelength = 3;
1069 return ENC_UTF8;
1070 }
1071 else if (encoding[0] == '1' && encoding[1] == '6') {
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02001072 encoding += 2;
1073 *bytelength = 2;
1074 if (*encoding == '\0') {
1075#ifdef WORDS_BIGENDIAN
1076 return ENC_UTF16BE;
1077#else
1078 return ENC_UTF16LE;
1079#endif
1080 }
1081 if (*encoding == '-' || *encoding == '_' )
1082 encoding++;
1083 if (Py_TOLOWER(encoding[1]) == 'e' && encoding[2] == '\0') {
1084 if (Py_TOLOWER(encoding[0]) == 'b')
1085 return ENC_UTF16BE;
1086 if (Py_TOLOWER(encoding[0]) == 'l')
1087 return ENC_UTF16LE;
1088 }
1089 }
1090 else if (encoding[0] == '3' && encoding[1] == '2') {
1091 encoding += 2;
1092 *bytelength = 4;
1093 if (*encoding == '\0') {
1094#ifdef WORDS_BIGENDIAN
1095 return ENC_UTF32BE;
1096#else
1097 return ENC_UTF32LE;
1098#endif
1099 }
1100 if (*encoding == '-' || *encoding == '_' )
1101 encoding++;
1102 if (Py_TOLOWER(encoding[1]) == 'e' && encoding[2] == '\0') {
1103 if (Py_TOLOWER(encoding[0]) == 'b')
1104 return ENC_UTF32BE;
1105 if (Py_TOLOWER(encoding[0]) == 'l')
1106 return ENC_UTF32LE;
1107 }
1108 }
1109 }
Victor Stinner0d4e01c2014-05-16 14:46:20 +02001110 else if (strcmp(encoding, "CP_UTF8") == 0) {
1111 *bytelength = 3;
1112 return ENC_UTF8;
1113 }
Serhiy Storchaka88d8fb62014-05-15 14:37:42 +03001114 return ENC_UNKNOWN;
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02001115}
1116
Martin v. Löwisaef3fb02009-05-02 19:27:30 +00001117/* This handler is declared static until someone demonstrates
1118 a need to call it directly. */
1119static PyObject *
Martin v. Löwise0a2b722009-05-10 08:08:56 +00001120PyCodec_SurrogatePassErrors(PyObject *exc)
Martin v. Löwisdb12d452009-05-02 18:52:14 +00001121{
1122 PyObject *restuple;
1123 PyObject *object;
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02001124 PyObject *encode;
Serhiy Storchaka85b0f5b2016-11-20 10:16:47 +02001125 const char *encoding;
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02001126 int code;
1127 int bytelength;
Martin v. Löwisb09af032011-11-04 11:16:41 +01001128 Py_ssize_t i;
Martin v. Löwisdb12d452009-05-02 18:52:14 +00001129 Py_ssize_t start;
1130 Py_ssize_t end;
1131 PyObject *res;
Serhiy Storchakaca7fecb2015-05-18 16:08:52 +03001132
1133 if (PyObject_TypeCheck(exc, (PyTypeObject *)PyExc_UnicodeEncodeError)) {
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02001134 unsigned char *outp;
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001135 if (PyUnicodeEncodeError_GetStart(exc, &start))
1136 return NULL;
1137 if (PyUnicodeEncodeError_GetEnd(exc, &end))
1138 return NULL;
1139 if (!(object = PyUnicodeEncodeError_GetObject(exc)))
1140 return NULL;
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02001141 if (!(encode = PyUnicodeEncodeError_GetEncoding(exc))) {
1142 Py_DECREF(object);
1143 return NULL;
1144 }
1145 if (!(encoding = PyUnicode_AsUTF8(encode))) {
1146 Py_DECREF(object);
1147 Py_DECREF(encode);
1148 return NULL;
1149 }
1150 code = get_standard_encoding(encoding, &bytelength);
1151 Py_DECREF(encode);
Serhiy Storchaka88d8fb62014-05-15 14:37:42 +03001152 if (code == ENC_UNKNOWN) {
1153 /* Not supported, fail with original exception */
1154 PyErr_SetObject(PyExceptionInstance_Class(exc), exc);
1155 Py_DECREF(object);
1156 return NULL;
1157 }
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02001158
Serhiy Storchaka2e374092014-10-04 14:15:49 +03001159 if (end - start > PY_SSIZE_T_MAX / bytelength)
1160 end = start + PY_SSIZE_T_MAX / bytelength;
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02001161 res = PyBytes_FromStringAndSize(NULL, bytelength*(end-start));
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001162 if (!res) {
1163 Py_DECREF(object);
1164 return NULL;
1165 }
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02001166 outp = (unsigned char*)PyBytes_AsString(res);
Martin v. Löwisb09af032011-11-04 11:16:41 +01001167 for (i = start; i < end; i++) {
1168 /* object is guaranteed to be "ready" */
1169 Py_UCS4 ch = PyUnicode_READ_CHAR(object, i);
Victor Stinner76df43d2012-10-30 01:42:39 +01001170 if (!Py_UNICODE_IS_SURROGATE(ch)) {
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001171 /* Not a surrogate, fail with original exception */
1172 PyErr_SetObject(PyExceptionInstance_Class(exc), exc);
1173 Py_DECREF(res);
1174 Py_DECREF(object);
1175 return NULL;
1176 }
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02001177 switch (code) {
1178 case ENC_UTF8:
1179 *outp++ = (unsigned char)(0xe0 | (ch >> 12));
1180 *outp++ = (unsigned char)(0x80 | ((ch >> 6) & 0x3f));
1181 *outp++ = (unsigned char)(0x80 | (ch & 0x3f));
1182 break;
1183 case ENC_UTF16LE:
1184 *outp++ = (unsigned char) ch;
1185 *outp++ = (unsigned char)(ch >> 8);
1186 break;
1187 case ENC_UTF16BE:
1188 *outp++ = (unsigned char)(ch >> 8);
1189 *outp++ = (unsigned char) ch;
1190 break;
1191 case ENC_UTF32LE:
1192 *outp++ = (unsigned char) ch;
1193 *outp++ = (unsigned char)(ch >> 8);
1194 *outp++ = (unsigned char)(ch >> 16);
1195 *outp++ = (unsigned char)(ch >> 24);
1196 break;
1197 case ENC_UTF32BE:
1198 *outp++ = (unsigned char)(ch >> 24);
1199 *outp++ = (unsigned char)(ch >> 16);
1200 *outp++ = (unsigned char)(ch >> 8);
1201 *outp++ = (unsigned char) ch;
1202 break;
1203 }
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001204 }
1205 restuple = Py_BuildValue("(On)", res, end);
1206 Py_DECREF(res);
1207 Py_DECREF(object);
1208 return restuple;
Martin v. Löwisdb12d452009-05-02 18:52:14 +00001209 }
Serhiy Storchakaca7fecb2015-05-18 16:08:52 +03001210 else if (PyObject_TypeCheck(exc, (PyTypeObject *)PyExc_UnicodeDecodeError)) {
Serhiy Storchakacb33a012016-10-23 09:44:50 +03001211 const unsigned char *p;
Victor Stinnerc06bb7a2011-11-04 21:36:35 +01001212 Py_UCS4 ch = 0;
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001213 if (PyUnicodeDecodeError_GetStart(exc, &start))
1214 return NULL;
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02001215 if (PyUnicodeDecodeError_GetEnd(exc, &end))
1216 return NULL;
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001217 if (!(object = PyUnicodeDecodeError_GetObject(exc)))
1218 return NULL;
Serhiy Storchakacb33a012016-10-23 09:44:50 +03001219 p = (const unsigned char*)PyBytes_AS_STRING(object);
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02001220 if (!(encode = PyUnicodeDecodeError_GetEncoding(exc))) {
1221 Py_DECREF(object);
1222 return NULL;
1223 }
1224 if (!(encoding = PyUnicode_AsUTF8(encode))) {
1225 Py_DECREF(object);
1226 Py_DECREF(encode);
1227 return NULL;
1228 }
1229 code = get_standard_encoding(encoding, &bytelength);
1230 Py_DECREF(encode);
Serhiy Storchaka88d8fb62014-05-15 14:37:42 +03001231 if (code == ENC_UNKNOWN) {
1232 /* Not supported, fail with original exception */
1233 PyErr_SetObject(PyExceptionInstance_Class(exc), exc);
1234 Py_DECREF(object);
1235 return NULL;
1236 }
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02001237
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001238 /* Try decoding a single surrogate character. If
1239 there are more, let the codec call us again. */
1240 p += start;
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02001241 if (PyBytes_GET_SIZE(object) - start >= bytelength) {
1242 switch (code) {
1243 case ENC_UTF8:
1244 if ((p[0] & 0xf0) == 0xe0 &&
1245 (p[1] & 0xc0) == 0x80 &&
1246 (p[2] & 0xc0) == 0x80) {
1247 /* it's a three-byte code */
1248 ch = ((p[0] & 0x0f) << 12) + ((p[1] & 0x3f) << 6) + (p[2] & 0x3f);
1249 }
1250 break;
1251 case ENC_UTF16LE:
1252 ch = p[1] << 8 | p[0];
1253 break;
1254 case ENC_UTF16BE:
1255 ch = p[0] << 8 | p[1];
1256 break;
1257 case ENC_UTF32LE:
1258 ch = (p[3] << 24) | (p[2] << 16) | (p[1] << 8) | p[0];
1259 break;
1260 case ENC_UTF32BE:
1261 ch = (p[0] << 24) | (p[1] << 16) | (p[2] << 8) | p[3];
1262 break;
1263 }
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001264 }
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02001265
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001266 Py_DECREF(object);
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02001267 if (!Py_UNICODE_IS_SURROGATE(ch)) {
1268 /* it's not a surrogate - fail */
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001269 PyErr_SetObject(PyExceptionInstance_Class(exc), exc);
1270 return NULL;
1271 }
Victor Stinnerc06bb7a2011-11-04 21:36:35 +01001272 res = PyUnicode_FromOrdinal(ch);
1273 if (res == NULL)
1274 return NULL;
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02001275 return Py_BuildValue("(Nn)", res, start + bytelength);
Martin v. Löwisdb12d452009-05-02 18:52:14 +00001276 }
1277 else {
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001278 wrong_exception_type(exc);
1279 return NULL;
Martin v. Löwisdb12d452009-05-02 18:52:14 +00001280 }
1281}
1282
Martin v. Löwis011e8422009-05-05 04:43:17 +00001283static PyObject *
Martin v. Löwis43c57782009-05-10 08:15:24 +00001284PyCodec_SurrogateEscapeErrors(PyObject *exc)
Martin v. Löwis011e8422009-05-05 04:43:17 +00001285{
1286 PyObject *restuple;
1287 PyObject *object;
Martin v. Löwisb09af032011-11-04 11:16:41 +01001288 Py_ssize_t i;
Martin v. Löwis011e8422009-05-05 04:43:17 +00001289 Py_ssize_t start;
1290 Py_ssize_t end;
1291 PyObject *res;
Serhiy Storchakaca7fecb2015-05-18 16:08:52 +03001292
1293 if (PyObject_TypeCheck(exc, (PyTypeObject *)PyExc_UnicodeEncodeError)) {
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001294 char *outp;
1295 if (PyUnicodeEncodeError_GetStart(exc, &start))
1296 return NULL;
1297 if (PyUnicodeEncodeError_GetEnd(exc, &end))
1298 return NULL;
1299 if (!(object = PyUnicodeEncodeError_GetObject(exc)))
1300 return NULL;
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001301 res = PyBytes_FromStringAndSize(NULL, end-start);
1302 if (!res) {
1303 Py_DECREF(object);
1304 return NULL;
1305 }
1306 outp = PyBytes_AsString(res);
Martin v. Löwisb09af032011-11-04 11:16:41 +01001307 for (i = start; i < end; i++) {
1308 /* object is guaranteed to be "ready" */
1309 Py_UCS4 ch = PyUnicode_READ_CHAR(object, i);
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001310 if (ch < 0xdc80 || ch > 0xdcff) {
1311 /* Not a UTF-8b surrogate, fail with original exception */
1312 PyErr_SetObject(PyExceptionInstance_Class(exc), exc);
1313 Py_DECREF(res);
1314 Py_DECREF(object);
1315 return NULL;
1316 }
1317 *outp++ = ch - 0xdc00;
1318 }
1319 restuple = Py_BuildValue("(On)", res, end);
1320 Py_DECREF(res);
1321 Py_DECREF(object);
1322 return restuple;
Martin v. Löwis011e8422009-05-05 04:43:17 +00001323 }
Serhiy Storchakaca7fecb2015-05-18 16:08:52 +03001324 else if (PyObject_TypeCheck(exc, (PyTypeObject *)PyExc_UnicodeDecodeError)) {
Victor Stinnerc06bb7a2011-11-04 21:36:35 +01001325 PyObject *str;
Serhiy Storchakacb33a012016-10-23 09:44:50 +03001326 const unsigned char *p;
Victor Stinnerc06bb7a2011-11-04 21:36:35 +01001327 Py_UCS2 ch[4]; /* decode up to 4 bad bytes. */
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001328 int consumed = 0;
1329 if (PyUnicodeDecodeError_GetStart(exc, &start))
1330 return NULL;
1331 if (PyUnicodeDecodeError_GetEnd(exc, &end))
1332 return NULL;
1333 if (!(object = PyUnicodeDecodeError_GetObject(exc)))
1334 return NULL;
Serhiy Storchakacb33a012016-10-23 09:44:50 +03001335 p = (const unsigned char*)PyBytes_AS_STRING(object);
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001336 while (consumed < 4 && consumed < end-start) {
1337 /* Refuse to escape ASCII bytes. */
1338 if (p[start+consumed] < 128)
1339 break;
1340 ch[consumed] = 0xdc00 + p[start+consumed];
1341 consumed++;
1342 }
1343 Py_DECREF(object);
1344 if (!consumed) {
1345 /* codec complained about ASCII byte. */
1346 PyErr_SetObject(PyExceptionInstance_Class(exc), exc);
1347 return NULL;
1348 }
Victor Stinnerc06bb7a2011-11-04 21:36:35 +01001349 str = PyUnicode_FromKindAndData(PyUnicode_2BYTE_KIND, ch, consumed);
1350 if (str == NULL)
1351 return NULL;
1352 return Py_BuildValue("(Nn)", str, start+consumed);
Martin v. Löwis011e8422009-05-05 04:43:17 +00001353 }
1354 else {
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001355 wrong_exception_type(exc);
1356 return NULL;
Martin v. Löwis011e8422009-05-05 04:43:17 +00001357 }
1358}
1359
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001360
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001361static PyObject *strict_errors(PyObject *self, PyObject *exc)
1362{
1363 return PyCodec_StrictErrors(exc);
1364}
1365
1366
1367static PyObject *ignore_errors(PyObject *self, PyObject *exc)
1368{
1369 return PyCodec_IgnoreErrors(exc);
1370}
1371
1372
1373static PyObject *replace_errors(PyObject *self, PyObject *exc)
1374{
1375 return PyCodec_ReplaceErrors(exc);
1376}
1377
1378
1379static PyObject *xmlcharrefreplace_errors(PyObject *self, PyObject *exc)
1380{
1381 return PyCodec_XMLCharRefReplaceErrors(exc);
1382}
1383
1384
1385static PyObject *backslashreplace_errors(PyObject *self, PyObject *exc)
1386{
1387 return PyCodec_BackslashReplaceErrors(exc);
1388}
1389
Serhiy Storchaka166ebc42014-11-25 13:57:17 +02001390static PyObject *namereplace_errors(PyObject *self, PyObject *exc)
1391{
1392 return PyCodec_NameReplaceErrors(exc);
1393}
1394
Martin v. Löwise0a2b722009-05-10 08:08:56 +00001395static PyObject *surrogatepass_errors(PyObject *self, PyObject *exc)
Martin v. Löwisdb12d452009-05-02 18:52:14 +00001396{
Martin v. Löwise0a2b722009-05-10 08:08:56 +00001397 return PyCodec_SurrogatePassErrors(exc);
Martin v. Löwisdb12d452009-05-02 18:52:14 +00001398}
1399
Martin v. Löwis43c57782009-05-10 08:15:24 +00001400static PyObject *surrogateescape_errors(PyObject *self, PyObject *exc)
Martin v. Löwis011e8422009-05-05 04:43:17 +00001401{
Martin v. Löwis43c57782009-05-10 08:15:24 +00001402 return PyCodec_SurrogateEscapeErrors(exc);
Martin v. Löwis011e8422009-05-05 04:43:17 +00001403}
1404
Gustavo Niemeyer5ddd4c32003-03-19 00:35:36 +00001405static int _PyCodecRegistry_Init(void)
Guido van Rossumfeee4b92000-03-10 22:57:27 +00001406{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001407 static struct {
Andy Lester7386a702020-02-13 22:42:56 -06001408 const char *name;
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001409 PyMethodDef def;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001410 } methods[] =
1411 {
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001412 {
1413 "strict",
1414 {
1415 "strict_errors",
1416 strict_errors,
1417 METH_O,
1418 PyDoc_STR("Implements the 'strict' error handling, which "
1419 "raises a UnicodeError on coding errors.")
1420 }
1421 },
1422 {
1423 "ignore",
1424 {
1425 "ignore_errors",
1426 ignore_errors,
1427 METH_O,
1428 PyDoc_STR("Implements the 'ignore' error handling, which "
1429 "ignores malformed data and continues.")
1430 }
1431 },
1432 {
1433 "replace",
1434 {
1435 "replace_errors",
1436 replace_errors,
1437 METH_O,
1438 PyDoc_STR("Implements the 'replace' error handling, which "
1439 "replaces malformed data with a replacement marker.")
1440 }
1441 },
1442 {
1443 "xmlcharrefreplace",
1444 {
1445 "xmlcharrefreplace_errors",
1446 xmlcharrefreplace_errors,
1447 METH_O,
1448 PyDoc_STR("Implements the 'xmlcharrefreplace' error handling, "
1449 "which replaces an unencodable character with the "
1450 "appropriate XML character reference.")
1451 }
1452 },
1453 {
1454 "backslashreplace",
1455 {
1456 "backslashreplace_errors",
1457 backslashreplace_errors,
1458 METH_O,
1459 PyDoc_STR("Implements the 'backslashreplace' error handling, "
Serhiy Storchaka07985ef2015-01-25 22:56:57 +02001460 "which replaces malformed data with a backslashed "
1461 "escape sequence.")
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001462 }
1463 },
1464 {
Serhiy Storchaka166ebc42014-11-25 13:57:17 +02001465 "namereplace",
1466 {
1467 "namereplace_errors",
1468 namereplace_errors,
1469 METH_O,
1470 PyDoc_STR("Implements the 'namereplace' error handling, "
1471 "which replaces an unencodable character with a "
1472 "\\N{...} escape sequence.")
1473 }
1474 },
1475 {
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001476 "surrogatepass",
1477 {
1478 "surrogatepass",
1479 surrogatepass_errors,
1480 METH_O
1481 }
1482 },
1483 {
1484 "surrogateescape",
1485 {
1486 "surrogateescape",
1487 surrogateescape_errors,
1488 METH_O
1489 }
1490 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001491 };
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001492
Victor Stinnerff4584c2020-03-13 18:03:56 +01001493 PyInterpreterState *interp = _PyInterpreterState_GET_UNSAFE();
Gustavo Niemeyer5ddd4c32003-03-19 00:35:36 +00001494 PyObject *mod;
Gustavo Niemeyer5ddd4c32003-03-19 00:35:36 +00001495
1496 if (interp->codec_search_path != NULL)
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001497 return 0;
Gustavo Niemeyer5ddd4c32003-03-19 00:35:36 +00001498
1499 interp->codec_search_path = PyList_New(0);
Victor Stinnerd3a1de22020-01-27 23:23:12 +01001500 if (interp->codec_search_path == NULL) {
1501 return -1;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001502 }
Guido van Rossumfeee4b92000-03-10 22:57:27 +00001503
Victor Stinnerd3a1de22020-01-27 23:23:12 +01001504 interp->codec_search_cache = PyDict_New();
1505 if (interp->codec_search_cache == NULL) {
1506 return -1;
1507 }
1508
1509 interp->codec_error_registry = PyDict_New();
1510 if (interp->codec_error_registry == NULL) {
1511 return -1;
1512 }
1513
1514 for (size_t i = 0; i < Py_ARRAY_LENGTH(methods); ++i) {
1515 PyObject *func = PyCFunction_NewEx(&methods[i].def, NULL, NULL);
1516 if (!func) {
1517 return -1;
1518 }
1519
1520 int res = PyCodec_RegisterError(methods[i].name, func);
1521 Py_DECREF(func);
1522 if (res) {
1523 return -1;
1524 }
1525 }
Gustavo Niemeyer5ddd4c32003-03-19 00:35:36 +00001526
Christian Heimes819b8bf2008-01-03 23:05:47 +00001527 mod = PyImport_ImportModuleNoBlock("encodings");
Gustavo Niemeyer5ddd4c32003-03-19 00:35:36 +00001528 if (mod == NULL) {
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001529 return -1;
Gustavo Niemeyer5ddd4c32003-03-19 00:35:36 +00001530 }
1531 Py_DECREF(mod);
Christian Heimes6a27efa2008-10-30 21:48:26 +00001532 interp->codecs_initialized = 1;
Gustavo Niemeyer5ddd4c32003-03-19 00:35:36 +00001533 return 0;
Guido van Rossumfeee4b92000-03-10 22:57:27 +00001534}