blob: fa329ce24364276f381e62c09fe106f8a3d13ca7 [file] [log] [blame]
Guido van Rossumfeee4b92000-03-10 22:57:27 +00001/* ------------------------------------------------------------------------
2
3 Python Codec Registry and support functions
4
5Written by Marc-Andre Lemburg (mal@lemburg.com).
6
Guido van Rossum16b1ad92000-08-03 16:24:25 +00007Copyright (c) Corporation for National Research Initiatives.
Guido van Rossumfeee4b92000-03-10 22:57:27 +00008
9 ------------------------------------------------------------------------ */
10
11#include "Python.h"
Victor Stinnere5014be2020-04-14 17:52:15 +020012#include "pycore_interp.h" // PyInterpreterState.codec_search_path
13#include "pycore_pystate.h" // _PyInterpreterState_GET()
Victor Stinner47e1afd2020-10-26 16:43:47 +010014#include "pycore_ucnhash.h" // _PyUnicode_Name_CAPI
Guido van Rossumfeee4b92000-03-10 22:57:27 +000015#include <ctype.h>
16
Victor Stinnerf5cff562011-10-14 02:13:11 +020017const char *Py_hexdigits = "0123456789abcdef";
18
Guido van Rossumfeee4b92000-03-10 22:57:27 +000019/* --- Codec Registry ----------------------------------------------------- */
20
21/* Import the standard encodings package which will register the first
Guido van Rossum98297ee2007-11-06 21:34:58 +000022 codec search function.
Guido van Rossumfeee4b92000-03-10 22:57:27 +000023
24 This is done in a lazy way so that the Unicode implementation does
25 not downgrade startup time of scripts not needing it.
26
Guido van Rossumb95de4f2000-03-31 17:25:23 +000027 ImportErrors are silently ignored by this function. Only one try is
28 made.
Guido van Rossumfeee4b92000-03-10 22:57:27 +000029
30*/
31
Gustavo Niemeyer5ddd4c32003-03-19 00:35:36 +000032static int _PyCodecRegistry_Init(void); /* Forward */
Guido van Rossumfeee4b92000-03-10 22:57:27 +000033
Guido van Rossumfeee4b92000-03-10 22:57:27 +000034int PyCodec_Register(PyObject *search_function)
35{
Victor Stinner81a7be32020-04-14 15:14:01 +020036 PyInterpreterState *interp = _PyInterpreterState_GET();
Gustavo Niemeyer5ddd4c32003-03-19 00:35:36 +000037 if (interp->codec_search_path == NULL && _PyCodecRegistry_Init())
Antoine Pitrouf95a1b32010-05-09 15:52:27 +000038 goto onError;
Guido van Rossumfeee4b92000-03-10 22:57:27 +000039 if (search_function == NULL) {
Antoine Pitrouf95a1b32010-05-09 15:52:27 +000040 PyErr_BadArgument();
41 goto onError;
Guido van Rossumfeee4b92000-03-10 22:57:27 +000042 }
43 if (!PyCallable_Check(search_function)) {
Antoine Pitrouf95a1b32010-05-09 15:52:27 +000044 PyErr_SetString(PyExc_TypeError, "argument must be callable");
45 goto onError;
Guido van Rossumfeee4b92000-03-10 22:57:27 +000046 }
Gustavo Niemeyer5ddd4c32003-03-19 00:35:36 +000047 return PyList_Append(interp->codec_search_path, search_function);
Guido van Rossumb95de4f2000-03-31 17:25:23 +000048
49 onError:
50 return -1;
Guido van Rossumfeee4b92000-03-10 22:57:27 +000051}
52
Hai Shid332e7b2020-09-29 05:41:11 +080053int
54PyCodec_Unregister(PyObject *search_function)
55{
56 PyInterpreterState *interp = PyInterpreterState_Get();
57 PyObject *codec_search_path = interp->codec_search_path;
58 /* Do nothing if codec_search_path is not created yet or was cleared. */
59 if (codec_search_path == NULL) {
60 return 0;
61 }
62
63 assert(PyList_CheckExact(codec_search_path));
64 Py_ssize_t n = PyList_GET_SIZE(codec_search_path);
65 for (Py_ssize_t i = 0; i < n; i++) {
66 PyObject *item = PyList_GET_ITEM(codec_search_path, i);
67 if (item == search_function) {
68 if (interp->codec_search_cache != NULL) {
69 assert(PyDict_CheckExact(interp->codec_search_cache));
70 PyDict_Clear(interp->codec_search_cache);
71 }
72 return PyList_SetSlice(codec_search_path, i, i+1, NULL);
73 }
74 }
75 return 0;
76}
77
Jordon Xu20f59fe2019-08-21 21:26:20 +080078extern int _Py_normalize_encoding(const char *, char *, size_t);
79
80/* Convert a string to a normalized Python string(decoded from UTF-8): all characters are
81 converted to lower case, spaces and hyphens are replaced with underscores. */
Guido van Rossum9e896b32000-04-05 20:11:21 +000082
Guido van Rossumfeee4b92000-03-10 22:57:27 +000083static
Guido van Rossum9e896b32000-04-05 20:11:21 +000084PyObject *normalizestring(const char *string)
Guido van Rossumfeee4b92000-03-10 22:57:27 +000085{
Guido van Rossum582acec2000-06-28 22:07:35 +000086 size_t len = strlen(string);
Jordon Xu20f59fe2019-08-21 21:26:20 +080087 char *encoding;
Guido van Rossumfeee4b92000-03-10 22:57:27 +000088 PyObject *v;
Guido van Rossum21431e82007-10-19 21:48:41 +000089
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000090 if (len > PY_SSIZE_T_MAX) {
Antoine Pitrouf95a1b32010-05-09 15:52:27 +000091 PyErr_SetString(PyExc_OverflowError, "string is too large");
92 return NULL;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000093 }
Guido van Rossum21431e82007-10-19 21:48:41 +000094
Jordon Xu20f59fe2019-08-21 21:26:20 +080095 encoding = PyMem_Malloc(len + 1);
96 if (encoding == NULL)
Victor Stinnercc351592013-07-12 00:02:55 +020097 return PyErr_NoMemory();
Jordon Xu20f59fe2019-08-21 21:26:20 +080098
99 if (!_Py_normalize_encoding(string, encoding, len + 1))
100 {
101 PyErr_SetString(PyExc_RuntimeError, "_Py_normalize_encoding() failed");
102 PyMem_Free(encoding);
103 return NULL;
Guido van Rossum9e896b32000-04-05 20:11:21 +0000104 }
Jordon Xu20f59fe2019-08-21 21:26:20 +0800105
106 v = PyUnicode_FromString(encoding);
107 PyMem_Free(encoding);
Guido van Rossumfeee4b92000-03-10 22:57:27 +0000108 return v;
109}
110
111/* Lookup the given encoding and return a tuple providing the codec
112 facilities.
113
114 The encoding string is looked up converted to all lower-case
115 characters. This makes encodings looked up through this mechanism
116 effectively case-insensitive.
117
Guido van Rossum98297ee2007-11-06 21:34:58 +0000118 If no codec is found, a LookupError is set and NULL returned.
Guido van Rossumb95de4f2000-03-31 17:25:23 +0000119
120 As side effect, this tries to load the encodings package, if not
121 yet done. This is part of the lazy load strategy for the encodings
122 package.
123
124*/
Guido van Rossumfeee4b92000-03-10 22:57:27 +0000125
126PyObject *_PyCodec_Lookup(const char *encoding)
127{
Fred Drake766de832000-05-09 19:55:59 +0000128 if (encoding == NULL) {
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000129 PyErr_BadArgument();
Jeroen Demeyer6e43d072019-07-05 12:57:32 +0200130 return NULL;
Fred Drake766de832000-05-09 19:55:59 +0000131 }
Gustavo Niemeyer5ddd4c32003-03-19 00:35:36 +0000132
Victor Stinner81a7be32020-04-14 15:14:01 +0200133 PyInterpreterState *interp = _PyInterpreterState_GET();
Jeroen Demeyer6e43d072019-07-05 12:57:32 +0200134 if (interp->codec_search_path == NULL && _PyCodecRegistry_Init()) {
135 return NULL;
136 }
Guido van Rossumfeee4b92000-03-10 22:57:27 +0000137
Guido van Rossum9e896b32000-04-05 20:11:21 +0000138 /* Convert the encoding to a normalized Python string: all
Thomas Wouters7e474022000-07-16 12:04:32 +0000139 characters are converted to lower case, spaces and hyphens are
Guido van Rossum9e896b32000-04-05 20:11:21 +0000140 replaced with underscores. */
Jeroen Demeyer6e43d072019-07-05 12:57:32 +0200141 PyObject *v = normalizestring(encoding);
142 if (v == NULL) {
143 return NULL;
144 }
Guido van Rossum21431e82007-10-19 21:48:41 +0000145 PyUnicode_InternInPlace(&v);
Guido van Rossumfeee4b92000-03-10 22:57:27 +0000146
147 /* First, try to lookup the name in the registry dictionary */
Jeroen Demeyer6e43d072019-07-05 12:57:32 +0200148 PyObject *result = PyDict_GetItemWithError(interp->codec_search_cache, v);
Guido van Rossumfeee4b92000-03-10 22:57:27 +0000149 if (result != NULL) {
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000150 Py_INCREF(result);
151 Py_DECREF(v);
152 return result;
Guido van Rossumfeee4b92000-03-10 22:57:27 +0000153 }
Serhiy Storchakaa24107b2019-02-25 17:59:46 +0200154 else if (PyErr_Occurred()) {
Jeroen Demeyer6e43d072019-07-05 12:57:32 +0200155 goto onError;
Serhiy Storchakaa24107b2019-02-25 17:59:46 +0200156 }
Guido van Rossum98297ee2007-11-06 21:34:58 +0000157
Guido van Rossumfeee4b92000-03-10 22:57:27 +0000158 /* Next, scan the search functions in order of registration */
Jeroen Demeyer6e43d072019-07-05 12:57:32 +0200159 const Py_ssize_t len = PyList_Size(interp->codec_search_path);
Guido van Rossum5ba3c842000-03-24 20:52:23 +0000160 if (len < 0)
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000161 goto onError;
Guido van Rossumb95de4f2000-03-31 17:25:23 +0000162 if (len == 0) {
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000163 PyErr_SetString(PyExc_LookupError,
164 "no codec search functions registered: "
165 "can't find encoding");
166 goto onError;
Guido van Rossumb95de4f2000-03-31 17:25:23 +0000167 }
Guido van Rossumfeee4b92000-03-10 22:57:27 +0000168
Jeroen Demeyer6e43d072019-07-05 12:57:32 +0200169 Py_ssize_t i;
Guido van Rossumfeee4b92000-03-10 22:57:27 +0000170 for (i = 0; i < len; i++) {
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000171 PyObject *func;
Guido van Rossum98297ee2007-11-06 21:34:58 +0000172
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000173 func = PyList_GetItem(interp->codec_search_path, i);
174 if (func == NULL)
175 goto onError;
Petr Viktorinffd97532020-02-11 17:46:57 +0100176 result = PyObject_CallOneArg(func, v);
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000177 if (result == NULL)
178 goto onError;
179 if (result == Py_None) {
180 Py_DECREF(result);
181 continue;
182 }
183 if (!PyTuple_Check(result) || PyTuple_GET_SIZE(result) != 4) {
184 PyErr_SetString(PyExc_TypeError,
185 "codec search functions must return 4-tuples");
186 Py_DECREF(result);
187 goto onError;
188 }
189 break;
Guido van Rossumfeee4b92000-03-10 22:57:27 +0000190 }
191 if (i == len) {
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000192 /* XXX Perhaps we should cache misses too ? */
193 PyErr_Format(PyExc_LookupError,
Martin v. Löwiseb42b022002-09-26 16:01:24 +0000194 "unknown encoding: %s", encoding);
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000195 goto onError;
Guido van Rossumfeee4b92000-03-10 22:57:27 +0000196 }
197
198 /* Cache and return the result */
Neal Norwitz9edcc2e2007-08-11 04:58:26 +0000199 if (PyDict_SetItem(interp->codec_search_cache, v, result) < 0) {
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000200 Py_DECREF(result);
201 goto onError;
Neal Norwitz9edcc2e2007-08-11 04:58:26 +0000202 }
Jeroen Demeyer6e43d072019-07-05 12:57:32 +0200203 Py_DECREF(v);
Guido van Rossumfeee4b92000-03-10 22:57:27 +0000204 return result;
205
206 onError:
Jeroen Demeyer6e43d072019-07-05 12:57:32 +0200207 Py_DECREF(v);
Guido van Rossumfeee4b92000-03-10 22:57:27 +0000208 return NULL;
209}
210
Marc-André Lemburgb2750b52008-06-06 12:18:17 +0000211/* Codec registry encoding check API. */
212
213int PyCodec_KnownEncoding(const char *encoding)
214{
215 PyObject *codecs;
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000216
Marc-André Lemburgb2750b52008-06-06 12:18:17 +0000217 codecs = _PyCodec_Lookup(encoding);
218 if (!codecs) {
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000219 PyErr_Clear();
220 return 0;
Marc-André Lemburgb2750b52008-06-06 12:18:17 +0000221 }
222 else {
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000223 Py_DECREF(codecs);
224 return 1;
Marc-André Lemburgb2750b52008-06-06 12:18:17 +0000225 }
226}
227
Guido van Rossumfeee4b92000-03-10 22:57:27 +0000228static
229PyObject *args_tuple(PyObject *object,
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000230 const char *errors)
Guido van Rossumfeee4b92000-03-10 22:57:27 +0000231{
232 PyObject *args;
Guido van Rossum98297ee2007-11-06 21:34:58 +0000233
Guido van Rossumfeee4b92000-03-10 22:57:27 +0000234 args = PyTuple_New(1 + (errors != NULL));
235 if (args == NULL)
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000236 return NULL;
Guido van Rossumfeee4b92000-03-10 22:57:27 +0000237 Py_INCREF(object);
238 PyTuple_SET_ITEM(args,0,object);
239 if (errors) {
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000240 PyObject *v;
Guido van Rossum98297ee2007-11-06 21:34:58 +0000241
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000242 v = PyUnicode_FromString(errors);
243 if (v == NULL) {
244 Py_DECREF(args);
245 return NULL;
246 }
247 PyTuple_SET_ITEM(args, 1, v);
Guido van Rossumfeee4b92000-03-10 22:57:27 +0000248 }
249 return args;
250}
251
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000252/* Helper function to get a codec item */
Guido van Rossumfeee4b92000-03-10 22:57:27 +0000253
254static
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000255PyObject *codec_getitem(const char *encoding, int index)
Guido van Rossumfeee4b92000-03-10 22:57:27 +0000256{
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000257 PyObject *codecs;
258 PyObject *v;
Guido van Rossumfeee4b92000-03-10 22:57:27 +0000259
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000260 codecs = _PyCodec_Lookup(encoding);
261 if (codecs == NULL)
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000262 return NULL;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000263 v = PyTuple_GET_ITEM(codecs, index);
264 Py_DECREF(codecs);
265 Py_INCREF(v);
266 return v;
267}
268
Nick Coghlana9b15242014-02-04 22:11:18 +1000269/* Helper functions to create an incremental codec. */
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000270static
Nick Coghlana9b15242014-02-04 22:11:18 +1000271PyObject *codec_makeincrementalcodec(PyObject *codec_info,
272 const char *errors,
273 const char *attrname)
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000274{
Nick Coghlana9b15242014-02-04 22:11:18 +1000275 PyObject *ret, *inccodec;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000276
Nick Coghlana9b15242014-02-04 22:11:18 +1000277 inccodec = PyObject_GetAttrString(codec_info, attrname);
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000278 if (inccodec == NULL)
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000279 return NULL;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000280 if (errors)
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000281 ret = PyObject_CallFunction(inccodec, "s", errors);
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000282 else
Victor Stinner4778eab2016-12-01 14:51:04 +0100283 ret = _PyObject_CallNoArg(inccodec);
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000284 Py_DECREF(inccodec);
285 return ret;
286}
287
Nick Coghlana9b15242014-02-04 22:11:18 +1000288static
289PyObject *codec_getincrementalcodec(const char *encoding,
290 const char *errors,
291 const char *attrname)
292{
293 PyObject *codec_info, *ret;
294
295 codec_info = _PyCodec_Lookup(encoding);
296 if (codec_info == NULL)
297 return NULL;
298 ret = codec_makeincrementalcodec(codec_info, errors, attrname);
299 Py_DECREF(codec_info);
300 return ret;
301}
302
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000303/* Helper function to create a stream codec. */
304
305static
306PyObject *codec_getstreamcodec(const char *encoding,
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000307 PyObject *stream,
308 const char *errors,
309 const int index)
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000310{
Thomas Wouters0e3f5912006-08-11 14:57:12 +0000311 PyObject *codecs, *streamcodec, *codeccls;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000312
313 codecs = _PyCodec_Lookup(encoding);
314 if (codecs == NULL)
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000315 return NULL;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000316
Thomas Wouters0e3f5912006-08-11 14:57:12 +0000317 codeccls = PyTuple_GET_ITEM(codecs, index);
318 if (errors != NULL)
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000319 streamcodec = PyObject_CallFunction(codeccls, "Os", stream, errors);
Thomas Wouters0e3f5912006-08-11 14:57:12 +0000320 else
Petr Viktorinffd97532020-02-11 17:46:57 +0100321 streamcodec = PyObject_CallOneArg(codeccls, stream);
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000322 Py_DECREF(codecs);
323 return streamcodec;
Guido van Rossumfeee4b92000-03-10 22:57:27 +0000324}
325
Nick Coghlana9b15242014-02-04 22:11:18 +1000326/* Helpers to work with the result of _PyCodec_Lookup
327
328 */
329PyObject *_PyCodecInfo_GetIncrementalDecoder(PyObject *codec_info,
330 const char *errors)
331{
332 return codec_makeincrementalcodec(codec_info, errors,
333 "incrementaldecoder");
334}
335
336PyObject *_PyCodecInfo_GetIncrementalEncoder(PyObject *codec_info,
337 const char *errors)
338{
339 return codec_makeincrementalcodec(codec_info, errors,
340 "incrementalencoder");
341}
342
343
Guido van Rossum98297ee2007-11-06 21:34:58 +0000344/* Convenience APIs to query the Codec registry.
345
Guido van Rossumfeee4b92000-03-10 22:57:27 +0000346 All APIs return a codec object with incremented refcount.
Guido van Rossum98297ee2007-11-06 21:34:58 +0000347
Guido van Rossumfeee4b92000-03-10 22:57:27 +0000348 */
349
350PyObject *PyCodec_Encoder(const char *encoding)
351{
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000352 return codec_getitem(encoding, 0);
Guido van Rossumfeee4b92000-03-10 22:57:27 +0000353}
354
355PyObject *PyCodec_Decoder(const char *encoding)
356{
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000357 return codec_getitem(encoding, 1);
Guido van Rossumfeee4b92000-03-10 22:57:27 +0000358}
359
Thomas Woutersa9773292006-04-21 09:43:23 +0000360PyObject *PyCodec_IncrementalEncoder(const char *encoding,
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000361 const char *errors)
Thomas Woutersa9773292006-04-21 09:43:23 +0000362{
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000363 return codec_getincrementalcodec(encoding, errors, "incrementalencoder");
Thomas Woutersa9773292006-04-21 09:43:23 +0000364}
365
366PyObject *PyCodec_IncrementalDecoder(const char *encoding,
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000367 const char *errors)
Thomas Woutersa9773292006-04-21 09:43:23 +0000368{
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000369 return codec_getincrementalcodec(encoding, errors, "incrementaldecoder");
Thomas Woutersa9773292006-04-21 09:43:23 +0000370}
371
Guido van Rossumfeee4b92000-03-10 22:57:27 +0000372PyObject *PyCodec_StreamReader(const char *encoding,
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000373 PyObject *stream,
374 const char *errors)
Guido van Rossumfeee4b92000-03-10 22:57:27 +0000375{
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000376 return codec_getstreamcodec(encoding, stream, errors, 2);
Guido van Rossumfeee4b92000-03-10 22:57:27 +0000377}
378
379PyObject *PyCodec_StreamWriter(const char *encoding,
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000380 PyObject *stream,
381 const char *errors)
Guido van Rossumfeee4b92000-03-10 22:57:27 +0000382{
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000383 return codec_getstreamcodec(encoding, stream, errors, 3);
Guido van Rossumfeee4b92000-03-10 22:57:27 +0000384}
385
Nick Coghlan8b097b42013-11-13 23:49:21 +1000386/* Helper that tries to ensure the reported exception chain indicates the
387 * codec that was invoked to trigger the failure without changing the type
388 * of the exception raised.
389 */
390static void
391wrap_codec_error(const char *operation,
392 const char *encoding)
393{
394 /* TrySetFromCause will replace the active exception with a suitably
395 * updated clone if it can, otherwise it will leave the original
396 * exception alone.
397 */
398 _PyErr_TrySetFromCause("%s with '%s' codec failed",
399 operation, encoding);
400}
401
Martin Panter6245cb32016-04-15 02:14:19 +0000402/* Encode an object (e.g. a Unicode object) using the given encoding
Guido van Rossumfeee4b92000-03-10 22:57:27 +0000403 and return the resulting encoded object (usually a Python string).
404
405 errors is passed to the encoder factory as argument if non-NULL. */
406
Nick Coghlanc72e4e62013-11-22 22:39:36 +1000407static PyObject *
408_PyCodec_EncodeInternal(PyObject *object,
409 PyObject *encoder,
410 const char *encoding,
411 const char *errors)
Guido van Rossumfeee4b92000-03-10 22:57:27 +0000412{
Neal Norwitz3715c3e2005-11-24 22:09:18 +0000413 PyObject *args = NULL, *result = NULL;
Guido van Rossum98297ee2007-11-06 21:34:58 +0000414 PyObject *v = NULL;
Guido van Rossumfeee4b92000-03-10 22:57:27 +0000415
Guido van Rossumfeee4b92000-03-10 22:57:27 +0000416 args = args_tuple(object, errors);
417 if (args == NULL)
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000418 goto onError;
Guido van Rossum98297ee2007-11-06 21:34:58 +0000419
Jeroen Demeyer1dbd0842019-07-11 17:57:32 +0200420 result = PyObject_Call(encoder, args, NULL);
Nick Coghlanc4c25802013-11-15 21:47:37 +1000421 if (result == NULL) {
422 wrap_codec_error("encoding", encoding);
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000423 goto onError;
Nick Coghlanc4c25802013-11-15 21:47:37 +1000424 }
Guido van Rossumfeee4b92000-03-10 22:57:27 +0000425
Guido van Rossum98297ee2007-11-06 21:34:58 +0000426 if (!PyTuple_Check(result) ||
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000427 PyTuple_GET_SIZE(result) != 2) {
428 PyErr_SetString(PyExc_TypeError,
429 "encoder must return a tuple (object, integer)");
430 goto onError;
Guido van Rossumfeee4b92000-03-10 22:57:27 +0000431 }
Marc-André Lemburgb2750b52008-06-06 12:18:17 +0000432 v = PyTuple_GET_ITEM(result,0);
433 Py_INCREF(v);
Guido van Rossumfeee4b92000-03-10 22:57:27 +0000434 /* We don't check or use the second (integer) entry. */
435
Marc-André Lemburgb2750b52008-06-06 12:18:17 +0000436 Py_DECREF(args);
437 Py_DECREF(encoder);
438 Py_DECREF(result);
439 return v;
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000440
Guido van Rossumfeee4b92000-03-10 22:57:27 +0000441 onError:
Neal Norwitz3715c3e2005-11-24 22:09:18 +0000442 Py_XDECREF(result);
Guido van Rossumfeee4b92000-03-10 22:57:27 +0000443 Py_XDECREF(args);
444 Py_XDECREF(encoder);
Marc-André Lemburgb2750b52008-06-06 12:18:17 +0000445 return NULL;
Guido van Rossumfeee4b92000-03-10 22:57:27 +0000446}
447
448/* Decode an object (usually a Python string) using the given encoding
Martin Panter6245cb32016-04-15 02:14:19 +0000449 and return an equivalent object (e.g. a Unicode object).
Guido van Rossumfeee4b92000-03-10 22:57:27 +0000450
451 errors is passed to the decoder factory as argument if non-NULL. */
452
Nick Coghlanc72e4e62013-11-22 22:39:36 +1000453static PyObject *
454_PyCodec_DecodeInternal(PyObject *object,
455 PyObject *decoder,
456 const char *encoding,
457 const char *errors)
Guido van Rossumfeee4b92000-03-10 22:57:27 +0000458{
Guido van Rossumfeee4b92000-03-10 22:57:27 +0000459 PyObject *args = NULL, *result = NULL;
460 PyObject *v;
461
Guido van Rossumfeee4b92000-03-10 22:57:27 +0000462 args = args_tuple(object, errors);
463 if (args == NULL)
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000464 goto onError;
Guido van Rossum98297ee2007-11-06 21:34:58 +0000465
Jeroen Demeyer1dbd0842019-07-11 17:57:32 +0200466 result = PyObject_Call(decoder, args, NULL);
Nick Coghlanc4c25802013-11-15 21:47:37 +1000467 if (result == NULL) {
468 wrap_codec_error("decoding", encoding);
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000469 goto onError;
Nick Coghlanc4c25802013-11-15 21:47:37 +1000470 }
Guido van Rossum98297ee2007-11-06 21:34:58 +0000471 if (!PyTuple_Check(result) ||
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000472 PyTuple_GET_SIZE(result) != 2) {
473 PyErr_SetString(PyExc_TypeError,
474 "decoder must return a tuple (object,integer)");
475 goto onError;
Guido van Rossumfeee4b92000-03-10 22:57:27 +0000476 }
477 v = PyTuple_GET_ITEM(result,0);
478 Py_INCREF(v);
479 /* We don't check or use the second (integer) entry. */
480
481 Py_DECREF(args);
482 Py_DECREF(decoder);
483 Py_DECREF(result);
484 return v;
Guido van Rossum98297ee2007-11-06 21:34:58 +0000485
Guido van Rossumfeee4b92000-03-10 22:57:27 +0000486 onError:
487 Py_XDECREF(args);
488 Py_XDECREF(decoder);
489 Py_XDECREF(result);
490 return NULL;
491}
492
Nick Coghlanc72e4e62013-11-22 22:39:36 +1000493/* Generic encoding/decoding API */
494PyObject *PyCodec_Encode(PyObject *object,
495 const char *encoding,
496 const char *errors)
497{
498 PyObject *encoder;
499
500 encoder = PyCodec_Encoder(encoding);
501 if (encoder == NULL)
502 return NULL;
503
504 return _PyCodec_EncodeInternal(object, encoder, encoding, errors);
505}
506
507PyObject *PyCodec_Decode(PyObject *object,
508 const char *encoding,
509 const char *errors)
510{
511 PyObject *decoder;
512
513 decoder = PyCodec_Decoder(encoding);
514 if (decoder == NULL)
515 return NULL;
516
517 return _PyCodec_DecodeInternal(object, decoder, encoding, errors);
518}
519
520/* Text encoding/decoding API */
Nick Coghlana9b15242014-02-04 22:11:18 +1000521PyObject * _PyCodec_LookupTextEncoding(const char *encoding,
522 const char *alternate_command)
Nick Coghlanc72e4e62013-11-22 22:39:36 +1000523{
524 _Py_IDENTIFIER(_is_text_encoding);
525 PyObject *codec;
526 PyObject *attr;
Nick Coghlanc72e4e62013-11-22 22:39:36 +1000527 int is_text_codec;
528
529 codec = _PyCodec_Lookup(encoding);
530 if (codec == NULL)
531 return NULL;
532
533 /* Backwards compatibility: assume any raw tuple describes a text
534 * encoding, and the same for anything lacking the private
535 * attribute.
536 */
537 if (!PyTuple_CheckExact(codec)) {
Serhiy Storchakaf320be72018-01-25 10:49:40 +0200538 if (_PyObject_LookupAttrId(codec, &PyId__is_text_encoding, &attr) < 0) {
539 Py_DECREF(codec);
540 return NULL;
541 }
542 if (attr != NULL) {
Nick Coghlanc72e4e62013-11-22 22:39:36 +1000543 is_text_codec = PyObject_IsTrue(attr);
544 Py_DECREF(attr);
Serhiy Storchakafa494fd2015-05-30 17:45:22 +0300545 if (is_text_codec <= 0) {
Nick Coghlanc72e4e62013-11-22 22:39:36 +1000546 Py_DECREF(codec);
Serhiy Storchakafa494fd2015-05-30 17:45:22 +0300547 if (!is_text_codec)
548 PyErr_Format(PyExc_LookupError,
549 "'%.400s' is not a text encoding; "
550 "use %s to handle arbitrary codecs",
551 encoding, alternate_command);
Nick Coghlanc72e4e62013-11-22 22:39:36 +1000552 return NULL;
553 }
554 }
555 }
556
Nick Coghlana9b15242014-02-04 22:11:18 +1000557 /* This appears to be a valid text encoding */
558 return codec;
559}
560
561
562static
563PyObject *codec_getitem_checked(const char *encoding,
564 const char *alternate_command,
565 int index)
566{
567 PyObject *codec;
568 PyObject *v;
569
570 codec = _PyCodec_LookupTextEncoding(encoding, alternate_command);
571 if (codec == NULL)
572 return NULL;
573
Nick Coghlanc72e4e62013-11-22 22:39:36 +1000574 v = PyTuple_GET_ITEM(codec, index);
Nick Coghlanc72e4e62013-11-22 22:39:36 +1000575 Py_INCREF(v);
Nick Coghlana9b15242014-02-04 22:11:18 +1000576 Py_DECREF(codec);
Nick Coghlanc72e4e62013-11-22 22:39:36 +1000577 return v;
578}
579
580static PyObject * _PyCodec_TextEncoder(const char *encoding)
581{
Nick Coghlana9b15242014-02-04 22:11:18 +1000582 return codec_getitem_checked(encoding, "codecs.encode()", 0);
Nick Coghlanc72e4e62013-11-22 22:39:36 +1000583}
584
585static PyObject * _PyCodec_TextDecoder(const char *encoding)
586{
Nick Coghlana9b15242014-02-04 22:11:18 +1000587 return codec_getitem_checked(encoding, "codecs.decode()", 1);
Nick Coghlanc72e4e62013-11-22 22:39:36 +1000588}
589
590PyObject *_PyCodec_EncodeText(PyObject *object,
591 const char *encoding,
592 const char *errors)
593{
594 PyObject *encoder;
595
596 encoder = _PyCodec_TextEncoder(encoding);
597 if (encoder == NULL)
598 return NULL;
599
600 return _PyCodec_EncodeInternal(object, encoder, encoding, errors);
601}
602
603PyObject *_PyCodec_DecodeText(PyObject *object,
604 const char *encoding,
605 const char *errors)
606{
607 PyObject *decoder;
608
609 decoder = _PyCodec_TextDecoder(encoding);
610 if (decoder == NULL)
611 return NULL;
612
613 return _PyCodec_DecodeInternal(object, decoder, encoding, errors);
614}
615
Walter Dörwald3aeb6322002-09-02 13:14:32 +0000616/* Register the error handling callback function error under the name
617 name. This function will be called by the codec when it encounters
618 an unencodable characters/undecodable bytes and doesn't know the
619 callback name, when name is specified as the error parameter
620 in the call to the encode/decode function.
621 Return 0 on success, -1 on error */
622int PyCodec_RegisterError(const char *name, PyObject *error)
623{
Victor Stinner81a7be32020-04-14 15:14:01 +0200624 PyInterpreterState *interp = _PyInterpreterState_GET();
Gustavo Niemeyer5ddd4c32003-03-19 00:35:36 +0000625 if (interp->codec_search_path == NULL && _PyCodecRegistry_Init())
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000626 return -1;
Walter Dörwald3aeb6322002-09-02 13:14:32 +0000627 if (!PyCallable_Check(error)) {
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000628 PyErr_SetString(PyExc_TypeError, "handler must be callable");
629 return -1;
Walter Dörwald3aeb6322002-09-02 13:14:32 +0000630 }
Gustavo Niemeyer5ddd4c32003-03-19 00:35:36 +0000631 return PyDict_SetItemString(interp->codec_error_registry,
Serhiy Storchakac6792272013-10-19 21:03:34 +0300632 name, error);
Walter Dörwald3aeb6322002-09-02 13:14:32 +0000633}
634
635/* Lookup the error handling callback function registered under the
636 name error. As a special case NULL can be passed, in which case
637 the error handling callback for strict encoding will be returned. */
638PyObject *PyCodec_LookupError(const char *name)
639{
640 PyObject *handler = NULL;
641
Victor Stinner81a7be32020-04-14 15:14:01 +0200642 PyInterpreterState *interp = _PyInterpreterState_GET();
Gustavo Niemeyer5ddd4c32003-03-19 00:35:36 +0000643 if (interp->codec_search_path == NULL && _PyCodecRegistry_Init())
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000644 return NULL;
Gustavo Niemeyer5ddd4c32003-03-19 00:35:36 +0000645
Walter Dörwald3aeb6322002-09-02 13:14:32 +0000646 if (name==NULL)
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000647 name = "strict";
Serhiy Storchakaa24107b2019-02-25 17:59:46 +0200648 handler = _PyDict_GetItemStringWithError(interp->codec_error_registry, name);
649 if (handler) {
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000650 Py_INCREF(handler);
Serhiy Storchakaa24107b2019-02-25 17:59:46 +0200651 }
652 else if (!PyErr_Occurred()) {
653 PyErr_Format(PyExc_LookupError, "unknown error handler name '%.400s'", name);
654 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +0000655 return handler;
656}
657
658static void wrong_exception_type(PyObject *exc)
659{
Serhiy Storchakaca7fecb2015-05-18 16:08:52 +0300660 PyErr_Format(PyExc_TypeError,
661 "don't know how to handle %.200s in error callback",
Victor Stinnera102ed72020-02-07 02:24:48 +0100662 Py_TYPE(exc)->tp_name);
Walter Dörwald3aeb6322002-09-02 13:14:32 +0000663}
664
665PyObject *PyCodec_StrictErrors(PyObject *exc)
666{
Brett Cannonbf364092006-03-01 04:25:17 +0000667 if (PyExceptionInstance_Check(exc))
668 PyErr_SetObject(PyExceptionInstance_Class(exc), exc);
Walter Dörwald3aeb6322002-09-02 13:14:32 +0000669 else
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000670 PyErr_SetString(PyExc_TypeError, "codec must pass exception instance");
Walter Dörwald3aeb6322002-09-02 13:14:32 +0000671 return NULL;
672}
673
674
675PyObject *PyCodec_IgnoreErrors(PyObject *exc)
676{
Martin v. Löwis18e16552006-02-15 17:27:45 +0000677 Py_ssize_t end;
Serhiy Storchakaca7fecb2015-05-18 16:08:52 +0300678
679 if (PyObject_TypeCheck(exc, (PyTypeObject *)PyExc_UnicodeEncodeError)) {
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000680 if (PyUnicodeEncodeError_GetEnd(exc, &end))
681 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +0000682 }
Serhiy Storchakaca7fecb2015-05-18 16:08:52 +0300683 else if (PyObject_TypeCheck(exc, (PyTypeObject *)PyExc_UnicodeDecodeError)) {
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000684 if (PyUnicodeDecodeError_GetEnd(exc, &end))
685 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +0000686 }
Serhiy Storchakaca7fecb2015-05-18 16:08:52 +0300687 else if (PyObject_TypeCheck(exc, (PyTypeObject *)PyExc_UnicodeTranslateError)) {
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000688 if (PyUnicodeTranslateError_GetEnd(exc, &end))
689 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +0000690 }
691 else {
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000692 wrong_exception_type(exc);
693 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +0000694 }
Victor Stinneree450092011-12-01 02:52:11 +0100695 return Py_BuildValue("(Nn)", PyUnicode_New(0, 0), end);
Walter Dörwald3aeb6322002-09-02 13:14:32 +0000696}
697
698
699PyObject *PyCodec_ReplaceErrors(PyObject *exc)
700{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200701 Py_ssize_t start, end, i, len;
Walter Dörwald3aeb6322002-09-02 13:14:32 +0000702
Serhiy Storchakaca7fecb2015-05-18 16:08:52 +0300703 if (PyObject_TypeCheck(exc, (PyTypeObject *)PyExc_UnicodeEncodeError)) {
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000704 PyObject *res;
Serhiy Storchakacd8295f2020-04-11 10:48:40 +0300705 Py_UCS1 *outp;
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000706 if (PyUnicodeEncodeError_GetStart(exc, &start))
707 return NULL;
708 if (PyUnicodeEncodeError_GetEnd(exc, &end))
709 return NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200710 len = end - start;
711 res = PyUnicode_New(len, '?');
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000712 if (res == NULL)
713 return NULL;
Serhiy Storchakacd8295f2020-04-11 10:48:40 +0300714 assert(PyUnicode_KIND(res) == PyUnicode_1BYTE_KIND);
715 outp = PyUnicode_1BYTE_DATA(res);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200716 for (i = 0; i < len; ++i)
Serhiy Storchakacd8295f2020-04-11 10:48:40 +0300717 outp[i] = '?';
Victor Stinner8f825062012-04-27 13:55:39 +0200718 assert(_PyUnicode_CheckConsistency(res, 1));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200719 return Py_BuildValue("(Nn)", res, end);
Walter Dörwald3aeb6322002-09-02 13:14:32 +0000720 }
Serhiy Storchakaca7fecb2015-05-18 16:08:52 +0300721 else if (PyObject_TypeCheck(exc, (PyTypeObject *)PyExc_UnicodeDecodeError)) {
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000722 if (PyUnicodeDecodeError_GetEnd(exc, &end))
723 return NULL;
Victor Stinner1a15aba2011-10-02 19:00:15 +0200724 return Py_BuildValue("(Cn)",
725 (int)Py_UNICODE_REPLACEMENT_CHARACTER,
726 end);
Walter Dörwald3aeb6322002-09-02 13:14:32 +0000727 }
Serhiy Storchakaca7fecb2015-05-18 16:08:52 +0300728 else if (PyObject_TypeCheck(exc, (PyTypeObject *)PyExc_UnicodeTranslateError)) {
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000729 PyObject *res;
Serhiy Storchakacd8295f2020-04-11 10:48:40 +0300730 Py_UCS2 *outp;
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000731 if (PyUnicodeTranslateError_GetStart(exc, &start))
732 return NULL;
733 if (PyUnicodeTranslateError_GetEnd(exc, &end))
734 return NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200735 len = end - start;
736 res = PyUnicode_New(len, Py_UNICODE_REPLACEMENT_CHARACTER);
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000737 if (res == NULL)
738 return NULL;
Serhiy Storchakacd8295f2020-04-11 10:48:40 +0300739 assert(PyUnicode_KIND(res) == PyUnicode_2BYTE_KIND);
740 outp = PyUnicode_2BYTE_DATA(res);
741 for (i = 0; i < len; i++)
742 outp[i] = Py_UNICODE_REPLACEMENT_CHARACTER;
Victor Stinner8f825062012-04-27 13:55:39 +0200743 assert(_PyUnicode_CheckConsistency(res, 1));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200744 return Py_BuildValue("(Nn)", res, end);
Walter Dörwald3aeb6322002-09-02 13:14:32 +0000745 }
746 else {
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000747 wrong_exception_type(exc);
748 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +0000749 }
750}
751
752PyObject *PyCodec_XMLCharRefReplaceErrors(PyObject *exc)
753{
Serhiy Storchakaca7fecb2015-05-18 16:08:52 +0300754 if (PyObject_TypeCheck(exc, (PyTypeObject *)PyExc_UnicodeEncodeError)) {
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000755 PyObject *restuple;
756 PyObject *object;
Victor Stinnerb31f1bc2011-11-04 21:29:10 +0100757 Py_ssize_t i;
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000758 Py_ssize_t start;
759 Py_ssize_t end;
760 PyObject *res;
Serhiy Storchakacd8295f2020-04-11 10:48:40 +0300761 Py_UCS1 *outp;
Serhiy Storchaka2e374092014-10-04 14:15:49 +0300762 Py_ssize_t ressize;
Martin v. Löwisb09af032011-11-04 11:16:41 +0100763 Py_UCS4 ch;
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000764 if (PyUnicodeEncodeError_GetStart(exc, &start))
765 return NULL;
766 if (PyUnicodeEncodeError_GetEnd(exc, &end))
767 return NULL;
768 if (!(object = PyUnicodeEncodeError_GetObject(exc)))
769 return NULL;
Serhiy Storchaka2e374092014-10-04 14:15:49 +0300770 if (end - start > PY_SSIZE_T_MAX / (2+7+1))
771 end = start + PY_SSIZE_T_MAX / (2+7+1);
Martin v. Löwisb09af032011-11-04 11:16:41 +0100772 for (i = start, ressize = 0; i < end; ++i) {
773 /* object is guaranteed to be "ready" */
774 ch = PyUnicode_READ_CHAR(object, i);
775 if (ch<10)
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000776 ressize += 2+1+1;
Martin v. Löwisb09af032011-11-04 11:16:41 +0100777 else if (ch<100)
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000778 ressize += 2+2+1;
Martin v. Löwisb09af032011-11-04 11:16:41 +0100779 else if (ch<1000)
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000780 ressize += 2+3+1;
Martin v. Löwisb09af032011-11-04 11:16:41 +0100781 else if (ch<10000)
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000782 ressize += 2+4+1;
Martin v. Löwisb09af032011-11-04 11:16:41 +0100783 else if (ch<100000)
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000784 ressize += 2+5+1;
Martin v. Löwisb09af032011-11-04 11:16:41 +0100785 else if (ch<1000000)
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000786 ressize += 2+6+1;
787 else
788 ressize += 2+7+1;
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000789 }
790 /* allocate replacement */
Martin v. Löwisb09af032011-11-04 11:16:41 +0100791 res = PyUnicode_New(ressize, 127);
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000792 if (res == NULL) {
793 Py_DECREF(object);
794 return NULL;
795 }
Martin v. Löwisb09af032011-11-04 11:16:41 +0100796 outp = PyUnicode_1BYTE_DATA(res);
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000797 /* generate replacement */
Victor Stinnerb31f1bc2011-11-04 21:29:10 +0100798 for (i = start; i < end; ++i) {
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000799 int digits;
800 int base;
Martin v. Löwis8ba79302011-11-04 12:26:49 +0100801 ch = PyUnicode_READ_CHAR(object, i);
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000802 *outp++ = '&';
803 *outp++ = '#';
Martin v. Löwisb09af032011-11-04 11:16:41 +0100804 if (ch<10) {
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000805 digits = 1;
806 base = 1;
807 }
Martin v. Löwisb09af032011-11-04 11:16:41 +0100808 else if (ch<100) {
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000809 digits = 2;
810 base = 10;
811 }
Martin v. Löwisb09af032011-11-04 11:16:41 +0100812 else if (ch<1000) {
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000813 digits = 3;
814 base = 100;
815 }
Martin v. Löwisb09af032011-11-04 11:16:41 +0100816 else if (ch<10000) {
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000817 digits = 4;
818 base = 1000;
819 }
Martin v. Löwisb09af032011-11-04 11:16:41 +0100820 else if (ch<100000) {
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000821 digits = 5;
822 base = 10000;
823 }
Martin v. Löwisb09af032011-11-04 11:16:41 +0100824 else if (ch<1000000) {
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000825 digits = 6;
826 base = 100000;
827 }
828 else {
829 digits = 7;
830 base = 1000000;
831 }
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000832 while (digits-->0) {
Martin v. Löwisb09af032011-11-04 11:16:41 +0100833 *outp++ = '0' + ch/base;
834 ch %= base;
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000835 base /= 10;
836 }
837 *outp++ = ';';
838 }
Victor Stinner8f825062012-04-27 13:55:39 +0200839 assert(_PyUnicode_CheckConsistency(res, 1));
840 restuple = Py_BuildValue("(Nn)", res, end);
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000841 Py_DECREF(object);
842 return restuple;
Walter Dörwald3aeb6322002-09-02 13:14:32 +0000843 }
844 else {
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000845 wrong_exception_type(exc);
846 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +0000847 }
848}
849
Walter Dörwald3aeb6322002-09-02 13:14:32 +0000850PyObject *PyCodec_BackslashReplaceErrors(PyObject *exc)
851{
Serhiy Storchaka07985ef2015-01-25 22:56:57 +0200852 PyObject *object;
853 Py_ssize_t i;
854 Py_ssize_t start;
855 Py_ssize_t end;
856 PyObject *res;
Serhiy Storchakacd8295f2020-04-11 10:48:40 +0300857 Py_UCS1 *outp;
Serhiy Storchaka07985ef2015-01-25 22:56:57 +0200858 int ressize;
859 Py_UCS4 c;
860
Serhiy Storchakac0937f72015-05-18 16:10:40 +0300861 if (PyObject_TypeCheck(exc, (PyTypeObject *)PyExc_UnicodeDecodeError)) {
Serhiy Storchakacb33a012016-10-23 09:44:50 +0300862 const unsigned char *p;
Serhiy Storchaka07985ef2015-01-25 22:56:57 +0200863 if (PyUnicodeDecodeError_GetStart(exc, &start))
864 return NULL;
865 if (PyUnicodeDecodeError_GetEnd(exc, &end))
866 return NULL;
867 if (!(object = PyUnicodeDecodeError_GetObject(exc)))
868 return NULL;
Serhiy Storchakacb33a012016-10-23 09:44:50 +0300869 p = (const unsigned char*)PyBytes_AS_STRING(object);
Serhiy Storchaka07985ef2015-01-25 22:56:57 +0200870 res = PyUnicode_New(4 * (end - start), 127);
871 if (res == NULL) {
872 Py_DECREF(object);
873 return NULL;
874 }
875 outp = PyUnicode_1BYTE_DATA(res);
876 for (i = start; i < end; i++, outp += 4) {
877 unsigned char c = p[i];
878 outp[0] = '\\';
879 outp[1] = 'x';
880 outp[2] = Py_hexdigits[(c>>4)&0xf];
881 outp[3] = Py_hexdigits[c&0xf];
882 }
883
884 assert(_PyUnicode_CheckConsistency(res, 1));
885 Py_DECREF(object);
886 return Py_BuildValue("(Nn)", res, end);
887 }
Serhiy Storchakac0937f72015-05-18 16:10:40 +0300888 if (PyObject_TypeCheck(exc, (PyTypeObject *)PyExc_UnicodeEncodeError)) {
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000889 if (PyUnicodeEncodeError_GetStart(exc, &start))
890 return NULL;
891 if (PyUnicodeEncodeError_GetEnd(exc, &end))
892 return NULL;
893 if (!(object = PyUnicodeEncodeError_GetObject(exc)))
894 return NULL;
Serhiy Storchaka07985ef2015-01-25 22:56:57 +0200895 }
Serhiy Storchakac0937f72015-05-18 16:10:40 +0300896 else if (PyObject_TypeCheck(exc, (PyTypeObject *)PyExc_UnicodeTranslateError)) {
Serhiy Storchaka07985ef2015-01-25 22:56:57 +0200897 if (PyUnicodeTranslateError_GetStart(exc, &start))
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000898 return NULL;
Serhiy Storchaka07985ef2015-01-25 22:56:57 +0200899 if (PyUnicodeTranslateError_GetEnd(exc, &end))
900 return NULL;
901 if (!(object = PyUnicodeTranslateError_GetObject(exc)))
902 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +0000903 }
904 else {
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000905 wrong_exception_type(exc);
906 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +0000907 }
Serhiy Storchaka07985ef2015-01-25 22:56:57 +0200908
909 if (end - start > PY_SSIZE_T_MAX / (1+1+8))
910 end = start + PY_SSIZE_T_MAX / (1+1+8);
911 for (i = start, ressize = 0; i < end; ++i) {
912 /* object is guaranteed to be "ready" */
913 c = PyUnicode_READ_CHAR(object, i);
914 if (c >= 0x10000) {
915 ressize += 1+1+8;
916 }
917 else if (c >= 0x100) {
918 ressize += 1+1+4;
919 }
920 else
921 ressize += 1+1+2;
922 }
923 res = PyUnicode_New(ressize, 127);
924 if (res == NULL) {
925 Py_DECREF(object);
926 return NULL;
927 }
928 outp = PyUnicode_1BYTE_DATA(res);
929 for (i = start; i < end; ++i) {
930 c = PyUnicode_READ_CHAR(object, i);
931 *outp++ = '\\';
932 if (c >= 0x00010000) {
933 *outp++ = 'U';
934 *outp++ = Py_hexdigits[(c>>28)&0xf];
935 *outp++ = Py_hexdigits[(c>>24)&0xf];
936 *outp++ = Py_hexdigits[(c>>20)&0xf];
937 *outp++ = Py_hexdigits[(c>>16)&0xf];
938 *outp++ = Py_hexdigits[(c>>12)&0xf];
939 *outp++ = Py_hexdigits[(c>>8)&0xf];
940 }
941 else if (c >= 0x100) {
942 *outp++ = 'u';
943 *outp++ = Py_hexdigits[(c>>12)&0xf];
944 *outp++ = Py_hexdigits[(c>>8)&0xf];
945 }
946 else
947 *outp++ = 'x';
948 *outp++ = Py_hexdigits[(c>>4)&0xf];
949 *outp++ = Py_hexdigits[c&0xf];
950 }
951
952 assert(_PyUnicode_CheckConsistency(res, 1));
953 Py_DECREF(object);
954 return Py_BuildValue("(Nn)", res, end);
Walter Dörwald3aeb6322002-09-02 13:14:32 +0000955}
956
Victor Stinner47e1afd2020-10-26 16:43:47 +0100957static _PyUnicode_Name_CAPI *ucnhash_capi = NULL;
Serhiy Storchaka166ebc42014-11-25 13:57:17 +0200958
959PyObject *PyCodec_NameReplaceErrors(PyObject *exc)
960{
Serhiy Storchakaca7fecb2015-05-18 16:08:52 +0300961 if (PyObject_TypeCheck(exc, (PyTypeObject *)PyExc_UnicodeEncodeError)) {
Serhiy Storchaka166ebc42014-11-25 13:57:17 +0200962 PyObject *restuple;
963 PyObject *object;
964 Py_ssize_t i;
965 Py_ssize_t start;
966 Py_ssize_t end;
967 PyObject *res;
Serhiy Storchakacd8295f2020-04-11 10:48:40 +0300968 Py_UCS1 *outp;
Serhiy Storchakaaacfccc2014-11-26 12:11:40 +0200969 Py_ssize_t ressize;
970 int replsize;
Serhiy Storchaka166ebc42014-11-25 13:57:17 +0200971 Py_UCS4 c;
972 char buffer[256]; /* NAME_MAXLEN */
973 if (PyUnicodeEncodeError_GetStart(exc, &start))
974 return NULL;
975 if (PyUnicodeEncodeError_GetEnd(exc, &end))
976 return NULL;
977 if (!(object = PyUnicodeEncodeError_GetObject(exc)))
978 return NULL;
Victor Stinner47e1afd2020-10-26 16:43:47 +0100979 if (!ucnhash_capi) {
Serhiy Storchaka166ebc42014-11-25 13:57:17 +0200980 /* load the unicode data module */
Victor Stinner47e1afd2020-10-26 16:43:47 +0100981 ucnhash_capi = (_PyUnicode_Name_CAPI *)PyCapsule_Import(
Serhiy Storchaka166ebc42014-11-25 13:57:17 +0200982 PyUnicodeData_CAPSULE_NAME, 1);
Victor Stinner47e1afd2020-10-26 16:43:47 +0100983 if (!ucnhash_capi) {
Victor Stinner38b8ae02015-09-03 16:19:40 +0200984 return NULL;
Victor Stinner47e1afd2020-10-26 16:43:47 +0100985 }
Serhiy Storchaka166ebc42014-11-25 13:57:17 +0200986 }
987 for (i = start, ressize = 0; i < end; ++i) {
988 /* object is guaranteed to be "ready" */
989 c = PyUnicode_READ_CHAR(object, i);
Victor Stinner920cb642020-10-26 19:19:36 +0100990 if (ucnhash_capi->getname(c, buffer, sizeof(buffer), 1)) {
Serhiy Storchaka26861b02015-02-16 20:52:17 +0200991 replsize = 1+1+1+(int)strlen(buffer)+1;
Serhiy Storchaka166ebc42014-11-25 13:57:17 +0200992 }
993 else if (c >= 0x10000) {
Serhiy Storchakaaacfccc2014-11-26 12:11:40 +0200994 replsize = 1+1+8;
Serhiy Storchaka166ebc42014-11-25 13:57:17 +0200995 }
996 else if (c >= 0x100) {
Serhiy Storchakaaacfccc2014-11-26 12:11:40 +0200997 replsize = 1+1+4;
Serhiy Storchaka166ebc42014-11-25 13:57:17 +0200998 }
999 else
Serhiy Storchakaaacfccc2014-11-26 12:11:40 +02001000 replsize = 1+1+2;
1001 if (ressize > PY_SSIZE_T_MAX - replsize)
1002 break;
1003 ressize += replsize;
Serhiy Storchaka166ebc42014-11-25 13:57:17 +02001004 }
Serhiy Storchakaaacfccc2014-11-26 12:11:40 +02001005 end = i;
Serhiy Storchaka166ebc42014-11-25 13:57:17 +02001006 res = PyUnicode_New(ressize, 127);
1007 if (res==NULL)
1008 return NULL;
1009 for (i = start, outp = PyUnicode_1BYTE_DATA(res);
1010 i < end; ++i) {
1011 c = PyUnicode_READ_CHAR(object, i);
1012 *outp++ = '\\';
Victor Stinner920cb642020-10-26 19:19:36 +01001013 if (ucnhash_capi->getname(c, buffer, sizeof(buffer), 1)) {
Serhiy Storchaka166ebc42014-11-25 13:57:17 +02001014 *outp++ = 'N';
1015 *outp++ = '{';
1016 strcpy((char *)outp, buffer);
1017 outp += strlen(buffer);
1018 *outp++ = '}';
1019 continue;
1020 }
1021 if (c >= 0x00010000) {
1022 *outp++ = 'U';
1023 *outp++ = Py_hexdigits[(c>>28)&0xf];
1024 *outp++ = Py_hexdigits[(c>>24)&0xf];
1025 *outp++ = Py_hexdigits[(c>>20)&0xf];
1026 *outp++ = Py_hexdigits[(c>>16)&0xf];
1027 *outp++ = Py_hexdigits[(c>>12)&0xf];
1028 *outp++ = Py_hexdigits[(c>>8)&0xf];
1029 }
1030 else if (c >= 0x100) {
1031 *outp++ = 'u';
1032 *outp++ = Py_hexdigits[(c>>12)&0xf];
1033 *outp++ = Py_hexdigits[(c>>8)&0xf];
1034 }
1035 else
1036 *outp++ = 'x';
1037 *outp++ = Py_hexdigits[(c>>4)&0xf];
1038 *outp++ = Py_hexdigits[c&0xf];
1039 }
1040
Benjamin Peterson3663b582014-11-26 14:39:54 -06001041 assert(outp == PyUnicode_1BYTE_DATA(res) + ressize);
Serhiy Storchaka166ebc42014-11-25 13:57:17 +02001042 assert(_PyUnicode_CheckConsistency(res, 1));
1043 restuple = Py_BuildValue("(Nn)", res, end);
1044 Py_DECREF(object);
1045 return restuple;
1046 }
1047 else {
1048 wrong_exception_type(exc);
1049 return NULL;
1050 }
1051}
1052
Serhiy Storchaka88d8fb62014-05-15 14:37:42 +03001053#define ENC_UNKNOWN -1
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02001054#define ENC_UTF8 0
1055#define ENC_UTF16BE 1
1056#define ENC_UTF16LE 2
1057#define ENC_UTF32BE 3
1058#define ENC_UTF32LE 4
1059
1060static int
1061get_standard_encoding(const char *encoding, int *bytelength)
1062{
1063 if (Py_TOLOWER(encoding[0]) == 'u' &&
1064 Py_TOLOWER(encoding[1]) == 't' &&
1065 Py_TOLOWER(encoding[2]) == 'f') {
1066 encoding += 3;
1067 if (*encoding == '-' || *encoding == '_' )
1068 encoding++;
Serhiy Storchaka88d8fb62014-05-15 14:37:42 +03001069 if (encoding[0] == '8' && encoding[1] == '\0') {
1070 *bytelength = 3;
1071 return ENC_UTF8;
1072 }
1073 else if (encoding[0] == '1' && encoding[1] == '6') {
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02001074 encoding += 2;
1075 *bytelength = 2;
1076 if (*encoding == '\0') {
1077#ifdef WORDS_BIGENDIAN
1078 return ENC_UTF16BE;
1079#else
1080 return ENC_UTF16LE;
1081#endif
1082 }
1083 if (*encoding == '-' || *encoding == '_' )
1084 encoding++;
1085 if (Py_TOLOWER(encoding[1]) == 'e' && encoding[2] == '\0') {
1086 if (Py_TOLOWER(encoding[0]) == 'b')
1087 return ENC_UTF16BE;
1088 if (Py_TOLOWER(encoding[0]) == 'l')
1089 return ENC_UTF16LE;
1090 }
1091 }
1092 else if (encoding[0] == '3' && encoding[1] == '2') {
1093 encoding += 2;
1094 *bytelength = 4;
1095 if (*encoding == '\0') {
1096#ifdef WORDS_BIGENDIAN
1097 return ENC_UTF32BE;
1098#else
1099 return ENC_UTF32LE;
1100#endif
1101 }
1102 if (*encoding == '-' || *encoding == '_' )
1103 encoding++;
1104 if (Py_TOLOWER(encoding[1]) == 'e' && encoding[2] == '\0') {
1105 if (Py_TOLOWER(encoding[0]) == 'b')
1106 return ENC_UTF32BE;
1107 if (Py_TOLOWER(encoding[0]) == 'l')
1108 return ENC_UTF32LE;
1109 }
1110 }
1111 }
Victor Stinner0d4e01c2014-05-16 14:46:20 +02001112 else if (strcmp(encoding, "CP_UTF8") == 0) {
1113 *bytelength = 3;
1114 return ENC_UTF8;
1115 }
Serhiy Storchaka88d8fb62014-05-15 14:37:42 +03001116 return ENC_UNKNOWN;
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02001117}
1118
Martin v. Löwisaef3fb02009-05-02 19:27:30 +00001119/* This handler is declared static until someone demonstrates
1120 a need to call it directly. */
1121static PyObject *
Martin v. Löwise0a2b722009-05-10 08:08:56 +00001122PyCodec_SurrogatePassErrors(PyObject *exc)
Martin v. Löwisdb12d452009-05-02 18:52:14 +00001123{
1124 PyObject *restuple;
1125 PyObject *object;
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02001126 PyObject *encode;
Serhiy Storchaka85b0f5b2016-11-20 10:16:47 +02001127 const char *encoding;
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02001128 int code;
1129 int bytelength;
Martin v. Löwisb09af032011-11-04 11:16:41 +01001130 Py_ssize_t i;
Martin v. Löwisdb12d452009-05-02 18:52:14 +00001131 Py_ssize_t start;
1132 Py_ssize_t end;
1133 PyObject *res;
Serhiy Storchakaca7fecb2015-05-18 16:08:52 +03001134
1135 if (PyObject_TypeCheck(exc, (PyTypeObject *)PyExc_UnicodeEncodeError)) {
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02001136 unsigned char *outp;
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001137 if (PyUnicodeEncodeError_GetStart(exc, &start))
1138 return NULL;
1139 if (PyUnicodeEncodeError_GetEnd(exc, &end))
1140 return NULL;
1141 if (!(object = PyUnicodeEncodeError_GetObject(exc)))
1142 return NULL;
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02001143 if (!(encode = PyUnicodeEncodeError_GetEncoding(exc))) {
1144 Py_DECREF(object);
1145 return NULL;
1146 }
1147 if (!(encoding = PyUnicode_AsUTF8(encode))) {
1148 Py_DECREF(object);
1149 Py_DECREF(encode);
1150 return NULL;
1151 }
1152 code = get_standard_encoding(encoding, &bytelength);
1153 Py_DECREF(encode);
Serhiy Storchaka88d8fb62014-05-15 14:37:42 +03001154 if (code == ENC_UNKNOWN) {
1155 /* Not supported, fail with original exception */
1156 PyErr_SetObject(PyExceptionInstance_Class(exc), exc);
1157 Py_DECREF(object);
1158 return NULL;
1159 }
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02001160
Serhiy Storchaka2e374092014-10-04 14:15:49 +03001161 if (end - start > PY_SSIZE_T_MAX / bytelength)
1162 end = start + PY_SSIZE_T_MAX / bytelength;
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02001163 res = PyBytes_FromStringAndSize(NULL, bytelength*(end-start));
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001164 if (!res) {
1165 Py_DECREF(object);
1166 return NULL;
1167 }
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02001168 outp = (unsigned char*)PyBytes_AsString(res);
Martin v. Löwisb09af032011-11-04 11:16:41 +01001169 for (i = start; i < end; i++) {
1170 /* object is guaranteed to be "ready" */
1171 Py_UCS4 ch = PyUnicode_READ_CHAR(object, i);
Victor Stinner76df43d2012-10-30 01:42:39 +01001172 if (!Py_UNICODE_IS_SURROGATE(ch)) {
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001173 /* Not a surrogate, fail with original exception */
1174 PyErr_SetObject(PyExceptionInstance_Class(exc), exc);
1175 Py_DECREF(res);
1176 Py_DECREF(object);
1177 return NULL;
1178 }
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02001179 switch (code) {
1180 case ENC_UTF8:
1181 *outp++ = (unsigned char)(0xe0 | (ch >> 12));
1182 *outp++ = (unsigned char)(0x80 | ((ch >> 6) & 0x3f));
1183 *outp++ = (unsigned char)(0x80 | (ch & 0x3f));
1184 break;
1185 case ENC_UTF16LE:
1186 *outp++ = (unsigned char) ch;
1187 *outp++ = (unsigned char)(ch >> 8);
1188 break;
1189 case ENC_UTF16BE:
1190 *outp++ = (unsigned char)(ch >> 8);
1191 *outp++ = (unsigned char) ch;
1192 break;
1193 case ENC_UTF32LE:
1194 *outp++ = (unsigned char) ch;
1195 *outp++ = (unsigned char)(ch >> 8);
1196 *outp++ = (unsigned char)(ch >> 16);
1197 *outp++ = (unsigned char)(ch >> 24);
1198 break;
1199 case ENC_UTF32BE:
1200 *outp++ = (unsigned char)(ch >> 24);
1201 *outp++ = (unsigned char)(ch >> 16);
1202 *outp++ = (unsigned char)(ch >> 8);
1203 *outp++ = (unsigned char) ch;
1204 break;
1205 }
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001206 }
1207 restuple = Py_BuildValue("(On)", res, end);
1208 Py_DECREF(res);
1209 Py_DECREF(object);
1210 return restuple;
Martin v. Löwisdb12d452009-05-02 18:52:14 +00001211 }
Serhiy Storchakaca7fecb2015-05-18 16:08:52 +03001212 else if (PyObject_TypeCheck(exc, (PyTypeObject *)PyExc_UnicodeDecodeError)) {
Serhiy Storchakacb33a012016-10-23 09:44:50 +03001213 const unsigned char *p;
Victor Stinnerc06bb7a2011-11-04 21:36:35 +01001214 Py_UCS4 ch = 0;
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001215 if (PyUnicodeDecodeError_GetStart(exc, &start))
1216 return NULL;
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02001217 if (PyUnicodeDecodeError_GetEnd(exc, &end))
1218 return NULL;
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001219 if (!(object = PyUnicodeDecodeError_GetObject(exc)))
1220 return NULL;
Serhiy Storchakacb33a012016-10-23 09:44:50 +03001221 p = (const unsigned char*)PyBytes_AS_STRING(object);
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02001222 if (!(encode = PyUnicodeDecodeError_GetEncoding(exc))) {
1223 Py_DECREF(object);
1224 return NULL;
1225 }
1226 if (!(encoding = PyUnicode_AsUTF8(encode))) {
1227 Py_DECREF(object);
1228 Py_DECREF(encode);
1229 return NULL;
1230 }
1231 code = get_standard_encoding(encoding, &bytelength);
1232 Py_DECREF(encode);
Serhiy Storchaka88d8fb62014-05-15 14:37:42 +03001233 if (code == ENC_UNKNOWN) {
1234 /* Not supported, fail with original exception */
1235 PyErr_SetObject(PyExceptionInstance_Class(exc), exc);
1236 Py_DECREF(object);
1237 return NULL;
1238 }
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02001239
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001240 /* Try decoding a single surrogate character. If
1241 there are more, let the codec call us again. */
1242 p += start;
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02001243 if (PyBytes_GET_SIZE(object) - start >= bytelength) {
1244 switch (code) {
1245 case ENC_UTF8:
1246 if ((p[0] & 0xf0) == 0xe0 &&
1247 (p[1] & 0xc0) == 0x80 &&
1248 (p[2] & 0xc0) == 0x80) {
1249 /* it's a three-byte code */
1250 ch = ((p[0] & 0x0f) << 12) + ((p[1] & 0x3f) << 6) + (p[2] & 0x3f);
1251 }
1252 break;
1253 case ENC_UTF16LE:
1254 ch = p[1] << 8 | p[0];
1255 break;
1256 case ENC_UTF16BE:
1257 ch = p[0] << 8 | p[1];
1258 break;
1259 case ENC_UTF32LE:
1260 ch = (p[3] << 24) | (p[2] << 16) | (p[1] << 8) | p[0];
1261 break;
1262 case ENC_UTF32BE:
1263 ch = (p[0] << 24) | (p[1] << 16) | (p[2] << 8) | p[3];
1264 break;
1265 }
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001266 }
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02001267
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001268 Py_DECREF(object);
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02001269 if (!Py_UNICODE_IS_SURROGATE(ch)) {
1270 /* it's not a surrogate - fail */
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001271 PyErr_SetObject(PyExceptionInstance_Class(exc), exc);
1272 return NULL;
1273 }
Victor Stinnerc06bb7a2011-11-04 21:36:35 +01001274 res = PyUnicode_FromOrdinal(ch);
1275 if (res == NULL)
1276 return NULL;
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02001277 return Py_BuildValue("(Nn)", res, start + bytelength);
Martin v. Löwisdb12d452009-05-02 18:52:14 +00001278 }
1279 else {
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001280 wrong_exception_type(exc);
1281 return NULL;
Martin v. Löwisdb12d452009-05-02 18:52:14 +00001282 }
1283}
1284
Martin v. Löwis011e8422009-05-05 04:43:17 +00001285static PyObject *
Martin v. Löwis43c57782009-05-10 08:15:24 +00001286PyCodec_SurrogateEscapeErrors(PyObject *exc)
Martin v. Löwis011e8422009-05-05 04:43:17 +00001287{
1288 PyObject *restuple;
1289 PyObject *object;
Martin v. Löwisb09af032011-11-04 11:16:41 +01001290 Py_ssize_t i;
Martin v. Löwis011e8422009-05-05 04:43:17 +00001291 Py_ssize_t start;
1292 Py_ssize_t end;
1293 PyObject *res;
Serhiy Storchakaca7fecb2015-05-18 16:08:52 +03001294
1295 if (PyObject_TypeCheck(exc, (PyTypeObject *)PyExc_UnicodeEncodeError)) {
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001296 char *outp;
1297 if (PyUnicodeEncodeError_GetStart(exc, &start))
1298 return NULL;
1299 if (PyUnicodeEncodeError_GetEnd(exc, &end))
1300 return NULL;
1301 if (!(object = PyUnicodeEncodeError_GetObject(exc)))
1302 return NULL;
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001303 res = PyBytes_FromStringAndSize(NULL, end-start);
1304 if (!res) {
1305 Py_DECREF(object);
1306 return NULL;
1307 }
1308 outp = PyBytes_AsString(res);
Martin v. Löwisb09af032011-11-04 11:16:41 +01001309 for (i = start; i < end; i++) {
1310 /* object is guaranteed to be "ready" */
1311 Py_UCS4 ch = PyUnicode_READ_CHAR(object, i);
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001312 if (ch < 0xdc80 || ch > 0xdcff) {
1313 /* Not a UTF-8b surrogate, fail with original exception */
1314 PyErr_SetObject(PyExceptionInstance_Class(exc), exc);
1315 Py_DECREF(res);
1316 Py_DECREF(object);
1317 return NULL;
1318 }
1319 *outp++ = ch - 0xdc00;
1320 }
1321 restuple = Py_BuildValue("(On)", res, end);
1322 Py_DECREF(res);
1323 Py_DECREF(object);
1324 return restuple;
Martin v. Löwis011e8422009-05-05 04:43:17 +00001325 }
Serhiy Storchakaca7fecb2015-05-18 16:08:52 +03001326 else if (PyObject_TypeCheck(exc, (PyTypeObject *)PyExc_UnicodeDecodeError)) {
Victor Stinnerc06bb7a2011-11-04 21:36:35 +01001327 PyObject *str;
Serhiy Storchakacb33a012016-10-23 09:44:50 +03001328 const unsigned char *p;
Victor Stinnerc06bb7a2011-11-04 21:36:35 +01001329 Py_UCS2 ch[4]; /* decode up to 4 bad bytes. */
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001330 int consumed = 0;
1331 if (PyUnicodeDecodeError_GetStart(exc, &start))
1332 return NULL;
1333 if (PyUnicodeDecodeError_GetEnd(exc, &end))
1334 return NULL;
1335 if (!(object = PyUnicodeDecodeError_GetObject(exc)))
1336 return NULL;
Serhiy Storchakacb33a012016-10-23 09:44:50 +03001337 p = (const unsigned char*)PyBytes_AS_STRING(object);
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001338 while (consumed < 4 && consumed < end-start) {
1339 /* Refuse to escape ASCII bytes. */
1340 if (p[start+consumed] < 128)
1341 break;
1342 ch[consumed] = 0xdc00 + p[start+consumed];
1343 consumed++;
1344 }
1345 Py_DECREF(object);
1346 if (!consumed) {
1347 /* codec complained about ASCII byte. */
1348 PyErr_SetObject(PyExceptionInstance_Class(exc), exc);
1349 return NULL;
1350 }
Victor Stinnerc06bb7a2011-11-04 21:36:35 +01001351 str = PyUnicode_FromKindAndData(PyUnicode_2BYTE_KIND, ch, consumed);
1352 if (str == NULL)
1353 return NULL;
1354 return Py_BuildValue("(Nn)", str, start+consumed);
Martin v. Löwis011e8422009-05-05 04:43:17 +00001355 }
1356 else {
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001357 wrong_exception_type(exc);
1358 return NULL;
Martin v. Löwis011e8422009-05-05 04:43:17 +00001359 }
1360}
1361
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001362
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001363static PyObject *strict_errors(PyObject *self, PyObject *exc)
1364{
1365 return PyCodec_StrictErrors(exc);
1366}
1367
1368
1369static PyObject *ignore_errors(PyObject *self, PyObject *exc)
1370{
1371 return PyCodec_IgnoreErrors(exc);
1372}
1373
1374
1375static PyObject *replace_errors(PyObject *self, PyObject *exc)
1376{
1377 return PyCodec_ReplaceErrors(exc);
1378}
1379
1380
1381static PyObject *xmlcharrefreplace_errors(PyObject *self, PyObject *exc)
1382{
1383 return PyCodec_XMLCharRefReplaceErrors(exc);
1384}
1385
1386
1387static PyObject *backslashreplace_errors(PyObject *self, PyObject *exc)
1388{
1389 return PyCodec_BackslashReplaceErrors(exc);
1390}
1391
Serhiy Storchaka166ebc42014-11-25 13:57:17 +02001392static PyObject *namereplace_errors(PyObject *self, PyObject *exc)
1393{
1394 return PyCodec_NameReplaceErrors(exc);
1395}
1396
Martin v. Löwise0a2b722009-05-10 08:08:56 +00001397static PyObject *surrogatepass_errors(PyObject *self, PyObject *exc)
Martin v. Löwisdb12d452009-05-02 18:52:14 +00001398{
Martin v. Löwise0a2b722009-05-10 08:08:56 +00001399 return PyCodec_SurrogatePassErrors(exc);
Martin v. Löwisdb12d452009-05-02 18:52:14 +00001400}
1401
Martin v. Löwis43c57782009-05-10 08:15:24 +00001402static PyObject *surrogateescape_errors(PyObject *self, PyObject *exc)
Martin v. Löwis011e8422009-05-05 04:43:17 +00001403{
Martin v. Löwis43c57782009-05-10 08:15:24 +00001404 return PyCodec_SurrogateEscapeErrors(exc);
Martin v. Löwis011e8422009-05-05 04:43:17 +00001405}
1406
Gustavo Niemeyer5ddd4c32003-03-19 00:35:36 +00001407static int _PyCodecRegistry_Init(void)
Guido van Rossumfeee4b92000-03-10 22:57:27 +00001408{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001409 static struct {
Andy Lester7386a702020-02-13 22:42:56 -06001410 const char *name;
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001411 PyMethodDef def;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001412 } methods[] =
1413 {
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001414 {
1415 "strict",
1416 {
1417 "strict_errors",
1418 strict_errors,
1419 METH_O,
1420 PyDoc_STR("Implements the 'strict' error handling, which "
1421 "raises a UnicodeError on coding errors.")
1422 }
1423 },
1424 {
1425 "ignore",
1426 {
1427 "ignore_errors",
1428 ignore_errors,
1429 METH_O,
1430 PyDoc_STR("Implements the 'ignore' error handling, which "
1431 "ignores malformed data and continues.")
1432 }
1433 },
1434 {
1435 "replace",
1436 {
1437 "replace_errors",
1438 replace_errors,
1439 METH_O,
1440 PyDoc_STR("Implements the 'replace' error handling, which "
1441 "replaces malformed data with a replacement marker.")
1442 }
1443 },
1444 {
1445 "xmlcharrefreplace",
1446 {
1447 "xmlcharrefreplace_errors",
1448 xmlcharrefreplace_errors,
1449 METH_O,
1450 PyDoc_STR("Implements the 'xmlcharrefreplace' error handling, "
1451 "which replaces an unencodable character with the "
1452 "appropriate XML character reference.")
1453 }
1454 },
1455 {
1456 "backslashreplace",
1457 {
1458 "backslashreplace_errors",
1459 backslashreplace_errors,
1460 METH_O,
1461 PyDoc_STR("Implements the 'backslashreplace' error handling, "
Serhiy Storchaka07985ef2015-01-25 22:56:57 +02001462 "which replaces malformed data with a backslashed "
1463 "escape sequence.")
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001464 }
1465 },
1466 {
Serhiy Storchaka166ebc42014-11-25 13:57:17 +02001467 "namereplace",
1468 {
1469 "namereplace_errors",
1470 namereplace_errors,
1471 METH_O,
1472 PyDoc_STR("Implements the 'namereplace' error handling, "
1473 "which replaces an unencodable character with a "
1474 "\\N{...} escape sequence.")
1475 }
1476 },
1477 {
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001478 "surrogatepass",
1479 {
1480 "surrogatepass",
1481 surrogatepass_errors,
1482 METH_O
1483 }
1484 },
1485 {
1486 "surrogateescape",
1487 {
1488 "surrogateescape",
1489 surrogateescape_errors,
1490 METH_O
1491 }
1492 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001493 };
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001494
Victor Stinner81a7be32020-04-14 15:14:01 +02001495 PyInterpreterState *interp = _PyInterpreterState_GET();
Gustavo Niemeyer5ddd4c32003-03-19 00:35:36 +00001496 PyObject *mod;
Gustavo Niemeyer5ddd4c32003-03-19 00:35:36 +00001497
1498 if (interp->codec_search_path != NULL)
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001499 return 0;
Gustavo Niemeyer5ddd4c32003-03-19 00:35:36 +00001500
1501 interp->codec_search_path = PyList_New(0);
Victor Stinnerd3a1de22020-01-27 23:23:12 +01001502 if (interp->codec_search_path == NULL) {
1503 return -1;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001504 }
Guido van Rossumfeee4b92000-03-10 22:57:27 +00001505
Victor Stinnerd3a1de22020-01-27 23:23:12 +01001506 interp->codec_search_cache = PyDict_New();
1507 if (interp->codec_search_cache == NULL) {
1508 return -1;
1509 }
1510
1511 interp->codec_error_registry = PyDict_New();
1512 if (interp->codec_error_registry == NULL) {
1513 return -1;
1514 }
1515
1516 for (size_t i = 0; i < Py_ARRAY_LENGTH(methods); ++i) {
1517 PyObject *func = PyCFunction_NewEx(&methods[i].def, NULL, NULL);
1518 if (!func) {
1519 return -1;
1520 }
1521
1522 int res = PyCodec_RegisterError(methods[i].name, func);
1523 Py_DECREF(func);
1524 if (res) {
1525 return -1;
1526 }
1527 }
Gustavo Niemeyer5ddd4c32003-03-19 00:35:36 +00001528
Christian Heimes819b8bf2008-01-03 23:05:47 +00001529 mod = PyImport_ImportModuleNoBlock("encodings");
Gustavo Niemeyer5ddd4c32003-03-19 00:35:36 +00001530 if (mod == NULL) {
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001531 return -1;
Gustavo Niemeyer5ddd4c32003-03-19 00:35:36 +00001532 }
1533 Py_DECREF(mod);
Christian Heimes6a27efa2008-10-30 21:48:26 +00001534 interp->codecs_initialized = 1;
Gustavo Niemeyer5ddd4c32003-03-19 00:35:36 +00001535 return 0;
Guido van Rossumfeee4b92000-03-10 22:57:27 +00001536}