blob: 4f38b33e0b76f16ffe057a23c760eac773201e03 [file] [log] [blame]
Guido van Rossumfeee4b92000-03-10 22:57:27 +00001/* ------------------------------------------------------------------------
2
3 Python Codec Registry and support functions
4
5Written by Marc-Andre Lemburg (mal@lemburg.com).
6
Guido van Rossum16b1ad92000-08-03 16:24:25 +00007Copyright (c) Corporation for National Research Initiatives.
Guido van Rossumfeee4b92000-03-10 22:57:27 +00008
9 ------------------------------------------------------------------------ */
10
11#include "Python.h"
Victor Stinner621cebe2018-11-12 16:53:38 +010012#include "pycore_pystate.h"
Serhiy Storchaka166ebc42014-11-25 13:57:17 +020013#include "ucnhash.h"
Guido van Rossumfeee4b92000-03-10 22:57:27 +000014#include <ctype.h>
15
Victor Stinnerf5cff562011-10-14 02:13:11 +020016const char *Py_hexdigits = "0123456789abcdef";
17
Guido van Rossumfeee4b92000-03-10 22:57:27 +000018/* --- Codec Registry ----------------------------------------------------- */
19
20/* Import the standard encodings package which will register the first
Guido van Rossum98297ee2007-11-06 21:34:58 +000021 codec search function.
Guido van Rossumfeee4b92000-03-10 22:57:27 +000022
23 This is done in a lazy way so that the Unicode implementation does
24 not downgrade startup time of scripts not needing it.
25
Guido van Rossumb95de4f2000-03-31 17:25:23 +000026 ImportErrors are silently ignored by this function. Only one try is
27 made.
Guido van Rossumfeee4b92000-03-10 22:57:27 +000028
29*/
30
Gustavo Niemeyer5ddd4c32003-03-19 00:35:36 +000031static int _PyCodecRegistry_Init(void); /* Forward */
Guido van Rossumfeee4b92000-03-10 22:57:27 +000032
Guido van Rossumfeee4b92000-03-10 22:57:27 +000033int PyCodec_Register(PyObject *search_function)
34{
Victor Stinnercaba55b2018-08-03 15:33:52 +020035 PyInterpreterState *interp = _PyInterpreterState_Get();
Gustavo Niemeyer5ddd4c32003-03-19 00:35:36 +000036 if (interp->codec_search_path == NULL && _PyCodecRegistry_Init())
Antoine Pitrouf95a1b32010-05-09 15:52:27 +000037 goto onError;
Guido van Rossumfeee4b92000-03-10 22:57:27 +000038 if (search_function == NULL) {
Antoine Pitrouf95a1b32010-05-09 15:52:27 +000039 PyErr_BadArgument();
40 goto onError;
Guido van Rossumfeee4b92000-03-10 22:57:27 +000041 }
42 if (!PyCallable_Check(search_function)) {
Antoine Pitrouf95a1b32010-05-09 15:52:27 +000043 PyErr_SetString(PyExc_TypeError, "argument must be callable");
44 goto onError;
Guido van Rossumfeee4b92000-03-10 22:57:27 +000045 }
Gustavo Niemeyer5ddd4c32003-03-19 00:35:36 +000046 return PyList_Append(interp->codec_search_path, search_function);
Guido van Rossumb95de4f2000-03-31 17:25:23 +000047
48 onError:
49 return -1;
Guido van Rossumfeee4b92000-03-10 22:57:27 +000050}
51
Guido van Rossum9e896b32000-04-05 20:11:21 +000052/* Convert a string to a normalized Python string: all characters are
53 converted to lower case, spaces are replaced with underscores. */
54
Guido van Rossumfeee4b92000-03-10 22:57:27 +000055static
Guido van Rossum9e896b32000-04-05 20:11:21 +000056PyObject *normalizestring(const char *string)
Guido van Rossumfeee4b92000-03-10 22:57:27 +000057{
Antoine Pitrou9ed5f272013-08-13 20:18:52 +020058 size_t i;
Guido van Rossum582acec2000-06-28 22:07:35 +000059 size_t len = strlen(string);
Guido van Rossumfeee4b92000-03-10 22:57:27 +000060 char *p;
61 PyObject *v;
Guido van Rossum21431e82007-10-19 21:48:41 +000062
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000063 if (len > PY_SSIZE_T_MAX) {
Antoine Pitrouf95a1b32010-05-09 15:52:27 +000064 PyErr_SetString(PyExc_OverflowError, "string is too large");
65 return NULL;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000066 }
Guido van Rossum21431e82007-10-19 21:48:41 +000067
68 p = PyMem_Malloc(len + 1);
69 if (p == NULL)
Victor Stinnercc351592013-07-12 00:02:55 +020070 return PyErr_NoMemory();
Guido van Rossum9e896b32000-04-05 20:11:21 +000071 for (i = 0; i < len; i++) {
Antoine Pitrou9ed5f272013-08-13 20:18:52 +020072 char ch = string[i];
Guido van Rossum9e896b32000-04-05 20:11:21 +000073 if (ch == ' ')
74 ch = '-';
75 else
Antoine Pitroucf9d3c02011-07-24 02:27:04 +020076 ch = Py_TOLOWER(Py_CHARMASK(ch));
Antoine Pitrouf95a1b32010-05-09 15:52:27 +000077 p[i] = ch;
Guido van Rossum9e896b32000-04-05 20:11:21 +000078 }
Guido van Rossum21431e82007-10-19 21:48:41 +000079 p[i] = '\0';
80 v = PyUnicode_FromString(p);
Guido van Rossum21431e82007-10-19 21:48:41 +000081 PyMem_Free(p);
Guido van Rossumfeee4b92000-03-10 22:57:27 +000082 return v;
83}
84
85/* Lookup the given encoding and return a tuple providing the codec
86 facilities.
87
88 The encoding string is looked up converted to all lower-case
89 characters. This makes encodings looked up through this mechanism
90 effectively case-insensitive.
91
Guido van Rossum98297ee2007-11-06 21:34:58 +000092 If no codec is found, a LookupError is set and NULL returned.
Guido van Rossumb95de4f2000-03-31 17:25:23 +000093
94 As side effect, this tries to load the encodings package, if not
95 yet done. This is part of the lazy load strategy for the encodings
96 package.
97
98*/
Guido van Rossumfeee4b92000-03-10 22:57:27 +000099
100PyObject *_PyCodec_Lookup(const char *encoding)
101{
Fred Drake766de832000-05-09 19:55:59 +0000102 if (encoding == NULL) {
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000103 PyErr_BadArgument();
Jeroen Demeyer6e43d072019-07-05 12:57:32 +0200104 return NULL;
Fred Drake766de832000-05-09 19:55:59 +0000105 }
Gustavo Niemeyer5ddd4c32003-03-19 00:35:36 +0000106
Victor Stinnercaba55b2018-08-03 15:33:52 +0200107 PyInterpreterState *interp = _PyInterpreterState_GET_UNSAFE();
Jeroen Demeyer6e43d072019-07-05 12:57:32 +0200108 if (interp->codec_search_path == NULL && _PyCodecRegistry_Init()) {
109 return NULL;
110 }
Guido van Rossumfeee4b92000-03-10 22:57:27 +0000111
Guido van Rossum9e896b32000-04-05 20:11:21 +0000112 /* Convert the encoding to a normalized Python string: all
Thomas Wouters7e474022000-07-16 12:04:32 +0000113 characters are converted to lower case, spaces and hyphens are
Guido van Rossum9e896b32000-04-05 20:11:21 +0000114 replaced with underscores. */
Jeroen Demeyer6e43d072019-07-05 12:57:32 +0200115 PyObject *v = normalizestring(encoding);
116 if (v == NULL) {
117 return NULL;
118 }
Guido van Rossum21431e82007-10-19 21:48:41 +0000119 PyUnicode_InternInPlace(&v);
Guido van Rossumfeee4b92000-03-10 22:57:27 +0000120
121 /* First, try to lookup the name in the registry dictionary */
Jeroen Demeyer6e43d072019-07-05 12:57:32 +0200122 PyObject *result = PyDict_GetItemWithError(interp->codec_search_cache, v);
Guido van Rossumfeee4b92000-03-10 22:57:27 +0000123 if (result != NULL) {
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000124 Py_INCREF(result);
125 Py_DECREF(v);
126 return result;
Guido van Rossumfeee4b92000-03-10 22:57:27 +0000127 }
Serhiy Storchakaa24107b2019-02-25 17:59:46 +0200128 else if (PyErr_Occurred()) {
Jeroen Demeyer6e43d072019-07-05 12:57:32 +0200129 goto onError;
Serhiy Storchakaa24107b2019-02-25 17:59:46 +0200130 }
Guido van Rossum98297ee2007-11-06 21:34:58 +0000131
Guido van Rossumfeee4b92000-03-10 22:57:27 +0000132 /* Next, scan the search functions in order of registration */
Jeroen Demeyer6e43d072019-07-05 12:57:32 +0200133 const Py_ssize_t len = PyList_Size(interp->codec_search_path);
Guido van Rossum5ba3c842000-03-24 20:52:23 +0000134 if (len < 0)
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000135 goto onError;
Guido van Rossumb95de4f2000-03-31 17:25:23 +0000136 if (len == 0) {
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000137 PyErr_SetString(PyExc_LookupError,
138 "no codec search functions registered: "
139 "can't find encoding");
140 goto onError;
Guido van Rossumb95de4f2000-03-31 17:25:23 +0000141 }
Guido van Rossumfeee4b92000-03-10 22:57:27 +0000142
Jeroen Demeyer6e43d072019-07-05 12:57:32 +0200143 Py_ssize_t i;
Guido van Rossumfeee4b92000-03-10 22:57:27 +0000144 for (i = 0; i < len; i++) {
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000145 PyObject *func;
Guido van Rossum98297ee2007-11-06 21:34:58 +0000146
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000147 func = PyList_GetItem(interp->codec_search_path, i);
148 if (func == NULL)
149 goto onError;
Jeroen Demeyer196a5302019-07-04 12:31:34 +0200150 result = _PyObject_CallOneArg(func, v);
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000151 if (result == NULL)
152 goto onError;
153 if (result == Py_None) {
154 Py_DECREF(result);
155 continue;
156 }
157 if (!PyTuple_Check(result) || PyTuple_GET_SIZE(result) != 4) {
158 PyErr_SetString(PyExc_TypeError,
159 "codec search functions must return 4-tuples");
160 Py_DECREF(result);
161 goto onError;
162 }
163 break;
Guido van Rossumfeee4b92000-03-10 22:57:27 +0000164 }
165 if (i == len) {
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000166 /* XXX Perhaps we should cache misses too ? */
167 PyErr_Format(PyExc_LookupError,
Martin v. Löwiseb42b022002-09-26 16:01:24 +0000168 "unknown encoding: %s", encoding);
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000169 goto onError;
Guido van Rossumfeee4b92000-03-10 22:57:27 +0000170 }
171
172 /* Cache and return the result */
Neal Norwitz9edcc2e2007-08-11 04:58:26 +0000173 if (PyDict_SetItem(interp->codec_search_cache, v, result) < 0) {
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000174 Py_DECREF(result);
175 goto onError;
Neal Norwitz9edcc2e2007-08-11 04:58:26 +0000176 }
Jeroen Demeyer6e43d072019-07-05 12:57:32 +0200177 Py_DECREF(v);
Guido van Rossumfeee4b92000-03-10 22:57:27 +0000178 return result;
179
180 onError:
Jeroen Demeyer6e43d072019-07-05 12:57:32 +0200181 Py_DECREF(v);
Guido van Rossumfeee4b92000-03-10 22:57:27 +0000182 return NULL;
183}
184
Nick Coghlan8fad1672014-09-15 23:50:44 +1200185int _PyCodec_Forget(const char *encoding)
186{
Nick Coghlan8fad1672014-09-15 23:50:44 +1200187 PyObject *v;
188 int result;
189
Victor Stinnercaba55b2018-08-03 15:33:52 +0200190 PyInterpreterState *interp = _PyInterpreterState_Get();
Nick Coghlan8fad1672014-09-15 23:50:44 +1200191 if (interp->codec_search_path == NULL) {
192 return -1;
193 }
194
195 /* Convert the encoding to a normalized Python string: all
196 characters are converted to lower case, spaces and hyphens are
197 replaced with underscores. */
198 v = normalizestring(encoding);
199 if (v == NULL) {
200 return -1;
201 }
202
203 /* Drop the named codec from the internal cache */
204 result = PyDict_DelItem(interp->codec_search_cache, v);
205 Py_DECREF(v);
206
207 return result;
208}
209
Marc-André Lemburgb2750b52008-06-06 12:18:17 +0000210/* Codec registry encoding check API. */
211
212int PyCodec_KnownEncoding(const char *encoding)
213{
214 PyObject *codecs;
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000215
Marc-André Lemburgb2750b52008-06-06 12:18:17 +0000216 codecs = _PyCodec_Lookup(encoding);
217 if (!codecs) {
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000218 PyErr_Clear();
219 return 0;
Marc-André Lemburgb2750b52008-06-06 12:18:17 +0000220 }
221 else {
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000222 Py_DECREF(codecs);
223 return 1;
Marc-André Lemburgb2750b52008-06-06 12:18:17 +0000224 }
225}
226
Guido van Rossumfeee4b92000-03-10 22:57:27 +0000227static
228PyObject *args_tuple(PyObject *object,
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000229 const char *errors)
Guido van Rossumfeee4b92000-03-10 22:57:27 +0000230{
231 PyObject *args;
Guido van Rossum98297ee2007-11-06 21:34:58 +0000232
Guido van Rossumfeee4b92000-03-10 22:57:27 +0000233 args = PyTuple_New(1 + (errors != NULL));
234 if (args == NULL)
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000235 return NULL;
Guido van Rossumfeee4b92000-03-10 22:57:27 +0000236 Py_INCREF(object);
237 PyTuple_SET_ITEM(args,0,object);
238 if (errors) {
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000239 PyObject *v;
Guido van Rossum98297ee2007-11-06 21:34:58 +0000240
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000241 v = PyUnicode_FromString(errors);
242 if (v == NULL) {
243 Py_DECREF(args);
244 return NULL;
245 }
246 PyTuple_SET_ITEM(args, 1, v);
Guido van Rossumfeee4b92000-03-10 22:57:27 +0000247 }
248 return args;
249}
250
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000251/* Helper function to get a codec item */
Guido van Rossumfeee4b92000-03-10 22:57:27 +0000252
253static
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000254PyObject *codec_getitem(const char *encoding, int index)
Guido van Rossumfeee4b92000-03-10 22:57:27 +0000255{
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000256 PyObject *codecs;
257 PyObject *v;
Guido van Rossumfeee4b92000-03-10 22:57:27 +0000258
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000259 codecs = _PyCodec_Lookup(encoding);
260 if (codecs == NULL)
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000261 return NULL;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000262 v = PyTuple_GET_ITEM(codecs, index);
263 Py_DECREF(codecs);
264 Py_INCREF(v);
265 return v;
266}
267
Nick Coghlana9b15242014-02-04 22:11:18 +1000268/* Helper functions to create an incremental codec. */
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000269static
Nick Coghlana9b15242014-02-04 22:11:18 +1000270PyObject *codec_makeincrementalcodec(PyObject *codec_info,
271 const char *errors,
272 const char *attrname)
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000273{
Nick Coghlana9b15242014-02-04 22:11:18 +1000274 PyObject *ret, *inccodec;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000275
Nick Coghlana9b15242014-02-04 22:11:18 +1000276 inccodec = PyObject_GetAttrString(codec_info, attrname);
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000277 if (inccodec == NULL)
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000278 return NULL;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000279 if (errors)
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000280 ret = PyObject_CallFunction(inccodec, "s", errors);
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000281 else
Victor Stinner4778eab2016-12-01 14:51:04 +0100282 ret = _PyObject_CallNoArg(inccodec);
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000283 Py_DECREF(inccodec);
284 return ret;
285}
286
Nick Coghlana9b15242014-02-04 22:11:18 +1000287static
288PyObject *codec_getincrementalcodec(const char *encoding,
289 const char *errors,
290 const char *attrname)
291{
292 PyObject *codec_info, *ret;
293
294 codec_info = _PyCodec_Lookup(encoding);
295 if (codec_info == NULL)
296 return NULL;
297 ret = codec_makeincrementalcodec(codec_info, errors, attrname);
298 Py_DECREF(codec_info);
299 return ret;
300}
301
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000302/* Helper function to create a stream codec. */
303
304static
305PyObject *codec_getstreamcodec(const char *encoding,
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000306 PyObject *stream,
307 const char *errors,
308 const int index)
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000309{
Thomas Wouters0e3f5912006-08-11 14:57:12 +0000310 PyObject *codecs, *streamcodec, *codeccls;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000311
312 codecs = _PyCodec_Lookup(encoding);
313 if (codecs == NULL)
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000314 return NULL;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000315
Thomas Wouters0e3f5912006-08-11 14:57:12 +0000316 codeccls = PyTuple_GET_ITEM(codecs, index);
317 if (errors != NULL)
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000318 streamcodec = PyObject_CallFunction(codeccls, "Os", stream, errors);
Thomas Wouters0e3f5912006-08-11 14:57:12 +0000319 else
Jeroen Demeyer196a5302019-07-04 12:31:34 +0200320 streamcodec = _PyObject_CallOneArg(codeccls, stream);
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000321 Py_DECREF(codecs);
322 return streamcodec;
Guido van Rossumfeee4b92000-03-10 22:57:27 +0000323}
324
Nick Coghlana9b15242014-02-04 22:11:18 +1000325/* Helpers to work with the result of _PyCodec_Lookup
326
327 */
328PyObject *_PyCodecInfo_GetIncrementalDecoder(PyObject *codec_info,
329 const char *errors)
330{
331 return codec_makeincrementalcodec(codec_info, errors,
332 "incrementaldecoder");
333}
334
335PyObject *_PyCodecInfo_GetIncrementalEncoder(PyObject *codec_info,
336 const char *errors)
337{
338 return codec_makeincrementalcodec(codec_info, errors,
339 "incrementalencoder");
340}
341
342
Guido van Rossum98297ee2007-11-06 21:34:58 +0000343/* Convenience APIs to query the Codec registry.
344
Guido van Rossumfeee4b92000-03-10 22:57:27 +0000345 All APIs return a codec object with incremented refcount.
Guido van Rossum98297ee2007-11-06 21:34:58 +0000346
Guido van Rossumfeee4b92000-03-10 22:57:27 +0000347 */
348
349PyObject *PyCodec_Encoder(const char *encoding)
350{
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000351 return codec_getitem(encoding, 0);
Guido van Rossumfeee4b92000-03-10 22:57:27 +0000352}
353
354PyObject *PyCodec_Decoder(const char *encoding)
355{
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000356 return codec_getitem(encoding, 1);
Guido van Rossumfeee4b92000-03-10 22:57:27 +0000357}
358
Thomas Woutersa9773292006-04-21 09:43:23 +0000359PyObject *PyCodec_IncrementalEncoder(const char *encoding,
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000360 const char *errors)
Thomas Woutersa9773292006-04-21 09:43:23 +0000361{
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000362 return codec_getincrementalcodec(encoding, errors, "incrementalencoder");
Thomas Woutersa9773292006-04-21 09:43:23 +0000363}
364
365PyObject *PyCodec_IncrementalDecoder(const char *encoding,
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000366 const char *errors)
Thomas Woutersa9773292006-04-21 09:43:23 +0000367{
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000368 return codec_getincrementalcodec(encoding, errors, "incrementaldecoder");
Thomas Woutersa9773292006-04-21 09:43:23 +0000369}
370
Guido van Rossumfeee4b92000-03-10 22:57:27 +0000371PyObject *PyCodec_StreamReader(const char *encoding,
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000372 PyObject *stream,
373 const char *errors)
Guido van Rossumfeee4b92000-03-10 22:57:27 +0000374{
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000375 return codec_getstreamcodec(encoding, stream, errors, 2);
Guido van Rossumfeee4b92000-03-10 22:57:27 +0000376}
377
378PyObject *PyCodec_StreamWriter(const char *encoding,
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000379 PyObject *stream,
380 const char *errors)
Guido van Rossumfeee4b92000-03-10 22:57:27 +0000381{
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000382 return codec_getstreamcodec(encoding, stream, errors, 3);
Guido van Rossumfeee4b92000-03-10 22:57:27 +0000383}
384
Nick Coghlan8b097b42013-11-13 23:49:21 +1000385/* Helper that tries to ensure the reported exception chain indicates the
386 * codec that was invoked to trigger the failure without changing the type
387 * of the exception raised.
388 */
389static void
390wrap_codec_error(const char *operation,
391 const char *encoding)
392{
393 /* TrySetFromCause will replace the active exception with a suitably
394 * updated clone if it can, otherwise it will leave the original
395 * exception alone.
396 */
397 _PyErr_TrySetFromCause("%s with '%s' codec failed",
398 operation, encoding);
399}
400
Martin Panter6245cb32016-04-15 02:14:19 +0000401/* Encode an object (e.g. a Unicode object) using the given encoding
Guido van Rossumfeee4b92000-03-10 22:57:27 +0000402 and return the resulting encoded object (usually a Python string).
403
404 errors is passed to the encoder factory as argument if non-NULL. */
405
Nick Coghlanc72e4e62013-11-22 22:39:36 +1000406static PyObject *
407_PyCodec_EncodeInternal(PyObject *object,
408 PyObject *encoder,
409 const char *encoding,
410 const char *errors)
Guido van Rossumfeee4b92000-03-10 22:57:27 +0000411{
Neal Norwitz3715c3e2005-11-24 22:09:18 +0000412 PyObject *args = NULL, *result = NULL;
Guido van Rossum98297ee2007-11-06 21:34:58 +0000413 PyObject *v = NULL;
Guido van Rossumfeee4b92000-03-10 22:57:27 +0000414
Guido van Rossumfeee4b92000-03-10 22:57:27 +0000415 args = args_tuple(object, errors);
416 if (args == NULL)
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000417 goto onError;
Guido van Rossum98297ee2007-11-06 21:34:58 +0000418
Jeroen Demeyer1dbd0842019-07-11 17:57:32 +0200419 result = PyObject_Call(encoder, args, NULL);
Nick Coghlanc4c25802013-11-15 21:47:37 +1000420 if (result == NULL) {
421 wrap_codec_error("encoding", encoding);
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000422 goto onError;
Nick Coghlanc4c25802013-11-15 21:47:37 +1000423 }
Guido van Rossumfeee4b92000-03-10 22:57:27 +0000424
Guido van Rossum98297ee2007-11-06 21:34:58 +0000425 if (!PyTuple_Check(result) ||
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000426 PyTuple_GET_SIZE(result) != 2) {
427 PyErr_SetString(PyExc_TypeError,
428 "encoder must return a tuple (object, integer)");
429 goto onError;
Guido van Rossumfeee4b92000-03-10 22:57:27 +0000430 }
Marc-André Lemburgb2750b52008-06-06 12:18:17 +0000431 v = PyTuple_GET_ITEM(result,0);
432 Py_INCREF(v);
Guido van Rossumfeee4b92000-03-10 22:57:27 +0000433 /* We don't check or use the second (integer) entry. */
434
Marc-André Lemburgb2750b52008-06-06 12:18:17 +0000435 Py_DECREF(args);
436 Py_DECREF(encoder);
437 Py_DECREF(result);
438 return v;
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000439
Guido van Rossumfeee4b92000-03-10 22:57:27 +0000440 onError:
Neal Norwitz3715c3e2005-11-24 22:09:18 +0000441 Py_XDECREF(result);
Guido van Rossumfeee4b92000-03-10 22:57:27 +0000442 Py_XDECREF(args);
443 Py_XDECREF(encoder);
Marc-André Lemburgb2750b52008-06-06 12:18:17 +0000444 return NULL;
Guido van Rossumfeee4b92000-03-10 22:57:27 +0000445}
446
447/* Decode an object (usually a Python string) using the given encoding
Martin Panter6245cb32016-04-15 02:14:19 +0000448 and return an equivalent object (e.g. a Unicode object).
Guido van Rossumfeee4b92000-03-10 22:57:27 +0000449
450 errors is passed to the decoder factory as argument if non-NULL. */
451
Nick Coghlanc72e4e62013-11-22 22:39:36 +1000452static PyObject *
453_PyCodec_DecodeInternal(PyObject *object,
454 PyObject *decoder,
455 const char *encoding,
456 const char *errors)
Guido van Rossumfeee4b92000-03-10 22:57:27 +0000457{
Guido van Rossumfeee4b92000-03-10 22:57:27 +0000458 PyObject *args = NULL, *result = NULL;
459 PyObject *v;
460
Guido van Rossumfeee4b92000-03-10 22:57:27 +0000461 args = args_tuple(object, errors);
462 if (args == NULL)
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000463 goto onError;
Guido van Rossum98297ee2007-11-06 21:34:58 +0000464
Jeroen Demeyer1dbd0842019-07-11 17:57:32 +0200465 result = PyObject_Call(decoder, args, NULL);
Nick Coghlanc4c25802013-11-15 21:47:37 +1000466 if (result == NULL) {
467 wrap_codec_error("decoding", encoding);
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000468 goto onError;
Nick Coghlanc4c25802013-11-15 21:47:37 +1000469 }
Guido van Rossum98297ee2007-11-06 21:34:58 +0000470 if (!PyTuple_Check(result) ||
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000471 PyTuple_GET_SIZE(result) != 2) {
472 PyErr_SetString(PyExc_TypeError,
473 "decoder must return a tuple (object,integer)");
474 goto onError;
Guido van Rossumfeee4b92000-03-10 22:57:27 +0000475 }
476 v = PyTuple_GET_ITEM(result,0);
477 Py_INCREF(v);
478 /* We don't check or use the second (integer) entry. */
479
480 Py_DECREF(args);
481 Py_DECREF(decoder);
482 Py_DECREF(result);
483 return v;
Guido van Rossum98297ee2007-11-06 21:34:58 +0000484
Guido van Rossumfeee4b92000-03-10 22:57:27 +0000485 onError:
486 Py_XDECREF(args);
487 Py_XDECREF(decoder);
488 Py_XDECREF(result);
489 return NULL;
490}
491
Nick Coghlanc72e4e62013-11-22 22:39:36 +1000492/* Generic encoding/decoding API */
493PyObject *PyCodec_Encode(PyObject *object,
494 const char *encoding,
495 const char *errors)
496{
497 PyObject *encoder;
498
499 encoder = PyCodec_Encoder(encoding);
500 if (encoder == NULL)
501 return NULL;
502
503 return _PyCodec_EncodeInternal(object, encoder, encoding, errors);
504}
505
506PyObject *PyCodec_Decode(PyObject *object,
507 const char *encoding,
508 const char *errors)
509{
510 PyObject *decoder;
511
512 decoder = PyCodec_Decoder(encoding);
513 if (decoder == NULL)
514 return NULL;
515
516 return _PyCodec_DecodeInternal(object, decoder, encoding, errors);
517}
518
519/* Text encoding/decoding API */
Nick Coghlana9b15242014-02-04 22:11:18 +1000520PyObject * _PyCodec_LookupTextEncoding(const char *encoding,
521 const char *alternate_command)
Nick Coghlanc72e4e62013-11-22 22:39:36 +1000522{
523 _Py_IDENTIFIER(_is_text_encoding);
524 PyObject *codec;
525 PyObject *attr;
Nick Coghlanc72e4e62013-11-22 22:39:36 +1000526 int is_text_codec;
527
528 codec = _PyCodec_Lookup(encoding);
529 if (codec == NULL)
530 return NULL;
531
532 /* Backwards compatibility: assume any raw tuple describes a text
533 * encoding, and the same for anything lacking the private
534 * attribute.
535 */
536 if (!PyTuple_CheckExact(codec)) {
Serhiy Storchakaf320be72018-01-25 10:49:40 +0200537 if (_PyObject_LookupAttrId(codec, &PyId__is_text_encoding, &attr) < 0) {
538 Py_DECREF(codec);
539 return NULL;
540 }
541 if (attr != NULL) {
Nick Coghlanc72e4e62013-11-22 22:39:36 +1000542 is_text_codec = PyObject_IsTrue(attr);
543 Py_DECREF(attr);
Serhiy Storchakafa494fd2015-05-30 17:45:22 +0300544 if (is_text_codec <= 0) {
Nick Coghlanc72e4e62013-11-22 22:39:36 +1000545 Py_DECREF(codec);
Serhiy Storchakafa494fd2015-05-30 17:45:22 +0300546 if (!is_text_codec)
547 PyErr_Format(PyExc_LookupError,
548 "'%.400s' is not a text encoding; "
549 "use %s to handle arbitrary codecs",
550 encoding, alternate_command);
Nick Coghlanc72e4e62013-11-22 22:39:36 +1000551 return NULL;
552 }
553 }
554 }
555
Nick Coghlana9b15242014-02-04 22:11:18 +1000556 /* This appears to be a valid text encoding */
557 return codec;
558}
559
560
561static
562PyObject *codec_getitem_checked(const char *encoding,
563 const char *alternate_command,
564 int index)
565{
566 PyObject *codec;
567 PyObject *v;
568
569 codec = _PyCodec_LookupTextEncoding(encoding, alternate_command);
570 if (codec == NULL)
571 return NULL;
572
Nick Coghlanc72e4e62013-11-22 22:39:36 +1000573 v = PyTuple_GET_ITEM(codec, index);
Nick Coghlanc72e4e62013-11-22 22:39:36 +1000574 Py_INCREF(v);
Nick Coghlana9b15242014-02-04 22:11:18 +1000575 Py_DECREF(codec);
Nick Coghlanc72e4e62013-11-22 22:39:36 +1000576 return v;
577}
578
579static PyObject * _PyCodec_TextEncoder(const char *encoding)
580{
Nick Coghlana9b15242014-02-04 22:11:18 +1000581 return codec_getitem_checked(encoding, "codecs.encode()", 0);
Nick Coghlanc72e4e62013-11-22 22:39:36 +1000582}
583
584static PyObject * _PyCodec_TextDecoder(const char *encoding)
585{
Nick Coghlana9b15242014-02-04 22:11:18 +1000586 return codec_getitem_checked(encoding, "codecs.decode()", 1);
Nick Coghlanc72e4e62013-11-22 22:39:36 +1000587}
588
589PyObject *_PyCodec_EncodeText(PyObject *object,
590 const char *encoding,
591 const char *errors)
592{
593 PyObject *encoder;
594
595 encoder = _PyCodec_TextEncoder(encoding);
596 if (encoder == NULL)
597 return NULL;
598
599 return _PyCodec_EncodeInternal(object, encoder, encoding, errors);
600}
601
602PyObject *_PyCodec_DecodeText(PyObject *object,
603 const char *encoding,
604 const char *errors)
605{
606 PyObject *decoder;
607
608 decoder = _PyCodec_TextDecoder(encoding);
609 if (decoder == NULL)
610 return NULL;
611
612 return _PyCodec_DecodeInternal(object, decoder, encoding, errors);
613}
614
Walter Dörwald3aeb6322002-09-02 13:14:32 +0000615/* Register the error handling callback function error under the name
616 name. This function will be called by the codec when it encounters
617 an unencodable characters/undecodable bytes and doesn't know the
618 callback name, when name is specified as the error parameter
619 in the call to the encode/decode function.
620 Return 0 on success, -1 on error */
621int PyCodec_RegisterError(const char *name, PyObject *error)
622{
Victor Stinnercaba55b2018-08-03 15:33:52 +0200623 PyInterpreterState *interp = _PyInterpreterState_Get();
Gustavo Niemeyer5ddd4c32003-03-19 00:35:36 +0000624 if (interp->codec_search_path == NULL && _PyCodecRegistry_Init())
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000625 return -1;
Walter Dörwald3aeb6322002-09-02 13:14:32 +0000626 if (!PyCallable_Check(error)) {
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000627 PyErr_SetString(PyExc_TypeError, "handler must be callable");
628 return -1;
Walter Dörwald3aeb6322002-09-02 13:14:32 +0000629 }
Gustavo Niemeyer5ddd4c32003-03-19 00:35:36 +0000630 return PyDict_SetItemString(interp->codec_error_registry,
Serhiy Storchakac6792272013-10-19 21:03:34 +0300631 name, error);
Walter Dörwald3aeb6322002-09-02 13:14:32 +0000632}
633
634/* Lookup the error handling callback function registered under the
635 name error. As a special case NULL can be passed, in which case
636 the error handling callback for strict encoding will be returned. */
637PyObject *PyCodec_LookupError(const char *name)
638{
639 PyObject *handler = NULL;
640
Victor Stinnercaba55b2018-08-03 15:33:52 +0200641 PyInterpreterState *interp = _PyInterpreterState_GET_UNSAFE();
Gustavo Niemeyer5ddd4c32003-03-19 00:35:36 +0000642 if (interp->codec_search_path == NULL && _PyCodecRegistry_Init())
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000643 return NULL;
Gustavo Niemeyer5ddd4c32003-03-19 00:35:36 +0000644
Walter Dörwald3aeb6322002-09-02 13:14:32 +0000645 if (name==NULL)
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000646 name = "strict";
Serhiy Storchakaa24107b2019-02-25 17:59:46 +0200647 handler = _PyDict_GetItemStringWithError(interp->codec_error_registry, name);
648 if (handler) {
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000649 Py_INCREF(handler);
Serhiy Storchakaa24107b2019-02-25 17:59:46 +0200650 }
651 else if (!PyErr_Occurred()) {
652 PyErr_Format(PyExc_LookupError, "unknown error handler name '%.400s'", name);
653 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +0000654 return handler;
655}
656
657static void wrong_exception_type(PyObject *exc)
658{
Serhiy Storchakaca7fecb2015-05-18 16:08:52 +0300659 PyErr_Format(PyExc_TypeError,
660 "don't know how to handle %.200s in error callback",
661 exc->ob_type->tp_name);
Walter Dörwald3aeb6322002-09-02 13:14:32 +0000662}
663
664PyObject *PyCodec_StrictErrors(PyObject *exc)
665{
Brett Cannonbf364092006-03-01 04:25:17 +0000666 if (PyExceptionInstance_Check(exc))
667 PyErr_SetObject(PyExceptionInstance_Class(exc), exc);
Walter Dörwald3aeb6322002-09-02 13:14:32 +0000668 else
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000669 PyErr_SetString(PyExc_TypeError, "codec must pass exception instance");
Walter Dörwald3aeb6322002-09-02 13:14:32 +0000670 return NULL;
671}
672
673
674PyObject *PyCodec_IgnoreErrors(PyObject *exc)
675{
Martin v. Löwis18e16552006-02-15 17:27:45 +0000676 Py_ssize_t end;
Serhiy Storchakaca7fecb2015-05-18 16:08:52 +0300677
678 if (PyObject_TypeCheck(exc, (PyTypeObject *)PyExc_UnicodeEncodeError)) {
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000679 if (PyUnicodeEncodeError_GetEnd(exc, &end))
680 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +0000681 }
Serhiy Storchakaca7fecb2015-05-18 16:08:52 +0300682 else if (PyObject_TypeCheck(exc, (PyTypeObject *)PyExc_UnicodeDecodeError)) {
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000683 if (PyUnicodeDecodeError_GetEnd(exc, &end))
684 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +0000685 }
Serhiy Storchakaca7fecb2015-05-18 16:08:52 +0300686 else if (PyObject_TypeCheck(exc, (PyTypeObject *)PyExc_UnicodeTranslateError)) {
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000687 if (PyUnicodeTranslateError_GetEnd(exc, &end))
688 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +0000689 }
690 else {
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000691 wrong_exception_type(exc);
692 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +0000693 }
Victor Stinneree450092011-12-01 02:52:11 +0100694 return Py_BuildValue("(Nn)", PyUnicode_New(0, 0), end);
Walter Dörwald3aeb6322002-09-02 13:14:32 +0000695}
696
697
698PyObject *PyCodec_ReplaceErrors(PyObject *exc)
699{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200700 Py_ssize_t start, end, i, len;
Walter Dörwald3aeb6322002-09-02 13:14:32 +0000701
Serhiy Storchakaca7fecb2015-05-18 16:08:52 +0300702 if (PyObject_TypeCheck(exc, (PyTypeObject *)PyExc_UnicodeEncodeError)) {
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000703 PyObject *res;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200704 int kind;
705 void *data;
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000706 if (PyUnicodeEncodeError_GetStart(exc, &start))
707 return NULL;
708 if (PyUnicodeEncodeError_GetEnd(exc, &end))
709 return NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200710 len = end - start;
711 res = PyUnicode_New(len, '?');
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000712 if (res == NULL)
713 return NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200714 kind = PyUnicode_KIND(res);
715 data = PyUnicode_DATA(res);
716 for (i = 0; i < len; ++i)
717 PyUnicode_WRITE(kind, data, i, '?');
Victor Stinner8f825062012-04-27 13:55:39 +0200718 assert(_PyUnicode_CheckConsistency(res, 1));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200719 return Py_BuildValue("(Nn)", res, end);
Walter Dörwald3aeb6322002-09-02 13:14:32 +0000720 }
Serhiy Storchakaca7fecb2015-05-18 16:08:52 +0300721 else if (PyObject_TypeCheck(exc, (PyTypeObject *)PyExc_UnicodeDecodeError)) {
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000722 if (PyUnicodeDecodeError_GetEnd(exc, &end))
723 return NULL;
Victor Stinner1a15aba2011-10-02 19:00:15 +0200724 return Py_BuildValue("(Cn)",
725 (int)Py_UNICODE_REPLACEMENT_CHARACTER,
726 end);
Walter Dörwald3aeb6322002-09-02 13:14:32 +0000727 }
Serhiy Storchakaca7fecb2015-05-18 16:08:52 +0300728 else if (PyObject_TypeCheck(exc, (PyTypeObject *)PyExc_UnicodeTranslateError)) {
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000729 PyObject *res;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200730 int kind;
731 void *data;
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000732 if (PyUnicodeTranslateError_GetStart(exc, &start))
733 return NULL;
734 if (PyUnicodeTranslateError_GetEnd(exc, &end))
735 return NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200736 len = end - start;
737 res = PyUnicode_New(len, Py_UNICODE_REPLACEMENT_CHARACTER);
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000738 if (res == NULL)
739 return NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200740 kind = PyUnicode_KIND(res);
741 data = PyUnicode_DATA(res);
742 for (i=0; i < len; i++)
743 PyUnicode_WRITE(kind, data, i, Py_UNICODE_REPLACEMENT_CHARACTER);
Victor Stinner8f825062012-04-27 13:55:39 +0200744 assert(_PyUnicode_CheckConsistency(res, 1));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200745 return Py_BuildValue("(Nn)", res, end);
Walter Dörwald3aeb6322002-09-02 13:14:32 +0000746 }
747 else {
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000748 wrong_exception_type(exc);
749 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +0000750 }
751}
752
753PyObject *PyCodec_XMLCharRefReplaceErrors(PyObject *exc)
754{
Serhiy Storchakaca7fecb2015-05-18 16:08:52 +0300755 if (PyObject_TypeCheck(exc, (PyTypeObject *)PyExc_UnicodeEncodeError)) {
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000756 PyObject *restuple;
757 PyObject *object;
Victor Stinnerb31f1bc2011-11-04 21:29:10 +0100758 Py_ssize_t i;
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000759 Py_ssize_t start;
760 Py_ssize_t end;
761 PyObject *res;
Martin v. Löwisb09af032011-11-04 11:16:41 +0100762 unsigned char *outp;
Serhiy Storchaka2e374092014-10-04 14:15:49 +0300763 Py_ssize_t ressize;
Martin v. Löwisb09af032011-11-04 11:16:41 +0100764 Py_UCS4 ch;
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000765 if (PyUnicodeEncodeError_GetStart(exc, &start))
766 return NULL;
767 if (PyUnicodeEncodeError_GetEnd(exc, &end))
768 return NULL;
769 if (!(object = PyUnicodeEncodeError_GetObject(exc)))
770 return NULL;
Serhiy Storchaka2e374092014-10-04 14:15:49 +0300771 if (end - start > PY_SSIZE_T_MAX / (2+7+1))
772 end = start + PY_SSIZE_T_MAX / (2+7+1);
Martin v. Löwisb09af032011-11-04 11:16:41 +0100773 for (i = start, ressize = 0; i < end; ++i) {
774 /* object is guaranteed to be "ready" */
775 ch = PyUnicode_READ_CHAR(object, i);
776 if (ch<10)
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000777 ressize += 2+1+1;
Martin v. Löwisb09af032011-11-04 11:16:41 +0100778 else if (ch<100)
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000779 ressize += 2+2+1;
Martin v. Löwisb09af032011-11-04 11:16:41 +0100780 else if (ch<1000)
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000781 ressize += 2+3+1;
Martin v. Löwisb09af032011-11-04 11:16:41 +0100782 else if (ch<10000)
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000783 ressize += 2+4+1;
Martin v. Löwisb09af032011-11-04 11:16:41 +0100784 else if (ch<100000)
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000785 ressize += 2+5+1;
Martin v. Löwisb09af032011-11-04 11:16:41 +0100786 else if (ch<1000000)
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000787 ressize += 2+6+1;
788 else
789 ressize += 2+7+1;
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000790 }
791 /* allocate replacement */
Martin v. Löwisb09af032011-11-04 11:16:41 +0100792 res = PyUnicode_New(ressize, 127);
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000793 if (res == NULL) {
794 Py_DECREF(object);
795 return NULL;
796 }
Martin v. Löwisb09af032011-11-04 11:16:41 +0100797 outp = PyUnicode_1BYTE_DATA(res);
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000798 /* generate replacement */
Victor Stinnerb31f1bc2011-11-04 21:29:10 +0100799 for (i = start; i < end; ++i) {
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000800 int digits;
801 int base;
Martin v. Löwis8ba79302011-11-04 12:26:49 +0100802 ch = PyUnicode_READ_CHAR(object, i);
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000803 *outp++ = '&';
804 *outp++ = '#';
Martin v. Löwisb09af032011-11-04 11:16:41 +0100805 if (ch<10) {
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000806 digits = 1;
807 base = 1;
808 }
Martin v. Löwisb09af032011-11-04 11:16:41 +0100809 else if (ch<100) {
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000810 digits = 2;
811 base = 10;
812 }
Martin v. Löwisb09af032011-11-04 11:16:41 +0100813 else if (ch<1000) {
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000814 digits = 3;
815 base = 100;
816 }
Martin v. Löwisb09af032011-11-04 11:16:41 +0100817 else if (ch<10000) {
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000818 digits = 4;
819 base = 1000;
820 }
Martin v. Löwisb09af032011-11-04 11:16:41 +0100821 else if (ch<100000) {
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000822 digits = 5;
823 base = 10000;
824 }
Martin v. Löwisb09af032011-11-04 11:16:41 +0100825 else if (ch<1000000) {
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000826 digits = 6;
827 base = 100000;
828 }
829 else {
830 digits = 7;
831 base = 1000000;
832 }
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000833 while (digits-->0) {
Martin v. Löwisb09af032011-11-04 11:16:41 +0100834 *outp++ = '0' + ch/base;
835 ch %= base;
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000836 base /= 10;
837 }
838 *outp++ = ';';
839 }
Victor Stinner8f825062012-04-27 13:55:39 +0200840 assert(_PyUnicode_CheckConsistency(res, 1));
841 restuple = Py_BuildValue("(Nn)", res, end);
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000842 Py_DECREF(object);
843 return restuple;
Walter Dörwald3aeb6322002-09-02 13:14:32 +0000844 }
845 else {
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000846 wrong_exception_type(exc);
847 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +0000848 }
849}
850
Walter Dörwald3aeb6322002-09-02 13:14:32 +0000851PyObject *PyCodec_BackslashReplaceErrors(PyObject *exc)
852{
Serhiy Storchaka07985ef2015-01-25 22:56:57 +0200853 PyObject *object;
854 Py_ssize_t i;
855 Py_ssize_t start;
856 Py_ssize_t end;
857 PyObject *res;
858 unsigned char *outp;
859 int ressize;
860 Py_UCS4 c;
861
Serhiy Storchakac0937f72015-05-18 16:10:40 +0300862 if (PyObject_TypeCheck(exc, (PyTypeObject *)PyExc_UnicodeDecodeError)) {
Serhiy Storchakacb33a012016-10-23 09:44:50 +0300863 const unsigned char *p;
Serhiy Storchaka07985ef2015-01-25 22:56:57 +0200864 if (PyUnicodeDecodeError_GetStart(exc, &start))
865 return NULL;
866 if (PyUnicodeDecodeError_GetEnd(exc, &end))
867 return NULL;
868 if (!(object = PyUnicodeDecodeError_GetObject(exc)))
869 return NULL;
Serhiy Storchakacb33a012016-10-23 09:44:50 +0300870 p = (const unsigned char*)PyBytes_AS_STRING(object);
Serhiy Storchaka07985ef2015-01-25 22:56:57 +0200871 res = PyUnicode_New(4 * (end - start), 127);
872 if (res == NULL) {
873 Py_DECREF(object);
874 return NULL;
875 }
876 outp = PyUnicode_1BYTE_DATA(res);
877 for (i = start; i < end; i++, outp += 4) {
878 unsigned char c = p[i];
879 outp[0] = '\\';
880 outp[1] = 'x';
881 outp[2] = Py_hexdigits[(c>>4)&0xf];
882 outp[3] = Py_hexdigits[c&0xf];
883 }
884
885 assert(_PyUnicode_CheckConsistency(res, 1));
886 Py_DECREF(object);
887 return Py_BuildValue("(Nn)", res, end);
888 }
Serhiy Storchakac0937f72015-05-18 16:10:40 +0300889 if (PyObject_TypeCheck(exc, (PyTypeObject *)PyExc_UnicodeEncodeError)) {
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000890 if (PyUnicodeEncodeError_GetStart(exc, &start))
891 return NULL;
892 if (PyUnicodeEncodeError_GetEnd(exc, &end))
893 return NULL;
894 if (!(object = PyUnicodeEncodeError_GetObject(exc)))
895 return NULL;
Serhiy Storchaka07985ef2015-01-25 22:56:57 +0200896 }
Serhiy Storchakac0937f72015-05-18 16:10:40 +0300897 else if (PyObject_TypeCheck(exc, (PyTypeObject *)PyExc_UnicodeTranslateError)) {
Serhiy Storchaka07985ef2015-01-25 22:56:57 +0200898 if (PyUnicodeTranslateError_GetStart(exc, &start))
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000899 return NULL;
Serhiy Storchaka07985ef2015-01-25 22:56:57 +0200900 if (PyUnicodeTranslateError_GetEnd(exc, &end))
901 return NULL;
902 if (!(object = PyUnicodeTranslateError_GetObject(exc)))
903 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +0000904 }
905 else {
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000906 wrong_exception_type(exc);
907 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +0000908 }
Serhiy Storchaka07985ef2015-01-25 22:56:57 +0200909
910 if (end - start > PY_SSIZE_T_MAX / (1+1+8))
911 end = start + PY_SSIZE_T_MAX / (1+1+8);
912 for (i = start, ressize = 0; i < end; ++i) {
913 /* object is guaranteed to be "ready" */
914 c = PyUnicode_READ_CHAR(object, i);
915 if (c >= 0x10000) {
916 ressize += 1+1+8;
917 }
918 else if (c >= 0x100) {
919 ressize += 1+1+4;
920 }
921 else
922 ressize += 1+1+2;
923 }
924 res = PyUnicode_New(ressize, 127);
925 if (res == NULL) {
926 Py_DECREF(object);
927 return NULL;
928 }
929 outp = PyUnicode_1BYTE_DATA(res);
930 for (i = start; i < end; ++i) {
931 c = PyUnicode_READ_CHAR(object, i);
932 *outp++ = '\\';
933 if (c >= 0x00010000) {
934 *outp++ = 'U';
935 *outp++ = Py_hexdigits[(c>>28)&0xf];
936 *outp++ = Py_hexdigits[(c>>24)&0xf];
937 *outp++ = Py_hexdigits[(c>>20)&0xf];
938 *outp++ = Py_hexdigits[(c>>16)&0xf];
939 *outp++ = Py_hexdigits[(c>>12)&0xf];
940 *outp++ = Py_hexdigits[(c>>8)&0xf];
941 }
942 else if (c >= 0x100) {
943 *outp++ = 'u';
944 *outp++ = Py_hexdigits[(c>>12)&0xf];
945 *outp++ = Py_hexdigits[(c>>8)&0xf];
946 }
947 else
948 *outp++ = 'x';
949 *outp++ = Py_hexdigits[(c>>4)&0xf];
950 *outp++ = Py_hexdigits[c&0xf];
951 }
952
953 assert(_PyUnicode_CheckConsistency(res, 1));
954 Py_DECREF(object);
955 return Py_BuildValue("(Nn)", res, end);
Walter Dörwald3aeb6322002-09-02 13:14:32 +0000956}
957
Serhiy Storchaka166ebc42014-11-25 13:57:17 +0200958static _PyUnicode_Name_CAPI *ucnhash_CAPI = NULL;
Serhiy Storchaka166ebc42014-11-25 13:57:17 +0200959
960PyObject *PyCodec_NameReplaceErrors(PyObject *exc)
961{
Serhiy Storchakaca7fecb2015-05-18 16:08:52 +0300962 if (PyObject_TypeCheck(exc, (PyTypeObject *)PyExc_UnicodeEncodeError)) {
Serhiy Storchaka166ebc42014-11-25 13:57:17 +0200963 PyObject *restuple;
964 PyObject *object;
965 Py_ssize_t i;
966 Py_ssize_t start;
967 Py_ssize_t end;
968 PyObject *res;
969 unsigned char *outp;
Serhiy Storchakaaacfccc2014-11-26 12:11:40 +0200970 Py_ssize_t ressize;
971 int replsize;
Serhiy Storchaka166ebc42014-11-25 13:57:17 +0200972 Py_UCS4 c;
973 char buffer[256]; /* NAME_MAXLEN */
974 if (PyUnicodeEncodeError_GetStart(exc, &start))
975 return NULL;
976 if (PyUnicodeEncodeError_GetEnd(exc, &end))
977 return NULL;
978 if (!(object = PyUnicodeEncodeError_GetObject(exc)))
979 return NULL;
Victor Stinner38b8ae02015-09-03 16:19:40 +0200980 if (!ucnhash_CAPI) {
Serhiy Storchaka166ebc42014-11-25 13:57:17 +0200981 /* load the unicode data module */
982 ucnhash_CAPI = (_PyUnicode_Name_CAPI *)PyCapsule_Import(
983 PyUnicodeData_CAPSULE_NAME, 1);
Victor Stinner38b8ae02015-09-03 16:19:40 +0200984 if (!ucnhash_CAPI)
985 return NULL;
Serhiy Storchaka166ebc42014-11-25 13:57:17 +0200986 }
987 for (i = start, ressize = 0; i < end; ++i) {
988 /* object is guaranteed to be "ready" */
989 c = PyUnicode_READ_CHAR(object, i);
Victor Stinner38b8ae02015-09-03 16:19:40 +0200990 if (ucnhash_CAPI->getname(NULL, c, buffer, sizeof(buffer), 1)) {
Serhiy Storchaka26861b02015-02-16 20:52:17 +0200991 replsize = 1+1+1+(int)strlen(buffer)+1;
Serhiy Storchaka166ebc42014-11-25 13:57:17 +0200992 }
993 else if (c >= 0x10000) {
Serhiy Storchakaaacfccc2014-11-26 12:11:40 +0200994 replsize = 1+1+8;
Serhiy Storchaka166ebc42014-11-25 13:57:17 +0200995 }
996 else if (c >= 0x100) {
Serhiy Storchakaaacfccc2014-11-26 12:11:40 +0200997 replsize = 1+1+4;
Serhiy Storchaka166ebc42014-11-25 13:57:17 +0200998 }
999 else
Serhiy Storchakaaacfccc2014-11-26 12:11:40 +02001000 replsize = 1+1+2;
1001 if (ressize > PY_SSIZE_T_MAX - replsize)
1002 break;
1003 ressize += replsize;
Serhiy Storchaka166ebc42014-11-25 13:57:17 +02001004 }
Serhiy Storchakaaacfccc2014-11-26 12:11:40 +02001005 end = i;
Serhiy Storchaka166ebc42014-11-25 13:57:17 +02001006 res = PyUnicode_New(ressize, 127);
1007 if (res==NULL)
1008 return NULL;
1009 for (i = start, outp = PyUnicode_1BYTE_DATA(res);
1010 i < end; ++i) {
1011 c = PyUnicode_READ_CHAR(object, i);
1012 *outp++ = '\\';
Victor Stinner38b8ae02015-09-03 16:19:40 +02001013 if (ucnhash_CAPI->getname(NULL, c, buffer, sizeof(buffer), 1)) {
Serhiy Storchaka166ebc42014-11-25 13:57:17 +02001014 *outp++ = 'N';
1015 *outp++ = '{';
1016 strcpy((char *)outp, buffer);
1017 outp += strlen(buffer);
1018 *outp++ = '}';
1019 continue;
1020 }
1021 if (c >= 0x00010000) {
1022 *outp++ = 'U';
1023 *outp++ = Py_hexdigits[(c>>28)&0xf];
1024 *outp++ = Py_hexdigits[(c>>24)&0xf];
1025 *outp++ = Py_hexdigits[(c>>20)&0xf];
1026 *outp++ = Py_hexdigits[(c>>16)&0xf];
1027 *outp++ = Py_hexdigits[(c>>12)&0xf];
1028 *outp++ = Py_hexdigits[(c>>8)&0xf];
1029 }
1030 else if (c >= 0x100) {
1031 *outp++ = 'u';
1032 *outp++ = Py_hexdigits[(c>>12)&0xf];
1033 *outp++ = Py_hexdigits[(c>>8)&0xf];
1034 }
1035 else
1036 *outp++ = 'x';
1037 *outp++ = Py_hexdigits[(c>>4)&0xf];
1038 *outp++ = Py_hexdigits[c&0xf];
1039 }
1040
Benjamin Peterson3663b582014-11-26 14:39:54 -06001041 assert(outp == PyUnicode_1BYTE_DATA(res) + ressize);
Serhiy Storchaka166ebc42014-11-25 13:57:17 +02001042 assert(_PyUnicode_CheckConsistency(res, 1));
1043 restuple = Py_BuildValue("(Nn)", res, end);
1044 Py_DECREF(object);
1045 return restuple;
1046 }
1047 else {
1048 wrong_exception_type(exc);
1049 return NULL;
1050 }
1051}
1052
Serhiy Storchaka88d8fb62014-05-15 14:37:42 +03001053#define ENC_UNKNOWN -1
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02001054#define ENC_UTF8 0
1055#define ENC_UTF16BE 1
1056#define ENC_UTF16LE 2
1057#define ENC_UTF32BE 3
1058#define ENC_UTF32LE 4
1059
1060static int
1061get_standard_encoding(const char *encoding, int *bytelength)
1062{
1063 if (Py_TOLOWER(encoding[0]) == 'u' &&
1064 Py_TOLOWER(encoding[1]) == 't' &&
1065 Py_TOLOWER(encoding[2]) == 'f') {
1066 encoding += 3;
1067 if (*encoding == '-' || *encoding == '_' )
1068 encoding++;
Serhiy Storchaka88d8fb62014-05-15 14:37:42 +03001069 if (encoding[0] == '8' && encoding[1] == '\0') {
1070 *bytelength = 3;
1071 return ENC_UTF8;
1072 }
1073 else if (encoding[0] == '1' && encoding[1] == '6') {
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02001074 encoding += 2;
1075 *bytelength = 2;
1076 if (*encoding == '\0') {
1077#ifdef WORDS_BIGENDIAN
1078 return ENC_UTF16BE;
1079#else
1080 return ENC_UTF16LE;
1081#endif
1082 }
1083 if (*encoding == '-' || *encoding == '_' )
1084 encoding++;
1085 if (Py_TOLOWER(encoding[1]) == 'e' && encoding[2] == '\0') {
1086 if (Py_TOLOWER(encoding[0]) == 'b')
1087 return ENC_UTF16BE;
1088 if (Py_TOLOWER(encoding[0]) == 'l')
1089 return ENC_UTF16LE;
1090 }
1091 }
1092 else if (encoding[0] == '3' && encoding[1] == '2') {
1093 encoding += 2;
1094 *bytelength = 4;
1095 if (*encoding == '\0') {
1096#ifdef WORDS_BIGENDIAN
1097 return ENC_UTF32BE;
1098#else
1099 return ENC_UTF32LE;
1100#endif
1101 }
1102 if (*encoding == '-' || *encoding == '_' )
1103 encoding++;
1104 if (Py_TOLOWER(encoding[1]) == 'e' && encoding[2] == '\0') {
1105 if (Py_TOLOWER(encoding[0]) == 'b')
1106 return ENC_UTF32BE;
1107 if (Py_TOLOWER(encoding[0]) == 'l')
1108 return ENC_UTF32LE;
1109 }
1110 }
1111 }
Victor Stinner0d4e01c2014-05-16 14:46:20 +02001112 else if (strcmp(encoding, "CP_UTF8") == 0) {
1113 *bytelength = 3;
1114 return ENC_UTF8;
1115 }
Serhiy Storchaka88d8fb62014-05-15 14:37:42 +03001116 return ENC_UNKNOWN;
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02001117}
1118
Martin v. Löwisaef3fb02009-05-02 19:27:30 +00001119/* This handler is declared static until someone demonstrates
1120 a need to call it directly. */
1121static PyObject *
Martin v. Löwise0a2b722009-05-10 08:08:56 +00001122PyCodec_SurrogatePassErrors(PyObject *exc)
Martin v. Löwisdb12d452009-05-02 18:52:14 +00001123{
1124 PyObject *restuple;
1125 PyObject *object;
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02001126 PyObject *encode;
Serhiy Storchaka85b0f5b2016-11-20 10:16:47 +02001127 const char *encoding;
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02001128 int code;
1129 int bytelength;
Martin v. Löwisb09af032011-11-04 11:16:41 +01001130 Py_ssize_t i;
Martin v. Löwisdb12d452009-05-02 18:52:14 +00001131 Py_ssize_t start;
1132 Py_ssize_t end;
1133 PyObject *res;
Serhiy Storchakaca7fecb2015-05-18 16:08:52 +03001134
1135 if (PyObject_TypeCheck(exc, (PyTypeObject *)PyExc_UnicodeEncodeError)) {
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02001136 unsigned char *outp;
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001137 if (PyUnicodeEncodeError_GetStart(exc, &start))
1138 return NULL;
1139 if (PyUnicodeEncodeError_GetEnd(exc, &end))
1140 return NULL;
1141 if (!(object = PyUnicodeEncodeError_GetObject(exc)))
1142 return NULL;
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02001143 if (!(encode = PyUnicodeEncodeError_GetEncoding(exc))) {
1144 Py_DECREF(object);
1145 return NULL;
1146 }
1147 if (!(encoding = PyUnicode_AsUTF8(encode))) {
1148 Py_DECREF(object);
1149 Py_DECREF(encode);
1150 return NULL;
1151 }
1152 code = get_standard_encoding(encoding, &bytelength);
1153 Py_DECREF(encode);
Serhiy Storchaka88d8fb62014-05-15 14:37:42 +03001154 if (code == ENC_UNKNOWN) {
1155 /* Not supported, fail with original exception */
1156 PyErr_SetObject(PyExceptionInstance_Class(exc), exc);
1157 Py_DECREF(object);
1158 return NULL;
1159 }
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02001160
Serhiy Storchaka2e374092014-10-04 14:15:49 +03001161 if (end - start > PY_SSIZE_T_MAX / bytelength)
1162 end = start + PY_SSIZE_T_MAX / bytelength;
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02001163 res = PyBytes_FromStringAndSize(NULL, bytelength*(end-start));
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001164 if (!res) {
1165 Py_DECREF(object);
1166 return NULL;
1167 }
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02001168 outp = (unsigned char*)PyBytes_AsString(res);
Martin v. Löwisb09af032011-11-04 11:16:41 +01001169 for (i = start; i < end; i++) {
1170 /* object is guaranteed to be "ready" */
1171 Py_UCS4 ch = PyUnicode_READ_CHAR(object, i);
Victor Stinner76df43d2012-10-30 01:42:39 +01001172 if (!Py_UNICODE_IS_SURROGATE(ch)) {
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001173 /* Not a surrogate, fail with original exception */
1174 PyErr_SetObject(PyExceptionInstance_Class(exc), exc);
1175 Py_DECREF(res);
1176 Py_DECREF(object);
1177 return NULL;
1178 }
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02001179 switch (code) {
1180 case ENC_UTF8:
1181 *outp++ = (unsigned char)(0xe0 | (ch >> 12));
1182 *outp++ = (unsigned char)(0x80 | ((ch >> 6) & 0x3f));
1183 *outp++ = (unsigned char)(0x80 | (ch & 0x3f));
1184 break;
1185 case ENC_UTF16LE:
1186 *outp++ = (unsigned char) ch;
1187 *outp++ = (unsigned char)(ch >> 8);
1188 break;
1189 case ENC_UTF16BE:
1190 *outp++ = (unsigned char)(ch >> 8);
1191 *outp++ = (unsigned char) ch;
1192 break;
1193 case ENC_UTF32LE:
1194 *outp++ = (unsigned char) ch;
1195 *outp++ = (unsigned char)(ch >> 8);
1196 *outp++ = (unsigned char)(ch >> 16);
1197 *outp++ = (unsigned char)(ch >> 24);
1198 break;
1199 case ENC_UTF32BE:
1200 *outp++ = (unsigned char)(ch >> 24);
1201 *outp++ = (unsigned char)(ch >> 16);
1202 *outp++ = (unsigned char)(ch >> 8);
1203 *outp++ = (unsigned char) ch;
1204 break;
1205 }
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001206 }
1207 restuple = Py_BuildValue("(On)", res, end);
1208 Py_DECREF(res);
1209 Py_DECREF(object);
1210 return restuple;
Martin v. Löwisdb12d452009-05-02 18:52:14 +00001211 }
Serhiy Storchakaca7fecb2015-05-18 16:08:52 +03001212 else if (PyObject_TypeCheck(exc, (PyTypeObject *)PyExc_UnicodeDecodeError)) {
Serhiy Storchakacb33a012016-10-23 09:44:50 +03001213 const unsigned char *p;
Victor Stinnerc06bb7a2011-11-04 21:36:35 +01001214 Py_UCS4 ch = 0;
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001215 if (PyUnicodeDecodeError_GetStart(exc, &start))
1216 return NULL;
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02001217 if (PyUnicodeDecodeError_GetEnd(exc, &end))
1218 return NULL;
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001219 if (!(object = PyUnicodeDecodeError_GetObject(exc)))
1220 return NULL;
Serhiy Storchakacb33a012016-10-23 09:44:50 +03001221 p = (const unsigned char*)PyBytes_AS_STRING(object);
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02001222 if (!(encode = PyUnicodeDecodeError_GetEncoding(exc))) {
1223 Py_DECREF(object);
1224 return NULL;
1225 }
1226 if (!(encoding = PyUnicode_AsUTF8(encode))) {
1227 Py_DECREF(object);
1228 Py_DECREF(encode);
1229 return NULL;
1230 }
1231 code = get_standard_encoding(encoding, &bytelength);
1232 Py_DECREF(encode);
Serhiy Storchaka88d8fb62014-05-15 14:37:42 +03001233 if (code == ENC_UNKNOWN) {
1234 /* Not supported, fail with original exception */
1235 PyErr_SetObject(PyExceptionInstance_Class(exc), exc);
1236 Py_DECREF(object);
1237 return NULL;
1238 }
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02001239
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001240 /* Try decoding a single surrogate character. If
1241 there are more, let the codec call us again. */
1242 p += start;
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02001243 if (PyBytes_GET_SIZE(object) - start >= bytelength) {
1244 switch (code) {
1245 case ENC_UTF8:
1246 if ((p[0] & 0xf0) == 0xe0 &&
1247 (p[1] & 0xc0) == 0x80 &&
1248 (p[2] & 0xc0) == 0x80) {
1249 /* it's a three-byte code */
1250 ch = ((p[0] & 0x0f) << 12) + ((p[1] & 0x3f) << 6) + (p[2] & 0x3f);
1251 }
1252 break;
1253 case ENC_UTF16LE:
1254 ch = p[1] << 8 | p[0];
1255 break;
1256 case ENC_UTF16BE:
1257 ch = p[0] << 8 | p[1];
1258 break;
1259 case ENC_UTF32LE:
1260 ch = (p[3] << 24) | (p[2] << 16) | (p[1] << 8) | p[0];
1261 break;
1262 case ENC_UTF32BE:
1263 ch = (p[0] << 24) | (p[1] << 16) | (p[2] << 8) | p[3];
1264 break;
1265 }
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001266 }
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02001267
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001268 Py_DECREF(object);
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02001269 if (!Py_UNICODE_IS_SURROGATE(ch)) {
1270 /* it's not a surrogate - fail */
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001271 PyErr_SetObject(PyExceptionInstance_Class(exc), exc);
1272 return NULL;
1273 }
Victor Stinnerc06bb7a2011-11-04 21:36:35 +01001274 res = PyUnicode_FromOrdinal(ch);
1275 if (res == NULL)
1276 return NULL;
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02001277 return Py_BuildValue("(Nn)", res, start + bytelength);
Martin v. Löwisdb12d452009-05-02 18:52:14 +00001278 }
1279 else {
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001280 wrong_exception_type(exc);
1281 return NULL;
Martin v. Löwisdb12d452009-05-02 18:52:14 +00001282 }
1283}
1284
Martin v. Löwis011e8422009-05-05 04:43:17 +00001285static PyObject *
Martin v. Löwis43c57782009-05-10 08:15:24 +00001286PyCodec_SurrogateEscapeErrors(PyObject *exc)
Martin v. Löwis011e8422009-05-05 04:43:17 +00001287{
1288 PyObject *restuple;
1289 PyObject *object;
Martin v. Löwisb09af032011-11-04 11:16:41 +01001290 Py_ssize_t i;
Martin v. Löwis011e8422009-05-05 04:43:17 +00001291 Py_ssize_t start;
1292 Py_ssize_t end;
1293 PyObject *res;
Serhiy Storchakaca7fecb2015-05-18 16:08:52 +03001294
1295 if (PyObject_TypeCheck(exc, (PyTypeObject *)PyExc_UnicodeEncodeError)) {
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001296 char *outp;
1297 if (PyUnicodeEncodeError_GetStart(exc, &start))
1298 return NULL;
1299 if (PyUnicodeEncodeError_GetEnd(exc, &end))
1300 return NULL;
1301 if (!(object = PyUnicodeEncodeError_GetObject(exc)))
1302 return NULL;
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001303 res = PyBytes_FromStringAndSize(NULL, end-start);
1304 if (!res) {
1305 Py_DECREF(object);
1306 return NULL;
1307 }
1308 outp = PyBytes_AsString(res);
Martin v. Löwisb09af032011-11-04 11:16:41 +01001309 for (i = start; i < end; i++) {
1310 /* object is guaranteed to be "ready" */
1311 Py_UCS4 ch = PyUnicode_READ_CHAR(object, i);
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001312 if (ch < 0xdc80 || ch > 0xdcff) {
1313 /* Not a UTF-8b surrogate, fail with original exception */
1314 PyErr_SetObject(PyExceptionInstance_Class(exc), exc);
1315 Py_DECREF(res);
1316 Py_DECREF(object);
1317 return NULL;
1318 }
1319 *outp++ = ch - 0xdc00;
1320 }
1321 restuple = Py_BuildValue("(On)", res, end);
1322 Py_DECREF(res);
1323 Py_DECREF(object);
1324 return restuple;
Martin v. Löwis011e8422009-05-05 04:43:17 +00001325 }
Serhiy Storchakaca7fecb2015-05-18 16:08:52 +03001326 else if (PyObject_TypeCheck(exc, (PyTypeObject *)PyExc_UnicodeDecodeError)) {
Victor Stinnerc06bb7a2011-11-04 21:36:35 +01001327 PyObject *str;
Serhiy Storchakacb33a012016-10-23 09:44:50 +03001328 const unsigned char *p;
Victor Stinnerc06bb7a2011-11-04 21:36:35 +01001329 Py_UCS2 ch[4]; /* decode up to 4 bad bytes. */
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001330 int consumed = 0;
1331 if (PyUnicodeDecodeError_GetStart(exc, &start))
1332 return NULL;
1333 if (PyUnicodeDecodeError_GetEnd(exc, &end))
1334 return NULL;
1335 if (!(object = PyUnicodeDecodeError_GetObject(exc)))
1336 return NULL;
Serhiy Storchakacb33a012016-10-23 09:44:50 +03001337 p = (const unsigned char*)PyBytes_AS_STRING(object);
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001338 while (consumed < 4 && consumed < end-start) {
1339 /* Refuse to escape ASCII bytes. */
1340 if (p[start+consumed] < 128)
1341 break;
1342 ch[consumed] = 0xdc00 + p[start+consumed];
1343 consumed++;
1344 }
1345 Py_DECREF(object);
1346 if (!consumed) {
1347 /* codec complained about ASCII byte. */
1348 PyErr_SetObject(PyExceptionInstance_Class(exc), exc);
1349 return NULL;
1350 }
Victor Stinnerc06bb7a2011-11-04 21:36:35 +01001351 str = PyUnicode_FromKindAndData(PyUnicode_2BYTE_KIND, ch, consumed);
1352 if (str == NULL)
1353 return NULL;
1354 return Py_BuildValue("(Nn)", str, start+consumed);
Martin v. Löwis011e8422009-05-05 04:43:17 +00001355 }
1356 else {
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001357 wrong_exception_type(exc);
1358 return NULL;
Martin v. Löwis011e8422009-05-05 04:43:17 +00001359 }
1360}
1361
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001362
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001363static PyObject *strict_errors(PyObject *self, PyObject *exc)
1364{
1365 return PyCodec_StrictErrors(exc);
1366}
1367
1368
1369static PyObject *ignore_errors(PyObject *self, PyObject *exc)
1370{
1371 return PyCodec_IgnoreErrors(exc);
1372}
1373
1374
1375static PyObject *replace_errors(PyObject *self, PyObject *exc)
1376{
1377 return PyCodec_ReplaceErrors(exc);
1378}
1379
1380
1381static PyObject *xmlcharrefreplace_errors(PyObject *self, PyObject *exc)
1382{
1383 return PyCodec_XMLCharRefReplaceErrors(exc);
1384}
1385
1386
1387static PyObject *backslashreplace_errors(PyObject *self, PyObject *exc)
1388{
1389 return PyCodec_BackslashReplaceErrors(exc);
1390}
1391
Serhiy Storchaka166ebc42014-11-25 13:57:17 +02001392static PyObject *namereplace_errors(PyObject *self, PyObject *exc)
1393{
1394 return PyCodec_NameReplaceErrors(exc);
1395}
1396
Martin v. Löwise0a2b722009-05-10 08:08:56 +00001397static PyObject *surrogatepass_errors(PyObject *self, PyObject *exc)
Martin v. Löwisdb12d452009-05-02 18:52:14 +00001398{
Martin v. Löwise0a2b722009-05-10 08:08:56 +00001399 return PyCodec_SurrogatePassErrors(exc);
Martin v. Löwisdb12d452009-05-02 18:52:14 +00001400}
1401
Martin v. Löwis43c57782009-05-10 08:15:24 +00001402static PyObject *surrogateescape_errors(PyObject *self, PyObject *exc)
Martin v. Löwis011e8422009-05-05 04:43:17 +00001403{
Martin v. Löwis43c57782009-05-10 08:15:24 +00001404 return PyCodec_SurrogateEscapeErrors(exc);
Martin v. Löwis011e8422009-05-05 04:43:17 +00001405}
1406
Gustavo Niemeyer5ddd4c32003-03-19 00:35:36 +00001407static int _PyCodecRegistry_Init(void)
Guido van Rossumfeee4b92000-03-10 22:57:27 +00001408{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001409 static struct {
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001410 char *name;
1411 PyMethodDef def;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001412 } methods[] =
1413 {
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001414 {
1415 "strict",
1416 {
1417 "strict_errors",
1418 strict_errors,
1419 METH_O,
1420 PyDoc_STR("Implements the 'strict' error handling, which "
1421 "raises a UnicodeError on coding errors.")
1422 }
1423 },
1424 {
1425 "ignore",
1426 {
1427 "ignore_errors",
1428 ignore_errors,
1429 METH_O,
1430 PyDoc_STR("Implements the 'ignore' error handling, which "
1431 "ignores malformed data and continues.")
1432 }
1433 },
1434 {
1435 "replace",
1436 {
1437 "replace_errors",
1438 replace_errors,
1439 METH_O,
1440 PyDoc_STR("Implements the 'replace' error handling, which "
1441 "replaces malformed data with a replacement marker.")
1442 }
1443 },
1444 {
1445 "xmlcharrefreplace",
1446 {
1447 "xmlcharrefreplace_errors",
1448 xmlcharrefreplace_errors,
1449 METH_O,
1450 PyDoc_STR("Implements the 'xmlcharrefreplace' error handling, "
1451 "which replaces an unencodable character with the "
1452 "appropriate XML character reference.")
1453 }
1454 },
1455 {
1456 "backslashreplace",
1457 {
1458 "backslashreplace_errors",
1459 backslashreplace_errors,
1460 METH_O,
1461 PyDoc_STR("Implements the 'backslashreplace' error handling, "
Serhiy Storchaka07985ef2015-01-25 22:56:57 +02001462 "which replaces malformed data with a backslashed "
1463 "escape sequence.")
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001464 }
1465 },
1466 {
Serhiy Storchaka166ebc42014-11-25 13:57:17 +02001467 "namereplace",
1468 {
1469 "namereplace_errors",
1470 namereplace_errors,
1471 METH_O,
1472 PyDoc_STR("Implements the 'namereplace' error handling, "
1473 "which replaces an unencodable character with a "
1474 "\\N{...} escape sequence.")
1475 }
1476 },
1477 {
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001478 "surrogatepass",
1479 {
1480 "surrogatepass",
1481 surrogatepass_errors,
1482 METH_O
1483 }
1484 },
1485 {
1486 "surrogateescape",
1487 {
1488 "surrogateescape",
1489 surrogateescape_errors,
1490 METH_O
1491 }
1492 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001493 };
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001494
Victor Stinnercaba55b2018-08-03 15:33:52 +02001495 PyInterpreterState *interp = _PyInterpreterState_Get();
Gustavo Niemeyer5ddd4c32003-03-19 00:35:36 +00001496 PyObject *mod;
Neal Norwitz739a8f82004-07-08 01:55:58 +00001497 unsigned i;
Gustavo Niemeyer5ddd4c32003-03-19 00:35:36 +00001498
1499 if (interp->codec_search_path != NULL)
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001500 return 0;
Gustavo Niemeyer5ddd4c32003-03-19 00:35:36 +00001501
1502 interp->codec_search_path = PyList_New(0);
1503 interp->codec_search_cache = PyDict_New();
1504 interp->codec_error_registry = PyDict_New();
1505
1506 if (interp->codec_error_registry) {
Victor Stinner63941882011-09-29 00:42:28 +02001507 for (i = 0; i < Py_ARRAY_LENGTH(methods); ++i) {
Andrew Svetlov3ba3a3e2012-12-25 13:32:35 +02001508 PyObject *func = PyCFunction_NewEx(&methods[i].def, NULL, NULL);
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001509 int res;
1510 if (!func)
1511 Py_FatalError("can't initialize codec error registry");
1512 res = PyCodec_RegisterError(methods[i].name, func);
1513 Py_DECREF(func);
1514 if (res)
1515 Py_FatalError("can't initialize codec error registry");
1516 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001517 }
Guido van Rossumfeee4b92000-03-10 22:57:27 +00001518
Gustavo Niemeyer5ddd4c32003-03-19 00:35:36 +00001519 if (interp->codec_search_path == NULL ||
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001520 interp->codec_search_cache == NULL ||
1521 interp->codec_error_registry == NULL)
1522 Py_FatalError("can't initialize codec registry");
Gustavo Niemeyer5ddd4c32003-03-19 00:35:36 +00001523
Christian Heimes819b8bf2008-01-03 23:05:47 +00001524 mod = PyImport_ImportModuleNoBlock("encodings");
Gustavo Niemeyer5ddd4c32003-03-19 00:35:36 +00001525 if (mod == NULL) {
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001526 return -1;
Gustavo Niemeyer5ddd4c32003-03-19 00:35:36 +00001527 }
1528 Py_DECREF(mod);
Christian Heimes6a27efa2008-10-30 21:48:26 +00001529 interp->codecs_initialized = 1;
Gustavo Niemeyer5ddd4c32003-03-19 00:35:36 +00001530 return 0;
Guido van Rossumfeee4b92000-03-10 22:57:27 +00001531}