blob: 62bbee61c0a3b5275728516c1d8078b63bc3fcf2 [file] [log] [blame]
Guido van Rossumfeee4b92000-03-10 22:57:27 +00001/* ------------------------------------------------------------------------
2
3 Python Codec Registry and support functions
4
5Written by Marc-Andre Lemburg (mal@lemburg.com).
6
Guido van Rossum16b1ad92000-08-03 16:24:25 +00007Copyright (c) Corporation for National Research Initiatives.
Guido van Rossumfeee4b92000-03-10 22:57:27 +00008
9 ------------------------------------------------------------------------ */
10
11#include "Python.h"
Victor Stinner27e2d1f2018-11-01 00:52:28 +010012#include "pycore_state.h"
Serhiy Storchaka166ebc42014-11-25 13:57:17 +020013#include "ucnhash.h"
Guido van Rossumfeee4b92000-03-10 22:57:27 +000014#include <ctype.h>
15
Victor Stinnerf5cff562011-10-14 02:13:11 +020016const char *Py_hexdigits = "0123456789abcdef";
17
Guido van Rossumfeee4b92000-03-10 22:57:27 +000018/* --- Codec Registry ----------------------------------------------------- */
19
20/* Import the standard encodings package which will register the first
Guido van Rossum98297ee2007-11-06 21:34:58 +000021 codec search function.
Guido van Rossumfeee4b92000-03-10 22:57:27 +000022
23 This is done in a lazy way so that the Unicode implementation does
24 not downgrade startup time of scripts not needing it.
25
Guido van Rossumb95de4f2000-03-31 17:25:23 +000026 ImportErrors are silently ignored by this function. Only one try is
27 made.
Guido van Rossumfeee4b92000-03-10 22:57:27 +000028
29*/
30
Gustavo Niemeyer5ddd4c32003-03-19 00:35:36 +000031static int _PyCodecRegistry_Init(void); /* Forward */
Guido van Rossumfeee4b92000-03-10 22:57:27 +000032
Guido van Rossumfeee4b92000-03-10 22:57:27 +000033int PyCodec_Register(PyObject *search_function)
34{
Victor Stinnercaba55b2018-08-03 15:33:52 +020035 PyInterpreterState *interp = _PyInterpreterState_Get();
Gustavo Niemeyer5ddd4c32003-03-19 00:35:36 +000036 if (interp->codec_search_path == NULL && _PyCodecRegistry_Init())
Antoine Pitrouf95a1b32010-05-09 15:52:27 +000037 goto onError;
Guido van Rossumfeee4b92000-03-10 22:57:27 +000038 if (search_function == NULL) {
Antoine Pitrouf95a1b32010-05-09 15:52:27 +000039 PyErr_BadArgument();
40 goto onError;
Guido van Rossumfeee4b92000-03-10 22:57:27 +000041 }
42 if (!PyCallable_Check(search_function)) {
Antoine Pitrouf95a1b32010-05-09 15:52:27 +000043 PyErr_SetString(PyExc_TypeError, "argument must be callable");
44 goto onError;
Guido van Rossumfeee4b92000-03-10 22:57:27 +000045 }
Gustavo Niemeyer5ddd4c32003-03-19 00:35:36 +000046 return PyList_Append(interp->codec_search_path, search_function);
Guido van Rossumb95de4f2000-03-31 17:25:23 +000047
48 onError:
49 return -1;
Guido van Rossumfeee4b92000-03-10 22:57:27 +000050}
51
Guido van Rossum9e896b32000-04-05 20:11:21 +000052/* Convert a string to a normalized Python string: all characters are
53 converted to lower case, spaces are replaced with underscores. */
54
Guido van Rossumfeee4b92000-03-10 22:57:27 +000055static
Guido van Rossum9e896b32000-04-05 20:11:21 +000056PyObject *normalizestring(const char *string)
Guido van Rossumfeee4b92000-03-10 22:57:27 +000057{
Antoine Pitrou9ed5f272013-08-13 20:18:52 +020058 size_t i;
Guido van Rossum582acec2000-06-28 22:07:35 +000059 size_t len = strlen(string);
Guido van Rossumfeee4b92000-03-10 22:57:27 +000060 char *p;
61 PyObject *v;
Guido van Rossum21431e82007-10-19 21:48:41 +000062
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000063 if (len > PY_SSIZE_T_MAX) {
Antoine Pitrouf95a1b32010-05-09 15:52:27 +000064 PyErr_SetString(PyExc_OverflowError, "string is too large");
65 return NULL;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000066 }
Guido van Rossum21431e82007-10-19 21:48:41 +000067
68 p = PyMem_Malloc(len + 1);
69 if (p == NULL)
Victor Stinnercc351592013-07-12 00:02:55 +020070 return PyErr_NoMemory();
Guido van Rossum9e896b32000-04-05 20:11:21 +000071 for (i = 0; i < len; i++) {
Antoine Pitrou9ed5f272013-08-13 20:18:52 +020072 char ch = string[i];
Guido van Rossum9e896b32000-04-05 20:11:21 +000073 if (ch == ' ')
74 ch = '-';
75 else
Antoine Pitroucf9d3c02011-07-24 02:27:04 +020076 ch = Py_TOLOWER(Py_CHARMASK(ch));
Antoine Pitrouf95a1b32010-05-09 15:52:27 +000077 p[i] = ch;
Guido van Rossum9e896b32000-04-05 20:11:21 +000078 }
Guido van Rossum21431e82007-10-19 21:48:41 +000079 p[i] = '\0';
80 v = PyUnicode_FromString(p);
Guido van Rossum21431e82007-10-19 21:48:41 +000081 PyMem_Free(p);
Guido van Rossumfeee4b92000-03-10 22:57:27 +000082 return v;
83}
84
85/* Lookup the given encoding and return a tuple providing the codec
86 facilities.
87
88 The encoding string is looked up converted to all lower-case
89 characters. This makes encodings looked up through this mechanism
90 effectively case-insensitive.
91
Guido van Rossum98297ee2007-11-06 21:34:58 +000092 If no codec is found, a LookupError is set and NULL returned.
Guido van Rossumb95de4f2000-03-31 17:25:23 +000093
94 As side effect, this tries to load the encodings package, if not
95 yet done. This is part of the lazy load strategy for the encodings
96 package.
97
98*/
Guido van Rossumfeee4b92000-03-10 22:57:27 +000099
100PyObject *_PyCodec_Lookup(const char *encoding)
101{
Guido van Rossum5ba3c842000-03-24 20:52:23 +0000102 PyObject *result, *args = NULL, *v;
Thomas Wouters477c8d52006-05-27 19:21:47 +0000103 Py_ssize_t i, len;
Guido van Rossumfeee4b92000-03-10 22:57:27 +0000104
Fred Drake766de832000-05-09 19:55:59 +0000105 if (encoding == NULL) {
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000106 PyErr_BadArgument();
107 goto onError;
Fred Drake766de832000-05-09 19:55:59 +0000108 }
Gustavo Niemeyer5ddd4c32003-03-19 00:35:36 +0000109
Victor Stinnercaba55b2018-08-03 15:33:52 +0200110 PyInterpreterState *interp = _PyInterpreterState_GET_UNSAFE();
Gustavo Niemeyer5ddd4c32003-03-19 00:35:36 +0000111 if (interp->codec_search_path == NULL && _PyCodecRegistry_Init())
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000112 goto onError;
Guido van Rossumfeee4b92000-03-10 22:57:27 +0000113
Guido van Rossum9e896b32000-04-05 20:11:21 +0000114 /* Convert the encoding to a normalized Python string: all
Thomas Wouters7e474022000-07-16 12:04:32 +0000115 characters are converted to lower case, spaces and hyphens are
Guido van Rossum9e896b32000-04-05 20:11:21 +0000116 replaced with underscores. */
117 v = normalizestring(encoding);
Guido van Rossumfeee4b92000-03-10 22:57:27 +0000118 if (v == NULL)
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000119 goto onError;
Guido van Rossum21431e82007-10-19 21:48:41 +0000120 PyUnicode_InternInPlace(&v);
Guido van Rossumfeee4b92000-03-10 22:57:27 +0000121
122 /* First, try to lookup the name in the registry dictionary */
Gustavo Niemeyer5ddd4c32003-03-19 00:35:36 +0000123 result = PyDict_GetItem(interp->codec_search_cache, v);
Guido van Rossumfeee4b92000-03-10 22:57:27 +0000124 if (result != NULL) {
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000125 Py_INCREF(result);
126 Py_DECREF(v);
127 return result;
Guido van Rossumfeee4b92000-03-10 22:57:27 +0000128 }
Guido van Rossum98297ee2007-11-06 21:34:58 +0000129
Guido van Rossumfeee4b92000-03-10 22:57:27 +0000130 /* Next, scan the search functions in order of registration */
Guido van Rossumfeee4b92000-03-10 22:57:27 +0000131 args = PyTuple_New(1);
132 if (args == NULL)
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000133 goto onError;
Guido van Rossumfeee4b92000-03-10 22:57:27 +0000134 PyTuple_SET_ITEM(args,0,v);
Guido van Rossum5ba3c842000-03-24 20:52:23 +0000135
Gustavo Niemeyer5ddd4c32003-03-19 00:35:36 +0000136 len = PyList_Size(interp->codec_search_path);
Guido van Rossum5ba3c842000-03-24 20:52:23 +0000137 if (len < 0)
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000138 goto onError;
Guido van Rossumb95de4f2000-03-31 17:25:23 +0000139 if (len == 0) {
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000140 PyErr_SetString(PyExc_LookupError,
141 "no codec search functions registered: "
142 "can't find encoding");
143 goto onError;
Guido van Rossumb95de4f2000-03-31 17:25:23 +0000144 }
Guido van Rossumfeee4b92000-03-10 22:57:27 +0000145
146 for (i = 0; i < len; i++) {
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000147 PyObject *func;
Guido van Rossum98297ee2007-11-06 21:34:58 +0000148
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000149 func = PyList_GetItem(interp->codec_search_path, i);
150 if (func == NULL)
151 goto onError;
152 result = PyEval_CallObject(func, args);
153 if (result == NULL)
154 goto onError;
155 if (result == Py_None) {
156 Py_DECREF(result);
157 continue;
158 }
159 if (!PyTuple_Check(result) || PyTuple_GET_SIZE(result) != 4) {
160 PyErr_SetString(PyExc_TypeError,
161 "codec search functions must return 4-tuples");
162 Py_DECREF(result);
163 goto onError;
164 }
165 break;
Guido van Rossumfeee4b92000-03-10 22:57:27 +0000166 }
167 if (i == len) {
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000168 /* XXX Perhaps we should cache misses too ? */
169 PyErr_Format(PyExc_LookupError,
Martin v. Löwiseb42b022002-09-26 16:01:24 +0000170 "unknown encoding: %s", encoding);
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000171 goto onError;
Guido van Rossumfeee4b92000-03-10 22:57:27 +0000172 }
173
174 /* Cache and return the result */
Neal Norwitz9edcc2e2007-08-11 04:58:26 +0000175 if (PyDict_SetItem(interp->codec_search_cache, v, result) < 0) {
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000176 Py_DECREF(result);
177 goto onError;
Neal Norwitz9edcc2e2007-08-11 04:58:26 +0000178 }
Guido van Rossumfeee4b92000-03-10 22:57:27 +0000179 Py_DECREF(args);
180 return result;
181
182 onError:
183 Py_XDECREF(args);
184 return NULL;
185}
186
Nick Coghlan8fad1672014-09-15 23:50:44 +1200187int _PyCodec_Forget(const char *encoding)
188{
Nick Coghlan8fad1672014-09-15 23:50:44 +1200189 PyObject *v;
190 int result;
191
Victor Stinnercaba55b2018-08-03 15:33:52 +0200192 PyInterpreterState *interp = _PyInterpreterState_Get();
Nick Coghlan8fad1672014-09-15 23:50:44 +1200193 if (interp->codec_search_path == NULL) {
194 return -1;
195 }
196
197 /* Convert the encoding to a normalized Python string: all
198 characters are converted to lower case, spaces and hyphens are
199 replaced with underscores. */
200 v = normalizestring(encoding);
201 if (v == NULL) {
202 return -1;
203 }
204
205 /* Drop the named codec from the internal cache */
206 result = PyDict_DelItem(interp->codec_search_cache, v);
207 Py_DECREF(v);
208
209 return result;
210}
211
Marc-André Lemburgb2750b52008-06-06 12:18:17 +0000212/* Codec registry encoding check API. */
213
214int PyCodec_KnownEncoding(const char *encoding)
215{
216 PyObject *codecs;
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000217
Marc-André Lemburgb2750b52008-06-06 12:18:17 +0000218 codecs = _PyCodec_Lookup(encoding);
219 if (!codecs) {
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000220 PyErr_Clear();
221 return 0;
Marc-André Lemburgb2750b52008-06-06 12:18:17 +0000222 }
223 else {
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000224 Py_DECREF(codecs);
225 return 1;
Marc-André Lemburgb2750b52008-06-06 12:18:17 +0000226 }
227}
228
Guido van Rossumfeee4b92000-03-10 22:57:27 +0000229static
230PyObject *args_tuple(PyObject *object,
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000231 const char *errors)
Guido van Rossumfeee4b92000-03-10 22:57:27 +0000232{
233 PyObject *args;
Guido van Rossum98297ee2007-11-06 21:34:58 +0000234
Guido van Rossumfeee4b92000-03-10 22:57:27 +0000235 args = PyTuple_New(1 + (errors != NULL));
236 if (args == NULL)
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000237 return NULL;
Guido van Rossumfeee4b92000-03-10 22:57:27 +0000238 Py_INCREF(object);
239 PyTuple_SET_ITEM(args,0,object);
240 if (errors) {
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000241 PyObject *v;
Guido van Rossum98297ee2007-11-06 21:34:58 +0000242
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000243 v = PyUnicode_FromString(errors);
244 if (v == NULL) {
245 Py_DECREF(args);
246 return NULL;
247 }
248 PyTuple_SET_ITEM(args, 1, v);
Guido van Rossumfeee4b92000-03-10 22:57:27 +0000249 }
250 return args;
251}
252
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000253/* Helper function to get a codec item */
Guido van Rossumfeee4b92000-03-10 22:57:27 +0000254
255static
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000256PyObject *codec_getitem(const char *encoding, int index)
Guido van Rossumfeee4b92000-03-10 22:57:27 +0000257{
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000258 PyObject *codecs;
259 PyObject *v;
Guido van Rossumfeee4b92000-03-10 22:57:27 +0000260
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000261 codecs = _PyCodec_Lookup(encoding);
262 if (codecs == NULL)
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000263 return NULL;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000264 v = PyTuple_GET_ITEM(codecs, index);
265 Py_DECREF(codecs);
266 Py_INCREF(v);
267 return v;
268}
269
Nick Coghlana9b15242014-02-04 22:11:18 +1000270/* Helper functions to create an incremental codec. */
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000271static
Nick Coghlana9b15242014-02-04 22:11:18 +1000272PyObject *codec_makeincrementalcodec(PyObject *codec_info,
273 const char *errors,
274 const char *attrname)
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000275{
Nick Coghlana9b15242014-02-04 22:11:18 +1000276 PyObject *ret, *inccodec;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000277
Nick Coghlana9b15242014-02-04 22:11:18 +1000278 inccodec = PyObject_GetAttrString(codec_info, attrname);
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000279 if (inccodec == NULL)
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000280 return NULL;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000281 if (errors)
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000282 ret = PyObject_CallFunction(inccodec, "s", errors);
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000283 else
Victor Stinner4778eab2016-12-01 14:51:04 +0100284 ret = _PyObject_CallNoArg(inccodec);
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000285 Py_DECREF(inccodec);
286 return ret;
287}
288
Nick Coghlana9b15242014-02-04 22:11:18 +1000289static
290PyObject *codec_getincrementalcodec(const char *encoding,
291 const char *errors,
292 const char *attrname)
293{
294 PyObject *codec_info, *ret;
295
296 codec_info = _PyCodec_Lookup(encoding);
297 if (codec_info == NULL)
298 return NULL;
299 ret = codec_makeincrementalcodec(codec_info, errors, attrname);
300 Py_DECREF(codec_info);
301 return ret;
302}
303
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000304/* Helper function to create a stream codec. */
305
306static
307PyObject *codec_getstreamcodec(const char *encoding,
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000308 PyObject *stream,
309 const char *errors,
310 const int index)
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000311{
Thomas Wouters0e3f5912006-08-11 14:57:12 +0000312 PyObject *codecs, *streamcodec, *codeccls;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000313
314 codecs = _PyCodec_Lookup(encoding);
315 if (codecs == NULL)
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000316 return NULL;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000317
Thomas Wouters0e3f5912006-08-11 14:57:12 +0000318 codeccls = PyTuple_GET_ITEM(codecs, index);
319 if (errors != NULL)
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000320 streamcodec = PyObject_CallFunction(codeccls, "Os", stream, errors);
Thomas Wouters0e3f5912006-08-11 14:57:12 +0000321 else
Victor Stinner7bfb42d2016-12-05 17:04:32 +0100322 streamcodec = PyObject_CallFunctionObjArgs(codeccls, stream, NULL);
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000323 Py_DECREF(codecs);
324 return streamcodec;
Guido van Rossumfeee4b92000-03-10 22:57:27 +0000325}
326
Nick Coghlana9b15242014-02-04 22:11:18 +1000327/* Helpers to work with the result of _PyCodec_Lookup
328
329 */
330PyObject *_PyCodecInfo_GetIncrementalDecoder(PyObject *codec_info,
331 const char *errors)
332{
333 return codec_makeincrementalcodec(codec_info, errors,
334 "incrementaldecoder");
335}
336
337PyObject *_PyCodecInfo_GetIncrementalEncoder(PyObject *codec_info,
338 const char *errors)
339{
340 return codec_makeincrementalcodec(codec_info, errors,
341 "incrementalencoder");
342}
343
344
Guido van Rossum98297ee2007-11-06 21:34:58 +0000345/* Convenience APIs to query the Codec registry.
346
Guido van Rossumfeee4b92000-03-10 22:57:27 +0000347 All APIs return a codec object with incremented refcount.
Guido van Rossum98297ee2007-11-06 21:34:58 +0000348
Guido van Rossumfeee4b92000-03-10 22:57:27 +0000349 */
350
351PyObject *PyCodec_Encoder(const char *encoding)
352{
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000353 return codec_getitem(encoding, 0);
Guido van Rossumfeee4b92000-03-10 22:57:27 +0000354}
355
356PyObject *PyCodec_Decoder(const char *encoding)
357{
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000358 return codec_getitem(encoding, 1);
Guido van Rossumfeee4b92000-03-10 22:57:27 +0000359}
360
Thomas Woutersa9773292006-04-21 09:43:23 +0000361PyObject *PyCodec_IncrementalEncoder(const char *encoding,
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000362 const char *errors)
Thomas Woutersa9773292006-04-21 09:43:23 +0000363{
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000364 return codec_getincrementalcodec(encoding, errors, "incrementalencoder");
Thomas Woutersa9773292006-04-21 09:43:23 +0000365}
366
367PyObject *PyCodec_IncrementalDecoder(const char *encoding,
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000368 const char *errors)
Thomas Woutersa9773292006-04-21 09:43:23 +0000369{
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000370 return codec_getincrementalcodec(encoding, errors, "incrementaldecoder");
Thomas Woutersa9773292006-04-21 09:43:23 +0000371}
372
Guido van Rossumfeee4b92000-03-10 22:57:27 +0000373PyObject *PyCodec_StreamReader(const char *encoding,
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000374 PyObject *stream,
375 const char *errors)
Guido van Rossumfeee4b92000-03-10 22:57:27 +0000376{
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000377 return codec_getstreamcodec(encoding, stream, errors, 2);
Guido van Rossumfeee4b92000-03-10 22:57:27 +0000378}
379
380PyObject *PyCodec_StreamWriter(const char *encoding,
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000381 PyObject *stream,
382 const char *errors)
Guido van Rossumfeee4b92000-03-10 22:57:27 +0000383{
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000384 return codec_getstreamcodec(encoding, stream, errors, 3);
Guido van Rossumfeee4b92000-03-10 22:57:27 +0000385}
386
Nick Coghlan8b097b42013-11-13 23:49:21 +1000387/* Helper that tries to ensure the reported exception chain indicates the
388 * codec that was invoked to trigger the failure without changing the type
389 * of the exception raised.
390 */
391static void
392wrap_codec_error(const char *operation,
393 const char *encoding)
394{
395 /* TrySetFromCause will replace the active exception with a suitably
396 * updated clone if it can, otherwise it will leave the original
397 * exception alone.
398 */
399 _PyErr_TrySetFromCause("%s with '%s' codec failed",
400 operation, encoding);
401}
402
Martin Panter6245cb32016-04-15 02:14:19 +0000403/* Encode an object (e.g. a Unicode object) using the given encoding
Guido van Rossumfeee4b92000-03-10 22:57:27 +0000404 and return the resulting encoded object (usually a Python string).
405
406 errors is passed to the encoder factory as argument if non-NULL. */
407
Nick Coghlanc72e4e62013-11-22 22:39:36 +1000408static PyObject *
409_PyCodec_EncodeInternal(PyObject *object,
410 PyObject *encoder,
411 const char *encoding,
412 const char *errors)
Guido van Rossumfeee4b92000-03-10 22:57:27 +0000413{
Neal Norwitz3715c3e2005-11-24 22:09:18 +0000414 PyObject *args = NULL, *result = NULL;
Guido van Rossum98297ee2007-11-06 21:34:58 +0000415 PyObject *v = NULL;
Guido van Rossumfeee4b92000-03-10 22:57:27 +0000416
Guido van Rossumfeee4b92000-03-10 22:57:27 +0000417 args = args_tuple(object, errors);
418 if (args == NULL)
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000419 goto onError;
Guido van Rossum98297ee2007-11-06 21:34:58 +0000420
421 result = PyEval_CallObject(encoder, args);
Nick Coghlanc4c25802013-11-15 21:47:37 +1000422 if (result == NULL) {
423 wrap_codec_error("encoding", encoding);
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000424 goto onError;
Nick Coghlanc4c25802013-11-15 21:47:37 +1000425 }
Guido van Rossumfeee4b92000-03-10 22:57:27 +0000426
Guido van Rossum98297ee2007-11-06 21:34:58 +0000427 if (!PyTuple_Check(result) ||
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000428 PyTuple_GET_SIZE(result) != 2) {
429 PyErr_SetString(PyExc_TypeError,
430 "encoder must return a tuple (object, integer)");
431 goto onError;
Guido van Rossumfeee4b92000-03-10 22:57:27 +0000432 }
Marc-André Lemburgb2750b52008-06-06 12:18:17 +0000433 v = PyTuple_GET_ITEM(result,0);
434 Py_INCREF(v);
Guido van Rossumfeee4b92000-03-10 22:57:27 +0000435 /* We don't check or use the second (integer) entry. */
436
Marc-André Lemburgb2750b52008-06-06 12:18:17 +0000437 Py_DECREF(args);
438 Py_DECREF(encoder);
439 Py_DECREF(result);
440 return v;
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000441
Guido van Rossumfeee4b92000-03-10 22:57:27 +0000442 onError:
Neal Norwitz3715c3e2005-11-24 22:09:18 +0000443 Py_XDECREF(result);
Guido van Rossumfeee4b92000-03-10 22:57:27 +0000444 Py_XDECREF(args);
445 Py_XDECREF(encoder);
Marc-André Lemburgb2750b52008-06-06 12:18:17 +0000446 return NULL;
Guido van Rossumfeee4b92000-03-10 22:57:27 +0000447}
448
449/* Decode an object (usually a Python string) using the given encoding
Martin Panter6245cb32016-04-15 02:14:19 +0000450 and return an equivalent object (e.g. a Unicode object).
Guido van Rossumfeee4b92000-03-10 22:57:27 +0000451
452 errors is passed to the decoder factory as argument if non-NULL. */
453
Nick Coghlanc72e4e62013-11-22 22:39:36 +1000454static PyObject *
455_PyCodec_DecodeInternal(PyObject *object,
456 PyObject *decoder,
457 const char *encoding,
458 const char *errors)
Guido van Rossumfeee4b92000-03-10 22:57:27 +0000459{
Guido van Rossumfeee4b92000-03-10 22:57:27 +0000460 PyObject *args = NULL, *result = NULL;
461 PyObject *v;
462
Guido van Rossumfeee4b92000-03-10 22:57:27 +0000463 args = args_tuple(object, errors);
464 if (args == NULL)
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000465 goto onError;
Guido van Rossum98297ee2007-11-06 21:34:58 +0000466
Guido van Rossumfeee4b92000-03-10 22:57:27 +0000467 result = PyEval_CallObject(decoder,args);
Nick Coghlanc4c25802013-11-15 21:47:37 +1000468 if (result == NULL) {
469 wrap_codec_error("decoding", encoding);
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000470 goto onError;
Nick Coghlanc4c25802013-11-15 21:47:37 +1000471 }
Guido van Rossum98297ee2007-11-06 21:34:58 +0000472 if (!PyTuple_Check(result) ||
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000473 PyTuple_GET_SIZE(result) != 2) {
474 PyErr_SetString(PyExc_TypeError,
475 "decoder must return a tuple (object,integer)");
476 goto onError;
Guido van Rossumfeee4b92000-03-10 22:57:27 +0000477 }
478 v = PyTuple_GET_ITEM(result,0);
479 Py_INCREF(v);
480 /* We don't check or use the second (integer) entry. */
481
482 Py_DECREF(args);
483 Py_DECREF(decoder);
484 Py_DECREF(result);
485 return v;
Guido van Rossum98297ee2007-11-06 21:34:58 +0000486
Guido van Rossumfeee4b92000-03-10 22:57:27 +0000487 onError:
488 Py_XDECREF(args);
489 Py_XDECREF(decoder);
490 Py_XDECREF(result);
491 return NULL;
492}
493
Nick Coghlanc72e4e62013-11-22 22:39:36 +1000494/* Generic encoding/decoding API */
495PyObject *PyCodec_Encode(PyObject *object,
496 const char *encoding,
497 const char *errors)
498{
499 PyObject *encoder;
500
501 encoder = PyCodec_Encoder(encoding);
502 if (encoder == NULL)
503 return NULL;
504
505 return _PyCodec_EncodeInternal(object, encoder, encoding, errors);
506}
507
508PyObject *PyCodec_Decode(PyObject *object,
509 const char *encoding,
510 const char *errors)
511{
512 PyObject *decoder;
513
514 decoder = PyCodec_Decoder(encoding);
515 if (decoder == NULL)
516 return NULL;
517
518 return _PyCodec_DecodeInternal(object, decoder, encoding, errors);
519}
520
521/* Text encoding/decoding API */
Nick Coghlana9b15242014-02-04 22:11:18 +1000522PyObject * _PyCodec_LookupTextEncoding(const char *encoding,
523 const char *alternate_command)
Nick Coghlanc72e4e62013-11-22 22:39:36 +1000524{
525 _Py_IDENTIFIER(_is_text_encoding);
526 PyObject *codec;
527 PyObject *attr;
Nick Coghlanc72e4e62013-11-22 22:39:36 +1000528 int is_text_codec;
529
530 codec = _PyCodec_Lookup(encoding);
531 if (codec == NULL)
532 return NULL;
533
534 /* Backwards compatibility: assume any raw tuple describes a text
535 * encoding, and the same for anything lacking the private
536 * attribute.
537 */
538 if (!PyTuple_CheckExact(codec)) {
Serhiy Storchakaf320be72018-01-25 10:49:40 +0200539 if (_PyObject_LookupAttrId(codec, &PyId__is_text_encoding, &attr) < 0) {
540 Py_DECREF(codec);
541 return NULL;
542 }
543 if (attr != NULL) {
Nick Coghlanc72e4e62013-11-22 22:39:36 +1000544 is_text_codec = PyObject_IsTrue(attr);
545 Py_DECREF(attr);
Serhiy Storchakafa494fd2015-05-30 17:45:22 +0300546 if (is_text_codec <= 0) {
Nick Coghlanc72e4e62013-11-22 22:39:36 +1000547 Py_DECREF(codec);
Serhiy Storchakafa494fd2015-05-30 17:45:22 +0300548 if (!is_text_codec)
549 PyErr_Format(PyExc_LookupError,
550 "'%.400s' is not a text encoding; "
551 "use %s to handle arbitrary codecs",
552 encoding, alternate_command);
Nick Coghlanc72e4e62013-11-22 22:39:36 +1000553 return NULL;
554 }
555 }
556 }
557
Nick Coghlana9b15242014-02-04 22:11:18 +1000558 /* This appears to be a valid text encoding */
559 return codec;
560}
561
562
563static
564PyObject *codec_getitem_checked(const char *encoding,
565 const char *alternate_command,
566 int index)
567{
568 PyObject *codec;
569 PyObject *v;
570
571 codec = _PyCodec_LookupTextEncoding(encoding, alternate_command);
572 if (codec == NULL)
573 return NULL;
574
Nick Coghlanc72e4e62013-11-22 22:39:36 +1000575 v = PyTuple_GET_ITEM(codec, index);
Nick Coghlanc72e4e62013-11-22 22:39:36 +1000576 Py_INCREF(v);
Nick Coghlana9b15242014-02-04 22:11:18 +1000577 Py_DECREF(codec);
Nick Coghlanc72e4e62013-11-22 22:39:36 +1000578 return v;
579}
580
581static PyObject * _PyCodec_TextEncoder(const char *encoding)
582{
Nick Coghlana9b15242014-02-04 22:11:18 +1000583 return codec_getitem_checked(encoding, "codecs.encode()", 0);
Nick Coghlanc72e4e62013-11-22 22:39:36 +1000584}
585
586static PyObject * _PyCodec_TextDecoder(const char *encoding)
587{
Nick Coghlana9b15242014-02-04 22:11:18 +1000588 return codec_getitem_checked(encoding, "codecs.decode()", 1);
Nick Coghlanc72e4e62013-11-22 22:39:36 +1000589}
590
591PyObject *_PyCodec_EncodeText(PyObject *object,
592 const char *encoding,
593 const char *errors)
594{
595 PyObject *encoder;
596
597 encoder = _PyCodec_TextEncoder(encoding);
598 if (encoder == NULL)
599 return NULL;
600
601 return _PyCodec_EncodeInternal(object, encoder, encoding, errors);
602}
603
604PyObject *_PyCodec_DecodeText(PyObject *object,
605 const char *encoding,
606 const char *errors)
607{
608 PyObject *decoder;
609
610 decoder = _PyCodec_TextDecoder(encoding);
611 if (decoder == NULL)
612 return NULL;
613
614 return _PyCodec_DecodeInternal(object, decoder, encoding, errors);
615}
616
Walter Dörwald3aeb6322002-09-02 13:14:32 +0000617/* Register the error handling callback function error under the name
618 name. This function will be called by the codec when it encounters
619 an unencodable characters/undecodable bytes and doesn't know the
620 callback name, when name is specified as the error parameter
621 in the call to the encode/decode function.
622 Return 0 on success, -1 on error */
623int PyCodec_RegisterError(const char *name, PyObject *error)
624{
Victor Stinnercaba55b2018-08-03 15:33:52 +0200625 PyInterpreterState *interp = _PyInterpreterState_Get();
Gustavo Niemeyer5ddd4c32003-03-19 00:35:36 +0000626 if (interp->codec_search_path == NULL && _PyCodecRegistry_Init())
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000627 return -1;
Walter Dörwald3aeb6322002-09-02 13:14:32 +0000628 if (!PyCallable_Check(error)) {
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000629 PyErr_SetString(PyExc_TypeError, "handler must be callable");
630 return -1;
Walter Dörwald3aeb6322002-09-02 13:14:32 +0000631 }
Gustavo Niemeyer5ddd4c32003-03-19 00:35:36 +0000632 return PyDict_SetItemString(interp->codec_error_registry,
Serhiy Storchakac6792272013-10-19 21:03:34 +0300633 name, error);
Walter Dörwald3aeb6322002-09-02 13:14:32 +0000634}
635
636/* Lookup the error handling callback function registered under the
637 name error. As a special case NULL can be passed, in which case
638 the error handling callback for strict encoding will be returned. */
639PyObject *PyCodec_LookupError(const char *name)
640{
641 PyObject *handler = NULL;
642
Victor Stinnercaba55b2018-08-03 15:33:52 +0200643 PyInterpreterState *interp = _PyInterpreterState_GET_UNSAFE();
Gustavo Niemeyer5ddd4c32003-03-19 00:35:36 +0000644 if (interp->codec_search_path == NULL && _PyCodecRegistry_Init())
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000645 return NULL;
Gustavo Niemeyer5ddd4c32003-03-19 00:35:36 +0000646
Walter Dörwald3aeb6322002-09-02 13:14:32 +0000647 if (name==NULL)
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000648 name = "strict";
Serhiy Storchakac6792272013-10-19 21:03:34 +0300649 handler = PyDict_GetItemString(interp->codec_error_registry, name);
Walter Dörwald3aeb6322002-09-02 13:14:32 +0000650 if (!handler)
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000651 PyErr_Format(PyExc_LookupError, "unknown error handler name '%.400s'", name);
Walter Dörwald3aeb6322002-09-02 13:14:32 +0000652 else
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000653 Py_INCREF(handler);
Walter Dörwald3aeb6322002-09-02 13:14:32 +0000654 return handler;
655}
656
657static void wrong_exception_type(PyObject *exc)
658{
Serhiy Storchakaca7fecb2015-05-18 16:08:52 +0300659 PyErr_Format(PyExc_TypeError,
660 "don't know how to handle %.200s in error callback",
661 exc->ob_type->tp_name);
Walter Dörwald3aeb6322002-09-02 13:14:32 +0000662}
663
664PyObject *PyCodec_StrictErrors(PyObject *exc)
665{
Brett Cannonbf364092006-03-01 04:25:17 +0000666 if (PyExceptionInstance_Check(exc))
667 PyErr_SetObject(PyExceptionInstance_Class(exc), exc);
Walter Dörwald3aeb6322002-09-02 13:14:32 +0000668 else
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000669 PyErr_SetString(PyExc_TypeError, "codec must pass exception instance");
Walter Dörwald3aeb6322002-09-02 13:14:32 +0000670 return NULL;
671}
672
673
674PyObject *PyCodec_IgnoreErrors(PyObject *exc)
675{
Martin v. Löwis18e16552006-02-15 17:27:45 +0000676 Py_ssize_t end;
Serhiy Storchakaca7fecb2015-05-18 16:08:52 +0300677
678 if (PyObject_TypeCheck(exc, (PyTypeObject *)PyExc_UnicodeEncodeError)) {
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000679 if (PyUnicodeEncodeError_GetEnd(exc, &end))
680 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +0000681 }
Serhiy Storchakaca7fecb2015-05-18 16:08:52 +0300682 else if (PyObject_TypeCheck(exc, (PyTypeObject *)PyExc_UnicodeDecodeError)) {
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000683 if (PyUnicodeDecodeError_GetEnd(exc, &end))
684 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +0000685 }
Serhiy Storchakaca7fecb2015-05-18 16:08:52 +0300686 else if (PyObject_TypeCheck(exc, (PyTypeObject *)PyExc_UnicodeTranslateError)) {
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000687 if (PyUnicodeTranslateError_GetEnd(exc, &end))
688 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +0000689 }
690 else {
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000691 wrong_exception_type(exc);
692 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +0000693 }
Victor Stinneree450092011-12-01 02:52:11 +0100694 return Py_BuildValue("(Nn)", PyUnicode_New(0, 0), end);
Walter Dörwald3aeb6322002-09-02 13:14:32 +0000695}
696
697
698PyObject *PyCodec_ReplaceErrors(PyObject *exc)
699{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200700 Py_ssize_t start, end, i, len;
Walter Dörwald3aeb6322002-09-02 13:14:32 +0000701
Serhiy Storchakaca7fecb2015-05-18 16:08:52 +0300702 if (PyObject_TypeCheck(exc, (PyTypeObject *)PyExc_UnicodeEncodeError)) {
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000703 PyObject *res;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200704 int kind;
705 void *data;
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000706 if (PyUnicodeEncodeError_GetStart(exc, &start))
707 return NULL;
708 if (PyUnicodeEncodeError_GetEnd(exc, &end))
709 return NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200710 len = end - start;
711 res = PyUnicode_New(len, '?');
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000712 if (res == NULL)
713 return NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200714 kind = PyUnicode_KIND(res);
715 data = PyUnicode_DATA(res);
716 for (i = 0; i < len; ++i)
717 PyUnicode_WRITE(kind, data, i, '?');
Victor Stinner8f825062012-04-27 13:55:39 +0200718 assert(_PyUnicode_CheckConsistency(res, 1));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200719 return Py_BuildValue("(Nn)", res, end);
Walter Dörwald3aeb6322002-09-02 13:14:32 +0000720 }
Serhiy Storchakaca7fecb2015-05-18 16:08:52 +0300721 else if (PyObject_TypeCheck(exc, (PyTypeObject *)PyExc_UnicodeDecodeError)) {
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000722 if (PyUnicodeDecodeError_GetEnd(exc, &end))
723 return NULL;
Victor Stinner1a15aba2011-10-02 19:00:15 +0200724 return Py_BuildValue("(Cn)",
725 (int)Py_UNICODE_REPLACEMENT_CHARACTER,
726 end);
Walter Dörwald3aeb6322002-09-02 13:14:32 +0000727 }
Serhiy Storchakaca7fecb2015-05-18 16:08:52 +0300728 else if (PyObject_TypeCheck(exc, (PyTypeObject *)PyExc_UnicodeTranslateError)) {
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000729 PyObject *res;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200730 int kind;
731 void *data;
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000732 if (PyUnicodeTranslateError_GetStart(exc, &start))
733 return NULL;
734 if (PyUnicodeTranslateError_GetEnd(exc, &end))
735 return NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200736 len = end - start;
737 res = PyUnicode_New(len, Py_UNICODE_REPLACEMENT_CHARACTER);
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000738 if (res == NULL)
739 return NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200740 kind = PyUnicode_KIND(res);
741 data = PyUnicode_DATA(res);
742 for (i=0; i < len; i++)
743 PyUnicode_WRITE(kind, data, i, Py_UNICODE_REPLACEMENT_CHARACTER);
Victor Stinner8f825062012-04-27 13:55:39 +0200744 assert(_PyUnicode_CheckConsistency(res, 1));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200745 return Py_BuildValue("(Nn)", res, end);
Walter Dörwald3aeb6322002-09-02 13:14:32 +0000746 }
747 else {
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000748 wrong_exception_type(exc);
749 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +0000750 }
751}
752
753PyObject *PyCodec_XMLCharRefReplaceErrors(PyObject *exc)
754{
Serhiy Storchakaca7fecb2015-05-18 16:08:52 +0300755 if (PyObject_TypeCheck(exc, (PyTypeObject *)PyExc_UnicodeEncodeError)) {
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000756 PyObject *restuple;
757 PyObject *object;
Victor Stinnerb31f1bc2011-11-04 21:29:10 +0100758 Py_ssize_t i;
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000759 Py_ssize_t start;
760 Py_ssize_t end;
761 PyObject *res;
Martin v. Löwisb09af032011-11-04 11:16:41 +0100762 unsigned char *outp;
Serhiy Storchaka2e374092014-10-04 14:15:49 +0300763 Py_ssize_t ressize;
Martin v. Löwisb09af032011-11-04 11:16:41 +0100764 Py_UCS4 ch;
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000765 if (PyUnicodeEncodeError_GetStart(exc, &start))
766 return NULL;
767 if (PyUnicodeEncodeError_GetEnd(exc, &end))
768 return NULL;
769 if (!(object = PyUnicodeEncodeError_GetObject(exc)))
770 return NULL;
Serhiy Storchaka2e374092014-10-04 14:15:49 +0300771 if (end - start > PY_SSIZE_T_MAX / (2+7+1))
772 end = start + PY_SSIZE_T_MAX / (2+7+1);
Martin v. Löwisb09af032011-11-04 11:16:41 +0100773 for (i = start, ressize = 0; i < end; ++i) {
774 /* object is guaranteed to be "ready" */
775 ch = PyUnicode_READ_CHAR(object, i);
776 if (ch<10)
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000777 ressize += 2+1+1;
Martin v. Löwisb09af032011-11-04 11:16:41 +0100778 else if (ch<100)
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000779 ressize += 2+2+1;
Martin v. Löwisb09af032011-11-04 11:16:41 +0100780 else if (ch<1000)
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000781 ressize += 2+3+1;
Martin v. Löwisb09af032011-11-04 11:16:41 +0100782 else if (ch<10000)
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000783 ressize += 2+4+1;
Martin v. Löwisb09af032011-11-04 11:16:41 +0100784 else if (ch<100000)
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000785 ressize += 2+5+1;
Martin v. Löwisb09af032011-11-04 11:16:41 +0100786 else if (ch<1000000)
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000787 ressize += 2+6+1;
788 else
789 ressize += 2+7+1;
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000790 }
791 /* allocate replacement */
Martin v. Löwisb09af032011-11-04 11:16:41 +0100792 res = PyUnicode_New(ressize, 127);
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000793 if (res == NULL) {
794 Py_DECREF(object);
795 return NULL;
796 }
Martin v. Löwisb09af032011-11-04 11:16:41 +0100797 outp = PyUnicode_1BYTE_DATA(res);
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000798 /* generate replacement */
Victor Stinnerb31f1bc2011-11-04 21:29:10 +0100799 for (i = start; i < end; ++i) {
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000800 int digits;
801 int base;
Martin v. Löwis8ba79302011-11-04 12:26:49 +0100802 ch = PyUnicode_READ_CHAR(object, i);
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000803 *outp++ = '&';
804 *outp++ = '#';
Martin v. Löwisb09af032011-11-04 11:16:41 +0100805 if (ch<10) {
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000806 digits = 1;
807 base = 1;
808 }
Martin v. Löwisb09af032011-11-04 11:16:41 +0100809 else if (ch<100) {
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000810 digits = 2;
811 base = 10;
812 }
Martin v. Löwisb09af032011-11-04 11:16:41 +0100813 else if (ch<1000) {
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000814 digits = 3;
815 base = 100;
816 }
Martin v. Löwisb09af032011-11-04 11:16:41 +0100817 else if (ch<10000) {
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000818 digits = 4;
819 base = 1000;
820 }
Martin v. Löwisb09af032011-11-04 11:16:41 +0100821 else if (ch<100000) {
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000822 digits = 5;
823 base = 10000;
824 }
Martin v. Löwisb09af032011-11-04 11:16:41 +0100825 else if (ch<1000000) {
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000826 digits = 6;
827 base = 100000;
828 }
829 else {
830 digits = 7;
831 base = 1000000;
832 }
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000833 while (digits-->0) {
Martin v. Löwisb09af032011-11-04 11:16:41 +0100834 *outp++ = '0' + ch/base;
835 ch %= base;
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000836 base /= 10;
837 }
838 *outp++ = ';';
839 }
Victor Stinner8f825062012-04-27 13:55:39 +0200840 assert(_PyUnicode_CheckConsistency(res, 1));
841 restuple = Py_BuildValue("(Nn)", res, end);
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000842 Py_DECREF(object);
843 return restuple;
Walter Dörwald3aeb6322002-09-02 13:14:32 +0000844 }
845 else {
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000846 wrong_exception_type(exc);
847 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +0000848 }
849}
850
Walter Dörwald3aeb6322002-09-02 13:14:32 +0000851PyObject *PyCodec_BackslashReplaceErrors(PyObject *exc)
852{
Serhiy Storchaka07985ef2015-01-25 22:56:57 +0200853 PyObject *object;
854 Py_ssize_t i;
855 Py_ssize_t start;
856 Py_ssize_t end;
857 PyObject *res;
858 unsigned char *outp;
859 int ressize;
860 Py_UCS4 c;
861
Serhiy Storchakac0937f72015-05-18 16:10:40 +0300862 if (PyObject_TypeCheck(exc, (PyTypeObject *)PyExc_UnicodeDecodeError)) {
Serhiy Storchakacb33a012016-10-23 09:44:50 +0300863 const unsigned char *p;
Serhiy Storchaka07985ef2015-01-25 22:56:57 +0200864 if (PyUnicodeDecodeError_GetStart(exc, &start))
865 return NULL;
866 if (PyUnicodeDecodeError_GetEnd(exc, &end))
867 return NULL;
868 if (!(object = PyUnicodeDecodeError_GetObject(exc)))
869 return NULL;
Serhiy Storchakacb33a012016-10-23 09:44:50 +0300870 p = (const unsigned char*)PyBytes_AS_STRING(object);
Serhiy Storchaka07985ef2015-01-25 22:56:57 +0200871 res = PyUnicode_New(4 * (end - start), 127);
872 if (res == NULL) {
873 Py_DECREF(object);
874 return NULL;
875 }
876 outp = PyUnicode_1BYTE_DATA(res);
877 for (i = start; i < end; i++, outp += 4) {
878 unsigned char c = p[i];
879 outp[0] = '\\';
880 outp[1] = 'x';
881 outp[2] = Py_hexdigits[(c>>4)&0xf];
882 outp[3] = Py_hexdigits[c&0xf];
883 }
884
885 assert(_PyUnicode_CheckConsistency(res, 1));
886 Py_DECREF(object);
887 return Py_BuildValue("(Nn)", res, end);
888 }
Serhiy Storchakac0937f72015-05-18 16:10:40 +0300889 if (PyObject_TypeCheck(exc, (PyTypeObject *)PyExc_UnicodeEncodeError)) {
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000890 if (PyUnicodeEncodeError_GetStart(exc, &start))
891 return NULL;
892 if (PyUnicodeEncodeError_GetEnd(exc, &end))
893 return NULL;
894 if (!(object = PyUnicodeEncodeError_GetObject(exc)))
895 return NULL;
Serhiy Storchaka07985ef2015-01-25 22:56:57 +0200896 }
Serhiy Storchakac0937f72015-05-18 16:10:40 +0300897 else if (PyObject_TypeCheck(exc, (PyTypeObject *)PyExc_UnicodeTranslateError)) {
Serhiy Storchaka07985ef2015-01-25 22:56:57 +0200898 if (PyUnicodeTranslateError_GetStart(exc, &start))
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000899 return NULL;
Serhiy Storchaka07985ef2015-01-25 22:56:57 +0200900 if (PyUnicodeTranslateError_GetEnd(exc, &end))
901 return NULL;
902 if (!(object = PyUnicodeTranslateError_GetObject(exc)))
903 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +0000904 }
905 else {
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000906 wrong_exception_type(exc);
907 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +0000908 }
Serhiy Storchaka07985ef2015-01-25 22:56:57 +0200909
910 if (end - start > PY_SSIZE_T_MAX / (1+1+8))
911 end = start + PY_SSIZE_T_MAX / (1+1+8);
912 for (i = start, ressize = 0; i < end; ++i) {
913 /* object is guaranteed to be "ready" */
914 c = PyUnicode_READ_CHAR(object, i);
915 if (c >= 0x10000) {
916 ressize += 1+1+8;
917 }
918 else if (c >= 0x100) {
919 ressize += 1+1+4;
920 }
921 else
922 ressize += 1+1+2;
923 }
924 res = PyUnicode_New(ressize, 127);
925 if (res == NULL) {
926 Py_DECREF(object);
927 return NULL;
928 }
929 outp = PyUnicode_1BYTE_DATA(res);
930 for (i = start; i < end; ++i) {
931 c = PyUnicode_READ_CHAR(object, i);
932 *outp++ = '\\';
933 if (c >= 0x00010000) {
934 *outp++ = 'U';
935 *outp++ = Py_hexdigits[(c>>28)&0xf];
936 *outp++ = Py_hexdigits[(c>>24)&0xf];
937 *outp++ = Py_hexdigits[(c>>20)&0xf];
938 *outp++ = Py_hexdigits[(c>>16)&0xf];
939 *outp++ = Py_hexdigits[(c>>12)&0xf];
940 *outp++ = Py_hexdigits[(c>>8)&0xf];
941 }
942 else if (c >= 0x100) {
943 *outp++ = 'u';
944 *outp++ = Py_hexdigits[(c>>12)&0xf];
945 *outp++ = Py_hexdigits[(c>>8)&0xf];
946 }
947 else
948 *outp++ = 'x';
949 *outp++ = Py_hexdigits[(c>>4)&0xf];
950 *outp++ = Py_hexdigits[c&0xf];
951 }
952
953 assert(_PyUnicode_CheckConsistency(res, 1));
954 Py_DECREF(object);
955 return Py_BuildValue("(Nn)", res, end);
Walter Dörwald3aeb6322002-09-02 13:14:32 +0000956}
957
Serhiy Storchaka166ebc42014-11-25 13:57:17 +0200958static _PyUnicode_Name_CAPI *ucnhash_CAPI = NULL;
Serhiy Storchaka166ebc42014-11-25 13:57:17 +0200959
960PyObject *PyCodec_NameReplaceErrors(PyObject *exc)
961{
Serhiy Storchakaca7fecb2015-05-18 16:08:52 +0300962 if (PyObject_TypeCheck(exc, (PyTypeObject *)PyExc_UnicodeEncodeError)) {
Serhiy Storchaka166ebc42014-11-25 13:57:17 +0200963 PyObject *restuple;
964 PyObject *object;
965 Py_ssize_t i;
966 Py_ssize_t start;
967 Py_ssize_t end;
968 PyObject *res;
969 unsigned char *outp;
Serhiy Storchakaaacfccc2014-11-26 12:11:40 +0200970 Py_ssize_t ressize;
971 int replsize;
Serhiy Storchaka166ebc42014-11-25 13:57:17 +0200972 Py_UCS4 c;
973 char buffer[256]; /* NAME_MAXLEN */
974 if (PyUnicodeEncodeError_GetStart(exc, &start))
975 return NULL;
976 if (PyUnicodeEncodeError_GetEnd(exc, &end))
977 return NULL;
978 if (!(object = PyUnicodeEncodeError_GetObject(exc)))
979 return NULL;
Victor Stinner38b8ae02015-09-03 16:19:40 +0200980 if (!ucnhash_CAPI) {
Serhiy Storchaka166ebc42014-11-25 13:57:17 +0200981 /* load the unicode data module */
982 ucnhash_CAPI = (_PyUnicode_Name_CAPI *)PyCapsule_Import(
983 PyUnicodeData_CAPSULE_NAME, 1);
Victor Stinner38b8ae02015-09-03 16:19:40 +0200984 if (!ucnhash_CAPI)
985 return NULL;
Serhiy Storchaka166ebc42014-11-25 13:57:17 +0200986 }
987 for (i = start, ressize = 0; i < end; ++i) {
988 /* object is guaranteed to be "ready" */
989 c = PyUnicode_READ_CHAR(object, i);
Victor Stinner38b8ae02015-09-03 16:19:40 +0200990 if (ucnhash_CAPI->getname(NULL, c, buffer, sizeof(buffer), 1)) {
Serhiy Storchaka26861b02015-02-16 20:52:17 +0200991 replsize = 1+1+1+(int)strlen(buffer)+1;
Serhiy Storchaka166ebc42014-11-25 13:57:17 +0200992 }
993 else if (c >= 0x10000) {
Serhiy Storchakaaacfccc2014-11-26 12:11:40 +0200994 replsize = 1+1+8;
Serhiy Storchaka166ebc42014-11-25 13:57:17 +0200995 }
996 else if (c >= 0x100) {
Serhiy Storchakaaacfccc2014-11-26 12:11:40 +0200997 replsize = 1+1+4;
Serhiy Storchaka166ebc42014-11-25 13:57:17 +0200998 }
999 else
Serhiy Storchakaaacfccc2014-11-26 12:11:40 +02001000 replsize = 1+1+2;
1001 if (ressize > PY_SSIZE_T_MAX - replsize)
1002 break;
1003 ressize += replsize;
Serhiy Storchaka166ebc42014-11-25 13:57:17 +02001004 }
Serhiy Storchakaaacfccc2014-11-26 12:11:40 +02001005 end = i;
Serhiy Storchaka166ebc42014-11-25 13:57:17 +02001006 res = PyUnicode_New(ressize, 127);
1007 if (res==NULL)
1008 return NULL;
1009 for (i = start, outp = PyUnicode_1BYTE_DATA(res);
1010 i < end; ++i) {
1011 c = PyUnicode_READ_CHAR(object, i);
1012 *outp++ = '\\';
Victor Stinner38b8ae02015-09-03 16:19:40 +02001013 if (ucnhash_CAPI->getname(NULL, c, buffer, sizeof(buffer), 1)) {
Serhiy Storchaka166ebc42014-11-25 13:57:17 +02001014 *outp++ = 'N';
1015 *outp++ = '{';
1016 strcpy((char *)outp, buffer);
1017 outp += strlen(buffer);
1018 *outp++ = '}';
1019 continue;
1020 }
1021 if (c >= 0x00010000) {
1022 *outp++ = 'U';
1023 *outp++ = Py_hexdigits[(c>>28)&0xf];
1024 *outp++ = Py_hexdigits[(c>>24)&0xf];
1025 *outp++ = Py_hexdigits[(c>>20)&0xf];
1026 *outp++ = Py_hexdigits[(c>>16)&0xf];
1027 *outp++ = Py_hexdigits[(c>>12)&0xf];
1028 *outp++ = Py_hexdigits[(c>>8)&0xf];
1029 }
1030 else if (c >= 0x100) {
1031 *outp++ = 'u';
1032 *outp++ = Py_hexdigits[(c>>12)&0xf];
1033 *outp++ = Py_hexdigits[(c>>8)&0xf];
1034 }
1035 else
1036 *outp++ = 'x';
1037 *outp++ = Py_hexdigits[(c>>4)&0xf];
1038 *outp++ = Py_hexdigits[c&0xf];
1039 }
1040
Benjamin Peterson3663b582014-11-26 14:39:54 -06001041 assert(outp == PyUnicode_1BYTE_DATA(res) + ressize);
Serhiy Storchaka166ebc42014-11-25 13:57:17 +02001042 assert(_PyUnicode_CheckConsistency(res, 1));
1043 restuple = Py_BuildValue("(Nn)", res, end);
1044 Py_DECREF(object);
1045 return restuple;
1046 }
1047 else {
1048 wrong_exception_type(exc);
1049 return NULL;
1050 }
1051}
1052
Serhiy Storchaka88d8fb62014-05-15 14:37:42 +03001053#define ENC_UNKNOWN -1
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02001054#define ENC_UTF8 0
1055#define ENC_UTF16BE 1
1056#define ENC_UTF16LE 2
1057#define ENC_UTF32BE 3
1058#define ENC_UTF32LE 4
1059
1060static int
1061get_standard_encoding(const char *encoding, int *bytelength)
1062{
1063 if (Py_TOLOWER(encoding[0]) == 'u' &&
1064 Py_TOLOWER(encoding[1]) == 't' &&
1065 Py_TOLOWER(encoding[2]) == 'f') {
1066 encoding += 3;
1067 if (*encoding == '-' || *encoding == '_' )
1068 encoding++;
Serhiy Storchaka88d8fb62014-05-15 14:37:42 +03001069 if (encoding[0] == '8' && encoding[1] == '\0') {
1070 *bytelength = 3;
1071 return ENC_UTF8;
1072 }
1073 else if (encoding[0] == '1' && encoding[1] == '6') {
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02001074 encoding += 2;
1075 *bytelength = 2;
1076 if (*encoding == '\0') {
1077#ifdef WORDS_BIGENDIAN
1078 return ENC_UTF16BE;
1079#else
1080 return ENC_UTF16LE;
1081#endif
1082 }
1083 if (*encoding == '-' || *encoding == '_' )
1084 encoding++;
1085 if (Py_TOLOWER(encoding[1]) == 'e' && encoding[2] == '\0') {
1086 if (Py_TOLOWER(encoding[0]) == 'b')
1087 return ENC_UTF16BE;
1088 if (Py_TOLOWER(encoding[0]) == 'l')
1089 return ENC_UTF16LE;
1090 }
1091 }
1092 else if (encoding[0] == '3' && encoding[1] == '2') {
1093 encoding += 2;
1094 *bytelength = 4;
1095 if (*encoding == '\0') {
1096#ifdef WORDS_BIGENDIAN
1097 return ENC_UTF32BE;
1098#else
1099 return ENC_UTF32LE;
1100#endif
1101 }
1102 if (*encoding == '-' || *encoding == '_' )
1103 encoding++;
1104 if (Py_TOLOWER(encoding[1]) == 'e' && encoding[2] == '\0') {
1105 if (Py_TOLOWER(encoding[0]) == 'b')
1106 return ENC_UTF32BE;
1107 if (Py_TOLOWER(encoding[0]) == 'l')
1108 return ENC_UTF32LE;
1109 }
1110 }
1111 }
Victor Stinner0d4e01c2014-05-16 14:46:20 +02001112 else if (strcmp(encoding, "CP_UTF8") == 0) {
1113 *bytelength = 3;
1114 return ENC_UTF8;
1115 }
Serhiy Storchaka88d8fb62014-05-15 14:37:42 +03001116 return ENC_UNKNOWN;
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02001117}
1118
Martin v. Löwisaef3fb02009-05-02 19:27:30 +00001119/* This handler is declared static until someone demonstrates
1120 a need to call it directly. */
1121static PyObject *
Martin v. Löwise0a2b722009-05-10 08:08:56 +00001122PyCodec_SurrogatePassErrors(PyObject *exc)
Martin v. Löwisdb12d452009-05-02 18:52:14 +00001123{
1124 PyObject *restuple;
1125 PyObject *object;
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02001126 PyObject *encode;
Serhiy Storchaka85b0f5b2016-11-20 10:16:47 +02001127 const char *encoding;
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02001128 int code;
1129 int bytelength;
Martin v. Löwisb09af032011-11-04 11:16:41 +01001130 Py_ssize_t i;
Martin v. Löwisdb12d452009-05-02 18:52:14 +00001131 Py_ssize_t start;
1132 Py_ssize_t end;
1133 PyObject *res;
Serhiy Storchakaca7fecb2015-05-18 16:08:52 +03001134
1135 if (PyObject_TypeCheck(exc, (PyTypeObject *)PyExc_UnicodeEncodeError)) {
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02001136 unsigned char *outp;
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001137 if (PyUnicodeEncodeError_GetStart(exc, &start))
1138 return NULL;
1139 if (PyUnicodeEncodeError_GetEnd(exc, &end))
1140 return NULL;
1141 if (!(object = PyUnicodeEncodeError_GetObject(exc)))
1142 return NULL;
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02001143 if (!(encode = PyUnicodeEncodeError_GetEncoding(exc))) {
1144 Py_DECREF(object);
1145 return NULL;
1146 }
1147 if (!(encoding = PyUnicode_AsUTF8(encode))) {
1148 Py_DECREF(object);
1149 Py_DECREF(encode);
1150 return NULL;
1151 }
1152 code = get_standard_encoding(encoding, &bytelength);
1153 Py_DECREF(encode);
Serhiy Storchaka88d8fb62014-05-15 14:37:42 +03001154 if (code == ENC_UNKNOWN) {
1155 /* Not supported, fail with original exception */
1156 PyErr_SetObject(PyExceptionInstance_Class(exc), exc);
1157 Py_DECREF(object);
1158 return NULL;
1159 }
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02001160
Serhiy Storchaka2e374092014-10-04 14:15:49 +03001161 if (end - start > PY_SSIZE_T_MAX / bytelength)
1162 end = start + PY_SSIZE_T_MAX / bytelength;
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02001163 res = PyBytes_FromStringAndSize(NULL, bytelength*(end-start));
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001164 if (!res) {
1165 Py_DECREF(object);
1166 return NULL;
1167 }
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02001168 outp = (unsigned char*)PyBytes_AsString(res);
Martin v. Löwisb09af032011-11-04 11:16:41 +01001169 for (i = start; i < end; i++) {
1170 /* object is guaranteed to be "ready" */
1171 Py_UCS4 ch = PyUnicode_READ_CHAR(object, i);
Victor Stinner76df43d2012-10-30 01:42:39 +01001172 if (!Py_UNICODE_IS_SURROGATE(ch)) {
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001173 /* Not a surrogate, fail with original exception */
1174 PyErr_SetObject(PyExceptionInstance_Class(exc), exc);
1175 Py_DECREF(res);
1176 Py_DECREF(object);
1177 return NULL;
1178 }
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02001179 switch (code) {
1180 case ENC_UTF8:
1181 *outp++ = (unsigned char)(0xe0 | (ch >> 12));
1182 *outp++ = (unsigned char)(0x80 | ((ch >> 6) & 0x3f));
1183 *outp++ = (unsigned char)(0x80 | (ch & 0x3f));
1184 break;
1185 case ENC_UTF16LE:
1186 *outp++ = (unsigned char) ch;
1187 *outp++ = (unsigned char)(ch >> 8);
1188 break;
1189 case ENC_UTF16BE:
1190 *outp++ = (unsigned char)(ch >> 8);
1191 *outp++ = (unsigned char) ch;
1192 break;
1193 case ENC_UTF32LE:
1194 *outp++ = (unsigned char) ch;
1195 *outp++ = (unsigned char)(ch >> 8);
1196 *outp++ = (unsigned char)(ch >> 16);
1197 *outp++ = (unsigned char)(ch >> 24);
1198 break;
1199 case ENC_UTF32BE:
1200 *outp++ = (unsigned char)(ch >> 24);
1201 *outp++ = (unsigned char)(ch >> 16);
1202 *outp++ = (unsigned char)(ch >> 8);
1203 *outp++ = (unsigned char) ch;
1204 break;
1205 }
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001206 }
1207 restuple = Py_BuildValue("(On)", res, end);
1208 Py_DECREF(res);
1209 Py_DECREF(object);
1210 return restuple;
Martin v. Löwisdb12d452009-05-02 18:52:14 +00001211 }
Serhiy Storchakaca7fecb2015-05-18 16:08:52 +03001212 else if (PyObject_TypeCheck(exc, (PyTypeObject *)PyExc_UnicodeDecodeError)) {
Serhiy Storchakacb33a012016-10-23 09:44:50 +03001213 const unsigned char *p;
Victor Stinnerc06bb7a2011-11-04 21:36:35 +01001214 Py_UCS4 ch = 0;
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001215 if (PyUnicodeDecodeError_GetStart(exc, &start))
1216 return NULL;
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02001217 if (PyUnicodeDecodeError_GetEnd(exc, &end))
1218 return NULL;
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001219 if (!(object = PyUnicodeDecodeError_GetObject(exc)))
1220 return NULL;
Serhiy Storchakacb33a012016-10-23 09:44:50 +03001221 p = (const unsigned char*)PyBytes_AS_STRING(object);
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02001222 if (!(encode = PyUnicodeDecodeError_GetEncoding(exc))) {
1223 Py_DECREF(object);
1224 return NULL;
1225 }
1226 if (!(encoding = PyUnicode_AsUTF8(encode))) {
1227 Py_DECREF(object);
1228 Py_DECREF(encode);
1229 return NULL;
1230 }
1231 code = get_standard_encoding(encoding, &bytelength);
1232 Py_DECREF(encode);
Serhiy Storchaka88d8fb62014-05-15 14:37:42 +03001233 if (code == ENC_UNKNOWN) {
1234 /* Not supported, fail with original exception */
1235 PyErr_SetObject(PyExceptionInstance_Class(exc), exc);
1236 Py_DECREF(object);
1237 return NULL;
1238 }
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02001239
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001240 /* Try decoding a single surrogate character. If
1241 there are more, let the codec call us again. */
1242 p += start;
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02001243 if (PyBytes_GET_SIZE(object) - start >= bytelength) {
1244 switch (code) {
1245 case ENC_UTF8:
1246 if ((p[0] & 0xf0) == 0xe0 &&
1247 (p[1] & 0xc0) == 0x80 &&
1248 (p[2] & 0xc0) == 0x80) {
1249 /* it's a three-byte code */
1250 ch = ((p[0] & 0x0f) << 12) + ((p[1] & 0x3f) << 6) + (p[2] & 0x3f);
1251 }
1252 break;
1253 case ENC_UTF16LE:
1254 ch = p[1] << 8 | p[0];
1255 break;
1256 case ENC_UTF16BE:
1257 ch = p[0] << 8 | p[1];
1258 break;
1259 case ENC_UTF32LE:
1260 ch = (p[3] << 24) | (p[2] << 16) | (p[1] << 8) | p[0];
1261 break;
1262 case ENC_UTF32BE:
1263 ch = (p[0] << 24) | (p[1] << 16) | (p[2] << 8) | p[3];
1264 break;
1265 }
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001266 }
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02001267
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001268 Py_DECREF(object);
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02001269 if (!Py_UNICODE_IS_SURROGATE(ch)) {
1270 /* it's not a surrogate - fail */
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001271 PyErr_SetObject(PyExceptionInstance_Class(exc), exc);
1272 return NULL;
1273 }
Victor Stinnerc06bb7a2011-11-04 21:36:35 +01001274 res = PyUnicode_FromOrdinal(ch);
1275 if (res == NULL)
1276 return NULL;
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02001277 return Py_BuildValue("(Nn)", res, start + bytelength);
Martin v. Löwisdb12d452009-05-02 18:52:14 +00001278 }
1279 else {
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001280 wrong_exception_type(exc);
1281 return NULL;
Martin v. Löwisdb12d452009-05-02 18:52:14 +00001282 }
1283}
1284
Martin v. Löwis011e8422009-05-05 04:43:17 +00001285static PyObject *
Martin v. Löwis43c57782009-05-10 08:15:24 +00001286PyCodec_SurrogateEscapeErrors(PyObject *exc)
Martin v. Löwis011e8422009-05-05 04:43:17 +00001287{
1288 PyObject *restuple;
1289 PyObject *object;
Martin v. Löwisb09af032011-11-04 11:16:41 +01001290 Py_ssize_t i;
Martin v. Löwis011e8422009-05-05 04:43:17 +00001291 Py_ssize_t start;
1292 Py_ssize_t end;
1293 PyObject *res;
Serhiy Storchakaca7fecb2015-05-18 16:08:52 +03001294
1295 if (PyObject_TypeCheck(exc, (PyTypeObject *)PyExc_UnicodeEncodeError)) {
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001296 char *outp;
1297 if (PyUnicodeEncodeError_GetStart(exc, &start))
1298 return NULL;
1299 if (PyUnicodeEncodeError_GetEnd(exc, &end))
1300 return NULL;
1301 if (!(object = PyUnicodeEncodeError_GetObject(exc)))
1302 return NULL;
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001303 res = PyBytes_FromStringAndSize(NULL, end-start);
1304 if (!res) {
1305 Py_DECREF(object);
1306 return NULL;
1307 }
1308 outp = PyBytes_AsString(res);
Martin v. Löwisb09af032011-11-04 11:16:41 +01001309 for (i = start; i < end; i++) {
1310 /* object is guaranteed to be "ready" */
1311 Py_UCS4 ch = PyUnicode_READ_CHAR(object, i);
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001312 if (ch < 0xdc80 || ch > 0xdcff) {
1313 /* Not a UTF-8b surrogate, fail with original exception */
1314 PyErr_SetObject(PyExceptionInstance_Class(exc), exc);
1315 Py_DECREF(res);
1316 Py_DECREF(object);
1317 return NULL;
1318 }
1319 *outp++ = ch - 0xdc00;
1320 }
1321 restuple = Py_BuildValue("(On)", res, end);
1322 Py_DECREF(res);
1323 Py_DECREF(object);
1324 return restuple;
Martin v. Löwis011e8422009-05-05 04:43:17 +00001325 }
Serhiy Storchakaca7fecb2015-05-18 16:08:52 +03001326 else if (PyObject_TypeCheck(exc, (PyTypeObject *)PyExc_UnicodeDecodeError)) {
Victor Stinnerc06bb7a2011-11-04 21:36:35 +01001327 PyObject *str;
Serhiy Storchakacb33a012016-10-23 09:44:50 +03001328 const unsigned char *p;
Victor Stinnerc06bb7a2011-11-04 21:36:35 +01001329 Py_UCS2 ch[4]; /* decode up to 4 bad bytes. */
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001330 int consumed = 0;
1331 if (PyUnicodeDecodeError_GetStart(exc, &start))
1332 return NULL;
1333 if (PyUnicodeDecodeError_GetEnd(exc, &end))
1334 return NULL;
1335 if (!(object = PyUnicodeDecodeError_GetObject(exc)))
1336 return NULL;
Serhiy Storchakacb33a012016-10-23 09:44:50 +03001337 p = (const unsigned char*)PyBytes_AS_STRING(object);
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001338 while (consumed < 4 && consumed < end-start) {
1339 /* Refuse to escape ASCII bytes. */
1340 if (p[start+consumed] < 128)
1341 break;
1342 ch[consumed] = 0xdc00 + p[start+consumed];
1343 consumed++;
1344 }
1345 Py_DECREF(object);
1346 if (!consumed) {
1347 /* codec complained about ASCII byte. */
1348 PyErr_SetObject(PyExceptionInstance_Class(exc), exc);
1349 return NULL;
1350 }
Victor Stinnerc06bb7a2011-11-04 21:36:35 +01001351 str = PyUnicode_FromKindAndData(PyUnicode_2BYTE_KIND, ch, consumed);
1352 if (str == NULL)
1353 return NULL;
1354 return Py_BuildValue("(Nn)", str, start+consumed);
Martin v. Löwis011e8422009-05-05 04:43:17 +00001355 }
1356 else {
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001357 wrong_exception_type(exc);
1358 return NULL;
Martin v. Löwis011e8422009-05-05 04:43:17 +00001359 }
1360}
1361
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001362
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001363static PyObject *strict_errors(PyObject *self, PyObject *exc)
1364{
1365 return PyCodec_StrictErrors(exc);
1366}
1367
1368
1369static PyObject *ignore_errors(PyObject *self, PyObject *exc)
1370{
1371 return PyCodec_IgnoreErrors(exc);
1372}
1373
1374
1375static PyObject *replace_errors(PyObject *self, PyObject *exc)
1376{
1377 return PyCodec_ReplaceErrors(exc);
1378}
1379
1380
1381static PyObject *xmlcharrefreplace_errors(PyObject *self, PyObject *exc)
1382{
1383 return PyCodec_XMLCharRefReplaceErrors(exc);
1384}
1385
1386
1387static PyObject *backslashreplace_errors(PyObject *self, PyObject *exc)
1388{
1389 return PyCodec_BackslashReplaceErrors(exc);
1390}
1391
Serhiy Storchaka166ebc42014-11-25 13:57:17 +02001392static PyObject *namereplace_errors(PyObject *self, PyObject *exc)
1393{
1394 return PyCodec_NameReplaceErrors(exc);
1395}
1396
Martin v. Löwise0a2b722009-05-10 08:08:56 +00001397static PyObject *surrogatepass_errors(PyObject *self, PyObject *exc)
Martin v. Löwisdb12d452009-05-02 18:52:14 +00001398{
Martin v. Löwise0a2b722009-05-10 08:08:56 +00001399 return PyCodec_SurrogatePassErrors(exc);
Martin v. Löwisdb12d452009-05-02 18:52:14 +00001400}
1401
Martin v. Löwis43c57782009-05-10 08:15:24 +00001402static PyObject *surrogateescape_errors(PyObject *self, PyObject *exc)
Martin v. Löwis011e8422009-05-05 04:43:17 +00001403{
Martin v. Löwis43c57782009-05-10 08:15:24 +00001404 return PyCodec_SurrogateEscapeErrors(exc);
Martin v. Löwis011e8422009-05-05 04:43:17 +00001405}
1406
Gustavo Niemeyer5ddd4c32003-03-19 00:35:36 +00001407static int _PyCodecRegistry_Init(void)
Guido van Rossumfeee4b92000-03-10 22:57:27 +00001408{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001409 static struct {
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001410 char *name;
1411 PyMethodDef def;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001412 } methods[] =
1413 {
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001414 {
1415 "strict",
1416 {
1417 "strict_errors",
1418 strict_errors,
1419 METH_O,
1420 PyDoc_STR("Implements the 'strict' error handling, which "
1421 "raises a UnicodeError on coding errors.")
1422 }
1423 },
1424 {
1425 "ignore",
1426 {
1427 "ignore_errors",
1428 ignore_errors,
1429 METH_O,
1430 PyDoc_STR("Implements the 'ignore' error handling, which "
1431 "ignores malformed data and continues.")
1432 }
1433 },
1434 {
1435 "replace",
1436 {
1437 "replace_errors",
1438 replace_errors,
1439 METH_O,
1440 PyDoc_STR("Implements the 'replace' error handling, which "
1441 "replaces malformed data with a replacement marker.")
1442 }
1443 },
1444 {
1445 "xmlcharrefreplace",
1446 {
1447 "xmlcharrefreplace_errors",
1448 xmlcharrefreplace_errors,
1449 METH_O,
1450 PyDoc_STR("Implements the 'xmlcharrefreplace' error handling, "
1451 "which replaces an unencodable character with the "
1452 "appropriate XML character reference.")
1453 }
1454 },
1455 {
1456 "backslashreplace",
1457 {
1458 "backslashreplace_errors",
1459 backslashreplace_errors,
1460 METH_O,
1461 PyDoc_STR("Implements the 'backslashreplace' error handling, "
Serhiy Storchaka07985ef2015-01-25 22:56:57 +02001462 "which replaces malformed data with a backslashed "
1463 "escape sequence.")
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001464 }
1465 },
1466 {
Serhiy Storchaka166ebc42014-11-25 13:57:17 +02001467 "namereplace",
1468 {
1469 "namereplace_errors",
1470 namereplace_errors,
1471 METH_O,
1472 PyDoc_STR("Implements the 'namereplace' error handling, "
1473 "which replaces an unencodable character with a "
1474 "\\N{...} escape sequence.")
1475 }
1476 },
1477 {
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001478 "surrogatepass",
1479 {
1480 "surrogatepass",
1481 surrogatepass_errors,
1482 METH_O
1483 }
1484 },
1485 {
1486 "surrogateescape",
1487 {
1488 "surrogateescape",
1489 surrogateescape_errors,
1490 METH_O
1491 }
1492 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001493 };
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001494
Victor Stinnercaba55b2018-08-03 15:33:52 +02001495 PyInterpreterState *interp = _PyInterpreterState_Get();
Gustavo Niemeyer5ddd4c32003-03-19 00:35:36 +00001496 PyObject *mod;
Neal Norwitz739a8f82004-07-08 01:55:58 +00001497 unsigned i;
Gustavo Niemeyer5ddd4c32003-03-19 00:35:36 +00001498
1499 if (interp->codec_search_path != NULL)
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001500 return 0;
Gustavo Niemeyer5ddd4c32003-03-19 00:35:36 +00001501
1502 interp->codec_search_path = PyList_New(0);
1503 interp->codec_search_cache = PyDict_New();
1504 interp->codec_error_registry = PyDict_New();
1505
1506 if (interp->codec_error_registry) {
Victor Stinner63941882011-09-29 00:42:28 +02001507 for (i = 0; i < Py_ARRAY_LENGTH(methods); ++i) {
Andrew Svetlov3ba3a3e2012-12-25 13:32:35 +02001508 PyObject *func = PyCFunction_NewEx(&methods[i].def, NULL, NULL);
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001509 int res;
1510 if (!func)
1511 Py_FatalError("can't initialize codec error registry");
1512 res = PyCodec_RegisterError(methods[i].name, func);
1513 Py_DECREF(func);
1514 if (res)
1515 Py_FatalError("can't initialize codec error registry");
1516 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001517 }
Guido van Rossumfeee4b92000-03-10 22:57:27 +00001518
Gustavo Niemeyer5ddd4c32003-03-19 00:35:36 +00001519 if (interp->codec_search_path == NULL ||
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001520 interp->codec_search_cache == NULL ||
1521 interp->codec_error_registry == NULL)
1522 Py_FatalError("can't initialize codec registry");
Gustavo Niemeyer5ddd4c32003-03-19 00:35:36 +00001523
Christian Heimes819b8bf2008-01-03 23:05:47 +00001524 mod = PyImport_ImportModuleNoBlock("encodings");
Gustavo Niemeyer5ddd4c32003-03-19 00:35:36 +00001525 if (mod == NULL) {
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001526 return -1;
Gustavo Niemeyer5ddd4c32003-03-19 00:35:36 +00001527 }
1528 Py_DECREF(mod);
Christian Heimes6a27efa2008-10-30 21:48:26 +00001529 interp->codecs_initialized = 1;
Gustavo Niemeyer5ddd4c32003-03-19 00:35:36 +00001530 return 0;
Guido van Rossumfeee4b92000-03-10 22:57:27 +00001531}