blob: eb3cd35fb8e249a2506e221bebdb2d7d68be8233 [file] [log] [blame]
Guido van Rossumfeee4b92000-03-10 22:57:27 +00001/* ------------------------------------------------------------------------
2
3 Python Codec Registry and support functions
4
5Written by Marc-Andre Lemburg (mal@lemburg.com).
6
Guido van Rossum16b1ad92000-08-03 16:24:25 +00007Copyright (c) Corporation for National Research Initiatives.
Guido van Rossumfeee4b92000-03-10 22:57:27 +00008
9 ------------------------------------------------------------------------ */
10
11#include "Python.h"
Eric Snow2ebc5ce2017-09-07 23:51:28 -060012#include "internal/pystate.h"
Serhiy Storchaka166ebc42014-11-25 13:57:17 +020013#include "ucnhash.h"
Guido van Rossumfeee4b92000-03-10 22:57:27 +000014#include <ctype.h>
15
Victor Stinnerf5cff562011-10-14 02:13:11 +020016const char *Py_hexdigits = "0123456789abcdef";
17
Guido van Rossumfeee4b92000-03-10 22:57:27 +000018/* --- Codec Registry ----------------------------------------------------- */
19
20/* Import the standard encodings package which will register the first
Guido van Rossum98297ee2007-11-06 21:34:58 +000021 codec search function.
Guido van Rossumfeee4b92000-03-10 22:57:27 +000022
23 This is done in a lazy way so that the Unicode implementation does
24 not downgrade startup time of scripts not needing it.
25
Guido van Rossumb95de4f2000-03-31 17:25:23 +000026 ImportErrors are silently ignored by this function. Only one try is
27 made.
Guido van Rossumfeee4b92000-03-10 22:57:27 +000028
29*/
30
Gustavo Niemeyer5ddd4c32003-03-19 00:35:36 +000031static int _PyCodecRegistry_Init(void); /* Forward */
Guido van Rossumfeee4b92000-03-10 22:57:27 +000032
Guido van Rossumfeee4b92000-03-10 22:57:27 +000033int PyCodec_Register(PyObject *search_function)
34{
Nicholas Bastine5662ae2004-03-24 22:22:12 +000035 PyInterpreterState *interp = PyThreadState_GET()->interp;
Gustavo Niemeyer5ddd4c32003-03-19 00:35:36 +000036 if (interp->codec_search_path == NULL && _PyCodecRegistry_Init())
Antoine Pitrouf95a1b32010-05-09 15:52:27 +000037 goto onError;
Guido van Rossumfeee4b92000-03-10 22:57:27 +000038 if (search_function == NULL) {
Antoine Pitrouf95a1b32010-05-09 15:52:27 +000039 PyErr_BadArgument();
40 goto onError;
Guido van Rossumfeee4b92000-03-10 22:57:27 +000041 }
42 if (!PyCallable_Check(search_function)) {
Antoine Pitrouf95a1b32010-05-09 15:52:27 +000043 PyErr_SetString(PyExc_TypeError, "argument must be callable");
44 goto onError;
Guido van Rossumfeee4b92000-03-10 22:57:27 +000045 }
Gustavo Niemeyer5ddd4c32003-03-19 00:35:36 +000046 return PyList_Append(interp->codec_search_path, search_function);
Guido van Rossumb95de4f2000-03-31 17:25:23 +000047
48 onError:
49 return -1;
Guido van Rossumfeee4b92000-03-10 22:57:27 +000050}
51
Guido van Rossum9e896b32000-04-05 20:11:21 +000052/* Convert a string to a normalized Python string: all characters are
53 converted to lower case, spaces are replaced with underscores. */
54
Guido van Rossumfeee4b92000-03-10 22:57:27 +000055static
Guido van Rossum9e896b32000-04-05 20:11:21 +000056PyObject *normalizestring(const char *string)
Guido van Rossumfeee4b92000-03-10 22:57:27 +000057{
Antoine Pitrou9ed5f272013-08-13 20:18:52 +020058 size_t i;
Guido van Rossum582acec2000-06-28 22:07:35 +000059 size_t len = strlen(string);
Guido van Rossumfeee4b92000-03-10 22:57:27 +000060 char *p;
61 PyObject *v;
Guido van Rossum21431e82007-10-19 21:48:41 +000062
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000063 if (len > PY_SSIZE_T_MAX) {
Antoine Pitrouf95a1b32010-05-09 15:52:27 +000064 PyErr_SetString(PyExc_OverflowError, "string is too large");
65 return NULL;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000066 }
Guido van Rossum21431e82007-10-19 21:48:41 +000067
68 p = PyMem_Malloc(len + 1);
69 if (p == NULL)
Victor Stinnercc351592013-07-12 00:02:55 +020070 return PyErr_NoMemory();
Guido van Rossum9e896b32000-04-05 20:11:21 +000071 for (i = 0; i < len; i++) {
Antoine Pitrou9ed5f272013-08-13 20:18:52 +020072 char ch = string[i];
Guido van Rossum9e896b32000-04-05 20:11:21 +000073 if (ch == ' ')
74 ch = '-';
75 else
Antoine Pitroucf9d3c02011-07-24 02:27:04 +020076 ch = Py_TOLOWER(Py_CHARMASK(ch));
Antoine Pitrouf95a1b32010-05-09 15:52:27 +000077 p[i] = ch;
Guido van Rossum9e896b32000-04-05 20:11:21 +000078 }
Guido van Rossum21431e82007-10-19 21:48:41 +000079 p[i] = '\0';
80 v = PyUnicode_FromString(p);
Guido van Rossum21431e82007-10-19 21:48:41 +000081 PyMem_Free(p);
Guido van Rossumfeee4b92000-03-10 22:57:27 +000082 return v;
83}
84
85/* Lookup the given encoding and return a tuple providing the codec
86 facilities.
87
88 The encoding string is looked up converted to all lower-case
89 characters. This makes encodings looked up through this mechanism
90 effectively case-insensitive.
91
Guido van Rossum98297ee2007-11-06 21:34:58 +000092 If no codec is found, a LookupError is set and NULL returned.
Guido van Rossumb95de4f2000-03-31 17:25:23 +000093
94 As side effect, this tries to load the encodings package, if not
95 yet done. This is part of the lazy load strategy for the encodings
96 package.
97
98*/
Guido van Rossumfeee4b92000-03-10 22:57:27 +000099
100PyObject *_PyCodec_Lookup(const char *encoding)
101{
Gustavo Niemeyer5ddd4c32003-03-19 00:35:36 +0000102 PyInterpreterState *interp;
Guido van Rossum5ba3c842000-03-24 20:52:23 +0000103 PyObject *result, *args = NULL, *v;
Thomas Wouters477c8d52006-05-27 19:21:47 +0000104 Py_ssize_t i, len;
Guido van Rossumfeee4b92000-03-10 22:57:27 +0000105
Fred Drake766de832000-05-09 19:55:59 +0000106 if (encoding == NULL) {
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000107 PyErr_BadArgument();
108 goto onError;
Fred Drake766de832000-05-09 19:55:59 +0000109 }
Gustavo Niemeyer5ddd4c32003-03-19 00:35:36 +0000110
Nicholas Bastine5662ae2004-03-24 22:22:12 +0000111 interp = PyThreadState_GET()->interp;
Gustavo Niemeyer5ddd4c32003-03-19 00:35:36 +0000112 if (interp->codec_search_path == NULL && _PyCodecRegistry_Init())
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000113 goto onError;
Guido van Rossumfeee4b92000-03-10 22:57:27 +0000114
Guido van Rossum9e896b32000-04-05 20:11:21 +0000115 /* Convert the encoding to a normalized Python string: all
Thomas Wouters7e474022000-07-16 12:04:32 +0000116 characters are converted to lower case, spaces and hyphens are
Guido van Rossum9e896b32000-04-05 20:11:21 +0000117 replaced with underscores. */
118 v = normalizestring(encoding);
Guido van Rossumfeee4b92000-03-10 22:57:27 +0000119 if (v == NULL)
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000120 goto onError;
Guido van Rossum21431e82007-10-19 21:48:41 +0000121 PyUnicode_InternInPlace(&v);
Guido van Rossumfeee4b92000-03-10 22:57:27 +0000122
123 /* First, try to lookup the name in the registry dictionary */
Gustavo Niemeyer5ddd4c32003-03-19 00:35:36 +0000124 result = PyDict_GetItem(interp->codec_search_cache, v);
Guido van Rossumfeee4b92000-03-10 22:57:27 +0000125 if (result != NULL) {
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000126 Py_INCREF(result);
127 Py_DECREF(v);
128 return result;
Guido van Rossumfeee4b92000-03-10 22:57:27 +0000129 }
Guido van Rossum98297ee2007-11-06 21:34:58 +0000130
Guido van Rossumfeee4b92000-03-10 22:57:27 +0000131 /* Next, scan the search functions in order of registration */
Guido van Rossumfeee4b92000-03-10 22:57:27 +0000132 args = PyTuple_New(1);
133 if (args == NULL)
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000134 goto onError;
Guido van Rossumfeee4b92000-03-10 22:57:27 +0000135 PyTuple_SET_ITEM(args,0,v);
Guido van Rossum5ba3c842000-03-24 20:52:23 +0000136
Gustavo Niemeyer5ddd4c32003-03-19 00:35:36 +0000137 len = PyList_Size(interp->codec_search_path);
Guido van Rossum5ba3c842000-03-24 20:52:23 +0000138 if (len < 0)
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000139 goto onError;
Guido van Rossumb95de4f2000-03-31 17:25:23 +0000140 if (len == 0) {
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000141 PyErr_SetString(PyExc_LookupError,
142 "no codec search functions registered: "
143 "can't find encoding");
144 goto onError;
Guido van Rossumb95de4f2000-03-31 17:25:23 +0000145 }
Guido van Rossumfeee4b92000-03-10 22:57:27 +0000146
147 for (i = 0; i < len; i++) {
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000148 PyObject *func;
Guido van Rossum98297ee2007-11-06 21:34:58 +0000149
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000150 func = PyList_GetItem(interp->codec_search_path, i);
151 if (func == NULL)
152 goto onError;
153 result = PyEval_CallObject(func, args);
154 if (result == NULL)
155 goto onError;
156 if (result == Py_None) {
157 Py_DECREF(result);
158 continue;
159 }
160 if (!PyTuple_Check(result) || PyTuple_GET_SIZE(result) != 4) {
161 PyErr_SetString(PyExc_TypeError,
162 "codec search functions must return 4-tuples");
163 Py_DECREF(result);
164 goto onError;
165 }
166 break;
Guido van Rossumfeee4b92000-03-10 22:57:27 +0000167 }
168 if (i == len) {
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000169 /* XXX Perhaps we should cache misses too ? */
170 PyErr_Format(PyExc_LookupError,
Martin v. Löwiseb42b022002-09-26 16:01:24 +0000171 "unknown encoding: %s", encoding);
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000172 goto onError;
Guido van Rossumfeee4b92000-03-10 22:57:27 +0000173 }
174
175 /* Cache and return the result */
Neal Norwitz9edcc2e2007-08-11 04:58:26 +0000176 if (PyDict_SetItem(interp->codec_search_cache, v, result) < 0) {
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000177 Py_DECREF(result);
178 goto onError;
Neal Norwitz9edcc2e2007-08-11 04:58:26 +0000179 }
Guido van Rossumfeee4b92000-03-10 22:57:27 +0000180 Py_DECREF(args);
181 return result;
182
183 onError:
184 Py_XDECREF(args);
185 return NULL;
186}
187
Nick Coghlan8fad1672014-09-15 23:50:44 +1200188int _PyCodec_Forget(const char *encoding)
189{
190 PyInterpreterState *interp;
191 PyObject *v;
192 int result;
193
194 interp = PyThreadState_GET()->interp;
195 if (interp->codec_search_path == NULL) {
196 return -1;
197 }
198
199 /* Convert the encoding to a normalized Python string: all
200 characters are converted to lower case, spaces and hyphens are
201 replaced with underscores. */
202 v = normalizestring(encoding);
203 if (v == NULL) {
204 return -1;
205 }
206
207 /* Drop the named codec from the internal cache */
208 result = PyDict_DelItem(interp->codec_search_cache, v);
209 Py_DECREF(v);
210
211 return result;
212}
213
Marc-André Lemburgb2750b52008-06-06 12:18:17 +0000214/* Codec registry encoding check API. */
215
216int PyCodec_KnownEncoding(const char *encoding)
217{
218 PyObject *codecs;
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000219
Marc-André Lemburgb2750b52008-06-06 12:18:17 +0000220 codecs = _PyCodec_Lookup(encoding);
221 if (!codecs) {
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000222 PyErr_Clear();
223 return 0;
Marc-André Lemburgb2750b52008-06-06 12:18:17 +0000224 }
225 else {
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000226 Py_DECREF(codecs);
227 return 1;
Marc-André Lemburgb2750b52008-06-06 12:18:17 +0000228 }
229}
230
Guido van Rossumfeee4b92000-03-10 22:57:27 +0000231static
232PyObject *args_tuple(PyObject *object,
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000233 const char *errors)
Guido van Rossumfeee4b92000-03-10 22:57:27 +0000234{
235 PyObject *args;
Guido van Rossum98297ee2007-11-06 21:34:58 +0000236
Guido van Rossumfeee4b92000-03-10 22:57:27 +0000237 args = PyTuple_New(1 + (errors != NULL));
238 if (args == NULL)
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000239 return NULL;
Guido van Rossumfeee4b92000-03-10 22:57:27 +0000240 Py_INCREF(object);
241 PyTuple_SET_ITEM(args,0,object);
242 if (errors) {
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000243 PyObject *v;
Guido van Rossum98297ee2007-11-06 21:34:58 +0000244
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000245 v = PyUnicode_FromString(errors);
246 if (v == NULL) {
247 Py_DECREF(args);
248 return NULL;
249 }
250 PyTuple_SET_ITEM(args, 1, v);
Guido van Rossumfeee4b92000-03-10 22:57:27 +0000251 }
252 return args;
253}
254
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000255/* Helper function to get a codec item */
Guido van Rossumfeee4b92000-03-10 22:57:27 +0000256
257static
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000258PyObject *codec_getitem(const char *encoding, int index)
Guido van Rossumfeee4b92000-03-10 22:57:27 +0000259{
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000260 PyObject *codecs;
261 PyObject *v;
Guido van Rossumfeee4b92000-03-10 22:57:27 +0000262
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000263 codecs = _PyCodec_Lookup(encoding);
264 if (codecs == NULL)
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000265 return NULL;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000266 v = PyTuple_GET_ITEM(codecs, index);
267 Py_DECREF(codecs);
268 Py_INCREF(v);
269 return v;
270}
271
Nick Coghlana9b15242014-02-04 22:11:18 +1000272/* Helper functions to create an incremental codec. */
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000273static
Nick Coghlana9b15242014-02-04 22:11:18 +1000274PyObject *codec_makeincrementalcodec(PyObject *codec_info,
275 const char *errors,
276 const char *attrname)
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000277{
Nick Coghlana9b15242014-02-04 22:11:18 +1000278 PyObject *ret, *inccodec;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000279
Nick Coghlana9b15242014-02-04 22:11:18 +1000280 inccodec = PyObject_GetAttrString(codec_info, attrname);
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000281 if (inccodec == NULL)
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000282 return NULL;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000283 if (errors)
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000284 ret = PyObject_CallFunction(inccodec, "s", errors);
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000285 else
Victor Stinner4778eab2016-12-01 14:51:04 +0100286 ret = _PyObject_CallNoArg(inccodec);
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000287 Py_DECREF(inccodec);
288 return ret;
289}
290
Nick Coghlana9b15242014-02-04 22:11:18 +1000291static
292PyObject *codec_getincrementalcodec(const char *encoding,
293 const char *errors,
294 const char *attrname)
295{
296 PyObject *codec_info, *ret;
297
298 codec_info = _PyCodec_Lookup(encoding);
299 if (codec_info == NULL)
300 return NULL;
301 ret = codec_makeincrementalcodec(codec_info, errors, attrname);
302 Py_DECREF(codec_info);
303 return ret;
304}
305
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000306/* Helper function to create a stream codec. */
307
308static
309PyObject *codec_getstreamcodec(const char *encoding,
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000310 PyObject *stream,
311 const char *errors,
312 const int index)
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000313{
Thomas Wouters0e3f5912006-08-11 14:57:12 +0000314 PyObject *codecs, *streamcodec, *codeccls;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000315
316 codecs = _PyCodec_Lookup(encoding);
317 if (codecs == NULL)
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000318 return NULL;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000319
Thomas Wouters0e3f5912006-08-11 14:57:12 +0000320 codeccls = PyTuple_GET_ITEM(codecs, index);
321 if (errors != NULL)
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000322 streamcodec = PyObject_CallFunction(codeccls, "Os", stream, errors);
Thomas Wouters0e3f5912006-08-11 14:57:12 +0000323 else
Victor Stinner7bfb42d2016-12-05 17:04:32 +0100324 streamcodec = PyObject_CallFunctionObjArgs(codeccls, stream, NULL);
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000325 Py_DECREF(codecs);
326 return streamcodec;
Guido van Rossumfeee4b92000-03-10 22:57:27 +0000327}
328
Nick Coghlana9b15242014-02-04 22:11:18 +1000329/* Helpers to work with the result of _PyCodec_Lookup
330
331 */
332PyObject *_PyCodecInfo_GetIncrementalDecoder(PyObject *codec_info,
333 const char *errors)
334{
335 return codec_makeincrementalcodec(codec_info, errors,
336 "incrementaldecoder");
337}
338
339PyObject *_PyCodecInfo_GetIncrementalEncoder(PyObject *codec_info,
340 const char *errors)
341{
342 return codec_makeincrementalcodec(codec_info, errors,
343 "incrementalencoder");
344}
345
346
Guido van Rossum98297ee2007-11-06 21:34:58 +0000347/* Convenience APIs to query the Codec registry.
348
Guido van Rossumfeee4b92000-03-10 22:57:27 +0000349 All APIs return a codec object with incremented refcount.
Guido van Rossum98297ee2007-11-06 21:34:58 +0000350
Guido van Rossumfeee4b92000-03-10 22:57:27 +0000351 */
352
353PyObject *PyCodec_Encoder(const char *encoding)
354{
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000355 return codec_getitem(encoding, 0);
Guido van Rossumfeee4b92000-03-10 22:57:27 +0000356}
357
358PyObject *PyCodec_Decoder(const char *encoding)
359{
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000360 return codec_getitem(encoding, 1);
Guido van Rossumfeee4b92000-03-10 22:57:27 +0000361}
362
Thomas Woutersa9773292006-04-21 09:43:23 +0000363PyObject *PyCodec_IncrementalEncoder(const char *encoding,
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000364 const char *errors)
Thomas Woutersa9773292006-04-21 09:43:23 +0000365{
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000366 return codec_getincrementalcodec(encoding, errors, "incrementalencoder");
Thomas Woutersa9773292006-04-21 09:43:23 +0000367}
368
369PyObject *PyCodec_IncrementalDecoder(const char *encoding,
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000370 const char *errors)
Thomas Woutersa9773292006-04-21 09:43:23 +0000371{
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000372 return codec_getincrementalcodec(encoding, errors, "incrementaldecoder");
Thomas Woutersa9773292006-04-21 09:43:23 +0000373}
374
Guido van Rossumfeee4b92000-03-10 22:57:27 +0000375PyObject *PyCodec_StreamReader(const char *encoding,
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000376 PyObject *stream,
377 const char *errors)
Guido van Rossumfeee4b92000-03-10 22:57:27 +0000378{
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000379 return codec_getstreamcodec(encoding, stream, errors, 2);
Guido van Rossumfeee4b92000-03-10 22:57:27 +0000380}
381
382PyObject *PyCodec_StreamWriter(const char *encoding,
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000383 PyObject *stream,
384 const char *errors)
Guido van Rossumfeee4b92000-03-10 22:57:27 +0000385{
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000386 return codec_getstreamcodec(encoding, stream, errors, 3);
Guido van Rossumfeee4b92000-03-10 22:57:27 +0000387}
388
Nick Coghlan8b097b42013-11-13 23:49:21 +1000389/* Helper that tries to ensure the reported exception chain indicates the
390 * codec that was invoked to trigger the failure without changing the type
391 * of the exception raised.
392 */
393static void
394wrap_codec_error(const char *operation,
395 const char *encoding)
396{
397 /* TrySetFromCause will replace the active exception with a suitably
398 * updated clone if it can, otherwise it will leave the original
399 * exception alone.
400 */
401 _PyErr_TrySetFromCause("%s with '%s' codec failed",
402 operation, encoding);
403}
404
Martin Panter6245cb32016-04-15 02:14:19 +0000405/* Encode an object (e.g. a Unicode object) using the given encoding
Guido van Rossumfeee4b92000-03-10 22:57:27 +0000406 and return the resulting encoded object (usually a Python string).
407
408 errors is passed to the encoder factory as argument if non-NULL. */
409
Nick Coghlanc72e4e62013-11-22 22:39:36 +1000410static PyObject *
411_PyCodec_EncodeInternal(PyObject *object,
412 PyObject *encoder,
413 const char *encoding,
414 const char *errors)
Guido van Rossumfeee4b92000-03-10 22:57:27 +0000415{
Neal Norwitz3715c3e2005-11-24 22:09:18 +0000416 PyObject *args = NULL, *result = NULL;
Guido van Rossum98297ee2007-11-06 21:34:58 +0000417 PyObject *v = NULL;
Guido van Rossumfeee4b92000-03-10 22:57:27 +0000418
Guido van Rossumfeee4b92000-03-10 22:57:27 +0000419 args = args_tuple(object, errors);
420 if (args == NULL)
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000421 goto onError;
Guido van Rossum98297ee2007-11-06 21:34:58 +0000422
423 result = PyEval_CallObject(encoder, args);
Nick Coghlanc4c25802013-11-15 21:47:37 +1000424 if (result == NULL) {
425 wrap_codec_error("encoding", encoding);
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000426 goto onError;
Nick Coghlanc4c25802013-11-15 21:47:37 +1000427 }
Guido van Rossumfeee4b92000-03-10 22:57:27 +0000428
Guido van Rossum98297ee2007-11-06 21:34:58 +0000429 if (!PyTuple_Check(result) ||
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000430 PyTuple_GET_SIZE(result) != 2) {
431 PyErr_SetString(PyExc_TypeError,
432 "encoder must return a tuple (object, integer)");
433 goto onError;
Guido van Rossumfeee4b92000-03-10 22:57:27 +0000434 }
Marc-André Lemburgb2750b52008-06-06 12:18:17 +0000435 v = PyTuple_GET_ITEM(result,0);
436 Py_INCREF(v);
Guido van Rossumfeee4b92000-03-10 22:57:27 +0000437 /* We don't check or use the second (integer) entry. */
438
Marc-André Lemburgb2750b52008-06-06 12:18:17 +0000439 Py_DECREF(args);
440 Py_DECREF(encoder);
441 Py_DECREF(result);
442 return v;
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000443
Guido van Rossumfeee4b92000-03-10 22:57:27 +0000444 onError:
Neal Norwitz3715c3e2005-11-24 22:09:18 +0000445 Py_XDECREF(result);
Guido van Rossumfeee4b92000-03-10 22:57:27 +0000446 Py_XDECREF(args);
447 Py_XDECREF(encoder);
Marc-André Lemburgb2750b52008-06-06 12:18:17 +0000448 return NULL;
Guido van Rossumfeee4b92000-03-10 22:57:27 +0000449}
450
451/* Decode an object (usually a Python string) using the given encoding
Martin Panter6245cb32016-04-15 02:14:19 +0000452 and return an equivalent object (e.g. a Unicode object).
Guido van Rossumfeee4b92000-03-10 22:57:27 +0000453
454 errors is passed to the decoder factory as argument if non-NULL. */
455
Nick Coghlanc72e4e62013-11-22 22:39:36 +1000456static PyObject *
457_PyCodec_DecodeInternal(PyObject *object,
458 PyObject *decoder,
459 const char *encoding,
460 const char *errors)
Guido van Rossumfeee4b92000-03-10 22:57:27 +0000461{
Guido van Rossumfeee4b92000-03-10 22:57:27 +0000462 PyObject *args = NULL, *result = NULL;
463 PyObject *v;
464
Guido van Rossumfeee4b92000-03-10 22:57:27 +0000465 args = args_tuple(object, errors);
466 if (args == NULL)
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000467 goto onError;
Guido van Rossum98297ee2007-11-06 21:34:58 +0000468
Guido van Rossumfeee4b92000-03-10 22:57:27 +0000469 result = PyEval_CallObject(decoder,args);
Nick Coghlanc4c25802013-11-15 21:47:37 +1000470 if (result == NULL) {
471 wrap_codec_error("decoding", encoding);
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000472 goto onError;
Nick Coghlanc4c25802013-11-15 21:47:37 +1000473 }
Guido van Rossum98297ee2007-11-06 21:34:58 +0000474 if (!PyTuple_Check(result) ||
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000475 PyTuple_GET_SIZE(result) != 2) {
476 PyErr_SetString(PyExc_TypeError,
477 "decoder must return a tuple (object,integer)");
478 goto onError;
Guido van Rossumfeee4b92000-03-10 22:57:27 +0000479 }
480 v = PyTuple_GET_ITEM(result,0);
481 Py_INCREF(v);
482 /* We don't check or use the second (integer) entry. */
483
484 Py_DECREF(args);
485 Py_DECREF(decoder);
486 Py_DECREF(result);
487 return v;
Guido van Rossum98297ee2007-11-06 21:34:58 +0000488
Guido van Rossumfeee4b92000-03-10 22:57:27 +0000489 onError:
490 Py_XDECREF(args);
491 Py_XDECREF(decoder);
492 Py_XDECREF(result);
493 return NULL;
494}
495
Nick Coghlanc72e4e62013-11-22 22:39:36 +1000496/* Generic encoding/decoding API */
497PyObject *PyCodec_Encode(PyObject *object,
498 const char *encoding,
499 const char *errors)
500{
501 PyObject *encoder;
502
503 encoder = PyCodec_Encoder(encoding);
504 if (encoder == NULL)
505 return NULL;
506
507 return _PyCodec_EncodeInternal(object, encoder, encoding, errors);
508}
509
510PyObject *PyCodec_Decode(PyObject *object,
511 const char *encoding,
512 const char *errors)
513{
514 PyObject *decoder;
515
516 decoder = PyCodec_Decoder(encoding);
517 if (decoder == NULL)
518 return NULL;
519
520 return _PyCodec_DecodeInternal(object, decoder, encoding, errors);
521}
522
523/* Text encoding/decoding API */
Nick Coghlana9b15242014-02-04 22:11:18 +1000524PyObject * _PyCodec_LookupTextEncoding(const char *encoding,
525 const char *alternate_command)
Nick Coghlanc72e4e62013-11-22 22:39:36 +1000526{
527 _Py_IDENTIFIER(_is_text_encoding);
528 PyObject *codec;
529 PyObject *attr;
Nick Coghlanc72e4e62013-11-22 22:39:36 +1000530 int is_text_codec;
531
532 codec = _PyCodec_Lookup(encoding);
533 if (codec == NULL)
534 return NULL;
535
536 /* Backwards compatibility: assume any raw tuple describes a text
537 * encoding, and the same for anything lacking the private
538 * attribute.
539 */
540 if (!PyTuple_CheckExact(codec)) {
Serhiy Storchakaf320be72018-01-25 10:49:40 +0200541 if (_PyObject_LookupAttrId(codec, &PyId__is_text_encoding, &attr) < 0) {
542 Py_DECREF(codec);
543 return NULL;
544 }
545 if (attr != NULL) {
Nick Coghlanc72e4e62013-11-22 22:39:36 +1000546 is_text_codec = PyObject_IsTrue(attr);
547 Py_DECREF(attr);
Serhiy Storchakafa494fd2015-05-30 17:45:22 +0300548 if (is_text_codec <= 0) {
Nick Coghlanc72e4e62013-11-22 22:39:36 +1000549 Py_DECREF(codec);
Serhiy Storchakafa494fd2015-05-30 17:45:22 +0300550 if (!is_text_codec)
551 PyErr_Format(PyExc_LookupError,
552 "'%.400s' is not a text encoding; "
553 "use %s to handle arbitrary codecs",
554 encoding, alternate_command);
Nick Coghlanc72e4e62013-11-22 22:39:36 +1000555 return NULL;
556 }
557 }
558 }
559
Nick Coghlana9b15242014-02-04 22:11:18 +1000560 /* This appears to be a valid text encoding */
561 return codec;
562}
563
564
565static
566PyObject *codec_getitem_checked(const char *encoding,
567 const char *alternate_command,
568 int index)
569{
570 PyObject *codec;
571 PyObject *v;
572
573 codec = _PyCodec_LookupTextEncoding(encoding, alternate_command);
574 if (codec == NULL)
575 return NULL;
576
Nick Coghlanc72e4e62013-11-22 22:39:36 +1000577 v = PyTuple_GET_ITEM(codec, index);
Nick Coghlanc72e4e62013-11-22 22:39:36 +1000578 Py_INCREF(v);
Nick Coghlana9b15242014-02-04 22:11:18 +1000579 Py_DECREF(codec);
Nick Coghlanc72e4e62013-11-22 22:39:36 +1000580 return v;
581}
582
583static PyObject * _PyCodec_TextEncoder(const char *encoding)
584{
Nick Coghlana9b15242014-02-04 22:11:18 +1000585 return codec_getitem_checked(encoding, "codecs.encode()", 0);
Nick Coghlanc72e4e62013-11-22 22:39:36 +1000586}
587
588static PyObject * _PyCodec_TextDecoder(const char *encoding)
589{
Nick Coghlana9b15242014-02-04 22:11:18 +1000590 return codec_getitem_checked(encoding, "codecs.decode()", 1);
Nick Coghlanc72e4e62013-11-22 22:39:36 +1000591}
592
593PyObject *_PyCodec_EncodeText(PyObject *object,
594 const char *encoding,
595 const char *errors)
596{
597 PyObject *encoder;
598
599 encoder = _PyCodec_TextEncoder(encoding);
600 if (encoder == NULL)
601 return NULL;
602
603 return _PyCodec_EncodeInternal(object, encoder, encoding, errors);
604}
605
606PyObject *_PyCodec_DecodeText(PyObject *object,
607 const char *encoding,
608 const char *errors)
609{
610 PyObject *decoder;
611
612 decoder = _PyCodec_TextDecoder(encoding);
613 if (decoder == NULL)
614 return NULL;
615
616 return _PyCodec_DecodeInternal(object, decoder, encoding, errors);
617}
618
Walter Dörwald3aeb6322002-09-02 13:14:32 +0000619/* Register the error handling callback function error under the name
620 name. This function will be called by the codec when it encounters
621 an unencodable characters/undecodable bytes and doesn't know the
622 callback name, when name is specified as the error parameter
623 in the call to the encode/decode function.
624 Return 0 on success, -1 on error */
625int PyCodec_RegisterError(const char *name, PyObject *error)
626{
Nicholas Bastine5662ae2004-03-24 22:22:12 +0000627 PyInterpreterState *interp = PyThreadState_GET()->interp;
Gustavo Niemeyer5ddd4c32003-03-19 00:35:36 +0000628 if (interp->codec_search_path == NULL && _PyCodecRegistry_Init())
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000629 return -1;
Walter Dörwald3aeb6322002-09-02 13:14:32 +0000630 if (!PyCallable_Check(error)) {
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000631 PyErr_SetString(PyExc_TypeError, "handler must be callable");
632 return -1;
Walter Dörwald3aeb6322002-09-02 13:14:32 +0000633 }
Gustavo Niemeyer5ddd4c32003-03-19 00:35:36 +0000634 return PyDict_SetItemString(interp->codec_error_registry,
Serhiy Storchakac6792272013-10-19 21:03:34 +0300635 name, error);
Walter Dörwald3aeb6322002-09-02 13:14:32 +0000636}
637
638/* Lookup the error handling callback function registered under the
639 name error. As a special case NULL can be passed, in which case
640 the error handling callback for strict encoding will be returned. */
641PyObject *PyCodec_LookupError(const char *name)
642{
643 PyObject *handler = NULL;
644
Nicholas Bastine5662ae2004-03-24 22:22:12 +0000645 PyInterpreterState *interp = PyThreadState_GET()->interp;
Gustavo Niemeyer5ddd4c32003-03-19 00:35:36 +0000646 if (interp->codec_search_path == NULL && _PyCodecRegistry_Init())
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000647 return NULL;
Gustavo Niemeyer5ddd4c32003-03-19 00:35:36 +0000648
Walter Dörwald3aeb6322002-09-02 13:14:32 +0000649 if (name==NULL)
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000650 name = "strict";
Serhiy Storchakac6792272013-10-19 21:03:34 +0300651 handler = PyDict_GetItemString(interp->codec_error_registry, name);
Walter Dörwald3aeb6322002-09-02 13:14:32 +0000652 if (!handler)
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000653 PyErr_Format(PyExc_LookupError, "unknown error handler name '%.400s'", name);
Walter Dörwald3aeb6322002-09-02 13:14:32 +0000654 else
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000655 Py_INCREF(handler);
Walter Dörwald3aeb6322002-09-02 13:14:32 +0000656 return handler;
657}
658
659static void wrong_exception_type(PyObject *exc)
660{
Serhiy Storchakaca7fecb2015-05-18 16:08:52 +0300661 PyErr_Format(PyExc_TypeError,
662 "don't know how to handle %.200s in error callback",
663 exc->ob_type->tp_name);
Walter Dörwald3aeb6322002-09-02 13:14:32 +0000664}
665
666PyObject *PyCodec_StrictErrors(PyObject *exc)
667{
Brett Cannonbf364092006-03-01 04:25:17 +0000668 if (PyExceptionInstance_Check(exc))
669 PyErr_SetObject(PyExceptionInstance_Class(exc), exc);
Walter Dörwald3aeb6322002-09-02 13:14:32 +0000670 else
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000671 PyErr_SetString(PyExc_TypeError, "codec must pass exception instance");
Walter Dörwald3aeb6322002-09-02 13:14:32 +0000672 return NULL;
673}
674
675
676PyObject *PyCodec_IgnoreErrors(PyObject *exc)
677{
Martin v. Löwis18e16552006-02-15 17:27:45 +0000678 Py_ssize_t end;
Serhiy Storchakaca7fecb2015-05-18 16:08:52 +0300679
680 if (PyObject_TypeCheck(exc, (PyTypeObject *)PyExc_UnicodeEncodeError)) {
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000681 if (PyUnicodeEncodeError_GetEnd(exc, &end))
682 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +0000683 }
Serhiy Storchakaca7fecb2015-05-18 16:08:52 +0300684 else if (PyObject_TypeCheck(exc, (PyTypeObject *)PyExc_UnicodeDecodeError)) {
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000685 if (PyUnicodeDecodeError_GetEnd(exc, &end))
686 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +0000687 }
Serhiy Storchakaca7fecb2015-05-18 16:08:52 +0300688 else if (PyObject_TypeCheck(exc, (PyTypeObject *)PyExc_UnicodeTranslateError)) {
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000689 if (PyUnicodeTranslateError_GetEnd(exc, &end))
690 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +0000691 }
692 else {
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000693 wrong_exception_type(exc);
694 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +0000695 }
Victor Stinneree450092011-12-01 02:52:11 +0100696 return Py_BuildValue("(Nn)", PyUnicode_New(0, 0), end);
Walter Dörwald3aeb6322002-09-02 13:14:32 +0000697}
698
699
700PyObject *PyCodec_ReplaceErrors(PyObject *exc)
701{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200702 Py_ssize_t start, end, i, len;
Walter Dörwald3aeb6322002-09-02 13:14:32 +0000703
Serhiy Storchakaca7fecb2015-05-18 16:08:52 +0300704 if (PyObject_TypeCheck(exc, (PyTypeObject *)PyExc_UnicodeEncodeError)) {
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000705 PyObject *res;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200706 int kind;
707 void *data;
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000708 if (PyUnicodeEncodeError_GetStart(exc, &start))
709 return NULL;
710 if (PyUnicodeEncodeError_GetEnd(exc, &end))
711 return NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200712 len = end - start;
713 res = PyUnicode_New(len, '?');
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000714 if (res == NULL)
715 return NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200716 kind = PyUnicode_KIND(res);
717 data = PyUnicode_DATA(res);
718 for (i = 0; i < len; ++i)
719 PyUnicode_WRITE(kind, data, i, '?');
Victor Stinner8f825062012-04-27 13:55:39 +0200720 assert(_PyUnicode_CheckConsistency(res, 1));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200721 return Py_BuildValue("(Nn)", res, end);
Walter Dörwald3aeb6322002-09-02 13:14:32 +0000722 }
Serhiy Storchakaca7fecb2015-05-18 16:08:52 +0300723 else if (PyObject_TypeCheck(exc, (PyTypeObject *)PyExc_UnicodeDecodeError)) {
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000724 if (PyUnicodeDecodeError_GetEnd(exc, &end))
725 return NULL;
Victor Stinner1a15aba2011-10-02 19:00:15 +0200726 return Py_BuildValue("(Cn)",
727 (int)Py_UNICODE_REPLACEMENT_CHARACTER,
728 end);
Walter Dörwald3aeb6322002-09-02 13:14:32 +0000729 }
Serhiy Storchakaca7fecb2015-05-18 16:08:52 +0300730 else if (PyObject_TypeCheck(exc, (PyTypeObject *)PyExc_UnicodeTranslateError)) {
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000731 PyObject *res;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200732 int kind;
733 void *data;
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000734 if (PyUnicodeTranslateError_GetStart(exc, &start))
735 return NULL;
736 if (PyUnicodeTranslateError_GetEnd(exc, &end))
737 return NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200738 len = end - start;
739 res = PyUnicode_New(len, Py_UNICODE_REPLACEMENT_CHARACTER);
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000740 if (res == NULL)
741 return NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200742 kind = PyUnicode_KIND(res);
743 data = PyUnicode_DATA(res);
744 for (i=0; i < len; i++)
745 PyUnicode_WRITE(kind, data, i, Py_UNICODE_REPLACEMENT_CHARACTER);
Victor Stinner8f825062012-04-27 13:55:39 +0200746 assert(_PyUnicode_CheckConsistency(res, 1));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200747 return Py_BuildValue("(Nn)", res, end);
Walter Dörwald3aeb6322002-09-02 13:14:32 +0000748 }
749 else {
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000750 wrong_exception_type(exc);
751 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +0000752 }
753}
754
755PyObject *PyCodec_XMLCharRefReplaceErrors(PyObject *exc)
756{
Serhiy Storchakaca7fecb2015-05-18 16:08:52 +0300757 if (PyObject_TypeCheck(exc, (PyTypeObject *)PyExc_UnicodeEncodeError)) {
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000758 PyObject *restuple;
759 PyObject *object;
Victor Stinnerb31f1bc2011-11-04 21:29:10 +0100760 Py_ssize_t i;
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000761 Py_ssize_t start;
762 Py_ssize_t end;
763 PyObject *res;
Martin v. Löwisb09af032011-11-04 11:16:41 +0100764 unsigned char *outp;
Serhiy Storchaka2e374092014-10-04 14:15:49 +0300765 Py_ssize_t ressize;
Martin v. Löwisb09af032011-11-04 11:16:41 +0100766 Py_UCS4 ch;
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000767 if (PyUnicodeEncodeError_GetStart(exc, &start))
768 return NULL;
769 if (PyUnicodeEncodeError_GetEnd(exc, &end))
770 return NULL;
771 if (!(object = PyUnicodeEncodeError_GetObject(exc)))
772 return NULL;
Serhiy Storchaka2e374092014-10-04 14:15:49 +0300773 if (end - start > PY_SSIZE_T_MAX / (2+7+1))
774 end = start + PY_SSIZE_T_MAX / (2+7+1);
Martin v. Löwisb09af032011-11-04 11:16:41 +0100775 for (i = start, ressize = 0; i < end; ++i) {
776 /* object is guaranteed to be "ready" */
777 ch = PyUnicode_READ_CHAR(object, i);
778 if (ch<10)
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000779 ressize += 2+1+1;
Martin v. Löwisb09af032011-11-04 11:16:41 +0100780 else if (ch<100)
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000781 ressize += 2+2+1;
Martin v. Löwisb09af032011-11-04 11:16:41 +0100782 else if (ch<1000)
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000783 ressize += 2+3+1;
Martin v. Löwisb09af032011-11-04 11:16:41 +0100784 else if (ch<10000)
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000785 ressize += 2+4+1;
Martin v. Löwisb09af032011-11-04 11:16:41 +0100786 else if (ch<100000)
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000787 ressize += 2+5+1;
Martin v. Löwisb09af032011-11-04 11:16:41 +0100788 else if (ch<1000000)
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000789 ressize += 2+6+1;
790 else
791 ressize += 2+7+1;
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000792 }
793 /* allocate replacement */
Martin v. Löwisb09af032011-11-04 11:16:41 +0100794 res = PyUnicode_New(ressize, 127);
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000795 if (res == NULL) {
796 Py_DECREF(object);
797 return NULL;
798 }
Martin v. Löwisb09af032011-11-04 11:16:41 +0100799 outp = PyUnicode_1BYTE_DATA(res);
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000800 /* generate replacement */
Victor Stinnerb31f1bc2011-11-04 21:29:10 +0100801 for (i = start; i < end; ++i) {
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000802 int digits;
803 int base;
Martin v. Löwis8ba79302011-11-04 12:26:49 +0100804 ch = PyUnicode_READ_CHAR(object, i);
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000805 *outp++ = '&';
806 *outp++ = '#';
Martin v. Löwisb09af032011-11-04 11:16:41 +0100807 if (ch<10) {
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000808 digits = 1;
809 base = 1;
810 }
Martin v. Löwisb09af032011-11-04 11:16:41 +0100811 else if (ch<100) {
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000812 digits = 2;
813 base = 10;
814 }
Martin v. Löwisb09af032011-11-04 11:16:41 +0100815 else if (ch<1000) {
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000816 digits = 3;
817 base = 100;
818 }
Martin v. Löwisb09af032011-11-04 11:16:41 +0100819 else if (ch<10000) {
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000820 digits = 4;
821 base = 1000;
822 }
Martin v. Löwisb09af032011-11-04 11:16:41 +0100823 else if (ch<100000) {
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000824 digits = 5;
825 base = 10000;
826 }
Martin v. Löwisb09af032011-11-04 11:16:41 +0100827 else if (ch<1000000) {
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000828 digits = 6;
829 base = 100000;
830 }
831 else {
832 digits = 7;
833 base = 1000000;
834 }
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000835 while (digits-->0) {
Martin v. Löwisb09af032011-11-04 11:16:41 +0100836 *outp++ = '0' + ch/base;
837 ch %= base;
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000838 base /= 10;
839 }
840 *outp++ = ';';
841 }
Victor Stinner8f825062012-04-27 13:55:39 +0200842 assert(_PyUnicode_CheckConsistency(res, 1));
843 restuple = Py_BuildValue("(Nn)", res, end);
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000844 Py_DECREF(object);
845 return restuple;
Walter Dörwald3aeb6322002-09-02 13:14:32 +0000846 }
847 else {
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000848 wrong_exception_type(exc);
849 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +0000850 }
851}
852
Walter Dörwald3aeb6322002-09-02 13:14:32 +0000853PyObject *PyCodec_BackslashReplaceErrors(PyObject *exc)
854{
Serhiy Storchaka07985ef2015-01-25 22:56:57 +0200855 PyObject *object;
856 Py_ssize_t i;
857 Py_ssize_t start;
858 Py_ssize_t end;
859 PyObject *res;
860 unsigned char *outp;
861 int ressize;
862 Py_UCS4 c;
863
Serhiy Storchakac0937f72015-05-18 16:10:40 +0300864 if (PyObject_TypeCheck(exc, (PyTypeObject *)PyExc_UnicodeDecodeError)) {
Serhiy Storchakacb33a012016-10-23 09:44:50 +0300865 const unsigned char *p;
Serhiy Storchaka07985ef2015-01-25 22:56:57 +0200866 if (PyUnicodeDecodeError_GetStart(exc, &start))
867 return NULL;
868 if (PyUnicodeDecodeError_GetEnd(exc, &end))
869 return NULL;
870 if (!(object = PyUnicodeDecodeError_GetObject(exc)))
871 return NULL;
Serhiy Storchakacb33a012016-10-23 09:44:50 +0300872 p = (const unsigned char*)PyBytes_AS_STRING(object);
Serhiy Storchaka07985ef2015-01-25 22:56:57 +0200873 res = PyUnicode_New(4 * (end - start), 127);
874 if (res == NULL) {
875 Py_DECREF(object);
876 return NULL;
877 }
878 outp = PyUnicode_1BYTE_DATA(res);
879 for (i = start; i < end; i++, outp += 4) {
880 unsigned char c = p[i];
881 outp[0] = '\\';
882 outp[1] = 'x';
883 outp[2] = Py_hexdigits[(c>>4)&0xf];
884 outp[3] = Py_hexdigits[c&0xf];
885 }
886
887 assert(_PyUnicode_CheckConsistency(res, 1));
888 Py_DECREF(object);
889 return Py_BuildValue("(Nn)", res, end);
890 }
Serhiy Storchakac0937f72015-05-18 16:10:40 +0300891 if (PyObject_TypeCheck(exc, (PyTypeObject *)PyExc_UnicodeEncodeError)) {
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000892 if (PyUnicodeEncodeError_GetStart(exc, &start))
893 return NULL;
894 if (PyUnicodeEncodeError_GetEnd(exc, &end))
895 return NULL;
896 if (!(object = PyUnicodeEncodeError_GetObject(exc)))
897 return NULL;
Serhiy Storchaka07985ef2015-01-25 22:56:57 +0200898 }
Serhiy Storchakac0937f72015-05-18 16:10:40 +0300899 else if (PyObject_TypeCheck(exc, (PyTypeObject *)PyExc_UnicodeTranslateError)) {
Serhiy Storchaka07985ef2015-01-25 22:56:57 +0200900 if (PyUnicodeTranslateError_GetStart(exc, &start))
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000901 return NULL;
Serhiy Storchaka07985ef2015-01-25 22:56:57 +0200902 if (PyUnicodeTranslateError_GetEnd(exc, &end))
903 return NULL;
904 if (!(object = PyUnicodeTranslateError_GetObject(exc)))
905 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +0000906 }
907 else {
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000908 wrong_exception_type(exc);
909 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +0000910 }
Serhiy Storchaka07985ef2015-01-25 22:56:57 +0200911
912 if (end - start > PY_SSIZE_T_MAX / (1+1+8))
913 end = start + PY_SSIZE_T_MAX / (1+1+8);
914 for (i = start, ressize = 0; i < end; ++i) {
915 /* object is guaranteed to be "ready" */
916 c = PyUnicode_READ_CHAR(object, i);
917 if (c >= 0x10000) {
918 ressize += 1+1+8;
919 }
920 else if (c >= 0x100) {
921 ressize += 1+1+4;
922 }
923 else
924 ressize += 1+1+2;
925 }
926 res = PyUnicode_New(ressize, 127);
927 if (res == NULL) {
928 Py_DECREF(object);
929 return NULL;
930 }
931 outp = PyUnicode_1BYTE_DATA(res);
932 for (i = start; i < end; ++i) {
933 c = PyUnicode_READ_CHAR(object, i);
934 *outp++ = '\\';
935 if (c >= 0x00010000) {
936 *outp++ = 'U';
937 *outp++ = Py_hexdigits[(c>>28)&0xf];
938 *outp++ = Py_hexdigits[(c>>24)&0xf];
939 *outp++ = Py_hexdigits[(c>>20)&0xf];
940 *outp++ = Py_hexdigits[(c>>16)&0xf];
941 *outp++ = Py_hexdigits[(c>>12)&0xf];
942 *outp++ = Py_hexdigits[(c>>8)&0xf];
943 }
944 else if (c >= 0x100) {
945 *outp++ = 'u';
946 *outp++ = Py_hexdigits[(c>>12)&0xf];
947 *outp++ = Py_hexdigits[(c>>8)&0xf];
948 }
949 else
950 *outp++ = 'x';
951 *outp++ = Py_hexdigits[(c>>4)&0xf];
952 *outp++ = Py_hexdigits[c&0xf];
953 }
954
955 assert(_PyUnicode_CheckConsistency(res, 1));
956 Py_DECREF(object);
957 return Py_BuildValue("(Nn)", res, end);
Walter Dörwald3aeb6322002-09-02 13:14:32 +0000958}
959
Serhiy Storchaka166ebc42014-11-25 13:57:17 +0200960static _PyUnicode_Name_CAPI *ucnhash_CAPI = NULL;
Serhiy Storchaka166ebc42014-11-25 13:57:17 +0200961
962PyObject *PyCodec_NameReplaceErrors(PyObject *exc)
963{
Serhiy Storchakaca7fecb2015-05-18 16:08:52 +0300964 if (PyObject_TypeCheck(exc, (PyTypeObject *)PyExc_UnicodeEncodeError)) {
Serhiy Storchaka166ebc42014-11-25 13:57:17 +0200965 PyObject *restuple;
966 PyObject *object;
967 Py_ssize_t i;
968 Py_ssize_t start;
969 Py_ssize_t end;
970 PyObject *res;
971 unsigned char *outp;
Serhiy Storchakaaacfccc2014-11-26 12:11:40 +0200972 Py_ssize_t ressize;
973 int replsize;
Serhiy Storchaka166ebc42014-11-25 13:57:17 +0200974 Py_UCS4 c;
975 char buffer[256]; /* NAME_MAXLEN */
976 if (PyUnicodeEncodeError_GetStart(exc, &start))
977 return NULL;
978 if (PyUnicodeEncodeError_GetEnd(exc, &end))
979 return NULL;
980 if (!(object = PyUnicodeEncodeError_GetObject(exc)))
981 return NULL;
Victor Stinner38b8ae02015-09-03 16:19:40 +0200982 if (!ucnhash_CAPI) {
Serhiy Storchaka166ebc42014-11-25 13:57:17 +0200983 /* load the unicode data module */
984 ucnhash_CAPI = (_PyUnicode_Name_CAPI *)PyCapsule_Import(
985 PyUnicodeData_CAPSULE_NAME, 1);
Victor Stinner38b8ae02015-09-03 16:19:40 +0200986 if (!ucnhash_CAPI)
987 return NULL;
Serhiy Storchaka166ebc42014-11-25 13:57:17 +0200988 }
989 for (i = start, ressize = 0; i < end; ++i) {
990 /* object is guaranteed to be "ready" */
991 c = PyUnicode_READ_CHAR(object, i);
Victor Stinner38b8ae02015-09-03 16:19:40 +0200992 if (ucnhash_CAPI->getname(NULL, c, buffer, sizeof(buffer), 1)) {
Serhiy Storchaka26861b02015-02-16 20:52:17 +0200993 replsize = 1+1+1+(int)strlen(buffer)+1;
Serhiy Storchaka166ebc42014-11-25 13:57:17 +0200994 }
995 else if (c >= 0x10000) {
Serhiy Storchakaaacfccc2014-11-26 12:11:40 +0200996 replsize = 1+1+8;
Serhiy Storchaka166ebc42014-11-25 13:57:17 +0200997 }
998 else if (c >= 0x100) {
Serhiy Storchakaaacfccc2014-11-26 12:11:40 +0200999 replsize = 1+1+4;
Serhiy Storchaka166ebc42014-11-25 13:57:17 +02001000 }
1001 else
Serhiy Storchakaaacfccc2014-11-26 12:11:40 +02001002 replsize = 1+1+2;
1003 if (ressize > PY_SSIZE_T_MAX - replsize)
1004 break;
1005 ressize += replsize;
Serhiy Storchaka166ebc42014-11-25 13:57:17 +02001006 }
Serhiy Storchakaaacfccc2014-11-26 12:11:40 +02001007 end = i;
Serhiy Storchaka166ebc42014-11-25 13:57:17 +02001008 res = PyUnicode_New(ressize, 127);
1009 if (res==NULL)
1010 return NULL;
1011 for (i = start, outp = PyUnicode_1BYTE_DATA(res);
1012 i < end; ++i) {
1013 c = PyUnicode_READ_CHAR(object, i);
1014 *outp++ = '\\';
Victor Stinner38b8ae02015-09-03 16:19:40 +02001015 if (ucnhash_CAPI->getname(NULL, c, buffer, sizeof(buffer), 1)) {
Serhiy Storchaka166ebc42014-11-25 13:57:17 +02001016 *outp++ = 'N';
1017 *outp++ = '{';
1018 strcpy((char *)outp, buffer);
1019 outp += strlen(buffer);
1020 *outp++ = '}';
1021 continue;
1022 }
1023 if (c >= 0x00010000) {
1024 *outp++ = 'U';
1025 *outp++ = Py_hexdigits[(c>>28)&0xf];
1026 *outp++ = Py_hexdigits[(c>>24)&0xf];
1027 *outp++ = Py_hexdigits[(c>>20)&0xf];
1028 *outp++ = Py_hexdigits[(c>>16)&0xf];
1029 *outp++ = Py_hexdigits[(c>>12)&0xf];
1030 *outp++ = Py_hexdigits[(c>>8)&0xf];
1031 }
1032 else if (c >= 0x100) {
1033 *outp++ = 'u';
1034 *outp++ = Py_hexdigits[(c>>12)&0xf];
1035 *outp++ = Py_hexdigits[(c>>8)&0xf];
1036 }
1037 else
1038 *outp++ = 'x';
1039 *outp++ = Py_hexdigits[(c>>4)&0xf];
1040 *outp++ = Py_hexdigits[c&0xf];
1041 }
1042
Benjamin Peterson3663b582014-11-26 14:39:54 -06001043 assert(outp == PyUnicode_1BYTE_DATA(res) + ressize);
Serhiy Storchaka166ebc42014-11-25 13:57:17 +02001044 assert(_PyUnicode_CheckConsistency(res, 1));
1045 restuple = Py_BuildValue("(Nn)", res, end);
1046 Py_DECREF(object);
1047 return restuple;
1048 }
1049 else {
1050 wrong_exception_type(exc);
1051 return NULL;
1052 }
1053}
1054
Serhiy Storchaka88d8fb62014-05-15 14:37:42 +03001055#define ENC_UNKNOWN -1
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02001056#define ENC_UTF8 0
1057#define ENC_UTF16BE 1
1058#define ENC_UTF16LE 2
1059#define ENC_UTF32BE 3
1060#define ENC_UTF32LE 4
1061
1062static int
1063get_standard_encoding(const char *encoding, int *bytelength)
1064{
1065 if (Py_TOLOWER(encoding[0]) == 'u' &&
1066 Py_TOLOWER(encoding[1]) == 't' &&
1067 Py_TOLOWER(encoding[2]) == 'f') {
1068 encoding += 3;
1069 if (*encoding == '-' || *encoding == '_' )
1070 encoding++;
Serhiy Storchaka88d8fb62014-05-15 14:37:42 +03001071 if (encoding[0] == '8' && encoding[1] == '\0') {
1072 *bytelength = 3;
1073 return ENC_UTF8;
1074 }
1075 else if (encoding[0] == '1' && encoding[1] == '6') {
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02001076 encoding += 2;
1077 *bytelength = 2;
1078 if (*encoding == '\0') {
1079#ifdef WORDS_BIGENDIAN
1080 return ENC_UTF16BE;
1081#else
1082 return ENC_UTF16LE;
1083#endif
1084 }
1085 if (*encoding == '-' || *encoding == '_' )
1086 encoding++;
1087 if (Py_TOLOWER(encoding[1]) == 'e' && encoding[2] == '\0') {
1088 if (Py_TOLOWER(encoding[0]) == 'b')
1089 return ENC_UTF16BE;
1090 if (Py_TOLOWER(encoding[0]) == 'l')
1091 return ENC_UTF16LE;
1092 }
1093 }
1094 else if (encoding[0] == '3' && encoding[1] == '2') {
1095 encoding += 2;
1096 *bytelength = 4;
1097 if (*encoding == '\0') {
1098#ifdef WORDS_BIGENDIAN
1099 return ENC_UTF32BE;
1100#else
1101 return ENC_UTF32LE;
1102#endif
1103 }
1104 if (*encoding == '-' || *encoding == '_' )
1105 encoding++;
1106 if (Py_TOLOWER(encoding[1]) == 'e' && encoding[2] == '\0') {
1107 if (Py_TOLOWER(encoding[0]) == 'b')
1108 return ENC_UTF32BE;
1109 if (Py_TOLOWER(encoding[0]) == 'l')
1110 return ENC_UTF32LE;
1111 }
1112 }
1113 }
Victor Stinner0d4e01c2014-05-16 14:46:20 +02001114 else if (strcmp(encoding, "CP_UTF8") == 0) {
1115 *bytelength = 3;
1116 return ENC_UTF8;
1117 }
Serhiy Storchaka88d8fb62014-05-15 14:37:42 +03001118 return ENC_UNKNOWN;
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02001119}
1120
Martin v. Löwisaef3fb02009-05-02 19:27:30 +00001121/* This handler is declared static until someone demonstrates
1122 a need to call it directly. */
1123static PyObject *
Martin v. Löwise0a2b722009-05-10 08:08:56 +00001124PyCodec_SurrogatePassErrors(PyObject *exc)
Martin v. Löwisdb12d452009-05-02 18:52:14 +00001125{
1126 PyObject *restuple;
1127 PyObject *object;
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02001128 PyObject *encode;
Serhiy Storchaka85b0f5b2016-11-20 10:16:47 +02001129 const char *encoding;
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02001130 int code;
1131 int bytelength;
Martin v. Löwisb09af032011-11-04 11:16:41 +01001132 Py_ssize_t i;
Martin v. Löwisdb12d452009-05-02 18:52:14 +00001133 Py_ssize_t start;
1134 Py_ssize_t end;
1135 PyObject *res;
Serhiy Storchakaca7fecb2015-05-18 16:08:52 +03001136
1137 if (PyObject_TypeCheck(exc, (PyTypeObject *)PyExc_UnicodeEncodeError)) {
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02001138 unsigned char *outp;
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001139 if (PyUnicodeEncodeError_GetStart(exc, &start))
1140 return NULL;
1141 if (PyUnicodeEncodeError_GetEnd(exc, &end))
1142 return NULL;
1143 if (!(object = PyUnicodeEncodeError_GetObject(exc)))
1144 return NULL;
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02001145 if (!(encode = PyUnicodeEncodeError_GetEncoding(exc))) {
1146 Py_DECREF(object);
1147 return NULL;
1148 }
1149 if (!(encoding = PyUnicode_AsUTF8(encode))) {
1150 Py_DECREF(object);
1151 Py_DECREF(encode);
1152 return NULL;
1153 }
1154 code = get_standard_encoding(encoding, &bytelength);
1155 Py_DECREF(encode);
Serhiy Storchaka88d8fb62014-05-15 14:37:42 +03001156 if (code == ENC_UNKNOWN) {
1157 /* Not supported, fail with original exception */
1158 PyErr_SetObject(PyExceptionInstance_Class(exc), exc);
1159 Py_DECREF(object);
1160 return NULL;
1161 }
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02001162
Serhiy Storchaka2e374092014-10-04 14:15:49 +03001163 if (end - start > PY_SSIZE_T_MAX / bytelength)
1164 end = start + PY_SSIZE_T_MAX / bytelength;
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02001165 res = PyBytes_FromStringAndSize(NULL, bytelength*(end-start));
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001166 if (!res) {
1167 Py_DECREF(object);
1168 return NULL;
1169 }
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02001170 outp = (unsigned char*)PyBytes_AsString(res);
Martin v. Löwisb09af032011-11-04 11:16:41 +01001171 for (i = start; i < end; i++) {
1172 /* object is guaranteed to be "ready" */
1173 Py_UCS4 ch = PyUnicode_READ_CHAR(object, i);
Victor Stinner76df43d2012-10-30 01:42:39 +01001174 if (!Py_UNICODE_IS_SURROGATE(ch)) {
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001175 /* Not a surrogate, fail with original exception */
1176 PyErr_SetObject(PyExceptionInstance_Class(exc), exc);
1177 Py_DECREF(res);
1178 Py_DECREF(object);
1179 return NULL;
1180 }
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02001181 switch (code) {
1182 case ENC_UTF8:
1183 *outp++ = (unsigned char)(0xe0 | (ch >> 12));
1184 *outp++ = (unsigned char)(0x80 | ((ch >> 6) & 0x3f));
1185 *outp++ = (unsigned char)(0x80 | (ch & 0x3f));
1186 break;
1187 case ENC_UTF16LE:
1188 *outp++ = (unsigned char) ch;
1189 *outp++ = (unsigned char)(ch >> 8);
1190 break;
1191 case ENC_UTF16BE:
1192 *outp++ = (unsigned char)(ch >> 8);
1193 *outp++ = (unsigned char) ch;
1194 break;
1195 case ENC_UTF32LE:
1196 *outp++ = (unsigned char) ch;
1197 *outp++ = (unsigned char)(ch >> 8);
1198 *outp++ = (unsigned char)(ch >> 16);
1199 *outp++ = (unsigned char)(ch >> 24);
1200 break;
1201 case ENC_UTF32BE:
1202 *outp++ = (unsigned char)(ch >> 24);
1203 *outp++ = (unsigned char)(ch >> 16);
1204 *outp++ = (unsigned char)(ch >> 8);
1205 *outp++ = (unsigned char) ch;
1206 break;
1207 }
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001208 }
1209 restuple = Py_BuildValue("(On)", res, end);
1210 Py_DECREF(res);
1211 Py_DECREF(object);
1212 return restuple;
Martin v. Löwisdb12d452009-05-02 18:52:14 +00001213 }
Serhiy Storchakaca7fecb2015-05-18 16:08:52 +03001214 else if (PyObject_TypeCheck(exc, (PyTypeObject *)PyExc_UnicodeDecodeError)) {
Serhiy Storchakacb33a012016-10-23 09:44:50 +03001215 const unsigned char *p;
Victor Stinnerc06bb7a2011-11-04 21:36:35 +01001216 Py_UCS4 ch = 0;
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001217 if (PyUnicodeDecodeError_GetStart(exc, &start))
1218 return NULL;
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02001219 if (PyUnicodeDecodeError_GetEnd(exc, &end))
1220 return NULL;
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001221 if (!(object = PyUnicodeDecodeError_GetObject(exc)))
1222 return NULL;
Serhiy Storchakacb33a012016-10-23 09:44:50 +03001223 p = (const unsigned char*)PyBytes_AS_STRING(object);
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02001224 if (!(encode = PyUnicodeDecodeError_GetEncoding(exc))) {
1225 Py_DECREF(object);
1226 return NULL;
1227 }
1228 if (!(encoding = PyUnicode_AsUTF8(encode))) {
1229 Py_DECREF(object);
1230 Py_DECREF(encode);
1231 return NULL;
1232 }
1233 code = get_standard_encoding(encoding, &bytelength);
1234 Py_DECREF(encode);
Serhiy Storchaka88d8fb62014-05-15 14:37:42 +03001235 if (code == ENC_UNKNOWN) {
1236 /* Not supported, fail with original exception */
1237 PyErr_SetObject(PyExceptionInstance_Class(exc), exc);
1238 Py_DECREF(object);
1239 return NULL;
1240 }
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02001241
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001242 /* Try decoding a single surrogate character. If
1243 there are more, let the codec call us again. */
1244 p += start;
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02001245 if (PyBytes_GET_SIZE(object) - start >= bytelength) {
1246 switch (code) {
1247 case ENC_UTF8:
1248 if ((p[0] & 0xf0) == 0xe0 &&
1249 (p[1] & 0xc0) == 0x80 &&
1250 (p[2] & 0xc0) == 0x80) {
1251 /* it's a three-byte code */
1252 ch = ((p[0] & 0x0f) << 12) + ((p[1] & 0x3f) << 6) + (p[2] & 0x3f);
1253 }
1254 break;
1255 case ENC_UTF16LE:
1256 ch = p[1] << 8 | p[0];
1257 break;
1258 case ENC_UTF16BE:
1259 ch = p[0] << 8 | p[1];
1260 break;
1261 case ENC_UTF32LE:
1262 ch = (p[3] << 24) | (p[2] << 16) | (p[1] << 8) | p[0];
1263 break;
1264 case ENC_UTF32BE:
1265 ch = (p[0] << 24) | (p[1] << 16) | (p[2] << 8) | p[3];
1266 break;
1267 }
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001268 }
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02001269
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001270 Py_DECREF(object);
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02001271 if (!Py_UNICODE_IS_SURROGATE(ch)) {
1272 /* it's not a surrogate - fail */
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001273 PyErr_SetObject(PyExceptionInstance_Class(exc), exc);
1274 return NULL;
1275 }
Victor Stinnerc06bb7a2011-11-04 21:36:35 +01001276 res = PyUnicode_FromOrdinal(ch);
1277 if (res == NULL)
1278 return NULL;
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02001279 return Py_BuildValue("(Nn)", res, start + bytelength);
Martin v. Löwisdb12d452009-05-02 18:52:14 +00001280 }
1281 else {
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001282 wrong_exception_type(exc);
1283 return NULL;
Martin v. Löwisdb12d452009-05-02 18:52:14 +00001284 }
1285}
1286
Martin v. Löwis011e8422009-05-05 04:43:17 +00001287static PyObject *
Martin v. Löwis43c57782009-05-10 08:15:24 +00001288PyCodec_SurrogateEscapeErrors(PyObject *exc)
Martin v. Löwis011e8422009-05-05 04:43:17 +00001289{
1290 PyObject *restuple;
1291 PyObject *object;
Martin v. Löwisb09af032011-11-04 11:16:41 +01001292 Py_ssize_t i;
Martin v. Löwis011e8422009-05-05 04:43:17 +00001293 Py_ssize_t start;
1294 Py_ssize_t end;
1295 PyObject *res;
Serhiy Storchakaca7fecb2015-05-18 16:08:52 +03001296
1297 if (PyObject_TypeCheck(exc, (PyTypeObject *)PyExc_UnicodeEncodeError)) {
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001298 char *outp;
1299 if (PyUnicodeEncodeError_GetStart(exc, &start))
1300 return NULL;
1301 if (PyUnicodeEncodeError_GetEnd(exc, &end))
1302 return NULL;
1303 if (!(object = PyUnicodeEncodeError_GetObject(exc)))
1304 return NULL;
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001305 res = PyBytes_FromStringAndSize(NULL, end-start);
1306 if (!res) {
1307 Py_DECREF(object);
1308 return NULL;
1309 }
1310 outp = PyBytes_AsString(res);
Martin v. Löwisb09af032011-11-04 11:16:41 +01001311 for (i = start; i < end; i++) {
1312 /* object is guaranteed to be "ready" */
1313 Py_UCS4 ch = PyUnicode_READ_CHAR(object, i);
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001314 if (ch < 0xdc80 || ch > 0xdcff) {
1315 /* Not a UTF-8b surrogate, fail with original exception */
1316 PyErr_SetObject(PyExceptionInstance_Class(exc), exc);
1317 Py_DECREF(res);
1318 Py_DECREF(object);
1319 return NULL;
1320 }
1321 *outp++ = ch - 0xdc00;
1322 }
1323 restuple = Py_BuildValue("(On)", res, end);
1324 Py_DECREF(res);
1325 Py_DECREF(object);
1326 return restuple;
Martin v. Löwis011e8422009-05-05 04:43:17 +00001327 }
Serhiy Storchakaca7fecb2015-05-18 16:08:52 +03001328 else if (PyObject_TypeCheck(exc, (PyTypeObject *)PyExc_UnicodeDecodeError)) {
Victor Stinnerc06bb7a2011-11-04 21:36:35 +01001329 PyObject *str;
Serhiy Storchakacb33a012016-10-23 09:44:50 +03001330 const unsigned char *p;
Victor Stinnerc06bb7a2011-11-04 21:36:35 +01001331 Py_UCS2 ch[4]; /* decode up to 4 bad bytes. */
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001332 int consumed = 0;
1333 if (PyUnicodeDecodeError_GetStart(exc, &start))
1334 return NULL;
1335 if (PyUnicodeDecodeError_GetEnd(exc, &end))
1336 return NULL;
1337 if (!(object = PyUnicodeDecodeError_GetObject(exc)))
1338 return NULL;
Serhiy Storchakacb33a012016-10-23 09:44:50 +03001339 p = (const unsigned char*)PyBytes_AS_STRING(object);
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001340 while (consumed < 4 && consumed < end-start) {
1341 /* Refuse to escape ASCII bytes. */
1342 if (p[start+consumed] < 128)
1343 break;
1344 ch[consumed] = 0xdc00 + p[start+consumed];
1345 consumed++;
1346 }
1347 Py_DECREF(object);
1348 if (!consumed) {
1349 /* codec complained about ASCII byte. */
1350 PyErr_SetObject(PyExceptionInstance_Class(exc), exc);
1351 return NULL;
1352 }
Victor Stinnerc06bb7a2011-11-04 21:36:35 +01001353 str = PyUnicode_FromKindAndData(PyUnicode_2BYTE_KIND, ch, consumed);
1354 if (str == NULL)
1355 return NULL;
1356 return Py_BuildValue("(Nn)", str, start+consumed);
Martin v. Löwis011e8422009-05-05 04:43:17 +00001357 }
1358 else {
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001359 wrong_exception_type(exc);
1360 return NULL;
Martin v. Löwis011e8422009-05-05 04:43:17 +00001361 }
1362}
1363
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001364
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001365static PyObject *strict_errors(PyObject *self, PyObject *exc)
1366{
1367 return PyCodec_StrictErrors(exc);
1368}
1369
1370
1371static PyObject *ignore_errors(PyObject *self, PyObject *exc)
1372{
1373 return PyCodec_IgnoreErrors(exc);
1374}
1375
1376
1377static PyObject *replace_errors(PyObject *self, PyObject *exc)
1378{
1379 return PyCodec_ReplaceErrors(exc);
1380}
1381
1382
1383static PyObject *xmlcharrefreplace_errors(PyObject *self, PyObject *exc)
1384{
1385 return PyCodec_XMLCharRefReplaceErrors(exc);
1386}
1387
1388
1389static PyObject *backslashreplace_errors(PyObject *self, PyObject *exc)
1390{
1391 return PyCodec_BackslashReplaceErrors(exc);
1392}
1393
Serhiy Storchaka166ebc42014-11-25 13:57:17 +02001394static PyObject *namereplace_errors(PyObject *self, PyObject *exc)
1395{
1396 return PyCodec_NameReplaceErrors(exc);
1397}
1398
Martin v. Löwise0a2b722009-05-10 08:08:56 +00001399static PyObject *surrogatepass_errors(PyObject *self, PyObject *exc)
Martin v. Löwisdb12d452009-05-02 18:52:14 +00001400{
Martin v. Löwise0a2b722009-05-10 08:08:56 +00001401 return PyCodec_SurrogatePassErrors(exc);
Martin v. Löwisdb12d452009-05-02 18:52:14 +00001402}
1403
Martin v. Löwis43c57782009-05-10 08:15:24 +00001404static PyObject *surrogateescape_errors(PyObject *self, PyObject *exc)
Martin v. Löwis011e8422009-05-05 04:43:17 +00001405{
Martin v. Löwis43c57782009-05-10 08:15:24 +00001406 return PyCodec_SurrogateEscapeErrors(exc);
Martin v. Löwis011e8422009-05-05 04:43:17 +00001407}
1408
Gustavo Niemeyer5ddd4c32003-03-19 00:35:36 +00001409static int _PyCodecRegistry_Init(void)
Guido van Rossumfeee4b92000-03-10 22:57:27 +00001410{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001411 static struct {
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001412 char *name;
1413 PyMethodDef def;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001414 } methods[] =
1415 {
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001416 {
1417 "strict",
1418 {
1419 "strict_errors",
1420 strict_errors,
1421 METH_O,
1422 PyDoc_STR("Implements the 'strict' error handling, which "
1423 "raises a UnicodeError on coding errors.")
1424 }
1425 },
1426 {
1427 "ignore",
1428 {
1429 "ignore_errors",
1430 ignore_errors,
1431 METH_O,
1432 PyDoc_STR("Implements the 'ignore' error handling, which "
1433 "ignores malformed data and continues.")
1434 }
1435 },
1436 {
1437 "replace",
1438 {
1439 "replace_errors",
1440 replace_errors,
1441 METH_O,
1442 PyDoc_STR("Implements the 'replace' error handling, which "
1443 "replaces malformed data with a replacement marker.")
1444 }
1445 },
1446 {
1447 "xmlcharrefreplace",
1448 {
1449 "xmlcharrefreplace_errors",
1450 xmlcharrefreplace_errors,
1451 METH_O,
1452 PyDoc_STR("Implements the 'xmlcharrefreplace' error handling, "
1453 "which replaces an unencodable character with the "
1454 "appropriate XML character reference.")
1455 }
1456 },
1457 {
1458 "backslashreplace",
1459 {
1460 "backslashreplace_errors",
1461 backslashreplace_errors,
1462 METH_O,
1463 PyDoc_STR("Implements the 'backslashreplace' error handling, "
Serhiy Storchaka07985ef2015-01-25 22:56:57 +02001464 "which replaces malformed data with a backslashed "
1465 "escape sequence.")
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001466 }
1467 },
1468 {
Serhiy Storchaka166ebc42014-11-25 13:57:17 +02001469 "namereplace",
1470 {
1471 "namereplace_errors",
1472 namereplace_errors,
1473 METH_O,
1474 PyDoc_STR("Implements the 'namereplace' error handling, "
1475 "which replaces an unencodable character with a "
1476 "\\N{...} escape sequence.")
1477 }
1478 },
1479 {
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001480 "surrogatepass",
1481 {
1482 "surrogatepass",
1483 surrogatepass_errors,
1484 METH_O
1485 }
1486 },
1487 {
1488 "surrogateescape",
1489 {
1490 "surrogateescape",
1491 surrogateescape_errors,
1492 METH_O
1493 }
1494 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001495 };
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001496
Nicholas Bastine5662ae2004-03-24 22:22:12 +00001497 PyInterpreterState *interp = PyThreadState_GET()->interp;
Gustavo Niemeyer5ddd4c32003-03-19 00:35:36 +00001498 PyObject *mod;
Neal Norwitz739a8f82004-07-08 01:55:58 +00001499 unsigned i;
Gustavo Niemeyer5ddd4c32003-03-19 00:35:36 +00001500
1501 if (interp->codec_search_path != NULL)
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001502 return 0;
Gustavo Niemeyer5ddd4c32003-03-19 00:35:36 +00001503
1504 interp->codec_search_path = PyList_New(0);
1505 interp->codec_search_cache = PyDict_New();
1506 interp->codec_error_registry = PyDict_New();
1507
1508 if (interp->codec_error_registry) {
Victor Stinner63941882011-09-29 00:42:28 +02001509 for (i = 0; i < Py_ARRAY_LENGTH(methods); ++i) {
Andrew Svetlov3ba3a3e2012-12-25 13:32:35 +02001510 PyObject *func = PyCFunction_NewEx(&methods[i].def, NULL, NULL);
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001511 int res;
1512 if (!func)
1513 Py_FatalError("can't initialize codec error registry");
1514 res = PyCodec_RegisterError(methods[i].name, func);
1515 Py_DECREF(func);
1516 if (res)
1517 Py_FatalError("can't initialize codec error registry");
1518 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001519 }
Guido van Rossumfeee4b92000-03-10 22:57:27 +00001520
Gustavo Niemeyer5ddd4c32003-03-19 00:35:36 +00001521 if (interp->codec_search_path == NULL ||
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001522 interp->codec_search_cache == NULL ||
1523 interp->codec_error_registry == NULL)
1524 Py_FatalError("can't initialize codec registry");
Gustavo Niemeyer5ddd4c32003-03-19 00:35:36 +00001525
Christian Heimes819b8bf2008-01-03 23:05:47 +00001526 mod = PyImport_ImportModuleNoBlock("encodings");
Gustavo Niemeyer5ddd4c32003-03-19 00:35:36 +00001527 if (mod == NULL) {
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001528 return -1;
Gustavo Niemeyer5ddd4c32003-03-19 00:35:36 +00001529 }
1530 Py_DECREF(mod);
Christian Heimes6a27efa2008-10-30 21:48:26 +00001531 interp->codecs_initialized = 1;
Gustavo Niemeyer5ddd4c32003-03-19 00:35:36 +00001532 return 0;
Guido van Rossumfeee4b92000-03-10 22:57:27 +00001533}