blob: 688a40bd6ff418b6ec1010bd15a99805a16b5022 [file] [log] [blame]
Guido van Rossumfeee4b92000-03-10 22:57:27 +00001/* ------------------------------------------------------------------------
2
3 Python Codec Registry and support functions
4
5Written by Marc-Andre Lemburg (mal@lemburg.com).
6
Guido van Rossum16b1ad92000-08-03 16:24:25 +00007Copyright (c) Corporation for National Research Initiatives.
Guido van Rossumfeee4b92000-03-10 22:57:27 +00008
9 ------------------------------------------------------------------------ */
10
11#include "Python.h"
Serhiy Storchaka166ebc42014-11-25 13:57:17 +020012#include "ucnhash.h"
Guido van Rossumfeee4b92000-03-10 22:57:27 +000013#include <ctype.h>
14
Victor Stinnerf5cff562011-10-14 02:13:11 +020015const char *Py_hexdigits = "0123456789abcdef";
16
Guido van Rossumfeee4b92000-03-10 22:57:27 +000017/* --- Codec Registry ----------------------------------------------------- */
18
19/* Import the standard encodings package which will register the first
Guido van Rossum98297ee2007-11-06 21:34:58 +000020 codec search function.
Guido van Rossumfeee4b92000-03-10 22:57:27 +000021
22 This is done in a lazy way so that the Unicode implementation does
23 not downgrade startup time of scripts not needing it.
24
Guido van Rossumb95de4f2000-03-31 17:25:23 +000025 ImportErrors are silently ignored by this function. Only one try is
26 made.
Guido van Rossumfeee4b92000-03-10 22:57:27 +000027
28*/
29
Gustavo Niemeyer5ddd4c32003-03-19 00:35:36 +000030static int _PyCodecRegistry_Init(void); /* Forward */
Guido van Rossumfeee4b92000-03-10 22:57:27 +000031
Guido van Rossumfeee4b92000-03-10 22:57:27 +000032int PyCodec_Register(PyObject *search_function)
33{
Nicholas Bastine5662ae2004-03-24 22:22:12 +000034 PyInterpreterState *interp = PyThreadState_GET()->interp;
Gustavo Niemeyer5ddd4c32003-03-19 00:35:36 +000035 if (interp->codec_search_path == NULL && _PyCodecRegistry_Init())
Antoine Pitrouf95a1b32010-05-09 15:52:27 +000036 goto onError;
Guido van Rossumfeee4b92000-03-10 22:57:27 +000037 if (search_function == NULL) {
Antoine Pitrouf95a1b32010-05-09 15:52:27 +000038 PyErr_BadArgument();
39 goto onError;
Guido van Rossumfeee4b92000-03-10 22:57:27 +000040 }
41 if (!PyCallable_Check(search_function)) {
Antoine Pitrouf95a1b32010-05-09 15:52:27 +000042 PyErr_SetString(PyExc_TypeError, "argument must be callable");
43 goto onError;
Guido van Rossumfeee4b92000-03-10 22:57:27 +000044 }
Gustavo Niemeyer5ddd4c32003-03-19 00:35:36 +000045 return PyList_Append(interp->codec_search_path, search_function);
Guido van Rossumb95de4f2000-03-31 17:25:23 +000046
47 onError:
48 return -1;
Guido van Rossumfeee4b92000-03-10 22:57:27 +000049}
50
Guido van Rossum9e896b32000-04-05 20:11:21 +000051/* Convert a string to a normalized Python string: all characters are
52 converted to lower case, spaces are replaced with underscores. */
53
Guido van Rossumfeee4b92000-03-10 22:57:27 +000054static
Guido van Rossum9e896b32000-04-05 20:11:21 +000055PyObject *normalizestring(const char *string)
Guido van Rossumfeee4b92000-03-10 22:57:27 +000056{
Antoine Pitrou9ed5f272013-08-13 20:18:52 +020057 size_t i;
Guido van Rossum582acec2000-06-28 22:07:35 +000058 size_t len = strlen(string);
Guido van Rossumfeee4b92000-03-10 22:57:27 +000059 char *p;
60 PyObject *v;
Guido van Rossum21431e82007-10-19 21:48:41 +000061
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000062 if (len > PY_SSIZE_T_MAX) {
Antoine Pitrouf95a1b32010-05-09 15:52:27 +000063 PyErr_SetString(PyExc_OverflowError, "string is too large");
64 return NULL;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000065 }
Guido van Rossum21431e82007-10-19 21:48:41 +000066
67 p = PyMem_Malloc(len + 1);
68 if (p == NULL)
Victor Stinnercc351592013-07-12 00:02:55 +020069 return PyErr_NoMemory();
Guido van Rossum9e896b32000-04-05 20:11:21 +000070 for (i = 0; i < len; i++) {
Antoine Pitrou9ed5f272013-08-13 20:18:52 +020071 char ch = string[i];
Guido van Rossum9e896b32000-04-05 20:11:21 +000072 if (ch == ' ')
73 ch = '-';
74 else
Antoine Pitroucf9d3c02011-07-24 02:27:04 +020075 ch = Py_TOLOWER(Py_CHARMASK(ch));
Antoine Pitrouf95a1b32010-05-09 15:52:27 +000076 p[i] = ch;
Guido van Rossum9e896b32000-04-05 20:11:21 +000077 }
Guido van Rossum21431e82007-10-19 21:48:41 +000078 p[i] = '\0';
79 v = PyUnicode_FromString(p);
80 if (v == NULL)
81 return NULL;
82 PyMem_Free(p);
Guido van Rossumfeee4b92000-03-10 22:57:27 +000083 return v;
84}
85
86/* Lookup the given encoding and return a tuple providing the codec
87 facilities.
88
89 The encoding string is looked up converted to all lower-case
90 characters. This makes encodings looked up through this mechanism
91 effectively case-insensitive.
92
Guido van Rossum98297ee2007-11-06 21:34:58 +000093 If no codec is found, a LookupError is set and NULL returned.
Guido van Rossumb95de4f2000-03-31 17:25:23 +000094
95 As side effect, this tries to load the encodings package, if not
96 yet done. This is part of the lazy load strategy for the encodings
97 package.
98
99*/
Guido van Rossumfeee4b92000-03-10 22:57:27 +0000100
101PyObject *_PyCodec_Lookup(const char *encoding)
102{
Gustavo Niemeyer5ddd4c32003-03-19 00:35:36 +0000103 PyInterpreterState *interp;
Guido van Rossum5ba3c842000-03-24 20:52:23 +0000104 PyObject *result, *args = NULL, *v;
Thomas Wouters477c8d52006-05-27 19:21:47 +0000105 Py_ssize_t i, len;
Guido van Rossumfeee4b92000-03-10 22:57:27 +0000106
Fred Drake766de832000-05-09 19:55:59 +0000107 if (encoding == NULL) {
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000108 PyErr_BadArgument();
109 goto onError;
Fred Drake766de832000-05-09 19:55:59 +0000110 }
Gustavo Niemeyer5ddd4c32003-03-19 00:35:36 +0000111
Nicholas Bastine5662ae2004-03-24 22:22:12 +0000112 interp = PyThreadState_GET()->interp;
Gustavo Niemeyer5ddd4c32003-03-19 00:35:36 +0000113 if (interp->codec_search_path == NULL && _PyCodecRegistry_Init())
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000114 goto onError;
Guido van Rossumfeee4b92000-03-10 22:57:27 +0000115
Guido van Rossum9e896b32000-04-05 20:11:21 +0000116 /* Convert the encoding to a normalized Python string: all
Thomas Wouters7e474022000-07-16 12:04:32 +0000117 characters are converted to lower case, spaces and hyphens are
Guido van Rossum9e896b32000-04-05 20:11:21 +0000118 replaced with underscores. */
119 v = normalizestring(encoding);
Guido van Rossumfeee4b92000-03-10 22:57:27 +0000120 if (v == NULL)
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000121 goto onError;
Guido van Rossum21431e82007-10-19 21:48:41 +0000122 PyUnicode_InternInPlace(&v);
Guido van Rossumfeee4b92000-03-10 22:57:27 +0000123
124 /* First, try to lookup the name in the registry dictionary */
Gustavo Niemeyer5ddd4c32003-03-19 00:35:36 +0000125 result = PyDict_GetItem(interp->codec_search_cache, v);
Guido van Rossumfeee4b92000-03-10 22:57:27 +0000126 if (result != NULL) {
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000127 Py_INCREF(result);
128 Py_DECREF(v);
129 return result;
Guido van Rossumfeee4b92000-03-10 22:57:27 +0000130 }
Guido van Rossum98297ee2007-11-06 21:34:58 +0000131
Guido van Rossumfeee4b92000-03-10 22:57:27 +0000132 /* Next, scan the search functions in order of registration */
Guido van Rossumfeee4b92000-03-10 22:57:27 +0000133 args = PyTuple_New(1);
134 if (args == NULL)
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000135 goto onError;
Guido van Rossumfeee4b92000-03-10 22:57:27 +0000136 PyTuple_SET_ITEM(args,0,v);
Guido van Rossum5ba3c842000-03-24 20:52:23 +0000137
Gustavo Niemeyer5ddd4c32003-03-19 00:35:36 +0000138 len = PyList_Size(interp->codec_search_path);
Guido van Rossum5ba3c842000-03-24 20:52:23 +0000139 if (len < 0)
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000140 goto onError;
Guido van Rossumb95de4f2000-03-31 17:25:23 +0000141 if (len == 0) {
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000142 PyErr_SetString(PyExc_LookupError,
143 "no codec search functions registered: "
144 "can't find encoding");
145 goto onError;
Guido van Rossumb95de4f2000-03-31 17:25:23 +0000146 }
Guido van Rossumfeee4b92000-03-10 22:57:27 +0000147
148 for (i = 0; i < len; i++) {
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000149 PyObject *func;
Guido van Rossum98297ee2007-11-06 21:34:58 +0000150
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000151 func = PyList_GetItem(interp->codec_search_path, i);
152 if (func == NULL)
153 goto onError;
154 result = PyEval_CallObject(func, args);
155 if (result == NULL)
156 goto onError;
157 if (result == Py_None) {
158 Py_DECREF(result);
159 continue;
160 }
161 if (!PyTuple_Check(result) || PyTuple_GET_SIZE(result) != 4) {
162 PyErr_SetString(PyExc_TypeError,
163 "codec search functions must return 4-tuples");
164 Py_DECREF(result);
165 goto onError;
166 }
167 break;
Guido van Rossumfeee4b92000-03-10 22:57:27 +0000168 }
169 if (i == len) {
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000170 /* XXX Perhaps we should cache misses too ? */
171 PyErr_Format(PyExc_LookupError,
Martin v. Löwiseb42b022002-09-26 16:01:24 +0000172 "unknown encoding: %s", encoding);
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000173 goto onError;
Guido van Rossumfeee4b92000-03-10 22:57:27 +0000174 }
175
176 /* Cache and return the result */
Neal Norwitz9edcc2e2007-08-11 04:58:26 +0000177 if (PyDict_SetItem(interp->codec_search_cache, v, result) < 0) {
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000178 Py_DECREF(result);
179 goto onError;
Neal Norwitz9edcc2e2007-08-11 04:58:26 +0000180 }
Guido van Rossumfeee4b92000-03-10 22:57:27 +0000181 Py_DECREF(args);
182 return result;
183
184 onError:
185 Py_XDECREF(args);
186 return NULL;
187}
188
Nick Coghlan8fad1672014-09-15 23:50:44 +1200189int _PyCodec_Forget(const char *encoding)
190{
191 PyInterpreterState *interp;
192 PyObject *v;
193 int result;
194
195 interp = PyThreadState_GET()->interp;
196 if (interp->codec_search_path == NULL) {
197 return -1;
198 }
199
200 /* Convert the encoding to a normalized Python string: all
201 characters are converted to lower case, spaces and hyphens are
202 replaced with underscores. */
203 v = normalizestring(encoding);
204 if (v == NULL) {
205 return -1;
206 }
207
208 /* Drop the named codec from the internal cache */
209 result = PyDict_DelItem(interp->codec_search_cache, v);
210 Py_DECREF(v);
211
212 return result;
213}
214
Marc-André Lemburgb2750b52008-06-06 12:18:17 +0000215/* Codec registry encoding check API. */
216
217int PyCodec_KnownEncoding(const char *encoding)
218{
219 PyObject *codecs;
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000220
Marc-André Lemburgb2750b52008-06-06 12:18:17 +0000221 codecs = _PyCodec_Lookup(encoding);
222 if (!codecs) {
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000223 PyErr_Clear();
224 return 0;
Marc-André Lemburgb2750b52008-06-06 12:18:17 +0000225 }
226 else {
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000227 Py_DECREF(codecs);
228 return 1;
Marc-André Lemburgb2750b52008-06-06 12:18:17 +0000229 }
230}
231
Guido van Rossumfeee4b92000-03-10 22:57:27 +0000232static
233PyObject *args_tuple(PyObject *object,
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000234 const char *errors)
Guido van Rossumfeee4b92000-03-10 22:57:27 +0000235{
236 PyObject *args;
Guido van Rossum98297ee2007-11-06 21:34:58 +0000237
Guido van Rossumfeee4b92000-03-10 22:57:27 +0000238 args = PyTuple_New(1 + (errors != NULL));
239 if (args == NULL)
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000240 return NULL;
Guido van Rossumfeee4b92000-03-10 22:57:27 +0000241 Py_INCREF(object);
242 PyTuple_SET_ITEM(args,0,object);
243 if (errors) {
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000244 PyObject *v;
Guido van Rossum98297ee2007-11-06 21:34:58 +0000245
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000246 v = PyUnicode_FromString(errors);
247 if (v == NULL) {
248 Py_DECREF(args);
249 return NULL;
250 }
251 PyTuple_SET_ITEM(args, 1, v);
Guido van Rossumfeee4b92000-03-10 22:57:27 +0000252 }
253 return args;
254}
255
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000256/* Helper function to get a codec item */
Guido van Rossumfeee4b92000-03-10 22:57:27 +0000257
258static
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000259PyObject *codec_getitem(const char *encoding, int index)
Guido van Rossumfeee4b92000-03-10 22:57:27 +0000260{
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000261 PyObject *codecs;
262 PyObject *v;
Guido van Rossumfeee4b92000-03-10 22:57:27 +0000263
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000264 codecs = _PyCodec_Lookup(encoding);
265 if (codecs == NULL)
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000266 return NULL;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000267 v = PyTuple_GET_ITEM(codecs, index);
268 Py_DECREF(codecs);
269 Py_INCREF(v);
270 return v;
271}
272
Nick Coghlana9b15242014-02-04 22:11:18 +1000273/* Helper functions to create an incremental codec. */
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000274static
Nick Coghlana9b15242014-02-04 22:11:18 +1000275PyObject *codec_makeincrementalcodec(PyObject *codec_info,
276 const char *errors,
277 const char *attrname)
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000278{
Nick Coghlana9b15242014-02-04 22:11:18 +1000279 PyObject *ret, *inccodec;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000280
Nick Coghlana9b15242014-02-04 22:11:18 +1000281 inccodec = PyObject_GetAttrString(codec_info, attrname);
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000282 if (inccodec == NULL)
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000283 return NULL;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000284 if (errors)
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000285 ret = PyObject_CallFunction(inccodec, "s", errors);
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000286 else
Victor Stinner4778eab2016-12-01 14:51:04 +0100287 ret = _PyObject_CallNoArg(inccodec);
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000288 Py_DECREF(inccodec);
289 return ret;
290}
291
Nick Coghlana9b15242014-02-04 22:11:18 +1000292static
293PyObject *codec_getincrementalcodec(const char *encoding,
294 const char *errors,
295 const char *attrname)
296{
297 PyObject *codec_info, *ret;
298
299 codec_info = _PyCodec_Lookup(encoding);
300 if (codec_info == NULL)
301 return NULL;
302 ret = codec_makeincrementalcodec(codec_info, errors, attrname);
303 Py_DECREF(codec_info);
304 return ret;
305}
306
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000307/* Helper function to create a stream codec. */
308
309static
310PyObject *codec_getstreamcodec(const char *encoding,
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000311 PyObject *stream,
312 const char *errors,
313 const int index)
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000314{
Thomas Wouters0e3f5912006-08-11 14:57:12 +0000315 PyObject *codecs, *streamcodec, *codeccls;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000316
317 codecs = _PyCodec_Lookup(encoding);
318 if (codecs == NULL)
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000319 return NULL;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000320
Thomas Wouters0e3f5912006-08-11 14:57:12 +0000321 codeccls = PyTuple_GET_ITEM(codecs, index);
322 if (errors != NULL)
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000323 streamcodec = PyObject_CallFunction(codeccls, "Os", stream, errors);
Thomas Wouters0e3f5912006-08-11 14:57:12 +0000324 else
Victor Stinner7bfb42d2016-12-05 17:04:32 +0100325 streamcodec = PyObject_CallFunctionObjArgs(codeccls, stream, NULL);
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000326 Py_DECREF(codecs);
327 return streamcodec;
Guido van Rossumfeee4b92000-03-10 22:57:27 +0000328}
329
Nick Coghlana9b15242014-02-04 22:11:18 +1000330/* Helpers to work with the result of _PyCodec_Lookup
331
332 */
333PyObject *_PyCodecInfo_GetIncrementalDecoder(PyObject *codec_info,
334 const char *errors)
335{
336 return codec_makeincrementalcodec(codec_info, errors,
337 "incrementaldecoder");
338}
339
340PyObject *_PyCodecInfo_GetIncrementalEncoder(PyObject *codec_info,
341 const char *errors)
342{
343 return codec_makeincrementalcodec(codec_info, errors,
344 "incrementalencoder");
345}
346
347
Guido van Rossum98297ee2007-11-06 21:34:58 +0000348/* Convenience APIs to query the Codec registry.
349
Guido van Rossumfeee4b92000-03-10 22:57:27 +0000350 All APIs return a codec object with incremented refcount.
Guido van Rossum98297ee2007-11-06 21:34:58 +0000351
Guido van Rossumfeee4b92000-03-10 22:57:27 +0000352 */
353
354PyObject *PyCodec_Encoder(const char *encoding)
355{
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000356 return codec_getitem(encoding, 0);
Guido van Rossumfeee4b92000-03-10 22:57:27 +0000357}
358
359PyObject *PyCodec_Decoder(const char *encoding)
360{
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000361 return codec_getitem(encoding, 1);
Guido van Rossumfeee4b92000-03-10 22:57:27 +0000362}
363
Thomas Woutersa9773292006-04-21 09:43:23 +0000364PyObject *PyCodec_IncrementalEncoder(const char *encoding,
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000365 const char *errors)
Thomas Woutersa9773292006-04-21 09:43:23 +0000366{
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000367 return codec_getincrementalcodec(encoding, errors, "incrementalencoder");
Thomas Woutersa9773292006-04-21 09:43:23 +0000368}
369
370PyObject *PyCodec_IncrementalDecoder(const char *encoding,
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000371 const char *errors)
Thomas Woutersa9773292006-04-21 09:43:23 +0000372{
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000373 return codec_getincrementalcodec(encoding, errors, "incrementaldecoder");
Thomas Woutersa9773292006-04-21 09:43:23 +0000374}
375
Guido van Rossumfeee4b92000-03-10 22:57:27 +0000376PyObject *PyCodec_StreamReader(const char *encoding,
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000377 PyObject *stream,
378 const char *errors)
Guido van Rossumfeee4b92000-03-10 22:57:27 +0000379{
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000380 return codec_getstreamcodec(encoding, stream, errors, 2);
Guido van Rossumfeee4b92000-03-10 22:57:27 +0000381}
382
383PyObject *PyCodec_StreamWriter(const char *encoding,
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000384 PyObject *stream,
385 const char *errors)
Guido van Rossumfeee4b92000-03-10 22:57:27 +0000386{
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000387 return codec_getstreamcodec(encoding, stream, errors, 3);
Guido van Rossumfeee4b92000-03-10 22:57:27 +0000388}
389
Nick Coghlan8b097b42013-11-13 23:49:21 +1000390/* Helper that tries to ensure the reported exception chain indicates the
391 * codec that was invoked to trigger the failure without changing the type
392 * of the exception raised.
393 */
394static void
395wrap_codec_error(const char *operation,
396 const char *encoding)
397{
398 /* TrySetFromCause will replace the active exception with a suitably
399 * updated clone if it can, otherwise it will leave the original
400 * exception alone.
401 */
402 _PyErr_TrySetFromCause("%s with '%s' codec failed",
403 operation, encoding);
404}
405
Martin Panter6245cb32016-04-15 02:14:19 +0000406/* Encode an object (e.g. a Unicode object) using the given encoding
Guido van Rossumfeee4b92000-03-10 22:57:27 +0000407 and return the resulting encoded object (usually a Python string).
408
409 errors is passed to the encoder factory as argument if non-NULL. */
410
Nick Coghlanc72e4e62013-11-22 22:39:36 +1000411static PyObject *
412_PyCodec_EncodeInternal(PyObject *object,
413 PyObject *encoder,
414 const char *encoding,
415 const char *errors)
Guido van Rossumfeee4b92000-03-10 22:57:27 +0000416{
Neal Norwitz3715c3e2005-11-24 22:09:18 +0000417 PyObject *args = NULL, *result = NULL;
Guido van Rossum98297ee2007-11-06 21:34:58 +0000418 PyObject *v = NULL;
Guido van Rossumfeee4b92000-03-10 22:57:27 +0000419
Guido van Rossumfeee4b92000-03-10 22:57:27 +0000420 args = args_tuple(object, errors);
421 if (args == NULL)
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000422 goto onError;
Guido van Rossum98297ee2007-11-06 21:34:58 +0000423
424 result = PyEval_CallObject(encoder, args);
Nick Coghlanc4c25802013-11-15 21:47:37 +1000425 if (result == NULL) {
426 wrap_codec_error("encoding", encoding);
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000427 goto onError;
Nick Coghlanc4c25802013-11-15 21:47:37 +1000428 }
Guido van Rossumfeee4b92000-03-10 22:57:27 +0000429
Guido van Rossum98297ee2007-11-06 21:34:58 +0000430 if (!PyTuple_Check(result) ||
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000431 PyTuple_GET_SIZE(result) != 2) {
432 PyErr_SetString(PyExc_TypeError,
433 "encoder must return a tuple (object, integer)");
434 goto onError;
Guido van Rossumfeee4b92000-03-10 22:57:27 +0000435 }
Marc-André Lemburgb2750b52008-06-06 12:18:17 +0000436 v = PyTuple_GET_ITEM(result,0);
437 Py_INCREF(v);
Guido van Rossumfeee4b92000-03-10 22:57:27 +0000438 /* We don't check or use the second (integer) entry. */
439
Marc-André Lemburgb2750b52008-06-06 12:18:17 +0000440 Py_DECREF(args);
441 Py_DECREF(encoder);
442 Py_DECREF(result);
443 return v;
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000444
Guido van Rossumfeee4b92000-03-10 22:57:27 +0000445 onError:
Neal Norwitz3715c3e2005-11-24 22:09:18 +0000446 Py_XDECREF(result);
Guido van Rossumfeee4b92000-03-10 22:57:27 +0000447 Py_XDECREF(args);
448 Py_XDECREF(encoder);
Marc-André Lemburgb2750b52008-06-06 12:18:17 +0000449 return NULL;
Guido van Rossumfeee4b92000-03-10 22:57:27 +0000450}
451
452/* Decode an object (usually a Python string) using the given encoding
Martin Panter6245cb32016-04-15 02:14:19 +0000453 and return an equivalent object (e.g. a Unicode object).
Guido van Rossumfeee4b92000-03-10 22:57:27 +0000454
455 errors is passed to the decoder factory as argument if non-NULL. */
456
Nick Coghlanc72e4e62013-11-22 22:39:36 +1000457static PyObject *
458_PyCodec_DecodeInternal(PyObject *object,
459 PyObject *decoder,
460 const char *encoding,
461 const char *errors)
Guido van Rossumfeee4b92000-03-10 22:57:27 +0000462{
Guido van Rossumfeee4b92000-03-10 22:57:27 +0000463 PyObject *args = NULL, *result = NULL;
464 PyObject *v;
465
Guido van Rossumfeee4b92000-03-10 22:57:27 +0000466 args = args_tuple(object, errors);
467 if (args == NULL)
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000468 goto onError;
Guido van Rossum98297ee2007-11-06 21:34:58 +0000469
Guido van Rossumfeee4b92000-03-10 22:57:27 +0000470 result = PyEval_CallObject(decoder,args);
Nick Coghlanc4c25802013-11-15 21:47:37 +1000471 if (result == NULL) {
472 wrap_codec_error("decoding", encoding);
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000473 goto onError;
Nick Coghlanc4c25802013-11-15 21:47:37 +1000474 }
Guido van Rossum98297ee2007-11-06 21:34:58 +0000475 if (!PyTuple_Check(result) ||
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000476 PyTuple_GET_SIZE(result) != 2) {
477 PyErr_SetString(PyExc_TypeError,
478 "decoder must return a tuple (object,integer)");
479 goto onError;
Guido van Rossumfeee4b92000-03-10 22:57:27 +0000480 }
481 v = PyTuple_GET_ITEM(result,0);
482 Py_INCREF(v);
483 /* We don't check or use the second (integer) entry. */
484
485 Py_DECREF(args);
486 Py_DECREF(decoder);
487 Py_DECREF(result);
488 return v;
Guido van Rossum98297ee2007-11-06 21:34:58 +0000489
Guido van Rossumfeee4b92000-03-10 22:57:27 +0000490 onError:
491 Py_XDECREF(args);
492 Py_XDECREF(decoder);
493 Py_XDECREF(result);
494 return NULL;
495}
496
Nick Coghlanc72e4e62013-11-22 22:39:36 +1000497/* Generic encoding/decoding API */
498PyObject *PyCodec_Encode(PyObject *object,
499 const char *encoding,
500 const char *errors)
501{
502 PyObject *encoder;
503
504 encoder = PyCodec_Encoder(encoding);
505 if (encoder == NULL)
506 return NULL;
507
508 return _PyCodec_EncodeInternal(object, encoder, encoding, errors);
509}
510
511PyObject *PyCodec_Decode(PyObject *object,
512 const char *encoding,
513 const char *errors)
514{
515 PyObject *decoder;
516
517 decoder = PyCodec_Decoder(encoding);
518 if (decoder == NULL)
519 return NULL;
520
521 return _PyCodec_DecodeInternal(object, decoder, encoding, errors);
522}
523
524/* Text encoding/decoding API */
Nick Coghlana9b15242014-02-04 22:11:18 +1000525PyObject * _PyCodec_LookupTextEncoding(const char *encoding,
526 const char *alternate_command)
Nick Coghlanc72e4e62013-11-22 22:39:36 +1000527{
528 _Py_IDENTIFIER(_is_text_encoding);
529 PyObject *codec;
530 PyObject *attr;
Nick Coghlanc72e4e62013-11-22 22:39:36 +1000531 int is_text_codec;
532
533 codec = _PyCodec_Lookup(encoding);
534 if (codec == NULL)
535 return NULL;
536
537 /* Backwards compatibility: assume any raw tuple describes a text
538 * encoding, and the same for anything lacking the private
539 * attribute.
540 */
541 if (!PyTuple_CheckExact(codec)) {
542 attr = _PyObject_GetAttrId(codec, &PyId__is_text_encoding);
543 if (attr == NULL) {
544 if (PyErr_ExceptionMatches(PyExc_AttributeError)) {
545 PyErr_Clear();
546 } else {
547 Py_DECREF(codec);
548 return NULL;
549 }
550 } else {
551 is_text_codec = PyObject_IsTrue(attr);
552 Py_DECREF(attr);
Serhiy Storchakafa494fd2015-05-30 17:45:22 +0300553 if (is_text_codec <= 0) {
Nick Coghlanc72e4e62013-11-22 22:39:36 +1000554 Py_DECREF(codec);
Serhiy Storchakafa494fd2015-05-30 17:45:22 +0300555 if (!is_text_codec)
556 PyErr_Format(PyExc_LookupError,
557 "'%.400s' is not a text encoding; "
558 "use %s to handle arbitrary codecs",
559 encoding, alternate_command);
Nick Coghlanc72e4e62013-11-22 22:39:36 +1000560 return NULL;
561 }
562 }
563 }
564
Nick Coghlana9b15242014-02-04 22:11:18 +1000565 /* This appears to be a valid text encoding */
566 return codec;
567}
568
569
570static
571PyObject *codec_getitem_checked(const char *encoding,
572 const char *alternate_command,
573 int index)
574{
575 PyObject *codec;
576 PyObject *v;
577
578 codec = _PyCodec_LookupTextEncoding(encoding, alternate_command);
579 if (codec == NULL)
580 return NULL;
581
Nick Coghlanc72e4e62013-11-22 22:39:36 +1000582 v = PyTuple_GET_ITEM(codec, index);
Nick Coghlanc72e4e62013-11-22 22:39:36 +1000583 Py_INCREF(v);
Nick Coghlana9b15242014-02-04 22:11:18 +1000584 Py_DECREF(codec);
Nick Coghlanc72e4e62013-11-22 22:39:36 +1000585 return v;
586}
587
588static PyObject * _PyCodec_TextEncoder(const char *encoding)
589{
Nick Coghlana9b15242014-02-04 22:11:18 +1000590 return codec_getitem_checked(encoding, "codecs.encode()", 0);
Nick Coghlanc72e4e62013-11-22 22:39:36 +1000591}
592
593static PyObject * _PyCodec_TextDecoder(const char *encoding)
594{
Nick Coghlana9b15242014-02-04 22:11:18 +1000595 return codec_getitem_checked(encoding, "codecs.decode()", 1);
Nick Coghlanc72e4e62013-11-22 22:39:36 +1000596}
597
598PyObject *_PyCodec_EncodeText(PyObject *object,
599 const char *encoding,
600 const char *errors)
601{
602 PyObject *encoder;
603
604 encoder = _PyCodec_TextEncoder(encoding);
605 if (encoder == NULL)
606 return NULL;
607
608 return _PyCodec_EncodeInternal(object, encoder, encoding, errors);
609}
610
611PyObject *_PyCodec_DecodeText(PyObject *object,
612 const char *encoding,
613 const char *errors)
614{
615 PyObject *decoder;
616
617 decoder = _PyCodec_TextDecoder(encoding);
618 if (decoder == NULL)
619 return NULL;
620
621 return _PyCodec_DecodeInternal(object, decoder, encoding, errors);
622}
623
Walter Dörwald3aeb6322002-09-02 13:14:32 +0000624/* Register the error handling callback function error under the name
625 name. This function will be called by the codec when it encounters
626 an unencodable characters/undecodable bytes and doesn't know the
627 callback name, when name is specified as the error parameter
628 in the call to the encode/decode function.
629 Return 0 on success, -1 on error */
630int PyCodec_RegisterError(const char *name, PyObject *error)
631{
Nicholas Bastine5662ae2004-03-24 22:22:12 +0000632 PyInterpreterState *interp = PyThreadState_GET()->interp;
Gustavo Niemeyer5ddd4c32003-03-19 00:35:36 +0000633 if (interp->codec_search_path == NULL && _PyCodecRegistry_Init())
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000634 return -1;
Walter Dörwald3aeb6322002-09-02 13:14:32 +0000635 if (!PyCallable_Check(error)) {
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000636 PyErr_SetString(PyExc_TypeError, "handler must be callable");
637 return -1;
Walter Dörwald3aeb6322002-09-02 13:14:32 +0000638 }
Gustavo Niemeyer5ddd4c32003-03-19 00:35:36 +0000639 return PyDict_SetItemString(interp->codec_error_registry,
Serhiy Storchakac6792272013-10-19 21:03:34 +0300640 name, error);
Walter Dörwald3aeb6322002-09-02 13:14:32 +0000641}
642
643/* Lookup the error handling callback function registered under the
644 name error. As a special case NULL can be passed, in which case
645 the error handling callback for strict encoding will be returned. */
646PyObject *PyCodec_LookupError(const char *name)
647{
648 PyObject *handler = NULL;
649
Nicholas Bastine5662ae2004-03-24 22:22:12 +0000650 PyInterpreterState *interp = PyThreadState_GET()->interp;
Gustavo Niemeyer5ddd4c32003-03-19 00:35:36 +0000651 if (interp->codec_search_path == NULL && _PyCodecRegistry_Init())
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000652 return NULL;
Gustavo Niemeyer5ddd4c32003-03-19 00:35:36 +0000653
Walter Dörwald3aeb6322002-09-02 13:14:32 +0000654 if (name==NULL)
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000655 name = "strict";
Serhiy Storchakac6792272013-10-19 21:03:34 +0300656 handler = PyDict_GetItemString(interp->codec_error_registry, name);
Walter Dörwald3aeb6322002-09-02 13:14:32 +0000657 if (!handler)
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000658 PyErr_Format(PyExc_LookupError, "unknown error handler name '%.400s'", name);
Walter Dörwald3aeb6322002-09-02 13:14:32 +0000659 else
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000660 Py_INCREF(handler);
Walter Dörwald3aeb6322002-09-02 13:14:32 +0000661 return handler;
662}
663
664static void wrong_exception_type(PyObject *exc)
665{
Serhiy Storchakaca7fecb2015-05-18 16:08:52 +0300666 PyErr_Format(PyExc_TypeError,
667 "don't know how to handle %.200s in error callback",
668 exc->ob_type->tp_name);
Walter Dörwald3aeb6322002-09-02 13:14:32 +0000669}
670
671PyObject *PyCodec_StrictErrors(PyObject *exc)
672{
Brett Cannonbf364092006-03-01 04:25:17 +0000673 if (PyExceptionInstance_Check(exc))
674 PyErr_SetObject(PyExceptionInstance_Class(exc), exc);
Walter Dörwald3aeb6322002-09-02 13:14:32 +0000675 else
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000676 PyErr_SetString(PyExc_TypeError, "codec must pass exception instance");
Walter Dörwald3aeb6322002-09-02 13:14:32 +0000677 return NULL;
678}
679
680
681PyObject *PyCodec_IgnoreErrors(PyObject *exc)
682{
Martin v. Löwis18e16552006-02-15 17:27:45 +0000683 Py_ssize_t end;
Serhiy Storchakaca7fecb2015-05-18 16:08:52 +0300684
685 if (PyObject_TypeCheck(exc, (PyTypeObject *)PyExc_UnicodeEncodeError)) {
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000686 if (PyUnicodeEncodeError_GetEnd(exc, &end))
687 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +0000688 }
Serhiy Storchakaca7fecb2015-05-18 16:08:52 +0300689 else if (PyObject_TypeCheck(exc, (PyTypeObject *)PyExc_UnicodeDecodeError)) {
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000690 if (PyUnicodeDecodeError_GetEnd(exc, &end))
691 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +0000692 }
Serhiy Storchakaca7fecb2015-05-18 16:08:52 +0300693 else if (PyObject_TypeCheck(exc, (PyTypeObject *)PyExc_UnicodeTranslateError)) {
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000694 if (PyUnicodeTranslateError_GetEnd(exc, &end))
695 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +0000696 }
697 else {
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000698 wrong_exception_type(exc);
699 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +0000700 }
Victor Stinneree450092011-12-01 02:52:11 +0100701 return Py_BuildValue("(Nn)", PyUnicode_New(0, 0), end);
Walter Dörwald3aeb6322002-09-02 13:14:32 +0000702}
703
704
705PyObject *PyCodec_ReplaceErrors(PyObject *exc)
706{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200707 Py_ssize_t start, end, i, len;
Walter Dörwald3aeb6322002-09-02 13:14:32 +0000708
Serhiy Storchakaca7fecb2015-05-18 16:08:52 +0300709 if (PyObject_TypeCheck(exc, (PyTypeObject *)PyExc_UnicodeEncodeError)) {
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000710 PyObject *res;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200711 int kind;
712 void *data;
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000713 if (PyUnicodeEncodeError_GetStart(exc, &start))
714 return NULL;
715 if (PyUnicodeEncodeError_GetEnd(exc, &end))
716 return NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200717 len = end - start;
718 res = PyUnicode_New(len, '?');
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000719 if (res == NULL)
720 return NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200721 kind = PyUnicode_KIND(res);
722 data = PyUnicode_DATA(res);
723 for (i = 0; i < len; ++i)
724 PyUnicode_WRITE(kind, data, i, '?');
Victor Stinner8f825062012-04-27 13:55:39 +0200725 assert(_PyUnicode_CheckConsistency(res, 1));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200726 return Py_BuildValue("(Nn)", res, end);
Walter Dörwald3aeb6322002-09-02 13:14:32 +0000727 }
Serhiy Storchakaca7fecb2015-05-18 16:08:52 +0300728 else if (PyObject_TypeCheck(exc, (PyTypeObject *)PyExc_UnicodeDecodeError)) {
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000729 if (PyUnicodeDecodeError_GetEnd(exc, &end))
730 return NULL;
Victor Stinner1a15aba2011-10-02 19:00:15 +0200731 return Py_BuildValue("(Cn)",
732 (int)Py_UNICODE_REPLACEMENT_CHARACTER,
733 end);
Walter Dörwald3aeb6322002-09-02 13:14:32 +0000734 }
Serhiy Storchakaca7fecb2015-05-18 16:08:52 +0300735 else if (PyObject_TypeCheck(exc, (PyTypeObject *)PyExc_UnicodeTranslateError)) {
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000736 PyObject *res;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200737 int kind;
738 void *data;
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000739 if (PyUnicodeTranslateError_GetStart(exc, &start))
740 return NULL;
741 if (PyUnicodeTranslateError_GetEnd(exc, &end))
742 return NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200743 len = end - start;
744 res = PyUnicode_New(len, Py_UNICODE_REPLACEMENT_CHARACTER);
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000745 if (res == NULL)
746 return NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200747 kind = PyUnicode_KIND(res);
748 data = PyUnicode_DATA(res);
749 for (i=0; i < len; i++)
750 PyUnicode_WRITE(kind, data, i, Py_UNICODE_REPLACEMENT_CHARACTER);
Victor Stinner8f825062012-04-27 13:55:39 +0200751 assert(_PyUnicode_CheckConsistency(res, 1));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200752 return Py_BuildValue("(Nn)", res, end);
Walter Dörwald3aeb6322002-09-02 13:14:32 +0000753 }
754 else {
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000755 wrong_exception_type(exc);
756 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +0000757 }
758}
759
760PyObject *PyCodec_XMLCharRefReplaceErrors(PyObject *exc)
761{
Serhiy Storchakaca7fecb2015-05-18 16:08:52 +0300762 if (PyObject_TypeCheck(exc, (PyTypeObject *)PyExc_UnicodeEncodeError)) {
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000763 PyObject *restuple;
764 PyObject *object;
Victor Stinnerb31f1bc2011-11-04 21:29:10 +0100765 Py_ssize_t i;
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000766 Py_ssize_t start;
767 Py_ssize_t end;
768 PyObject *res;
Martin v. Löwisb09af032011-11-04 11:16:41 +0100769 unsigned char *outp;
Serhiy Storchaka2e374092014-10-04 14:15:49 +0300770 Py_ssize_t ressize;
Martin v. Löwisb09af032011-11-04 11:16:41 +0100771 Py_UCS4 ch;
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000772 if (PyUnicodeEncodeError_GetStart(exc, &start))
773 return NULL;
774 if (PyUnicodeEncodeError_GetEnd(exc, &end))
775 return NULL;
776 if (!(object = PyUnicodeEncodeError_GetObject(exc)))
777 return NULL;
Serhiy Storchaka2e374092014-10-04 14:15:49 +0300778 if (end - start > PY_SSIZE_T_MAX / (2+7+1))
779 end = start + PY_SSIZE_T_MAX / (2+7+1);
Martin v. Löwisb09af032011-11-04 11:16:41 +0100780 for (i = start, ressize = 0; i < end; ++i) {
781 /* object is guaranteed to be "ready" */
782 ch = PyUnicode_READ_CHAR(object, i);
783 if (ch<10)
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000784 ressize += 2+1+1;
Martin v. Löwisb09af032011-11-04 11:16:41 +0100785 else if (ch<100)
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000786 ressize += 2+2+1;
Martin v. Löwisb09af032011-11-04 11:16:41 +0100787 else if (ch<1000)
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000788 ressize += 2+3+1;
Martin v. Löwisb09af032011-11-04 11:16:41 +0100789 else if (ch<10000)
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000790 ressize += 2+4+1;
Martin v. Löwisb09af032011-11-04 11:16:41 +0100791 else if (ch<100000)
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000792 ressize += 2+5+1;
Martin v. Löwisb09af032011-11-04 11:16:41 +0100793 else if (ch<1000000)
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000794 ressize += 2+6+1;
795 else
796 ressize += 2+7+1;
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000797 }
798 /* allocate replacement */
Martin v. Löwisb09af032011-11-04 11:16:41 +0100799 res = PyUnicode_New(ressize, 127);
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000800 if (res == NULL) {
801 Py_DECREF(object);
802 return NULL;
803 }
Martin v. Löwisb09af032011-11-04 11:16:41 +0100804 outp = PyUnicode_1BYTE_DATA(res);
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000805 /* generate replacement */
Victor Stinnerb31f1bc2011-11-04 21:29:10 +0100806 for (i = start; i < end; ++i) {
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000807 int digits;
808 int base;
Martin v. Löwis8ba79302011-11-04 12:26:49 +0100809 ch = PyUnicode_READ_CHAR(object, i);
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000810 *outp++ = '&';
811 *outp++ = '#';
Martin v. Löwisb09af032011-11-04 11:16:41 +0100812 if (ch<10) {
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000813 digits = 1;
814 base = 1;
815 }
Martin v. Löwisb09af032011-11-04 11:16:41 +0100816 else if (ch<100) {
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000817 digits = 2;
818 base = 10;
819 }
Martin v. Löwisb09af032011-11-04 11:16:41 +0100820 else if (ch<1000) {
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000821 digits = 3;
822 base = 100;
823 }
Martin v. Löwisb09af032011-11-04 11:16:41 +0100824 else if (ch<10000) {
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000825 digits = 4;
826 base = 1000;
827 }
Martin v. Löwisb09af032011-11-04 11:16:41 +0100828 else if (ch<100000) {
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000829 digits = 5;
830 base = 10000;
831 }
Martin v. Löwisb09af032011-11-04 11:16:41 +0100832 else if (ch<1000000) {
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000833 digits = 6;
834 base = 100000;
835 }
836 else {
837 digits = 7;
838 base = 1000000;
839 }
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000840 while (digits-->0) {
Martin v. Löwisb09af032011-11-04 11:16:41 +0100841 *outp++ = '0' + ch/base;
842 ch %= base;
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000843 base /= 10;
844 }
845 *outp++ = ';';
846 }
Victor Stinner8f825062012-04-27 13:55:39 +0200847 assert(_PyUnicode_CheckConsistency(res, 1));
848 restuple = Py_BuildValue("(Nn)", res, end);
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000849 Py_DECREF(object);
850 return restuple;
Walter Dörwald3aeb6322002-09-02 13:14:32 +0000851 }
852 else {
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000853 wrong_exception_type(exc);
854 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +0000855 }
856}
857
Walter Dörwald3aeb6322002-09-02 13:14:32 +0000858PyObject *PyCodec_BackslashReplaceErrors(PyObject *exc)
859{
Serhiy Storchaka07985ef2015-01-25 22:56:57 +0200860 PyObject *object;
861 Py_ssize_t i;
862 Py_ssize_t start;
863 Py_ssize_t end;
864 PyObject *res;
865 unsigned char *outp;
866 int ressize;
867 Py_UCS4 c;
868
Serhiy Storchakac0937f72015-05-18 16:10:40 +0300869 if (PyObject_TypeCheck(exc, (PyTypeObject *)PyExc_UnicodeDecodeError)) {
Serhiy Storchakacb33a012016-10-23 09:44:50 +0300870 const unsigned char *p;
Serhiy Storchaka07985ef2015-01-25 22:56:57 +0200871 if (PyUnicodeDecodeError_GetStart(exc, &start))
872 return NULL;
873 if (PyUnicodeDecodeError_GetEnd(exc, &end))
874 return NULL;
875 if (!(object = PyUnicodeDecodeError_GetObject(exc)))
876 return NULL;
Serhiy Storchakacb33a012016-10-23 09:44:50 +0300877 p = (const unsigned char*)PyBytes_AS_STRING(object);
Serhiy Storchaka07985ef2015-01-25 22:56:57 +0200878 res = PyUnicode_New(4 * (end - start), 127);
879 if (res == NULL) {
880 Py_DECREF(object);
881 return NULL;
882 }
883 outp = PyUnicode_1BYTE_DATA(res);
884 for (i = start; i < end; i++, outp += 4) {
885 unsigned char c = p[i];
886 outp[0] = '\\';
887 outp[1] = 'x';
888 outp[2] = Py_hexdigits[(c>>4)&0xf];
889 outp[3] = Py_hexdigits[c&0xf];
890 }
891
892 assert(_PyUnicode_CheckConsistency(res, 1));
893 Py_DECREF(object);
894 return Py_BuildValue("(Nn)", res, end);
895 }
Serhiy Storchakac0937f72015-05-18 16:10:40 +0300896 if (PyObject_TypeCheck(exc, (PyTypeObject *)PyExc_UnicodeEncodeError)) {
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000897 if (PyUnicodeEncodeError_GetStart(exc, &start))
898 return NULL;
899 if (PyUnicodeEncodeError_GetEnd(exc, &end))
900 return NULL;
901 if (!(object = PyUnicodeEncodeError_GetObject(exc)))
902 return NULL;
Serhiy Storchaka07985ef2015-01-25 22:56:57 +0200903 }
Serhiy Storchakac0937f72015-05-18 16:10:40 +0300904 else if (PyObject_TypeCheck(exc, (PyTypeObject *)PyExc_UnicodeTranslateError)) {
Serhiy Storchaka07985ef2015-01-25 22:56:57 +0200905 if (PyUnicodeTranslateError_GetStart(exc, &start))
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000906 return NULL;
Serhiy Storchaka07985ef2015-01-25 22:56:57 +0200907 if (PyUnicodeTranslateError_GetEnd(exc, &end))
908 return NULL;
909 if (!(object = PyUnicodeTranslateError_GetObject(exc)))
910 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +0000911 }
912 else {
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000913 wrong_exception_type(exc);
914 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +0000915 }
Serhiy Storchaka07985ef2015-01-25 22:56:57 +0200916
917 if (end - start > PY_SSIZE_T_MAX / (1+1+8))
918 end = start + PY_SSIZE_T_MAX / (1+1+8);
919 for (i = start, ressize = 0; i < end; ++i) {
920 /* object is guaranteed to be "ready" */
921 c = PyUnicode_READ_CHAR(object, i);
922 if (c >= 0x10000) {
923 ressize += 1+1+8;
924 }
925 else if (c >= 0x100) {
926 ressize += 1+1+4;
927 }
928 else
929 ressize += 1+1+2;
930 }
931 res = PyUnicode_New(ressize, 127);
932 if (res == NULL) {
933 Py_DECREF(object);
934 return NULL;
935 }
936 outp = PyUnicode_1BYTE_DATA(res);
937 for (i = start; i < end; ++i) {
938 c = PyUnicode_READ_CHAR(object, i);
939 *outp++ = '\\';
940 if (c >= 0x00010000) {
941 *outp++ = 'U';
942 *outp++ = Py_hexdigits[(c>>28)&0xf];
943 *outp++ = Py_hexdigits[(c>>24)&0xf];
944 *outp++ = Py_hexdigits[(c>>20)&0xf];
945 *outp++ = Py_hexdigits[(c>>16)&0xf];
946 *outp++ = Py_hexdigits[(c>>12)&0xf];
947 *outp++ = Py_hexdigits[(c>>8)&0xf];
948 }
949 else if (c >= 0x100) {
950 *outp++ = 'u';
951 *outp++ = Py_hexdigits[(c>>12)&0xf];
952 *outp++ = Py_hexdigits[(c>>8)&0xf];
953 }
954 else
955 *outp++ = 'x';
956 *outp++ = Py_hexdigits[(c>>4)&0xf];
957 *outp++ = Py_hexdigits[c&0xf];
958 }
959
960 assert(_PyUnicode_CheckConsistency(res, 1));
961 Py_DECREF(object);
962 return Py_BuildValue("(Nn)", res, end);
Walter Dörwald3aeb6322002-09-02 13:14:32 +0000963}
964
Serhiy Storchaka166ebc42014-11-25 13:57:17 +0200965static _PyUnicode_Name_CAPI *ucnhash_CAPI = NULL;
Serhiy Storchaka166ebc42014-11-25 13:57:17 +0200966
967PyObject *PyCodec_NameReplaceErrors(PyObject *exc)
968{
Serhiy Storchakaca7fecb2015-05-18 16:08:52 +0300969 if (PyObject_TypeCheck(exc, (PyTypeObject *)PyExc_UnicodeEncodeError)) {
Serhiy Storchaka166ebc42014-11-25 13:57:17 +0200970 PyObject *restuple;
971 PyObject *object;
972 Py_ssize_t i;
973 Py_ssize_t start;
974 Py_ssize_t end;
975 PyObject *res;
976 unsigned char *outp;
Serhiy Storchakaaacfccc2014-11-26 12:11:40 +0200977 Py_ssize_t ressize;
978 int replsize;
Serhiy Storchaka166ebc42014-11-25 13:57:17 +0200979 Py_UCS4 c;
980 char buffer[256]; /* NAME_MAXLEN */
981 if (PyUnicodeEncodeError_GetStart(exc, &start))
982 return NULL;
983 if (PyUnicodeEncodeError_GetEnd(exc, &end))
984 return NULL;
985 if (!(object = PyUnicodeEncodeError_GetObject(exc)))
986 return NULL;
Victor Stinner38b8ae02015-09-03 16:19:40 +0200987 if (!ucnhash_CAPI) {
Serhiy Storchaka166ebc42014-11-25 13:57:17 +0200988 /* load the unicode data module */
989 ucnhash_CAPI = (_PyUnicode_Name_CAPI *)PyCapsule_Import(
990 PyUnicodeData_CAPSULE_NAME, 1);
Victor Stinner38b8ae02015-09-03 16:19:40 +0200991 if (!ucnhash_CAPI)
992 return NULL;
Serhiy Storchaka166ebc42014-11-25 13:57:17 +0200993 }
994 for (i = start, ressize = 0; i < end; ++i) {
995 /* object is guaranteed to be "ready" */
996 c = PyUnicode_READ_CHAR(object, i);
Victor Stinner38b8ae02015-09-03 16:19:40 +0200997 if (ucnhash_CAPI->getname(NULL, c, buffer, sizeof(buffer), 1)) {
Serhiy Storchaka26861b02015-02-16 20:52:17 +0200998 replsize = 1+1+1+(int)strlen(buffer)+1;
Serhiy Storchaka166ebc42014-11-25 13:57:17 +0200999 }
1000 else if (c >= 0x10000) {
Serhiy Storchakaaacfccc2014-11-26 12:11:40 +02001001 replsize = 1+1+8;
Serhiy Storchaka166ebc42014-11-25 13:57:17 +02001002 }
1003 else if (c >= 0x100) {
Serhiy Storchakaaacfccc2014-11-26 12:11:40 +02001004 replsize = 1+1+4;
Serhiy Storchaka166ebc42014-11-25 13:57:17 +02001005 }
1006 else
Serhiy Storchakaaacfccc2014-11-26 12:11:40 +02001007 replsize = 1+1+2;
1008 if (ressize > PY_SSIZE_T_MAX - replsize)
1009 break;
1010 ressize += replsize;
Serhiy Storchaka166ebc42014-11-25 13:57:17 +02001011 }
Serhiy Storchakaaacfccc2014-11-26 12:11:40 +02001012 end = i;
Serhiy Storchaka166ebc42014-11-25 13:57:17 +02001013 res = PyUnicode_New(ressize, 127);
1014 if (res==NULL)
1015 return NULL;
1016 for (i = start, outp = PyUnicode_1BYTE_DATA(res);
1017 i < end; ++i) {
1018 c = PyUnicode_READ_CHAR(object, i);
1019 *outp++ = '\\';
Victor Stinner38b8ae02015-09-03 16:19:40 +02001020 if (ucnhash_CAPI->getname(NULL, c, buffer, sizeof(buffer), 1)) {
Serhiy Storchaka166ebc42014-11-25 13:57:17 +02001021 *outp++ = 'N';
1022 *outp++ = '{';
1023 strcpy((char *)outp, buffer);
1024 outp += strlen(buffer);
1025 *outp++ = '}';
1026 continue;
1027 }
1028 if (c >= 0x00010000) {
1029 *outp++ = 'U';
1030 *outp++ = Py_hexdigits[(c>>28)&0xf];
1031 *outp++ = Py_hexdigits[(c>>24)&0xf];
1032 *outp++ = Py_hexdigits[(c>>20)&0xf];
1033 *outp++ = Py_hexdigits[(c>>16)&0xf];
1034 *outp++ = Py_hexdigits[(c>>12)&0xf];
1035 *outp++ = Py_hexdigits[(c>>8)&0xf];
1036 }
1037 else if (c >= 0x100) {
1038 *outp++ = 'u';
1039 *outp++ = Py_hexdigits[(c>>12)&0xf];
1040 *outp++ = Py_hexdigits[(c>>8)&0xf];
1041 }
1042 else
1043 *outp++ = 'x';
1044 *outp++ = Py_hexdigits[(c>>4)&0xf];
1045 *outp++ = Py_hexdigits[c&0xf];
1046 }
1047
Benjamin Peterson3663b582014-11-26 14:39:54 -06001048 assert(outp == PyUnicode_1BYTE_DATA(res) + ressize);
Serhiy Storchaka166ebc42014-11-25 13:57:17 +02001049 assert(_PyUnicode_CheckConsistency(res, 1));
1050 restuple = Py_BuildValue("(Nn)", res, end);
1051 Py_DECREF(object);
1052 return restuple;
1053 }
1054 else {
1055 wrong_exception_type(exc);
1056 return NULL;
1057 }
1058}
1059
Serhiy Storchaka88d8fb62014-05-15 14:37:42 +03001060#define ENC_UNKNOWN -1
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02001061#define ENC_UTF8 0
1062#define ENC_UTF16BE 1
1063#define ENC_UTF16LE 2
1064#define ENC_UTF32BE 3
1065#define ENC_UTF32LE 4
1066
1067static int
1068get_standard_encoding(const char *encoding, int *bytelength)
1069{
1070 if (Py_TOLOWER(encoding[0]) == 'u' &&
1071 Py_TOLOWER(encoding[1]) == 't' &&
1072 Py_TOLOWER(encoding[2]) == 'f') {
1073 encoding += 3;
1074 if (*encoding == '-' || *encoding == '_' )
1075 encoding++;
Serhiy Storchaka88d8fb62014-05-15 14:37:42 +03001076 if (encoding[0] == '8' && encoding[1] == '\0') {
1077 *bytelength = 3;
1078 return ENC_UTF8;
1079 }
1080 else if (encoding[0] == '1' && encoding[1] == '6') {
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02001081 encoding += 2;
1082 *bytelength = 2;
1083 if (*encoding == '\0') {
1084#ifdef WORDS_BIGENDIAN
1085 return ENC_UTF16BE;
1086#else
1087 return ENC_UTF16LE;
1088#endif
1089 }
1090 if (*encoding == '-' || *encoding == '_' )
1091 encoding++;
1092 if (Py_TOLOWER(encoding[1]) == 'e' && encoding[2] == '\0') {
1093 if (Py_TOLOWER(encoding[0]) == 'b')
1094 return ENC_UTF16BE;
1095 if (Py_TOLOWER(encoding[0]) == 'l')
1096 return ENC_UTF16LE;
1097 }
1098 }
1099 else if (encoding[0] == '3' && encoding[1] == '2') {
1100 encoding += 2;
1101 *bytelength = 4;
1102 if (*encoding == '\0') {
1103#ifdef WORDS_BIGENDIAN
1104 return ENC_UTF32BE;
1105#else
1106 return ENC_UTF32LE;
1107#endif
1108 }
1109 if (*encoding == '-' || *encoding == '_' )
1110 encoding++;
1111 if (Py_TOLOWER(encoding[1]) == 'e' && encoding[2] == '\0') {
1112 if (Py_TOLOWER(encoding[0]) == 'b')
1113 return ENC_UTF32BE;
1114 if (Py_TOLOWER(encoding[0]) == 'l')
1115 return ENC_UTF32LE;
1116 }
1117 }
1118 }
Victor Stinner0d4e01c2014-05-16 14:46:20 +02001119 else if (strcmp(encoding, "CP_UTF8") == 0) {
1120 *bytelength = 3;
1121 return ENC_UTF8;
1122 }
Serhiy Storchaka88d8fb62014-05-15 14:37:42 +03001123 return ENC_UNKNOWN;
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02001124}
1125
Martin v. Löwisaef3fb02009-05-02 19:27:30 +00001126/* This handler is declared static until someone demonstrates
1127 a need to call it directly. */
1128static PyObject *
Martin v. Löwise0a2b722009-05-10 08:08:56 +00001129PyCodec_SurrogatePassErrors(PyObject *exc)
Martin v. Löwisdb12d452009-05-02 18:52:14 +00001130{
1131 PyObject *restuple;
1132 PyObject *object;
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02001133 PyObject *encode;
Serhiy Storchaka85b0f5b2016-11-20 10:16:47 +02001134 const char *encoding;
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02001135 int code;
1136 int bytelength;
Martin v. Löwisb09af032011-11-04 11:16:41 +01001137 Py_ssize_t i;
Martin v. Löwisdb12d452009-05-02 18:52:14 +00001138 Py_ssize_t start;
1139 Py_ssize_t end;
1140 PyObject *res;
Serhiy Storchakaca7fecb2015-05-18 16:08:52 +03001141
1142 if (PyObject_TypeCheck(exc, (PyTypeObject *)PyExc_UnicodeEncodeError)) {
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02001143 unsigned char *outp;
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001144 if (PyUnicodeEncodeError_GetStart(exc, &start))
1145 return NULL;
1146 if (PyUnicodeEncodeError_GetEnd(exc, &end))
1147 return NULL;
1148 if (!(object = PyUnicodeEncodeError_GetObject(exc)))
1149 return NULL;
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02001150 if (!(encode = PyUnicodeEncodeError_GetEncoding(exc))) {
1151 Py_DECREF(object);
1152 return NULL;
1153 }
1154 if (!(encoding = PyUnicode_AsUTF8(encode))) {
1155 Py_DECREF(object);
1156 Py_DECREF(encode);
1157 return NULL;
1158 }
1159 code = get_standard_encoding(encoding, &bytelength);
1160 Py_DECREF(encode);
Serhiy Storchaka88d8fb62014-05-15 14:37:42 +03001161 if (code == ENC_UNKNOWN) {
1162 /* Not supported, fail with original exception */
1163 PyErr_SetObject(PyExceptionInstance_Class(exc), exc);
1164 Py_DECREF(object);
1165 return NULL;
1166 }
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02001167
Serhiy Storchaka2e374092014-10-04 14:15:49 +03001168 if (end - start > PY_SSIZE_T_MAX / bytelength)
1169 end = start + PY_SSIZE_T_MAX / bytelength;
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02001170 res = PyBytes_FromStringAndSize(NULL, bytelength*(end-start));
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001171 if (!res) {
1172 Py_DECREF(object);
1173 return NULL;
1174 }
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02001175 outp = (unsigned char*)PyBytes_AsString(res);
Martin v. Löwisb09af032011-11-04 11:16:41 +01001176 for (i = start; i < end; i++) {
1177 /* object is guaranteed to be "ready" */
1178 Py_UCS4 ch = PyUnicode_READ_CHAR(object, i);
Victor Stinner76df43d2012-10-30 01:42:39 +01001179 if (!Py_UNICODE_IS_SURROGATE(ch)) {
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001180 /* Not a surrogate, fail with original exception */
1181 PyErr_SetObject(PyExceptionInstance_Class(exc), exc);
1182 Py_DECREF(res);
1183 Py_DECREF(object);
1184 return NULL;
1185 }
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02001186 switch (code) {
1187 case ENC_UTF8:
1188 *outp++ = (unsigned char)(0xe0 | (ch >> 12));
1189 *outp++ = (unsigned char)(0x80 | ((ch >> 6) & 0x3f));
1190 *outp++ = (unsigned char)(0x80 | (ch & 0x3f));
1191 break;
1192 case ENC_UTF16LE:
1193 *outp++ = (unsigned char) ch;
1194 *outp++ = (unsigned char)(ch >> 8);
1195 break;
1196 case ENC_UTF16BE:
1197 *outp++ = (unsigned char)(ch >> 8);
1198 *outp++ = (unsigned char) ch;
1199 break;
1200 case ENC_UTF32LE:
1201 *outp++ = (unsigned char) ch;
1202 *outp++ = (unsigned char)(ch >> 8);
1203 *outp++ = (unsigned char)(ch >> 16);
1204 *outp++ = (unsigned char)(ch >> 24);
1205 break;
1206 case ENC_UTF32BE:
1207 *outp++ = (unsigned char)(ch >> 24);
1208 *outp++ = (unsigned char)(ch >> 16);
1209 *outp++ = (unsigned char)(ch >> 8);
1210 *outp++ = (unsigned char) ch;
1211 break;
1212 }
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001213 }
1214 restuple = Py_BuildValue("(On)", res, end);
1215 Py_DECREF(res);
1216 Py_DECREF(object);
1217 return restuple;
Martin v. Löwisdb12d452009-05-02 18:52:14 +00001218 }
Serhiy Storchakaca7fecb2015-05-18 16:08:52 +03001219 else if (PyObject_TypeCheck(exc, (PyTypeObject *)PyExc_UnicodeDecodeError)) {
Serhiy Storchakacb33a012016-10-23 09:44:50 +03001220 const unsigned char *p;
Victor Stinnerc06bb7a2011-11-04 21:36:35 +01001221 Py_UCS4 ch = 0;
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001222 if (PyUnicodeDecodeError_GetStart(exc, &start))
1223 return NULL;
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02001224 if (PyUnicodeDecodeError_GetEnd(exc, &end))
1225 return NULL;
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001226 if (!(object = PyUnicodeDecodeError_GetObject(exc)))
1227 return NULL;
Serhiy Storchakacb33a012016-10-23 09:44:50 +03001228 p = (const unsigned char*)PyBytes_AS_STRING(object);
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02001229 if (!(encode = PyUnicodeDecodeError_GetEncoding(exc))) {
1230 Py_DECREF(object);
1231 return NULL;
1232 }
1233 if (!(encoding = PyUnicode_AsUTF8(encode))) {
1234 Py_DECREF(object);
1235 Py_DECREF(encode);
1236 return NULL;
1237 }
1238 code = get_standard_encoding(encoding, &bytelength);
1239 Py_DECREF(encode);
Serhiy Storchaka88d8fb62014-05-15 14:37:42 +03001240 if (code == ENC_UNKNOWN) {
1241 /* Not supported, fail with original exception */
1242 PyErr_SetObject(PyExceptionInstance_Class(exc), exc);
1243 Py_DECREF(object);
1244 return NULL;
1245 }
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02001246
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001247 /* Try decoding a single surrogate character. If
1248 there are more, let the codec call us again. */
1249 p += start;
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02001250 if (PyBytes_GET_SIZE(object) - start >= bytelength) {
1251 switch (code) {
1252 case ENC_UTF8:
1253 if ((p[0] & 0xf0) == 0xe0 &&
1254 (p[1] & 0xc0) == 0x80 &&
1255 (p[2] & 0xc0) == 0x80) {
1256 /* it's a three-byte code */
1257 ch = ((p[0] & 0x0f) << 12) + ((p[1] & 0x3f) << 6) + (p[2] & 0x3f);
1258 }
1259 break;
1260 case ENC_UTF16LE:
1261 ch = p[1] << 8 | p[0];
1262 break;
1263 case ENC_UTF16BE:
1264 ch = p[0] << 8 | p[1];
1265 break;
1266 case ENC_UTF32LE:
1267 ch = (p[3] << 24) | (p[2] << 16) | (p[1] << 8) | p[0];
1268 break;
1269 case ENC_UTF32BE:
1270 ch = (p[0] << 24) | (p[1] << 16) | (p[2] << 8) | p[3];
1271 break;
1272 }
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001273 }
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02001274
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001275 Py_DECREF(object);
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02001276 if (!Py_UNICODE_IS_SURROGATE(ch)) {
1277 /* it's not a surrogate - fail */
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001278 PyErr_SetObject(PyExceptionInstance_Class(exc), exc);
1279 return NULL;
1280 }
Victor Stinnerc06bb7a2011-11-04 21:36:35 +01001281 res = PyUnicode_FromOrdinal(ch);
1282 if (res == NULL)
1283 return NULL;
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02001284 return Py_BuildValue("(Nn)", res, start + bytelength);
Martin v. Löwisdb12d452009-05-02 18:52:14 +00001285 }
1286 else {
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001287 wrong_exception_type(exc);
1288 return NULL;
Martin v. Löwisdb12d452009-05-02 18:52:14 +00001289 }
1290}
1291
Martin v. Löwis011e8422009-05-05 04:43:17 +00001292static PyObject *
Martin v. Löwis43c57782009-05-10 08:15:24 +00001293PyCodec_SurrogateEscapeErrors(PyObject *exc)
Martin v. Löwis011e8422009-05-05 04:43:17 +00001294{
1295 PyObject *restuple;
1296 PyObject *object;
Martin v. Löwisb09af032011-11-04 11:16:41 +01001297 Py_ssize_t i;
Martin v. Löwis011e8422009-05-05 04:43:17 +00001298 Py_ssize_t start;
1299 Py_ssize_t end;
1300 PyObject *res;
Serhiy Storchakaca7fecb2015-05-18 16:08:52 +03001301
1302 if (PyObject_TypeCheck(exc, (PyTypeObject *)PyExc_UnicodeEncodeError)) {
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001303 char *outp;
1304 if (PyUnicodeEncodeError_GetStart(exc, &start))
1305 return NULL;
1306 if (PyUnicodeEncodeError_GetEnd(exc, &end))
1307 return NULL;
1308 if (!(object = PyUnicodeEncodeError_GetObject(exc)))
1309 return NULL;
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001310 res = PyBytes_FromStringAndSize(NULL, end-start);
1311 if (!res) {
1312 Py_DECREF(object);
1313 return NULL;
1314 }
1315 outp = PyBytes_AsString(res);
Martin v. Löwisb09af032011-11-04 11:16:41 +01001316 for (i = start; i < end; i++) {
1317 /* object is guaranteed to be "ready" */
1318 Py_UCS4 ch = PyUnicode_READ_CHAR(object, i);
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001319 if (ch < 0xdc80 || ch > 0xdcff) {
1320 /* Not a UTF-8b surrogate, fail with original exception */
1321 PyErr_SetObject(PyExceptionInstance_Class(exc), exc);
1322 Py_DECREF(res);
1323 Py_DECREF(object);
1324 return NULL;
1325 }
1326 *outp++ = ch - 0xdc00;
1327 }
1328 restuple = Py_BuildValue("(On)", res, end);
1329 Py_DECREF(res);
1330 Py_DECREF(object);
1331 return restuple;
Martin v. Löwis011e8422009-05-05 04:43:17 +00001332 }
Serhiy Storchakaca7fecb2015-05-18 16:08:52 +03001333 else if (PyObject_TypeCheck(exc, (PyTypeObject *)PyExc_UnicodeDecodeError)) {
Victor Stinnerc06bb7a2011-11-04 21:36:35 +01001334 PyObject *str;
Serhiy Storchakacb33a012016-10-23 09:44:50 +03001335 const unsigned char *p;
Victor Stinnerc06bb7a2011-11-04 21:36:35 +01001336 Py_UCS2 ch[4]; /* decode up to 4 bad bytes. */
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001337 int consumed = 0;
1338 if (PyUnicodeDecodeError_GetStart(exc, &start))
1339 return NULL;
1340 if (PyUnicodeDecodeError_GetEnd(exc, &end))
1341 return NULL;
1342 if (!(object = PyUnicodeDecodeError_GetObject(exc)))
1343 return NULL;
Serhiy Storchakacb33a012016-10-23 09:44:50 +03001344 p = (const unsigned char*)PyBytes_AS_STRING(object);
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001345 while (consumed < 4 && consumed < end-start) {
1346 /* Refuse to escape ASCII bytes. */
1347 if (p[start+consumed] < 128)
1348 break;
1349 ch[consumed] = 0xdc00 + p[start+consumed];
1350 consumed++;
1351 }
1352 Py_DECREF(object);
1353 if (!consumed) {
1354 /* codec complained about ASCII byte. */
1355 PyErr_SetObject(PyExceptionInstance_Class(exc), exc);
1356 return NULL;
1357 }
Victor Stinnerc06bb7a2011-11-04 21:36:35 +01001358 str = PyUnicode_FromKindAndData(PyUnicode_2BYTE_KIND, ch, consumed);
1359 if (str == NULL)
1360 return NULL;
1361 return Py_BuildValue("(Nn)", str, start+consumed);
Martin v. Löwis011e8422009-05-05 04:43:17 +00001362 }
1363 else {
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001364 wrong_exception_type(exc);
1365 return NULL;
Martin v. Löwis011e8422009-05-05 04:43:17 +00001366 }
1367}
1368
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001369
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001370static PyObject *strict_errors(PyObject *self, PyObject *exc)
1371{
1372 return PyCodec_StrictErrors(exc);
1373}
1374
1375
1376static PyObject *ignore_errors(PyObject *self, PyObject *exc)
1377{
1378 return PyCodec_IgnoreErrors(exc);
1379}
1380
1381
1382static PyObject *replace_errors(PyObject *self, PyObject *exc)
1383{
1384 return PyCodec_ReplaceErrors(exc);
1385}
1386
1387
1388static PyObject *xmlcharrefreplace_errors(PyObject *self, PyObject *exc)
1389{
1390 return PyCodec_XMLCharRefReplaceErrors(exc);
1391}
1392
1393
1394static PyObject *backslashreplace_errors(PyObject *self, PyObject *exc)
1395{
1396 return PyCodec_BackslashReplaceErrors(exc);
1397}
1398
Serhiy Storchaka166ebc42014-11-25 13:57:17 +02001399static PyObject *namereplace_errors(PyObject *self, PyObject *exc)
1400{
1401 return PyCodec_NameReplaceErrors(exc);
1402}
1403
Martin v. Löwise0a2b722009-05-10 08:08:56 +00001404static PyObject *surrogatepass_errors(PyObject *self, PyObject *exc)
Martin v. Löwisdb12d452009-05-02 18:52:14 +00001405{
Martin v. Löwise0a2b722009-05-10 08:08:56 +00001406 return PyCodec_SurrogatePassErrors(exc);
Martin v. Löwisdb12d452009-05-02 18:52:14 +00001407}
1408
Martin v. Löwis43c57782009-05-10 08:15:24 +00001409static PyObject *surrogateescape_errors(PyObject *self, PyObject *exc)
Martin v. Löwis011e8422009-05-05 04:43:17 +00001410{
Martin v. Löwis43c57782009-05-10 08:15:24 +00001411 return PyCodec_SurrogateEscapeErrors(exc);
Martin v. Löwis011e8422009-05-05 04:43:17 +00001412}
1413
Gustavo Niemeyer5ddd4c32003-03-19 00:35:36 +00001414static int _PyCodecRegistry_Init(void)
Guido van Rossumfeee4b92000-03-10 22:57:27 +00001415{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001416 static struct {
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001417 char *name;
1418 PyMethodDef def;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001419 } methods[] =
1420 {
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001421 {
1422 "strict",
1423 {
1424 "strict_errors",
1425 strict_errors,
1426 METH_O,
1427 PyDoc_STR("Implements the 'strict' error handling, which "
1428 "raises a UnicodeError on coding errors.")
1429 }
1430 },
1431 {
1432 "ignore",
1433 {
1434 "ignore_errors",
1435 ignore_errors,
1436 METH_O,
1437 PyDoc_STR("Implements the 'ignore' error handling, which "
1438 "ignores malformed data and continues.")
1439 }
1440 },
1441 {
1442 "replace",
1443 {
1444 "replace_errors",
1445 replace_errors,
1446 METH_O,
1447 PyDoc_STR("Implements the 'replace' error handling, which "
1448 "replaces malformed data with a replacement marker.")
1449 }
1450 },
1451 {
1452 "xmlcharrefreplace",
1453 {
1454 "xmlcharrefreplace_errors",
1455 xmlcharrefreplace_errors,
1456 METH_O,
1457 PyDoc_STR("Implements the 'xmlcharrefreplace' error handling, "
1458 "which replaces an unencodable character with the "
1459 "appropriate XML character reference.")
1460 }
1461 },
1462 {
1463 "backslashreplace",
1464 {
1465 "backslashreplace_errors",
1466 backslashreplace_errors,
1467 METH_O,
1468 PyDoc_STR("Implements the 'backslashreplace' error handling, "
Serhiy Storchaka07985ef2015-01-25 22:56:57 +02001469 "which replaces malformed data with a backslashed "
1470 "escape sequence.")
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001471 }
1472 },
1473 {
Serhiy Storchaka166ebc42014-11-25 13:57:17 +02001474 "namereplace",
1475 {
1476 "namereplace_errors",
1477 namereplace_errors,
1478 METH_O,
1479 PyDoc_STR("Implements the 'namereplace' error handling, "
1480 "which replaces an unencodable character with a "
1481 "\\N{...} escape sequence.")
1482 }
1483 },
1484 {
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001485 "surrogatepass",
1486 {
1487 "surrogatepass",
1488 surrogatepass_errors,
1489 METH_O
1490 }
1491 },
1492 {
1493 "surrogateescape",
1494 {
1495 "surrogateescape",
1496 surrogateescape_errors,
1497 METH_O
1498 }
1499 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001500 };
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001501
Nicholas Bastine5662ae2004-03-24 22:22:12 +00001502 PyInterpreterState *interp = PyThreadState_GET()->interp;
Gustavo Niemeyer5ddd4c32003-03-19 00:35:36 +00001503 PyObject *mod;
Neal Norwitz739a8f82004-07-08 01:55:58 +00001504 unsigned i;
Gustavo Niemeyer5ddd4c32003-03-19 00:35:36 +00001505
1506 if (interp->codec_search_path != NULL)
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001507 return 0;
Gustavo Niemeyer5ddd4c32003-03-19 00:35:36 +00001508
1509 interp->codec_search_path = PyList_New(0);
1510 interp->codec_search_cache = PyDict_New();
1511 interp->codec_error_registry = PyDict_New();
1512
1513 if (interp->codec_error_registry) {
Victor Stinner63941882011-09-29 00:42:28 +02001514 for (i = 0; i < Py_ARRAY_LENGTH(methods); ++i) {
Andrew Svetlov3ba3a3e2012-12-25 13:32:35 +02001515 PyObject *func = PyCFunction_NewEx(&methods[i].def, NULL, NULL);
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001516 int res;
1517 if (!func)
1518 Py_FatalError("can't initialize codec error registry");
1519 res = PyCodec_RegisterError(methods[i].name, func);
1520 Py_DECREF(func);
1521 if (res)
1522 Py_FatalError("can't initialize codec error registry");
1523 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001524 }
Guido van Rossumfeee4b92000-03-10 22:57:27 +00001525
Gustavo Niemeyer5ddd4c32003-03-19 00:35:36 +00001526 if (interp->codec_search_path == NULL ||
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001527 interp->codec_search_cache == NULL ||
1528 interp->codec_error_registry == NULL)
1529 Py_FatalError("can't initialize codec registry");
Gustavo Niemeyer5ddd4c32003-03-19 00:35:36 +00001530
Christian Heimes819b8bf2008-01-03 23:05:47 +00001531 mod = PyImport_ImportModuleNoBlock("encodings");
Gustavo Niemeyer5ddd4c32003-03-19 00:35:36 +00001532 if (mod == NULL) {
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001533 return -1;
Gustavo Niemeyer5ddd4c32003-03-19 00:35:36 +00001534 }
1535 Py_DECREF(mod);
Christian Heimes6a27efa2008-10-30 21:48:26 +00001536 interp->codecs_initialized = 1;
Gustavo Niemeyer5ddd4c32003-03-19 00:35:36 +00001537 return 0;
Guido van Rossumfeee4b92000-03-10 22:57:27 +00001538}