blob: 38b0c2c33d029d884303c09facd26f9cd4b340dd [file] [log] [blame]
Guido van Rossumfeee4b92000-03-10 22:57:27 +00001/* ------------------------------------------------------------------------
2
3 Python Codec Registry and support functions
4
5Written by Marc-Andre Lemburg (mal@lemburg.com).
6
Guido van Rossum16b1ad92000-08-03 16:24:25 +00007Copyright (c) Corporation for National Research Initiatives.
Guido van Rossumfeee4b92000-03-10 22:57:27 +00008
9 ------------------------------------------------------------------------ */
10
11#include "Python.h"
Serhiy Storchaka166ebc42014-11-25 13:57:17 +020012#include "ucnhash.h"
Guido van Rossumfeee4b92000-03-10 22:57:27 +000013#include <ctype.h>
14
Victor Stinnerf5cff562011-10-14 02:13:11 +020015const char *Py_hexdigits = "0123456789abcdef";
16
Guido van Rossumfeee4b92000-03-10 22:57:27 +000017/* --- Codec Registry ----------------------------------------------------- */
18
19/* Import the standard encodings package which will register the first
Guido van Rossum98297ee2007-11-06 21:34:58 +000020 codec search function.
Guido van Rossumfeee4b92000-03-10 22:57:27 +000021
22 This is done in a lazy way so that the Unicode implementation does
23 not downgrade startup time of scripts not needing it.
24
Guido van Rossumb95de4f2000-03-31 17:25:23 +000025 ImportErrors are silently ignored by this function. Only one try is
26 made.
Guido van Rossumfeee4b92000-03-10 22:57:27 +000027
28*/
29
Gustavo Niemeyer5ddd4c32003-03-19 00:35:36 +000030static int _PyCodecRegistry_Init(void); /* Forward */
Guido van Rossumfeee4b92000-03-10 22:57:27 +000031
Guido van Rossumfeee4b92000-03-10 22:57:27 +000032int PyCodec_Register(PyObject *search_function)
33{
Nicholas Bastine5662ae2004-03-24 22:22:12 +000034 PyInterpreterState *interp = PyThreadState_GET()->interp;
Gustavo Niemeyer5ddd4c32003-03-19 00:35:36 +000035 if (interp->codec_search_path == NULL && _PyCodecRegistry_Init())
Antoine Pitrouf95a1b32010-05-09 15:52:27 +000036 goto onError;
Guido van Rossumfeee4b92000-03-10 22:57:27 +000037 if (search_function == NULL) {
Antoine Pitrouf95a1b32010-05-09 15:52:27 +000038 PyErr_BadArgument();
39 goto onError;
Guido van Rossumfeee4b92000-03-10 22:57:27 +000040 }
41 if (!PyCallable_Check(search_function)) {
Antoine Pitrouf95a1b32010-05-09 15:52:27 +000042 PyErr_SetString(PyExc_TypeError, "argument must be callable");
43 goto onError;
Guido van Rossumfeee4b92000-03-10 22:57:27 +000044 }
Gustavo Niemeyer5ddd4c32003-03-19 00:35:36 +000045 return PyList_Append(interp->codec_search_path, search_function);
Guido van Rossumb95de4f2000-03-31 17:25:23 +000046
47 onError:
48 return -1;
Guido van Rossumfeee4b92000-03-10 22:57:27 +000049}
50
Guido van Rossum9e896b32000-04-05 20:11:21 +000051/* Convert a string to a normalized Python string: all characters are
52 converted to lower case, spaces are replaced with underscores. */
53
Guido van Rossumfeee4b92000-03-10 22:57:27 +000054static
Guido van Rossum9e896b32000-04-05 20:11:21 +000055PyObject *normalizestring(const char *string)
Guido van Rossumfeee4b92000-03-10 22:57:27 +000056{
Antoine Pitrou9ed5f272013-08-13 20:18:52 +020057 size_t i;
Guido van Rossum582acec2000-06-28 22:07:35 +000058 size_t len = strlen(string);
Guido van Rossumfeee4b92000-03-10 22:57:27 +000059 char *p;
60 PyObject *v;
Guido van Rossum21431e82007-10-19 21:48:41 +000061
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000062 if (len > PY_SSIZE_T_MAX) {
Antoine Pitrouf95a1b32010-05-09 15:52:27 +000063 PyErr_SetString(PyExc_OverflowError, "string is too large");
64 return NULL;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000065 }
Guido van Rossum21431e82007-10-19 21:48:41 +000066
67 p = PyMem_Malloc(len + 1);
68 if (p == NULL)
Victor Stinnercc351592013-07-12 00:02:55 +020069 return PyErr_NoMemory();
Guido van Rossum9e896b32000-04-05 20:11:21 +000070 for (i = 0; i < len; i++) {
Antoine Pitrou9ed5f272013-08-13 20:18:52 +020071 char ch = string[i];
Guido van Rossum9e896b32000-04-05 20:11:21 +000072 if (ch == ' ')
73 ch = '-';
74 else
Antoine Pitroucf9d3c02011-07-24 02:27:04 +020075 ch = Py_TOLOWER(Py_CHARMASK(ch));
Antoine Pitrouf95a1b32010-05-09 15:52:27 +000076 p[i] = ch;
Guido van Rossum9e896b32000-04-05 20:11:21 +000077 }
Guido van Rossum21431e82007-10-19 21:48:41 +000078 p[i] = '\0';
79 v = PyUnicode_FromString(p);
80 if (v == NULL)
81 return NULL;
82 PyMem_Free(p);
Guido van Rossumfeee4b92000-03-10 22:57:27 +000083 return v;
84}
85
86/* Lookup the given encoding and return a tuple providing the codec
87 facilities.
88
89 The encoding string is looked up converted to all lower-case
90 characters. This makes encodings looked up through this mechanism
91 effectively case-insensitive.
92
Guido van Rossum98297ee2007-11-06 21:34:58 +000093 If no codec is found, a LookupError is set and NULL returned.
Guido van Rossumb95de4f2000-03-31 17:25:23 +000094
95 As side effect, this tries to load the encodings package, if not
96 yet done. This is part of the lazy load strategy for the encodings
97 package.
98
99*/
Guido van Rossumfeee4b92000-03-10 22:57:27 +0000100
101PyObject *_PyCodec_Lookup(const char *encoding)
102{
Gustavo Niemeyer5ddd4c32003-03-19 00:35:36 +0000103 PyInterpreterState *interp;
Guido van Rossum5ba3c842000-03-24 20:52:23 +0000104 PyObject *result, *args = NULL, *v;
Thomas Wouters477c8d52006-05-27 19:21:47 +0000105 Py_ssize_t i, len;
Guido van Rossumfeee4b92000-03-10 22:57:27 +0000106
Fred Drake766de832000-05-09 19:55:59 +0000107 if (encoding == NULL) {
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000108 PyErr_BadArgument();
109 goto onError;
Fred Drake766de832000-05-09 19:55:59 +0000110 }
Gustavo Niemeyer5ddd4c32003-03-19 00:35:36 +0000111
Nicholas Bastine5662ae2004-03-24 22:22:12 +0000112 interp = PyThreadState_GET()->interp;
Gustavo Niemeyer5ddd4c32003-03-19 00:35:36 +0000113 if (interp->codec_search_path == NULL && _PyCodecRegistry_Init())
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000114 goto onError;
Guido van Rossumfeee4b92000-03-10 22:57:27 +0000115
Guido van Rossum9e896b32000-04-05 20:11:21 +0000116 /* Convert the encoding to a normalized Python string: all
Thomas Wouters7e474022000-07-16 12:04:32 +0000117 characters are converted to lower case, spaces and hyphens are
Guido van Rossum9e896b32000-04-05 20:11:21 +0000118 replaced with underscores. */
119 v = normalizestring(encoding);
Guido van Rossumfeee4b92000-03-10 22:57:27 +0000120 if (v == NULL)
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000121 goto onError;
Guido van Rossum21431e82007-10-19 21:48:41 +0000122 PyUnicode_InternInPlace(&v);
Guido van Rossumfeee4b92000-03-10 22:57:27 +0000123
124 /* First, try to lookup the name in the registry dictionary */
Gustavo Niemeyer5ddd4c32003-03-19 00:35:36 +0000125 result = PyDict_GetItem(interp->codec_search_cache, v);
Guido van Rossumfeee4b92000-03-10 22:57:27 +0000126 if (result != NULL) {
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000127 Py_INCREF(result);
128 Py_DECREF(v);
129 return result;
Guido van Rossumfeee4b92000-03-10 22:57:27 +0000130 }
Guido van Rossum98297ee2007-11-06 21:34:58 +0000131
Guido van Rossumfeee4b92000-03-10 22:57:27 +0000132 /* Next, scan the search functions in order of registration */
Guido van Rossumfeee4b92000-03-10 22:57:27 +0000133 args = PyTuple_New(1);
134 if (args == NULL)
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000135 goto onError;
Guido van Rossumfeee4b92000-03-10 22:57:27 +0000136 PyTuple_SET_ITEM(args,0,v);
Guido van Rossum5ba3c842000-03-24 20:52:23 +0000137
Gustavo Niemeyer5ddd4c32003-03-19 00:35:36 +0000138 len = PyList_Size(interp->codec_search_path);
Guido van Rossum5ba3c842000-03-24 20:52:23 +0000139 if (len < 0)
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000140 goto onError;
Guido van Rossumb95de4f2000-03-31 17:25:23 +0000141 if (len == 0) {
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000142 PyErr_SetString(PyExc_LookupError,
143 "no codec search functions registered: "
144 "can't find encoding");
145 goto onError;
Guido van Rossumb95de4f2000-03-31 17:25:23 +0000146 }
Guido van Rossumfeee4b92000-03-10 22:57:27 +0000147
148 for (i = 0; i < len; i++) {
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000149 PyObject *func;
Guido van Rossum98297ee2007-11-06 21:34:58 +0000150
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000151 func = PyList_GetItem(interp->codec_search_path, i);
152 if (func == NULL)
153 goto onError;
154 result = PyEval_CallObject(func, args);
155 if (result == NULL)
156 goto onError;
157 if (result == Py_None) {
158 Py_DECREF(result);
159 continue;
160 }
161 if (!PyTuple_Check(result) || PyTuple_GET_SIZE(result) != 4) {
162 PyErr_SetString(PyExc_TypeError,
163 "codec search functions must return 4-tuples");
164 Py_DECREF(result);
165 goto onError;
166 }
167 break;
Guido van Rossumfeee4b92000-03-10 22:57:27 +0000168 }
169 if (i == len) {
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000170 /* XXX Perhaps we should cache misses too ? */
171 PyErr_Format(PyExc_LookupError,
Martin v. Löwiseb42b022002-09-26 16:01:24 +0000172 "unknown encoding: %s", encoding);
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000173 goto onError;
Guido van Rossumfeee4b92000-03-10 22:57:27 +0000174 }
175
176 /* Cache and return the result */
Neal Norwitz9edcc2e2007-08-11 04:58:26 +0000177 if (PyDict_SetItem(interp->codec_search_cache, v, result) < 0) {
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000178 Py_DECREF(result);
179 goto onError;
Neal Norwitz9edcc2e2007-08-11 04:58:26 +0000180 }
Guido van Rossumfeee4b92000-03-10 22:57:27 +0000181 Py_DECREF(args);
182 return result;
183
184 onError:
185 Py_XDECREF(args);
186 return NULL;
187}
188
Nick Coghlan8fad1672014-09-15 23:50:44 +1200189int _PyCodec_Forget(const char *encoding)
190{
191 PyInterpreterState *interp;
192 PyObject *v;
193 int result;
194
195 interp = PyThreadState_GET()->interp;
196 if (interp->codec_search_path == NULL) {
197 return -1;
198 }
199
200 /* Convert the encoding to a normalized Python string: all
201 characters are converted to lower case, spaces and hyphens are
202 replaced with underscores. */
203 v = normalizestring(encoding);
204 if (v == NULL) {
205 return -1;
206 }
207
208 /* Drop the named codec from the internal cache */
209 result = PyDict_DelItem(interp->codec_search_cache, v);
210 Py_DECREF(v);
211
212 return result;
213}
214
Marc-André Lemburgb2750b52008-06-06 12:18:17 +0000215/* Codec registry encoding check API. */
216
217int PyCodec_KnownEncoding(const char *encoding)
218{
219 PyObject *codecs;
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000220
Marc-André Lemburgb2750b52008-06-06 12:18:17 +0000221 codecs = _PyCodec_Lookup(encoding);
222 if (!codecs) {
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000223 PyErr_Clear();
224 return 0;
Marc-André Lemburgb2750b52008-06-06 12:18:17 +0000225 }
226 else {
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000227 Py_DECREF(codecs);
228 return 1;
Marc-André Lemburgb2750b52008-06-06 12:18:17 +0000229 }
230}
231
Guido van Rossumfeee4b92000-03-10 22:57:27 +0000232static
233PyObject *args_tuple(PyObject *object,
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000234 const char *errors)
Guido van Rossumfeee4b92000-03-10 22:57:27 +0000235{
236 PyObject *args;
Guido van Rossum98297ee2007-11-06 21:34:58 +0000237
Guido van Rossumfeee4b92000-03-10 22:57:27 +0000238 args = PyTuple_New(1 + (errors != NULL));
239 if (args == NULL)
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000240 return NULL;
Guido van Rossumfeee4b92000-03-10 22:57:27 +0000241 Py_INCREF(object);
242 PyTuple_SET_ITEM(args,0,object);
243 if (errors) {
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000244 PyObject *v;
Guido van Rossum98297ee2007-11-06 21:34:58 +0000245
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000246 v = PyUnicode_FromString(errors);
247 if (v == NULL) {
248 Py_DECREF(args);
249 return NULL;
250 }
251 PyTuple_SET_ITEM(args, 1, v);
Guido van Rossumfeee4b92000-03-10 22:57:27 +0000252 }
253 return args;
254}
255
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000256/* Helper function to get a codec item */
Guido van Rossumfeee4b92000-03-10 22:57:27 +0000257
258static
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000259PyObject *codec_getitem(const char *encoding, int index)
Guido van Rossumfeee4b92000-03-10 22:57:27 +0000260{
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000261 PyObject *codecs;
262 PyObject *v;
Guido van Rossumfeee4b92000-03-10 22:57:27 +0000263
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000264 codecs = _PyCodec_Lookup(encoding);
265 if (codecs == NULL)
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000266 return NULL;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000267 v = PyTuple_GET_ITEM(codecs, index);
268 Py_DECREF(codecs);
269 Py_INCREF(v);
270 return v;
271}
272
Nick Coghlana9b15242014-02-04 22:11:18 +1000273/* Helper functions to create an incremental codec. */
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000274static
Nick Coghlana9b15242014-02-04 22:11:18 +1000275PyObject *codec_makeincrementalcodec(PyObject *codec_info,
276 const char *errors,
277 const char *attrname)
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000278{
Nick Coghlana9b15242014-02-04 22:11:18 +1000279 PyObject *ret, *inccodec;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000280
Nick Coghlana9b15242014-02-04 22:11:18 +1000281 inccodec = PyObject_GetAttrString(codec_info, attrname);
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000282 if (inccodec == NULL)
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000283 return NULL;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000284 if (errors)
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000285 ret = PyObject_CallFunction(inccodec, "s", errors);
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000286 else
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000287 ret = PyObject_CallFunction(inccodec, NULL);
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000288 Py_DECREF(inccodec);
289 return ret;
290}
291
Nick Coghlana9b15242014-02-04 22:11:18 +1000292static
293PyObject *codec_getincrementalcodec(const char *encoding,
294 const char *errors,
295 const char *attrname)
296{
297 PyObject *codec_info, *ret;
298
299 codec_info = _PyCodec_Lookup(encoding);
300 if (codec_info == NULL)
301 return NULL;
302 ret = codec_makeincrementalcodec(codec_info, errors, attrname);
303 Py_DECREF(codec_info);
304 return ret;
305}
306
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000307/* Helper function to create a stream codec. */
308
309static
310PyObject *codec_getstreamcodec(const char *encoding,
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000311 PyObject *stream,
312 const char *errors,
313 const int index)
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000314{
Thomas Wouters0e3f5912006-08-11 14:57:12 +0000315 PyObject *codecs, *streamcodec, *codeccls;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000316
317 codecs = _PyCodec_Lookup(encoding);
318 if (codecs == NULL)
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000319 return NULL;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000320
Thomas Wouters0e3f5912006-08-11 14:57:12 +0000321 codeccls = PyTuple_GET_ITEM(codecs, index);
322 if (errors != NULL)
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000323 streamcodec = PyObject_CallFunction(codeccls, "Os", stream, errors);
Thomas Wouters0e3f5912006-08-11 14:57:12 +0000324 else
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000325 streamcodec = PyObject_CallFunction(codeccls, "O", stream);
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000326 Py_DECREF(codecs);
327 return streamcodec;
Guido van Rossumfeee4b92000-03-10 22:57:27 +0000328}
329
Nick Coghlana9b15242014-02-04 22:11:18 +1000330/* Helpers to work with the result of _PyCodec_Lookup
331
332 */
333PyObject *_PyCodecInfo_GetIncrementalDecoder(PyObject *codec_info,
334 const char *errors)
335{
336 return codec_makeincrementalcodec(codec_info, errors,
337 "incrementaldecoder");
338}
339
340PyObject *_PyCodecInfo_GetIncrementalEncoder(PyObject *codec_info,
341 const char *errors)
342{
343 return codec_makeincrementalcodec(codec_info, errors,
344 "incrementalencoder");
345}
346
347
Guido van Rossum98297ee2007-11-06 21:34:58 +0000348/* Convenience APIs to query the Codec registry.
349
Guido van Rossumfeee4b92000-03-10 22:57:27 +0000350 All APIs return a codec object with incremented refcount.
Guido van Rossum98297ee2007-11-06 21:34:58 +0000351
Guido van Rossumfeee4b92000-03-10 22:57:27 +0000352 */
353
354PyObject *PyCodec_Encoder(const char *encoding)
355{
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000356 return codec_getitem(encoding, 0);
Guido van Rossumfeee4b92000-03-10 22:57:27 +0000357}
358
359PyObject *PyCodec_Decoder(const char *encoding)
360{
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000361 return codec_getitem(encoding, 1);
Guido van Rossumfeee4b92000-03-10 22:57:27 +0000362}
363
Thomas Woutersa9773292006-04-21 09:43:23 +0000364PyObject *PyCodec_IncrementalEncoder(const char *encoding,
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000365 const char *errors)
Thomas Woutersa9773292006-04-21 09:43:23 +0000366{
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000367 return codec_getincrementalcodec(encoding, errors, "incrementalencoder");
Thomas Woutersa9773292006-04-21 09:43:23 +0000368}
369
370PyObject *PyCodec_IncrementalDecoder(const char *encoding,
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000371 const char *errors)
Thomas Woutersa9773292006-04-21 09:43:23 +0000372{
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000373 return codec_getincrementalcodec(encoding, errors, "incrementaldecoder");
Thomas Woutersa9773292006-04-21 09:43:23 +0000374}
375
Guido van Rossumfeee4b92000-03-10 22:57:27 +0000376PyObject *PyCodec_StreamReader(const char *encoding,
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000377 PyObject *stream,
378 const char *errors)
Guido van Rossumfeee4b92000-03-10 22:57:27 +0000379{
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000380 return codec_getstreamcodec(encoding, stream, errors, 2);
Guido van Rossumfeee4b92000-03-10 22:57:27 +0000381}
382
383PyObject *PyCodec_StreamWriter(const char *encoding,
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000384 PyObject *stream,
385 const char *errors)
Guido van Rossumfeee4b92000-03-10 22:57:27 +0000386{
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000387 return codec_getstreamcodec(encoding, stream, errors, 3);
Guido van Rossumfeee4b92000-03-10 22:57:27 +0000388}
389
Nick Coghlan8b097b42013-11-13 23:49:21 +1000390/* Helper that tries to ensure the reported exception chain indicates the
391 * codec that was invoked to trigger the failure without changing the type
392 * of the exception raised.
393 */
394static void
395wrap_codec_error(const char *operation,
396 const char *encoding)
397{
398 /* TrySetFromCause will replace the active exception with a suitably
399 * updated clone if it can, otherwise it will leave the original
400 * exception alone.
401 */
402 _PyErr_TrySetFromCause("%s with '%s' codec failed",
403 operation, encoding);
404}
405
Guido van Rossumfeee4b92000-03-10 22:57:27 +0000406/* Encode an object (e.g. an Unicode object) using the given encoding
407 and return the resulting encoded object (usually a Python string).
408
409 errors is passed to the encoder factory as argument if non-NULL. */
410
Nick Coghlanc72e4e62013-11-22 22:39:36 +1000411static PyObject *
412_PyCodec_EncodeInternal(PyObject *object,
413 PyObject *encoder,
414 const char *encoding,
415 const char *errors)
Guido van Rossumfeee4b92000-03-10 22:57:27 +0000416{
Neal Norwitz3715c3e2005-11-24 22:09:18 +0000417 PyObject *args = NULL, *result = NULL;
Guido van Rossum98297ee2007-11-06 21:34:58 +0000418 PyObject *v = NULL;
Guido van Rossumfeee4b92000-03-10 22:57:27 +0000419
Guido van Rossumfeee4b92000-03-10 22:57:27 +0000420 args = args_tuple(object, errors);
421 if (args == NULL)
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000422 goto onError;
Guido van Rossum98297ee2007-11-06 21:34:58 +0000423
424 result = PyEval_CallObject(encoder, args);
Nick Coghlanc4c25802013-11-15 21:47:37 +1000425 if (result == NULL) {
426 wrap_codec_error("encoding", encoding);
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000427 goto onError;
Nick Coghlanc4c25802013-11-15 21:47:37 +1000428 }
Guido van Rossumfeee4b92000-03-10 22:57:27 +0000429
Guido van Rossum98297ee2007-11-06 21:34:58 +0000430 if (!PyTuple_Check(result) ||
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000431 PyTuple_GET_SIZE(result) != 2) {
432 PyErr_SetString(PyExc_TypeError,
433 "encoder must return a tuple (object, integer)");
434 goto onError;
Guido van Rossumfeee4b92000-03-10 22:57:27 +0000435 }
Marc-André Lemburgb2750b52008-06-06 12:18:17 +0000436 v = PyTuple_GET_ITEM(result,0);
437 Py_INCREF(v);
Guido van Rossumfeee4b92000-03-10 22:57:27 +0000438 /* We don't check or use the second (integer) entry. */
439
Marc-André Lemburgb2750b52008-06-06 12:18:17 +0000440 Py_DECREF(args);
441 Py_DECREF(encoder);
442 Py_DECREF(result);
443 return v;
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000444
Guido van Rossumfeee4b92000-03-10 22:57:27 +0000445 onError:
Neal Norwitz3715c3e2005-11-24 22:09:18 +0000446 Py_XDECREF(result);
Guido van Rossumfeee4b92000-03-10 22:57:27 +0000447 Py_XDECREF(args);
448 Py_XDECREF(encoder);
Marc-André Lemburgb2750b52008-06-06 12:18:17 +0000449 return NULL;
Guido van Rossumfeee4b92000-03-10 22:57:27 +0000450}
451
452/* Decode an object (usually a Python string) using the given encoding
453 and return an equivalent object (e.g. an Unicode object).
454
455 errors is passed to the decoder factory as argument if non-NULL. */
456
Nick Coghlanc72e4e62013-11-22 22:39:36 +1000457static PyObject *
458_PyCodec_DecodeInternal(PyObject *object,
459 PyObject *decoder,
460 const char *encoding,
461 const char *errors)
Guido van Rossumfeee4b92000-03-10 22:57:27 +0000462{
Guido van Rossumfeee4b92000-03-10 22:57:27 +0000463 PyObject *args = NULL, *result = NULL;
464 PyObject *v;
465
Guido van Rossumfeee4b92000-03-10 22:57:27 +0000466 args = args_tuple(object, errors);
467 if (args == NULL)
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000468 goto onError;
Guido van Rossum98297ee2007-11-06 21:34:58 +0000469
Guido van Rossumfeee4b92000-03-10 22:57:27 +0000470 result = PyEval_CallObject(decoder,args);
Nick Coghlanc4c25802013-11-15 21:47:37 +1000471 if (result == NULL) {
472 wrap_codec_error("decoding", encoding);
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000473 goto onError;
Nick Coghlanc4c25802013-11-15 21:47:37 +1000474 }
Guido van Rossum98297ee2007-11-06 21:34:58 +0000475 if (!PyTuple_Check(result) ||
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000476 PyTuple_GET_SIZE(result) != 2) {
477 PyErr_SetString(PyExc_TypeError,
478 "decoder must return a tuple (object,integer)");
479 goto onError;
Guido van Rossumfeee4b92000-03-10 22:57:27 +0000480 }
481 v = PyTuple_GET_ITEM(result,0);
482 Py_INCREF(v);
483 /* We don't check or use the second (integer) entry. */
484
485 Py_DECREF(args);
486 Py_DECREF(decoder);
487 Py_DECREF(result);
488 return v;
Guido van Rossum98297ee2007-11-06 21:34:58 +0000489
Guido van Rossumfeee4b92000-03-10 22:57:27 +0000490 onError:
491 Py_XDECREF(args);
492 Py_XDECREF(decoder);
493 Py_XDECREF(result);
494 return NULL;
495}
496
Nick Coghlanc72e4e62013-11-22 22:39:36 +1000497/* Generic encoding/decoding API */
498PyObject *PyCodec_Encode(PyObject *object,
499 const char *encoding,
500 const char *errors)
501{
502 PyObject *encoder;
503
504 encoder = PyCodec_Encoder(encoding);
505 if (encoder == NULL)
506 return NULL;
507
508 return _PyCodec_EncodeInternal(object, encoder, encoding, errors);
509}
510
511PyObject *PyCodec_Decode(PyObject *object,
512 const char *encoding,
513 const char *errors)
514{
515 PyObject *decoder;
516
517 decoder = PyCodec_Decoder(encoding);
518 if (decoder == NULL)
519 return NULL;
520
521 return _PyCodec_DecodeInternal(object, decoder, encoding, errors);
522}
523
524/* Text encoding/decoding API */
Nick Coghlana9b15242014-02-04 22:11:18 +1000525PyObject * _PyCodec_LookupTextEncoding(const char *encoding,
526 const char *alternate_command)
Nick Coghlanc72e4e62013-11-22 22:39:36 +1000527{
528 _Py_IDENTIFIER(_is_text_encoding);
529 PyObject *codec;
530 PyObject *attr;
Nick Coghlanc72e4e62013-11-22 22:39:36 +1000531 int is_text_codec;
532
533 codec = _PyCodec_Lookup(encoding);
534 if (codec == NULL)
535 return NULL;
536
537 /* Backwards compatibility: assume any raw tuple describes a text
538 * encoding, and the same for anything lacking the private
539 * attribute.
540 */
541 if (!PyTuple_CheckExact(codec)) {
542 attr = _PyObject_GetAttrId(codec, &PyId__is_text_encoding);
543 if (attr == NULL) {
544 if (PyErr_ExceptionMatches(PyExc_AttributeError)) {
545 PyErr_Clear();
546 } else {
547 Py_DECREF(codec);
548 return NULL;
549 }
550 } else {
551 is_text_codec = PyObject_IsTrue(attr);
552 Py_DECREF(attr);
553 if (!is_text_codec) {
554 Py_DECREF(codec);
555 PyErr_Format(PyExc_LookupError,
556 "'%.400s' is not a text encoding; "
Nick Coghlana9b15242014-02-04 22:11:18 +1000557 "use %s to handle arbitrary codecs",
558 encoding, alternate_command);
Nick Coghlanc72e4e62013-11-22 22:39:36 +1000559 return NULL;
560 }
561 }
562 }
563
Nick Coghlana9b15242014-02-04 22:11:18 +1000564 /* This appears to be a valid text encoding */
565 return codec;
566}
567
568
569static
570PyObject *codec_getitem_checked(const char *encoding,
571 const char *alternate_command,
572 int index)
573{
574 PyObject *codec;
575 PyObject *v;
576
577 codec = _PyCodec_LookupTextEncoding(encoding, alternate_command);
578 if (codec == NULL)
579 return NULL;
580
Nick Coghlanc72e4e62013-11-22 22:39:36 +1000581 v = PyTuple_GET_ITEM(codec, index);
Nick Coghlanc72e4e62013-11-22 22:39:36 +1000582 Py_INCREF(v);
Nick Coghlana9b15242014-02-04 22:11:18 +1000583 Py_DECREF(codec);
Nick Coghlanc72e4e62013-11-22 22:39:36 +1000584 return v;
585}
586
587static PyObject * _PyCodec_TextEncoder(const char *encoding)
588{
Nick Coghlana9b15242014-02-04 22:11:18 +1000589 return codec_getitem_checked(encoding, "codecs.encode()", 0);
Nick Coghlanc72e4e62013-11-22 22:39:36 +1000590}
591
592static PyObject * _PyCodec_TextDecoder(const char *encoding)
593{
Nick Coghlana9b15242014-02-04 22:11:18 +1000594 return codec_getitem_checked(encoding, "codecs.decode()", 1);
Nick Coghlanc72e4e62013-11-22 22:39:36 +1000595}
596
597PyObject *_PyCodec_EncodeText(PyObject *object,
598 const char *encoding,
599 const char *errors)
600{
601 PyObject *encoder;
602
603 encoder = _PyCodec_TextEncoder(encoding);
604 if (encoder == NULL)
605 return NULL;
606
607 return _PyCodec_EncodeInternal(object, encoder, encoding, errors);
608}
609
610PyObject *_PyCodec_DecodeText(PyObject *object,
611 const char *encoding,
612 const char *errors)
613{
614 PyObject *decoder;
615
616 decoder = _PyCodec_TextDecoder(encoding);
617 if (decoder == NULL)
618 return NULL;
619
620 return _PyCodec_DecodeInternal(object, decoder, encoding, errors);
621}
622
Walter Dörwald3aeb6322002-09-02 13:14:32 +0000623/* Register the error handling callback function error under the name
624 name. This function will be called by the codec when it encounters
625 an unencodable characters/undecodable bytes and doesn't know the
626 callback name, when name is specified as the error parameter
627 in the call to the encode/decode function.
628 Return 0 on success, -1 on error */
629int PyCodec_RegisterError(const char *name, PyObject *error)
630{
Nicholas Bastine5662ae2004-03-24 22:22:12 +0000631 PyInterpreterState *interp = PyThreadState_GET()->interp;
Gustavo Niemeyer5ddd4c32003-03-19 00:35:36 +0000632 if (interp->codec_search_path == NULL && _PyCodecRegistry_Init())
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000633 return -1;
Walter Dörwald3aeb6322002-09-02 13:14:32 +0000634 if (!PyCallable_Check(error)) {
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000635 PyErr_SetString(PyExc_TypeError, "handler must be callable");
636 return -1;
Walter Dörwald3aeb6322002-09-02 13:14:32 +0000637 }
Gustavo Niemeyer5ddd4c32003-03-19 00:35:36 +0000638 return PyDict_SetItemString(interp->codec_error_registry,
Serhiy Storchakac6792272013-10-19 21:03:34 +0300639 name, error);
Walter Dörwald3aeb6322002-09-02 13:14:32 +0000640}
641
642/* Lookup the error handling callback function registered under the
643 name error. As a special case NULL can be passed, in which case
644 the error handling callback for strict encoding will be returned. */
645PyObject *PyCodec_LookupError(const char *name)
646{
647 PyObject *handler = NULL;
648
Nicholas Bastine5662ae2004-03-24 22:22:12 +0000649 PyInterpreterState *interp = PyThreadState_GET()->interp;
Gustavo Niemeyer5ddd4c32003-03-19 00:35:36 +0000650 if (interp->codec_search_path == NULL && _PyCodecRegistry_Init())
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000651 return NULL;
Gustavo Niemeyer5ddd4c32003-03-19 00:35:36 +0000652
Walter Dörwald3aeb6322002-09-02 13:14:32 +0000653 if (name==NULL)
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000654 name = "strict";
Serhiy Storchakac6792272013-10-19 21:03:34 +0300655 handler = PyDict_GetItemString(interp->codec_error_registry, name);
Walter Dörwald3aeb6322002-09-02 13:14:32 +0000656 if (!handler)
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000657 PyErr_Format(PyExc_LookupError, "unknown error handler name '%.400s'", name);
Walter Dörwald3aeb6322002-09-02 13:14:32 +0000658 else
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000659 Py_INCREF(handler);
Walter Dörwald3aeb6322002-09-02 13:14:32 +0000660 return handler;
661}
662
663static void wrong_exception_type(PyObject *exc)
664{
Serhiy Storchakaca7fecb2015-05-18 16:08:52 +0300665 PyErr_Format(PyExc_TypeError,
666 "don't know how to handle %.200s in error callback",
667 exc->ob_type->tp_name);
Walter Dörwald3aeb6322002-09-02 13:14:32 +0000668}
669
670PyObject *PyCodec_StrictErrors(PyObject *exc)
671{
Brett Cannonbf364092006-03-01 04:25:17 +0000672 if (PyExceptionInstance_Check(exc))
673 PyErr_SetObject(PyExceptionInstance_Class(exc), exc);
Walter Dörwald3aeb6322002-09-02 13:14:32 +0000674 else
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000675 PyErr_SetString(PyExc_TypeError, "codec must pass exception instance");
Walter Dörwald3aeb6322002-09-02 13:14:32 +0000676 return NULL;
677}
678
679
680PyObject *PyCodec_IgnoreErrors(PyObject *exc)
681{
Martin v. Löwis18e16552006-02-15 17:27:45 +0000682 Py_ssize_t end;
Serhiy Storchakaca7fecb2015-05-18 16:08:52 +0300683
684 if (PyObject_TypeCheck(exc, (PyTypeObject *)PyExc_UnicodeEncodeError)) {
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000685 if (PyUnicodeEncodeError_GetEnd(exc, &end))
686 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +0000687 }
Serhiy Storchakaca7fecb2015-05-18 16:08:52 +0300688 else if (PyObject_TypeCheck(exc, (PyTypeObject *)PyExc_UnicodeDecodeError)) {
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000689 if (PyUnicodeDecodeError_GetEnd(exc, &end))
690 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +0000691 }
Serhiy Storchakaca7fecb2015-05-18 16:08:52 +0300692 else if (PyObject_TypeCheck(exc, (PyTypeObject *)PyExc_UnicodeTranslateError)) {
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000693 if (PyUnicodeTranslateError_GetEnd(exc, &end))
694 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +0000695 }
696 else {
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000697 wrong_exception_type(exc);
698 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +0000699 }
Victor Stinneree450092011-12-01 02:52:11 +0100700 return Py_BuildValue("(Nn)", PyUnicode_New(0, 0), end);
Walter Dörwald3aeb6322002-09-02 13:14:32 +0000701}
702
703
704PyObject *PyCodec_ReplaceErrors(PyObject *exc)
705{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200706 Py_ssize_t start, end, i, len;
Walter Dörwald3aeb6322002-09-02 13:14:32 +0000707
Serhiy Storchakaca7fecb2015-05-18 16:08:52 +0300708 if (PyObject_TypeCheck(exc, (PyTypeObject *)PyExc_UnicodeEncodeError)) {
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000709 PyObject *res;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200710 int kind;
711 void *data;
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000712 if (PyUnicodeEncodeError_GetStart(exc, &start))
713 return NULL;
714 if (PyUnicodeEncodeError_GetEnd(exc, &end))
715 return NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200716 len = end - start;
717 res = PyUnicode_New(len, '?');
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000718 if (res == NULL)
719 return NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200720 kind = PyUnicode_KIND(res);
721 data = PyUnicode_DATA(res);
722 for (i = 0; i < len; ++i)
723 PyUnicode_WRITE(kind, data, i, '?');
Victor Stinner8f825062012-04-27 13:55:39 +0200724 assert(_PyUnicode_CheckConsistency(res, 1));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200725 return Py_BuildValue("(Nn)", res, end);
Walter Dörwald3aeb6322002-09-02 13:14:32 +0000726 }
Serhiy Storchakaca7fecb2015-05-18 16:08:52 +0300727 else if (PyObject_TypeCheck(exc, (PyTypeObject *)PyExc_UnicodeDecodeError)) {
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000728 if (PyUnicodeDecodeError_GetEnd(exc, &end))
729 return NULL;
Victor Stinner1a15aba2011-10-02 19:00:15 +0200730 return Py_BuildValue("(Cn)",
731 (int)Py_UNICODE_REPLACEMENT_CHARACTER,
732 end);
Walter Dörwald3aeb6322002-09-02 13:14:32 +0000733 }
Serhiy Storchakaca7fecb2015-05-18 16:08:52 +0300734 else if (PyObject_TypeCheck(exc, (PyTypeObject *)PyExc_UnicodeTranslateError)) {
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000735 PyObject *res;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200736 int kind;
737 void *data;
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000738 if (PyUnicodeTranslateError_GetStart(exc, &start))
739 return NULL;
740 if (PyUnicodeTranslateError_GetEnd(exc, &end))
741 return NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200742 len = end - start;
743 res = PyUnicode_New(len, Py_UNICODE_REPLACEMENT_CHARACTER);
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000744 if (res == NULL)
745 return NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200746 kind = PyUnicode_KIND(res);
747 data = PyUnicode_DATA(res);
748 for (i=0; i < len; i++)
749 PyUnicode_WRITE(kind, data, i, Py_UNICODE_REPLACEMENT_CHARACTER);
Victor Stinner8f825062012-04-27 13:55:39 +0200750 assert(_PyUnicode_CheckConsistency(res, 1));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200751 return Py_BuildValue("(Nn)", res, end);
Walter Dörwald3aeb6322002-09-02 13:14:32 +0000752 }
753 else {
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000754 wrong_exception_type(exc);
755 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +0000756 }
757}
758
759PyObject *PyCodec_XMLCharRefReplaceErrors(PyObject *exc)
760{
Serhiy Storchakaca7fecb2015-05-18 16:08:52 +0300761 if (PyObject_TypeCheck(exc, (PyTypeObject *)PyExc_UnicodeEncodeError)) {
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000762 PyObject *restuple;
763 PyObject *object;
Victor Stinnerb31f1bc2011-11-04 21:29:10 +0100764 Py_ssize_t i;
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000765 Py_ssize_t start;
766 Py_ssize_t end;
767 PyObject *res;
Martin v. Löwisb09af032011-11-04 11:16:41 +0100768 unsigned char *outp;
Serhiy Storchaka2e374092014-10-04 14:15:49 +0300769 Py_ssize_t ressize;
Martin v. Löwisb09af032011-11-04 11:16:41 +0100770 Py_UCS4 ch;
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000771 if (PyUnicodeEncodeError_GetStart(exc, &start))
772 return NULL;
773 if (PyUnicodeEncodeError_GetEnd(exc, &end))
774 return NULL;
775 if (!(object = PyUnicodeEncodeError_GetObject(exc)))
776 return NULL;
Serhiy Storchaka2e374092014-10-04 14:15:49 +0300777 if (end - start > PY_SSIZE_T_MAX / (2+7+1))
778 end = start + PY_SSIZE_T_MAX / (2+7+1);
Martin v. Löwisb09af032011-11-04 11:16:41 +0100779 for (i = start, ressize = 0; i < end; ++i) {
780 /* object is guaranteed to be "ready" */
781 ch = PyUnicode_READ_CHAR(object, i);
782 if (ch<10)
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000783 ressize += 2+1+1;
Martin v. Löwisb09af032011-11-04 11:16:41 +0100784 else if (ch<100)
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000785 ressize += 2+2+1;
Martin v. Löwisb09af032011-11-04 11:16:41 +0100786 else if (ch<1000)
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000787 ressize += 2+3+1;
Martin v. Löwisb09af032011-11-04 11:16:41 +0100788 else if (ch<10000)
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000789 ressize += 2+4+1;
Martin v. Löwisb09af032011-11-04 11:16:41 +0100790 else if (ch<100000)
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000791 ressize += 2+5+1;
Martin v. Löwisb09af032011-11-04 11:16:41 +0100792 else if (ch<1000000)
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000793 ressize += 2+6+1;
794 else
795 ressize += 2+7+1;
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000796 }
797 /* allocate replacement */
Martin v. Löwisb09af032011-11-04 11:16:41 +0100798 res = PyUnicode_New(ressize, 127);
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000799 if (res == NULL) {
800 Py_DECREF(object);
801 return NULL;
802 }
Martin v. Löwisb09af032011-11-04 11:16:41 +0100803 outp = PyUnicode_1BYTE_DATA(res);
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000804 /* generate replacement */
Victor Stinnerb31f1bc2011-11-04 21:29:10 +0100805 for (i = start; i < end; ++i) {
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000806 int digits;
807 int base;
Martin v. Löwis8ba79302011-11-04 12:26:49 +0100808 ch = PyUnicode_READ_CHAR(object, i);
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000809 *outp++ = '&';
810 *outp++ = '#';
Martin v. Löwisb09af032011-11-04 11:16:41 +0100811 if (ch<10) {
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000812 digits = 1;
813 base = 1;
814 }
Martin v. Löwisb09af032011-11-04 11:16:41 +0100815 else if (ch<100) {
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000816 digits = 2;
817 base = 10;
818 }
Martin v. Löwisb09af032011-11-04 11:16:41 +0100819 else if (ch<1000) {
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000820 digits = 3;
821 base = 100;
822 }
Martin v. Löwisb09af032011-11-04 11:16:41 +0100823 else if (ch<10000) {
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000824 digits = 4;
825 base = 1000;
826 }
Martin v. Löwisb09af032011-11-04 11:16:41 +0100827 else if (ch<100000) {
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000828 digits = 5;
829 base = 10000;
830 }
Martin v. Löwisb09af032011-11-04 11:16:41 +0100831 else if (ch<1000000) {
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000832 digits = 6;
833 base = 100000;
834 }
835 else {
836 digits = 7;
837 base = 1000000;
838 }
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000839 while (digits-->0) {
Martin v. Löwisb09af032011-11-04 11:16:41 +0100840 *outp++ = '0' + ch/base;
841 ch %= base;
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000842 base /= 10;
843 }
844 *outp++ = ';';
845 }
Victor Stinner8f825062012-04-27 13:55:39 +0200846 assert(_PyUnicode_CheckConsistency(res, 1));
847 restuple = Py_BuildValue("(Nn)", res, end);
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000848 Py_DECREF(object);
849 return restuple;
Walter Dörwald3aeb6322002-09-02 13:14:32 +0000850 }
851 else {
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000852 wrong_exception_type(exc);
853 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +0000854 }
855}
856
Walter Dörwald3aeb6322002-09-02 13:14:32 +0000857PyObject *PyCodec_BackslashReplaceErrors(PyObject *exc)
858{
Serhiy Storchaka07985ef2015-01-25 22:56:57 +0200859 PyObject *object;
860 Py_ssize_t i;
861 Py_ssize_t start;
862 Py_ssize_t end;
863 PyObject *res;
864 unsigned char *outp;
865 int ressize;
866 Py_UCS4 c;
867
Serhiy Storchakac0937f72015-05-18 16:10:40 +0300868 if (PyObject_TypeCheck(exc, (PyTypeObject *)PyExc_UnicodeDecodeError)) {
Serhiy Storchaka07985ef2015-01-25 22:56:57 +0200869 unsigned char *p;
870 if (PyUnicodeDecodeError_GetStart(exc, &start))
871 return NULL;
872 if (PyUnicodeDecodeError_GetEnd(exc, &end))
873 return NULL;
874 if (!(object = PyUnicodeDecodeError_GetObject(exc)))
875 return NULL;
876 if (!(p = (unsigned char*)PyBytes_AsString(object))) {
877 Py_DECREF(object);
878 return NULL;
879 }
880 res = PyUnicode_New(4 * (end - start), 127);
881 if (res == NULL) {
882 Py_DECREF(object);
883 return NULL;
884 }
885 outp = PyUnicode_1BYTE_DATA(res);
886 for (i = start; i < end; i++, outp += 4) {
887 unsigned char c = p[i];
888 outp[0] = '\\';
889 outp[1] = 'x';
890 outp[2] = Py_hexdigits[(c>>4)&0xf];
891 outp[3] = Py_hexdigits[c&0xf];
892 }
893
894 assert(_PyUnicode_CheckConsistency(res, 1));
895 Py_DECREF(object);
896 return Py_BuildValue("(Nn)", res, end);
897 }
Serhiy Storchakac0937f72015-05-18 16:10:40 +0300898 if (PyObject_TypeCheck(exc, (PyTypeObject *)PyExc_UnicodeEncodeError)) {
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000899 if (PyUnicodeEncodeError_GetStart(exc, &start))
900 return NULL;
901 if (PyUnicodeEncodeError_GetEnd(exc, &end))
902 return NULL;
903 if (!(object = PyUnicodeEncodeError_GetObject(exc)))
904 return NULL;
Serhiy Storchaka07985ef2015-01-25 22:56:57 +0200905 }
Serhiy Storchakac0937f72015-05-18 16:10:40 +0300906 else if (PyObject_TypeCheck(exc, (PyTypeObject *)PyExc_UnicodeTranslateError)) {
Serhiy Storchaka07985ef2015-01-25 22:56:57 +0200907 if (PyUnicodeTranslateError_GetStart(exc, &start))
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000908 return NULL;
Serhiy Storchaka07985ef2015-01-25 22:56:57 +0200909 if (PyUnicodeTranslateError_GetEnd(exc, &end))
910 return NULL;
911 if (!(object = PyUnicodeTranslateError_GetObject(exc)))
912 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +0000913 }
914 else {
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000915 wrong_exception_type(exc);
916 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +0000917 }
Serhiy Storchaka07985ef2015-01-25 22:56:57 +0200918
919 if (end - start > PY_SSIZE_T_MAX / (1+1+8))
920 end = start + PY_SSIZE_T_MAX / (1+1+8);
921 for (i = start, ressize = 0; i < end; ++i) {
922 /* object is guaranteed to be "ready" */
923 c = PyUnicode_READ_CHAR(object, i);
924 if (c >= 0x10000) {
925 ressize += 1+1+8;
926 }
927 else if (c >= 0x100) {
928 ressize += 1+1+4;
929 }
930 else
931 ressize += 1+1+2;
932 }
933 res = PyUnicode_New(ressize, 127);
934 if (res == NULL) {
935 Py_DECREF(object);
936 return NULL;
937 }
938 outp = PyUnicode_1BYTE_DATA(res);
939 for (i = start; i < end; ++i) {
940 c = PyUnicode_READ_CHAR(object, i);
941 *outp++ = '\\';
942 if (c >= 0x00010000) {
943 *outp++ = 'U';
944 *outp++ = Py_hexdigits[(c>>28)&0xf];
945 *outp++ = Py_hexdigits[(c>>24)&0xf];
946 *outp++ = Py_hexdigits[(c>>20)&0xf];
947 *outp++ = Py_hexdigits[(c>>16)&0xf];
948 *outp++ = Py_hexdigits[(c>>12)&0xf];
949 *outp++ = Py_hexdigits[(c>>8)&0xf];
950 }
951 else if (c >= 0x100) {
952 *outp++ = 'u';
953 *outp++ = Py_hexdigits[(c>>12)&0xf];
954 *outp++ = Py_hexdigits[(c>>8)&0xf];
955 }
956 else
957 *outp++ = 'x';
958 *outp++ = Py_hexdigits[(c>>4)&0xf];
959 *outp++ = Py_hexdigits[c&0xf];
960 }
961
962 assert(_PyUnicode_CheckConsistency(res, 1));
963 Py_DECREF(object);
964 return Py_BuildValue("(Nn)", res, end);
Walter Dörwald3aeb6322002-09-02 13:14:32 +0000965}
966
Serhiy Storchaka166ebc42014-11-25 13:57:17 +0200967static _PyUnicode_Name_CAPI *ucnhash_CAPI = NULL;
968static int ucnhash_initialized = 0;
969
970PyObject *PyCodec_NameReplaceErrors(PyObject *exc)
971{
Serhiy Storchakaca7fecb2015-05-18 16:08:52 +0300972 if (PyObject_TypeCheck(exc, (PyTypeObject *)PyExc_UnicodeEncodeError)) {
Serhiy Storchaka166ebc42014-11-25 13:57:17 +0200973 PyObject *restuple;
974 PyObject *object;
975 Py_ssize_t i;
976 Py_ssize_t start;
977 Py_ssize_t end;
978 PyObject *res;
979 unsigned char *outp;
Serhiy Storchakaaacfccc2014-11-26 12:11:40 +0200980 Py_ssize_t ressize;
981 int replsize;
Serhiy Storchaka166ebc42014-11-25 13:57:17 +0200982 Py_UCS4 c;
983 char buffer[256]; /* NAME_MAXLEN */
984 if (PyUnicodeEncodeError_GetStart(exc, &start))
985 return NULL;
986 if (PyUnicodeEncodeError_GetEnd(exc, &end))
987 return NULL;
988 if (!(object = PyUnicodeEncodeError_GetObject(exc)))
989 return NULL;
990 if (!ucnhash_initialized) {
991 /* load the unicode data module */
992 ucnhash_CAPI = (_PyUnicode_Name_CAPI *)PyCapsule_Import(
993 PyUnicodeData_CAPSULE_NAME, 1);
994 ucnhash_initialized = 1;
995 }
996 for (i = start, ressize = 0; i < end; ++i) {
997 /* object is guaranteed to be "ready" */
998 c = PyUnicode_READ_CHAR(object, i);
999 if (ucnhash_CAPI &&
1000 ucnhash_CAPI->getname(NULL, c, buffer, sizeof(buffer), 1)) {
Serhiy Storchaka26861b02015-02-16 20:52:17 +02001001 replsize = 1+1+1+(int)strlen(buffer)+1;
Serhiy Storchaka166ebc42014-11-25 13:57:17 +02001002 }
1003 else if (c >= 0x10000) {
Serhiy Storchakaaacfccc2014-11-26 12:11:40 +02001004 replsize = 1+1+8;
Serhiy Storchaka166ebc42014-11-25 13:57:17 +02001005 }
1006 else if (c >= 0x100) {
Serhiy Storchakaaacfccc2014-11-26 12:11:40 +02001007 replsize = 1+1+4;
Serhiy Storchaka166ebc42014-11-25 13:57:17 +02001008 }
1009 else
Serhiy Storchakaaacfccc2014-11-26 12:11:40 +02001010 replsize = 1+1+2;
1011 if (ressize > PY_SSIZE_T_MAX - replsize)
1012 break;
1013 ressize += replsize;
Serhiy Storchaka166ebc42014-11-25 13:57:17 +02001014 }
Serhiy Storchakaaacfccc2014-11-26 12:11:40 +02001015 end = i;
Serhiy Storchaka166ebc42014-11-25 13:57:17 +02001016 res = PyUnicode_New(ressize, 127);
1017 if (res==NULL)
1018 return NULL;
1019 for (i = start, outp = PyUnicode_1BYTE_DATA(res);
1020 i < end; ++i) {
1021 c = PyUnicode_READ_CHAR(object, i);
1022 *outp++ = '\\';
1023 if (ucnhash_CAPI &&
1024 ucnhash_CAPI->getname(NULL, c, buffer, sizeof(buffer), 1)) {
1025 *outp++ = 'N';
1026 *outp++ = '{';
1027 strcpy((char *)outp, buffer);
1028 outp += strlen(buffer);
1029 *outp++ = '}';
1030 continue;
1031 }
1032 if (c >= 0x00010000) {
1033 *outp++ = 'U';
1034 *outp++ = Py_hexdigits[(c>>28)&0xf];
1035 *outp++ = Py_hexdigits[(c>>24)&0xf];
1036 *outp++ = Py_hexdigits[(c>>20)&0xf];
1037 *outp++ = Py_hexdigits[(c>>16)&0xf];
1038 *outp++ = Py_hexdigits[(c>>12)&0xf];
1039 *outp++ = Py_hexdigits[(c>>8)&0xf];
1040 }
1041 else if (c >= 0x100) {
1042 *outp++ = 'u';
1043 *outp++ = Py_hexdigits[(c>>12)&0xf];
1044 *outp++ = Py_hexdigits[(c>>8)&0xf];
1045 }
1046 else
1047 *outp++ = 'x';
1048 *outp++ = Py_hexdigits[(c>>4)&0xf];
1049 *outp++ = Py_hexdigits[c&0xf];
1050 }
1051
Benjamin Peterson3663b582014-11-26 14:39:54 -06001052 assert(outp == PyUnicode_1BYTE_DATA(res) + ressize);
Serhiy Storchaka166ebc42014-11-25 13:57:17 +02001053 assert(_PyUnicode_CheckConsistency(res, 1));
1054 restuple = Py_BuildValue("(Nn)", res, end);
1055 Py_DECREF(object);
1056 return restuple;
1057 }
1058 else {
1059 wrong_exception_type(exc);
1060 return NULL;
1061 }
1062}
1063
Serhiy Storchaka88d8fb62014-05-15 14:37:42 +03001064#define ENC_UNKNOWN -1
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02001065#define ENC_UTF8 0
1066#define ENC_UTF16BE 1
1067#define ENC_UTF16LE 2
1068#define ENC_UTF32BE 3
1069#define ENC_UTF32LE 4
1070
1071static int
1072get_standard_encoding(const char *encoding, int *bytelength)
1073{
1074 if (Py_TOLOWER(encoding[0]) == 'u' &&
1075 Py_TOLOWER(encoding[1]) == 't' &&
1076 Py_TOLOWER(encoding[2]) == 'f') {
1077 encoding += 3;
1078 if (*encoding == '-' || *encoding == '_' )
1079 encoding++;
Serhiy Storchaka88d8fb62014-05-15 14:37:42 +03001080 if (encoding[0] == '8' && encoding[1] == '\0') {
1081 *bytelength = 3;
1082 return ENC_UTF8;
1083 }
1084 else if (encoding[0] == '1' && encoding[1] == '6') {
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02001085 encoding += 2;
1086 *bytelength = 2;
1087 if (*encoding == '\0') {
1088#ifdef WORDS_BIGENDIAN
1089 return ENC_UTF16BE;
1090#else
1091 return ENC_UTF16LE;
1092#endif
1093 }
1094 if (*encoding == '-' || *encoding == '_' )
1095 encoding++;
1096 if (Py_TOLOWER(encoding[1]) == 'e' && encoding[2] == '\0') {
1097 if (Py_TOLOWER(encoding[0]) == 'b')
1098 return ENC_UTF16BE;
1099 if (Py_TOLOWER(encoding[0]) == 'l')
1100 return ENC_UTF16LE;
1101 }
1102 }
1103 else if (encoding[0] == '3' && encoding[1] == '2') {
1104 encoding += 2;
1105 *bytelength = 4;
1106 if (*encoding == '\0') {
1107#ifdef WORDS_BIGENDIAN
1108 return ENC_UTF32BE;
1109#else
1110 return ENC_UTF32LE;
1111#endif
1112 }
1113 if (*encoding == '-' || *encoding == '_' )
1114 encoding++;
1115 if (Py_TOLOWER(encoding[1]) == 'e' && encoding[2] == '\0') {
1116 if (Py_TOLOWER(encoding[0]) == 'b')
1117 return ENC_UTF32BE;
1118 if (Py_TOLOWER(encoding[0]) == 'l')
1119 return ENC_UTF32LE;
1120 }
1121 }
1122 }
Victor Stinner0d4e01c2014-05-16 14:46:20 +02001123 else if (strcmp(encoding, "CP_UTF8") == 0) {
1124 *bytelength = 3;
1125 return ENC_UTF8;
1126 }
Serhiy Storchaka88d8fb62014-05-15 14:37:42 +03001127 return ENC_UNKNOWN;
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02001128}
1129
Martin v. Löwisaef3fb02009-05-02 19:27:30 +00001130/* This handler is declared static until someone demonstrates
1131 a need to call it directly. */
1132static PyObject *
Martin v. Löwise0a2b722009-05-10 08:08:56 +00001133PyCodec_SurrogatePassErrors(PyObject *exc)
Martin v. Löwisdb12d452009-05-02 18:52:14 +00001134{
1135 PyObject *restuple;
1136 PyObject *object;
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02001137 PyObject *encode;
1138 char *encoding;
1139 int code;
1140 int bytelength;
Martin v. Löwisb09af032011-11-04 11:16:41 +01001141 Py_ssize_t i;
Martin v. Löwisdb12d452009-05-02 18:52:14 +00001142 Py_ssize_t start;
1143 Py_ssize_t end;
1144 PyObject *res;
Serhiy Storchakaca7fecb2015-05-18 16:08:52 +03001145
1146 if (PyObject_TypeCheck(exc, (PyTypeObject *)PyExc_UnicodeEncodeError)) {
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02001147 unsigned char *outp;
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001148 if (PyUnicodeEncodeError_GetStart(exc, &start))
1149 return NULL;
1150 if (PyUnicodeEncodeError_GetEnd(exc, &end))
1151 return NULL;
1152 if (!(object = PyUnicodeEncodeError_GetObject(exc)))
1153 return NULL;
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02001154 if (!(encode = PyUnicodeEncodeError_GetEncoding(exc))) {
1155 Py_DECREF(object);
1156 return NULL;
1157 }
1158 if (!(encoding = PyUnicode_AsUTF8(encode))) {
1159 Py_DECREF(object);
1160 Py_DECREF(encode);
1161 return NULL;
1162 }
1163 code = get_standard_encoding(encoding, &bytelength);
1164 Py_DECREF(encode);
Serhiy Storchaka88d8fb62014-05-15 14:37:42 +03001165 if (code == ENC_UNKNOWN) {
1166 /* Not supported, fail with original exception */
1167 PyErr_SetObject(PyExceptionInstance_Class(exc), exc);
1168 Py_DECREF(object);
1169 return NULL;
1170 }
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02001171
Serhiy Storchaka2e374092014-10-04 14:15:49 +03001172 if (end - start > PY_SSIZE_T_MAX / bytelength)
1173 end = start + PY_SSIZE_T_MAX / bytelength;
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02001174 res = PyBytes_FromStringAndSize(NULL, bytelength*(end-start));
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001175 if (!res) {
1176 Py_DECREF(object);
1177 return NULL;
1178 }
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02001179 outp = (unsigned char*)PyBytes_AsString(res);
Martin v. Löwisb09af032011-11-04 11:16:41 +01001180 for (i = start; i < end; i++) {
1181 /* object is guaranteed to be "ready" */
1182 Py_UCS4 ch = PyUnicode_READ_CHAR(object, i);
Victor Stinner76df43d2012-10-30 01:42:39 +01001183 if (!Py_UNICODE_IS_SURROGATE(ch)) {
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001184 /* Not a surrogate, fail with original exception */
1185 PyErr_SetObject(PyExceptionInstance_Class(exc), exc);
1186 Py_DECREF(res);
1187 Py_DECREF(object);
1188 return NULL;
1189 }
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02001190 switch (code) {
1191 case ENC_UTF8:
1192 *outp++ = (unsigned char)(0xe0 | (ch >> 12));
1193 *outp++ = (unsigned char)(0x80 | ((ch >> 6) & 0x3f));
1194 *outp++ = (unsigned char)(0x80 | (ch & 0x3f));
1195 break;
1196 case ENC_UTF16LE:
1197 *outp++ = (unsigned char) ch;
1198 *outp++ = (unsigned char)(ch >> 8);
1199 break;
1200 case ENC_UTF16BE:
1201 *outp++ = (unsigned char)(ch >> 8);
1202 *outp++ = (unsigned char) ch;
1203 break;
1204 case ENC_UTF32LE:
1205 *outp++ = (unsigned char) ch;
1206 *outp++ = (unsigned char)(ch >> 8);
1207 *outp++ = (unsigned char)(ch >> 16);
1208 *outp++ = (unsigned char)(ch >> 24);
1209 break;
1210 case ENC_UTF32BE:
1211 *outp++ = (unsigned char)(ch >> 24);
1212 *outp++ = (unsigned char)(ch >> 16);
1213 *outp++ = (unsigned char)(ch >> 8);
1214 *outp++ = (unsigned char) ch;
1215 break;
1216 }
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001217 }
1218 restuple = Py_BuildValue("(On)", res, end);
1219 Py_DECREF(res);
1220 Py_DECREF(object);
1221 return restuple;
Martin v. Löwisdb12d452009-05-02 18:52:14 +00001222 }
Serhiy Storchakaca7fecb2015-05-18 16:08:52 +03001223 else if (PyObject_TypeCheck(exc, (PyTypeObject *)PyExc_UnicodeDecodeError)) {
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001224 unsigned char *p;
Victor Stinnerc06bb7a2011-11-04 21:36:35 +01001225 Py_UCS4 ch = 0;
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001226 if (PyUnicodeDecodeError_GetStart(exc, &start))
1227 return NULL;
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02001228 if (PyUnicodeDecodeError_GetEnd(exc, &end))
1229 return NULL;
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001230 if (!(object = PyUnicodeDecodeError_GetObject(exc)))
1231 return NULL;
1232 if (!(p = (unsigned char*)PyBytes_AsString(object))) {
1233 Py_DECREF(object);
1234 return NULL;
1235 }
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02001236 if (!(encode = PyUnicodeDecodeError_GetEncoding(exc))) {
1237 Py_DECREF(object);
1238 return NULL;
1239 }
1240 if (!(encoding = PyUnicode_AsUTF8(encode))) {
1241 Py_DECREF(object);
1242 Py_DECREF(encode);
1243 return NULL;
1244 }
1245 code = get_standard_encoding(encoding, &bytelength);
1246 Py_DECREF(encode);
Serhiy Storchaka88d8fb62014-05-15 14:37:42 +03001247 if (code == ENC_UNKNOWN) {
1248 /* Not supported, fail with original exception */
1249 PyErr_SetObject(PyExceptionInstance_Class(exc), exc);
1250 Py_DECREF(object);
1251 return NULL;
1252 }
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02001253
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001254 /* Try decoding a single surrogate character. If
1255 there are more, let the codec call us again. */
1256 p += start;
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02001257 if (PyBytes_GET_SIZE(object) - start >= bytelength) {
1258 switch (code) {
1259 case ENC_UTF8:
1260 if ((p[0] & 0xf0) == 0xe0 &&
1261 (p[1] & 0xc0) == 0x80 &&
1262 (p[2] & 0xc0) == 0x80) {
1263 /* it's a three-byte code */
1264 ch = ((p[0] & 0x0f) << 12) + ((p[1] & 0x3f) << 6) + (p[2] & 0x3f);
1265 }
1266 break;
1267 case ENC_UTF16LE:
1268 ch = p[1] << 8 | p[0];
1269 break;
1270 case ENC_UTF16BE:
1271 ch = p[0] << 8 | p[1];
1272 break;
1273 case ENC_UTF32LE:
1274 ch = (p[3] << 24) | (p[2] << 16) | (p[1] << 8) | p[0];
1275 break;
1276 case ENC_UTF32BE:
1277 ch = (p[0] << 24) | (p[1] << 16) | (p[2] << 8) | p[3];
1278 break;
1279 }
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001280 }
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02001281
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001282 Py_DECREF(object);
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02001283 if (!Py_UNICODE_IS_SURROGATE(ch)) {
1284 /* it's not a surrogate - fail */
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001285 PyErr_SetObject(PyExceptionInstance_Class(exc), exc);
1286 return NULL;
1287 }
Victor Stinnerc06bb7a2011-11-04 21:36:35 +01001288 res = PyUnicode_FromOrdinal(ch);
1289 if (res == NULL)
1290 return NULL;
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02001291 return Py_BuildValue("(Nn)", res, start + bytelength);
Martin v. Löwisdb12d452009-05-02 18:52:14 +00001292 }
1293 else {
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001294 wrong_exception_type(exc);
1295 return NULL;
Martin v. Löwisdb12d452009-05-02 18:52:14 +00001296 }
1297}
1298
Martin v. Löwis011e8422009-05-05 04:43:17 +00001299static PyObject *
Martin v. Löwis43c57782009-05-10 08:15:24 +00001300PyCodec_SurrogateEscapeErrors(PyObject *exc)
Martin v. Löwis011e8422009-05-05 04:43:17 +00001301{
1302 PyObject *restuple;
1303 PyObject *object;
Martin v. Löwisb09af032011-11-04 11:16:41 +01001304 Py_ssize_t i;
Martin v. Löwis011e8422009-05-05 04:43:17 +00001305 Py_ssize_t start;
1306 Py_ssize_t end;
1307 PyObject *res;
Serhiy Storchakaca7fecb2015-05-18 16:08:52 +03001308
1309 if (PyObject_TypeCheck(exc, (PyTypeObject *)PyExc_UnicodeEncodeError)) {
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001310 char *outp;
1311 if (PyUnicodeEncodeError_GetStart(exc, &start))
1312 return NULL;
1313 if (PyUnicodeEncodeError_GetEnd(exc, &end))
1314 return NULL;
1315 if (!(object = PyUnicodeEncodeError_GetObject(exc)))
1316 return NULL;
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001317 res = PyBytes_FromStringAndSize(NULL, end-start);
1318 if (!res) {
1319 Py_DECREF(object);
1320 return NULL;
1321 }
1322 outp = PyBytes_AsString(res);
Martin v. Löwisb09af032011-11-04 11:16:41 +01001323 for (i = start; i < end; i++) {
1324 /* object is guaranteed to be "ready" */
1325 Py_UCS4 ch = PyUnicode_READ_CHAR(object, i);
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001326 if (ch < 0xdc80 || ch > 0xdcff) {
1327 /* Not a UTF-8b surrogate, fail with original exception */
1328 PyErr_SetObject(PyExceptionInstance_Class(exc), exc);
1329 Py_DECREF(res);
1330 Py_DECREF(object);
1331 return NULL;
1332 }
1333 *outp++ = ch - 0xdc00;
1334 }
1335 restuple = Py_BuildValue("(On)", res, end);
1336 Py_DECREF(res);
1337 Py_DECREF(object);
1338 return restuple;
Martin v. Löwis011e8422009-05-05 04:43:17 +00001339 }
Serhiy Storchakaca7fecb2015-05-18 16:08:52 +03001340 else if (PyObject_TypeCheck(exc, (PyTypeObject *)PyExc_UnicodeDecodeError)) {
Victor Stinnerc06bb7a2011-11-04 21:36:35 +01001341 PyObject *str;
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001342 unsigned char *p;
Victor Stinnerc06bb7a2011-11-04 21:36:35 +01001343 Py_UCS2 ch[4]; /* decode up to 4 bad bytes. */
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001344 int consumed = 0;
1345 if (PyUnicodeDecodeError_GetStart(exc, &start))
1346 return NULL;
1347 if (PyUnicodeDecodeError_GetEnd(exc, &end))
1348 return NULL;
1349 if (!(object = PyUnicodeDecodeError_GetObject(exc)))
1350 return NULL;
1351 if (!(p = (unsigned char*)PyBytes_AsString(object))) {
1352 Py_DECREF(object);
1353 return NULL;
1354 }
1355 while (consumed < 4 && consumed < end-start) {
1356 /* Refuse to escape ASCII bytes. */
1357 if (p[start+consumed] < 128)
1358 break;
1359 ch[consumed] = 0xdc00 + p[start+consumed];
1360 consumed++;
1361 }
1362 Py_DECREF(object);
1363 if (!consumed) {
1364 /* codec complained about ASCII byte. */
1365 PyErr_SetObject(PyExceptionInstance_Class(exc), exc);
1366 return NULL;
1367 }
Victor Stinnerc06bb7a2011-11-04 21:36:35 +01001368 str = PyUnicode_FromKindAndData(PyUnicode_2BYTE_KIND, ch, consumed);
1369 if (str == NULL)
1370 return NULL;
1371 return Py_BuildValue("(Nn)", str, start+consumed);
Martin v. Löwis011e8422009-05-05 04:43:17 +00001372 }
1373 else {
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001374 wrong_exception_type(exc);
1375 return NULL;
Martin v. Löwis011e8422009-05-05 04:43:17 +00001376 }
1377}
1378
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001379
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001380static PyObject *strict_errors(PyObject *self, PyObject *exc)
1381{
1382 return PyCodec_StrictErrors(exc);
1383}
1384
1385
1386static PyObject *ignore_errors(PyObject *self, PyObject *exc)
1387{
1388 return PyCodec_IgnoreErrors(exc);
1389}
1390
1391
1392static PyObject *replace_errors(PyObject *self, PyObject *exc)
1393{
1394 return PyCodec_ReplaceErrors(exc);
1395}
1396
1397
1398static PyObject *xmlcharrefreplace_errors(PyObject *self, PyObject *exc)
1399{
1400 return PyCodec_XMLCharRefReplaceErrors(exc);
1401}
1402
1403
1404static PyObject *backslashreplace_errors(PyObject *self, PyObject *exc)
1405{
1406 return PyCodec_BackslashReplaceErrors(exc);
1407}
1408
Serhiy Storchaka166ebc42014-11-25 13:57:17 +02001409static PyObject *namereplace_errors(PyObject *self, PyObject *exc)
1410{
1411 return PyCodec_NameReplaceErrors(exc);
1412}
1413
Martin v. Löwise0a2b722009-05-10 08:08:56 +00001414static PyObject *surrogatepass_errors(PyObject *self, PyObject *exc)
Martin v. Löwisdb12d452009-05-02 18:52:14 +00001415{
Martin v. Löwise0a2b722009-05-10 08:08:56 +00001416 return PyCodec_SurrogatePassErrors(exc);
Martin v. Löwisdb12d452009-05-02 18:52:14 +00001417}
1418
Martin v. Löwis43c57782009-05-10 08:15:24 +00001419static PyObject *surrogateescape_errors(PyObject *self, PyObject *exc)
Martin v. Löwis011e8422009-05-05 04:43:17 +00001420{
Martin v. Löwis43c57782009-05-10 08:15:24 +00001421 return PyCodec_SurrogateEscapeErrors(exc);
Martin v. Löwis011e8422009-05-05 04:43:17 +00001422}
1423
Gustavo Niemeyer5ddd4c32003-03-19 00:35:36 +00001424static int _PyCodecRegistry_Init(void)
Guido van Rossumfeee4b92000-03-10 22:57:27 +00001425{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001426 static struct {
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001427 char *name;
1428 PyMethodDef def;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001429 } methods[] =
1430 {
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001431 {
1432 "strict",
1433 {
1434 "strict_errors",
1435 strict_errors,
1436 METH_O,
1437 PyDoc_STR("Implements the 'strict' error handling, which "
1438 "raises a UnicodeError on coding errors.")
1439 }
1440 },
1441 {
1442 "ignore",
1443 {
1444 "ignore_errors",
1445 ignore_errors,
1446 METH_O,
1447 PyDoc_STR("Implements the 'ignore' error handling, which "
1448 "ignores malformed data and continues.")
1449 }
1450 },
1451 {
1452 "replace",
1453 {
1454 "replace_errors",
1455 replace_errors,
1456 METH_O,
1457 PyDoc_STR("Implements the 'replace' error handling, which "
1458 "replaces malformed data with a replacement marker.")
1459 }
1460 },
1461 {
1462 "xmlcharrefreplace",
1463 {
1464 "xmlcharrefreplace_errors",
1465 xmlcharrefreplace_errors,
1466 METH_O,
1467 PyDoc_STR("Implements the 'xmlcharrefreplace' error handling, "
1468 "which replaces an unencodable character with the "
1469 "appropriate XML character reference.")
1470 }
1471 },
1472 {
1473 "backslashreplace",
1474 {
1475 "backslashreplace_errors",
1476 backslashreplace_errors,
1477 METH_O,
1478 PyDoc_STR("Implements the 'backslashreplace' error handling, "
Serhiy Storchaka07985ef2015-01-25 22:56:57 +02001479 "which replaces malformed data with a backslashed "
1480 "escape sequence.")
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001481 }
1482 },
1483 {
Serhiy Storchaka166ebc42014-11-25 13:57:17 +02001484 "namereplace",
1485 {
1486 "namereplace_errors",
1487 namereplace_errors,
1488 METH_O,
1489 PyDoc_STR("Implements the 'namereplace' error handling, "
1490 "which replaces an unencodable character with a "
1491 "\\N{...} escape sequence.")
1492 }
1493 },
1494 {
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001495 "surrogatepass",
1496 {
1497 "surrogatepass",
1498 surrogatepass_errors,
1499 METH_O
1500 }
1501 },
1502 {
1503 "surrogateescape",
1504 {
1505 "surrogateescape",
1506 surrogateescape_errors,
1507 METH_O
1508 }
1509 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001510 };
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001511
Nicholas Bastine5662ae2004-03-24 22:22:12 +00001512 PyInterpreterState *interp = PyThreadState_GET()->interp;
Gustavo Niemeyer5ddd4c32003-03-19 00:35:36 +00001513 PyObject *mod;
Neal Norwitz739a8f82004-07-08 01:55:58 +00001514 unsigned i;
Gustavo Niemeyer5ddd4c32003-03-19 00:35:36 +00001515
1516 if (interp->codec_search_path != NULL)
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001517 return 0;
Gustavo Niemeyer5ddd4c32003-03-19 00:35:36 +00001518
1519 interp->codec_search_path = PyList_New(0);
1520 interp->codec_search_cache = PyDict_New();
1521 interp->codec_error_registry = PyDict_New();
1522
1523 if (interp->codec_error_registry) {
Victor Stinner63941882011-09-29 00:42:28 +02001524 for (i = 0; i < Py_ARRAY_LENGTH(methods); ++i) {
Andrew Svetlov3ba3a3e2012-12-25 13:32:35 +02001525 PyObject *func = PyCFunction_NewEx(&methods[i].def, NULL, NULL);
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001526 int res;
1527 if (!func)
1528 Py_FatalError("can't initialize codec error registry");
1529 res = PyCodec_RegisterError(methods[i].name, func);
1530 Py_DECREF(func);
1531 if (res)
1532 Py_FatalError("can't initialize codec error registry");
1533 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001534 }
Guido van Rossumfeee4b92000-03-10 22:57:27 +00001535
Gustavo Niemeyer5ddd4c32003-03-19 00:35:36 +00001536 if (interp->codec_search_path == NULL ||
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001537 interp->codec_search_cache == NULL ||
1538 interp->codec_error_registry == NULL)
1539 Py_FatalError("can't initialize codec registry");
Gustavo Niemeyer5ddd4c32003-03-19 00:35:36 +00001540
Christian Heimes819b8bf2008-01-03 23:05:47 +00001541 mod = PyImport_ImportModuleNoBlock("encodings");
Gustavo Niemeyer5ddd4c32003-03-19 00:35:36 +00001542 if (mod == NULL) {
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001543 return -1;
Gustavo Niemeyer5ddd4c32003-03-19 00:35:36 +00001544 }
1545 Py_DECREF(mod);
Christian Heimes6a27efa2008-10-30 21:48:26 +00001546 interp->codecs_initialized = 1;
Gustavo Niemeyer5ddd4c32003-03-19 00:35:36 +00001547 return 0;
Guido van Rossumfeee4b92000-03-10 22:57:27 +00001548}