blob: 151fea7d498a186b54d9b6c422883e0469052ff8 [file] [log] [blame]
Guido van Rossumfeee4b92000-03-10 22:57:27 +00001/* ------------------------------------------------------------------------
2
3 Python Codec Registry and support functions
4
5Written by Marc-Andre Lemburg (mal@lemburg.com).
6
Guido van Rossum16b1ad92000-08-03 16:24:25 +00007Copyright (c) Corporation for National Research Initiatives.
Guido van Rossumfeee4b92000-03-10 22:57:27 +00008
9 ------------------------------------------------------------------------ */
10
11#include "Python.h"
12#include <ctype.h>
13
Victor Stinnerf5cff562011-10-14 02:13:11 +020014const char *Py_hexdigits = "0123456789abcdef";
15
Guido van Rossumfeee4b92000-03-10 22:57:27 +000016/* --- Codec Registry ----------------------------------------------------- */
17
18/* Import the standard encodings package which will register the first
Guido van Rossum98297ee2007-11-06 21:34:58 +000019 codec search function.
Guido van Rossumfeee4b92000-03-10 22:57:27 +000020
21 This is done in a lazy way so that the Unicode implementation does
22 not downgrade startup time of scripts not needing it.
23
Guido van Rossumb95de4f2000-03-31 17:25:23 +000024 ImportErrors are silently ignored by this function. Only one try is
25 made.
Guido van Rossumfeee4b92000-03-10 22:57:27 +000026
27*/
28
Gustavo Niemeyer5ddd4c32003-03-19 00:35:36 +000029static int _PyCodecRegistry_Init(void); /* Forward */
Guido van Rossumfeee4b92000-03-10 22:57:27 +000030
Guido van Rossumfeee4b92000-03-10 22:57:27 +000031int PyCodec_Register(PyObject *search_function)
32{
Nicholas Bastine5662ae2004-03-24 22:22:12 +000033 PyInterpreterState *interp = PyThreadState_GET()->interp;
Gustavo Niemeyer5ddd4c32003-03-19 00:35:36 +000034 if (interp->codec_search_path == NULL && _PyCodecRegistry_Init())
Antoine Pitrouf95a1b32010-05-09 15:52:27 +000035 goto onError;
Guido van Rossumfeee4b92000-03-10 22:57:27 +000036 if (search_function == NULL) {
Antoine Pitrouf95a1b32010-05-09 15:52:27 +000037 PyErr_BadArgument();
38 goto onError;
Guido van Rossumfeee4b92000-03-10 22:57:27 +000039 }
40 if (!PyCallable_Check(search_function)) {
Antoine Pitrouf95a1b32010-05-09 15:52:27 +000041 PyErr_SetString(PyExc_TypeError, "argument must be callable");
42 goto onError;
Guido van Rossumfeee4b92000-03-10 22:57:27 +000043 }
Gustavo Niemeyer5ddd4c32003-03-19 00:35:36 +000044 return PyList_Append(interp->codec_search_path, search_function);
Guido van Rossumb95de4f2000-03-31 17:25:23 +000045
46 onError:
47 return -1;
Guido van Rossumfeee4b92000-03-10 22:57:27 +000048}
49
Guido van Rossum9e896b32000-04-05 20:11:21 +000050/* Convert a string to a normalized Python string: all characters are
51 converted to lower case, spaces are replaced with underscores. */
52
Guido van Rossumfeee4b92000-03-10 22:57:27 +000053static
Guido van Rossum9e896b32000-04-05 20:11:21 +000054PyObject *normalizestring(const char *string)
Guido van Rossumfeee4b92000-03-10 22:57:27 +000055{
Antoine Pitrou9ed5f272013-08-13 20:18:52 +020056 size_t i;
Guido van Rossum582acec2000-06-28 22:07:35 +000057 size_t len = strlen(string);
Guido van Rossumfeee4b92000-03-10 22:57:27 +000058 char *p;
59 PyObject *v;
Guido van Rossum21431e82007-10-19 21:48:41 +000060
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000061 if (len > PY_SSIZE_T_MAX) {
Antoine Pitrouf95a1b32010-05-09 15:52:27 +000062 PyErr_SetString(PyExc_OverflowError, "string is too large");
63 return NULL;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000064 }
Guido van Rossum21431e82007-10-19 21:48:41 +000065
66 p = PyMem_Malloc(len + 1);
67 if (p == NULL)
Victor Stinnercc351592013-07-12 00:02:55 +020068 return PyErr_NoMemory();
Guido van Rossum9e896b32000-04-05 20:11:21 +000069 for (i = 0; i < len; i++) {
Antoine Pitrou9ed5f272013-08-13 20:18:52 +020070 char ch = string[i];
Guido van Rossum9e896b32000-04-05 20:11:21 +000071 if (ch == ' ')
72 ch = '-';
73 else
Antoine Pitroucf9d3c02011-07-24 02:27:04 +020074 ch = Py_TOLOWER(Py_CHARMASK(ch));
Antoine Pitrouf95a1b32010-05-09 15:52:27 +000075 p[i] = ch;
Guido van Rossum9e896b32000-04-05 20:11:21 +000076 }
Guido van Rossum21431e82007-10-19 21:48:41 +000077 p[i] = '\0';
78 v = PyUnicode_FromString(p);
79 if (v == NULL)
80 return NULL;
81 PyMem_Free(p);
Guido van Rossumfeee4b92000-03-10 22:57:27 +000082 return v;
83}
84
85/* Lookup the given encoding and return a tuple providing the codec
86 facilities.
87
88 The encoding string is looked up converted to all lower-case
89 characters. This makes encodings looked up through this mechanism
90 effectively case-insensitive.
91
Guido van Rossum98297ee2007-11-06 21:34:58 +000092 If no codec is found, a LookupError is set and NULL returned.
Guido van Rossumb95de4f2000-03-31 17:25:23 +000093
94 As side effect, this tries to load the encodings package, if not
95 yet done. This is part of the lazy load strategy for the encodings
96 package.
97
98*/
Guido van Rossumfeee4b92000-03-10 22:57:27 +000099
100PyObject *_PyCodec_Lookup(const char *encoding)
101{
Gustavo Niemeyer5ddd4c32003-03-19 00:35:36 +0000102 PyInterpreterState *interp;
Guido van Rossum5ba3c842000-03-24 20:52:23 +0000103 PyObject *result, *args = NULL, *v;
Thomas Wouters477c8d52006-05-27 19:21:47 +0000104 Py_ssize_t i, len;
Guido van Rossumfeee4b92000-03-10 22:57:27 +0000105
Fred Drake766de832000-05-09 19:55:59 +0000106 if (encoding == NULL) {
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000107 PyErr_BadArgument();
108 goto onError;
Fred Drake766de832000-05-09 19:55:59 +0000109 }
Gustavo Niemeyer5ddd4c32003-03-19 00:35:36 +0000110
Nicholas Bastine5662ae2004-03-24 22:22:12 +0000111 interp = PyThreadState_GET()->interp;
Gustavo Niemeyer5ddd4c32003-03-19 00:35:36 +0000112 if (interp->codec_search_path == NULL && _PyCodecRegistry_Init())
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000113 goto onError;
Guido van Rossumfeee4b92000-03-10 22:57:27 +0000114
Guido van Rossum9e896b32000-04-05 20:11:21 +0000115 /* Convert the encoding to a normalized Python string: all
Thomas Wouters7e474022000-07-16 12:04:32 +0000116 characters are converted to lower case, spaces and hyphens are
Guido van Rossum9e896b32000-04-05 20:11:21 +0000117 replaced with underscores. */
118 v = normalizestring(encoding);
Guido van Rossumfeee4b92000-03-10 22:57:27 +0000119 if (v == NULL)
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000120 goto onError;
Guido van Rossum21431e82007-10-19 21:48:41 +0000121 PyUnicode_InternInPlace(&v);
Guido van Rossumfeee4b92000-03-10 22:57:27 +0000122
123 /* First, try to lookup the name in the registry dictionary */
Gustavo Niemeyer5ddd4c32003-03-19 00:35:36 +0000124 result = PyDict_GetItem(interp->codec_search_cache, v);
Guido van Rossumfeee4b92000-03-10 22:57:27 +0000125 if (result != NULL) {
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000126 Py_INCREF(result);
127 Py_DECREF(v);
128 return result;
Guido van Rossumfeee4b92000-03-10 22:57:27 +0000129 }
Guido van Rossum98297ee2007-11-06 21:34:58 +0000130
Guido van Rossumfeee4b92000-03-10 22:57:27 +0000131 /* Next, scan the search functions in order of registration */
Guido van Rossumfeee4b92000-03-10 22:57:27 +0000132 args = PyTuple_New(1);
133 if (args == NULL)
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000134 goto onError;
Guido van Rossumfeee4b92000-03-10 22:57:27 +0000135 PyTuple_SET_ITEM(args,0,v);
Guido van Rossum5ba3c842000-03-24 20:52:23 +0000136
Gustavo Niemeyer5ddd4c32003-03-19 00:35:36 +0000137 len = PyList_Size(interp->codec_search_path);
Guido van Rossum5ba3c842000-03-24 20:52:23 +0000138 if (len < 0)
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000139 goto onError;
Guido van Rossumb95de4f2000-03-31 17:25:23 +0000140 if (len == 0) {
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000141 PyErr_SetString(PyExc_LookupError,
142 "no codec search functions registered: "
143 "can't find encoding");
144 goto onError;
Guido van Rossumb95de4f2000-03-31 17:25:23 +0000145 }
Guido van Rossumfeee4b92000-03-10 22:57:27 +0000146
147 for (i = 0; i < len; i++) {
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000148 PyObject *func;
Guido van Rossum98297ee2007-11-06 21:34:58 +0000149
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000150 func = PyList_GetItem(interp->codec_search_path, i);
151 if (func == NULL)
152 goto onError;
153 result = PyEval_CallObject(func, args);
154 if (result == NULL)
155 goto onError;
156 if (result == Py_None) {
157 Py_DECREF(result);
158 continue;
159 }
160 if (!PyTuple_Check(result) || PyTuple_GET_SIZE(result) != 4) {
161 PyErr_SetString(PyExc_TypeError,
162 "codec search functions must return 4-tuples");
163 Py_DECREF(result);
164 goto onError;
165 }
166 break;
Guido van Rossumfeee4b92000-03-10 22:57:27 +0000167 }
168 if (i == len) {
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000169 /* XXX Perhaps we should cache misses too ? */
170 PyErr_Format(PyExc_LookupError,
Martin v. Löwiseb42b022002-09-26 16:01:24 +0000171 "unknown encoding: %s", encoding);
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000172 goto onError;
Guido van Rossumfeee4b92000-03-10 22:57:27 +0000173 }
174
175 /* Cache and return the result */
Neal Norwitz9edcc2e2007-08-11 04:58:26 +0000176 if (PyDict_SetItem(interp->codec_search_cache, v, result) < 0) {
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000177 Py_DECREF(result);
178 goto onError;
Neal Norwitz9edcc2e2007-08-11 04:58:26 +0000179 }
Guido van Rossumfeee4b92000-03-10 22:57:27 +0000180 Py_DECREF(args);
181 return result;
182
183 onError:
184 Py_XDECREF(args);
185 return NULL;
186}
187
Nick Coghlan8fad1672014-09-15 23:50:44 +1200188int _PyCodec_Forget(const char *encoding)
189{
190 PyInterpreterState *interp;
191 PyObject *v;
192 int result;
193
194 interp = PyThreadState_GET()->interp;
195 if (interp->codec_search_path == NULL) {
196 return -1;
197 }
198
199 /* Convert the encoding to a normalized Python string: all
200 characters are converted to lower case, spaces and hyphens are
201 replaced with underscores. */
202 v = normalizestring(encoding);
203 if (v == NULL) {
204 return -1;
205 }
206
207 /* Drop the named codec from the internal cache */
208 result = PyDict_DelItem(interp->codec_search_cache, v);
209 Py_DECREF(v);
210
211 return result;
212}
213
Marc-André Lemburgb2750b52008-06-06 12:18:17 +0000214/* Codec registry encoding check API. */
215
216int PyCodec_KnownEncoding(const char *encoding)
217{
218 PyObject *codecs;
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000219
Marc-André Lemburgb2750b52008-06-06 12:18:17 +0000220 codecs = _PyCodec_Lookup(encoding);
221 if (!codecs) {
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000222 PyErr_Clear();
223 return 0;
Marc-André Lemburgb2750b52008-06-06 12:18:17 +0000224 }
225 else {
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000226 Py_DECREF(codecs);
227 return 1;
Marc-André Lemburgb2750b52008-06-06 12:18:17 +0000228 }
229}
230
Guido van Rossumfeee4b92000-03-10 22:57:27 +0000231static
232PyObject *args_tuple(PyObject *object,
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000233 const char *errors)
Guido van Rossumfeee4b92000-03-10 22:57:27 +0000234{
235 PyObject *args;
Guido van Rossum98297ee2007-11-06 21:34:58 +0000236
Guido van Rossumfeee4b92000-03-10 22:57:27 +0000237 args = PyTuple_New(1 + (errors != NULL));
238 if (args == NULL)
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000239 return NULL;
Guido van Rossumfeee4b92000-03-10 22:57:27 +0000240 Py_INCREF(object);
241 PyTuple_SET_ITEM(args,0,object);
242 if (errors) {
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000243 PyObject *v;
Guido van Rossum98297ee2007-11-06 21:34:58 +0000244
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000245 v = PyUnicode_FromString(errors);
246 if (v == NULL) {
247 Py_DECREF(args);
248 return NULL;
249 }
250 PyTuple_SET_ITEM(args, 1, v);
Guido van Rossumfeee4b92000-03-10 22:57:27 +0000251 }
252 return args;
253}
254
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000255/* Helper function to get a codec item */
Guido van Rossumfeee4b92000-03-10 22:57:27 +0000256
257static
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000258PyObject *codec_getitem(const char *encoding, int index)
Guido van Rossumfeee4b92000-03-10 22:57:27 +0000259{
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000260 PyObject *codecs;
261 PyObject *v;
Guido van Rossumfeee4b92000-03-10 22:57:27 +0000262
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000263 codecs = _PyCodec_Lookup(encoding);
264 if (codecs == NULL)
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000265 return NULL;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000266 v = PyTuple_GET_ITEM(codecs, index);
267 Py_DECREF(codecs);
268 Py_INCREF(v);
269 return v;
270}
271
Nick Coghlana9b15242014-02-04 22:11:18 +1000272/* Helper functions to create an incremental codec. */
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000273static
Nick Coghlana9b15242014-02-04 22:11:18 +1000274PyObject *codec_makeincrementalcodec(PyObject *codec_info,
275 const char *errors,
276 const char *attrname)
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000277{
Nick Coghlana9b15242014-02-04 22:11:18 +1000278 PyObject *ret, *inccodec;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000279
Nick Coghlana9b15242014-02-04 22:11:18 +1000280 inccodec = PyObject_GetAttrString(codec_info, attrname);
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000281 if (inccodec == NULL)
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000282 return NULL;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000283 if (errors)
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000284 ret = PyObject_CallFunction(inccodec, "s", errors);
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000285 else
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000286 ret = PyObject_CallFunction(inccodec, NULL);
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000287 Py_DECREF(inccodec);
288 return ret;
289}
290
Nick Coghlana9b15242014-02-04 22:11:18 +1000291static
292PyObject *codec_getincrementalcodec(const char *encoding,
293 const char *errors,
294 const char *attrname)
295{
296 PyObject *codec_info, *ret;
297
298 codec_info = _PyCodec_Lookup(encoding);
299 if (codec_info == NULL)
300 return NULL;
301 ret = codec_makeincrementalcodec(codec_info, errors, attrname);
302 Py_DECREF(codec_info);
303 return ret;
304}
305
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000306/* Helper function to create a stream codec. */
307
308static
309PyObject *codec_getstreamcodec(const char *encoding,
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000310 PyObject *stream,
311 const char *errors,
312 const int index)
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000313{
Thomas Wouters0e3f5912006-08-11 14:57:12 +0000314 PyObject *codecs, *streamcodec, *codeccls;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000315
316 codecs = _PyCodec_Lookup(encoding);
317 if (codecs == NULL)
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000318 return NULL;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000319
Thomas Wouters0e3f5912006-08-11 14:57:12 +0000320 codeccls = PyTuple_GET_ITEM(codecs, index);
321 if (errors != NULL)
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000322 streamcodec = PyObject_CallFunction(codeccls, "Os", stream, errors);
Thomas Wouters0e3f5912006-08-11 14:57:12 +0000323 else
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000324 streamcodec = PyObject_CallFunction(codeccls, "O", stream);
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000325 Py_DECREF(codecs);
326 return streamcodec;
Guido van Rossumfeee4b92000-03-10 22:57:27 +0000327}
328
Nick Coghlana9b15242014-02-04 22:11:18 +1000329/* Helpers to work with the result of _PyCodec_Lookup
330
331 */
332PyObject *_PyCodecInfo_GetIncrementalDecoder(PyObject *codec_info,
333 const char *errors)
334{
335 return codec_makeincrementalcodec(codec_info, errors,
336 "incrementaldecoder");
337}
338
339PyObject *_PyCodecInfo_GetIncrementalEncoder(PyObject *codec_info,
340 const char *errors)
341{
342 return codec_makeincrementalcodec(codec_info, errors,
343 "incrementalencoder");
344}
345
346
Guido van Rossum98297ee2007-11-06 21:34:58 +0000347/* Convenience APIs to query the Codec registry.
348
Guido van Rossumfeee4b92000-03-10 22:57:27 +0000349 All APIs return a codec object with incremented refcount.
Guido van Rossum98297ee2007-11-06 21:34:58 +0000350
Guido van Rossumfeee4b92000-03-10 22:57:27 +0000351 */
352
353PyObject *PyCodec_Encoder(const char *encoding)
354{
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000355 return codec_getitem(encoding, 0);
Guido van Rossumfeee4b92000-03-10 22:57:27 +0000356}
357
358PyObject *PyCodec_Decoder(const char *encoding)
359{
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000360 return codec_getitem(encoding, 1);
Guido van Rossumfeee4b92000-03-10 22:57:27 +0000361}
362
Thomas Woutersa9773292006-04-21 09:43:23 +0000363PyObject *PyCodec_IncrementalEncoder(const char *encoding,
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000364 const char *errors)
Thomas Woutersa9773292006-04-21 09:43:23 +0000365{
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000366 return codec_getincrementalcodec(encoding, errors, "incrementalencoder");
Thomas Woutersa9773292006-04-21 09:43:23 +0000367}
368
369PyObject *PyCodec_IncrementalDecoder(const char *encoding,
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000370 const char *errors)
Thomas Woutersa9773292006-04-21 09:43:23 +0000371{
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000372 return codec_getincrementalcodec(encoding, errors, "incrementaldecoder");
Thomas Woutersa9773292006-04-21 09:43:23 +0000373}
374
Guido van Rossumfeee4b92000-03-10 22:57:27 +0000375PyObject *PyCodec_StreamReader(const char *encoding,
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000376 PyObject *stream,
377 const char *errors)
Guido van Rossumfeee4b92000-03-10 22:57:27 +0000378{
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000379 return codec_getstreamcodec(encoding, stream, errors, 2);
Guido van Rossumfeee4b92000-03-10 22:57:27 +0000380}
381
382PyObject *PyCodec_StreamWriter(const char *encoding,
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000383 PyObject *stream,
384 const char *errors)
Guido van Rossumfeee4b92000-03-10 22:57:27 +0000385{
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000386 return codec_getstreamcodec(encoding, stream, errors, 3);
Guido van Rossumfeee4b92000-03-10 22:57:27 +0000387}
388
Nick Coghlan8b097b42013-11-13 23:49:21 +1000389/* Helper that tries to ensure the reported exception chain indicates the
390 * codec that was invoked to trigger the failure without changing the type
391 * of the exception raised.
392 */
393static void
394wrap_codec_error(const char *operation,
395 const char *encoding)
396{
397 /* TrySetFromCause will replace the active exception with a suitably
398 * updated clone if it can, otherwise it will leave the original
399 * exception alone.
400 */
401 _PyErr_TrySetFromCause("%s with '%s' codec failed",
402 operation, encoding);
403}
404
Guido van Rossumfeee4b92000-03-10 22:57:27 +0000405/* Encode an object (e.g. an Unicode object) using the given encoding
406 and return the resulting encoded object (usually a Python string).
407
408 errors is passed to the encoder factory as argument if non-NULL. */
409
Nick Coghlanc72e4e62013-11-22 22:39:36 +1000410static PyObject *
411_PyCodec_EncodeInternal(PyObject *object,
412 PyObject *encoder,
413 const char *encoding,
414 const char *errors)
Guido van Rossumfeee4b92000-03-10 22:57:27 +0000415{
Neal Norwitz3715c3e2005-11-24 22:09:18 +0000416 PyObject *args = NULL, *result = NULL;
Guido van Rossum98297ee2007-11-06 21:34:58 +0000417 PyObject *v = NULL;
Guido van Rossumfeee4b92000-03-10 22:57:27 +0000418
Guido van Rossumfeee4b92000-03-10 22:57:27 +0000419 args = args_tuple(object, errors);
420 if (args == NULL)
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000421 goto onError;
Guido van Rossum98297ee2007-11-06 21:34:58 +0000422
423 result = PyEval_CallObject(encoder, args);
Nick Coghlanc4c25802013-11-15 21:47:37 +1000424 if (result == NULL) {
425 wrap_codec_error("encoding", encoding);
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000426 goto onError;
Nick Coghlanc4c25802013-11-15 21:47:37 +1000427 }
Guido van Rossumfeee4b92000-03-10 22:57:27 +0000428
Guido van Rossum98297ee2007-11-06 21:34:58 +0000429 if (!PyTuple_Check(result) ||
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000430 PyTuple_GET_SIZE(result) != 2) {
431 PyErr_SetString(PyExc_TypeError,
432 "encoder must return a tuple (object, integer)");
433 goto onError;
Guido van Rossumfeee4b92000-03-10 22:57:27 +0000434 }
Marc-André Lemburgb2750b52008-06-06 12:18:17 +0000435 v = PyTuple_GET_ITEM(result,0);
436 Py_INCREF(v);
Guido van Rossumfeee4b92000-03-10 22:57:27 +0000437 /* We don't check or use the second (integer) entry. */
438
Marc-André Lemburgb2750b52008-06-06 12:18:17 +0000439 Py_DECREF(args);
440 Py_DECREF(encoder);
441 Py_DECREF(result);
442 return v;
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000443
Guido van Rossumfeee4b92000-03-10 22:57:27 +0000444 onError:
Neal Norwitz3715c3e2005-11-24 22:09:18 +0000445 Py_XDECREF(result);
Guido van Rossumfeee4b92000-03-10 22:57:27 +0000446 Py_XDECREF(args);
447 Py_XDECREF(encoder);
Marc-André Lemburgb2750b52008-06-06 12:18:17 +0000448 return NULL;
Guido van Rossumfeee4b92000-03-10 22:57:27 +0000449}
450
451/* Decode an object (usually a Python string) using the given encoding
452 and return an equivalent object (e.g. an Unicode object).
453
454 errors is passed to the decoder factory as argument if non-NULL. */
455
Nick Coghlanc72e4e62013-11-22 22:39:36 +1000456static PyObject *
457_PyCodec_DecodeInternal(PyObject *object,
458 PyObject *decoder,
459 const char *encoding,
460 const char *errors)
Guido van Rossumfeee4b92000-03-10 22:57:27 +0000461{
Guido van Rossumfeee4b92000-03-10 22:57:27 +0000462 PyObject *args = NULL, *result = NULL;
463 PyObject *v;
464
Guido van Rossumfeee4b92000-03-10 22:57:27 +0000465 args = args_tuple(object, errors);
466 if (args == NULL)
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000467 goto onError;
Guido van Rossum98297ee2007-11-06 21:34:58 +0000468
Guido van Rossumfeee4b92000-03-10 22:57:27 +0000469 result = PyEval_CallObject(decoder,args);
Nick Coghlanc4c25802013-11-15 21:47:37 +1000470 if (result == NULL) {
471 wrap_codec_error("decoding", encoding);
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000472 goto onError;
Nick Coghlanc4c25802013-11-15 21:47:37 +1000473 }
Guido van Rossum98297ee2007-11-06 21:34:58 +0000474 if (!PyTuple_Check(result) ||
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000475 PyTuple_GET_SIZE(result) != 2) {
476 PyErr_SetString(PyExc_TypeError,
477 "decoder must return a tuple (object,integer)");
478 goto onError;
Guido van Rossumfeee4b92000-03-10 22:57:27 +0000479 }
480 v = PyTuple_GET_ITEM(result,0);
481 Py_INCREF(v);
482 /* We don't check or use the second (integer) entry. */
483
484 Py_DECREF(args);
485 Py_DECREF(decoder);
486 Py_DECREF(result);
487 return v;
Guido van Rossum98297ee2007-11-06 21:34:58 +0000488
Guido van Rossumfeee4b92000-03-10 22:57:27 +0000489 onError:
490 Py_XDECREF(args);
491 Py_XDECREF(decoder);
492 Py_XDECREF(result);
493 return NULL;
494}
495
Nick Coghlanc72e4e62013-11-22 22:39:36 +1000496/* Generic encoding/decoding API */
497PyObject *PyCodec_Encode(PyObject *object,
498 const char *encoding,
499 const char *errors)
500{
501 PyObject *encoder;
502
503 encoder = PyCodec_Encoder(encoding);
504 if (encoder == NULL)
505 return NULL;
506
507 return _PyCodec_EncodeInternal(object, encoder, encoding, errors);
508}
509
510PyObject *PyCodec_Decode(PyObject *object,
511 const char *encoding,
512 const char *errors)
513{
514 PyObject *decoder;
515
516 decoder = PyCodec_Decoder(encoding);
517 if (decoder == NULL)
518 return NULL;
519
520 return _PyCodec_DecodeInternal(object, decoder, encoding, errors);
521}
522
523/* Text encoding/decoding API */
Nick Coghlana9b15242014-02-04 22:11:18 +1000524PyObject * _PyCodec_LookupTextEncoding(const char *encoding,
525 const char *alternate_command)
Nick Coghlanc72e4e62013-11-22 22:39:36 +1000526{
527 _Py_IDENTIFIER(_is_text_encoding);
528 PyObject *codec;
529 PyObject *attr;
Nick Coghlanc72e4e62013-11-22 22:39:36 +1000530 int is_text_codec;
531
532 codec = _PyCodec_Lookup(encoding);
533 if (codec == NULL)
534 return NULL;
535
536 /* Backwards compatibility: assume any raw tuple describes a text
537 * encoding, and the same for anything lacking the private
538 * attribute.
539 */
540 if (!PyTuple_CheckExact(codec)) {
541 attr = _PyObject_GetAttrId(codec, &PyId__is_text_encoding);
542 if (attr == NULL) {
543 if (PyErr_ExceptionMatches(PyExc_AttributeError)) {
544 PyErr_Clear();
545 } else {
546 Py_DECREF(codec);
547 return NULL;
548 }
549 } else {
550 is_text_codec = PyObject_IsTrue(attr);
551 Py_DECREF(attr);
552 if (!is_text_codec) {
553 Py_DECREF(codec);
554 PyErr_Format(PyExc_LookupError,
555 "'%.400s' is not a text encoding; "
Nick Coghlana9b15242014-02-04 22:11:18 +1000556 "use %s to handle arbitrary codecs",
557 encoding, alternate_command);
Nick Coghlanc72e4e62013-11-22 22:39:36 +1000558 return NULL;
559 }
560 }
561 }
562
Nick Coghlana9b15242014-02-04 22:11:18 +1000563 /* This appears to be a valid text encoding */
564 return codec;
565}
566
567
568static
569PyObject *codec_getitem_checked(const char *encoding,
570 const char *alternate_command,
571 int index)
572{
573 PyObject *codec;
574 PyObject *v;
575
576 codec = _PyCodec_LookupTextEncoding(encoding, alternate_command);
577 if (codec == NULL)
578 return NULL;
579
Nick Coghlanc72e4e62013-11-22 22:39:36 +1000580 v = PyTuple_GET_ITEM(codec, index);
Nick Coghlanc72e4e62013-11-22 22:39:36 +1000581 Py_INCREF(v);
Nick Coghlana9b15242014-02-04 22:11:18 +1000582 Py_DECREF(codec);
Nick Coghlanc72e4e62013-11-22 22:39:36 +1000583 return v;
584}
585
586static PyObject * _PyCodec_TextEncoder(const char *encoding)
587{
Nick Coghlana9b15242014-02-04 22:11:18 +1000588 return codec_getitem_checked(encoding, "codecs.encode()", 0);
Nick Coghlanc72e4e62013-11-22 22:39:36 +1000589}
590
591static PyObject * _PyCodec_TextDecoder(const char *encoding)
592{
Nick Coghlana9b15242014-02-04 22:11:18 +1000593 return codec_getitem_checked(encoding, "codecs.decode()", 1);
Nick Coghlanc72e4e62013-11-22 22:39:36 +1000594}
595
596PyObject *_PyCodec_EncodeText(PyObject *object,
597 const char *encoding,
598 const char *errors)
599{
600 PyObject *encoder;
601
602 encoder = _PyCodec_TextEncoder(encoding);
603 if (encoder == NULL)
604 return NULL;
605
606 return _PyCodec_EncodeInternal(object, encoder, encoding, errors);
607}
608
609PyObject *_PyCodec_DecodeText(PyObject *object,
610 const char *encoding,
611 const char *errors)
612{
613 PyObject *decoder;
614
615 decoder = _PyCodec_TextDecoder(encoding);
616 if (decoder == NULL)
617 return NULL;
618
619 return _PyCodec_DecodeInternal(object, decoder, encoding, errors);
620}
621
Walter Dörwald3aeb6322002-09-02 13:14:32 +0000622/* Register the error handling callback function error under the name
623 name. This function will be called by the codec when it encounters
624 an unencodable characters/undecodable bytes and doesn't know the
625 callback name, when name is specified as the error parameter
626 in the call to the encode/decode function.
627 Return 0 on success, -1 on error */
628int PyCodec_RegisterError(const char *name, PyObject *error)
629{
Nicholas Bastine5662ae2004-03-24 22:22:12 +0000630 PyInterpreterState *interp = PyThreadState_GET()->interp;
Gustavo Niemeyer5ddd4c32003-03-19 00:35:36 +0000631 if (interp->codec_search_path == NULL && _PyCodecRegistry_Init())
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000632 return -1;
Walter Dörwald3aeb6322002-09-02 13:14:32 +0000633 if (!PyCallable_Check(error)) {
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000634 PyErr_SetString(PyExc_TypeError, "handler must be callable");
635 return -1;
Walter Dörwald3aeb6322002-09-02 13:14:32 +0000636 }
Gustavo Niemeyer5ddd4c32003-03-19 00:35:36 +0000637 return PyDict_SetItemString(interp->codec_error_registry,
Serhiy Storchakac6792272013-10-19 21:03:34 +0300638 name, error);
Walter Dörwald3aeb6322002-09-02 13:14:32 +0000639}
640
641/* Lookup the error handling callback function registered under the
642 name error. As a special case NULL can be passed, in which case
643 the error handling callback for strict encoding will be returned. */
644PyObject *PyCodec_LookupError(const char *name)
645{
646 PyObject *handler = NULL;
647
Nicholas Bastine5662ae2004-03-24 22:22:12 +0000648 PyInterpreterState *interp = PyThreadState_GET()->interp;
Gustavo Niemeyer5ddd4c32003-03-19 00:35:36 +0000649 if (interp->codec_search_path == NULL && _PyCodecRegistry_Init())
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000650 return NULL;
Gustavo Niemeyer5ddd4c32003-03-19 00:35:36 +0000651
Walter Dörwald3aeb6322002-09-02 13:14:32 +0000652 if (name==NULL)
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000653 name = "strict";
Serhiy Storchakac6792272013-10-19 21:03:34 +0300654 handler = PyDict_GetItemString(interp->codec_error_registry, name);
Walter Dörwald3aeb6322002-09-02 13:14:32 +0000655 if (!handler)
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000656 PyErr_Format(PyExc_LookupError, "unknown error handler name '%.400s'", name);
Walter Dörwald3aeb6322002-09-02 13:14:32 +0000657 else
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000658 Py_INCREF(handler);
Walter Dörwald3aeb6322002-09-02 13:14:32 +0000659 return handler;
660}
661
662static void wrong_exception_type(PyObject *exc)
663{
Martin v. Löwisbd928fe2011-10-14 10:20:37 +0200664 _Py_IDENTIFIER(__class__);
665 _Py_IDENTIFIER(__name__);
Martin v. Löwis1ee1b6f2011-10-10 18:11:30 +0200666 PyObject *type = _PyObject_GetAttrId(exc, &PyId___class__);
Walter Dörwald3aeb6322002-09-02 13:14:32 +0000667 if (type != NULL) {
Martin v. Löwis1ee1b6f2011-10-10 18:11:30 +0200668 PyObject *name = _PyObject_GetAttrId(type, &PyId___name__);
Walter Dörwald573c08c2007-05-25 15:46:59 +0000669 Py_DECREF(type);
670 if (name != NULL) {
671 PyErr_Format(PyExc_TypeError,
672 "don't know how to handle %S in error callback", name);
673 Py_DECREF(name);
674 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +0000675 }
676}
677
678PyObject *PyCodec_StrictErrors(PyObject *exc)
679{
Brett Cannonbf364092006-03-01 04:25:17 +0000680 if (PyExceptionInstance_Check(exc))
681 PyErr_SetObject(PyExceptionInstance_Class(exc), exc);
Walter Dörwald3aeb6322002-09-02 13:14:32 +0000682 else
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000683 PyErr_SetString(PyExc_TypeError, "codec must pass exception instance");
Walter Dörwald3aeb6322002-09-02 13:14:32 +0000684 return NULL;
685}
686
687
688PyObject *PyCodec_IgnoreErrors(PyObject *exc)
689{
Martin v. Löwis18e16552006-02-15 17:27:45 +0000690 Py_ssize_t end;
Walter Dörwald3aeb6322002-09-02 13:14:32 +0000691 if (PyObject_IsInstance(exc, PyExc_UnicodeEncodeError)) {
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000692 if (PyUnicodeEncodeError_GetEnd(exc, &end))
693 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +0000694 }
695 else if (PyObject_IsInstance(exc, PyExc_UnicodeDecodeError)) {
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000696 if (PyUnicodeDecodeError_GetEnd(exc, &end))
697 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +0000698 }
699 else if (PyObject_IsInstance(exc, PyExc_UnicodeTranslateError)) {
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000700 if (PyUnicodeTranslateError_GetEnd(exc, &end))
701 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +0000702 }
703 else {
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000704 wrong_exception_type(exc);
705 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +0000706 }
Victor Stinneree450092011-12-01 02:52:11 +0100707 return Py_BuildValue("(Nn)", PyUnicode_New(0, 0), end);
Walter Dörwald3aeb6322002-09-02 13:14:32 +0000708}
709
710
711PyObject *PyCodec_ReplaceErrors(PyObject *exc)
712{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200713 Py_ssize_t start, end, i, len;
Walter Dörwald3aeb6322002-09-02 13:14:32 +0000714
715 if (PyObject_IsInstance(exc, PyExc_UnicodeEncodeError)) {
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000716 PyObject *res;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200717 int kind;
718 void *data;
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000719 if (PyUnicodeEncodeError_GetStart(exc, &start))
720 return NULL;
721 if (PyUnicodeEncodeError_GetEnd(exc, &end))
722 return NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200723 len = end - start;
724 res = PyUnicode_New(len, '?');
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000725 if (res == NULL)
726 return NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200727 kind = PyUnicode_KIND(res);
728 data = PyUnicode_DATA(res);
729 for (i = 0; i < len; ++i)
730 PyUnicode_WRITE(kind, data, i, '?');
Victor Stinner8f825062012-04-27 13:55:39 +0200731 assert(_PyUnicode_CheckConsistency(res, 1));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200732 return Py_BuildValue("(Nn)", res, end);
Walter Dörwald3aeb6322002-09-02 13:14:32 +0000733 }
734 else if (PyObject_IsInstance(exc, PyExc_UnicodeDecodeError)) {
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000735 if (PyUnicodeDecodeError_GetEnd(exc, &end))
736 return NULL;
Victor Stinner1a15aba2011-10-02 19:00:15 +0200737 return Py_BuildValue("(Cn)",
738 (int)Py_UNICODE_REPLACEMENT_CHARACTER,
739 end);
Walter Dörwald3aeb6322002-09-02 13:14:32 +0000740 }
741 else if (PyObject_IsInstance(exc, PyExc_UnicodeTranslateError)) {
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000742 PyObject *res;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200743 int kind;
744 void *data;
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000745 if (PyUnicodeTranslateError_GetStart(exc, &start))
746 return NULL;
747 if (PyUnicodeTranslateError_GetEnd(exc, &end))
748 return NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200749 len = end - start;
750 res = PyUnicode_New(len, Py_UNICODE_REPLACEMENT_CHARACTER);
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000751 if (res == NULL)
752 return NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200753 kind = PyUnicode_KIND(res);
754 data = PyUnicode_DATA(res);
755 for (i=0; i < len; i++)
756 PyUnicode_WRITE(kind, data, i, Py_UNICODE_REPLACEMENT_CHARACTER);
Victor Stinner8f825062012-04-27 13:55:39 +0200757 assert(_PyUnicode_CheckConsistency(res, 1));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200758 return Py_BuildValue("(Nn)", res, end);
Walter Dörwald3aeb6322002-09-02 13:14:32 +0000759 }
760 else {
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000761 wrong_exception_type(exc);
762 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +0000763 }
764}
765
766PyObject *PyCodec_XMLCharRefReplaceErrors(PyObject *exc)
767{
768 if (PyObject_IsInstance(exc, PyExc_UnicodeEncodeError)) {
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000769 PyObject *restuple;
770 PyObject *object;
Victor Stinnerb31f1bc2011-11-04 21:29:10 +0100771 Py_ssize_t i;
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000772 Py_ssize_t start;
773 Py_ssize_t end;
774 PyObject *res;
Martin v. Löwisb09af032011-11-04 11:16:41 +0100775 unsigned char *outp;
Serhiy Storchaka2e374092014-10-04 14:15:49 +0300776 Py_ssize_t ressize;
Martin v. Löwisb09af032011-11-04 11:16:41 +0100777 Py_UCS4 ch;
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000778 if (PyUnicodeEncodeError_GetStart(exc, &start))
779 return NULL;
780 if (PyUnicodeEncodeError_GetEnd(exc, &end))
781 return NULL;
782 if (!(object = PyUnicodeEncodeError_GetObject(exc)))
783 return NULL;
Serhiy Storchaka2e374092014-10-04 14:15:49 +0300784 if (end - start > PY_SSIZE_T_MAX / (2+7+1))
785 end = start + PY_SSIZE_T_MAX / (2+7+1);
Martin v. Löwisb09af032011-11-04 11:16:41 +0100786 for (i = start, ressize = 0; i < end; ++i) {
787 /* object is guaranteed to be "ready" */
788 ch = PyUnicode_READ_CHAR(object, i);
789 if (ch<10)
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000790 ressize += 2+1+1;
Martin v. Löwisb09af032011-11-04 11:16:41 +0100791 else if (ch<100)
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000792 ressize += 2+2+1;
Martin v. Löwisb09af032011-11-04 11:16:41 +0100793 else if (ch<1000)
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000794 ressize += 2+3+1;
Martin v. Löwisb09af032011-11-04 11:16:41 +0100795 else if (ch<10000)
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000796 ressize += 2+4+1;
Martin v. Löwisb09af032011-11-04 11:16:41 +0100797 else if (ch<100000)
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000798 ressize += 2+5+1;
Martin v. Löwisb09af032011-11-04 11:16:41 +0100799 else if (ch<1000000)
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000800 ressize += 2+6+1;
801 else
802 ressize += 2+7+1;
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000803 }
804 /* allocate replacement */
Martin v. Löwisb09af032011-11-04 11:16:41 +0100805 res = PyUnicode_New(ressize, 127);
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000806 if (res == NULL) {
807 Py_DECREF(object);
808 return NULL;
809 }
Martin v. Löwisb09af032011-11-04 11:16:41 +0100810 outp = PyUnicode_1BYTE_DATA(res);
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000811 /* generate replacement */
Victor Stinnerb31f1bc2011-11-04 21:29:10 +0100812 for (i = start; i < end; ++i) {
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000813 int digits;
814 int base;
Martin v. Löwis8ba79302011-11-04 12:26:49 +0100815 ch = PyUnicode_READ_CHAR(object, i);
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000816 *outp++ = '&';
817 *outp++ = '#';
Martin v. Löwisb09af032011-11-04 11:16:41 +0100818 if (ch<10) {
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000819 digits = 1;
820 base = 1;
821 }
Martin v. Löwisb09af032011-11-04 11:16:41 +0100822 else if (ch<100) {
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000823 digits = 2;
824 base = 10;
825 }
Martin v. Löwisb09af032011-11-04 11:16:41 +0100826 else if (ch<1000) {
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000827 digits = 3;
828 base = 100;
829 }
Martin v. Löwisb09af032011-11-04 11:16:41 +0100830 else if (ch<10000) {
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000831 digits = 4;
832 base = 1000;
833 }
Martin v. Löwisb09af032011-11-04 11:16:41 +0100834 else if (ch<100000) {
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000835 digits = 5;
836 base = 10000;
837 }
Martin v. Löwisb09af032011-11-04 11:16:41 +0100838 else if (ch<1000000) {
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000839 digits = 6;
840 base = 100000;
841 }
842 else {
843 digits = 7;
844 base = 1000000;
845 }
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000846 while (digits-->0) {
Martin v. Löwisb09af032011-11-04 11:16:41 +0100847 *outp++ = '0' + ch/base;
848 ch %= base;
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000849 base /= 10;
850 }
851 *outp++ = ';';
852 }
Victor Stinner8f825062012-04-27 13:55:39 +0200853 assert(_PyUnicode_CheckConsistency(res, 1));
854 restuple = Py_BuildValue("(Nn)", res, end);
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000855 Py_DECREF(object);
856 return restuple;
Walter Dörwald3aeb6322002-09-02 13:14:32 +0000857 }
858 else {
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000859 wrong_exception_type(exc);
860 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +0000861 }
862}
863
Walter Dörwald3aeb6322002-09-02 13:14:32 +0000864PyObject *PyCodec_BackslashReplaceErrors(PyObject *exc)
865{
866 if (PyObject_IsInstance(exc, PyExc_UnicodeEncodeError)) {
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000867 PyObject *restuple;
868 PyObject *object;
Martin v. Löwisb09af032011-11-04 11:16:41 +0100869 Py_ssize_t i;
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000870 Py_ssize_t start;
871 Py_ssize_t end;
872 PyObject *res;
Martin v. Löwisb09af032011-11-04 11:16:41 +0100873 unsigned char *outp;
Serhiy Storchaka2e374092014-10-04 14:15:49 +0300874 Py_ssize_t ressize;
Martin v. Löwisb09af032011-11-04 11:16:41 +0100875 Py_UCS4 c;
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000876 if (PyUnicodeEncodeError_GetStart(exc, &start))
877 return NULL;
878 if (PyUnicodeEncodeError_GetEnd(exc, &end))
879 return NULL;
880 if (!(object = PyUnicodeEncodeError_GetObject(exc)))
881 return NULL;
Serhiy Storchaka2e374092014-10-04 14:15:49 +0300882 if (end - start > PY_SSIZE_T_MAX / (1+1+8))
883 end = start + PY_SSIZE_T_MAX / (1+1+8);
Martin v. Löwisb09af032011-11-04 11:16:41 +0100884 for (i = start, ressize = 0; i < end; ++i) {
885 /* object is guaranteed to be "ready" */
886 c = PyUnicode_READ_CHAR(object, i);
887 if (c >= 0x10000) {
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000888 ressize += 1+1+8;
Martin v. Löwisb09af032011-11-04 11:16:41 +0100889 }
890 else if (c >= 0x100) {
891 ressize += 1+1+4;
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000892 }
893 else
894 ressize += 1+1+2;
895 }
Martin v. Löwisb09af032011-11-04 11:16:41 +0100896 res = PyUnicode_New(ressize, 127);
Serhiy Storchaka8aa8c472014-09-23 19:59:09 +0300897 if (res == NULL) {
898 Py_DECREF(object);
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000899 return NULL;
Serhiy Storchaka8aa8c472014-09-23 19:59:09 +0300900 }
Martin v. Löwisb09af032011-11-04 11:16:41 +0100901 for (i = start, outp = PyUnicode_1BYTE_DATA(res);
902 i < end; ++i) {
903 c = PyUnicode_READ_CHAR(object, i);
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000904 *outp++ = '\\';
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000905 if (c >= 0x00010000) {
906 *outp++ = 'U';
Victor Stinnerf5cff562011-10-14 02:13:11 +0200907 *outp++ = Py_hexdigits[(c>>28)&0xf];
908 *outp++ = Py_hexdigits[(c>>24)&0xf];
909 *outp++ = Py_hexdigits[(c>>20)&0xf];
910 *outp++ = Py_hexdigits[(c>>16)&0xf];
911 *outp++ = Py_hexdigits[(c>>12)&0xf];
912 *outp++ = Py_hexdigits[(c>>8)&0xf];
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000913 }
Antoine Pitroue4a18922010-09-09 20:30:23 +0000914 else if (c >= 0x100) {
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000915 *outp++ = 'u';
Victor Stinnerf5cff562011-10-14 02:13:11 +0200916 *outp++ = Py_hexdigits[(c>>12)&0xf];
917 *outp++ = Py_hexdigits[(c>>8)&0xf];
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000918 }
919 else
920 *outp++ = 'x';
Victor Stinnerf5cff562011-10-14 02:13:11 +0200921 *outp++ = Py_hexdigits[(c>>4)&0xf];
922 *outp++ = Py_hexdigits[c&0xf];
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000923 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +0000924
Victor Stinner8f825062012-04-27 13:55:39 +0200925 assert(_PyUnicode_CheckConsistency(res, 1));
926 restuple = Py_BuildValue("(Nn)", res, end);
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000927 Py_DECREF(object);
928 return restuple;
Walter Dörwald3aeb6322002-09-02 13:14:32 +0000929 }
930 else {
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000931 wrong_exception_type(exc);
932 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +0000933 }
934}
935
Serhiy Storchaka88d8fb62014-05-15 14:37:42 +0300936#define ENC_UNKNOWN -1
Serhiy Storchaka58cf6072013-11-19 11:32:41 +0200937#define ENC_UTF8 0
938#define ENC_UTF16BE 1
939#define ENC_UTF16LE 2
940#define ENC_UTF32BE 3
941#define ENC_UTF32LE 4
942
943static int
944get_standard_encoding(const char *encoding, int *bytelength)
945{
946 if (Py_TOLOWER(encoding[0]) == 'u' &&
947 Py_TOLOWER(encoding[1]) == 't' &&
948 Py_TOLOWER(encoding[2]) == 'f') {
949 encoding += 3;
950 if (*encoding == '-' || *encoding == '_' )
951 encoding++;
Serhiy Storchaka88d8fb62014-05-15 14:37:42 +0300952 if (encoding[0] == '8' && encoding[1] == '\0') {
953 *bytelength = 3;
954 return ENC_UTF8;
955 }
956 else if (encoding[0] == '1' && encoding[1] == '6') {
Serhiy Storchaka58cf6072013-11-19 11:32:41 +0200957 encoding += 2;
958 *bytelength = 2;
959 if (*encoding == '\0') {
960#ifdef WORDS_BIGENDIAN
961 return ENC_UTF16BE;
962#else
963 return ENC_UTF16LE;
964#endif
965 }
966 if (*encoding == '-' || *encoding == '_' )
967 encoding++;
968 if (Py_TOLOWER(encoding[1]) == 'e' && encoding[2] == '\0') {
969 if (Py_TOLOWER(encoding[0]) == 'b')
970 return ENC_UTF16BE;
971 if (Py_TOLOWER(encoding[0]) == 'l')
972 return ENC_UTF16LE;
973 }
974 }
975 else if (encoding[0] == '3' && encoding[1] == '2') {
976 encoding += 2;
977 *bytelength = 4;
978 if (*encoding == '\0') {
979#ifdef WORDS_BIGENDIAN
980 return ENC_UTF32BE;
981#else
982 return ENC_UTF32LE;
983#endif
984 }
985 if (*encoding == '-' || *encoding == '_' )
986 encoding++;
987 if (Py_TOLOWER(encoding[1]) == 'e' && encoding[2] == '\0') {
988 if (Py_TOLOWER(encoding[0]) == 'b')
989 return ENC_UTF32BE;
990 if (Py_TOLOWER(encoding[0]) == 'l')
991 return ENC_UTF32LE;
992 }
993 }
994 }
Victor Stinner0d4e01c2014-05-16 14:46:20 +0200995 else if (strcmp(encoding, "CP_UTF8") == 0) {
996 *bytelength = 3;
997 return ENC_UTF8;
998 }
Serhiy Storchaka88d8fb62014-05-15 14:37:42 +0300999 return ENC_UNKNOWN;
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02001000}
1001
Martin v. Löwisaef3fb02009-05-02 19:27:30 +00001002/* This handler is declared static until someone demonstrates
1003 a need to call it directly. */
1004static PyObject *
Martin v. Löwise0a2b722009-05-10 08:08:56 +00001005PyCodec_SurrogatePassErrors(PyObject *exc)
Martin v. Löwisdb12d452009-05-02 18:52:14 +00001006{
1007 PyObject *restuple;
1008 PyObject *object;
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02001009 PyObject *encode;
1010 char *encoding;
1011 int code;
1012 int bytelength;
Martin v. Löwisb09af032011-11-04 11:16:41 +01001013 Py_ssize_t i;
Martin v. Löwisdb12d452009-05-02 18:52:14 +00001014 Py_ssize_t start;
1015 Py_ssize_t end;
1016 PyObject *res;
1017 if (PyObject_IsInstance(exc, PyExc_UnicodeEncodeError)) {
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02001018 unsigned char *outp;
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001019 if (PyUnicodeEncodeError_GetStart(exc, &start))
1020 return NULL;
1021 if (PyUnicodeEncodeError_GetEnd(exc, &end))
1022 return NULL;
1023 if (!(object = PyUnicodeEncodeError_GetObject(exc)))
1024 return NULL;
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02001025 if (!(encode = PyUnicodeEncodeError_GetEncoding(exc))) {
1026 Py_DECREF(object);
1027 return NULL;
1028 }
1029 if (!(encoding = PyUnicode_AsUTF8(encode))) {
1030 Py_DECREF(object);
1031 Py_DECREF(encode);
1032 return NULL;
1033 }
1034 code = get_standard_encoding(encoding, &bytelength);
1035 Py_DECREF(encode);
Serhiy Storchaka88d8fb62014-05-15 14:37:42 +03001036 if (code == ENC_UNKNOWN) {
1037 /* Not supported, fail with original exception */
1038 PyErr_SetObject(PyExceptionInstance_Class(exc), exc);
1039 Py_DECREF(object);
1040 return NULL;
1041 }
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02001042
Serhiy Storchaka2e374092014-10-04 14:15:49 +03001043 if (end - start > PY_SSIZE_T_MAX / bytelength)
1044 end = start + PY_SSIZE_T_MAX / bytelength;
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02001045 res = PyBytes_FromStringAndSize(NULL, bytelength*(end-start));
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001046 if (!res) {
1047 Py_DECREF(object);
1048 return NULL;
1049 }
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02001050 outp = (unsigned char*)PyBytes_AsString(res);
Martin v. Löwisb09af032011-11-04 11:16:41 +01001051 for (i = start; i < end; i++) {
1052 /* object is guaranteed to be "ready" */
1053 Py_UCS4 ch = PyUnicode_READ_CHAR(object, i);
Victor Stinner76df43d2012-10-30 01:42:39 +01001054 if (!Py_UNICODE_IS_SURROGATE(ch)) {
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001055 /* Not a surrogate, fail with original exception */
1056 PyErr_SetObject(PyExceptionInstance_Class(exc), exc);
1057 Py_DECREF(res);
1058 Py_DECREF(object);
1059 return NULL;
1060 }
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02001061 switch (code) {
1062 case ENC_UTF8:
1063 *outp++ = (unsigned char)(0xe0 | (ch >> 12));
1064 *outp++ = (unsigned char)(0x80 | ((ch >> 6) & 0x3f));
1065 *outp++ = (unsigned char)(0x80 | (ch & 0x3f));
1066 break;
1067 case ENC_UTF16LE:
1068 *outp++ = (unsigned char) ch;
1069 *outp++ = (unsigned char)(ch >> 8);
1070 break;
1071 case ENC_UTF16BE:
1072 *outp++ = (unsigned char)(ch >> 8);
1073 *outp++ = (unsigned char) ch;
1074 break;
1075 case ENC_UTF32LE:
1076 *outp++ = (unsigned char) ch;
1077 *outp++ = (unsigned char)(ch >> 8);
1078 *outp++ = (unsigned char)(ch >> 16);
1079 *outp++ = (unsigned char)(ch >> 24);
1080 break;
1081 case ENC_UTF32BE:
1082 *outp++ = (unsigned char)(ch >> 24);
1083 *outp++ = (unsigned char)(ch >> 16);
1084 *outp++ = (unsigned char)(ch >> 8);
1085 *outp++ = (unsigned char) ch;
1086 break;
1087 }
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001088 }
1089 restuple = Py_BuildValue("(On)", res, end);
1090 Py_DECREF(res);
1091 Py_DECREF(object);
1092 return restuple;
Martin v. Löwisdb12d452009-05-02 18:52:14 +00001093 }
1094 else if (PyObject_IsInstance(exc, PyExc_UnicodeDecodeError)) {
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001095 unsigned char *p;
Victor Stinnerc06bb7a2011-11-04 21:36:35 +01001096 Py_UCS4 ch = 0;
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001097 if (PyUnicodeDecodeError_GetStart(exc, &start))
1098 return NULL;
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02001099 if (PyUnicodeDecodeError_GetEnd(exc, &end))
1100 return NULL;
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001101 if (!(object = PyUnicodeDecodeError_GetObject(exc)))
1102 return NULL;
1103 if (!(p = (unsigned char*)PyBytes_AsString(object))) {
1104 Py_DECREF(object);
1105 return NULL;
1106 }
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02001107 if (!(encode = PyUnicodeDecodeError_GetEncoding(exc))) {
1108 Py_DECREF(object);
1109 return NULL;
1110 }
1111 if (!(encoding = PyUnicode_AsUTF8(encode))) {
1112 Py_DECREF(object);
1113 Py_DECREF(encode);
1114 return NULL;
1115 }
1116 code = get_standard_encoding(encoding, &bytelength);
1117 Py_DECREF(encode);
Serhiy Storchaka88d8fb62014-05-15 14:37:42 +03001118 if (code == ENC_UNKNOWN) {
1119 /* Not supported, fail with original exception */
1120 PyErr_SetObject(PyExceptionInstance_Class(exc), exc);
1121 Py_DECREF(object);
1122 return NULL;
1123 }
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02001124
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001125 /* Try decoding a single surrogate character. If
1126 there are more, let the codec call us again. */
1127 p += start;
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02001128 if (PyBytes_GET_SIZE(object) - start >= bytelength) {
1129 switch (code) {
1130 case ENC_UTF8:
1131 if ((p[0] & 0xf0) == 0xe0 &&
1132 (p[1] & 0xc0) == 0x80 &&
1133 (p[2] & 0xc0) == 0x80) {
1134 /* it's a three-byte code */
1135 ch = ((p[0] & 0x0f) << 12) + ((p[1] & 0x3f) << 6) + (p[2] & 0x3f);
1136 }
1137 break;
1138 case ENC_UTF16LE:
1139 ch = p[1] << 8 | p[0];
1140 break;
1141 case ENC_UTF16BE:
1142 ch = p[0] << 8 | p[1];
1143 break;
1144 case ENC_UTF32LE:
1145 ch = (p[3] << 24) | (p[2] << 16) | (p[1] << 8) | p[0];
1146 break;
1147 case ENC_UTF32BE:
1148 ch = (p[0] << 24) | (p[1] << 16) | (p[2] << 8) | p[3];
1149 break;
1150 }
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001151 }
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02001152
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001153 Py_DECREF(object);
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02001154 if (!Py_UNICODE_IS_SURROGATE(ch)) {
1155 /* it's not a surrogate - fail */
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001156 PyErr_SetObject(PyExceptionInstance_Class(exc), exc);
1157 return NULL;
1158 }
Victor Stinnerc06bb7a2011-11-04 21:36:35 +01001159 res = PyUnicode_FromOrdinal(ch);
1160 if (res == NULL)
1161 return NULL;
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02001162 return Py_BuildValue("(Nn)", res, start + bytelength);
Martin v. Löwisdb12d452009-05-02 18:52:14 +00001163 }
1164 else {
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001165 wrong_exception_type(exc);
1166 return NULL;
Martin v. Löwisdb12d452009-05-02 18:52:14 +00001167 }
1168}
1169
Martin v. Löwis011e8422009-05-05 04:43:17 +00001170static PyObject *
Martin v. Löwis43c57782009-05-10 08:15:24 +00001171PyCodec_SurrogateEscapeErrors(PyObject *exc)
Martin v. Löwis011e8422009-05-05 04:43:17 +00001172{
1173 PyObject *restuple;
1174 PyObject *object;
Martin v. Löwisb09af032011-11-04 11:16:41 +01001175 Py_ssize_t i;
Martin v. Löwis011e8422009-05-05 04:43:17 +00001176 Py_ssize_t start;
1177 Py_ssize_t end;
1178 PyObject *res;
1179 if (PyObject_IsInstance(exc, PyExc_UnicodeEncodeError)) {
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001180 char *outp;
1181 if (PyUnicodeEncodeError_GetStart(exc, &start))
1182 return NULL;
1183 if (PyUnicodeEncodeError_GetEnd(exc, &end))
1184 return NULL;
1185 if (!(object = PyUnicodeEncodeError_GetObject(exc)))
1186 return NULL;
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001187 res = PyBytes_FromStringAndSize(NULL, end-start);
1188 if (!res) {
1189 Py_DECREF(object);
1190 return NULL;
1191 }
1192 outp = PyBytes_AsString(res);
Martin v. Löwisb09af032011-11-04 11:16:41 +01001193 for (i = start; i < end; i++) {
1194 /* object is guaranteed to be "ready" */
1195 Py_UCS4 ch = PyUnicode_READ_CHAR(object, i);
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001196 if (ch < 0xdc80 || ch > 0xdcff) {
1197 /* Not a UTF-8b surrogate, fail with original exception */
1198 PyErr_SetObject(PyExceptionInstance_Class(exc), exc);
1199 Py_DECREF(res);
1200 Py_DECREF(object);
1201 return NULL;
1202 }
1203 *outp++ = ch - 0xdc00;
1204 }
1205 restuple = Py_BuildValue("(On)", res, end);
1206 Py_DECREF(res);
1207 Py_DECREF(object);
1208 return restuple;
Martin v. Löwis011e8422009-05-05 04:43:17 +00001209 }
1210 else if (PyObject_IsInstance(exc, PyExc_UnicodeDecodeError)) {
Victor Stinnerc06bb7a2011-11-04 21:36:35 +01001211 PyObject *str;
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001212 unsigned char *p;
Victor Stinnerc06bb7a2011-11-04 21:36:35 +01001213 Py_UCS2 ch[4]; /* decode up to 4 bad bytes. */
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001214 int consumed = 0;
1215 if (PyUnicodeDecodeError_GetStart(exc, &start))
1216 return NULL;
1217 if (PyUnicodeDecodeError_GetEnd(exc, &end))
1218 return NULL;
1219 if (!(object = PyUnicodeDecodeError_GetObject(exc)))
1220 return NULL;
1221 if (!(p = (unsigned char*)PyBytes_AsString(object))) {
1222 Py_DECREF(object);
1223 return NULL;
1224 }
1225 while (consumed < 4 && consumed < end-start) {
1226 /* Refuse to escape ASCII bytes. */
1227 if (p[start+consumed] < 128)
1228 break;
1229 ch[consumed] = 0xdc00 + p[start+consumed];
1230 consumed++;
1231 }
1232 Py_DECREF(object);
1233 if (!consumed) {
1234 /* codec complained about ASCII byte. */
1235 PyErr_SetObject(PyExceptionInstance_Class(exc), exc);
1236 return NULL;
1237 }
Victor Stinnerc06bb7a2011-11-04 21:36:35 +01001238 str = PyUnicode_FromKindAndData(PyUnicode_2BYTE_KIND, ch, consumed);
1239 if (str == NULL)
1240 return NULL;
1241 return Py_BuildValue("(Nn)", str, start+consumed);
Martin v. Löwis011e8422009-05-05 04:43:17 +00001242 }
1243 else {
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001244 wrong_exception_type(exc);
1245 return NULL;
Martin v. Löwis011e8422009-05-05 04:43:17 +00001246 }
1247}
1248
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001249
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001250static PyObject *strict_errors(PyObject *self, PyObject *exc)
1251{
1252 return PyCodec_StrictErrors(exc);
1253}
1254
1255
1256static PyObject *ignore_errors(PyObject *self, PyObject *exc)
1257{
1258 return PyCodec_IgnoreErrors(exc);
1259}
1260
1261
1262static PyObject *replace_errors(PyObject *self, PyObject *exc)
1263{
1264 return PyCodec_ReplaceErrors(exc);
1265}
1266
1267
1268static PyObject *xmlcharrefreplace_errors(PyObject *self, PyObject *exc)
1269{
1270 return PyCodec_XMLCharRefReplaceErrors(exc);
1271}
1272
1273
1274static PyObject *backslashreplace_errors(PyObject *self, PyObject *exc)
1275{
1276 return PyCodec_BackslashReplaceErrors(exc);
1277}
1278
Martin v. Löwise0a2b722009-05-10 08:08:56 +00001279static PyObject *surrogatepass_errors(PyObject *self, PyObject *exc)
Martin v. Löwisdb12d452009-05-02 18:52:14 +00001280{
Martin v. Löwise0a2b722009-05-10 08:08:56 +00001281 return PyCodec_SurrogatePassErrors(exc);
Martin v. Löwisdb12d452009-05-02 18:52:14 +00001282}
1283
Martin v. Löwis43c57782009-05-10 08:15:24 +00001284static PyObject *surrogateescape_errors(PyObject *self, PyObject *exc)
Martin v. Löwis011e8422009-05-05 04:43:17 +00001285{
Martin v. Löwis43c57782009-05-10 08:15:24 +00001286 return PyCodec_SurrogateEscapeErrors(exc);
Martin v. Löwis011e8422009-05-05 04:43:17 +00001287}
1288
Gustavo Niemeyer5ddd4c32003-03-19 00:35:36 +00001289static int _PyCodecRegistry_Init(void)
Guido van Rossumfeee4b92000-03-10 22:57:27 +00001290{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001291 static struct {
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001292 char *name;
1293 PyMethodDef def;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001294 } methods[] =
1295 {
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001296 {
1297 "strict",
1298 {
1299 "strict_errors",
1300 strict_errors,
1301 METH_O,
1302 PyDoc_STR("Implements the 'strict' error handling, which "
1303 "raises a UnicodeError on coding errors.")
1304 }
1305 },
1306 {
1307 "ignore",
1308 {
1309 "ignore_errors",
1310 ignore_errors,
1311 METH_O,
1312 PyDoc_STR("Implements the 'ignore' error handling, which "
1313 "ignores malformed data and continues.")
1314 }
1315 },
1316 {
1317 "replace",
1318 {
1319 "replace_errors",
1320 replace_errors,
1321 METH_O,
1322 PyDoc_STR("Implements the 'replace' error handling, which "
1323 "replaces malformed data with a replacement marker.")
1324 }
1325 },
1326 {
1327 "xmlcharrefreplace",
1328 {
1329 "xmlcharrefreplace_errors",
1330 xmlcharrefreplace_errors,
1331 METH_O,
1332 PyDoc_STR("Implements the 'xmlcharrefreplace' error handling, "
1333 "which replaces an unencodable character with the "
1334 "appropriate XML character reference.")
1335 }
1336 },
1337 {
1338 "backslashreplace",
1339 {
1340 "backslashreplace_errors",
1341 backslashreplace_errors,
1342 METH_O,
1343 PyDoc_STR("Implements the 'backslashreplace' error handling, "
1344 "which replaces an unencodable character with a "
1345 "backslashed escape sequence.")
1346 }
1347 },
1348 {
1349 "surrogatepass",
1350 {
1351 "surrogatepass",
1352 surrogatepass_errors,
1353 METH_O
1354 }
1355 },
1356 {
1357 "surrogateescape",
1358 {
1359 "surrogateescape",
1360 surrogateescape_errors,
1361 METH_O
1362 }
1363 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001364 };
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001365
Nicholas Bastine5662ae2004-03-24 22:22:12 +00001366 PyInterpreterState *interp = PyThreadState_GET()->interp;
Gustavo Niemeyer5ddd4c32003-03-19 00:35:36 +00001367 PyObject *mod;
Neal Norwitz739a8f82004-07-08 01:55:58 +00001368 unsigned i;
Gustavo Niemeyer5ddd4c32003-03-19 00:35:36 +00001369
1370 if (interp->codec_search_path != NULL)
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001371 return 0;
Gustavo Niemeyer5ddd4c32003-03-19 00:35:36 +00001372
1373 interp->codec_search_path = PyList_New(0);
1374 interp->codec_search_cache = PyDict_New();
1375 interp->codec_error_registry = PyDict_New();
1376
1377 if (interp->codec_error_registry) {
Victor Stinner63941882011-09-29 00:42:28 +02001378 for (i = 0; i < Py_ARRAY_LENGTH(methods); ++i) {
Andrew Svetlov3ba3a3e2012-12-25 13:32:35 +02001379 PyObject *func = PyCFunction_NewEx(&methods[i].def, NULL, NULL);
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001380 int res;
1381 if (!func)
1382 Py_FatalError("can't initialize codec error registry");
1383 res = PyCodec_RegisterError(methods[i].name, func);
1384 Py_DECREF(func);
1385 if (res)
1386 Py_FatalError("can't initialize codec error registry");
1387 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001388 }
Guido van Rossumfeee4b92000-03-10 22:57:27 +00001389
Gustavo Niemeyer5ddd4c32003-03-19 00:35:36 +00001390 if (interp->codec_search_path == NULL ||
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001391 interp->codec_search_cache == NULL ||
1392 interp->codec_error_registry == NULL)
1393 Py_FatalError("can't initialize codec registry");
Gustavo Niemeyer5ddd4c32003-03-19 00:35:36 +00001394
Christian Heimes819b8bf2008-01-03 23:05:47 +00001395 mod = PyImport_ImportModuleNoBlock("encodings");
Gustavo Niemeyer5ddd4c32003-03-19 00:35:36 +00001396 if (mod == NULL) {
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001397 return -1;
Gustavo Niemeyer5ddd4c32003-03-19 00:35:36 +00001398 }
1399 Py_DECREF(mod);
Christian Heimes6a27efa2008-10-30 21:48:26 +00001400 interp->codecs_initialized = 1;
Gustavo Niemeyer5ddd4c32003-03-19 00:35:36 +00001401 return 0;
Guido van Rossumfeee4b92000-03-10 22:57:27 +00001402}