blob: 27f2aebf827227fef0f5972e0823ab1fcdb8998f [file] [log] [blame]
Guido van Rossumfeee4b92000-03-10 22:57:27 +00001/* ------------------------------------------------------------------------
2
3 Python Codec Registry and support functions
4
5Written by Marc-Andre Lemburg (mal@lemburg.com).
6
Guido van Rossum16b1ad92000-08-03 16:24:25 +00007Copyright (c) Corporation for National Research Initiatives.
Guido van Rossumfeee4b92000-03-10 22:57:27 +00008
9 ------------------------------------------------------------------------ */
10
11#include "Python.h"
12#include <ctype.h>
13
Victor Stinnerf5cff562011-10-14 02:13:11 +020014const char *Py_hexdigits = "0123456789abcdef";
15
Guido van Rossumfeee4b92000-03-10 22:57:27 +000016/* --- Codec Registry ----------------------------------------------------- */
17
18/* Import the standard encodings package which will register the first
Guido van Rossum98297ee2007-11-06 21:34:58 +000019 codec search function.
Guido van Rossumfeee4b92000-03-10 22:57:27 +000020
21 This is done in a lazy way so that the Unicode implementation does
22 not downgrade startup time of scripts not needing it.
23
Guido van Rossumb95de4f2000-03-31 17:25:23 +000024 ImportErrors are silently ignored by this function. Only one try is
25 made.
Guido van Rossumfeee4b92000-03-10 22:57:27 +000026
27*/
28
Gustavo Niemeyer5ddd4c32003-03-19 00:35:36 +000029static int _PyCodecRegistry_Init(void); /* Forward */
Guido van Rossumfeee4b92000-03-10 22:57:27 +000030
Guido van Rossumfeee4b92000-03-10 22:57:27 +000031int PyCodec_Register(PyObject *search_function)
32{
Nicholas Bastine5662ae2004-03-24 22:22:12 +000033 PyInterpreterState *interp = PyThreadState_GET()->interp;
Gustavo Niemeyer5ddd4c32003-03-19 00:35:36 +000034 if (interp->codec_search_path == NULL && _PyCodecRegistry_Init())
Antoine Pitrouf95a1b32010-05-09 15:52:27 +000035 goto onError;
Guido van Rossumfeee4b92000-03-10 22:57:27 +000036 if (search_function == NULL) {
Antoine Pitrouf95a1b32010-05-09 15:52:27 +000037 PyErr_BadArgument();
38 goto onError;
Guido van Rossumfeee4b92000-03-10 22:57:27 +000039 }
40 if (!PyCallable_Check(search_function)) {
Antoine Pitrouf95a1b32010-05-09 15:52:27 +000041 PyErr_SetString(PyExc_TypeError, "argument must be callable");
42 goto onError;
Guido van Rossumfeee4b92000-03-10 22:57:27 +000043 }
Gustavo Niemeyer5ddd4c32003-03-19 00:35:36 +000044 return PyList_Append(interp->codec_search_path, search_function);
Guido van Rossumb95de4f2000-03-31 17:25:23 +000045
46 onError:
47 return -1;
Guido van Rossumfeee4b92000-03-10 22:57:27 +000048}
49
Guido van Rossum9e896b32000-04-05 20:11:21 +000050/* Convert a string to a normalized Python string: all characters are
51 converted to lower case, spaces are replaced with underscores. */
52
Guido van Rossumfeee4b92000-03-10 22:57:27 +000053static
Guido van Rossum9e896b32000-04-05 20:11:21 +000054PyObject *normalizestring(const char *string)
Guido van Rossumfeee4b92000-03-10 22:57:27 +000055{
Antoine Pitrou9ed5f272013-08-13 20:18:52 +020056 size_t i;
Guido van Rossum582acec2000-06-28 22:07:35 +000057 size_t len = strlen(string);
Guido van Rossumfeee4b92000-03-10 22:57:27 +000058 char *p;
59 PyObject *v;
Guido van Rossum21431e82007-10-19 21:48:41 +000060
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000061 if (len > PY_SSIZE_T_MAX) {
Antoine Pitrouf95a1b32010-05-09 15:52:27 +000062 PyErr_SetString(PyExc_OverflowError, "string is too large");
63 return NULL;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000064 }
Guido van Rossum21431e82007-10-19 21:48:41 +000065
66 p = PyMem_Malloc(len + 1);
67 if (p == NULL)
Victor Stinnercc351592013-07-12 00:02:55 +020068 return PyErr_NoMemory();
Guido van Rossum9e896b32000-04-05 20:11:21 +000069 for (i = 0; i < len; i++) {
Antoine Pitrou9ed5f272013-08-13 20:18:52 +020070 char ch = string[i];
Guido van Rossum9e896b32000-04-05 20:11:21 +000071 if (ch == ' ')
72 ch = '-';
73 else
Antoine Pitroucf9d3c02011-07-24 02:27:04 +020074 ch = Py_TOLOWER(Py_CHARMASK(ch));
Antoine Pitrouf95a1b32010-05-09 15:52:27 +000075 p[i] = ch;
Guido van Rossum9e896b32000-04-05 20:11:21 +000076 }
Guido van Rossum21431e82007-10-19 21:48:41 +000077 p[i] = '\0';
78 v = PyUnicode_FromString(p);
79 if (v == NULL)
80 return NULL;
81 PyMem_Free(p);
Guido van Rossumfeee4b92000-03-10 22:57:27 +000082 return v;
83}
84
85/* Lookup the given encoding and return a tuple providing the codec
86 facilities.
87
88 The encoding string is looked up converted to all lower-case
89 characters. This makes encodings looked up through this mechanism
90 effectively case-insensitive.
91
Guido van Rossum98297ee2007-11-06 21:34:58 +000092 If no codec is found, a LookupError is set and NULL returned.
Guido van Rossumb95de4f2000-03-31 17:25:23 +000093
94 As side effect, this tries to load the encodings package, if not
95 yet done. This is part of the lazy load strategy for the encodings
96 package.
97
98*/
Guido van Rossumfeee4b92000-03-10 22:57:27 +000099
100PyObject *_PyCodec_Lookup(const char *encoding)
101{
Gustavo Niemeyer5ddd4c32003-03-19 00:35:36 +0000102 PyInterpreterState *interp;
Guido van Rossum5ba3c842000-03-24 20:52:23 +0000103 PyObject *result, *args = NULL, *v;
Thomas Wouters477c8d52006-05-27 19:21:47 +0000104 Py_ssize_t i, len;
Guido van Rossumfeee4b92000-03-10 22:57:27 +0000105
Fred Drake766de832000-05-09 19:55:59 +0000106 if (encoding == NULL) {
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000107 PyErr_BadArgument();
108 goto onError;
Fred Drake766de832000-05-09 19:55:59 +0000109 }
Gustavo Niemeyer5ddd4c32003-03-19 00:35:36 +0000110
Nicholas Bastine5662ae2004-03-24 22:22:12 +0000111 interp = PyThreadState_GET()->interp;
Gustavo Niemeyer5ddd4c32003-03-19 00:35:36 +0000112 if (interp->codec_search_path == NULL && _PyCodecRegistry_Init())
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000113 goto onError;
Guido van Rossumfeee4b92000-03-10 22:57:27 +0000114
Guido van Rossum9e896b32000-04-05 20:11:21 +0000115 /* Convert the encoding to a normalized Python string: all
Thomas Wouters7e474022000-07-16 12:04:32 +0000116 characters are converted to lower case, spaces and hyphens are
Guido van Rossum9e896b32000-04-05 20:11:21 +0000117 replaced with underscores. */
118 v = normalizestring(encoding);
Guido van Rossumfeee4b92000-03-10 22:57:27 +0000119 if (v == NULL)
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000120 goto onError;
Guido van Rossum21431e82007-10-19 21:48:41 +0000121 PyUnicode_InternInPlace(&v);
Guido van Rossumfeee4b92000-03-10 22:57:27 +0000122
123 /* First, try to lookup the name in the registry dictionary */
Gustavo Niemeyer5ddd4c32003-03-19 00:35:36 +0000124 result = PyDict_GetItem(interp->codec_search_cache, v);
Guido van Rossumfeee4b92000-03-10 22:57:27 +0000125 if (result != NULL) {
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000126 Py_INCREF(result);
127 Py_DECREF(v);
128 return result;
Guido van Rossumfeee4b92000-03-10 22:57:27 +0000129 }
Guido van Rossum98297ee2007-11-06 21:34:58 +0000130
Guido van Rossumfeee4b92000-03-10 22:57:27 +0000131 /* Next, scan the search functions in order of registration */
Guido van Rossumfeee4b92000-03-10 22:57:27 +0000132 args = PyTuple_New(1);
133 if (args == NULL)
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000134 goto onError;
Guido van Rossumfeee4b92000-03-10 22:57:27 +0000135 PyTuple_SET_ITEM(args,0,v);
Guido van Rossum5ba3c842000-03-24 20:52:23 +0000136
Gustavo Niemeyer5ddd4c32003-03-19 00:35:36 +0000137 len = PyList_Size(interp->codec_search_path);
Guido van Rossum5ba3c842000-03-24 20:52:23 +0000138 if (len < 0)
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000139 goto onError;
Guido van Rossumb95de4f2000-03-31 17:25:23 +0000140 if (len == 0) {
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000141 PyErr_SetString(PyExc_LookupError,
142 "no codec search functions registered: "
143 "can't find encoding");
144 goto onError;
Guido van Rossumb95de4f2000-03-31 17:25:23 +0000145 }
Guido van Rossumfeee4b92000-03-10 22:57:27 +0000146
147 for (i = 0; i < len; i++) {
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000148 PyObject *func;
Guido van Rossum98297ee2007-11-06 21:34:58 +0000149
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000150 func = PyList_GetItem(interp->codec_search_path, i);
151 if (func == NULL)
152 goto onError;
153 result = PyEval_CallObject(func, args);
154 if (result == NULL)
155 goto onError;
156 if (result == Py_None) {
157 Py_DECREF(result);
158 continue;
159 }
160 if (!PyTuple_Check(result) || PyTuple_GET_SIZE(result) != 4) {
161 PyErr_SetString(PyExc_TypeError,
162 "codec search functions must return 4-tuples");
163 Py_DECREF(result);
164 goto onError;
165 }
166 break;
Guido van Rossumfeee4b92000-03-10 22:57:27 +0000167 }
168 if (i == len) {
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000169 /* XXX Perhaps we should cache misses too ? */
170 PyErr_Format(PyExc_LookupError,
Martin v. Löwiseb42b022002-09-26 16:01:24 +0000171 "unknown encoding: %s", encoding);
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000172 goto onError;
Guido van Rossumfeee4b92000-03-10 22:57:27 +0000173 }
174
175 /* Cache and return the result */
Neal Norwitz9edcc2e2007-08-11 04:58:26 +0000176 if (PyDict_SetItem(interp->codec_search_cache, v, result) < 0) {
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000177 Py_DECREF(result);
178 goto onError;
Neal Norwitz9edcc2e2007-08-11 04:58:26 +0000179 }
Guido van Rossumfeee4b92000-03-10 22:57:27 +0000180 Py_DECREF(args);
181 return result;
182
183 onError:
184 Py_XDECREF(args);
185 return NULL;
186}
187
Nick Coghlan8fad1672014-09-15 23:50:44 +1200188int _PyCodec_Forget(const char *encoding)
189{
190 PyInterpreterState *interp;
191 PyObject *v;
192 int result;
193
194 interp = PyThreadState_GET()->interp;
195 if (interp->codec_search_path == NULL) {
196 return -1;
197 }
198
199 /* Convert the encoding to a normalized Python string: all
200 characters are converted to lower case, spaces and hyphens are
201 replaced with underscores. */
202 v = normalizestring(encoding);
203 if (v == NULL) {
204 return -1;
205 }
206
207 /* Drop the named codec from the internal cache */
208 result = PyDict_DelItem(interp->codec_search_cache, v);
209 Py_DECREF(v);
210
211 return result;
212}
213
Marc-André Lemburgb2750b52008-06-06 12:18:17 +0000214/* Codec registry encoding check API. */
215
216int PyCodec_KnownEncoding(const char *encoding)
217{
218 PyObject *codecs;
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000219
Marc-André Lemburgb2750b52008-06-06 12:18:17 +0000220 codecs = _PyCodec_Lookup(encoding);
221 if (!codecs) {
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000222 PyErr_Clear();
223 return 0;
Marc-André Lemburgb2750b52008-06-06 12:18:17 +0000224 }
225 else {
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000226 Py_DECREF(codecs);
227 return 1;
Marc-André Lemburgb2750b52008-06-06 12:18:17 +0000228 }
229}
230
Guido van Rossumfeee4b92000-03-10 22:57:27 +0000231static
232PyObject *args_tuple(PyObject *object,
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000233 const char *errors)
Guido van Rossumfeee4b92000-03-10 22:57:27 +0000234{
235 PyObject *args;
Guido van Rossum98297ee2007-11-06 21:34:58 +0000236
Guido van Rossumfeee4b92000-03-10 22:57:27 +0000237 args = PyTuple_New(1 + (errors != NULL));
238 if (args == NULL)
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000239 return NULL;
Guido van Rossumfeee4b92000-03-10 22:57:27 +0000240 Py_INCREF(object);
241 PyTuple_SET_ITEM(args,0,object);
242 if (errors) {
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000243 PyObject *v;
Guido van Rossum98297ee2007-11-06 21:34:58 +0000244
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000245 v = PyUnicode_FromString(errors);
246 if (v == NULL) {
247 Py_DECREF(args);
248 return NULL;
249 }
250 PyTuple_SET_ITEM(args, 1, v);
Guido van Rossumfeee4b92000-03-10 22:57:27 +0000251 }
252 return args;
253}
254
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000255/* Helper function to get a codec item */
Guido van Rossumfeee4b92000-03-10 22:57:27 +0000256
257static
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000258PyObject *codec_getitem(const char *encoding, int index)
Guido van Rossumfeee4b92000-03-10 22:57:27 +0000259{
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000260 PyObject *codecs;
261 PyObject *v;
Guido van Rossumfeee4b92000-03-10 22:57:27 +0000262
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000263 codecs = _PyCodec_Lookup(encoding);
264 if (codecs == NULL)
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000265 return NULL;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000266 v = PyTuple_GET_ITEM(codecs, index);
267 Py_DECREF(codecs);
268 Py_INCREF(v);
269 return v;
270}
271
Nick Coghlana9b15242014-02-04 22:11:18 +1000272/* Helper functions to create an incremental codec. */
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000273static
Nick Coghlana9b15242014-02-04 22:11:18 +1000274PyObject *codec_makeincrementalcodec(PyObject *codec_info,
275 const char *errors,
276 const char *attrname)
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000277{
Nick Coghlana9b15242014-02-04 22:11:18 +1000278 PyObject *ret, *inccodec;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000279
Nick Coghlana9b15242014-02-04 22:11:18 +1000280 inccodec = PyObject_GetAttrString(codec_info, attrname);
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000281 if (inccodec == NULL)
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000282 return NULL;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000283 if (errors)
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000284 ret = PyObject_CallFunction(inccodec, "s", errors);
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000285 else
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000286 ret = PyObject_CallFunction(inccodec, NULL);
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000287 Py_DECREF(inccodec);
288 return ret;
289}
290
Nick Coghlana9b15242014-02-04 22:11:18 +1000291static
292PyObject *codec_getincrementalcodec(const char *encoding,
293 const char *errors,
294 const char *attrname)
295{
296 PyObject *codec_info, *ret;
297
298 codec_info = _PyCodec_Lookup(encoding);
299 if (codec_info == NULL)
300 return NULL;
301 ret = codec_makeincrementalcodec(codec_info, errors, attrname);
302 Py_DECREF(codec_info);
303 return ret;
304}
305
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000306/* Helper function to create a stream codec. */
307
308static
309PyObject *codec_getstreamcodec(const char *encoding,
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000310 PyObject *stream,
311 const char *errors,
312 const int index)
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000313{
Thomas Wouters0e3f5912006-08-11 14:57:12 +0000314 PyObject *codecs, *streamcodec, *codeccls;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000315
316 codecs = _PyCodec_Lookup(encoding);
317 if (codecs == NULL)
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000318 return NULL;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000319
Thomas Wouters0e3f5912006-08-11 14:57:12 +0000320 codeccls = PyTuple_GET_ITEM(codecs, index);
321 if (errors != NULL)
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000322 streamcodec = PyObject_CallFunction(codeccls, "Os", stream, errors);
Thomas Wouters0e3f5912006-08-11 14:57:12 +0000323 else
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000324 streamcodec = PyObject_CallFunction(codeccls, "O", stream);
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000325 Py_DECREF(codecs);
326 return streamcodec;
Guido van Rossumfeee4b92000-03-10 22:57:27 +0000327}
328
Nick Coghlana9b15242014-02-04 22:11:18 +1000329/* Helpers to work with the result of _PyCodec_Lookup
330
331 */
332PyObject *_PyCodecInfo_GetIncrementalDecoder(PyObject *codec_info,
333 const char *errors)
334{
335 return codec_makeincrementalcodec(codec_info, errors,
336 "incrementaldecoder");
337}
338
339PyObject *_PyCodecInfo_GetIncrementalEncoder(PyObject *codec_info,
340 const char *errors)
341{
342 return codec_makeincrementalcodec(codec_info, errors,
343 "incrementalencoder");
344}
345
346
Guido van Rossum98297ee2007-11-06 21:34:58 +0000347/* Convenience APIs to query the Codec registry.
348
Guido van Rossumfeee4b92000-03-10 22:57:27 +0000349 All APIs return a codec object with incremented refcount.
Guido van Rossum98297ee2007-11-06 21:34:58 +0000350
Guido van Rossumfeee4b92000-03-10 22:57:27 +0000351 */
352
353PyObject *PyCodec_Encoder(const char *encoding)
354{
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000355 return codec_getitem(encoding, 0);
Guido van Rossumfeee4b92000-03-10 22:57:27 +0000356}
357
358PyObject *PyCodec_Decoder(const char *encoding)
359{
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000360 return codec_getitem(encoding, 1);
Guido van Rossumfeee4b92000-03-10 22:57:27 +0000361}
362
Thomas Woutersa9773292006-04-21 09:43:23 +0000363PyObject *PyCodec_IncrementalEncoder(const char *encoding,
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000364 const char *errors)
Thomas Woutersa9773292006-04-21 09:43:23 +0000365{
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000366 return codec_getincrementalcodec(encoding, errors, "incrementalencoder");
Thomas Woutersa9773292006-04-21 09:43:23 +0000367}
368
369PyObject *PyCodec_IncrementalDecoder(const char *encoding,
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000370 const char *errors)
Thomas Woutersa9773292006-04-21 09:43:23 +0000371{
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000372 return codec_getincrementalcodec(encoding, errors, "incrementaldecoder");
Thomas Woutersa9773292006-04-21 09:43:23 +0000373}
374
Guido van Rossumfeee4b92000-03-10 22:57:27 +0000375PyObject *PyCodec_StreamReader(const char *encoding,
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000376 PyObject *stream,
377 const char *errors)
Guido van Rossumfeee4b92000-03-10 22:57:27 +0000378{
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000379 return codec_getstreamcodec(encoding, stream, errors, 2);
Guido van Rossumfeee4b92000-03-10 22:57:27 +0000380}
381
382PyObject *PyCodec_StreamWriter(const char *encoding,
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000383 PyObject *stream,
384 const char *errors)
Guido van Rossumfeee4b92000-03-10 22:57:27 +0000385{
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000386 return codec_getstreamcodec(encoding, stream, errors, 3);
Guido van Rossumfeee4b92000-03-10 22:57:27 +0000387}
388
Nick Coghlan8b097b42013-11-13 23:49:21 +1000389/* Helper that tries to ensure the reported exception chain indicates the
390 * codec that was invoked to trigger the failure without changing the type
391 * of the exception raised.
392 */
393static void
394wrap_codec_error(const char *operation,
395 const char *encoding)
396{
397 /* TrySetFromCause will replace the active exception with a suitably
398 * updated clone if it can, otherwise it will leave the original
399 * exception alone.
400 */
401 _PyErr_TrySetFromCause("%s with '%s' codec failed",
402 operation, encoding);
403}
404
Guido van Rossumfeee4b92000-03-10 22:57:27 +0000405/* Encode an object (e.g. an Unicode object) using the given encoding
406 and return the resulting encoded object (usually a Python string).
407
408 errors is passed to the encoder factory as argument if non-NULL. */
409
Nick Coghlanc72e4e62013-11-22 22:39:36 +1000410static PyObject *
411_PyCodec_EncodeInternal(PyObject *object,
412 PyObject *encoder,
413 const char *encoding,
414 const char *errors)
Guido van Rossumfeee4b92000-03-10 22:57:27 +0000415{
Neal Norwitz3715c3e2005-11-24 22:09:18 +0000416 PyObject *args = NULL, *result = NULL;
Guido van Rossum98297ee2007-11-06 21:34:58 +0000417 PyObject *v = NULL;
Guido van Rossumfeee4b92000-03-10 22:57:27 +0000418
Guido van Rossumfeee4b92000-03-10 22:57:27 +0000419 args = args_tuple(object, errors);
420 if (args == NULL)
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000421 goto onError;
Guido van Rossum98297ee2007-11-06 21:34:58 +0000422
423 result = PyEval_CallObject(encoder, args);
Nick Coghlanc4c25802013-11-15 21:47:37 +1000424 if (result == NULL) {
425 wrap_codec_error("encoding", encoding);
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000426 goto onError;
Nick Coghlanc4c25802013-11-15 21:47:37 +1000427 }
Guido van Rossumfeee4b92000-03-10 22:57:27 +0000428
Guido van Rossum98297ee2007-11-06 21:34:58 +0000429 if (!PyTuple_Check(result) ||
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000430 PyTuple_GET_SIZE(result) != 2) {
431 PyErr_SetString(PyExc_TypeError,
432 "encoder must return a tuple (object, integer)");
433 goto onError;
Guido van Rossumfeee4b92000-03-10 22:57:27 +0000434 }
Marc-André Lemburgb2750b52008-06-06 12:18:17 +0000435 v = PyTuple_GET_ITEM(result,0);
436 Py_INCREF(v);
Guido van Rossumfeee4b92000-03-10 22:57:27 +0000437 /* We don't check or use the second (integer) entry. */
438
Marc-André Lemburgb2750b52008-06-06 12:18:17 +0000439 Py_DECREF(args);
440 Py_DECREF(encoder);
441 Py_DECREF(result);
442 return v;
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000443
Guido van Rossumfeee4b92000-03-10 22:57:27 +0000444 onError:
Neal Norwitz3715c3e2005-11-24 22:09:18 +0000445 Py_XDECREF(result);
Guido van Rossumfeee4b92000-03-10 22:57:27 +0000446 Py_XDECREF(args);
447 Py_XDECREF(encoder);
Marc-André Lemburgb2750b52008-06-06 12:18:17 +0000448 return NULL;
Guido van Rossumfeee4b92000-03-10 22:57:27 +0000449}
450
451/* Decode an object (usually a Python string) using the given encoding
452 and return an equivalent object (e.g. an Unicode object).
453
454 errors is passed to the decoder factory as argument if non-NULL. */
455
Nick Coghlanc72e4e62013-11-22 22:39:36 +1000456static PyObject *
457_PyCodec_DecodeInternal(PyObject *object,
458 PyObject *decoder,
459 const char *encoding,
460 const char *errors)
Guido van Rossumfeee4b92000-03-10 22:57:27 +0000461{
Guido van Rossumfeee4b92000-03-10 22:57:27 +0000462 PyObject *args = NULL, *result = NULL;
463 PyObject *v;
464
Guido van Rossumfeee4b92000-03-10 22:57:27 +0000465 args = args_tuple(object, errors);
466 if (args == NULL)
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000467 goto onError;
Guido van Rossum98297ee2007-11-06 21:34:58 +0000468
Guido van Rossumfeee4b92000-03-10 22:57:27 +0000469 result = PyEval_CallObject(decoder,args);
Nick Coghlanc4c25802013-11-15 21:47:37 +1000470 if (result == NULL) {
471 wrap_codec_error("decoding", encoding);
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000472 goto onError;
Nick Coghlanc4c25802013-11-15 21:47:37 +1000473 }
Guido van Rossum98297ee2007-11-06 21:34:58 +0000474 if (!PyTuple_Check(result) ||
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000475 PyTuple_GET_SIZE(result) != 2) {
476 PyErr_SetString(PyExc_TypeError,
477 "decoder must return a tuple (object,integer)");
478 goto onError;
Guido van Rossumfeee4b92000-03-10 22:57:27 +0000479 }
480 v = PyTuple_GET_ITEM(result,0);
481 Py_INCREF(v);
482 /* We don't check or use the second (integer) entry. */
483
484 Py_DECREF(args);
485 Py_DECREF(decoder);
486 Py_DECREF(result);
487 return v;
Guido van Rossum98297ee2007-11-06 21:34:58 +0000488
Guido van Rossumfeee4b92000-03-10 22:57:27 +0000489 onError:
490 Py_XDECREF(args);
491 Py_XDECREF(decoder);
492 Py_XDECREF(result);
493 return NULL;
494}
495
Nick Coghlanc72e4e62013-11-22 22:39:36 +1000496/* Generic encoding/decoding API */
497PyObject *PyCodec_Encode(PyObject *object,
498 const char *encoding,
499 const char *errors)
500{
501 PyObject *encoder;
502
503 encoder = PyCodec_Encoder(encoding);
504 if (encoder == NULL)
505 return NULL;
506
507 return _PyCodec_EncodeInternal(object, encoder, encoding, errors);
508}
509
510PyObject *PyCodec_Decode(PyObject *object,
511 const char *encoding,
512 const char *errors)
513{
514 PyObject *decoder;
515
516 decoder = PyCodec_Decoder(encoding);
517 if (decoder == NULL)
518 return NULL;
519
520 return _PyCodec_DecodeInternal(object, decoder, encoding, errors);
521}
522
523/* Text encoding/decoding API */
Nick Coghlana9b15242014-02-04 22:11:18 +1000524PyObject * _PyCodec_LookupTextEncoding(const char *encoding,
525 const char *alternate_command)
Nick Coghlanc72e4e62013-11-22 22:39:36 +1000526{
527 _Py_IDENTIFIER(_is_text_encoding);
528 PyObject *codec;
529 PyObject *attr;
Nick Coghlanc72e4e62013-11-22 22:39:36 +1000530 int is_text_codec;
531
532 codec = _PyCodec_Lookup(encoding);
533 if (codec == NULL)
534 return NULL;
535
536 /* Backwards compatibility: assume any raw tuple describes a text
537 * encoding, and the same for anything lacking the private
538 * attribute.
539 */
540 if (!PyTuple_CheckExact(codec)) {
541 attr = _PyObject_GetAttrId(codec, &PyId__is_text_encoding);
542 if (attr == NULL) {
543 if (PyErr_ExceptionMatches(PyExc_AttributeError)) {
544 PyErr_Clear();
545 } else {
546 Py_DECREF(codec);
547 return NULL;
548 }
549 } else {
550 is_text_codec = PyObject_IsTrue(attr);
551 Py_DECREF(attr);
Serhiy Storchakafa494fd2015-05-30 17:45:22 +0300552 if (is_text_codec <= 0) {
Nick Coghlanc72e4e62013-11-22 22:39:36 +1000553 Py_DECREF(codec);
Serhiy Storchakafa494fd2015-05-30 17:45:22 +0300554 if (!is_text_codec)
555 PyErr_Format(PyExc_LookupError,
556 "'%.400s' is not a text encoding; "
557 "use %s to handle arbitrary codecs",
558 encoding, alternate_command);
Nick Coghlanc72e4e62013-11-22 22:39:36 +1000559 return NULL;
560 }
561 }
562 }
563
Nick Coghlana9b15242014-02-04 22:11:18 +1000564 /* This appears to be a valid text encoding */
565 return codec;
566}
567
568
569static
570PyObject *codec_getitem_checked(const char *encoding,
571 const char *alternate_command,
572 int index)
573{
574 PyObject *codec;
575 PyObject *v;
576
577 codec = _PyCodec_LookupTextEncoding(encoding, alternate_command);
578 if (codec == NULL)
579 return NULL;
580
Nick Coghlanc72e4e62013-11-22 22:39:36 +1000581 v = PyTuple_GET_ITEM(codec, index);
Nick Coghlanc72e4e62013-11-22 22:39:36 +1000582 Py_INCREF(v);
Nick Coghlana9b15242014-02-04 22:11:18 +1000583 Py_DECREF(codec);
Nick Coghlanc72e4e62013-11-22 22:39:36 +1000584 return v;
585}
586
587static PyObject * _PyCodec_TextEncoder(const char *encoding)
588{
Nick Coghlana9b15242014-02-04 22:11:18 +1000589 return codec_getitem_checked(encoding, "codecs.encode()", 0);
Nick Coghlanc72e4e62013-11-22 22:39:36 +1000590}
591
592static PyObject * _PyCodec_TextDecoder(const char *encoding)
593{
Nick Coghlana9b15242014-02-04 22:11:18 +1000594 return codec_getitem_checked(encoding, "codecs.decode()", 1);
Nick Coghlanc72e4e62013-11-22 22:39:36 +1000595}
596
597PyObject *_PyCodec_EncodeText(PyObject *object,
598 const char *encoding,
599 const char *errors)
600{
601 PyObject *encoder;
602
603 encoder = _PyCodec_TextEncoder(encoding);
604 if (encoder == NULL)
605 return NULL;
606
607 return _PyCodec_EncodeInternal(object, encoder, encoding, errors);
608}
609
610PyObject *_PyCodec_DecodeText(PyObject *object,
611 const char *encoding,
612 const char *errors)
613{
614 PyObject *decoder;
615
616 decoder = _PyCodec_TextDecoder(encoding);
617 if (decoder == NULL)
618 return NULL;
619
620 return _PyCodec_DecodeInternal(object, decoder, encoding, errors);
621}
622
Walter Dörwald3aeb6322002-09-02 13:14:32 +0000623/* Register the error handling callback function error under the name
624 name. This function will be called by the codec when it encounters
625 an unencodable characters/undecodable bytes and doesn't know the
626 callback name, when name is specified as the error parameter
627 in the call to the encode/decode function.
628 Return 0 on success, -1 on error */
629int PyCodec_RegisterError(const char *name, PyObject *error)
630{
Nicholas Bastine5662ae2004-03-24 22:22:12 +0000631 PyInterpreterState *interp = PyThreadState_GET()->interp;
Gustavo Niemeyer5ddd4c32003-03-19 00:35:36 +0000632 if (interp->codec_search_path == NULL && _PyCodecRegistry_Init())
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000633 return -1;
Walter Dörwald3aeb6322002-09-02 13:14:32 +0000634 if (!PyCallable_Check(error)) {
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000635 PyErr_SetString(PyExc_TypeError, "handler must be callable");
636 return -1;
Walter Dörwald3aeb6322002-09-02 13:14:32 +0000637 }
Gustavo Niemeyer5ddd4c32003-03-19 00:35:36 +0000638 return PyDict_SetItemString(interp->codec_error_registry,
Serhiy Storchakac6792272013-10-19 21:03:34 +0300639 name, error);
Walter Dörwald3aeb6322002-09-02 13:14:32 +0000640}
641
642/* Lookup the error handling callback function registered under the
643 name error. As a special case NULL can be passed, in which case
644 the error handling callback for strict encoding will be returned. */
645PyObject *PyCodec_LookupError(const char *name)
646{
647 PyObject *handler = NULL;
648
Nicholas Bastine5662ae2004-03-24 22:22:12 +0000649 PyInterpreterState *interp = PyThreadState_GET()->interp;
Gustavo Niemeyer5ddd4c32003-03-19 00:35:36 +0000650 if (interp->codec_search_path == NULL && _PyCodecRegistry_Init())
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000651 return NULL;
Gustavo Niemeyer5ddd4c32003-03-19 00:35:36 +0000652
Walter Dörwald3aeb6322002-09-02 13:14:32 +0000653 if (name==NULL)
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000654 name = "strict";
Serhiy Storchakac6792272013-10-19 21:03:34 +0300655 handler = PyDict_GetItemString(interp->codec_error_registry, name);
Walter Dörwald3aeb6322002-09-02 13:14:32 +0000656 if (!handler)
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000657 PyErr_Format(PyExc_LookupError, "unknown error handler name '%.400s'", name);
Walter Dörwald3aeb6322002-09-02 13:14:32 +0000658 else
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000659 Py_INCREF(handler);
Walter Dörwald3aeb6322002-09-02 13:14:32 +0000660 return handler;
661}
662
663static void wrong_exception_type(PyObject *exc)
664{
Serhiy Storchakaca7fecb2015-05-18 16:08:52 +0300665 PyErr_Format(PyExc_TypeError,
666 "don't know how to handle %.200s in error callback",
667 exc->ob_type->tp_name);
Walter Dörwald3aeb6322002-09-02 13:14:32 +0000668}
669
670PyObject *PyCodec_StrictErrors(PyObject *exc)
671{
Brett Cannonbf364092006-03-01 04:25:17 +0000672 if (PyExceptionInstance_Check(exc))
673 PyErr_SetObject(PyExceptionInstance_Class(exc), exc);
Walter Dörwald3aeb6322002-09-02 13:14:32 +0000674 else
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000675 PyErr_SetString(PyExc_TypeError, "codec must pass exception instance");
Walter Dörwald3aeb6322002-09-02 13:14:32 +0000676 return NULL;
677}
678
679
680PyObject *PyCodec_IgnoreErrors(PyObject *exc)
681{
Martin v. Löwis18e16552006-02-15 17:27:45 +0000682 Py_ssize_t end;
Serhiy Storchakaca7fecb2015-05-18 16:08:52 +0300683
684 if (PyObject_TypeCheck(exc, (PyTypeObject *)PyExc_UnicodeEncodeError)) {
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000685 if (PyUnicodeEncodeError_GetEnd(exc, &end))
686 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +0000687 }
Serhiy Storchakaca7fecb2015-05-18 16:08:52 +0300688 else if (PyObject_TypeCheck(exc, (PyTypeObject *)PyExc_UnicodeDecodeError)) {
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000689 if (PyUnicodeDecodeError_GetEnd(exc, &end))
690 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +0000691 }
Serhiy Storchakaca7fecb2015-05-18 16:08:52 +0300692 else if (PyObject_TypeCheck(exc, (PyTypeObject *)PyExc_UnicodeTranslateError)) {
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000693 if (PyUnicodeTranslateError_GetEnd(exc, &end))
694 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +0000695 }
696 else {
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000697 wrong_exception_type(exc);
698 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +0000699 }
Victor Stinneree450092011-12-01 02:52:11 +0100700 return Py_BuildValue("(Nn)", PyUnicode_New(0, 0), end);
Walter Dörwald3aeb6322002-09-02 13:14:32 +0000701}
702
703
704PyObject *PyCodec_ReplaceErrors(PyObject *exc)
705{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200706 Py_ssize_t start, end, i, len;
Walter Dörwald3aeb6322002-09-02 13:14:32 +0000707
Serhiy Storchakaca7fecb2015-05-18 16:08:52 +0300708 if (PyObject_TypeCheck(exc, (PyTypeObject *)PyExc_UnicodeEncodeError)) {
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000709 PyObject *res;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200710 int kind;
711 void *data;
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000712 if (PyUnicodeEncodeError_GetStart(exc, &start))
713 return NULL;
714 if (PyUnicodeEncodeError_GetEnd(exc, &end))
715 return NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200716 len = end - start;
717 res = PyUnicode_New(len, '?');
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000718 if (res == NULL)
719 return NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200720 kind = PyUnicode_KIND(res);
721 data = PyUnicode_DATA(res);
722 for (i = 0; i < len; ++i)
723 PyUnicode_WRITE(kind, data, i, '?');
Victor Stinner8f825062012-04-27 13:55:39 +0200724 assert(_PyUnicode_CheckConsistency(res, 1));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200725 return Py_BuildValue("(Nn)", res, end);
Walter Dörwald3aeb6322002-09-02 13:14:32 +0000726 }
Serhiy Storchakaca7fecb2015-05-18 16:08:52 +0300727 else if (PyObject_TypeCheck(exc, (PyTypeObject *)PyExc_UnicodeDecodeError)) {
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000728 if (PyUnicodeDecodeError_GetEnd(exc, &end))
729 return NULL;
Victor Stinner1a15aba2011-10-02 19:00:15 +0200730 return Py_BuildValue("(Cn)",
731 (int)Py_UNICODE_REPLACEMENT_CHARACTER,
732 end);
Walter Dörwald3aeb6322002-09-02 13:14:32 +0000733 }
Serhiy Storchakaca7fecb2015-05-18 16:08:52 +0300734 else if (PyObject_TypeCheck(exc, (PyTypeObject *)PyExc_UnicodeTranslateError)) {
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000735 PyObject *res;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200736 int kind;
737 void *data;
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000738 if (PyUnicodeTranslateError_GetStart(exc, &start))
739 return NULL;
740 if (PyUnicodeTranslateError_GetEnd(exc, &end))
741 return NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200742 len = end - start;
743 res = PyUnicode_New(len, Py_UNICODE_REPLACEMENT_CHARACTER);
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000744 if (res == NULL)
745 return NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200746 kind = PyUnicode_KIND(res);
747 data = PyUnicode_DATA(res);
748 for (i=0; i < len; i++)
749 PyUnicode_WRITE(kind, data, i, Py_UNICODE_REPLACEMENT_CHARACTER);
Victor Stinner8f825062012-04-27 13:55:39 +0200750 assert(_PyUnicode_CheckConsistency(res, 1));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200751 return Py_BuildValue("(Nn)", res, end);
Walter Dörwald3aeb6322002-09-02 13:14:32 +0000752 }
753 else {
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000754 wrong_exception_type(exc);
755 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +0000756 }
757}
758
759PyObject *PyCodec_XMLCharRefReplaceErrors(PyObject *exc)
760{
Serhiy Storchakaca7fecb2015-05-18 16:08:52 +0300761 if (PyObject_TypeCheck(exc, (PyTypeObject *)PyExc_UnicodeEncodeError)) {
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000762 PyObject *restuple;
763 PyObject *object;
Victor Stinnerb31f1bc2011-11-04 21:29:10 +0100764 Py_ssize_t i;
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000765 Py_ssize_t start;
766 Py_ssize_t end;
767 PyObject *res;
Martin v. Löwisb09af032011-11-04 11:16:41 +0100768 unsigned char *outp;
Serhiy Storchaka2e374092014-10-04 14:15:49 +0300769 Py_ssize_t ressize;
Martin v. Löwisb09af032011-11-04 11:16:41 +0100770 Py_UCS4 ch;
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000771 if (PyUnicodeEncodeError_GetStart(exc, &start))
772 return NULL;
773 if (PyUnicodeEncodeError_GetEnd(exc, &end))
774 return NULL;
775 if (!(object = PyUnicodeEncodeError_GetObject(exc)))
776 return NULL;
Serhiy Storchaka2e374092014-10-04 14:15:49 +0300777 if (end - start > PY_SSIZE_T_MAX / (2+7+1))
778 end = start + PY_SSIZE_T_MAX / (2+7+1);
Martin v. Löwisb09af032011-11-04 11:16:41 +0100779 for (i = start, ressize = 0; i < end; ++i) {
780 /* object is guaranteed to be "ready" */
781 ch = PyUnicode_READ_CHAR(object, i);
782 if (ch<10)
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000783 ressize += 2+1+1;
Martin v. Löwisb09af032011-11-04 11:16:41 +0100784 else if (ch<100)
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000785 ressize += 2+2+1;
Martin v. Löwisb09af032011-11-04 11:16:41 +0100786 else if (ch<1000)
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000787 ressize += 2+3+1;
Martin v. Löwisb09af032011-11-04 11:16:41 +0100788 else if (ch<10000)
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000789 ressize += 2+4+1;
Martin v. Löwisb09af032011-11-04 11:16:41 +0100790 else if (ch<100000)
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000791 ressize += 2+5+1;
Martin v. Löwisb09af032011-11-04 11:16:41 +0100792 else if (ch<1000000)
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000793 ressize += 2+6+1;
794 else
795 ressize += 2+7+1;
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000796 }
797 /* allocate replacement */
Martin v. Löwisb09af032011-11-04 11:16:41 +0100798 res = PyUnicode_New(ressize, 127);
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000799 if (res == NULL) {
800 Py_DECREF(object);
801 return NULL;
802 }
Martin v. Löwisb09af032011-11-04 11:16:41 +0100803 outp = PyUnicode_1BYTE_DATA(res);
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000804 /* generate replacement */
Victor Stinnerb31f1bc2011-11-04 21:29:10 +0100805 for (i = start; i < end; ++i) {
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000806 int digits;
807 int base;
Martin v. Löwis8ba79302011-11-04 12:26:49 +0100808 ch = PyUnicode_READ_CHAR(object, i);
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000809 *outp++ = '&';
810 *outp++ = '#';
Martin v. Löwisb09af032011-11-04 11:16:41 +0100811 if (ch<10) {
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000812 digits = 1;
813 base = 1;
814 }
Martin v. Löwisb09af032011-11-04 11:16:41 +0100815 else if (ch<100) {
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000816 digits = 2;
817 base = 10;
818 }
Martin v. Löwisb09af032011-11-04 11:16:41 +0100819 else if (ch<1000) {
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000820 digits = 3;
821 base = 100;
822 }
Martin v. Löwisb09af032011-11-04 11:16:41 +0100823 else if (ch<10000) {
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000824 digits = 4;
825 base = 1000;
826 }
Martin v. Löwisb09af032011-11-04 11:16:41 +0100827 else if (ch<100000) {
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000828 digits = 5;
829 base = 10000;
830 }
Martin v. Löwisb09af032011-11-04 11:16:41 +0100831 else if (ch<1000000) {
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000832 digits = 6;
833 base = 100000;
834 }
835 else {
836 digits = 7;
837 base = 1000000;
838 }
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000839 while (digits-->0) {
Martin v. Löwisb09af032011-11-04 11:16:41 +0100840 *outp++ = '0' + ch/base;
841 ch %= base;
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000842 base /= 10;
843 }
844 *outp++ = ';';
845 }
Victor Stinner8f825062012-04-27 13:55:39 +0200846 assert(_PyUnicode_CheckConsistency(res, 1));
847 restuple = Py_BuildValue("(Nn)", res, end);
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000848 Py_DECREF(object);
849 return restuple;
Walter Dörwald3aeb6322002-09-02 13:14:32 +0000850 }
851 else {
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000852 wrong_exception_type(exc);
853 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +0000854 }
855}
856
Walter Dörwald3aeb6322002-09-02 13:14:32 +0000857PyObject *PyCodec_BackslashReplaceErrors(PyObject *exc)
858{
Serhiy Storchakaca7fecb2015-05-18 16:08:52 +0300859 if (PyObject_TypeCheck(exc, (PyTypeObject *)PyExc_UnicodeEncodeError)) {
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000860 PyObject *restuple;
861 PyObject *object;
Martin v. Löwisb09af032011-11-04 11:16:41 +0100862 Py_ssize_t i;
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000863 Py_ssize_t start;
864 Py_ssize_t end;
865 PyObject *res;
Martin v. Löwisb09af032011-11-04 11:16:41 +0100866 unsigned char *outp;
Serhiy Storchaka2e374092014-10-04 14:15:49 +0300867 Py_ssize_t ressize;
Martin v. Löwisb09af032011-11-04 11:16:41 +0100868 Py_UCS4 c;
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000869 if (PyUnicodeEncodeError_GetStart(exc, &start))
870 return NULL;
871 if (PyUnicodeEncodeError_GetEnd(exc, &end))
872 return NULL;
873 if (!(object = PyUnicodeEncodeError_GetObject(exc)))
874 return NULL;
Serhiy Storchaka2e374092014-10-04 14:15:49 +0300875 if (end - start > PY_SSIZE_T_MAX / (1+1+8))
876 end = start + PY_SSIZE_T_MAX / (1+1+8);
Martin v. Löwisb09af032011-11-04 11:16:41 +0100877 for (i = start, ressize = 0; i < end; ++i) {
878 /* object is guaranteed to be "ready" */
879 c = PyUnicode_READ_CHAR(object, i);
880 if (c >= 0x10000) {
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000881 ressize += 1+1+8;
Martin v. Löwisb09af032011-11-04 11:16:41 +0100882 }
883 else if (c >= 0x100) {
884 ressize += 1+1+4;
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000885 }
886 else
887 ressize += 1+1+2;
888 }
Martin v. Löwisb09af032011-11-04 11:16:41 +0100889 res = PyUnicode_New(ressize, 127);
Serhiy Storchaka8aa8c472014-09-23 19:59:09 +0300890 if (res == NULL) {
891 Py_DECREF(object);
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000892 return NULL;
Serhiy Storchaka8aa8c472014-09-23 19:59:09 +0300893 }
Martin v. Löwisb09af032011-11-04 11:16:41 +0100894 for (i = start, outp = PyUnicode_1BYTE_DATA(res);
895 i < end; ++i) {
896 c = PyUnicode_READ_CHAR(object, i);
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000897 *outp++ = '\\';
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000898 if (c >= 0x00010000) {
899 *outp++ = 'U';
Victor Stinnerf5cff562011-10-14 02:13:11 +0200900 *outp++ = Py_hexdigits[(c>>28)&0xf];
901 *outp++ = Py_hexdigits[(c>>24)&0xf];
902 *outp++ = Py_hexdigits[(c>>20)&0xf];
903 *outp++ = Py_hexdigits[(c>>16)&0xf];
904 *outp++ = Py_hexdigits[(c>>12)&0xf];
905 *outp++ = Py_hexdigits[(c>>8)&0xf];
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000906 }
Antoine Pitroue4a18922010-09-09 20:30:23 +0000907 else if (c >= 0x100) {
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000908 *outp++ = 'u';
Victor Stinnerf5cff562011-10-14 02:13:11 +0200909 *outp++ = Py_hexdigits[(c>>12)&0xf];
910 *outp++ = Py_hexdigits[(c>>8)&0xf];
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000911 }
912 else
913 *outp++ = 'x';
Victor Stinnerf5cff562011-10-14 02:13:11 +0200914 *outp++ = Py_hexdigits[(c>>4)&0xf];
915 *outp++ = Py_hexdigits[c&0xf];
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000916 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +0000917
Victor Stinner8f825062012-04-27 13:55:39 +0200918 assert(_PyUnicode_CheckConsistency(res, 1));
919 restuple = Py_BuildValue("(Nn)", res, end);
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000920 Py_DECREF(object);
921 return restuple;
Walter Dörwald3aeb6322002-09-02 13:14:32 +0000922 }
923 else {
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000924 wrong_exception_type(exc);
925 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +0000926 }
927}
928
Serhiy Storchaka58cf6072013-11-19 11:32:41 +0200929#define ENC_UTF8 0
930#define ENC_UTF16BE 1
931#define ENC_UTF16LE 2
932#define ENC_UTF32BE 3
933#define ENC_UTF32LE 4
934
935static int
936get_standard_encoding(const char *encoding, int *bytelength)
937{
938 if (Py_TOLOWER(encoding[0]) == 'u' &&
939 Py_TOLOWER(encoding[1]) == 't' &&
940 Py_TOLOWER(encoding[2]) == 'f') {
941 encoding += 3;
942 if (*encoding == '-' || *encoding == '_' )
943 encoding++;
944 if (encoding[0] == '1' && encoding[1] == '6') {
945 encoding += 2;
946 *bytelength = 2;
947 if (*encoding == '\0') {
948#ifdef WORDS_BIGENDIAN
949 return ENC_UTF16BE;
950#else
951 return ENC_UTF16LE;
952#endif
953 }
954 if (*encoding == '-' || *encoding == '_' )
955 encoding++;
956 if (Py_TOLOWER(encoding[1]) == 'e' && encoding[2] == '\0') {
957 if (Py_TOLOWER(encoding[0]) == 'b')
958 return ENC_UTF16BE;
959 if (Py_TOLOWER(encoding[0]) == 'l')
960 return ENC_UTF16LE;
961 }
962 }
963 else if (encoding[0] == '3' && encoding[1] == '2') {
964 encoding += 2;
965 *bytelength = 4;
966 if (*encoding == '\0') {
967#ifdef WORDS_BIGENDIAN
968 return ENC_UTF32BE;
969#else
970 return ENC_UTF32LE;
971#endif
972 }
973 if (*encoding == '-' || *encoding == '_' )
974 encoding++;
975 if (Py_TOLOWER(encoding[1]) == 'e' && encoding[2] == '\0') {
976 if (Py_TOLOWER(encoding[0]) == 'b')
977 return ENC_UTF32BE;
978 if (Py_TOLOWER(encoding[0]) == 'l')
979 return ENC_UTF32LE;
980 }
981 }
982 }
983 /* utf-8 */
984 *bytelength = 3;
985 return ENC_UTF8;
986}
987
Martin v. Löwisaef3fb02009-05-02 19:27:30 +0000988/* This handler is declared static until someone demonstrates
989 a need to call it directly. */
990static PyObject *
Martin v. Löwise0a2b722009-05-10 08:08:56 +0000991PyCodec_SurrogatePassErrors(PyObject *exc)
Martin v. Löwisdb12d452009-05-02 18:52:14 +0000992{
993 PyObject *restuple;
994 PyObject *object;
Serhiy Storchaka58cf6072013-11-19 11:32:41 +0200995 PyObject *encode;
996 char *encoding;
997 int code;
998 int bytelength;
Martin v. Löwisb09af032011-11-04 11:16:41 +0100999 Py_ssize_t i;
Martin v. Löwisdb12d452009-05-02 18:52:14 +00001000 Py_ssize_t start;
1001 Py_ssize_t end;
1002 PyObject *res;
Serhiy Storchakaca7fecb2015-05-18 16:08:52 +03001003
1004 if (PyObject_TypeCheck(exc, (PyTypeObject *)PyExc_UnicodeEncodeError)) {
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02001005 unsigned char *outp;
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001006 if (PyUnicodeEncodeError_GetStart(exc, &start))
1007 return NULL;
1008 if (PyUnicodeEncodeError_GetEnd(exc, &end))
1009 return NULL;
1010 if (!(object = PyUnicodeEncodeError_GetObject(exc)))
1011 return NULL;
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02001012 if (!(encode = PyUnicodeEncodeError_GetEncoding(exc))) {
1013 Py_DECREF(object);
1014 return NULL;
1015 }
1016 if (!(encoding = PyUnicode_AsUTF8(encode))) {
1017 Py_DECREF(object);
1018 Py_DECREF(encode);
1019 return NULL;
1020 }
1021 code = get_standard_encoding(encoding, &bytelength);
1022 Py_DECREF(encode);
1023
Serhiy Storchaka2e374092014-10-04 14:15:49 +03001024 if (end - start > PY_SSIZE_T_MAX / bytelength)
1025 end = start + PY_SSIZE_T_MAX / bytelength;
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02001026 res = PyBytes_FromStringAndSize(NULL, bytelength*(end-start));
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001027 if (!res) {
1028 Py_DECREF(object);
1029 return NULL;
1030 }
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02001031 outp = (unsigned char*)PyBytes_AsString(res);
Martin v. Löwisb09af032011-11-04 11:16:41 +01001032 for (i = start; i < end; i++) {
1033 /* object is guaranteed to be "ready" */
1034 Py_UCS4 ch = PyUnicode_READ_CHAR(object, i);
Victor Stinner76df43d2012-10-30 01:42:39 +01001035 if (!Py_UNICODE_IS_SURROGATE(ch)) {
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001036 /* Not a surrogate, fail with original exception */
1037 PyErr_SetObject(PyExceptionInstance_Class(exc), exc);
1038 Py_DECREF(res);
1039 Py_DECREF(object);
1040 return NULL;
1041 }
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02001042 switch (code) {
1043 case ENC_UTF8:
1044 *outp++ = (unsigned char)(0xe0 | (ch >> 12));
1045 *outp++ = (unsigned char)(0x80 | ((ch >> 6) & 0x3f));
1046 *outp++ = (unsigned char)(0x80 | (ch & 0x3f));
1047 break;
1048 case ENC_UTF16LE:
1049 *outp++ = (unsigned char) ch;
1050 *outp++ = (unsigned char)(ch >> 8);
1051 break;
1052 case ENC_UTF16BE:
1053 *outp++ = (unsigned char)(ch >> 8);
1054 *outp++ = (unsigned char) ch;
1055 break;
1056 case ENC_UTF32LE:
1057 *outp++ = (unsigned char) ch;
1058 *outp++ = (unsigned char)(ch >> 8);
1059 *outp++ = (unsigned char)(ch >> 16);
1060 *outp++ = (unsigned char)(ch >> 24);
1061 break;
1062 case ENC_UTF32BE:
1063 *outp++ = (unsigned char)(ch >> 24);
1064 *outp++ = (unsigned char)(ch >> 16);
1065 *outp++ = (unsigned char)(ch >> 8);
1066 *outp++ = (unsigned char) ch;
1067 break;
1068 }
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001069 }
1070 restuple = Py_BuildValue("(On)", res, end);
1071 Py_DECREF(res);
1072 Py_DECREF(object);
1073 return restuple;
Martin v. Löwisdb12d452009-05-02 18:52:14 +00001074 }
Serhiy Storchakaca7fecb2015-05-18 16:08:52 +03001075 else if (PyObject_TypeCheck(exc, (PyTypeObject *)PyExc_UnicodeDecodeError)) {
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001076 unsigned char *p;
Victor Stinnerc06bb7a2011-11-04 21:36:35 +01001077 Py_UCS4 ch = 0;
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001078 if (PyUnicodeDecodeError_GetStart(exc, &start))
1079 return NULL;
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02001080 if (PyUnicodeDecodeError_GetEnd(exc, &end))
1081 return NULL;
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001082 if (!(object = PyUnicodeDecodeError_GetObject(exc)))
1083 return NULL;
1084 if (!(p = (unsigned char*)PyBytes_AsString(object))) {
1085 Py_DECREF(object);
1086 return NULL;
1087 }
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02001088 if (!(encode = PyUnicodeDecodeError_GetEncoding(exc))) {
1089 Py_DECREF(object);
1090 return NULL;
1091 }
1092 if (!(encoding = PyUnicode_AsUTF8(encode))) {
1093 Py_DECREF(object);
1094 Py_DECREF(encode);
1095 return NULL;
1096 }
1097 code = get_standard_encoding(encoding, &bytelength);
1098 Py_DECREF(encode);
1099
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001100 /* Try decoding a single surrogate character. If
1101 there are more, let the codec call us again. */
1102 p += start;
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02001103 if (PyBytes_GET_SIZE(object) - start >= bytelength) {
1104 switch (code) {
1105 case ENC_UTF8:
1106 if ((p[0] & 0xf0) == 0xe0 &&
1107 (p[1] & 0xc0) == 0x80 &&
1108 (p[2] & 0xc0) == 0x80) {
1109 /* it's a three-byte code */
1110 ch = ((p[0] & 0x0f) << 12) + ((p[1] & 0x3f) << 6) + (p[2] & 0x3f);
1111 }
1112 break;
1113 case ENC_UTF16LE:
1114 ch = p[1] << 8 | p[0];
1115 break;
1116 case ENC_UTF16BE:
1117 ch = p[0] << 8 | p[1];
1118 break;
1119 case ENC_UTF32LE:
1120 ch = (p[3] << 24) | (p[2] << 16) | (p[1] << 8) | p[0];
1121 break;
1122 case ENC_UTF32BE:
1123 ch = (p[0] << 24) | (p[1] << 16) | (p[2] << 8) | p[3];
1124 break;
1125 }
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001126 }
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02001127
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001128 Py_DECREF(object);
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02001129 if (!Py_UNICODE_IS_SURROGATE(ch)) {
1130 /* it's not a surrogate - fail */
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001131 PyErr_SetObject(PyExceptionInstance_Class(exc), exc);
1132 return NULL;
1133 }
Victor Stinnerc06bb7a2011-11-04 21:36:35 +01001134 res = PyUnicode_FromOrdinal(ch);
1135 if (res == NULL)
1136 return NULL;
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02001137 return Py_BuildValue("(Nn)", res, start + bytelength);
Martin v. Löwisdb12d452009-05-02 18:52:14 +00001138 }
1139 else {
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001140 wrong_exception_type(exc);
1141 return NULL;
Martin v. Löwisdb12d452009-05-02 18:52:14 +00001142 }
1143}
1144
Martin v. Löwis011e8422009-05-05 04:43:17 +00001145static PyObject *
Martin v. Löwis43c57782009-05-10 08:15:24 +00001146PyCodec_SurrogateEscapeErrors(PyObject *exc)
Martin v. Löwis011e8422009-05-05 04:43:17 +00001147{
1148 PyObject *restuple;
1149 PyObject *object;
Martin v. Löwisb09af032011-11-04 11:16:41 +01001150 Py_ssize_t i;
Martin v. Löwis011e8422009-05-05 04:43:17 +00001151 Py_ssize_t start;
1152 Py_ssize_t end;
1153 PyObject *res;
Serhiy Storchakaca7fecb2015-05-18 16:08:52 +03001154
1155 if (PyObject_TypeCheck(exc, (PyTypeObject *)PyExc_UnicodeEncodeError)) {
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001156 char *outp;
1157 if (PyUnicodeEncodeError_GetStart(exc, &start))
1158 return NULL;
1159 if (PyUnicodeEncodeError_GetEnd(exc, &end))
1160 return NULL;
1161 if (!(object = PyUnicodeEncodeError_GetObject(exc)))
1162 return NULL;
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001163 res = PyBytes_FromStringAndSize(NULL, end-start);
1164 if (!res) {
1165 Py_DECREF(object);
1166 return NULL;
1167 }
1168 outp = PyBytes_AsString(res);
Martin v. Löwisb09af032011-11-04 11:16:41 +01001169 for (i = start; i < end; i++) {
1170 /* object is guaranteed to be "ready" */
1171 Py_UCS4 ch = PyUnicode_READ_CHAR(object, i);
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001172 if (ch < 0xdc80 || ch > 0xdcff) {
1173 /* Not a UTF-8b surrogate, fail with original exception */
1174 PyErr_SetObject(PyExceptionInstance_Class(exc), exc);
1175 Py_DECREF(res);
1176 Py_DECREF(object);
1177 return NULL;
1178 }
1179 *outp++ = ch - 0xdc00;
1180 }
1181 restuple = Py_BuildValue("(On)", res, end);
1182 Py_DECREF(res);
1183 Py_DECREF(object);
1184 return restuple;
Martin v. Löwis011e8422009-05-05 04:43:17 +00001185 }
Serhiy Storchakaca7fecb2015-05-18 16:08:52 +03001186 else if (PyObject_TypeCheck(exc, (PyTypeObject *)PyExc_UnicodeDecodeError)) {
Victor Stinnerc06bb7a2011-11-04 21:36:35 +01001187 PyObject *str;
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001188 unsigned char *p;
Victor Stinnerc06bb7a2011-11-04 21:36:35 +01001189 Py_UCS2 ch[4]; /* decode up to 4 bad bytes. */
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001190 int consumed = 0;
1191 if (PyUnicodeDecodeError_GetStart(exc, &start))
1192 return NULL;
1193 if (PyUnicodeDecodeError_GetEnd(exc, &end))
1194 return NULL;
1195 if (!(object = PyUnicodeDecodeError_GetObject(exc)))
1196 return NULL;
1197 if (!(p = (unsigned char*)PyBytes_AsString(object))) {
1198 Py_DECREF(object);
1199 return NULL;
1200 }
1201 while (consumed < 4 && consumed < end-start) {
1202 /* Refuse to escape ASCII bytes. */
1203 if (p[start+consumed] < 128)
1204 break;
1205 ch[consumed] = 0xdc00 + p[start+consumed];
1206 consumed++;
1207 }
1208 Py_DECREF(object);
1209 if (!consumed) {
1210 /* codec complained about ASCII byte. */
1211 PyErr_SetObject(PyExceptionInstance_Class(exc), exc);
1212 return NULL;
1213 }
Victor Stinnerc06bb7a2011-11-04 21:36:35 +01001214 str = PyUnicode_FromKindAndData(PyUnicode_2BYTE_KIND, ch, consumed);
1215 if (str == NULL)
1216 return NULL;
1217 return Py_BuildValue("(Nn)", str, start+consumed);
Martin v. Löwis011e8422009-05-05 04:43:17 +00001218 }
1219 else {
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001220 wrong_exception_type(exc);
1221 return NULL;
Martin v. Löwis011e8422009-05-05 04:43:17 +00001222 }
1223}
1224
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001225
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001226static PyObject *strict_errors(PyObject *self, PyObject *exc)
1227{
1228 return PyCodec_StrictErrors(exc);
1229}
1230
1231
1232static PyObject *ignore_errors(PyObject *self, PyObject *exc)
1233{
1234 return PyCodec_IgnoreErrors(exc);
1235}
1236
1237
1238static PyObject *replace_errors(PyObject *self, PyObject *exc)
1239{
1240 return PyCodec_ReplaceErrors(exc);
1241}
1242
1243
1244static PyObject *xmlcharrefreplace_errors(PyObject *self, PyObject *exc)
1245{
1246 return PyCodec_XMLCharRefReplaceErrors(exc);
1247}
1248
1249
1250static PyObject *backslashreplace_errors(PyObject *self, PyObject *exc)
1251{
1252 return PyCodec_BackslashReplaceErrors(exc);
1253}
1254
Martin v. Löwise0a2b722009-05-10 08:08:56 +00001255static PyObject *surrogatepass_errors(PyObject *self, PyObject *exc)
Martin v. Löwisdb12d452009-05-02 18:52:14 +00001256{
Martin v. Löwise0a2b722009-05-10 08:08:56 +00001257 return PyCodec_SurrogatePassErrors(exc);
Martin v. Löwisdb12d452009-05-02 18:52:14 +00001258}
1259
Martin v. Löwis43c57782009-05-10 08:15:24 +00001260static PyObject *surrogateescape_errors(PyObject *self, PyObject *exc)
Martin v. Löwis011e8422009-05-05 04:43:17 +00001261{
Martin v. Löwis43c57782009-05-10 08:15:24 +00001262 return PyCodec_SurrogateEscapeErrors(exc);
Martin v. Löwis011e8422009-05-05 04:43:17 +00001263}
1264
Gustavo Niemeyer5ddd4c32003-03-19 00:35:36 +00001265static int _PyCodecRegistry_Init(void)
Guido van Rossumfeee4b92000-03-10 22:57:27 +00001266{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001267 static struct {
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001268 char *name;
1269 PyMethodDef def;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001270 } methods[] =
1271 {
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001272 {
1273 "strict",
1274 {
1275 "strict_errors",
1276 strict_errors,
1277 METH_O,
1278 PyDoc_STR("Implements the 'strict' error handling, which "
1279 "raises a UnicodeError on coding errors.")
1280 }
1281 },
1282 {
1283 "ignore",
1284 {
1285 "ignore_errors",
1286 ignore_errors,
1287 METH_O,
1288 PyDoc_STR("Implements the 'ignore' error handling, which "
1289 "ignores malformed data and continues.")
1290 }
1291 },
1292 {
1293 "replace",
1294 {
1295 "replace_errors",
1296 replace_errors,
1297 METH_O,
1298 PyDoc_STR("Implements the 'replace' error handling, which "
1299 "replaces malformed data with a replacement marker.")
1300 }
1301 },
1302 {
1303 "xmlcharrefreplace",
1304 {
1305 "xmlcharrefreplace_errors",
1306 xmlcharrefreplace_errors,
1307 METH_O,
1308 PyDoc_STR("Implements the 'xmlcharrefreplace' error handling, "
1309 "which replaces an unencodable character with the "
1310 "appropriate XML character reference.")
1311 }
1312 },
1313 {
1314 "backslashreplace",
1315 {
1316 "backslashreplace_errors",
1317 backslashreplace_errors,
1318 METH_O,
1319 PyDoc_STR("Implements the 'backslashreplace' error handling, "
1320 "which replaces an unencodable character with a "
1321 "backslashed escape sequence.")
1322 }
1323 },
1324 {
1325 "surrogatepass",
1326 {
1327 "surrogatepass",
1328 surrogatepass_errors,
1329 METH_O
1330 }
1331 },
1332 {
1333 "surrogateescape",
1334 {
1335 "surrogateescape",
1336 surrogateescape_errors,
1337 METH_O
1338 }
1339 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001340 };
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001341
Nicholas Bastine5662ae2004-03-24 22:22:12 +00001342 PyInterpreterState *interp = PyThreadState_GET()->interp;
Gustavo Niemeyer5ddd4c32003-03-19 00:35:36 +00001343 PyObject *mod;
Neal Norwitz739a8f82004-07-08 01:55:58 +00001344 unsigned i;
Gustavo Niemeyer5ddd4c32003-03-19 00:35:36 +00001345
1346 if (interp->codec_search_path != NULL)
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001347 return 0;
Gustavo Niemeyer5ddd4c32003-03-19 00:35:36 +00001348
1349 interp->codec_search_path = PyList_New(0);
1350 interp->codec_search_cache = PyDict_New();
1351 interp->codec_error_registry = PyDict_New();
1352
1353 if (interp->codec_error_registry) {
Victor Stinner63941882011-09-29 00:42:28 +02001354 for (i = 0; i < Py_ARRAY_LENGTH(methods); ++i) {
Andrew Svetlov3ba3a3e2012-12-25 13:32:35 +02001355 PyObject *func = PyCFunction_NewEx(&methods[i].def, NULL, NULL);
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001356 int res;
1357 if (!func)
1358 Py_FatalError("can't initialize codec error registry");
1359 res = PyCodec_RegisterError(methods[i].name, func);
1360 Py_DECREF(func);
1361 if (res)
1362 Py_FatalError("can't initialize codec error registry");
1363 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001364 }
Guido van Rossumfeee4b92000-03-10 22:57:27 +00001365
Gustavo Niemeyer5ddd4c32003-03-19 00:35:36 +00001366 if (interp->codec_search_path == NULL ||
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001367 interp->codec_search_cache == NULL ||
1368 interp->codec_error_registry == NULL)
1369 Py_FatalError("can't initialize codec registry");
Gustavo Niemeyer5ddd4c32003-03-19 00:35:36 +00001370
Christian Heimes819b8bf2008-01-03 23:05:47 +00001371 mod = PyImport_ImportModuleNoBlock("encodings");
Gustavo Niemeyer5ddd4c32003-03-19 00:35:36 +00001372 if (mod == NULL) {
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001373 return -1;
Gustavo Niemeyer5ddd4c32003-03-19 00:35:36 +00001374 }
1375 Py_DECREF(mod);
Christian Heimes6a27efa2008-10-30 21:48:26 +00001376 interp->codecs_initialized = 1;
Gustavo Niemeyer5ddd4c32003-03-19 00:35:36 +00001377 return 0;
Guido van Rossumfeee4b92000-03-10 22:57:27 +00001378}