blob: 4c2ae381b36d7047e6a4f45cb86232beac6a1de6 [file] [log] [blame]
Guido van Rossumfeee4b92000-03-10 22:57:27 +00001/* ------------------------------------------------------------------------
2
3 Python Codec Registry and support functions
4
5Written by Marc-Andre Lemburg (mal@lemburg.com).
6
Guido van Rossum16b1ad92000-08-03 16:24:25 +00007Copyright (c) Corporation for National Research Initiatives.
Guido van Rossumfeee4b92000-03-10 22:57:27 +00008
9 ------------------------------------------------------------------------ */
10
11#include "Python.h"
12#include <ctype.h>
13
Victor Stinnerf5cff562011-10-14 02:13:11 +020014const char *Py_hexdigits = "0123456789abcdef";
15
Guido van Rossumfeee4b92000-03-10 22:57:27 +000016/* --- Codec Registry ----------------------------------------------------- */
17
18/* Import the standard encodings package which will register the first
Guido van Rossum98297ee2007-11-06 21:34:58 +000019 codec search function.
Guido van Rossumfeee4b92000-03-10 22:57:27 +000020
21 This is done in a lazy way so that the Unicode implementation does
22 not downgrade startup time of scripts not needing it.
23
Guido van Rossumb95de4f2000-03-31 17:25:23 +000024 ImportErrors are silently ignored by this function. Only one try is
25 made.
Guido van Rossumfeee4b92000-03-10 22:57:27 +000026
27*/
28
Gustavo Niemeyer5ddd4c32003-03-19 00:35:36 +000029static int _PyCodecRegistry_Init(void); /* Forward */
Guido van Rossumfeee4b92000-03-10 22:57:27 +000030
Guido van Rossumfeee4b92000-03-10 22:57:27 +000031int PyCodec_Register(PyObject *search_function)
32{
Nicholas Bastine5662ae2004-03-24 22:22:12 +000033 PyInterpreterState *interp = PyThreadState_GET()->interp;
Gustavo Niemeyer5ddd4c32003-03-19 00:35:36 +000034 if (interp->codec_search_path == NULL && _PyCodecRegistry_Init())
Antoine Pitrouf95a1b32010-05-09 15:52:27 +000035 goto onError;
Guido van Rossumfeee4b92000-03-10 22:57:27 +000036 if (search_function == NULL) {
Antoine Pitrouf95a1b32010-05-09 15:52:27 +000037 PyErr_BadArgument();
38 goto onError;
Guido van Rossumfeee4b92000-03-10 22:57:27 +000039 }
40 if (!PyCallable_Check(search_function)) {
Antoine Pitrouf95a1b32010-05-09 15:52:27 +000041 PyErr_SetString(PyExc_TypeError, "argument must be callable");
42 goto onError;
Guido van Rossumfeee4b92000-03-10 22:57:27 +000043 }
Gustavo Niemeyer5ddd4c32003-03-19 00:35:36 +000044 return PyList_Append(interp->codec_search_path, search_function);
Guido van Rossumb95de4f2000-03-31 17:25:23 +000045
46 onError:
47 return -1;
Guido van Rossumfeee4b92000-03-10 22:57:27 +000048}
49
Guido van Rossum9e896b32000-04-05 20:11:21 +000050/* Convert a string to a normalized Python string: all characters are
51 converted to lower case, spaces are replaced with underscores. */
52
Guido van Rossumfeee4b92000-03-10 22:57:27 +000053static
Guido van Rossum9e896b32000-04-05 20:11:21 +000054PyObject *normalizestring(const char *string)
Guido van Rossumfeee4b92000-03-10 22:57:27 +000055{
Antoine Pitrou9ed5f272013-08-13 20:18:52 +020056 size_t i;
Guido van Rossum582acec2000-06-28 22:07:35 +000057 size_t len = strlen(string);
Guido van Rossumfeee4b92000-03-10 22:57:27 +000058 char *p;
59 PyObject *v;
Guido van Rossum21431e82007-10-19 21:48:41 +000060
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000061 if (len > PY_SSIZE_T_MAX) {
Antoine Pitrouf95a1b32010-05-09 15:52:27 +000062 PyErr_SetString(PyExc_OverflowError, "string is too large");
63 return NULL;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000064 }
Guido van Rossum21431e82007-10-19 21:48:41 +000065
66 p = PyMem_Malloc(len + 1);
67 if (p == NULL)
Victor Stinnercc351592013-07-12 00:02:55 +020068 return PyErr_NoMemory();
Guido van Rossum9e896b32000-04-05 20:11:21 +000069 for (i = 0; i < len; i++) {
Antoine Pitrou9ed5f272013-08-13 20:18:52 +020070 char ch = string[i];
Guido van Rossum9e896b32000-04-05 20:11:21 +000071 if (ch == ' ')
72 ch = '-';
73 else
Antoine Pitroucf9d3c02011-07-24 02:27:04 +020074 ch = Py_TOLOWER(Py_CHARMASK(ch));
Antoine Pitrouf95a1b32010-05-09 15:52:27 +000075 p[i] = ch;
Guido van Rossum9e896b32000-04-05 20:11:21 +000076 }
Guido van Rossum21431e82007-10-19 21:48:41 +000077 p[i] = '\0';
78 v = PyUnicode_FromString(p);
79 if (v == NULL)
80 return NULL;
81 PyMem_Free(p);
Guido van Rossumfeee4b92000-03-10 22:57:27 +000082 return v;
83}
84
85/* Lookup the given encoding and return a tuple providing the codec
86 facilities.
87
88 The encoding string is looked up converted to all lower-case
89 characters. This makes encodings looked up through this mechanism
90 effectively case-insensitive.
91
Guido van Rossum98297ee2007-11-06 21:34:58 +000092 If no codec is found, a LookupError is set and NULL returned.
Guido van Rossumb95de4f2000-03-31 17:25:23 +000093
94 As side effect, this tries to load the encodings package, if not
95 yet done. This is part of the lazy load strategy for the encodings
96 package.
97
98*/
Guido van Rossumfeee4b92000-03-10 22:57:27 +000099
100PyObject *_PyCodec_Lookup(const char *encoding)
101{
Gustavo Niemeyer5ddd4c32003-03-19 00:35:36 +0000102 PyInterpreterState *interp;
Guido van Rossum5ba3c842000-03-24 20:52:23 +0000103 PyObject *result, *args = NULL, *v;
Thomas Wouters477c8d52006-05-27 19:21:47 +0000104 Py_ssize_t i, len;
Guido van Rossumfeee4b92000-03-10 22:57:27 +0000105
Fred Drake766de832000-05-09 19:55:59 +0000106 if (encoding == NULL) {
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000107 PyErr_BadArgument();
108 goto onError;
Fred Drake766de832000-05-09 19:55:59 +0000109 }
Gustavo Niemeyer5ddd4c32003-03-19 00:35:36 +0000110
Nicholas Bastine5662ae2004-03-24 22:22:12 +0000111 interp = PyThreadState_GET()->interp;
Gustavo Niemeyer5ddd4c32003-03-19 00:35:36 +0000112 if (interp->codec_search_path == NULL && _PyCodecRegistry_Init())
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000113 goto onError;
Guido van Rossumfeee4b92000-03-10 22:57:27 +0000114
Guido van Rossum9e896b32000-04-05 20:11:21 +0000115 /* Convert the encoding to a normalized Python string: all
Thomas Wouters7e474022000-07-16 12:04:32 +0000116 characters are converted to lower case, spaces and hyphens are
Guido van Rossum9e896b32000-04-05 20:11:21 +0000117 replaced with underscores. */
118 v = normalizestring(encoding);
Guido van Rossumfeee4b92000-03-10 22:57:27 +0000119 if (v == NULL)
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000120 goto onError;
Guido van Rossum21431e82007-10-19 21:48:41 +0000121 PyUnicode_InternInPlace(&v);
Guido van Rossumfeee4b92000-03-10 22:57:27 +0000122
123 /* First, try to lookup the name in the registry dictionary */
Gustavo Niemeyer5ddd4c32003-03-19 00:35:36 +0000124 result = PyDict_GetItem(interp->codec_search_cache, v);
Guido van Rossumfeee4b92000-03-10 22:57:27 +0000125 if (result != NULL) {
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000126 Py_INCREF(result);
127 Py_DECREF(v);
128 return result;
Guido van Rossumfeee4b92000-03-10 22:57:27 +0000129 }
Guido van Rossum98297ee2007-11-06 21:34:58 +0000130
Guido van Rossumfeee4b92000-03-10 22:57:27 +0000131 /* Next, scan the search functions in order of registration */
Guido van Rossumfeee4b92000-03-10 22:57:27 +0000132 args = PyTuple_New(1);
133 if (args == NULL)
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000134 goto onError;
Guido van Rossumfeee4b92000-03-10 22:57:27 +0000135 PyTuple_SET_ITEM(args,0,v);
Guido van Rossum5ba3c842000-03-24 20:52:23 +0000136
Gustavo Niemeyer5ddd4c32003-03-19 00:35:36 +0000137 len = PyList_Size(interp->codec_search_path);
Guido van Rossum5ba3c842000-03-24 20:52:23 +0000138 if (len < 0)
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000139 goto onError;
Guido van Rossumb95de4f2000-03-31 17:25:23 +0000140 if (len == 0) {
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000141 PyErr_SetString(PyExc_LookupError,
142 "no codec search functions registered: "
143 "can't find encoding");
144 goto onError;
Guido van Rossumb95de4f2000-03-31 17:25:23 +0000145 }
Guido van Rossumfeee4b92000-03-10 22:57:27 +0000146
147 for (i = 0; i < len; i++) {
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000148 PyObject *func;
Guido van Rossum98297ee2007-11-06 21:34:58 +0000149
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000150 func = PyList_GetItem(interp->codec_search_path, i);
151 if (func == NULL)
152 goto onError;
153 result = PyEval_CallObject(func, args);
154 if (result == NULL)
155 goto onError;
156 if (result == Py_None) {
157 Py_DECREF(result);
158 continue;
159 }
160 if (!PyTuple_Check(result) || PyTuple_GET_SIZE(result) != 4) {
161 PyErr_SetString(PyExc_TypeError,
162 "codec search functions must return 4-tuples");
163 Py_DECREF(result);
164 goto onError;
165 }
166 break;
Guido van Rossumfeee4b92000-03-10 22:57:27 +0000167 }
168 if (i == len) {
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000169 /* XXX Perhaps we should cache misses too ? */
170 PyErr_Format(PyExc_LookupError,
Martin v. Löwiseb42b022002-09-26 16:01:24 +0000171 "unknown encoding: %s", encoding);
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000172 goto onError;
Guido van Rossumfeee4b92000-03-10 22:57:27 +0000173 }
174
175 /* Cache and return the result */
Neal Norwitz9edcc2e2007-08-11 04:58:26 +0000176 if (PyDict_SetItem(interp->codec_search_cache, v, result) < 0) {
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000177 Py_DECREF(result);
178 goto onError;
Neal Norwitz9edcc2e2007-08-11 04:58:26 +0000179 }
Guido van Rossumfeee4b92000-03-10 22:57:27 +0000180 Py_DECREF(args);
181 return result;
182
183 onError:
184 Py_XDECREF(args);
185 return NULL;
186}
187
Marc-André Lemburgb2750b52008-06-06 12:18:17 +0000188/* Codec registry encoding check API. */
189
190int PyCodec_KnownEncoding(const char *encoding)
191{
192 PyObject *codecs;
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000193
Marc-André Lemburgb2750b52008-06-06 12:18:17 +0000194 codecs = _PyCodec_Lookup(encoding);
195 if (!codecs) {
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000196 PyErr_Clear();
197 return 0;
Marc-André Lemburgb2750b52008-06-06 12:18:17 +0000198 }
199 else {
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000200 Py_DECREF(codecs);
201 return 1;
Marc-André Lemburgb2750b52008-06-06 12:18:17 +0000202 }
203}
204
Guido van Rossumfeee4b92000-03-10 22:57:27 +0000205static
206PyObject *args_tuple(PyObject *object,
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000207 const char *errors)
Guido van Rossumfeee4b92000-03-10 22:57:27 +0000208{
209 PyObject *args;
Guido van Rossum98297ee2007-11-06 21:34:58 +0000210
Guido van Rossumfeee4b92000-03-10 22:57:27 +0000211 args = PyTuple_New(1 + (errors != NULL));
212 if (args == NULL)
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000213 return NULL;
Guido van Rossumfeee4b92000-03-10 22:57:27 +0000214 Py_INCREF(object);
215 PyTuple_SET_ITEM(args,0,object);
216 if (errors) {
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000217 PyObject *v;
Guido van Rossum98297ee2007-11-06 21:34:58 +0000218
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000219 v = PyUnicode_FromString(errors);
220 if (v == NULL) {
221 Py_DECREF(args);
222 return NULL;
223 }
224 PyTuple_SET_ITEM(args, 1, v);
Guido van Rossumfeee4b92000-03-10 22:57:27 +0000225 }
226 return args;
227}
228
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000229/* Helper function to get a codec item */
Guido van Rossumfeee4b92000-03-10 22:57:27 +0000230
231static
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000232PyObject *codec_getitem(const char *encoding, int index)
Guido van Rossumfeee4b92000-03-10 22:57:27 +0000233{
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000234 PyObject *codecs;
235 PyObject *v;
Guido van Rossumfeee4b92000-03-10 22:57:27 +0000236
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000237 codecs = _PyCodec_Lookup(encoding);
238 if (codecs == NULL)
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000239 return NULL;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000240 v = PyTuple_GET_ITEM(codecs, index);
241 Py_DECREF(codecs);
242 Py_INCREF(v);
243 return v;
244}
245
Nick Coghlana9b15242014-02-04 22:11:18 +1000246/* Helper functions to create an incremental codec. */
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000247static
Nick Coghlana9b15242014-02-04 22:11:18 +1000248PyObject *codec_makeincrementalcodec(PyObject *codec_info,
249 const char *errors,
250 const char *attrname)
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000251{
Nick Coghlana9b15242014-02-04 22:11:18 +1000252 PyObject *ret, *inccodec;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000253
Nick Coghlana9b15242014-02-04 22:11:18 +1000254 inccodec = PyObject_GetAttrString(codec_info, attrname);
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000255 if (inccodec == NULL)
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000256 return NULL;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000257 if (errors)
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000258 ret = PyObject_CallFunction(inccodec, "s", errors);
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000259 else
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000260 ret = PyObject_CallFunction(inccodec, NULL);
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000261 Py_DECREF(inccodec);
262 return ret;
263}
264
Nick Coghlana9b15242014-02-04 22:11:18 +1000265static
266PyObject *codec_getincrementalcodec(const char *encoding,
267 const char *errors,
268 const char *attrname)
269{
270 PyObject *codec_info, *ret;
271
272 codec_info = _PyCodec_Lookup(encoding);
273 if (codec_info == NULL)
274 return NULL;
275 ret = codec_makeincrementalcodec(codec_info, errors, attrname);
276 Py_DECREF(codec_info);
277 return ret;
278}
279
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000280/* Helper function to create a stream codec. */
281
282static
283PyObject *codec_getstreamcodec(const char *encoding,
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000284 PyObject *stream,
285 const char *errors,
286 const int index)
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000287{
Thomas Wouters0e3f5912006-08-11 14:57:12 +0000288 PyObject *codecs, *streamcodec, *codeccls;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000289
290 codecs = _PyCodec_Lookup(encoding);
291 if (codecs == NULL)
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000292 return NULL;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000293
Thomas Wouters0e3f5912006-08-11 14:57:12 +0000294 codeccls = PyTuple_GET_ITEM(codecs, index);
295 if (errors != NULL)
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000296 streamcodec = PyObject_CallFunction(codeccls, "Os", stream, errors);
Thomas Wouters0e3f5912006-08-11 14:57:12 +0000297 else
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000298 streamcodec = PyObject_CallFunction(codeccls, "O", stream);
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000299 Py_DECREF(codecs);
300 return streamcodec;
Guido van Rossumfeee4b92000-03-10 22:57:27 +0000301}
302
Nick Coghlana9b15242014-02-04 22:11:18 +1000303/* Helpers to work with the result of _PyCodec_Lookup
304
305 */
306PyObject *_PyCodecInfo_GetIncrementalDecoder(PyObject *codec_info,
307 const char *errors)
308{
309 return codec_makeincrementalcodec(codec_info, errors,
310 "incrementaldecoder");
311}
312
313PyObject *_PyCodecInfo_GetIncrementalEncoder(PyObject *codec_info,
314 const char *errors)
315{
316 return codec_makeincrementalcodec(codec_info, errors,
317 "incrementalencoder");
318}
319
320
Guido van Rossum98297ee2007-11-06 21:34:58 +0000321/* Convenience APIs to query the Codec registry.
322
Guido van Rossumfeee4b92000-03-10 22:57:27 +0000323 All APIs return a codec object with incremented refcount.
Guido van Rossum98297ee2007-11-06 21:34:58 +0000324
Guido van Rossumfeee4b92000-03-10 22:57:27 +0000325 */
326
327PyObject *PyCodec_Encoder(const char *encoding)
328{
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000329 return codec_getitem(encoding, 0);
Guido van Rossumfeee4b92000-03-10 22:57:27 +0000330}
331
332PyObject *PyCodec_Decoder(const char *encoding)
333{
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000334 return codec_getitem(encoding, 1);
Guido van Rossumfeee4b92000-03-10 22:57:27 +0000335}
336
Thomas Woutersa9773292006-04-21 09:43:23 +0000337PyObject *PyCodec_IncrementalEncoder(const char *encoding,
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000338 const char *errors)
Thomas Woutersa9773292006-04-21 09:43:23 +0000339{
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000340 return codec_getincrementalcodec(encoding, errors, "incrementalencoder");
Thomas Woutersa9773292006-04-21 09:43:23 +0000341}
342
343PyObject *PyCodec_IncrementalDecoder(const char *encoding,
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000344 const char *errors)
Thomas Woutersa9773292006-04-21 09:43:23 +0000345{
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000346 return codec_getincrementalcodec(encoding, errors, "incrementaldecoder");
Thomas Woutersa9773292006-04-21 09:43:23 +0000347}
348
Guido van Rossumfeee4b92000-03-10 22:57:27 +0000349PyObject *PyCodec_StreamReader(const char *encoding,
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000350 PyObject *stream,
351 const char *errors)
Guido van Rossumfeee4b92000-03-10 22:57:27 +0000352{
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000353 return codec_getstreamcodec(encoding, stream, errors, 2);
Guido van Rossumfeee4b92000-03-10 22:57:27 +0000354}
355
356PyObject *PyCodec_StreamWriter(const char *encoding,
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000357 PyObject *stream,
358 const char *errors)
Guido van Rossumfeee4b92000-03-10 22:57:27 +0000359{
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000360 return codec_getstreamcodec(encoding, stream, errors, 3);
Guido van Rossumfeee4b92000-03-10 22:57:27 +0000361}
362
Nick Coghlan8b097b42013-11-13 23:49:21 +1000363/* Helper that tries to ensure the reported exception chain indicates the
364 * codec that was invoked to trigger the failure without changing the type
365 * of the exception raised.
366 */
367static void
368wrap_codec_error(const char *operation,
369 const char *encoding)
370{
371 /* TrySetFromCause will replace the active exception with a suitably
372 * updated clone if it can, otherwise it will leave the original
373 * exception alone.
374 */
375 _PyErr_TrySetFromCause("%s with '%s' codec failed",
376 operation, encoding);
377}
378
Guido van Rossumfeee4b92000-03-10 22:57:27 +0000379/* Encode an object (e.g. an Unicode object) using the given encoding
380 and return the resulting encoded object (usually a Python string).
381
382 errors is passed to the encoder factory as argument if non-NULL. */
383
Nick Coghlanc72e4e62013-11-22 22:39:36 +1000384static PyObject *
385_PyCodec_EncodeInternal(PyObject *object,
386 PyObject *encoder,
387 const char *encoding,
388 const char *errors)
Guido van Rossumfeee4b92000-03-10 22:57:27 +0000389{
Neal Norwitz3715c3e2005-11-24 22:09:18 +0000390 PyObject *args = NULL, *result = NULL;
Guido van Rossum98297ee2007-11-06 21:34:58 +0000391 PyObject *v = NULL;
Guido van Rossumfeee4b92000-03-10 22:57:27 +0000392
Guido van Rossumfeee4b92000-03-10 22:57:27 +0000393 args = args_tuple(object, errors);
394 if (args == NULL)
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000395 goto onError;
Guido van Rossum98297ee2007-11-06 21:34:58 +0000396
397 result = PyEval_CallObject(encoder, args);
Nick Coghlanc4c25802013-11-15 21:47:37 +1000398 if (result == NULL) {
399 wrap_codec_error("encoding", encoding);
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000400 goto onError;
Nick Coghlanc4c25802013-11-15 21:47:37 +1000401 }
Guido van Rossumfeee4b92000-03-10 22:57:27 +0000402
Guido van Rossum98297ee2007-11-06 21:34:58 +0000403 if (!PyTuple_Check(result) ||
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000404 PyTuple_GET_SIZE(result) != 2) {
405 PyErr_SetString(PyExc_TypeError,
406 "encoder must return a tuple (object, integer)");
407 goto onError;
Guido van Rossumfeee4b92000-03-10 22:57:27 +0000408 }
Marc-André Lemburgb2750b52008-06-06 12:18:17 +0000409 v = PyTuple_GET_ITEM(result,0);
410 Py_INCREF(v);
Guido van Rossumfeee4b92000-03-10 22:57:27 +0000411 /* We don't check or use the second (integer) entry. */
412
Marc-André Lemburgb2750b52008-06-06 12:18:17 +0000413 Py_DECREF(args);
414 Py_DECREF(encoder);
415 Py_DECREF(result);
416 return v;
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000417
Guido van Rossumfeee4b92000-03-10 22:57:27 +0000418 onError:
Neal Norwitz3715c3e2005-11-24 22:09:18 +0000419 Py_XDECREF(result);
Guido van Rossumfeee4b92000-03-10 22:57:27 +0000420 Py_XDECREF(args);
421 Py_XDECREF(encoder);
Marc-André Lemburgb2750b52008-06-06 12:18:17 +0000422 return NULL;
Guido van Rossumfeee4b92000-03-10 22:57:27 +0000423}
424
425/* Decode an object (usually a Python string) using the given encoding
426 and return an equivalent object (e.g. an Unicode object).
427
428 errors is passed to the decoder factory as argument if non-NULL. */
429
Nick Coghlanc72e4e62013-11-22 22:39:36 +1000430static PyObject *
431_PyCodec_DecodeInternal(PyObject *object,
432 PyObject *decoder,
433 const char *encoding,
434 const char *errors)
Guido van Rossumfeee4b92000-03-10 22:57:27 +0000435{
Guido van Rossumfeee4b92000-03-10 22:57:27 +0000436 PyObject *args = NULL, *result = NULL;
437 PyObject *v;
438
Guido van Rossumfeee4b92000-03-10 22:57:27 +0000439 args = args_tuple(object, errors);
440 if (args == NULL)
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000441 goto onError;
Guido van Rossum98297ee2007-11-06 21:34:58 +0000442
Guido van Rossumfeee4b92000-03-10 22:57:27 +0000443 result = PyEval_CallObject(decoder,args);
Nick Coghlanc4c25802013-11-15 21:47:37 +1000444 if (result == NULL) {
445 wrap_codec_error("decoding", encoding);
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000446 goto onError;
Nick Coghlanc4c25802013-11-15 21:47:37 +1000447 }
Guido van Rossum98297ee2007-11-06 21:34:58 +0000448 if (!PyTuple_Check(result) ||
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000449 PyTuple_GET_SIZE(result) != 2) {
450 PyErr_SetString(PyExc_TypeError,
451 "decoder must return a tuple (object,integer)");
452 goto onError;
Guido van Rossumfeee4b92000-03-10 22:57:27 +0000453 }
454 v = PyTuple_GET_ITEM(result,0);
455 Py_INCREF(v);
456 /* We don't check or use the second (integer) entry. */
457
458 Py_DECREF(args);
459 Py_DECREF(decoder);
460 Py_DECREF(result);
461 return v;
Guido van Rossum98297ee2007-11-06 21:34:58 +0000462
Guido van Rossumfeee4b92000-03-10 22:57:27 +0000463 onError:
464 Py_XDECREF(args);
465 Py_XDECREF(decoder);
466 Py_XDECREF(result);
467 return NULL;
468}
469
Nick Coghlanc72e4e62013-11-22 22:39:36 +1000470/* Generic encoding/decoding API */
471PyObject *PyCodec_Encode(PyObject *object,
472 const char *encoding,
473 const char *errors)
474{
475 PyObject *encoder;
476
477 encoder = PyCodec_Encoder(encoding);
478 if (encoder == NULL)
479 return NULL;
480
481 return _PyCodec_EncodeInternal(object, encoder, encoding, errors);
482}
483
484PyObject *PyCodec_Decode(PyObject *object,
485 const char *encoding,
486 const char *errors)
487{
488 PyObject *decoder;
489
490 decoder = PyCodec_Decoder(encoding);
491 if (decoder == NULL)
492 return NULL;
493
494 return _PyCodec_DecodeInternal(object, decoder, encoding, errors);
495}
496
497/* Text encoding/decoding API */
Nick Coghlana9b15242014-02-04 22:11:18 +1000498PyObject * _PyCodec_LookupTextEncoding(const char *encoding,
499 const char *alternate_command)
Nick Coghlanc72e4e62013-11-22 22:39:36 +1000500{
501 _Py_IDENTIFIER(_is_text_encoding);
502 PyObject *codec;
503 PyObject *attr;
Nick Coghlanc72e4e62013-11-22 22:39:36 +1000504 int is_text_codec;
505
506 codec = _PyCodec_Lookup(encoding);
507 if (codec == NULL)
508 return NULL;
509
510 /* Backwards compatibility: assume any raw tuple describes a text
511 * encoding, and the same for anything lacking the private
512 * attribute.
513 */
514 if (!PyTuple_CheckExact(codec)) {
515 attr = _PyObject_GetAttrId(codec, &PyId__is_text_encoding);
516 if (attr == NULL) {
517 if (PyErr_ExceptionMatches(PyExc_AttributeError)) {
518 PyErr_Clear();
519 } else {
520 Py_DECREF(codec);
521 return NULL;
522 }
523 } else {
524 is_text_codec = PyObject_IsTrue(attr);
525 Py_DECREF(attr);
526 if (!is_text_codec) {
527 Py_DECREF(codec);
528 PyErr_Format(PyExc_LookupError,
529 "'%.400s' is not a text encoding; "
Nick Coghlana9b15242014-02-04 22:11:18 +1000530 "use %s to handle arbitrary codecs",
531 encoding, alternate_command);
Nick Coghlanc72e4e62013-11-22 22:39:36 +1000532 return NULL;
533 }
534 }
535 }
536
Nick Coghlana9b15242014-02-04 22:11:18 +1000537 /* This appears to be a valid text encoding */
538 return codec;
539}
540
541
542static
543PyObject *codec_getitem_checked(const char *encoding,
544 const char *alternate_command,
545 int index)
546{
547 PyObject *codec;
548 PyObject *v;
549
550 codec = _PyCodec_LookupTextEncoding(encoding, alternate_command);
551 if (codec == NULL)
552 return NULL;
553
Nick Coghlanc72e4e62013-11-22 22:39:36 +1000554 v = PyTuple_GET_ITEM(codec, index);
Nick Coghlanc72e4e62013-11-22 22:39:36 +1000555 Py_INCREF(v);
Nick Coghlana9b15242014-02-04 22:11:18 +1000556 Py_DECREF(codec);
Nick Coghlanc72e4e62013-11-22 22:39:36 +1000557 return v;
558}
559
560static PyObject * _PyCodec_TextEncoder(const char *encoding)
561{
Nick Coghlana9b15242014-02-04 22:11:18 +1000562 return codec_getitem_checked(encoding, "codecs.encode()", 0);
Nick Coghlanc72e4e62013-11-22 22:39:36 +1000563}
564
565static PyObject * _PyCodec_TextDecoder(const char *encoding)
566{
Nick Coghlana9b15242014-02-04 22:11:18 +1000567 return codec_getitem_checked(encoding, "codecs.decode()", 1);
Nick Coghlanc72e4e62013-11-22 22:39:36 +1000568}
569
570PyObject *_PyCodec_EncodeText(PyObject *object,
571 const char *encoding,
572 const char *errors)
573{
574 PyObject *encoder;
575
576 encoder = _PyCodec_TextEncoder(encoding);
577 if (encoder == NULL)
578 return NULL;
579
580 return _PyCodec_EncodeInternal(object, encoder, encoding, errors);
581}
582
583PyObject *_PyCodec_DecodeText(PyObject *object,
584 const char *encoding,
585 const char *errors)
586{
587 PyObject *decoder;
588
589 decoder = _PyCodec_TextDecoder(encoding);
590 if (decoder == NULL)
591 return NULL;
592
593 return _PyCodec_DecodeInternal(object, decoder, encoding, errors);
594}
595
Walter Dörwald3aeb6322002-09-02 13:14:32 +0000596/* Register the error handling callback function error under the name
597 name. This function will be called by the codec when it encounters
598 an unencodable characters/undecodable bytes and doesn't know the
599 callback name, when name is specified as the error parameter
600 in the call to the encode/decode function.
601 Return 0 on success, -1 on error */
602int PyCodec_RegisterError(const char *name, PyObject *error)
603{
Nicholas Bastine5662ae2004-03-24 22:22:12 +0000604 PyInterpreterState *interp = PyThreadState_GET()->interp;
Gustavo Niemeyer5ddd4c32003-03-19 00:35:36 +0000605 if (interp->codec_search_path == NULL && _PyCodecRegistry_Init())
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000606 return -1;
Walter Dörwald3aeb6322002-09-02 13:14:32 +0000607 if (!PyCallable_Check(error)) {
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000608 PyErr_SetString(PyExc_TypeError, "handler must be callable");
609 return -1;
Walter Dörwald3aeb6322002-09-02 13:14:32 +0000610 }
Gustavo Niemeyer5ddd4c32003-03-19 00:35:36 +0000611 return PyDict_SetItemString(interp->codec_error_registry,
Serhiy Storchakac6792272013-10-19 21:03:34 +0300612 name, error);
Walter Dörwald3aeb6322002-09-02 13:14:32 +0000613}
614
615/* Lookup the error handling callback function registered under the
616 name error. As a special case NULL can be passed, in which case
617 the error handling callback for strict encoding will be returned. */
618PyObject *PyCodec_LookupError(const char *name)
619{
620 PyObject *handler = NULL;
621
Nicholas Bastine5662ae2004-03-24 22:22:12 +0000622 PyInterpreterState *interp = PyThreadState_GET()->interp;
Gustavo Niemeyer5ddd4c32003-03-19 00:35:36 +0000623 if (interp->codec_search_path == NULL && _PyCodecRegistry_Init())
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000624 return NULL;
Gustavo Niemeyer5ddd4c32003-03-19 00:35:36 +0000625
Walter Dörwald3aeb6322002-09-02 13:14:32 +0000626 if (name==NULL)
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000627 name = "strict";
Serhiy Storchakac6792272013-10-19 21:03:34 +0300628 handler = PyDict_GetItemString(interp->codec_error_registry, name);
Walter Dörwald3aeb6322002-09-02 13:14:32 +0000629 if (!handler)
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000630 PyErr_Format(PyExc_LookupError, "unknown error handler name '%.400s'", name);
Walter Dörwald3aeb6322002-09-02 13:14:32 +0000631 else
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000632 Py_INCREF(handler);
Walter Dörwald3aeb6322002-09-02 13:14:32 +0000633 return handler;
634}
635
636static void wrong_exception_type(PyObject *exc)
637{
Martin v. Löwisbd928fe2011-10-14 10:20:37 +0200638 _Py_IDENTIFIER(__class__);
639 _Py_IDENTIFIER(__name__);
Martin v. Löwis1ee1b6f2011-10-10 18:11:30 +0200640 PyObject *type = _PyObject_GetAttrId(exc, &PyId___class__);
Walter Dörwald3aeb6322002-09-02 13:14:32 +0000641 if (type != NULL) {
Martin v. Löwis1ee1b6f2011-10-10 18:11:30 +0200642 PyObject *name = _PyObject_GetAttrId(type, &PyId___name__);
Walter Dörwald573c08c2007-05-25 15:46:59 +0000643 Py_DECREF(type);
644 if (name != NULL) {
645 PyErr_Format(PyExc_TypeError,
646 "don't know how to handle %S in error callback", name);
647 Py_DECREF(name);
648 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +0000649 }
650}
651
652PyObject *PyCodec_StrictErrors(PyObject *exc)
653{
Brett Cannonbf364092006-03-01 04:25:17 +0000654 if (PyExceptionInstance_Check(exc))
655 PyErr_SetObject(PyExceptionInstance_Class(exc), exc);
Walter Dörwald3aeb6322002-09-02 13:14:32 +0000656 else
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000657 PyErr_SetString(PyExc_TypeError, "codec must pass exception instance");
Walter Dörwald3aeb6322002-09-02 13:14:32 +0000658 return NULL;
659}
660
661
662PyObject *PyCodec_IgnoreErrors(PyObject *exc)
663{
Martin v. Löwis18e16552006-02-15 17:27:45 +0000664 Py_ssize_t end;
Walter Dörwald3aeb6322002-09-02 13:14:32 +0000665 if (PyObject_IsInstance(exc, PyExc_UnicodeEncodeError)) {
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000666 if (PyUnicodeEncodeError_GetEnd(exc, &end))
667 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +0000668 }
669 else if (PyObject_IsInstance(exc, PyExc_UnicodeDecodeError)) {
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000670 if (PyUnicodeDecodeError_GetEnd(exc, &end))
671 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +0000672 }
673 else if (PyObject_IsInstance(exc, PyExc_UnicodeTranslateError)) {
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000674 if (PyUnicodeTranslateError_GetEnd(exc, &end))
675 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +0000676 }
677 else {
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000678 wrong_exception_type(exc);
679 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +0000680 }
Victor Stinneree450092011-12-01 02:52:11 +0100681 return Py_BuildValue("(Nn)", PyUnicode_New(0, 0), end);
Walter Dörwald3aeb6322002-09-02 13:14:32 +0000682}
683
684
685PyObject *PyCodec_ReplaceErrors(PyObject *exc)
686{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200687 Py_ssize_t start, end, i, len;
Walter Dörwald3aeb6322002-09-02 13:14:32 +0000688
689 if (PyObject_IsInstance(exc, PyExc_UnicodeEncodeError)) {
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000690 PyObject *res;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200691 int kind;
692 void *data;
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000693 if (PyUnicodeEncodeError_GetStart(exc, &start))
694 return NULL;
695 if (PyUnicodeEncodeError_GetEnd(exc, &end))
696 return NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200697 len = end - start;
698 res = PyUnicode_New(len, '?');
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000699 if (res == NULL)
700 return NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200701 kind = PyUnicode_KIND(res);
702 data = PyUnicode_DATA(res);
703 for (i = 0; i < len; ++i)
704 PyUnicode_WRITE(kind, data, i, '?');
Victor Stinner8f825062012-04-27 13:55:39 +0200705 assert(_PyUnicode_CheckConsistency(res, 1));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200706 return Py_BuildValue("(Nn)", res, end);
Walter Dörwald3aeb6322002-09-02 13:14:32 +0000707 }
708 else if (PyObject_IsInstance(exc, PyExc_UnicodeDecodeError)) {
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000709 if (PyUnicodeDecodeError_GetEnd(exc, &end))
710 return NULL;
Victor Stinner1a15aba2011-10-02 19:00:15 +0200711 return Py_BuildValue("(Cn)",
712 (int)Py_UNICODE_REPLACEMENT_CHARACTER,
713 end);
Walter Dörwald3aeb6322002-09-02 13:14:32 +0000714 }
715 else if (PyObject_IsInstance(exc, PyExc_UnicodeTranslateError)) {
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000716 PyObject *res;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200717 int kind;
718 void *data;
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000719 if (PyUnicodeTranslateError_GetStart(exc, &start))
720 return NULL;
721 if (PyUnicodeTranslateError_GetEnd(exc, &end))
722 return NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200723 len = end - start;
724 res = PyUnicode_New(len, Py_UNICODE_REPLACEMENT_CHARACTER);
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000725 if (res == NULL)
726 return NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200727 kind = PyUnicode_KIND(res);
728 data = PyUnicode_DATA(res);
729 for (i=0; i < len; i++)
730 PyUnicode_WRITE(kind, data, i, Py_UNICODE_REPLACEMENT_CHARACTER);
Victor Stinner8f825062012-04-27 13:55:39 +0200731 assert(_PyUnicode_CheckConsistency(res, 1));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200732 return Py_BuildValue("(Nn)", res, end);
Walter Dörwald3aeb6322002-09-02 13:14:32 +0000733 }
734 else {
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000735 wrong_exception_type(exc);
736 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +0000737 }
738}
739
740PyObject *PyCodec_XMLCharRefReplaceErrors(PyObject *exc)
741{
742 if (PyObject_IsInstance(exc, PyExc_UnicodeEncodeError)) {
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000743 PyObject *restuple;
744 PyObject *object;
Victor Stinnerb31f1bc2011-11-04 21:29:10 +0100745 Py_ssize_t i;
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000746 Py_ssize_t start;
747 Py_ssize_t end;
748 PyObject *res;
Martin v. Löwisb09af032011-11-04 11:16:41 +0100749 unsigned char *outp;
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000750 int ressize;
Martin v. Löwisb09af032011-11-04 11:16:41 +0100751 Py_UCS4 ch;
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000752 if (PyUnicodeEncodeError_GetStart(exc, &start))
753 return NULL;
754 if (PyUnicodeEncodeError_GetEnd(exc, &end))
755 return NULL;
756 if (!(object = PyUnicodeEncodeError_GetObject(exc)))
757 return NULL;
Martin v. Löwisb09af032011-11-04 11:16:41 +0100758 for (i = start, ressize = 0; i < end; ++i) {
759 /* object is guaranteed to be "ready" */
760 ch = PyUnicode_READ_CHAR(object, i);
761 if (ch<10)
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000762 ressize += 2+1+1;
Martin v. Löwisb09af032011-11-04 11:16:41 +0100763 else if (ch<100)
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000764 ressize += 2+2+1;
Martin v. Löwisb09af032011-11-04 11:16:41 +0100765 else if (ch<1000)
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000766 ressize += 2+3+1;
Martin v. Löwisb09af032011-11-04 11:16:41 +0100767 else if (ch<10000)
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000768 ressize += 2+4+1;
Martin v. Löwisb09af032011-11-04 11:16:41 +0100769 else if (ch<100000)
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000770 ressize += 2+5+1;
Martin v. Löwisb09af032011-11-04 11:16:41 +0100771 else if (ch<1000000)
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000772 ressize += 2+6+1;
773 else
774 ressize += 2+7+1;
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000775 }
776 /* allocate replacement */
Martin v. Löwisb09af032011-11-04 11:16:41 +0100777 res = PyUnicode_New(ressize, 127);
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000778 if (res == NULL) {
779 Py_DECREF(object);
780 return NULL;
781 }
Martin v. Löwisb09af032011-11-04 11:16:41 +0100782 outp = PyUnicode_1BYTE_DATA(res);
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000783 /* generate replacement */
Victor Stinnerb31f1bc2011-11-04 21:29:10 +0100784 for (i = start; i < end; ++i) {
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000785 int digits;
786 int base;
Martin v. Löwis8ba79302011-11-04 12:26:49 +0100787 ch = PyUnicode_READ_CHAR(object, i);
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000788 *outp++ = '&';
789 *outp++ = '#';
Martin v. Löwisb09af032011-11-04 11:16:41 +0100790 if (ch<10) {
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000791 digits = 1;
792 base = 1;
793 }
Martin v. Löwisb09af032011-11-04 11:16:41 +0100794 else if (ch<100) {
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000795 digits = 2;
796 base = 10;
797 }
Martin v. Löwisb09af032011-11-04 11:16:41 +0100798 else if (ch<1000) {
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000799 digits = 3;
800 base = 100;
801 }
Martin v. Löwisb09af032011-11-04 11:16:41 +0100802 else if (ch<10000) {
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000803 digits = 4;
804 base = 1000;
805 }
Martin v. Löwisb09af032011-11-04 11:16:41 +0100806 else if (ch<100000) {
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000807 digits = 5;
808 base = 10000;
809 }
Martin v. Löwisb09af032011-11-04 11:16:41 +0100810 else if (ch<1000000) {
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000811 digits = 6;
812 base = 100000;
813 }
814 else {
815 digits = 7;
816 base = 1000000;
817 }
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000818 while (digits-->0) {
Martin v. Löwisb09af032011-11-04 11:16:41 +0100819 *outp++ = '0' + ch/base;
820 ch %= base;
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000821 base /= 10;
822 }
823 *outp++ = ';';
824 }
Victor Stinner8f825062012-04-27 13:55:39 +0200825 assert(_PyUnicode_CheckConsistency(res, 1));
826 restuple = Py_BuildValue("(Nn)", res, end);
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000827 Py_DECREF(object);
828 return restuple;
Walter Dörwald3aeb6322002-09-02 13:14:32 +0000829 }
830 else {
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000831 wrong_exception_type(exc);
832 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +0000833 }
834}
835
Walter Dörwald3aeb6322002-09-02 13:14:32 +0000836PyObject *PyCodec_BackslashReplaceErrors(PyObject *exc)
837{
838 if (PyObject_IsInstance(exc, PyExc_UnicodeEncodeError)) {
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000839 PyObject *restuple;
840 PyObject *object;
Martin v. Löwisb09af032011-11-04 11:16:41 +0100841 Py_ssize_t i;
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000842 Py_ssize_t start;
843 Py_ssize_t end;
844 PyObject *res;
Martin v. Löwisb09af032011-11-04 11:16:41 +0100845 unsigned char *outp;
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000846 int ressize;
Martin v. Löwisb09af032011-11-04 11:16:41 +0100847 Py_UCS4 c;
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000848 if (PyUnicodeEncodeError_GetStart(exc, &start))
849 return NULL;
850 if (PyUnicodeEncodeError_GetEnd(exc, &end))
851 return NULL;
852 if (!(object = PyUnicodeEncodeError_GetObject(exc)))
853 return NULL;
Martin v. Löwisb09af032011-11-04 11:16:41 +0100854 for (i = start, ressize = 0; i < end; ++i) {
855 /* object is guaranteed to be "ready" */
856 c = PyUnicode_READ_CHAR(object, i);
857 if (c >= 0x10000) {
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000858 ressize += 1+1+8;
Martin v. Löwisb09af032011-11-04 11:16:41 +0100859 }
860 else if (c >= 0x100) {
861 ressize += 1+1+4;
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000862 }
863 else
864 ressize += 1+1+2;
865 }
Martin v. Löwisb09af032011-11-04 11:16:41 +0100866 res = PyUnicode_New(ressize, 127);
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000867 if (res==NULL)
868 return NULL;
Martin v. Löwisb09af032011-11-04 11:16:41 +0100869 for (i = start, outp = PyUnicode_1BYTE_DATA(res);
870 i < end; ++i) {
871 c = PyUnicode_READ_CHAR(object, i);
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000872 *outp++ = '\\';
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000873 if (c >= 0x00010000) {
874 *outp++ = 'U';
Victor Stinnerf5cff562011-10-14 02:13:11 +0200875 *outp++ = Py_hexdigits[(c>>28)&0xf];
876 *outp++ = Py_hexdigits[(c>>24)&0xf];
877 *outp++ = Py_hexdigits[(c>>20)&0xf];
878 *outp++ = Py_hexdigits[(c>>16)&0xf];
879 *outp++ = Py_hexdigits[(c>>12)&0xf];
880 *outp++ = Py_hexdigits[(c>>8)&0xf];
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000881 }
Antoine Pitroue4a18922010-09-09 20:30:23 +0000882 else if (c >= 0x100) {
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000883 *outp++ = 'u';
Victor Stinnerf5cff562011-10-14 02:13:11 +0200884 *outp++ = Py_hexdigits[(c>>12)&0xf];
885 *outp++ = Py_hexdigits[(c>>8)&0xf];
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000886 }
887 else
888 *outp++ = 'x';
Victor Stinnerf5cff562011-10-14 02:13:11 +0200889 *outp++ = Py_hexdigits[(c>>4)&0xf];
890 *outp++ = Py_hexdigits[c&0xf];
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000891 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +0000892
Victor Stinner8f825062012-04-27 13:55:39 +0200893 assert(_PyUnicode_CheckConsistency(res, 1));
894 restuple = Py_BuildValue("(Nn)", res, end);
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000895 Py_DECREF(object);
896 return restuple;
Walter Dörwald3aeb6322002-09-02 13:14:32 +0000897 }
898 else {
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000899 wrong_exception_type(exc);
900 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +0000901 }
902}
903
Serhiy Storchaka88d8fb62014-05-15 14:37:42 +0300904#define ENC_UNKNOWN -1
Serhiy Storchaka58cf6072013-11-19 11:32:41 +0200905#define ENC_UTF8 0
906#define ENC_UTF16BE 1
907#define ENC_UTF16LE 2
908#define ENC_UTF32BE 3
909#define ENC_UTF32LE 4
910
911static int
912get_standard_encoding(const char *encoding, int *bytelength)
913{
914 if (Py_TOLOWER(encoding[0]) == 'u' &&
915 Py_TOLOWER(encoding[1]) == 't' &&
916 Py_TOLOWER(encoding[2]) == 'f') {
917 encoding += 3;
918 if (*encoding == '-' || *encoding == '_' )
919 encoding++;
Serhiy Storchaka88d8fb62014-05-15 14:37:42 +0300920 if (encoding[0] == '8' && encoding[1] == '\0') {
921 *bytelength = 3;
922 return ENC_UTF8;
923 }
924 else if (encoding[0] == '1' && encoding[1] == '6') {
Serhiy Storchaka58cf6072013-11-19 11:32:41 +0200925 encoding += 2;
926 *bytelength = 2;
927 if (*encoding == '\0') {
928#ifdef WORDS_BIGENDIAN
929 return ENC_UTF16BE;
930#else
931 return ENC_UTF16LE;
932#endif
933 }
934 if (*encoding == '-' || *encoding == '_' )
935 encoding++;
936 if (Py_TOLOWER(encoding[1]) == 'e' && encoding[2] == '\0') {
937 if (Py_TOLOWER(encoding[0]) == 'b')
938 return ENC_UTF16BE;
939 if (Py_TOLOWER(encoding[0]) == 'l')
940 return ENC_UTF16LE;
941 }
942 }
943 else if (encoding[0] == '3' && encoding[1] == '2') {
944 encoding += 2;
945 *bytelength = 4;
946 if (*encoding == '\0') {
947#ifdef WORDS_BIGENDIAN
948 return ENC_UTF32BE;
949#else
950 return ENC_UTF32LE;
951#endif
952 }
953 if (*encoding == '-' || *encoding == '_' )
954 encoding++;
955 if (Py_TOLOWER(encoding[1]) == 'e' && encoding[2] == '\0') {
956 if (Py_TOLOWER(encoding[0]) == 'b')
957 return ENC_UTF32BE;
958 if (Py_TOLOWER(encoding[0]) == 'l')
959 return ENC_UTF32LE;
960 }
961 }
962 }
Victor Stinner0d4e01c2014-05-16 14:46:20 +0200963 else if (strcmp(encoding, "CP_UTF8") == 0) {
964 *bytelength = 3;
965 return ENC_UTF8;
966 }
Serhiy Storchaka88d8fb62014-05-15 14:37:42 +0300967 return ENC_UNKNOWN;
Serhiy Storchaka58cf6072013-11-19 11:32:41 +0200968}
969
Martin v. Löwisaef3fb02009-05-02 19:27:30 +0000970/* This handler is declared static until someone demonstrates
971 a need to call it directly. */
972static PyObject *
Martin v. Löwise0a2b722009-05-10 08:08:56 +0000973PyCodec_SurrogatePassErrors(PyObject *exc)
Martin v. Löwisdb12d452009-05-02 18:52:14 +0000974{
975 PyObject *restuple;
976 PyObject *object;
Serhiy Storchaka58cf6072013-11-19 11:32:41 +0200977 PyObject *encode;
978 char *encoding;
979 int code;
980 int bytelength;
Martin v. Löwisb09af032011-11-04 11:16:41 +0100981 Py_ssize_t i;
Martin v. Löwisdb12d452009-05-02 18:52:14 +0000982 Py_ssize_t start;
983 Py_ssize_t end;
984 PyObject *res;
985 if (PyObject_IsInstance(exc, PyExc_UnicodeEncodeError)) {
Serhiy Storchaka58cf6072013-11-19 11:32:41 +0200986 unsigned char *outp;
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000987 if (PyUnicodeEncodeError_GetStart(exc, &start))
988 return NULL;
989 if (PyUnicodeEncodeError_GetEnd(exc, &end))
990 return NULL;
991 if (!(object = PyUnicodeEncodeError_GetObject(exc)))
992 return NULL;
Serhiy Storchaka58cf6072013-11-19 11:32:41 +0200993 if (!(encode = PyUnicodeEncodeError_GetEncoding(exc))) {
994 Py_DECREF(object);
995 return NULL;
996 }
997 if (!(encoding = PyUnicode_AsUTF8(encode))) {
998 Py_DECREF(object);
999 Py_DECREF(encode);
1000 return NULL;
1001 }
1002 code = get_standard_encoding(encoding, &bytelength);
1003 Py_DECREF(encode);
Serhiy Storchaka88d8fb62014-05-15 14:37:42 +03001004 if (code == ENC_UNKNOWN) {
1005 /* Not supported, fail with original exception */
1006 PyErr_SetObject(PyExceptionInstance_Class(exc), exc);
1007 Py_DECREF(object);
1008 return NULL;
1009 }
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02001010
1011 res = PyBytes_FromStringAndSize(NULL, bytelength*(end-start));
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001012 if (!res) {
1013 Py_DECREF(object);
1014 return NULL;
1015 }
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02001016 outp = (unsigned char*)PyBytes_AsString(res);
Martin v. Löwisb09af032011-11-04 11:16:41 +01001017 for (i = start; i < end; i++) {
1018 /* object is guaranteed to be "ready" */
1019 Py_UCS4 ch = PyUnicode_READ_CHAR(object, i);
Victor Stinner76df43d2012-10-30 01:42:39 +01001020 if (!Py_UNICODE_IS_SURROGATE(ch)) {
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001021 /* Not a surrogate, fail with original exception */
1022 PyErr_SetObject(PyExceptionInstance_Class(exc), exc);
1023 Py_DECREF(res);
1024 Py_DECREF(object);
1025 return NULL;
1026 }
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02001027 switch (code) {
1028 case ENC_UTF8:
1029 *outp++ = (unsigned char)(0xe0 | (ch >> 12));
1030 *outp++ = (unsigned char)(0x80 | ((ch >> 6) & 0x3f));
1031 *outp++ = (unsigned char)(0x80 | (ch & 0x3f));
1032 break;
1033 case ENC_UTF16LE:
1034 *outp++ = (unsigned char) ch;
1035 *outp++ = (unsigned char)(ch >> 8);
1036 break;
1037 case ENC_UTF16BE:
1038 *outp++ = (unsigned char)(ch >> 8);
1039 *outp++ = (unsigned char) ch;
1040 break;
1041 case ENC_UTF32LE:
1042 *outp++ = (unsigned char) ch;
1043 *outp++ = (unsigned char)(ch >> 8);
1044 *outp++ = (unsigned char)(ch >> 16);
1045 *outp++ = (unsigned char)(ch >> 24);
1046 break;
1047 case ENC_UTF32BE:
1048 *outp++ = (unsigned char)(ch >> 24);
1049 *outp++ = (unsigned char)(ch >> 16);
1050 *outp++ = (unsigned char)(ch >> 8);
1051 *outp++ = (unsigned char) ch;
1052 break;
1053 }
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001054 }
1055 restuple = Py_BuildValue("(On)", res, end);
1056 Py_DECREF(res);
1057 Py_DECREF(object);
1058 return restuple;
Martin v. Löwisdb12d452009-05-02 18:52:14 +00001059 }
1060 else if (PyObject_IsInstance(exc, PyExc_UnicodeDecodeError)) {
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001061 unsigned char *p;
Victor Stinnerc06bb7a2011-11-04 21:36:35 +01001062 Py_UCS4 ch = 0;
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001063 if (PyUnicodeDecodeError_GetStart(exc, &start))
1064 return NULL;
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02001065 if (PyUnicodeDecodeError_GetEnd(exc, &end))
1066 return NULL;
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001067 if (!(object = PyUnicodeDecodeError_GetObject(exc)))
1068 return NULL;
1069 if (!(p = (unsigned char*)PyBytes_AsString(object))) {
1070 Py_DECREF(object);
1071 return NULL;
1072 }
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02001073 if (!(encode = PyUnicodeDecodeError_GetEncoding(exc))) {
1074 Py_DECREF(object);
1075 return NULL;
1076 }
1077 if (!(encoding = PyUnicode_AsUTF8(encode))) {
1078 Py_DECREF(object);
1079 Py_DECREF(encode);
1080 return NULL;
1081 }
1082 code = get_standard_encoding(encoding, &bytelength);
1083 Py_DECREF(encode);
Serhiy Storchaka88d8fb62014-05-15 14:37:42 +03001084 if (code == ENC_UNKNOWN) {
1085 /* Not supported, fail with original exception */
1086 PyErr_SetObject(PyExceptionInstance_Class(exc), exc);
1087 Py_DECREF(object);
1088 return NULL;
1089 }
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02001090
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001091 /* Try decoding a single surrogate character. If
1092 there are more, let the codec call us again. */
1093 p += start;
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02001094 if (PyBytes_GET_SIZE(object) - start >= bytelength) {
1095 switch (code) {
1096 case ENC_UTF8:
1097 if ((p[0] & 0xf0) == 0xe0 &&
1098 (p[1] & 0xc0) == 0x80 &&
1099 (p[2] & 0xc0) == 0x80) {
1100 /* it's a three-byte code */
1101 ch = ((p[0] & 0x0f) << 12) + ((p[1] & 0x3f) << 6) + (p[2] & 0x3f);
1102 }
1103 break;
1104 case ENC_UTF16LE:
1105 ch = p[1] << 8 | p[0];
1106 break;
1107 case ENC_UTF16BE:
1108 ch = p[0] << 8 | p[1];
1109 break;
1110 case ENC_UTF32LE:
1111 ch = (p[3] << 24) | (p[2] << 16) | (p[1] << 8) | p[0];
1112 break;
1113 case ENC_UTF32BE:
1114 ch = (p[0] << 24) | (p[1] << 16) | (p[2] << 8) | p[3];
1115 break;
1116 }
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001117 }
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02001118
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001119 Py_DECREF(object);
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02001120 if (!Py_UNICODE_IS_SURROGATE(ch)) {
1121 /* it's not a surrogate - fail */
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001122 PyErr_SetObject(PyExceptionInstance_Class(exc), exc);
1123 return NULL;
1124 }
Victor Stinnerc06bb7a2011-11-04 21:36:35 +01001125 res = PyUnicode_FromOrdinal(ch);
1126 if (res == NULL)
1127 return NULL;
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02001128 return Py_BuildValue("(Nn)", res, start + bytelength);
Martin v. Löwisdb12d452009-05-02 18:52:14 +00001129 }
1130 else {
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001131 wrong_exception_type(exc);
1132 return NULL;
Martin v. Löwisdb12d452009-05-02 18:52:14 +00001133 }
1134}
1135
Martin v. Löwis011e8422009-05-05 04:43:17 +00001136static PyObject *
Martin v. Löwis43c57782009-05-10 08:15:24 +00001137PyCodec_SurrogateEscapeErrors(PyObject *exc)
Martin v. Löwis011e8422009-05-05 04:43:17 +00001138{
1139 PyObject *restuple;
1140 PyObject *object;
Martin v. Löwisb09af032011-11-04 11:16:41 +01001141 Py_ssize_t i;
Martin v. Löwis011e8422009-05-05 04:43:17 +00001142 Py_ssize_t start;
1143 Py_ssize_t end;
1144 PyObject *res;
1145 if (PyObject_IsInstance(exc, PyExc_UnicodeEncodeError)) {
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001146 char *outp;
1147 if (PyUnicodeEncodeError_GetStart(exc, &start))
1148 return NULL;
1149 if (PyUnicodeEncodeError_GetEnd(exc, &end))
1150 return NULL;
1151 if (!(object = PyUnicodeEncodeError_GetObject(exc)))
1152 return NULL;
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001153 res = PyBytes_FromStringAndSize(NULL, end-start);
1154 if (!res) {
1155 Py_DECREF(object);
1156 return NULL;
1157 }
1158 outp = PyBytes_AsString(res);
Martin v. Löwisb09af032011-11-04 11:16:41 +01001159 for (i = start; i < end; i++) {
1160 /* object is guaranteed to be "ready" */
1161 Py_UCS4 ch = PyUnicode_READ_CHAR(object, i);
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001162 if (ch < 0xdc80 || ch > 0xdcff) {
1163 /* Not a UTF-8b surrogate, fail with original exception */
1164 PyErr_SetObject(PyExceptionInstance_Class(exc), exc);
1165 Py_DECREF(res);
1166 Py_DECREF(object);
1167 return NULL;
1168 }
1169 *outp++ = ch - 0xdc00;
1170 }
1171 restuple = Py_BuildValue("(On)", res, end);
1172 Py_DECREF(res);
1173 Py_DECREF(object);
1174 return restuple;
Martin v. Löwis011e8422009-05-05 04:43:17 +00001175 }
1176 else if (PyObject_IsInstance(exc, PyExc_UnicodeDecodeError)) {
Victor Stinnerc06bb7a2011-11-04 21:36:35 +01001177 PyObject *str;
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001178 unsigned char *p;
Victor Stinnerc06bb7a2011-11-04 21:36:35 +01001179 Py_UCS2 ch[4]; /* decode up to 4 bad bytes. */
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001180 int consumed = 0;
1181 if (PyUnicodeDecodeError_GetStart(exc, &start))
1182 return NULL;
1183 if (PyUnicodeDecodeError_GetEnd(exc, &end))
1184 return NULL;
1185 if (!(object = PyUnicodeDecodeError_GetObject(exc)))
1186 return NULL;
1187 if (!(p = (unsigned char*)PyBytes_AsString(object))) {
1188 Py_DECREF(object);
1189 return NULL;
1190 }
1191 while (consumed < 4 && consumed < end-start) {
1192 /* Refuse to escape ASCII bytes. */
1193 if (p[start+consumed] < 128)
1194 break;
1195 ch[consumed] = 0xdc00 + p[start+consumed];
1196 consumed++;
1197 }
1198 Py_DECREF(object);
1199 if (!consumed) {
1200 /* codec complained about ASCII byte. */
1201 PyErr_SetObject(PyExceptionInstance_Class(exc), exc);
1202 return NULL;
1203 }
Victor Stinnerc06bb7a2011-11-04 21:36:35 +01001204 str = PyUnicode_FromKindAndData(PyUnicode_2BYTE_KIND, ch, consumed);
1205 if (str == NULL)
1206 return NULL;
1207 return Py_BuildValue("(Nn)", str, start+consumed);
Martin v. Löwis011e8422009-05-05 04:43:17 +00001208 }
1209 else {
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001210 wrong_exception_type(exc);
1211 return NULL;
Martin v. Löwis011e8422009-05-05 04:43:17 +00001212 }
1213}
1214
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001215
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001216static PyObject *strict_errors(PyObject *self, PyObject *exc)
1217{
1218 return PyCodec_StrictErrors(exc);
1219}
1220
1221
1222static PyObject *ignore_errors(PyObject *self, PyObject *exc)
1223{
1224 return PyCodec_IgnoreErrors(exc);
1225}
1226
1227
1228static PyObject *replace_errors(PyObject *self, PyObject *exc)
1229{
1230 return PyCodec_ReplaceErrors(exc);
1231}
1232
1233
1234static PyObject *xmlcharrefreplace_errors(PyObject *self, PyObject *exc)
1235{
1236 return PyCodec_XMLCharRefReplaceErrors(exc);
1237}
1238
1239
1240static PyObject *backslashreplace_errors(PyObject *self, PyObject *exc)
1241{
1242 return PyCodec_BackslashReplaceErrors(exc);
1243}
1244
Martin v. Löwise0a2b722009-05-10 08:08:56 +00001245static PyObject *surrogatepass_errors(PyObject *self, PyObject *exc)
Martin v. Löwisdb12d452009-05-02 18:52:14 +00001246{
Martin v. Löwise0a2b722009-05-10 08:08:56 +00001247 return PyCodec_SurrogatePassErrors(exc);
Martin v. Löwisdb12d452009-05-02 18:52:14 +00001248}
1249
Martin v. Löwis43c57782009-05-10 08:15:24 +00001250static PyObject *surrogateescape_errors(PyObject *self, PyObject *exc)
Martin v. Löwis011e8422009-05-05 04:43:17 +00001251{
Martin v. Löwis43c57782009-05-10 08:15:24 +00001252 return PyCodec_SurrogateEscapeErrors(exc);
Martin v. Löwis011e8422009-05-05 04:43:17 +00001253}
1254
Gustavo Niemeyer5ddd4c32003-03-19 00:35:36 +00001255static int _PyCodecRegistry_Init(void)
Guido van Rossumfeee4b92000-03-10 22:57:27 +00001256{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001257 static struct {
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001258 char *name;
1259 PyMethodDef def;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001260 } methods[] =
1261 {
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001262 {
1263 "strict",
1264 {
1265 "strict_errors",
1266 strict_errors,
1267 METH_O,
1268 PyDoc_STR("Implements the 'strict' error handling, which "
1269 "raises a UnicodeError on coding errors.")
1270 }
1271 },
1272 {
1273 "ignore",
1274 {
1275 "ignore_errors",
1276 ignore_errors,
1277 METH_O,
1278 PyDoc_STR("Implements the 'ignore' error handling, which "
1279 "ignores malformed data and continues.")
1280 }
1281 },
1282 {
1283 "replace",
1284 {
1285 "replace_errors",
1286 replace_errors,
1287 METH_O,
1288 PyDoc_STR("Implements the 'replace' error handling, which "
1289 "replaces malformed data with a replacement marker.")
1290 }
1291 },
1292 {
1293 "xmlcharrefreplace",
1294 {
1295 "xmlcharrefreplace_errors",
1296 xmlcharrefreplace_errors,
1297 METH_O,
1298 PyDoc_STR("Implements the 'xmlcharrefreplace' error handling, "
1299 "which replaces an unencodable character with the "
1300 "appropriate XML character reference.")
1301 }
1302 },
1303 {
1304 "backslashreplace",
1305 {
1306 "backslashreplace_errors",
1307 backslashreplace_errors,
1308 METH_O,
1309 PyDoc_STR("Implements the 'backslashreplace' error handling, "
1310 "which replaces an unencodable character with a "
1311 "backslashed escape sequence.")
1312 }
1313 },
1314 {
1315 "surrogatepass",
1316 {
1317 "surrogatepass",
1318 surrogatepass_errors,
1319 METH_O
1320 }
1321 },
1322 {
1323 "surrogateescape",
1324 {
1325 "surrogateescape",
1326 surrogateescape_errors,
1327 METH_O
1328 }
1329 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001330 };
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001331
Nicholas Bastine5662ae2004-03-24 22:22:12 +00001332 PyInterpreterState *interp = PyThreadState_GET()->interp;
Gustavo Niemeyer5ddd4c32003-03-19 00:35:36 +00001333 PyObject *mod;
Neal Norwitz739a8f82004-07-08 01:55:58 +00001334 unsigned i;
Gustavo Niemeyer5ddd4c32003-03-19 00:35:36 +00001335
1336 if (interp->codec_search_path != NULL)
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001337 return 0;
Gustavo Niemeyer5ddd4c32003-03-19 00:35:36 +00001338
1339 interp->codec_search_path = PyList_New(0);
1340 interp->codec_search_cache = PyDict_New();
1341 interp->codec_error_registry = PyDict_New();
1342
1343 if (interp->codec_error_registry) {
Victor Stinner63941882011-09-29 00:42:28 +02001344 for (i = 0; i < Py_ARRAY_LENGTH(methods); ++i) {
Andrew Svetlov3ba3a3e2012-12-25 13:32:35 +02001345 PyObject *func = PyCFunction_NewEx(&methods[i].def, NULL, NULL);
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001346 int res;
1347 if (!func)
1348 Py_FatalError("can't initialize codec error registry");
1349 res = PyCodec_RegisterError(methods[i].name, func);
1350 Py_DECREF(func);
1351 if (res)
1352 Py_FatalError("can't initialize codec error registry");
1353 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001354 }
Guido van Rossumfeee4b92000-03-10 22:57:27 +00001355
Gustavo Niemeyer5ddd4c32003-03-19 00:35:36 +00001356 if (interp->codec_search_path == NULL ||
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001357 interp->codec_search_cache == NULL ||
1358 interp->codec_error_registry == NULL)
1359 Py_FatalError("can't initialize codec registry");
Gustavo Niemeyer5ddd4c32003-03-19 00:35:36 +00001360
Christian Heimes819b8bf2008-01-03 23:05:47 +00001361 mod = PyImport_ImportModuleNoBlock("encodings");
Gustavo Niemeyer5ddd4c32003-03-19 00:35:36 +00001362 if (mod == NULL) {
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001363 return -1;
Gustavo Niemeyer5ddd4c32003-03-19 00:35:36 +00001364 }
1365 Py_DECREF(mod);
Christian Heimes6a27efa2008-10-30 21:48:26 +00001366 interp->codecs_initialized = 1;
Gustavo Niemeyer5ddd4c32003-03-19 00:35:36 +00001367 return 0;
Guido van Rossumfeee4b92000-03-10 22:57:27 +00001368}