blob: 386576256f065534bd3ca89129837f6c5d7caa54 [file] [log] [blame]
Guido van Rossumfeee4b92000-03-10 22:57:27 +00001/* ------------------------------------------------------------------------
2
3 Python Codec Registry and support functions
4
5Written by Marc-Andre Lemburg (mal@lemburg.com).
6
Guido van Rossum16b1ad92000-08-03 16:24:25 +00007Copyright (c) Corporation for National Research Initiatives.
Guido van Rossumfeee4b92000-03-10 22:57:27 +00008
9 ------------------------------------------------------------------------ */
10
11#include "Python.h"
Victor Stinner621cebe2018-11-12 16:53:38 +010012#include "pycore_pystate.h"
Serhiy Storchaka166ebc42014-11-25 13:57:17 +020013#include "ucnhash.h"
Guido van Rossumfeee4b92000-03-10 22:57:27 +000014#include <ctype.h>
15
Victor Stinnerf5cff562011-10-14 02:13:11 +020016const char *Py_hexdigits = "0123456789abcdef";
17
Guido van Rossumfeee4b92000-03-10 22:57:27 +000018/* --- Codec Registry ----------------------------------------------------- */
19
20/* Import the standard encodings package which will register the first
Guido van Rossum98297ee2007-11-06 21:34:58 +000021 codec search function.
Guido van Rossumfeee4b92000-03-10 22:57:27 +000022
23 This is done in a lazy way so that the Unicode implementation does
24 not downgrade startup time of scripts not needing it.
25
Guido van Rossumb95de4f2000-03-31 17:25:23 +000026 ImportErrors are silently ignored by this function. Only one try is
27 made.
Guido van Rossumfeee4b92000-03-10 22:57:27 +000028
29*/
30
Gustavo Niemeyer5ddd4c32003-03-19 00:35:36 +000031static int _PyCodecRegistry_Init(void); /* Forward */
Guido van Rossumfeee4b92000-03-10 22:57:27 +000032
Guido van Rossumfeee4b92000-03-10 22:57:27 +000033int PyCodec_Register(PyObject *search_function)
34{
Victor Stinnercaba55b2018-08-03 15:33:52 +020035 PyInterpreterState *interp = _PyInterpreterState_Get();
Gustavo Niemeyer5ddd4c32003-03-19 00:35:36 +000036 if (interp->codec_search_path == NULL && _PyCodecRegistry_Init())
Antoine Pitrouf95a1b32010-05-09 15:52:27 +000037 goto onError;
Guido van Rossumfeee4b92000-03-10 22:57:27 +000038 if (search_function == NULL) {
Antoine Pitrouf95a1b32010-05-09 15:52:27 +000039 PyErr_BadArgument();
40 goto onError;
Guido van Rossumfeee4b92000-03-10 22:57:27 +000041 }
42 if (!PyCallable_Check(search_function)) {
Antoine Pitrouf95a1b32010-05-09 15:52:27 +000043 PyErr_SetString(PyExc_TypeError, "argument must be callable");
44 goto onError;
Guido van Rossumfeee4b92000-03-10 22:57:27 +000045 }
Gustavo Niemeyer5ddd4c32003-03-19 00:35:36 +000046 return PyList_Append(interp->codec_search_path, search_function);
Guido van Rossumb95de4f2000-03-31 17:25:23 +000047
48 onError:
49 return -1;
Guido van Rossumfeee4b92000-03-10 22:57:27 +000050}
51
Guido van Rossum9e896b32000-04-05 20:11:21 +000052/* Convert a string to a normalized Python string: all characters are
53 converted to lower case, spaces are replaced with underscores. */
54
Guido van Rossumfeee4b92000-03-10 22:57:27 +000055static
Guido van Rossum9e896b32000-04-05 20:11:21 +000056PyObject *normalizestring(const char *string)
Guido van Rossumfeee4b92000-03-10 22:57:27 +000057{
Antoine Pitrou9ed5f272013-08-13 20:18:52 +020058 size_t i;
Guido van Rossum582acec2000-06-28 22:07:35 +000059 size_t len = strlen(string);
Guido van Rossumfeee4b92000-03-10 22:57:27 +000060 char *p;
61 PyObject *v;
Guido van Rossum21431e82007-10-19 21:48:41 +000062
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000063 if (len > PY_SSIZE_T_MAX) {
Antoine Pitrouf95a1b32010-05-09 15:52:27 +000064 PyErr_SetString(PyExc_OverflowError, "string is too large");
65 return NULL;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000066 }
Guido van Rossum21431e82007-10-19 21:48:41 +000067
68 p = PyMem_Malloc(len + 1);
69 if (p == NULL)
Victor Stinnercc351592013-07-12 00:02:55 +020070 return PyErr_NoMemory();
Guido van Rossum9e896b32000-04-05 20:11:21 +000071 for (i = 0; i < len; i++) {
Antoine Pitrou9ed5f272013-08-13 20:18:52 +020072 char ch = string[i];
Guido van Rossum9e896b32000-04-05 20:11:21 +000073 if (ch == ' ')
74 ch = '-';
75 else
Antoine Pitroucf9d3c02011-07-24 02:27:04 +020076 ch = Py_TOLOWER(Py_CHARMASK(ch));
Antoine Pitrouf95a1b32010-05-09 15:52:27 +000077 p[i] = ch;
Guido van Rossum9e896b32000-04-05 20:11:21 +000078 }
Guido van Rossum21431e82007-10-19 21:48:41 +000079 p[i] = '\0';
80 v = PyUnicode_FromString(p);
Guido van Rossum21431e82007-10-19 21:48:41 +000081 PyMem_Free(p);
Guido van Rossumfeee4b92000-03-10 22:57:27 +000082 return v;
83}
84
85/* Lookup the given encoding and return a tuple providing the codec
86 facilities.
87
88 The encoding string is looked up converted to all lower-case
89 characters. This makes encodings looked up through this mechanism
90 effectively case-insensitive.
91
Guido van Rossum98297ee2007-11-06 21:34:58 +000092 If no codec is found, a LookupError is set and NULL returned.
Guido van Rossumb95de4f2000-03-31 17:25:23 +000093
94 As side effect, this tries to load the encodings package, if not
95 yet done. This is part of the lazy load strategy for the encodings
96 package.
97
98*/
Guido van Rossumfeee4b92000-03-10 22:57:27 +000099
100PyObject *_PyCodec_Lookup(const char *encoding)
101{
Jeroen Demeyer196a5302019-07-04 12:31:34 +0200102 PyObject *result, *v;
Thomas Wouters477c8d52006-05-27 19:21:47 +0000103 Py_ssize_t i, len;
Guido van Rossumfeee4b92000-03-10 22:57:27 +0000104
Fred Drake766de832000-05-09 19:55:59 +0000105 if (encoding == NULL) {
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000106 PyErr_BadArgument();
107 goto onError;
Fred Drake766de832000-05-09 19:55:59 +0000108 }
Gustavo Niemeyer5ddd4c32003-03-19 00:35:36 +0000109
Victor Stinnercaba55b2018-08-03 15:33:52 +0200110 PyInterpreterState *interp = _PyInterpreterState_GET_UNSAFE();
Gustavo Niemeyer5ddd4c32003-03-19 00:35:36 +0000111 if (interp->codec_search_path == NULL && _PyCodecRegistry_Init())
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000112 goto onError;
Guido van Rossumfeee4b92000-03-10 22:57:27 +0000113
Guido van Rossum9e896b32000-04-05 20:11:21 +0000114 /* Convert the encoding to a normalized Python string: all
Thomas Wouters7e474022000-07-16 12:04:32 +0000115 characters are converted to lower case, spaces and hyphens are
Guido van Rossum9e896b32000-04-05 20:11:21 +0000116 replaced with underscores. */
117 v = normalizestring(encoding);
Guido van Rossumfeee4b92000-03-10 22:57:27 +0000118 if (v == NULL)
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000119 goto onError;
Guido van Rossum21431e82007-10-19 21:48:41 +0000120 PyUnicode_InternInPlace(&v);
Guido van Rossumfeee4b92000-03-10 22:57:27 +0000121
122 /* First, try to lookup the name in the registry dictionary */
Serhiy Storchakaa24107b2019-02-25 17:59:46 +0200123 result = PyDict_GetItemWithError(interp->codec_search_cache, v);
Guido van Rossumfeee4b92000-03-10 22:57:27 +0000124 if (result != NULL) {
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000125 Py_INCREF(result);
126 Py_DECREF(v);
127 return result;
Guido van Rossumfeee4b92000-03-10 22:57:27 +0000128 }
Serhiy Storchakaa24107b2019-02-25 17:59:46 +0200129 else if (PyErr_Occurred()) {
130 Py_DECREF(v);
131 return NULL;
132 }
Guido van Rossum98297ee2007-11-06 21:34:58 +0000133
Guido van Rossumfeee4b92000-03-10 22:57:27 +0000134 /* Next, scan the search functions in order of registration */
Gustavo Niemeyer5ddd4c32003-03-19 00:35:36 +0000135 len = PyList_Size(interp->codec_search_path);
Guido van Rossum5ba3c842000-03-24 20:52:23 +0000136 if (len < 0)
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000137 goto onError;
Guido van Rossumb95de4f2000-03-31 17:25:23 +0000138 if (len == 0) {
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000139 PyErr_SetString(PyExc_LookupError,
140 "no codec search functions registered: "
141 "can't find encoding");
142 goto onError;
Guido van Rossumb95de4f2000-03-31 17:25:23 +0000143 }
Guido van Rossumfeee4b92000-03-10 22:57:27 +0000144
145 for (i = 0; i < len; i++) {
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000146 PyObject *func;
Guido van Rossum98297ee2007-11-06 21:34:58 +0000147
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000148 func = PyList_GetItem(interp->codec_search_path, i);
149 if (func == NULL)
150 goto onError;
Jeroen Demeyer196a5302019-07-04 12:31:34 +0200151 result = _PyObject_CallOneArg(func, v);
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000152 if (result == NULL)
153 goto onError;
154 if (result == Py_None) {
155 Py_DECREF(result);
156 continue;
157 }
158 if (!PyTuple_Check(result) || PyTuple_GET_SIZE(result) != 4) {
159 PyErr_SetString(PyExc_TypeError,
160 "codec search functions must return 4-tuples");
161 Py_DECREF(result);
162 goto onError;
163 }
164 break;
Guido van Rossumfeee4b92000-03-10 22:57:27 +0000165 }
166 if (i == len) {
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000167 /* XXX Perhaps we should cache misses too ? */
168 PyErr_Format(PyExc_LookupError,
Martin v. Löwiseb42b022002-09-26 16:01:24 +0000169 "unknown encoding: %s", encoding);
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000170 goto onError;
Guido van Rossumfeee4b92000-03-10 22:57:27 +0000171 }
172
173 /* Cache and return the result */
Neal Norwitz9edcc2e2007-08-11 04:58:26 +0000174 if (PyDict_SetItem(interp->codec_search_cache, v, result) < 0) {
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000175 Py_DECREF(result);
176 goto onError;
Neal Norwitz9edcc2e2007-08-11 04:58:26 +0000177 }
Guido van Rossumfeee4b92000-03-10 22:57:27 +0000178 return result;
179
180 onError:
Guido van Rossumfeee4b92000-03-10 22:57:27 +0000181 return NULL;
182}
183
Nick Coghlan8fad1672014-09-15 23:50:44 +1200184int _PyCodec_Forget(const char *encoding)
185{
Nick Coghlan8fad1672014-09-15 23:50:44 +1200186 PyObject *v;
187 int result;
188
Victor Stinnercaba55b2018-08-03 15:33:52 +0200189 PyInterpreterState *interp = _PyInterpreterState_Get();
Nick Coghlan8fad1672014-09-15 23:50:44 +1200190 if (interp->codec_search_path == NULL) {
191 return -1;
192 }
193
194 /* Convert the encoding to a normalized Python string: all
195 characters are converted to lower case, spaces and hyphens are
196 replaced with underscores. */
197 v = normalizestring(encoding);
198 if (v == NULL) {
199 return -1;
200 }
201
202 /* Drop the named codec from the internal cache */
203 result = PyDict_DelItem(interp->codec_search_cache, v);
204 Py_DECREF(v);
205
206 return result;
207}
208
Marc-André Lemburgb2750b52008-06-06 12:18:17 +0000209/* Codec registry encoding check API. */
210
211int PyCodec_KnownEncoding(const char *encoding)
212{
213 PyObject *codecs;
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000214
Marc-André Lemburgb2750b52008-06-06 12:18:17 +0000215 codecs = _PyCodec_Lookup(encoding);
216 if (!codecs) {
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000217 PyErr_Clear();
218 return 0;
Marc-André Lemburgb2750b52008-06-06 12:18:17 +0000219 }
220 else {
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000221 Py_DECREF(codecs);
222 return 1;
Marc-André Lemburgb2750b52008-06-06 12:18:17 +0000223 }
224}
225
Guido van Rossumfeee4b92000-03-10 22:57:27 +0000226static
227PyObject *args_tuple(PyObject *object,
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000228 const char *errors)
Guido van Rossumfeee4b92000-03-10 22:57:27 +0000229{
230 PyObject *args;
Guido van Rossum98297ee2007-11-06 21:34:58 +0000231
Guido van Rossumfeee4b92000-03-10 22:57:27 +0000232 args = PyTuple_New(1 + (errors != NULL));
233 if (args == NULL)
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000234 return NULL;
Guido van Rossumfeee4b92000-03-10 22:57:27 +0000235 Py_INCREF(object);
236 PyTuple_SET_ITEM(args,0,object);
237 if (errors) {
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000238 PyObject *v;
Guido van Rossum98297ee2007-11-06 21:34:58 +0000239
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000240 v = PyUnicode_FromString(errors);
241 if (v == NULL) {
242 Py_DECREF(args);
243 return NULL;
244 }
245 PyTuple_SET_ITEM(args, 1, v);
Guido van Rossumfeee4b92000-03-10 22:57:27 +0000246 }
247 return args;
248}
249
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000250/* Helper function to get a codec item */
Guido van Rossumfeee4b92000-03-10 22:57:27 +0000251
252static
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000253PyObject *codec_getitem(const char *encoding, int index)
Guido van Rossumfeee4b92000-03-10 22:57:27 +0000254{
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000255 PyObject *codecs;
256 PyObject *v;
Guido van Rossumfeee4b92000-03-10 22:57:27 +0000257
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000258 codecs = _PyCodec_Lookup(encoding);
259 if (codecs == NULL)
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000260 return NULL;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000261 v = PyTuple_GET_ITEM(codecs, index);
262 Py_DECREF(codecs);
263 Py_INCREF(v);
264 return v;
265}
266
Nick Coghlana9b15242014-02-04 22:11:18 +1000267/* Helper functions to create an incremental codec. */
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000268static
Nick Coghlana9b15242014-02-04 22:11:18 +1000269PyObject *codec_makeincrementalcodec(PyObject *codec_info,
270 const char *errors,
271 const char *attrname)
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000272{
Nick Coghlana9b15242014-02-04 22:11:18 +1000273 PyObject *ret, *inccodec;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000274
Nick Coghlana9b15242014-02-04 22:11:18 +1000275 inccodec = PyObject_GetAttrString(codec_info, attrname);
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000276 if (inccodec == NULL)
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000277 return NULL;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000278 if (errors)
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000279 ret = PyObject_CallFunction(inccodec, "s", errors);
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000280 else
Victor Stinner4778eab2016-12-01 14:51:04 +0100281 ret = _PyObject_CallNoArg(inccodec);
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000282 Py_DECREF(inccodec);
283 return ret;
284}
285
Nick Coghlana9b15242014-02-04 22:11:18 +1000286static
287PyObject *codec_getincrementalcodec(const char *encoding,
288 const char *errors,
289 const char *attrname)
290{
291 PyObject *codec_info, *ret;
292
293 codec_info = _PyCodec_Lookup(encoding);
294 if (codec_info == NULL)
295 return NULL;
296 ret = codec_makeincrementalcodec(codec_info, errors, attrname);
297 Py_DECREF(codec_info);
298 return ret;
299}
300
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000301/* Helper function to create a stream codec. */
302
303static
304PyObject *codec_getstreamcodec(const char *encoding,
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000305 PyObject *stream,
306 const char *errors,
307 const int index)
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000308{
Thomas Wouters0e3f5912006-08-11 14:57:12 +0000309 PyObject *codecs, *streamcodec, *codeccls;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000310
311 codecs = _PyCodec_Lookup(encoding);
312 if (codecs == NULL)
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000313 return NULL;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000314
Thomas Wouters0e3f5912006-08-11 14:57:12 +0000315 codeccls = PyTuple_GET_ITEM(codecs, index);
316 if (errors != NULL)
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000317 streamcodec = PyObject_CallFunction(codeccls, "Os", stream, errors);
Thomas Wouters0e3f5912006-08-11 14:57:12 +0000318 else
Jeroen Demeyer196a5302019-07-04 12:31:34 +0200319 streamcodec = _PyObject_CallOneArg(codeccls, stream);
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000320 Py_DECREF(codecs);
321 return streamcodec;
Guido van Rossumfeee4b92000-03-10 22:57:27 +0000322}
323
Nick Coghlana9b15242014-02-04 22:11:18 +1000324/* Helpers to work with the result of _PyCodec_Lookup
325
326 */
327PyObject *_PyCodecInfo_GetIncrementalDecoder(PyObject *codec_info,
328 const char *errors)
329{
330 return codec_makeincrementalcodec(codec_info, errors,
331 "incrementaldecoder");
332}
333
334PyObject *_PyCodecInfo_GetIncrementalEncoder(PyObject *codec_info,
335 const char *errors)
336{
337 return codec_makeincrementalcodec(codec_info, errors,
338 "incrementalencoder");
339}
340
341
Guido van Rossum98297ee2007-11-06 21:34:58 +0000342/* Convenience APIs to query the Codec registry.
343
Guido van Rossumfeee4b92000-03-10 22:57:27 +0000344 All APIs return a codec object with incremented refcount.
Guido van Rossum98297ee2007-11-06 21:34:58 +0000345
Guido van Rossumfeee4b92000-03-10 22:57:27 +0000346 */
347
348PyObject *PyCodec_Encoder(const char *encoding)
349{
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000350 return codec_getitem(encoding, 0);
Guido van Rossumfeee4b92000-03-10 22:57:27 +0000351}
352
353PyObject *PyCodec_Decoder(const char *encoding)
354{
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000355 return codec_getitem(encoding, 1);
Guido van Rossumfeee4b92000-03-10 22:57:27 +0000356}
357
Thomas Woutersa9773292006-04-21 09:43:23 +0000358PyObject *PyCodec_IncrementalEncoder(const char *encoding,
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000359 const char *errors)
Thomas Woutersa9773292006-04-21 09:43:23 +0000360{
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000361 return codec_getincrementalcodec(encoding, errors, "incrementalencoder");
Thomas Woutersa9773292006-04-21 09:43:23 +0000362}
363
364PyObject *PyCodec_IncrementalDecoder(const char *encoding,
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000365 const char *errors)
Thomas Woutersa9773292006-04-21 09:43:23 +0000366{
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000367 return codec_getincrementalcodec(encoding, errors, "incrementaldecoder");
Thomas Woutersa9773292006-04-21 09:43:23 +0000368}
369
Guido van Rossumfeee4b92000-03-10 22:57:27 +0000370PyObject *PyCodec_StreamReader(const char *encoding,
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000371 PyObject *stream,
372 const char *errors)
Guido van Rossumfeee4b92000-03-10 22:57:27 +0000373{
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000374 return codec_getstreamcodec(encoding, stream, errors, 2);
Guido van Rossumfeee4b92000-03-10 22:57:27 +0000375}
376
377PyObject *PyCodec_StreamWriter(const char *encoding,
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000378 PyObject *stream,
379 const char *errors)
Guido van Rossumfeee4b92000-03-10 22:57:27 +0000380{
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000381 return codec_getstreamcodec(encoding, stream, errors, 3);
Guido van Rossumfeee4b92000-03-10 22:57:27 +0000382}
383
Nick Coghlan8b097b42013-11-13 23:49:21 +1000384/* Helper that tries to ensure the reported exception chain indicates the
385 * codec that was invoked to trigger the failure without changing the type
386 * of the exception raised.
387 */
388static void
389wrap_codec_error(const char *operation,
390 const char *encoding)
391{
392 /* TrySetFromCause will replace the active exception with a suitably
393 * updated clone if it can, otherwise it will leave the original
394 * exception alone.
395 */
396 _PyErr_TrySetFromCause("%s with '%s' codec failed",
397 operation, encoding);
398}
399
Martin Panter6245cb32016-04-15 02:14:19 +0000400/* Encode an object (e.g. a Unicode object) using the given encoding
Guido van Rossumfeee4b92000-03-10 22:57:27 +0000401 and return the resulting encoded object (usually a Python string).
402
403 errors is passed to the encoder factory as argument if non-NULL. */
404
Nick Coghlanc72e4e62013-11-22 22:39:36 +1000405static PyObject *
406_PyCodec_EncodeInternal(PyObject *object,
407 PyObject *encoder,
408 const char *encoding,
409 const char *errors)
Guido van Rossumfeee4b92000-03-10 22:57:27 +0000410{
Neal Norwitz3715c3e2005-11-24 22:09:18 +0000411 PyObject *args = NULL, *result = NULL;
Guido van Rossum98297ee2007-11-06 21:34:58 +0000412 PyObject *v = NULL;
Guido van Rossumfeee4b92000-03-10 22:57:27 +0000413
Guido van Rossumfeee4b92000-03-10 22:57:27 +0000414 args = args_tuple(object, errors);
415 if (args == NULL)
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000416 goto onError;
Guido van Rossum98297ee2007-11-06 21:34:58 +0000417
418 result = PyEval_CallObject(encoder, args);
Nick Coghlanc4c25802013-11-15 21:47:37 +1000419 if (result == NULL) {
420 wrap_codec_error("encoding", encoding);
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000421 goto onError;
Nick Coghlanc4c25802013-11-15 21:47:37 +1000422 }
Guido van Rossumfeee4b92000-03-10 22:57:27 +0000423
Guido van Rossum98297ee2007-11-06 21:34:58 +0000424 if (!PyTuple_Check(result) ||
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000425 PyTuple_GET_SIZE(result) != 2) {
426 PyErr_SetString(PyExc_TypeError,
427 "encoder must return a tuple (object, integer)");
428 goto onError;
Guido van Rossumfeee4b92000-03-10 22:57:27 +0000429 }
Marc-André Lemburgb2750b52008-06-06 12:18:17 +0000430 v = PyTuple_GET_ITEM(result,0);
431 Py_INCREF(v);
Guido van Rossumfeee4b92000-03-10 22:57:27 +0000432 /* We don't check or use the second (integer) entry. */
433
Marc-André Lemburgb2750b52008-06-06 12:18:17 +0000434 Py_DECREF(args);
435 Py_DECREF(encoder);
436 Py_DECREF(result);
437 return v;
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000438
Guido van Rossumfeee4b92000-03-10 22:57:27 +0000439 onError:
Neal Norwitz3715c3e2005-11-24 22:09:18 +0000440 Py_XDECREF(result);
Guido van Rossumfeee4b92000-03-10 22:57:27 +0000441 Py_XDECREF(args);
442 Py_XDECREF(encoder);
Marc-André Lemburgb2750b52008-06-06 12:18:17 +0000443 return NULL;
Guido van Rossumfeee4b92000-03-10 22:57:27 +0000444}
445
446/* Decode an object (usually a Python string) using the given encoding
Martin Panter6245cb32016-04-15 02:14:19 +0000447 and return an equivalent object (e.g. a Unicode object).
Guido van Rossumfeee4b92000-03-10 22:57:27 +0000448
449 errors is passed to the decoder factory as argument if non-NULL. */
450
Nick Coghlanc72e4e62013-11-22 22:39:36 +1000451static PyObject *
452_PyCodec_DecodeInternal(PyObject *object,
453 PyObject *decoder,
454 const char *encoding,
455 const char *errors)
Guido van Rossumfeee4b92000-03-10 22:57:27 +0000456{
Guido van Rossumfeee4b92000-03-10 22:57:27 +0000457 PyObject *args = NULL, *result = NULL;
458 PyObject *v;
459
Guido van Rossumfeee4b92000-03-10 22:57:27 +0000460 args = args_tuple(object, errors);
461 if (args == NULL)
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000462 goto onError;
Guido van Rossum98297ee2007-11-06 21:34:58 +0000463
Guido van Rossumfeee4b92000-03-10 22:57:27 +0000464 result = PyEval_CallObject(decoder,args);
Nick Coghlanc4c25802013-11-15 21:47:37 +1000465 if (result == NULL) {
466 wrap_codec_error("decoding", encoding);
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000467 goto onError;
Nick Coghlanc4c25802013-11-15 21:47:37 +1000468 }
Guido van Rossum98297ee2007-11-06 21:34:58 +0000469 if (!PyTuple_Check(result) ||
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000470 PyTuple_GET_SIZE(result) != 2) {
471 PyErr_SetString(PyExc_TypeError,
472 "decoder must return a tuple (object,integer)");
473 goto onError;
Guido van Rossumfeee4b92000-03-10 22:57:27 +0000474 }
475 v = PyTuple_GET_ITEM(result,0);
476 Py_INCREF(v);
477 /* We don't check or use the second (integer) entry. */
478
479 Py_DECREF(args);
480 Py_DECREF(decoder);
481 Py_DECREF(result);
482 return v;
Guido van Rossum98297ee2007-11-06 21:34:58 +0000483
Guido van Rossumfeee4b92000-03-10 22:57:27 +0000484 onError:
485 Py_XDECREF(args);
486 Py_XDECREF(decoder);
487 Py_XDECREF(result);
488 return NULL;
489}
490
Nick Coghlanc72e4e62013-11-22 22:39:36 +1000491/* Generic encoding/decoding API */
492PyObject *PyCodec_Encode(PyObject *object,
493 const char *encoding,
494 const char *errors)
495{
496 PyObject *encoder;
497
498 encoder = PyCodec_Encoder(encoding);
499 if (encoder == NULL)
500 return NULL;
501
502 return _PyCodec_EncodeInternal(object, encoder, encoding, errors);
503}
504
505PyObject *PyCodec_Decode(PyObject *object,
506 const char *encoding,
507 const char *errors)
508{
509 PyObject *decoder;
510
511 decoder = PyCodec_Decoder(encoding);
512 if (decoder == NULL)
513 return NULL;
514
515 return _PyCodec_DecodeInternal(object, decoder, encoding, errors);
516}
517
518/* Text encoding/decoding API */
Nick Coghlana9b15242014-02-04 22:11:18 +1000519PyObject * _PyCodec_LookupTextEncoding(const char *encoding,
520 const char *alternate_command)
Nick Coghlanc72e4e62013-11-22 22:39:36 +1000521{
522 _Py_IDENTIFIER(_is_text_encoding);
523 PyObject *codec;
524 PyObject *attr;
Nick Coghlanc72e4e62013-11-22 22:39:36 +1000525 int is_text_codec;
526
527 codec = _PyCodec_Lookup(encoding);
528 if (codec == NULL)
529 return NULL;
530
531 /* Backwards compatibility: assume any raw tuple describes a text
532 * encoding, and the same for anything lacking the private
533 * attribute.
534 */
535 if (!PyTuple_CheckExact(codec)) {
Serhiy Storchakaf320be72018-01-25 10:49:40 +0200536 if (_PyObject_LookupAttrId(codec, &PyId__is_text_encoding, &attr) < 0) {
537 Py_DECREF(codec);
538 return NULL;
539 }
540 if (attr != NULL) {
Nick Coghlanc72e4e62013-11-22 22:39:36 +1000541 is_text_codec = PyObject_IsTrue(attr);
542 Py_DECREF(attr);
Serhiy Storchakafa494fd2015-05-30 17:45:22 +0300543 if (is_text_codec <= 0) {
Nick Coghlanc72e4e62013-11-22 22:39:36 +1000544 Py_DECREF(codec);
Serhiy Storchakafa494fd2015-05-30 17:45:22 +0300545 if (!is_text_codec)
546 PyErr_Format(PyExc_LookupError,
547 "'%.400s' is not a text encoding; "
548 "use %s to handle arbitrary codecs",
549 encoding, alternate_command);
Nick Coghlanc72e4e62013-11-22 22:39:36 +1000550 return NULL;
551 }
552 }
553 }
554
Nick Coghlana9b15242014-02-04 22:11:18 +1000555 /* This appears to be a valid text encoding */
556 return codec;
557}
558
559
560static
561PyObject *codec_getitem_checked(const char *encoding,
562 const char *alternate_command,
563 int index)
564{
565 PyObject *codec;
566 PyObject *v;
567
568 codec = _PyCodec_LookupTextEncoding(encoding, alternate_command);
569 if (codec == NULL)
570 return NULL;
571
Nick Coghlanc72e4e62013-11-22 22:39:36 +1000572 v = PyTuple_GET_ITEM(codec, index);
Nick Coghlanc72e4e62013-11-22 22:39:36 +1000573 Py_INCREF(v);
Nick Coghlana9b15242014-02-04 22:11:18 +1000574 Py_DECREF(codec);
Nick Coghlanc72e4e62013-11-22 22:39:36 +1000575 return v;
576}
577
578static PyObject * _PyCodec_TextEncoder(const char *encoding)
579{
Nick Coghlana9b15242014-02-04 22:11:18 +1000580 return codec_getitem_checked(encoding, "codecs.encode()", 0);
Nick Coghlanc72e4e62013-11-22 22:39:36 +1000581}
582
583static PyObject * _PyCodec_TextDecoder(const char *encoding)
584{
Nick Coghlana9b15242014-02-04 22:11:18 +1000585 return codec_getitem_checked(encoding, "codecs.decode()", 1);
Nick Coghlanc72e4e62013-11-22 22:39:36 +1000586}
587
588PyObject *_PyCodec_EncodeText(PyObject *object,
589 const char *encoding,
590 const char *errors)
591{
592 PyObject *encoder;
593
594 encoder = _PyCodec_TextEncoder(encoding);
595 if (encoder == NULL)
596 return NULL;
597
598 return _PyCodec_EncodeInternal(object, encoder, encoding, errors);
599}
600
601PyObject *_PyCodec_DecodeText(PyObject *object,
602 const char *encoding,
603 const char *errors)
604{
605 PyObject *decoder;
606
607 decoder = _PyCodec_TextDecoder(encoding);
608 if (decoder == NULL)
609 return NULL;
610
611 return _PyCodec_DecodeInternal(object, decoder, encoding, errors);
612}
613
Walter Dörwald3aeb6322002-09-02 13:14:32 +0000614/* Register the error handling callback function error under the name
615 name. This function will be called by the codec when it encounters
616 an unencodable characters/undecodable bytes and doesn't know the
617 callback name, when name is specified as the error parameter
618 in the call to the encode/decode function.
619 Return 0 on success, -1 on error */
620int PyCodec_RegisterError(const char *name, PyObject *error)
621{
Victor Stinnercaba55b2018-08-03 15:33:52 +0200622 PyInterpreterState *interp = _PyInterpreterState_Get();
Gustavo Niemeyer5ddd4c32003-03-19 00:35:36 +0000623 if (interp->codec_search_path == NULL && _PyCodecRegistry_Init())
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000624 return -1;
Walter Dörwald3aeb6322002-09-02 13:14:32 +0000625 if (!PyCallable_Check(error)) {
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000626 PyErr_SetString(PyExc_TypeError, "handler must be callable");
627 return -1;
Walter Dörwald3aeb6322002-09-02 13:14:32 +0000628 }
Gustavo Niemeyer5ddd4c32003-03-19 00:35:36 +0000629 return PyDict_SetItemString(interp->codec_error_registry,
Serhiy Storchakac6792272013-10-19 21:03:34 +0300630 name, error);
Walter Dörwald3aeb6322002-09-02 13:14:32 +0000631}
632
633/* Lookup the error handling callback function registered under the
634 name error. As a special case NULL can be passed, in which case
635 the error handling callback for strict encoding will be returned. */
636PyObject *PyCodec_LookupError(const char *name)
637{
638 PyObject *handler = NULL;
639
Victor Stinnercaba55b2018-08-03 15:33:52 +0200640 PyInterpreterState *interp = _PyInterpreterState_GET_UNSAFE();
Gustavo Niemeyer5ddd4c32003-03-19 00:35:36 +0000641 if (interp->codec_search_path == NULL && _PyCodecRegistry_Init())
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000642 return NULL;
Gustavo Niemeyer5ddd4c32003-03-19 00:35:36 +0000643
Walter Dörwald3aeb6322002-09-02 13:14:32 +0000644 if (name==NULL)
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000645 name = "strict";
Serhiy Storchakaa24107b2019-02-25 17:59:46 +0200646 handler = _PyDict_GetItemStringWithError(interp->codec_error_registry, name);
647 if (handler) {
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000648 Py_INCREF(handler);
Serhiy Storchakaa24107b2019-02-25 17:59:46 +0200649 }
650 else if (!PyErr_Occurred()) {
651 PyErr_Format(PyExc_LookupError, "unknown error handler name '%.400s'", name);
652 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +0000653 return handler;
654}
655
656static void wrong_exception_type(PyObject *exc)
657{
Serhiy Storchakaca7fecb2015-05-18 16:08:52 +0300658 PyErr_Format(PyExc_TypeError,
659 "don't know how to handle %.200s in error callback",
660 exc->ob_type->tp_name);
Walter Dörwald3aeb6322002-09-02 13:14:32 +0000661}
662
663PyObject *PyCodec_StrictErrors(PyObject *exc)
664{
Brett Cannonbf364092006-03-01 04:25:17 +0000665 if (PyExceptionInstance_Check(exc))
666 PyErr_SetObject(PyExceptionInstance_Class(exc), exc);
Walter Dörwald3aeb6322002-09-02 13:14:32 +0000667 else
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000668 PyErr_SetString(PyExc_TypeError, "codec must pass exception instance");
Walter Dörwald3aeb6322002-09-02 13:14:32 +0000669 return NULL;
670}
671
672
673PyObject *PyCodec_IgnoreErrors(PyObject *exc)
674{
Martin v. Löwis18e16552006-02-15 17:27:45 +0000675 Py_ssize_t end;
Serhiy Storchakaca7fecb2015-05-18 16:08:52 +0300676
677 if (PyObject_TypeCheck(exc, (PyTypeObject *)PyExc_UnicodeEncodeError)) {
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000678 if (PyUnicodeEncodeError_GetEnd(exc, &end))
679 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +0000680 }
Serhiy Storchakaca7fecb2015-05-18 16:08:52 +0300681 else if (PyObject_TypeCheck(exc, (PyTypeObject *)PyExc_UnicodeDecodeError)) {
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000682 if (PyUnicodeDecodeError_GetEnd(exc, &end))
683 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +0000684 }
Serhiy Storchakaca7fecb2015-05-18 16:08:52 +0300685 else if (PyObject_TypeCheck(exc, (PyTypeObject *)PyExc_UnicodeTranslateError)) {
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000686 if (PyUnicodeTranslateError_GetEnd(exc, &end))
687 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +0000688 }
689 else {
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000690 wrong_exception_type(exc);
691 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +0000692 }
Victor Stinneree450092011-12-01 02:52:11 +0100693 return Py_BuildValue("(Nn)", PyUnicode_New(0, 0), end);
Walter Dörwald3aeb6322002-09-02 13:14:32 +0000694}
695
696
697PyObject *PyCodec_ReplaceErrors(PyObject *exc)
698{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200699 Py_ssize_t start, end, i, len;
Walter Dörwald3aeb6322002-09-02 13:14:32 +0000700
Serhiy Storchakaca7fecb2015-05-18 16:08:52 +0300701 if (PyObject_TypeCheck(exc, (PyTypeObject *)PyExc_UnicodeEncodeError)) {
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000702 PyObject *res;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200703 int kind;
704 void *data;
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000705 if (PyUnicodeEncodeError_GetStart(exc, &start))
706 return NULL;
707 if (PyUnicodeEncodeError_GetEnd(exc, &end))
708 return NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200709 len = end - start;
710 res = PyUnicode_New(len, '?');
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000711 if (res == NULL)
712 return NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200713 kind = PyUnicode_KIND(res);
714 data = PyUnicode_DATA(res);
715 for (i = 0; i < len; ++i)
716 PyUnicode_WRITE(kind, data, i, '?');
Victor Stinner8f825062012-04-27 13:55:39 +0200717 assert(_PyUnicode_CheckConsistency(res, 1));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200718 return Py_BuildValue("(Nn)", res, end);
Walter Dörwald3aeb6322002-09-02 13:14:32 +0000719 }
Serhiy Storchakaca7fecb2015-05-18 16:08:52 +0300720 else if (PyObject_TypeCheck(exc, (PyTypeObject *)PyExc_UnicodeDecodeError)) {
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000721 if (PyUnicodeDecodeError_GetEnd(exc, &end))
722 return NULL;
Victor Stinner1a15aba2011-10-02 19:00:15 +0200723 return Py_BuildValue("(Cn)",
724 (int)Py_UNICODE_REPLACEMENT_CHARACTER,
725 end);
Walter Dörwald3aeb6322002-09-02 13:14:32 +0000726 }
Serhiy Storchakaca7fecb2015-05-18 16:08:52 +0300727 else if (PyObject_TypeCheck(exc, (PyTypeObject *)PyExc_UnicodeTranslateError)) {
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000728 PyObject *res;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200729 int kind;
730 void *data;
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000731 if (PyUnicodeTranslateError_GetStart(exc, &start))
732 return NULL;
733 if (PyUnicodeTranslateError_GetEnd(exc, &end))
734 return NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200735 len = end - start;
736 res = PyUnicode_New(len, Py_UNICODE_REPLACEMENT_CHARACTER);
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000737 if (res == NULL)
738 return NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200739 kind = PyUnicode_KIND(res);
740 data = PyUnicode_DATA(res);
741 for (i=0; i < len; i++)
742 PyUnicode_WRITE(kind, data, i, Py_UNICODE_REPLACEMENT_CHARACTER);
Victor Stinner8f825062012-04-27 13:55:39 +0200743 assert(_PyUnicode_CheckConsistency(res, 1));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200744 return Py_BuildValue("(Nn)", res, end);
Walter Dörwald3aeb6322002-09-02 13:14:32 +0000745 }
746 else {
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000747 wrong_exception_type(exc);
748 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +0000749 }
750}
751
752PyObject *PyCodec_XMLCharRefReplaceErrors(PyObject *exc)
753{
Serhiy Storchakaca7fecb2015-05-18 16:08:52 +0300754 if (PyObject_TypeCheck(exc, (PyTypeObject *)PyExc_UnicodeEncodeError)) {
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000755 PyObject *restuple;
756 PyObject *object;
Victor Stinnerb31f1bc2011-11-04 21:29:10 +0100757 Py_ssize_t i;
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000758 Py_ssize_t start;
759 Py_ssize_t end;
760 PyObject *res;
Martin v. Löwisb09af032011-11-04 11:16:41 +0100761 unsigned char *outp;
Serhiy Storchaka2e374092014-10-04 14:15:49 +0300762 Py_ssize_t ressize;
Martin v. Löwisb09af032011-11-04 11:16:41 +0100763 Py_UCS4 ch;
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000764 if (PyUnicodeEncodeError_GetStart(exc, &start))
765 return NULL;
766 if (PyUnicodeEncodeError_GetEnd(exc, &end))
767 return NULL;
768 if (!(object = PyUnicodeEncodeError_GetObject(exc)))
769 return NULL;
Serhiy Storchaka2e374092014-10-04 14:15:49 +0300770 if (end - start > PY_SSIZE_T_MAX / (2+7+1))
771 end = start + PY_SSIZE_T_MAX / (2+7+1);
Martin v. Löwisb09af032011-11-04 11:16:41 +0100772 for (i = start, ressize = 0; i < end; ++i) {
773 /* object is guaranteed to be "ready" */
774 ch = PyUnicode_READ_CHAR(object, i);
775 if (ch<10)
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000776 ressize += 2+1+1;
Martin v. Löwisb09af032011-11-04 11:16:41 +0100777 else if (ch<100)
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000778 ressize += 2+2+1;
Martin v. Löwisb09af032011-11-04 11:16:41 +0100779 else if (ch<1000)
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000780 ressize += 2+3+1;
Martin v. Löwisb09af032011-11-04 11:16:41 +0100781 else if (ch<10000)
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000782 ressize += 2+4+1;
Martin v. Löwisb09af032011-11-04 11:16:41 +0100783 else if (ch<100000)
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000784 ressize += 2+5+1;
Martin v. Löwisb09af032011-11-04 11:16:41 +0100785 else if (ch<1000000)
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000786 ressize += 2+6+1;
787 else
788 ressize += 2+7+1;
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000789 }
790 /* allocate replacement */
Martin v. Löwisb09af032011-11-04 11:16:41 +0100791 res = PyUnicode_New(ressize, 127);
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000792 if (res == NULL) {
793 Py_DECREF(object);
794 return NULL;
795 }
Martin v. Löwisb09af032011-11-04 11:16:41 +0100796 outp = PyUnicode_1BYTE_DATA(res);
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000797 /* generate replacement */
Victor Stinnerb31f1bc2011-11-04 21:29:10 +0100798 for (i = start; i < end; ++i) {
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000799 int digits;
800 int base;
Martin v. Löwis8ba79302011-11-04 12:26:49 +0100801 ch = PyUnicode_READ_CHAR(object, i);
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000802 *outp++ = '&';
803 *outp++ = '#';
Martin v. Löwisb09af032011-11-04 11:16:41 +0100804 if (ch<10) {
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000805 digits = 1;
806 base = 1;
807 }
Martin v. Löwisb09af032011-11-04 11:16:41 +0100808 else if (ch<100) {
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000809 digits = 2;
810 base = 10;
811 }
Martin v. Löwisb09af032011-11-04 11:16:41 +0100812 else if (ch<1000) {
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000813 digits = 3;
814 base = 100;
815 }
Martin v. Löwisb09af032011-11-04 11:16:41 +0100816 else if (ch<10000) {
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000817 digits = 4;
818 base = 1000;
819 }
Martin v. Löwisb09af032011-11-04 11:16:41 +0100820 else if (ch<100000) {
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000821 digits = 5;
822 base = 10000;
823 }
Martin v. Löwisb09af032011-11-04 11:16:41 +0100824 else if (ch<1000000) {
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000825 digits = 6;
826 base = 100000;
827 }
828 else {
829 digits = 7;
830 base = 1000000;
831 }
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000832 while (digits-->0) {
Martin v. Löwisb09af032011-11-04 11:16:41 +0100833 *outp++ = '0' + ch/base;
834 ch %= base;
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000835 base /= 10;
836 }
837 *outp++ = ';';
838 }
Victor Stinner8f825062012-04-27 13:55:39 +0200839 assert(_PyUnicode_CheckConsistency(res, 1));
840 restuple = Py_BuildValue("(Nn)", res, end);
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000841 Py_DECREF(object);
842 return restuple;
Walter Dörwald3aeb6322002-09-02 13:14:32 +0000843 }
844 else {
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000845 wrong_exception_type(exc);
846 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +0000847 }
848}
849
Walter Dörwald3aeb6322002-09-02 13:14:32 +0000850PyObject *PyCodec_BackslashReplaceErrors(PyObject *exc)
851{
Serhiy Storchaka07985ef2015-01-25 22:56:57 +0200852 PyObject *object;
853 Py_ssize_t i;
854 Py_ssize_t start;
855 Py_ssize_t end;
856 PyObject *res;
857 unsigned char *outp;
858 int ressize;
859 Py_UCS4 c;
860
Serhiy Storchakac0937f72015-05-18 16:10:40 +0300861 if (PyObject_TypeCheck(exc, (PyTypeObject *)PyExc_UnicodeDecodeError)) {
Serhiy Storchakacb33a012016-10-23 09:44:50 +0300862 const unsigned char *p;
Serhiy Storchaka07985ef2015-01-25 22:56:57 +0200863 if (PyUnicodeDecodeError_GetStart(exc, &start))
864 return NULL;
865 if (PyUnicodeDecodeError_GetEnd(exc, &end))
866 return NULL;
867 if (!(object = PyUnicodeDecodeError_GetObject(exc)))
868 return NULL;
Serhiy Storchakacb33a012016-10-23 09:44:50 +0300869 p = (const unsigned char*)PyBytes_AS_STRING(object);
Serhiy Storchaka07985ef2015-01-25 22:56:57 +0200870 res = PyUnicode_New(4 * (end - start), 127);
871 if (res == NULL) {
872 Py_DECREF(object);
873 return NULL;
874 }
875 outp = PyUnicode_1BYTE_DATA(res);
876 for (i = start; i < end; i++, outp += 4) {
877 unsigned char c = p[i];
878 outp[0] = '\\';
879 outp[1] = 'x';
880 outp[2] = Py_hexdigits[(c>>4)&0xf];
881 outp[3] = Py_hexdigits[c&0xf];
882 }
883
884 assert(_PyUnicode_CheckConsistency(res, 1));
885 Py_DECREF(object);
886 return Py_BuildValue("(Nn)", res, end);
887 }
Serhiy Storchakac0937f72015-05-18 16:10:40 +0300888 if (PyObject_TypeCheck(exc, (PyTypeObject *)PyExc_UnicodeEncodeError)) {
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000889 if (PyUnicodeEncodeError_GetStart(exc, &start))
890 return NULL;
891 if (PyUnicodeEncodeError_GetEnd(exc, &end))
892 return NULL;
893 if (!(object = PyUnicodeEncodeError_GetObject(exc)))
894 return NULL;
Serhiy Storchaka07985ef2015-01-25 22:56:57 +0200895 }
Serhiy Storchakac0937f72015-05-18 16:10:40 +0300896 else if (PyObject_TypeCheck(exc, (PyTypeObject *)PyExc_UnicodeTranslateError)) {
Serhiy Storchaka07985ef2015-01-25 22:56:57 +0200897 if (PyUnicodeTranslateError_GetStart(exc, &start))
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000898 return NULL;
Serhiy Storchaka07985ef2015-01-25 22:56:57 +0200899 if (PyUnicodeTranslateError_GetEnd(exc, &end))
900 return NULL;
901 if (!(object = PyUnicodeTranslateError_GetObject(exc)))
902 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +0000903 }
904 else {
Antoine Pitrouf95a1b32010-05-09 15:52:27 +0000905 wrong_exception_type(exc);
906 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +0000907 }
Serhiy Storchaka07985ef2015-01-25 22:56:57 +0200908
909 if (end - start > PY_SSIZE_T_MAX / (1+1+8))
910 end = start + PY_SSIZE_T_MAX / (1+1+8);
911 for (i = start, ressize = 0; i < end; ++i) {
912 /* object is guaranteed to be "ready" */
913 c = PyUnicode_READ_CHAR(object, i);
914 if (c >= 0x10000) {
915 ressize += 1+1+8;
916 }
917 else if (c >= 0x100) {
918 ressize += 1+1+4;
919 }
920 else
921 ressize += 1+1+2;
922 }
923 res = PyUnicode_New(ressize, 127);
924 if (res == NULL) {
925 Py_DECREF(object);
926 return NULL;
927 }
928 outp = PyUnicode_1BYTE_DATA(res);
929 for (i = start; i < end; ++i) {
930 c = PyUnicode_READ_CHAR(object, i);
931 *outp++ = '\\';
932 if (c >= 0x00010000) {
933 *outp++ = 'U';
934 *outp++ = Py_hexdigits[(c>>28)&0xf];
935 *outp++ = Py_hexdigits[(c>>24)&0xf];
936 *outp++ = Py_hexdigits[(c>>20)&0xf];
937 *outp++ = Py_hexdigits[(c>>16)&0xf];
938 *outp++ = Py_hexdigits[(c>>12)&0xf];
939 *outp++ = Py_hexdigits[(c>>8)&0xf];
940 }
941 else if (c >= 0x100) {
942 *outp++ = 'u';
943 *outp++ = Py_hexdigits[(c>>12)&0xf];
944 *outp++ = Py_hexdigits[(c>>8)&0xf];
945 }
946 else
947 *outp++ = 'x';
948 *outp++ = Py_hexdigits[(c>>4)&0xf];
949 *outp++ = Py_hexdigits[c&0xf];
950 }
951
952 assert(_PyUnicode_CheckConsistency(res, 1));
953 Py_DECREF(object);
954 return Py_BuildValue("(Nn)", res, end);
Walter Dörwald3aeb6322002-09-02 13:14:32 +0000955}
956
Serhiy Storchaka166ebc42014-11-25 13:57:17 +0200957static _PyUnicode_Name_CAPI *ucnhash_CAPI = NULL;
Serhiy Storchaka166ebc42014-11-25 13:57:17 +0200958
959PyObject *PyCodec_NameReplaceErrors(PyObject *exc)
960{
Serhiy Storchakaca7fecb2015-05-18 16:08:52 +0300961 if (PyObject_TypeCheck(exc, (PyTypeObject *)PyExc_UnicodeEncodeError)) {
Serhiy Storchaka166ebc42014-11-25 13:57:17 +0200962 PyObject *restuple;
963 PyObject *object;
964 Py_ssize_t i;
965 Py_ssize_t start;
966 Py_ssize_t end;
967 PyObject *res;
968 unsigned char *outp;
Serhiy Storchakaaacfccc2014-11-26 12:11:40 +0200969 Py_ssize_t ressize;
970 int replsize;
Serhiy Storchaka166ebc42014-11-25 13:57:17 +0200971 Py_UCS4 c;
972 char buffer[256]; /* NAME_MAXLEN */
973 if (PyUnicodeEncodeError_GetStart(exc, &start))
974 return NULL;
975 if (PyUnicodeEncodeError_GetEnd(exc, &end))
976 return NULL;
977 if (!(object = PyUnicodeEncodeError_GetObject(exc)))
978 return NULL;
Victor Stinner38b8ae02015-09-03 16:19:40 +0200979 if (!ucnhash_CAPI) {
Serhiy Storchaka166ebc42014-11-25 13:57:17 +0200980 /* load the unicode data module */
981 ucnhash_CAPI = (_PyUnicode_Name_CAPI *)PyCapsule_Import(
982 PyUnicodeData_CAPSULE_NAME, 1);
Victor Stinner38b8ae02015-09-03 16:19:40 +0200983 if (!ucnhash_CAPI)
984 return NULL;
Serhiy Storchaka166ebc42014-11-25 13:57:17 +0200985 }
986 for (i = start, ressize = 0; i < end; ++i) {
987 /* object is guaranteed to be "ready" */
988 c = PyUnicode_READ_CHAR(object, i);
Victor Stinner38b8ae02015-09-03 16:19:40 +0200989 if (ucnhash_CAPI->getname(NULL, c, buffer, sizeof(buffer), 1)) {
Serhiy Storchaka26861b02015-02-16 20:52:17 +0200990 replsize = 1+1+1+(int)strlen(buffer)+1;
Serhiy Storchaka166ebc42014-11-25 13:57:17 +0200991 }
992 else if (c >= 0x10000) {
Serhiy Storchakaaacfccc2014-11-26 12:11:40 +0200993 replsize = 1+1+8;
Serhiy Storchaka166ebc42014-11-25 13:57:17 +0200994 }
995 else if (c >= 0x100) {
Serhiy Storchakaaacfccc2014-11-26 12:11:40 +0200996 replsize = 1+1+4;
Serhiy Storchaka166ebc42014-11-25 13:57:17 +0200997 }
998 else
Serhiy Storchakaaacfccc2014-11-26 12:11:40 +0200999 replsize = 1+1+2;
1000 if (ressize > PY_SSIZE_T_MAX - replsize)
1001 break;
1002 ressize += replsize;
Serhiy Storchaka166ebc42014-11-25 13:57:17 +02001003 }
Serhiy Storchakaaacfccc2014-11-26 12:11:40 +02001004 end = i;
Serhiy Storchaka166ebc42014-11-25 13:57:17 +02001005 res = PyUnicode_New(ressize, 127);
1006 if (res==NULL)
1007 return NULL;
1008 for (i = start, outp = PyUnicode_1BYTE_DATA(res);
1009 i < end; ++i) {
1010 c = PyUnicode_READ_CHAR(object, i);
1011 *outp++ = '\\';
Victor Stinner38b8ae02015-09-03 16:19:40 +02001012 if (ucnhash_CAPI->getname(NULL, c, buffer, sizeof(buffer), 1)) {
Serhiy Storchaka166ebc42014-11-25 13:57:17 +02001013 *outp++ = 'N';
1014 *outp++ = '{';
1015 strcpy((char *)outp, buffer);
1016 outp += strlen(buffer);
1017 *outp++ = '}';
1018 continue;
1019 }
1020 if (c >= 0x00010000) {
1021 *outp++ = 'U';
1022 *outp++ = Py_hexdigits[(c>>28)&0xf];
1023 *outp++ = Py_hexdigits[(c>>24)&0xf];
1024 *outp++ = Py_hexdigits[(c>>20)&0xf];
1025 *outp++ = Py_hexdigits[(c>>16)&0xf];
1026 *outp++ = Py_hexdigits[(c>>12)&0xf];
1027 *outp++ = Py_hexdigits[(c>>8)&0xf];
1028 }
1029 else if (c >= 0x100) {
1030 *outp++ = 'u';
1031 *outp++ = Py_hexdigits[(c>>12)&0xf];
1032 *outp++ = Py_hexdigits[(c>>8)&0xf];
1033 }
1034 else
1035 *outp++ = 'x';
1036 *outp++ = Py_hexdigits[(c>>4)&0xf];
1037 *outp++ = Py_hexdigits[c&0xf];
1038 }
1039
Benjamin Peterson3663b582014-11-26 14:39:54 -06001040 assert(outp == PyUnicode_1BYTE_DATA(res) + ressize);
Serhiy Storchaka166ebc42014-11-25 13:57:17 +02001041 assert(_PyUnicode_CheckConsistency(res, 1));
1042 restuple = Py_BuildValue("(Nn)", res, end);
1043 Py_DECREF(object);
1044 return restuple;
1045 }
1046 else {
1047 wrong_exception_type(exc);
1048 return NULL;
1049 }
1050}
1051
Serhiy Storchaka88d8fb62014-05-15 14:37:42 +03001052#define ENC_UNKNOWN -1
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02001053#define ENC_UTF8 0
1054#define ENC_UTF16BE 1
1055#define ENC_UTF16LE 2
1056#define ENC_UTF32BE 3
1057#define ENC_UTF32LE 4
1058
1059static int
1060get_standard_encoding(const char *encoding, int *bytelength)
1061{
1062 if (Py_TOLOWER(encoding[0]) == 'u' &&
1063 Py_TOLOWER(encoding[1]) == 't' &&
1064 Py_TOLOWER(encoding[2]) == 'f') {
1065 encoding += 3;
1066 if (*encoding == '-' || *encoding == '_' )
1067 encoding++;
Serhiy Storchaka88d8fb62014-05-15 14:37:42 +03001068 if (encoding[0] == '8' && encoding[1] == '\0') {
1069 *bytelength = 3;
1070 return ENC_UTF8;
1071 }
1072 else if (encoding[0] == '1' && encoding[1] == '6') {
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02001073 encoding += 2;
1074 *bytelength = 2;
1075 if (*encoding == '\0') {
1076#ifdef WORDS_BIGENDIAN
1077 return ENC_UTF16BE;
1078#else
1079 return ENC_UTF16LE;
1080#endif
1081 }
1082 if (*encoding == '-' || *encoding == '_' )
1083 encoding++;
1084 if (Py_TOLOWER(encoding[1]) == 'e' && encoding[2] == '\0') {
1085 if (Py_TOLOWER(encoding[0]) == 'b')
1086 return ENC_UTF16BE;
1087 if (Py_TOLOWER(encoding[0]) == 'l')
1088 return ENC_UTF16LE;
1089 }
1090 }
1091 else if (encoding[0] == '3' && encoding[1] == '2') {
1092 encoding += 2;
1093 *bytelength = 4;
1094 if (*encoding == '\0') {
1095#ifdef WORDS_BIGENDIAN
1096 return ENC_UTF32BE;
1097#else
1098 return ENC_UTF32LE;
1099#endif
1100 }
1101 if (*encoding == '-' || *encoding == '_' )
1102 encoding++;
1103 if (Py_TOLOWER(encoding[1]) == 'e' && encoding[2] == '\0') {
1104 if (Py_TOLOWER(encoding[0]) == 'b')
1105 return ENC_UTF32BE;
1106 if (Py_TOLOWER(encoding[0]) == 'l')
1107 return ENC_UTF32LE;
1108 }
1109 }
1110 }
Victor Stinner0d4e01c2014-05-16 14:46:20 +02001111 else if (strcmp(encoding, "CP_UTF8") == 0) {
1112 *bytelength = 3;
1113 return ENC_UTF8;
1114 }
Serhiy Storchaka88d8fb62014-05-15 14:37:42 +03001115 return ENC_UNKNOWN;
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02001116}
1117
Martin v. Löwisaef3fb02009-05-02 19:27:30 +00001118/* This handler is declared static until someone demonstrates
1119 a need to call it directly. */
1120static PyObject *
Martin v. Löwise0a2b722009-05-10 08:08:56 +00001121PyCodec_SurrogatePassErrors(PyObject *exc)
Martin v. Löwisdb12d452009-05-02 18:52:14 +00001122{
1123 PyObject *restuple;
1124 PyObject *object;
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02001125 PyObject *encode;
Serhiy Storchaka85b0f5b2016-11-20 10:16:47 +02001126 const char *encoding;
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02001127 int code;
1128 int bytelength;
Martin v. Löwisb09af032011-11-04 11:16:41 +01001129 Py_ssize_t i;
Martin v. Löwisdb12d452009-05-02 18:52:14 +00001130 Py_ssize_t start;
1131 Py_ssize_t end;
1132 PyObject *res;
Serhiy Storchakaca7fecb2015-05-18 16:08:52 +03001133
1134 if (PyObject_TypeCheck(exc, (PyTypeObject *)PyExc_UnicodeEncodeError)) {
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02001135 unsigned char *outp;
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001136 if (PyUnicodeEncodeError_GetStart(exc, &start))
1137 return NULL;
1138 if (PyUnicodeEncodeError_GetEnd(exc, &end))
1139 return NULL;
1140 if (!(object = PyUnicodeEncodeError_GetObject(exc)))
1141 return NULL;
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02001142 if (!(encode = PyUnicodeEncodeError_GetEncoding(exc))) {
1143 Py_DECREF(object);
1144 return NULL;
1145 }
1146 if (!(encoding = PyUnicode_AsUTF8(encode))) {
1147 Py_DECREF(object);
1148 Py_DECREF(encode);
1149 return NULL;
1150 }
1151 code = get_standard_encoding(encoding, &bytelength);
1152 Py_DECREF(encode);
Serhiy Storchaka88d8fb62014-05-15 14:37:42 +03001153 if (code == ENC_UNKNOWN) {
1154 /* Not supported, fail with original exception */
1155 PyErr_SetObject(PyExceptionInstance_Class(exc), exc);
1156 Py_DECREF(object);
1157 return NULL;
1158 }
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02001159
Serhiy Storchaka2e374092014-10-04 14:15:49 +03001160 if (end - start > PY_SSIZE_T_MAX / bytelength)
1161 end = start + PY_SSIZE_T_MAX / bytelength;
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02001162 res = PyBytes_FromStringAndSize(NULL, bytelength*(end-start));
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001163 if (!res) {
1164 Py_DECREF(object);
1165 return NULL;
1166 }
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02001167 outp = (unsigned char*)PyBytes_AsString(res);
Martin v. Löwisb09af032011-11-04 11:16:41 +01001168 for (i = start; i < end; i++) {
1169 /* object is guaranteed to be "ready" */
1170 Py_UCS4 ch = PyUnicode_READ_CHAR(object, i);
Victor Stinner76df43d2012-10-30 01:42:39 +01001171 if (!Py_UNICODE_IS_SURROGATE(ch)) {
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001172 /* Not a surrogate, fail with original exception */
1173 PyErr_SetObject(PyExceptionInstance_Class(exc), exc);
1174 Py_DECREF(res);
1175 Py_DECREF(object);
1176 return NULL;
1177 }
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02001178 switch (code) {
1179 case ENC_UTF8:
1180 *outp++ = (unsigned char)(0xe0 | (ch >> 12));
1181 *outp++ = (unsigned char)(0x80 | ((ch >> 6) & 0x3f));
1182 *outp++ = (unsigned char)(0x80 | (ch & 0x3f));
1183 break;
1184 case ENC_UTF16LE:
1185 *outp++ = (unsigned char) ch;
1186 *outp++ = (unsigned char)(ch >> 8);
1187 break;
1188 case ENC_UTF16BE:
1189 *outp++ = (unsigned char)(ch >> 8);
1190 *outp++ = (unsigned char) ch;
1191 break;
1192 case ENC_UTF32LE:
1193 *outp++ = (unsigned char) ch;
1194 *outp++ = (unsigned char)(ch >> 8);
1195 *outp++ = (unsigned char)(ch >> 16);
1196 *outp++ = (unsigned char)(ch >> 24);
1197 break;
1198 case ENC_UTF32BE:
1199 *outp++ = (unsigned char)(ch >> 24);
1200 *outp++ = (unsigned char)(ch >> 16);
1201 *outp++ = (unsigned char)(ch >> 8);
1202 *outp++ = (unsigned char) ch;
1203 break;
1204 }
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001205 }
1206 restuple = Py_BuildValue("(On)", res, end);
1207 Py_DECREF(res);
1208 Py_DECREF(object);
1209 return restuple;
Martin v. Löwisdb12d452009-05-02 18:52:14 +00001210 }
Serhiy Storchakaca7fecb2015-05-18 16:08:52 +03001211 else if (PyObject_TypeCheck(exc, (PyTypeObject *)PyExc_UnicodeDecodeError)) {
Serhiy Storchakacb33a012016-10-23 09:44:50 +03001212 const unsigned char *p;
Victor Stinnerc06bb7a2011-11-04 21:36:35 +01001213 Py_UCS4 ch = 0;
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001214 if (PyUnicodeDecodeError_GetStart(exc, &start))
1215 return NULL;
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02001216 if (PyUnicodeDecodeError_GetEnd(exc, &end))
1217 return NULL;
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001218 if (!(object = PyUnicodeDecodeError_GetObject(exc)))
1219 return NULL;
Serhiy Storchakacb33a012016-10-23 09:44:50 +03001220 p = (const unsigned char*)PyBytes_AS_STRING(object);
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02001221 if (!(encode = PyUnicodeDecodeError_GetEncoding(exc))) {
1222 Py_DECREF(object);
1223 return NULL;
1224 }
1225 if (!(encoding = PyUnicode_AsUTF8(encode))) {
1226 Py_DECREF(object);
1227 Py_DECREF(encode);
1228 return NULL;
1229 }
1230 code = get_standard_encoding(encoding, &bytelength);
1231 Py_DECREF(encode);
Serhiy Storchaka88d8fb62014-05-15 14:37:42 +03001232 if (code == ENC_UNKNOWN) {
1233 /* Not supported, fail with original exception */
1234 PyErr_SetObject(PyExceptionInstance_Class(exc), exc);
1235 Py_DECREF(object);
1236 return NULL;
1237 }
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02001238
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001239 /* Try decoding a single surrogate character. If
1240 there are more, let the codec call us again. */
1241 p += start;
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02001242 if (PyBytes_GET_SIZE(object) - start >= bytelength) {
1243 switch (code) {
1244 case ENC_UTF8:
1245 if ((p[0] & 0xf0) == 0xe0 &&
1246 (p[1] & 0xc0) == 0x80 &&
1247 (p[2] & 0xc0) == 0x80) {
1248 /* it's a three-byte code */
1249 ch = ((p[0] & 0x0f) << 12) + ((p[1] & 0x3f) << 6) + (p[2] & 0x3f);
1250 }
1251 break;
1252 case ENC_UTF16LE:
1253 ch = p[1] << 8 | p[0];
1254 break;
1255 case ENC_UTF16BE:
1256 ch = p[0] << 8 | p[1];
1257 break;
1258 case ENC_UTF32LE:
1259 ch = (p[3] << 24) | (p[2] << 16) | (p[1] << 8) | p[0];
1260 break;
1261 case ENC_UTF32BE:
1262 ch = (p[0] << 24) | (p[1] << 16) | (p[2] << 8) | p[3];
1263 break;
1264 }
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001265 }
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02001266
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001267 Py_DECREF(object);
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02001268 if (!Py_UNICODE_IS_SURROGATE(ch)) {
1269 /* it's not a surrogate - fail */
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001270 PyErr_SetObject(PyExceptionInstance_Class(exc), exc);
1271 return NULL;
1272 }
Victor Stinnerc06bb7a2011-11-04 21:36:35 +01001273 res = PyUnicode_FromOrdinal(ch);
1274 if (res == NULL)
1275 return NULL;
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02001276 return Py_BuildValue("(Nn)", res, start + bytelength);
Martin v. Löwisdb12d452009-05-02 18:52:14 +00001277 }
1278 else {
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001279 wrong_exception_type(exc);
1280 return NULL;
Martin v. Löwisdb12d452009-05-02 18:52:14 +00001281 }
1282}
1283
Martin v. Löwis011e8422009-05-05 04:43:17 +00001284static PyObject *
Martin v. Löwis43c57782009-05-10 08:15:24 +00001285PyCodec_SurrogateEscapeErrors(PyObject *exc)
Martin v. Löwis011e8422009-05-05 04:43:17 +00001286{
1287 PyObject *restuple;
1288 PyObject *object;
Martin v. Löwisb09af032011-11-04 11:16:41 +01001289 Py_ssize_t i;
Martin v. Löwis011e8422009-05-05 04:43:17 +00001290 Py_ssize_t start;
1291 Py_ssize_t end;
1292 PyObject *res;
Serhiy Storchakaca7fecb2015-05-18 16:08:52 +03001293
1294 if (PyObject_TypeCheck(exc, (PyTypeObject *)PyExc_UnicodeEncodeError)) {
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001295 char *outp;
1296 if (PyUnicodeEncodeError_GetStart(exc, &start))
1297 return NULL;
1298 if (PyUnicodeEncodeError_GetEnd(exc, &end))
1299 return NULL;
1300 if (!(object = PyUnicodeEncodeError_GetObject(exc)))
1301 return NULL;
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001302 res = PyBytes_FromStringAndSize(NULL, end-start);
1303 if (!res) {
1304 Py_DECREF(object);
1305 return NULL;
1306 }
1307 outp = PyBytes_AsString(res);
Martin v. Löwisb09af032011-11-04 11:16:41 +01001308 for (i = start; i < end; i++) {
1309 /* object is guaranteed to be "ready" */
1310 Py_UCS4 ch = PyUnicode_READ_CHAR(object, i);
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001311 if (ch < 0xdc80 || ch > 0xdcff) {
1312 /* Not a UTF-8b surrogate, fail with original exception */
1313 PyErr_SetObject(PyExceptionInstance_Class(exc), exc);
1314 Py_DECREF(res);
1315 Py_DECREF(object);
1316 return NULL;
1317 }
1318 *outp++ = ch - 0xdc00;
1319 }
1320 restuple = Py_BuildValue("(On)", res, end);
1321 Py_DECREF(res);
1322 Py_DECREF(object);
1323 return restuple;
Martin v. Löwis011e8422009-05-05 04:43:17 +00001324 }
Serhiy Storchakaca7fecb2015-05-18 16:08:52 +03001325 else if (PyObject_TypeCheck(exc, (PyTypeObject *)PyExc_UnicodeDecodeError)) {
Victor Stinnerc06bb7a2011-11-04 21:36:35 +01001326 PyObject *str;
Serhiy Storchakacb33a012016-10-23 09:44:50 +03001327 const unsigned char *p;
Victor Stinnerc06bb7a2011-11-04 21:36:35 +01001328 Py_UCS2 ch[4]; /* decode up to 4 bad bytes. */
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001329 int consumed = 0;
1330 if (PyUnicodeDecodeError_GetStart(exc, &start))
1331 return NULL;
1332 if (PyUnicodeDecodeError_GetEnd(exc, &end))
1333 return NULL;
1334 if (!(object = PyUnicodeDecodeError_GetObject(exc)))
1335 return NULL;
Serhiy Storchakacb33a012016-10-23 09:44:50 +03001336 p = (const unsigned char*)PyBytes_AS_STRING(object);
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001337 while (consumed < 4 && consumed < end-start) {
1338 /* Refuse to escape ASCII bytes. */
1339 if (p[start+consumed] < 128)
1340 break;
1341 ch[consumed] = 0xdc00 + p[start+consumed];
1342 consumed++;
1343 }
1344 Py_DECREF(object);
1345 if (!consumed) {
1346 /* codec complained about ASCII byte. */
1347 PyErr_SetObject(PyExceptionInstance_Class(exc), exc);
1348 return NULL;
1349 }
Victor Stinnerc06bb7a2011-11-04 21:36:35 +01001350 str = PyUnicode_FromKindAndData(PyUnicode_2BYTE_KIND, ch, consumed);
1351 if (str == NULL)
1352 return NULL;
1353 return Py_BuildValue("(Nn)", str, start+consumed);
Martin v. Löwis011e8422009-05-05 04:43:17 +00001354 }
1355 else {
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001356 wrong_exception_type(exc);
1357 return NULL;
Martin v. Löwis011e8422009-05-05 04:43:17 +00001358 }
1359}
1360
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001361
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001362static PyObject *strict_errors(PyObject *self, PyObject *exc)
1363{
1364 return PyCodec_StrictErrors(exc);
1365}
1366
1367
1368static PyObject *ignore_errors(PyObject *self, PyObject *exc)
1369{
1370 return PyCodec_IgnoreErrors(exc);
1371}
1372
1373
1374static PyObject *replace_errors(PyObject *self, PyObject *exc)
1375{
1376 return PyCodec_ReplaceErrors(exc);
1377}
1378
1379
1380static PyObject *xmlcharrefreplace_errors(PyObject *self, PyObject *exc)
1381{
1382 return PyCodec_XMLCharRefReplaceErrors(exc);
1383}
1384
1385
1386static PyObject *backslashreplace_errors(PyObject *self, PyObject *exc)
1387{
1388 return PyCodec_BackslashReplaceErrors(exc);
1389}
1390
Serhiy Storchaka166ebc42014-11-25 13:57:17 +02001391static PyObject *namereplace_errors(PyObject *self, PyObject *exc)
1392{
1393 return PyCodec_NameReplaceErrors(exc);
1394}
1395
Martin v. Löwise0a2b722009-05-10 08:08:56 +00001396static PyObject *surrogatepass_errors(PyObject *self, PyObject *exc)
Martin v. Löwisdb12d452009-05-02 18:52:14 +00001397{
Martin v. Löwise0a2b722009-05-10 08:08:56 +00001398 return PyCodec_SurrogatePassErrors(exc);
Martin v. Löwisdb12d452009-05-02 18:52:14 +00001399}
1400
Martin v. Löwis43c57782009-05-10 08:15:24 +00001401static PyObject *surrogateescape_errors(PyObject *self, PyObject *exc)
Martin v. Löwis011e8422009-05-05 04:43:17 +00001402{
Martin v. Löwis43c57782009-05-10 08:15:24 +00001403 return PyCodec_SurrogateEscapeErrors(exc);
Martin v. Löwis011e8422009-05-05 04:43:17 +00001404}
1405
Gustavo Niemeyer5ddd4c32003-03-19 00:35:36 +00001406static int _PyCodecRegistry_Init(void)
Guido van Rossumfeee4b92000-03-10 22:57:27 +00001407{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001408 static struct {
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001409 char *name;
1410 PyMethodDef def;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001411 } methods[] =
1412 {
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001413 {
1414 "strict",
1415 {
1416 "strict_errors",
1417 strict_errors,
1418 METH_O,
1419 PyDoc_STR("Implements the 'strict' error handling, which "
1420 "raises a UnicodeError on coding errors.")
1421 }
1422 },
1423 {
1424 "ignore",
1425 {
1426 "ignore_errors",
1427 ignore_errors,
1428 METH_O,
1429 PyDoc_STR("Implements the 'ignore' error handling, which "
1430 "ignores malformed data and continues.")
1431 }
1432 },
1433 {
1434 "replace",
1435 {
1436 "replace_errors",
1437 replace_errors,
1438 METH_O,
1439 PyDoc_STR("Implements the 'replace' error handling, which "
1440 "replaces malformed data with a replacement marker.")
1441 }
1442 },
1443 {
1444 "xmlcharrefreplace",
1445 {
1446 "xmlcharrefreplace_errors",
1447 xmlcharrefreplace_errors,
1448 METH_O,
1449 PyDoc_STR("Implements the 'xmlcharrefreplace' error handling, "
1450 "which replaces an unencodable character with the "
1451 "appropriate XML character reference.")
1452 }
1453 },
1454 {
1455 "backslashreplace",
1456 {
1457 "backslashreplace_errors",
1458 backslashreplace_errors,
1459 METH_O,
1460 PyDoc_STR("Implements the 'backslashreplace' error handling, "
Serhiy Storchaka07985ef2015-01-25 22:56:57 +02001461 "which replaces malformed data with a backslashed "
1462 "escape sequence.")
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001463 }
1464 },
1465 {
Serhiy Storchaka166ebc42014-11-25 13:57:17 +02001466 "namereplace",
1467 {
1468 "namereplace_errors",
1469 namereplace_errors,
1470 METH_O,
1471 PyDoc_STR("Implements the 'namereplace' error handling, "
1472 "which replaces an unencodable character with a "
1473 "\\N{...} escape sequence.")
1474 }
1475 },
1476 {
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001477 "surrogatepass",
1478 {
1479 "surrogatepass",
1480 surrogatepass_errors,
1481 METH_O
1482 }
1483 },
1484 {
1485 "surrogateescape",
1486 {
1487 "surrogateescape",
1488 surrogateescape_errors,
1489 METH_O
1490 }
1491 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001492 };
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001493
Victor Stinnercaba55b2018-08-03 15:33:52 +02001494 PyInterpreterState *interp = _PyInterpreterState_Get();
Gustavo Niemeyer5ddd4c32003-03-19 00:35:36 +00001495 PyObject *mod;
Neal Norwitz739a8f82004-07-08 01:55:58 +00001496 unsigned i;
Gustavo Niemeyer5ddd4c32003-03-19 00:35:36 +00001497
1498 if (interp->codec_search_path != NULL)
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001499 return 0;
Gustavo Niemeyer5ddd4c32003-03-19 00:35:36 +00001500
1501 interp->codec_search_path = PyList_New(0);
1502 interp->codec_search_cache = PyDict_New();
1503 interp->codec_error_registry = PyDict_New();
1504
1505 if (interp->codec_error_registry) {
Victor Stinner63941882011-09-29 00:42:28 +02001506 for (i = 0; i < Py_ARRAY_LENGTH(methods); ++i) {
Andrew Svetlov3ba3a3e2012-12-25 13:32:35 +02001507 PyObject *func = PyCFunction_NewEx(&methods[i].def, NULL, NULL);
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001508 int res;
1509 if (!func)
1510 Py_FatalError("can't initialize codec error registry");
1511 res = PyCodec_RegisterError(methods[i].name, func);
1512 Py_DECREF(func);
1513 if (res)
1514 Py_FatalError("can't initialize codec error registry");
1515 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001516 }
Guido van Rossumfeee4b92000-03-10 22:57:27 +00001517
Gustavo Niemeyer5ddd4c32003-03-19 00:35:36 +00001518 if (interp->codec_search_path == NULL ||
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001519 interp->codec_search_cache == NULL ||
1520 interp->codec_error_registry == NULL)
1521 Py_FatalError("can't initialize codec registry");
Gustavo Niemeyer5ddd4c32003-03-19 00:35:36 +00001522
Christian Heimes819b8bf2008-01-03 23:05:47 +00001523 mod = PyImport_ImportModuleNoBlock("encodings");
Gustavo Niemeyer5ddd4c32003-03-19 00:35:36 +00001524 if (mod == NULL) {
Antoine Pitrouf95a1b32010-05-09 15:52:27 +00001525 return -1;
Gustavo Niemeyer5ddd4c32003-03-19 00:35:36 +00001526 }
1527 Py_DECREF(mod);
Christian Heimes6a27efa2008-10-30 21:48:26 +00001528 interp->codecs_initialized = 1;
Gustavo Niemeyer5ddd4c32003-03-19 00:35:36 +00001529 return 0;
Guido van Rossumfeee4b92000-03-10 22:57:27 +00001530}