blob: 253bc393260d385af191d697ba417e1f710fda6f [file] [log] [blame]
Guido van Rossumfeee4b92000-03-10 22:57:27 +00001/* ------------------------------------------------------------------------
2
3 Python Codec Registry and support functions
4
5Written by Marc-Andre Lemburg (mal@lemburg.com).
6
Guido van Rossum16b1ad92000-08-03 16:24:25 +00007Copyright (c) Corporation for National Research Initiatives.
Guido van Rossumfeee4b92000-03-10 22:57:27 +00008
9 ------------------------------------------------------------------------ */
10
11#include "Python.h"
12#include <ctype.h>
13
Guido van Rossumfeee4b92000-03-10 22:57:27 +000014/* --- Codec Registry ----------------------------------------------------- */
15
16/* Import the standard encodings package which will register the first
17 codec search function.
18
19 This is done in a lazy way so that the Unicode implementation does
20 not downgrade startup time of scripts not needing it.
21
Guido van Rossumb95de4f2000-03-31 17:25:23 +000022 ImportErrors are silently ignored by this function. Only one try is
23 made.
Guido van Rossumfeee4b92000-03-10 22:57:27 +000024
25*/
26
Gustavo Niemeyer5ddd4c32003-03-19 00:35:36 +000027static int _PyCodecRegistry_Init(void); /* Forward */
Guido van Rossumfeee4b92000-03-10 22:57:27 +000028
Guido van Rossumfeee4b92000-03-10 22:57:27 +000029int PyCodec_Register(PyObject *search_function)
30{
Nicholas Bastine5662ae2004-03-24 22:22:12 +000031 PyInterpreterState *interp = PyThreadState_GET()->interp;
Gustavo Niemeyer5ddd4c32003-03-19 00:35:36 +000032 if (interp->codec_search_path == NULL && _PyCodecRegistry_Init())
33 goto onError;
Guido van Rossumfeee4b92000-03-10 22:57:27 +000034 if (search_function == NULL) {
35 PyErr_BadArgument();
Guido van Rossumb95de4f2000-03-31 17:25:23 +000036 goto onError;
Guido van Rossumfeee4b92000-03-10 22:57:27 +000037 }
38 if (!PyCallable_Check(search_function)) {
Neal Norwitz3715c3e2005-11-24 22:09:18 +000039 PyErr_SetString(PyExc_TypeError, "argument must be callable");
Guido van Rossumb95de4f2000-03-31 17:25:23 +000040 goto onError;
Guido van Rossumfeee4b92000-03-10 22:57:27 +000041 }
Gustavo Niemeyer5ddd4c32003-03-19 00:35:36 +000042 return PyList_Append(interp->codec_search_path, search_function);
Guido van Rossumb95de4f2000-03-31 17:25:23 +000043
44 onError:
45 return -1;
Guido van Rossumfeee4b92000-03-10 22:57:27 +000046}
47
Guido van Rossum9e896b32000-04-05 20:11:21 +000048/* Convert a string to a normalized Python string: all characters are
49 converted to lower case, spaces are replaced with underscores. */
50
Guido van Rossumfeee4b92000-03-10 22:57:27 +000051static
Guido van Rossum9e896b32000-04-05 20:11:21 +000052PyObject *normalizestring(const char *string)
Guido van Rossumfeee4b92000-03-10 22:57:27 +000053{
Guido van Rossum33831132000-06-29 14:50:15 +000054 register size_t i;
Guido van Rossum582acec2000-06-28 22:07:35 +000055 size_t len = strlen(string);
Guido van Rossumfeee4b92000-03-10 22:57:27 +000056 char *p;
57 PyObject *v;
58
Guido van Rossum582acec2000-06-28 22:07:35 +000059 if (len > INT_MAX) {
60 PyErr_SetString(PyExc_OverflowError, "string is too large");
61 return NULL;
62 }
63
64 v = PyString_FromStringAndSize(NULL, (int)len);
Guido van Rossumfeee4b92000-03-10 22:57:27 +000065 if (v == NULL)
66 return NULL;
67 p = PyString_AS_STRING(v);
Guido van Rossum9e896b32000-04-05 20:11:21 +000068 for (i = 0; i < len; i++) {
69 register char ch = string[i];
70 if (ch == ' ')
71 ch = '-';
72 else
73 ch = tolower(ch);
74 p[i] = ch;
75 }
Guido van Rossumfeee4b92000-03-10 22:57:27 +000076 return v;
77}
78
79/* Lookup the given encoding and return a tuple providing the codec
80 facilities.
81
82 The encoding string is looked up converted to all lower-case
83 characters. This makes encodings looked up through this mechanism
84 effectively case-insensitive.
85
Fred Drake766de832000-05-09 19:55:59 +000086 If no codec is found, a LookupError is set and NULL returned.
Guido van Rossumb95de4f2000-03-31 17:25:23 +000087
88 As side effect, this tries to load the encodings package, if not
89 yet done. This is part of the lazy load strategy for the encodings
90 package.
91
92*/
Guido van Rossumfeee4b92000-03-10 22:57:27 +000093
94PyObject *_PyCodec_Lookup(const char *encoding)
95{
Gustavo Niemeyer5ddd4c32003-03-19 00:35:36 +000096 PyInterpreterState *interp;
Guido van Rossum5ba3c842000-03-24 20:52:23 +000097 PyObject *result, *args = NULL, *v;
Guido van Rossumfeee4b92000-03-10 22:57:27 +000098 int i, len;
99
Fred Drake766de832000-05-09 19:55:59 +0000100 if (encoding == NULL) {
101 PyErr_BadArgument();
102 goto onError;
103 }
Gustavo Niemeyer5ddd4c32003-03-19 00:35:36 +0000104
Nicholas Bastine5662ae2004-03-24 22:22:12 +0000105 interp = PyThreadState_GET()->interp;
Gustavo Niemeyer5ddd4c32003-03-19 00:35:36 +0000106 if (interp->codec_search_path == NULL && _PyCodecRegistry_Init())
Barry Warsaw51ac5802000-03-20 16:36:48 +0000107 goto onError;
Guido van Rossumfeee4b92000-03-10 22:57:27 +0000108
Guido van Rossum9e896b32000-04-05 20:11:21 +0000109 /* Convert the encoding to a normalized Python string: all
Thomas Wouters7e474022000-07-16 12:04:32 +0000110 characters are converted to lower case, spaces and hyphens are
Guido van Rossum9e896b32000-04-05 20:11:21 +0000111 replaced with underscores. */
112 v = normalizestring(encoding);
Guido van Rossumfeee4b92000-03-10 22:57:27 +0000113 if (v == NULL)
114 goto onError;
115 PyString_InternInPlace(&v);
116
117 /* First, try to lookup the name in the registry dictionary */
Gustavo Niemeyer5ddd4c32003-03-19 00:35:36 +0000118 result = PyDict_GetItem(interp->codec_search_cache, v);
Guido van Rossumfeee4b92000-03-10 22:57:27 +0000119 if (result != NULL) {
120 Py_INCREF(result);
Barry Warsaw51ac5802000-03-20 16:36:48 +0000121 Py_DECREF(v);
Guido van Rossumfeee4b92000-03-10 22:57:27 +0000122 return result;
123 }
124
125 /* Next, scan the search functions in order of registration */
Guido van Rossumfeee4b92000-03-10 22:57:27 +0000126 args = PyTuple_New(1);
127 if (args == NULL)
128 goto onError;
129 PyTuple_SET_ITEM(args,0,v);
Guido van Rossum5ba3c842000-03-24 20:52:23 +0000130
Gustavo Niemeyer5ddd4c32003-03-19 00:35:36 +0000131 len = PyList_Size(interp->codec_search_path);
Guido van Rossum5ba3c842000-03-24 20:52:23 +0000132 if (len < 0)
133 goto onError;
Guido van Rossumb95de4f2000-03-31 17:25:23 +0000134 if (len == 0) {
135 PyErr_SetString(PyExc_LookupError,
136 "no codec search functions registered: "
137 "can't find encoding");
138 goto onError;
139 }
Guido van Rossumfeee4b92000-03-10 22:57:27 +0000140
141 for (i = 0; i < len; i++) {
142 PyObject *func;
143
Gustavo Niemeyer5ddd4c32003-03-19 00:35:36 +0000144 func = PyList_GetItem(interp->codec_search_path, i);
Guido van Rossumfeee4b92000-03-10 22:57:27 +0000145 if (func == NULL)
146 goto onError;
Guido van Rossum5ba3c842000-03-24 20:52:23 +0000147 result = PyEval_CallObject(func, args);
Guido van Rossumfeee4b92000-03-10 22:57:27 +0000148 if (result == NULL)
149 goto onError;
150 if (result == Py_None) {
151 Py_DECREF(result);
152 continue;
153 }
154 if (!PyTuple_Check(result) || PyTuple_GET_SIZE(result) != 4) {
155 PyErr_SetString(PyExc_TypeError,
156 "codec search functions must return 4-tuples");
157 Py_DECREF(result);
158 goto onError;
159 }
160 break;
161 }
162 if (i == len) {
163 /* XXX Perhaps we should cache misses too ? */
Martin v. Löwiseb42b022002-09-26 16:01:24 +0000164 PyErr_Format(PyExc_LookupError,
165 "unknown encoding: %s", encoding);
Guido van Rossumfeee4b92000-03-10 22:57:27 +0000166 goto onError;
167 }
168
169 /* Cache and return the result */
Gustavo Niemeyer5ddd4c32003-03-19 00:35:36 +0000170 PyDict_SetItem(interp->codec_search_cache, v, result);
Guido van Rossumfeee4b92000-03-10 22:57:27 +0000171 Py_DECREF(args);
172 return result;
173
174 onError:
175 Py_XDECREF(args);
176 return NULL;
177}
178
179static
180PyObject *args_tuple(PyObject *object,
181 const char *errors)
182{
183 PyObject *args;
184
185 args = PyTuple_New(1 + (errors != NULL));
186 if (args == NULL)
187 return NULL;
188 Py_INCREF(object);
189 PyTuple_SET_ITEM(args,0,object);
190 if (errors) {
191 PyObject *v;
192
193 v = PyString_FromString(errors);
194 if (v == NULL) {
195 Py_DECREF(args);
196 return NULL;
197 }
198 PyTuple_SET_ITEM(args, 1, v);
199 }
200 return args;
201}
202
203/* Build a codec by calling factory(stream[,errors]) or just
204 factory(errors) depending on whether the given parameters are
205 non-NULL. */
206
207static
208PyObject *build_stream_codec(PyObject *factory,
209 PyObject *stream,
210 const char *errors)
211{
212 PyObject *args, *codec;
213
214 args = args_tuple(stream, errors);
215 if (args == NULL)
216 return NULL;
217
218 codec = PyEval_CallObject(factory, args);
219 Py_DECREF(args);
220 return codec;
221}
222
223/* Convenience APIs to query the Codec registry.
224
225 All APIs return a codec object with incremented refcount.
226
227 */
228
229PyObject *PyCodec_Encoder(const char *encoding)
230{
231 PyObject *codecs;
232 PyObject *v;
233
234 codecs = _PyCodec_Lookup(encoding);
235 if (codecs == NULL)
236 goto onError;
237 v = PyTuple_GET_ITEM(codecs,0);
Mark Hammonde21262c2002-07-18 23:06:17 +0000238 Py_DECREF(codecs);
Guido van Rossumfeee4b92000-03-10 22:57:27 +0000239 Py_INCREF(v);
240 return v;
241
242 onError:
243 return NULL;
244}
245
246PyObject *PyCodec_Decoder(const char *encoding)
247{
248 PyObject *codecs;
249 PyObject *v;
250
251 codecs = _PyCodec_Lookup(encoding);
252 if (codecs == NULL)
253 goto onError;
254 v = PyTuple_GET_ITEM(codecs,1);
Mark Hammonde21262c2002-07-18 23:06:17 +0000255 Py_DECREF(codecs);
Guido van Rossumfeee4b92000-03-10 22:57:27 +0000256 Py_INCREF(v);
257 return v;
258
259 onError:
260 return NULL;
261}
262
263PyObject *PyCodec_StreamReader(const char *encoding,
264 PyObject *stream,
265 const char *errors)
266{
Mark Hammonde21262c2002-07-18 23:06:17 +0000267 PyObject *codecs, *ret;
Guido van Rossumfeee4b92000-03-10 22:57:27 +0000268
269 codecs = _PyCodec_Lookup(encoding);
270 if (codecs == NULL)
271 goto onError;
Mark Hammonde21262c2002-07-18 23:06:17 +0000272 ret = build_stream_codec(PyTuple_GET_ITEM(codecs,2),stream,errors);
273 Py_DECREF(codecs);
274 return ret;
Guido van Rossumfeee4b92000-03-10 22:57:27 +0000275
276 onError:
277 return NULL;
278}
279
280PyObject *PyCodec_StreamWriter(const char *encoding,
281 PyObject *stream,
282 const char *errors)
283{
Mark Hammonde21262c2002-07-18 23:06:17 +0000284 PyObject *codecs, *ret;
Guido van Rossumfeee4b92000-03-10 22:57:27 +0000285
286 codecs = _PyCodec_Lookup(encoding);
287 if (codecs == NULL)
288 goto onError;
Mark Hammonde21262c2002-07-18 23:06:17 +0000289 ret = build_stream_codec(PyTuple_GET_ITEM(codecs,3),stream,errors);
290 Py_DECREF(codecs);
291 return ret;
Guido van Rossumfeee4b92000-03-10 22:57:27 +0000292
293 onError:
294 return NULL;
295}
296
297/* Encode an object (e.g. an Unicode object) using the given encoding
298 and return the resulting encoded object (usually a Python string).
299
300 errors is passed to the encoder factory as argument if non-NULL. */
301
302PyObject *PyCodec_Encode(PyObject *object,
303 const char *encoding,
304 const char *errors)
305{
306 PyObject *encoder = NULL;
Neal Norwitz3715c3e2005-11-24 22:09:18 +0000307 PyObject *args = NULL, *result = NULL;
Guido van Rossumfeee4b92000-03-10 22:57:27 +0000308 PyObject *v;
309
310 encoder = PyCodec_Encoder(encoding);
311 if (encoder == NULL)
312 goto onError;
313
314 args = args_tuple(object, errors);
315 if (args == NULL)
316 goto onError;
317
318 result = PyEval_CallObject(encoder,args);
319 if (result == NULL)
320 goto onError;
321
322 if (!PyTuple_Check(result) ||
323 PyTuple_GET_SIZE(result) != 2) {
324 PyErr_SetString(PyExc_TypeError,
325 "encoder must return a tuple (object,integer)");
326 goto onError;
327 }
328 v = PyTuple_GET_ITEM(result,0);
329 Py_INCREF(v);
330 /* We don't check or use the second (integer) entry. */
331
332 Py_DECREF(args);
333 Py_DECREF(encoder);
334 Py_DECREF(result);
335 return v;
336
337 onError:
Neal Norwitz3715c3e2005-11-24 22:09:18 +0000338 Py_XDECREF(result);
Guido van Rossumfeee4b92000-03-10 22:57:27 +0000339 Py_XDECREF(args);
340 Py_XDECREF(encoder);
341 return NULL;
342}
343
344/* Decode an object (usually a Python string) using the given encoding
345 and return an equivalent object (e.g. an Unicode object).
346
347 errors is passed to the decoder factory as argument if non-NULL. */
348
349PyObject *PyCodec_Decode(PyObject *object,
350 const char *encoding,
351 const char *errors)
352{
353 PyObject *decoder = NULL;
354 PyObject *args = NULL, *result = NULL;
355 PyObject *v;
356
357 decoder = PyCodec_Decoder(encoding);
358 if (decoder == NULL)
359 goto onError;
360
361 args = args_tuple(object, errors);
362 if (args == NULL)
363 goto onError;
364
365 result = PyEval_CallObject(decoder,args);
366 if (result == NULL)
367 goto onError;
368 if (!PyTuple_Check(result) ||
369 PyTuple_GET_SIZE(result) != 2) {
370 PyErr_SetString(PyExc_TypeError,
371 "decoder must return a tuple (object,integer)");
372 goto onError;
373 }
374 v = PyTuple_GET_ITEM(result,0);
375 Py_INCREF(v);
376 /* We don't check or use the second (integer) entry. */
377
378 Py_DECREF(args);
379 Py_DECREF(decoder);
380 Py_DECREF(result);
381 return v;
382
383 onError:
384 Py_XDECREF(args);
385 Py_XDECREF(decoder);
386 Py_XDECREF(result);
387 return NULL;
388}
389
Walter Dörwald3aeb6322002-09-02 13:14:32 +0000390/* Register the error handling callback function error under the name
391 name. This function will be called by the codec when it encounters
392 an unencodable characters/undecodable bytes and doesn't know the
393 callback name, when name is specified as the error parameter
394 in the call to the encode/decode function.
395 Return 0 on success, -1 on error */
396int PyCodec_RegisterError(const char *name, PyObject *error)
397{
Nicholas Bastine5662ae2004-03-24 22:22:12 +0000398 PyInterpreterState *interp = PyThreadState_GET()->interp;
Gustavo Niemeyer5ddd4c32003-03-19 00:35:36 +0000399 if (interp->codec_search_path == NULL && _PyCodecRegistry_Init())
400 return -1;
Walter Dörwald3aeb6322002-09-02 13:14:32 +0000401 if (!PyCallable_Check(error)) {
402 PyErr_SetString(PyExc_TypeError, "handler must be callable");
403 return -1;
404 }
Gustavo Niemeyer5ddd4c32003-03-19 00:35:36 +0000405 return PyDict_SetItemString(interp->codec_error_registry,
406 (char *)name, error);
Walter Dörwald3aeb6322002-09-02 13:14:32 +0000407}
408
409/* Lookup the error handling callback function registered under the
410 name error. As a special case NULL can be passed, in which case
411 the error handling callback for strict encoding will be returned. */
412PyObject *PyCodec_LookupError(const char *name)
413{
414 PyObject *handler = NULL;
415
Nicholas Bastine5662ae2004-03-24 22:22:12 +0000416 PyInterpreterState *interp = PyThreadState_GET()->interp;
Gustavo Niemeyer5ddd4c32003-03-19 00:35:36 +0000417 if (interp->codec_search_path == NULL && _PyCodecRegistry_Init())
418 return NULL;
419
Walter Dörwald3aeb6322002-09-02 13:14:32 +0000420 if (name==NULL)
421 name = "strict";
Gustavo Niemeyer5ddd4c32003-03-19 00:35:36 +0000422 handler = PyDict_GetItemString(interp->codec_error_registry, (char *)name);
Walter Dörwald3aeb6322002-09-02 13:14:32 +0000423 if (!handler)
424 PyErr_Format(PyExc_LookupError, "unknown error handler name '%.400s'", name);
425 else
426 Py_INCREF(handler);
427 return handler;
428}
429
430static void wrong_exception_type(PyObject *exc)
431{
432 PyObject *type = PyObject_GetAttrString(exc, "__class__");
433 if (type != NULL) {
434 PyObject *name = PyObject_GetAttrString(type, "__name__");
435 Py_DECREF(type);
436 if (name != NULL) {
437 PyObject *string = PyObject_Str(name);
438 Py_DECREF(name);
Walter Dörwaldf7bcd1d2002-09-02 18:22:32 +0000439 if (string != NULL) {
440 PyErr_Format(PyExc_TypeError,
441 "don't know how to handle %.400s in error callback",
442 PyString_AS_STRING(string));
443 Py_DECREF(string);
444 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +0000445 }
446 }
447}
448
449PyObject *PyCodec_StrictErrors(PyObject *exc)
450{
Brett Cannonbf364092006-03-01 04:25:17 +0000451 if (PyExceptionInstance_Check(exc))
452 PyErr_SetObject(PyExceptionInstance_Class(exc), exc);
Walter Dörwald3aeb6322002-09-02 13:14:32 +0000453 else
454 PyErr_SetString(PyExc_TypeError, "codec must pass exception instance");
455 return NULL;
456}
457
458
Walter Dörwaldbf73db82002-11-21 20:08:33 +0000459#ifdef Py_USING_UNICODE
Walter Dörwald3aeb6322002-09-02 13:14:32 +0000460PyObject *PyCodec_IgnoreErrors(PyObject *exc)
461{
Martin v. Löwis18e16552006-02-15 17:27:45 +0000462 Py_ssize_t end;
Walter Dörwald3aeb6322002-09-02 13:14:32 +0000463 if (PyObject_IsInstance(exc, PyExc_UnicodeEncodeError)) {
464 if (PyUnicodeEncodeError_GetEnd(exc, &end))
465 return NULL;
466 }
467 else if (PyObject_IsInstance(exc, PyExc_UnicodeDecodeError)) {
468 if (PyUnicodeDecodeError_GetEnd(exc, &end))
469 return NULL;
470 }
471 else if (PyObject_IsInstance(exc, PyExc_UnicodeTranslateError)) {
472 if (PyUnicodeTranslateError_GetEnd(exc, &end))
473 return NULL;
474 }
475 else {
476 wrong_exception_type(exc);
477 return NULL;
478 }
479 /* ouch: passing NULL, 0, pos gives None instead of u'' */
Martin v. Löwis18e16552006-02-15 17:27:45 +0000480 return Py_BuildValue("(u#n)", &end, 0, end);
Walter Dörwald3aeb6322002-09-02 13:14:32 +0000481}
482
483
484PyObject *PyCodec_ReplaceErrors(PyObject *exc)
485{
486 PyObject *restuple;
Martin v. Löwis18e16552006-02-15 17:27:45 +0000487 Py_ssize_t start;
488 Py_ssize_t end;
489 Py_ssize_t i;
Walter Dörwald3aeb6322002-09-02 13:14:32 +0000490
491 if (PyObject_IsInstance(exc, PyExc_UnicodeEncodeError)) {
492 PyObject *res;
493 Py_UNICODE *p;
494 if (PyUnicodeEncodeError_GetStart(exc, &start))
495 return NULL;
496 if (PyUnicodeEncodeError_GetEnd(exc, &end))
497 return NULL;
498 res = PyUnicode_FromUnicode(NULL, end-start);
499 if (res == NULL)
500 return NULL;
501 for (p = PyUnicode_AS_UNICODE(res), i = start;
502 i<end; ++p, ++i)
503 *p = '?';
Martin v. Löwis18e16552006-02-15 17:27:45 +0000504 restuple = Py_BuildValue("(On)", res, end);
Walter Dörwald3aeb6322002-09-02 13:14:32 +0000505 Py_DECREF(res);
506 return restuple;
507 }
508 else if (PyObject_IsInstance(exc, PyExc_UnicodeDecodeError)) {
509 Py_UNICODE res = Py_UNICODE_REPLACEMENT_CHARACTER;
510 if (PyUnicodeDecodeError_GetEnd(exc, &end))
511 return NULL;
Martin v. Löwis18e16552006-02-15 17:27:45 +0000512 return Py_BuildValue("(u#n)", &res, 1, end);
Walter Dörwald3aeb6322002-09-02 13:14:32 +0000513 }
514 else if (PyObject_IsInstance(exc, PyExc_UnicodeTranslateError)) {
515 PyObject *res;
516 Py_UNICODE *p;
517 if (PyUnicodeTranslateError_GetStart(exc, &start))
518 return NULL;
519 if (PyUnicodeTranslateError_GetEnd(exc, &end))
520 return NULL;
521 res = PyUnicode_FromUnicode(NULL, end-start);
522 if (res == NULL)
523 return NULL;
524 for (p = PyUnicode_AS_UNICODE(res), i = start;
525 i<end; ++p, ++i)
526 *p = Py_UNICODE_REPLACEMENT_CHARACTER;
Martin v. Löwis18e16552006-02-15 17:27:45 +0000527 restuple = Py_BuildValue("(On)", res, end);
Walter Dörwald3aeb6322002-09-02 13:14:32 +0000528 Py_DECREF(res);
529 return restuple;
530 }
531 else {
532 wrong_exception_type(exc);
533 return NULL;
534 }
535}
536
537PyObject *PyCodec_XMLCharRefReplaceErrors(PyObject *exc)
538{
539 if (PyObject_IsInstance(exc, PyExc_UnicodeEncodeError)) {
540 PyObject *restuple;
541 PyObject *object;
Martin v. Löwis18e16552006-02-15 17:27:45 +0000542 Py_ssize_t start;
543 Py_ssize_t end;
Walter Dörwald3aeb6322002-09-02 13:14:32 +0000544 PyObject *res;
545 Py_UNICODE *p;
546 Py_UNICODE *startp;
547 Py_UNICODE *outp;
548 int ressize;
549 if (PyUnicodeEncodeError_GetStart(exc, &start))
550 return NULL;
551 if (PyUnicodeEncodeError_GetEnd(exc, &end))
552 return NULL;
553 if (!(object = PyUnicodeEncodeError_GetObject(exc)))
554 return NULL;
555 startp = PyUnicode_AS_UNICODE(object);
556 for (p = startp+start, ressize = 0; p < startp+end; ++p) {
557 if (*p<10)
558 ressize += 2+1+1;
559 else if (*p<100)
560 ressize += 2+2+1;
561 else if (*p<1000)
562 ressize += 2+3+1;
563 else if (*p<10000)
564 ressize += 2+4+1;
Hye-Shik Chang7db07e62003-12-29 01:36:01 +0000565#ifndef Py_UNICODE_WIDE
566 else
567 ressize += 2+5+1;
568#else
Walter Dörwald3aeb6322002-09-02 13:14:32 +0000569 else if (*p<100000)
570 ressize += 2+5+1;
571 else if (*p<1000000)
572 ressize += 2+6+1;
573 else
574 ressize += 2+7+1;
Hye-Shik Chang7db07e62003-12-29 01:36:01 +0000575#endif
Walter Dörwald3aeb6322002-09-02 13:14:32 +0000576 }
577 /* allocate replacement */
578 res = PyUnicode_FromUnicode(NULL, ressize);
579 if (res == NULL) {
580 Py_DECREF(object);
581 return NULL;
582 }
583 /* generate replacement */
584 for (p = startp+start, outp = PyUnicode_AS_UNICODE(res);
585 p < startp+end; ++p) {
586 Py_UNICODE c = *p;
587 int digits;
588 int base;
589 *outp++ = '&';
590 *outp++ = '#';
591 if (*p<10) {
592 digits = 1;
593 base = 1;
594 }
595 else if (*p<100) {
596 digits = 2;
597 base = 10;
598 }
599 else if (*p<1000) {
600 digits = 3;
601 base = 100;
602 }
603 else if (*p<10000) {
604 digits = 4;
605 base = 1000;
606 }
Hye-Shik Chang7db07e62003-12-29 01:36:01 +0000607#ifndef Py_UNICODE_WIDE
608 else {
609 digits = 5;
610 base = 10000;
611 }
612#else
Walter Dörwald3aeb6322002-09-02 13:14:32 +0000613 else if (*p<100000) {
614 digits = 5;
615 base = 10000;
616 }
617 else if (*p<1000000) {
618 digits = 6;
619 base = 100000;
620 }
621 else {
622 digits = 7;
623 base = 1000000;
624 }
Hye-Shik Chang7db07e62003-12-29 01:36:01 +0000625#endif
Walter Dörwald3aeb6322002-09-02 13:14:32 +0000626 while (digits-->0) {
627 *outp++ = '0' + c/base;
628 c %= base;
629 base /= 10;
630 }
631 *outp++ = ';';
632 }
Martin v. Löwis18e16552006-02-15 17:27:45 +0000633 restuple = Py_BuildValue("(On)", res, end);
Walter Dörwald3aeb6322002-09-02 13:14:32 +0000634 Py_DECREF(res);
635 Py_DECREF(object);
636 return restuple;
637 }
638 else {
639 wrong_exception_type(exc);
640 return NULL;
641 }
642}
643
644static Py_UNICODE hexdigits[] = {
645 '0', '1', '2', '3', '4', '5', '6', '7',
646 '8', '9', 'a', 'b', 'c', 'd', 'e', 'f'
647};
648
649PyObject *PyCodec_BackslashReplaceErrors(PyObject *exc)
650{
651 if (PyObject_IsInstance(exc, PyExc_UnicodeEncodeError)) {
652 PyObject *restuple;
653 PyObject *object;
Martin v. Löwis18e16552006-02-15 17:27:45 +0000654 Py_ssize_t start;
655 Py_ssize_t end;
Walter Dörwald3aeb6322002-09-02 13:14:32 +0000656 PyObject *res;
657 Py_UNICODE *p;
658 Py_UNICODE *startp;
659 Py_UNICODE *outp;
660 int ressize;
661 if (PyUnicodeEncodeError_GetStart(exc, &start))
662 return NULL;
663 if (PyUnicodeEncodeError_GetEnd(exc, &end))
664 return NULL;
665 if (!(object = PyUnicodeEncodeError_GetObject(exc)))
666 return NULL;
667 startp = PyUnicode_AS_UNICODE(object);
668 for (p = startp+start, ressize = 0; p < startp+end; ++p) {
Hye-Shik Chang7db07e62003-12-29 01:36:01 +0000669#ifdef Py_UNICODE_WIDE
Walter Dörwald3aeb6322002-09-02 13:14:32 +0000670 if (*p >= 0x00010000)
671 ressize += 1+1+8;
Hye-Shik Chang7db07e62003-12-29 01:36:01 +0000672 else
673#endif
674 if (*p >= 0x100) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +0000675 ressize += 1+1+4;
676 }
677 else
678 ressize += 1+1+2;
679 }
680 res = PyUnicode_FromUnicode(NULL, ressize);
681 if (res==NULL)
682 return NULL;
683 for (p = startp+start, outp = PyUnicode_AS_UNICODE(res);
684 p < startp+end; ++p) {
685 Py_UNICODE c = *p;
686 *outp++ = '\\';
Hye-Shik Chang7db07e62003-12-29 01:36:01 +0000687#ifdef Py_UNICODE_WIDE
Walter Dörwald3aeb6322002-09-02 13:14:32 +0000688 if (c >= 0x00010000) {
689 *outp++ = 'U';
690 *outp++ = hexdigits[(c>>28)&0xf];
691 *outp++ = hexdigits[(c>>24)&0xf];
692 *outp++ = hexdigits[(c>>20)&0xf];
693 *outp++ = hexdigits[(c>>16)&0xf];
694 *outp++ = hexdigits[(c>>12)&0xf];
695 *outp++ = hexdigits[(c>>8)&0xf];
696 }
Hye-Shik Chang7db07e62003-12-29 01:36:01 +0000697 else
698#endif
699 if (c >= 0x100) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +0000700 *outp++ = 'u';
701 *outp++ = hexdigits[(c>>12)&0xf];
702 *outp++ = hexdigits[(c>>8)&0xf];
703 }
704 else
705 *outp++ = 'x';
706 *outp++ = hexdigits[(c>>4)&0xf];
707 *outp++ = hexdigits[c&0xf];
708 }
709
Martin v. Löwis18e16552006-02-15 17:27:45 +0000710 restuple = Py_BuildValue("(On)", res, end);
Walter Dörwald3aeb6322002-09-02 13:14:32 +0000711 Py_DECREF(res);
712 Py_DECREF(object);
713 return restuple;
714 }
715 else {
716 wrong_exception_type(exc);
717 return NULL;
718 }
719}
Walter Dörwaldbf73db82002-11-21 20:08:33 +0000720#endif
Walter Dörwald3aeb6322002-09-02 13:14:32 +0000721
722static PyObject *strict_errors(PyObject *self, PyObject *exc)
723{
724 return PyCodec_StrictErrors(exc);
725}
726
727
Walter Dörwaldbf73db82002-11-21 20:08:33 +0000728#ifdef Py_USING_UNICODE
Walter Dörwald3aeb6322002-09-02 13:14:32 +0000729static PyObject *ignore_errors(PyObject *self, PyObject *exc)
730{
731 return PyCodec_IgnoreErrors(exc);
732}
733
734
735static PyObject *replace_errors(PyObject *self, PyObject *exc)
736{
737 return PyCodec_ReplaceErrors(exc);
738}
739
740
741static PyObject *xmlcharrefreplace_errors(PyObject *self, PyObject *exc)
742{
743 return PyCodec_XMLCharRefReplaceErrors(exc);
744}
745
746
747static PyObject *backslashreplace_errors(PyObject *self, PyObject *exc)
748{
749 return PyCodec_BackslashReplaceErrors(exc);
750}
Walter Dörwaldbf73db82002-11-21 20:08:33 +0000751#endif
Walter Dörwald3aeb6322002-09-02 13:14:32 +0000752
Gustavo Niemeyer5ddd4c32003-03-19 00:35:36 +0000753static int _PyCodecRegistry_Init(void)
Guido van Rossumfeee4b92000-03-10 22:57:27 +0000754{
Walter Dörwald3aeb6322002-09-02 13:14:32 +0000755 static struct {
756 char *name;
757 PyMethodDef def;
758 } methods[] =
759 {
760 {
761 "strict",
762 {
763 "strict_errors",
764 strict_errors,
765 METH_O
766 }
767 },
Walter Dörwaldbf73db82002-11-21 20:08:33 +0000768#ifdef Py_USING_UNICODE
Walter Dörwald3aeb6322002-09-02 13:14:32 +0000769 {
770 "ignore",
771 {
772 "ignore_errors",
773 ignore_errors,
774 METH_O
775 }
776 },
777 {
778 "replace",
779 {
780 "replace_errors",
781 replace_errors,
782 METH_O
783 }
784 },
785 {
786 "xmlcharrefreplace",
787 {
788 "xmlcharrefreplace_errors",
789 xmlcharrefreplace_errors,
790 METH_O
791 }
792 },
793 {
794 "backslashreplace",
795 {
796 "backslashreplace_errors",
797 backslashreplace_errors,
798 METH_O
799 }
800 }
Walter Dörwaldbf73db82002-11-21 20:08:33 +0000801#endif
Walter Dörwald3aeb6322002-09-02 13:14:32 +0000802 };
Walter Dörwald3aeb6322002-09-02 13:14:32 +0000803
Nicholas Bastine5662ae2004-03-24 22:22:12 +0000804 PyInterpreterState *interp = PyThreadState_GET()->interp;
Gustavo Niemeyer5ddd4c32003-03-19 00:35:36 +0000805 PyObject *mod;
Neal Norwitz739a8f82004-07-08 01:55:58 +0000806 unsigned i;
Gustavo Niemeyer5ddd4c32003-03-19 00:35:36 +0000807
808 if (interp->codec_search_path != NULL)
809 return 0;
810
811 interp->codec_search_path = PyList_New(0);
812 interp->codec_search_cache = PyDict_New();
813 interp->codec_error_registry = PyDict_New();
814
815 if (interp->codec_error_registry) {
816 for (i = 0; i < sizeof(methods)/sizeof(methods[0]); ++i) {
817 PyObject *func = PyCFunction_New(&methods[i].def, NULL);
818 int res;
819 if (!func)
820 Py_FatalError("can't initialize codec error registry");
821 res = PyCodec_RegisterError(methods[i].name, func);
822 Py_DECREF(func);
823 if (res)
824 Py_FatalError("can't initialize codec error registry");
Walter Dörwald3aeb6322002-09-02 13:14:32 +0000825 }
826 }
Guido van Rossumfeee4b92000-03-10 22:57:27 +0000827
Gustavo Niemeyer5ddd4c32003-03-19 00:35:36 +0000828 if (interp->codec_search_path == NULL ||
829 interp->codec_search_cache == NULL ||
830 interp->codec_error_registry == NULL)
831 Py_FatalError("can't initialize codec registry");
832
Thomas Woutersf7f438b2006-02-28 16:09:29 +0000833 mod = PyImport_ImportModuleLevel("encodings", NULL, NULL, NULL, 0);
Gustavo Niemeyer5ddd4c32003-03-19 00:35:36 +0000834 if (mod == NULL) {
835 if (PyErr_ExceptionMatches(PyExc_ImportError)) {
836 /* Ignore ImportErrors... this is done so that
837 distributions can disable the encodings package. Note
838 that other errors are not masked, e.g. SystemErrors
839 raised to inform the user of an error in the Python
840 configuration are still reported back to the user. */
841 PyErr_Clear();
842 return 0;
843 }
844 return -1;
845 }
846 Py_DECREF(mod);
847 return 0;
Guido van Rossumfeee4b92000-03-10 22:57:27 +0000848}