blob: 0a315d9a23d5c1ba88ed5d3265f5ebf687cdcd08 [file] [log] [blame]
Guido van Rossumfeee4b92000-03-10 22:57:27 +00001/* ------------------------------------------------------------------------
2
3 Python Codec Registry and support functions
4
5Written by Marc-Andre Lemburg (mal@lemburg.com).
6
7(c) Copyright CNRI, All Rights Reserved. NO WARRANTY.
8
9 ------------------------------------------------------------------------ */
10
11#include "Python.h"
12#include <ctype.h>
Jack Jansen41aa8e52000-07-03 21:39:47 +000013#ifdef HAVE_LIMITS_H
14#include <limits.h>
15#endif
Guido van Rossumfeee4b92000-03-10 22:57:27 +000016
17/* --- Globals ------------------------------------------------------------ */
18
19static PyObject *_PyCodec_SearchPath;
20static PyObject *_PyCodec_SearchCache;
21
22/* Flag used for lazy import of the standard encodings package */
23static int import_encodings_called = 0;
24
25/* --- Codec Registry ----------------------------------------------------- */
26
27/* Import the standard encodings package which will register the first
28 codec search function.
29
30 This is done in a lazy way so that the Unicode implementation does
31 not downgrade startup time of scripts not needing it.
32
Guido van Rossumb95de4f2000-03-31 17:25:23 +000033 ImportErrors are silently ignored by this function. Only one try is
34 made.
Guido van Rossumfeee4b92000-03-10 22:57:27 +000035
36*/
37
38static
Thomas Woutersf70ef4f2000-07-22 18:47:25 +000039int import_encodings(void)
Guido van Rossumfeee4b92000-03-10 22:57:27 +000040{
41 PyObject *mod;
42
43 import_encodings_called = 1;
44 mod = PyImport_ImportModule("encodings");
45 if (mod == NULL) {
Guido van Rossumb95de4f2000-03-31 17:25:23 +000046 if (PyErr_ExceptionMatches(PyExc_ImportError)) {
47 /* Ignore ImportErrors... this is done so that
48 distributions can disable the encodings package. Note
49 that other errors are not masked, e.g. SystemErrors
50 raised to inform the user of an error in the Python
51 configuration are still reported back to the user. */
52 PyErr_Clear();
53 return 0;
54 }
55 return -1;
Guido van Rossumfeee4b92000-03-10 22:57:27 +000056 }
57 Py_DECREF(mod);
Guido van Rossumb95de4f2000-03-31 17:25:23 +000058 return 0;
Guido van Rossumfeee4b92000-03-10 22:57:27 +000059}
60
Guido van Rossumfeee4b92000-03-10 22:57:27 +000061int PyCodec_Register(PyObject *search_function)
62{
Guido van Rossumb95de4f2000-03-31 17:25:23 +000063 if (!import_encodings_called) {
64 if (import_encodings())
65 goto onError;
66 }
Guido van Rossumfeee4b92000-03-10 22:57:27 +000067 if (search_function == NULL) {
68 PyErr_BadArgument();
Guido van Rossumb95de4f2000-03-31 17:25:23 +000069 goto onError;
Guido van Rossumfeee4b92000-03-10 22:57:27 +000070 }
71 if (!PyCallable_Check(search_function)) {
72 PyErr_SetString(PyExc_TypeError,
73 "argument must be callable");
Guido van Rossumb95de4f2000-03-31 17:25:23 +000074 goto onError;
Guido van Rossumfeee4b92000-03-10 22:57:27 +000075 }
76 return PyList_Append(_PyCodec_SearchPath, search_function);
Guido van Rossumb95de4f2000-03-31 17:25:23 +000077
78 onError:
79 return -1;
Guido van Rossumfeee4b92000-03-10 22:57:27 +000080}
81
Guido van Rossum9e896b32000-04-05 20:11:21 +000082/* Convert a string to a normalized Python string: all characters are
83 converted to lower case, spaces are replaced with underscores. */
84
Guido van Rossumfeee4b92000-03-10 22:57:27 +000085static
Guido van Rossum9e896b32000-04-05 20:11:21 +000086PyObject *normalizestring(const char *string)
Guido van Rossumfeee4b92000-03-10 22:57:27 +000087{
Guido van Rossum33831132000-06-29 14:50:15 +000088 register size_t i;
Guido van Rossum582acec2000-06-28 22:07:35 +000089 size_t len = strlen(string);
Guido van Rossumfeee4b92000-03-10 22:57:27 +000090 char *p;
91 PyObject *v;
92
Guido van Rossum582acec2000-06-28 22:07:35 +000093 if (len > INT_MAX) {
94 PyErr_SetString(PyExc_OverflowError, "string is too large");
95 return NULL;
96 }
97
98 v = PyString_FromStringAndSize(NULL, (int)len);
Guido van Rossumfeee4b92000-03-10 22:57:27 +000099 if (v == NULL)
100 return NULL;
101 p = PyString_AS_STRING(v);
Guido van Rossum9e896b32000-04-05 20:11:21 +0000102 for (i = 0; i < len; i++) {
103 register char ch = string[i];
104 if (ch == ' ')
105 ch = '-';
106 else
107 ch = tolower(ch);
108 p[i] = ch;
109 }
Guido van Rossumfeee4b92000-03-10 22:57:27 +0000110 return v;
111}
112
113/* Lookup the given encoding and return a tuple providing the codec
114 facilities.
115
116 The encoding string is looked up converted to all lower-case
117 characters. This makes encodings looked up through this mechanism
118 effectively case-insensitive.
119
Fred Drake766de832000-05-09 19:55:59 +0000120 If no codec is found, a LookupError is set and NULL returned.
Guido van Rossumb95de4f2000-03-31 17:25:23 +0000121
122 As side effect, this tries to load the encodings package, if not
123 yet done. This is part of the lazy load strategy for the encodings
124 package.
125
126*/
Guido van Rossumfeee4b92000-03-10 22:57:27 +0000127
128PyObject *_PyCodec_Lookup(const char *encoding)
129{
Guido van Rossum5ba3c842000-03-24 20:52:23 +0000130 PyObject *result, *args = NULL, *v;
Guido van Rossumfeee4b92000-03-10 22:57:27 +0000131 int i, len;
132
Fred Drake766de832000-05-09 19:55:59 +0000133 if (encoding == NULL) {
134 PyErr_BadArgument();
135 goto onError;
136 }
Guido van Rossumb95de4f2000-03-31 17:25:23 +0000137 if (_PyCodec_SearchCache == NULL ||
138 _PyCodec_SearchPath == NULL) {
Barry Warsaw51ac5802000-03-20 16:36:48 +0000139 PyErr_SetString(PyExc_SystemError,
140 "codec module not properly initialized");
141 goto onError;
142 }
Guido van Rossumb95de4f2000-03-31 17:25:23 +0000143 if (!import_encodings_called) {
144 if (import_encodings())
145 goto onError;
146 }
Guido van Rossumfeee4b92000-03-10 22:57:27 +0000147
Guido van Rossum9e896b32000-04-05 20:11:21 +0000148 /* Convert the encoding to a normalized Python string: all
Thomas Wouters7e474022000-07-16 12:04:32 +0000149 characters are converted to lower case, spaces and hyphens are
Guido van Rossum9e896b32000-04-05 20:11:21 +0000150 replaced with underscores. */
151 v = normalizestring(encoding);
Guido van Rossumfeee4b92000-03-10 22:57:27 +0000152 if (v == NULL)
153 goto onError;
154 PyString_InternInPlace(&v);
155
156 /* First, try to lookup the name in the registry dictionary */
157 result = PyDict_GetItem(_PyCodec_SearchCache, v);
158 if (result != NULL) {
159 Py_INCREF(result);
Barry Warsaw51ac5802000-03-20 16:36:48 +0000160 Py_DECREF(v);
Guido van Rossumfeee4b92000-03-10 22:57:27 +0000161 return result;
162 }
163
164 /* Next, scan the search functions in order of registration */
Guido van Rossumfeee4b92000-03-10 22:57:27 +0000165 args = PyTuple_New(1);
166 if (args == NULL)
167 goto onError;
168 PyTuple_SET_ITEM(args,0,v);
Guido van Rossum5ba3c842000-03-24 20:52:23 +0000169
170 len = PyList_Size(_PyCodec_SearchPath);
171 if (len < 0)
172 goto onError;
Guido van Rossumb95de4f2000-03-31 17:25:23 +0000173 if (len == 0) {
174 PyErr_SetString(PyExc_LookupError,
175 "no codec search functions registered: "
176 "can't find encoding");
177 goto onError;
178 }
Guido van Rossumfeee4b92000-03-10 22:57:27 +0000179
180 for (i = 0; i < len; i++) {
181 PyObject *func;
182
183 func = PyList_GetItem(_PyCodec_SearchPath, i);
184 if (func == NULL)
185 goto onError;
Guido van Rossum5ba3c842000-03-24 20:52:23 +0000186 result = PyEval_CallObject(func, args);
Guido van Rossumfeee4b92000-03-10 22:57:27 +0000187 if (result == NULL)
188 goto onError;
189 if (result == Py_None) {
190 Py_DECREF(result);
191 continue;
192 }
193 if (!PyTuple_Check(result) || PyTuple_GET_SIZE(result) != 4) {
194 PyErr_SetString(PyExc_TypeError,
195 "codec search functions must return 4-tuples");
196 Py_DECREF(result);
197 goto onError;
198 }
199 break;
200 }
201 if (i == len) {
202 /* XXX Perhaps we should cache misses too ? */
203 PyErr_SetString(PyExc_LookupError,
Barry Warsaw51ac5802000-03-20 16:36:48 +0000204 "unknown encoding");
Guido van Rossumfeee4b92000-03-10 22:57:27 +0000205 goto onError;
206 }
207
208 /* Cache and return the result */
209 PyDict_SetItem(_PyCodec_SearchCache, v, result);
210 Py_DECREF(args);
211 return result;
212
213 onError:
214 Py_XDECREF(args);
215 return NULL;
216}
217
218static
219PyObject *args_tuple(PyObject *object,
220 const char *errors)
221{
222 PyObject *args;
223
224 args = PyTuple_New(1 + (errors != NULL));
225 if (args == NULL)
226 return NULL;
227 Py_INCREF(object);
228 PyTuple_SET_ITEM(args,0,object);
229 if (errors) {
230 PyObject *v;
231
232 v = PyString_FromString(errors);
233 if (v == NULL) {
234 Py_DECREF(args);
235 return NULL;
236 }
237 PyTuple_SET_ITEM(args, 1, v);
238 }
239 return args;
240}
241
242/* Build a codec by calling factory(stream[,errors]) or just
243 factory(errors) depending on whether the given parameters are
244 non-NULL. */
245
246static
247PyObject *build_stream_codec(PyObject *factory,
248 PyObject *stream,
249 const char *errors)
250{
251 PyObject *args, *codec;
252
253 args = args_tuple(stream, errors);
254 if (args == NULL)
255 return NULL;
256
257 codec = PyEval_CallObject(factory, args);
258 Py_DECREF(args);
259 return codec;
260}
261
262/* Convenience APIs to query the Codec registry.
263
264 All APIs return a codec object with incremented refcount.
265
266 */
267
268PyObject *PyCodec_Encoder(const char *encoding)
269{
270 PyObject *codecs;
271 PyObject *v;
272
273 codecs = _PyCodec_Lookup(encoding);
274 if (codecs == NULL)
275 goto onError;
276 v = PyTuple_GET_ITEM(codecs,0);
277 Py_INCREF(v);
278 return v;
279
280 onError:
281 return NULL;
282}
283
284PyObject *PyCodec_Decoder(const char *encoding)
285{
286 PyObject *codecs;
287 PyObject *v;
288
289 codecs = _PyCodec_Lookup(encoding);
290 if (codecs == NULL)
291 goto onError;
292 v = PyTuple_GET_ITEM(codecs,1);
293 Py_INCREF(v);
294 return v;
295
296 onError:
297 return NULL;
298}
299
300PyObject *PyCodec_StreamReader(const char *encoding,
301 PyObject *stream,
302 const char *errors)
303{
304 PyObject *codecs;
305
306 codecs = _PyCodec_Lookup(encoding);
307 if (codecs == NULL)
308 goto onError;
309 return build_stream_codec(PyTuple_GET_ITEM(codecs,2),stream,errors);
310
311 onError:
312 return NULL;
313}
314
315PyObject *PyCodec_StreamWriter(const char *encoding,
316 PyObject *stream,
317 const char *errors)
318{
319 PyObject *codecs;
320
321 codecs = _PyCodec_Lookup(encoding);
322 if (codecs == NULL)
323 goto onError;
324 return build_stream_codec(PyTuple_GET_ITEM(codecs,3),stream,errors);
325
326 onError:
327 return NULL;
328}
329
330/* Encode an object (e.g. an Unicode object) using the given encoding
331 and return the resulting encoded object (usually a Python string).
332
333 errors is passed to the encoder factory as argument if non-NULL. */
334
335PyObject *PyCodec_Encode(PyObject *object,
336 const char *encoding,
337 const char *errors)
338{
339 PyObject *encoder = NULL;
340 PyObject *args = NULL, *result;
341 PyObject *v;
342
343 encoder = PyCodec_Encoder(encoding);
344 if (encoder == NULL)
345 goto onError;
346
347 args = args_tuple(object, errors);
348 if (args == NULL)
349 goto onError;
350
351 result = PyEval_CallObject(encoder,args);
352 if (result == NULL)
353 goto onError;
354
355 if (!PyTuple_Check(result) ||
356 PyTuple_GET_SIZE(result) != 2) {
357 PyErr_SetString(PyExc_TypeError,
358 "encoder must return a tuple (object,integer)");
359 goto onError;
360 }
361 v = PyTuple_GET_ITEM(result,0);
362 Py_INCREF(v);
363 /* We don't check or use the second (integer) entry. */
364
365 Py_DECREF(args);
366 Py_DECREF(encoder);
367 Py_DECREF(result);
368 return v;
369
370 onError:
371 Py_XDECREF(args);
372 Py_XDECREF(encoder);
373 return NULL;
374}
375
376/* Decode an object (usually a Python string) using the given encoding
377 and return an equivalent object (e.g. an Unicode object).
378
379 errors is passed to the decoder factory as argument if non-NULL. */
380
381PyObject *PyCodec_Decode(PyObject *object,
382 const char *encoding,
383 const char *errors)
384{
385 PyObject *decoder = NULL;
386 PyObject *args = NULL, *result = NULL;
387 PyObject *v;
388
389 decoder = PyCodec_Decoder(encoding);
390 if (decoder == NULL)
391 goto onError;
392
393 args = args_tuple(object, errors);
394 if (args == NULL)
395 goto onError;
396
397 result = PyEval_CallObject(decoder,args);
398 if (result == NULL)
399 goto onError;
400 if (!PyTuple_Check(result) ||
401 PyTuple_GET_SIZE(result) != 2) {
402 PyErr_SetString(PyExc_TypeError,
403 "decoder must return a tuple (object,integer)");
404 goto onError;
405 }
406 v = PyTuple_GET_ITEM(result,0);
407 Py_INCREF(v);
408 /* We don't check or use the second (integer) entry. */
409
410 Py_DECREF(args);
411 Py_DECREF(decoder);
412 Py_DECREF(result);
413 return v;
414
415 onError:
416 Py_XDECREF(args);
417 Py_XDECREF(decoder);
418 Py_XDECREF(result);
419 return NULL;
420}
421
Thomas Woutersf70ef4f2000-07-22 18:47:25 +0000422void _PyCodecRegistry_Init(void)
Guido van Rossumfeee4b92000-03-10 22:57:27 +0000423{
424 if (_PyCodec_SearchPath == NULL)
425 _PyCodec_SearchPath = PyList_New(0);
426 if (_PyCodec_SearchCache == NULL)
427 _PyCodec_SearchCache = PyDict_New();
428 if (_PyCodec_SearchPath == NULL ||
429 _PyCodec_SearchCache == NULL)
Thomas Wouters7e474022000-07-16 12:04:32 +0000430 Py_FatalError("can't initialize codec registry");
Guido van Rossumfeee4b92000-03-10 22:57:27 +0000431}
432
Thomas Woutersf70ef4f2000-07-22 18:47:25 +0000433void _PyCodecRegistry_Fini(void)
Guido van Rossumfeee4b92000-03-10 22:57:27 +0000434{
435 Py_XDECREF(_PyCodec_SearchPath);
Barry Warsaw51ac5802000-03-20 16:36:48 +0000436 _PyCodec_SearchPath = NULL;
Guido van Rossumfeee4b92000-03-10 22:57:27 +0000437 Py_XDECREF(_PyCodec_SearchCache);
Barry Warsaw51ac5802000-03-20 16:36:48 +0000438 _PyCodec_SearchCache = NULL;
Guido van Rossumfeee4b92000-03-10 22:57:27 +0000439}