blob: 4df389ab8fc933c908783791d24929230602e806 [file] [log] [blame]
Guido van Rossumfeee4b92000-03-10 22:57:27 +00001/* ------------------------------------------------------------------------
2
3 Python Codec Registry and support functions
4
5Written by Marc-Andre Lemburg (mal@lemburg.com).
6
7(c) Copyright CNRI, All Rights Reserved. NO WARRANTY.
8
9 ------------------------------------------------------------------------ */
10
11#include "Python.h"
12#include <ctype.h>
13
14/* --- Globals ------------------------------------------------------------ */
15
16static PyObject *_PyCodec_SearchPath;
17static PyObject *_PyCodec_SearchCache;
18
19/* Flag used for lazy import of the standard encodings package */
20static int import_encodings_called = 0;
21
22/* --- Codec Registry ----------------------------------------------------- */
23
24/* Import the standard encodings package which will register the first
25 codec search function.
26
27 This is done in a lazy way so that the Unicode implementation does
28 not downgrade startup time of scripts not needing it.
29
Guido van Rossumb95de4f2000-03-31 17:25:23 +000030 ImportErrors are silently ignored by this function. Only one try is
31 made.
Guido van Rossumfeee4b92000-03-10 22:57:27 +000032
33*/
34
35static
Guido van Rossumb95de4f2000-03-31 17:25:23 +000036int import_encodings()
Guido van Rossumfeee4b92000-03-10 22:57:27 +000037{
38 PyObject *mod;
39
40 import_encodings_called = 1;
41 mod = PyImport_ImportModule("encodings");
42 if (mod == NULL) {
Guido van Rossumb95de4f2000-03-31 17:25:23 +000043 if (PyErr_ExceptionMatches(PyExc_ImportError)) {
44 /* Ignore ImportErrors... this is done so that
45 distributions can disable the encodings package. Note
46 that other errors are not masked, e.g. SystemErrors
47 raised to inform the user of an error in the Python
48 configuration are still reported back to the user. */
49 PyErr_Clear();
50 return 0;
51 }
52 return -1;
Guido van Rossumfeee4b92000-03-10 22:57:27 +000053 }
54 Py_DECREF(mod);
Guido van Rossumb95de4f2000-03-31 17:25:23 +000055 return 0;
Guido van Rossumfeee4b92000-03-10 22:57:27 +000056}
57
58/* Register a new codec search function.
59
Guido van Rossumb95de4f2000-03-31 17:25:23 +000060 As side effect, this tries to load the encodings package, if not
61 yet done, to make sure that it is always first in the list of
62 search functions.
63
Guido van Rossumfeee4b92000-03-10 22:57:27 +000064 The search_function's refcount is incremented by this function. */
65
66int PyCodec_Register(PyObject *search_function)
67{
Guido van Rossumb95de4f2000-03-31 17:25:23 +000068 if (!import_encodings_called) {
69 if (import_encodings())
70 goto onError;
71 }
Guido van Rossumfeee4b92000-03-10 22:57:27 +000072 if (search_function == NULL) {
73 PyErr_BadArgument();
Guido van Rossumb95de4f2000-03-31 17:25:23 +000074 goto onError;
Guido van Rossumfeee4b92000-03-10 22:57:27 +000075 }
76 if (!PyCallable_Check(search_function)) {
77 PyErr_SetString(PyExc_TypeError,
78 "argument must be callable");
Guido van Rossumb95de4f2000-03-31 17:25:23 +000079 goto onError;
Guido van Rossumfeee4b92000-03-10 22:57:27 +000080 }
81 return PyList_Append(_PyCodec_SearchPath, search_function);
Guido van Rossumb95de4f2000-03-31 17:25:23 +000082
83 onError:
84 return -1;
Guido van Rossumfeee4b92000-03-10 22:57:27 +000085}
86
Guido van Rossum9e896b32000-04-05 20:11:21 +000087/* Convert a string to a normalized Python string: all characters are
88 converted to lower case, spaces are replaced with underscores. */
89
Guido van Rossumfeee4b92000-03-10 22:57:27 +000090static
Guido van Rossum9e896b32000-04-05 20:11:21 +000091PyObject *normalizestring(const char *string)
Guido van Rossumfeee4b92000-03-10 22:57:27 +000092{
93 register int i;
94 int len = strlen(string);
95 char *p;
96 PyObject *v;
97
98 v = PyString_FromStringAndSize(NULL, len);
99 if (v == NULL)
100 return NULL;
101 p = PyString_AS_STRING(v);
Guido van Rossum9e896b32000-04-05 20:11:21 +0000102 for (i = 0; i < len; i++) {
103 register char ch = string[i];
104 if (ch == ' ')
105 ch = '-';
106 else
107 ch = tolower(ch);
108 p[i] = ch;
109 }
Guido van Rossumfeee4b92000-03-10 22:57:27 +0000110 return v;
111}
112
113/* Lookup the given encoding and return a tuple providing the codec
114 facilities.
115
116 The encoding string is looked up converted to all lower-case
117 characters. This makes encodings looked up through this mechanism
118 effectively case-insensitive.
119
Guido van Rossumb95de4f2000-03-31 17:25:23 +0000120 If no codec is found, a KeyError is set and NULL returned.
121
122 As side effect, this tries to load the encodings package, if not
123 yet done. This is part of the lazy load strategy for the encodings
124 package.
125
126*/
Guido van Rossumfeee4b92000-03-10 22:57:27 +0000127
128PyObject *_PyCodec_Lookup(const char *encoding)
129{
Guido van Rossum5ba3c842000-03-24 20:52:23 +0000130 PyObject *result, *args = NULL, *v;
Guido van Rossumfeee4b92000-03-10 22:57:27 +0000131 int i, len;
132
Guido van Rossumb95de4f2000-03-31 17:25:23 +0000133 if (_PyCodec_SearchCache == NULL ||
134 _PyCodec_SearchPath == NULL) {
Barry Warsaw51ac5802000-03-20 16:36:48 +0000135 PyErr_SetString(PyExc_SystemError,
136 "codec module not properly initialized");
137 goto onError;
138 }
Guido van Rossumb95de4f2000-03-31 17:25:23 +0000139 if (!import_encodings_called) {
140 if (import_encodings())
141 goto onError;
142 }
Guido van Rossumfeee4b92000-03-10 22:57:27 +0000143
Guido van Rossum9e896b32000-04-05 20:11:21 +0000144 /* Convert the encoding to a normalized Python string: all
145 characters are converted to lower case, spaces and hypens are
146 replaced with underscores. */
147 v = normalizestring(encoding);
Guido van Rossumfeee4b92000-03-10 22:57:27 +0000148 if (v == NULL)
149 goto onError;
150 PyString_InternInPlace(&v);
151
152 /* First, try to lookup the name in the registry dictionary */
153 result = PyDict_GetItem(_PyCodec_SearchCache, v);
154 if (result != NULL) {
155 Py_INCREF(result);
Barry Warsaw51ac5802000-03-20 16:36:48 +0000156 Py_DECREF(v);
Guido van Rossumfeee4b92000-03-10 22:57:27 +0000157 return result;
158 }
159
160 /* Next, scan the search functions in order of registration */
Guido van Rossumfeee4b92000-03-10 22:57:27 +0000161 args = PyTuple_New(1);
162 if (args == NULL)
163 goto onError;
164 PyTuple_SET_ITEM(args,0,v);
Guido van Rossum5ba3c842000-03-24 20:52:23 +0000165
166 len = PyList_Size(_PyCodec_SearchPath);
167 if (len < 0)
168 goto onError;
Guido van Rossumb95de4f2000-03-31 17:25:23 +0000169 if (len == 0) {
170 PyErr_SetString(PyExc_LookupError,
171 "no codec search functions registered: "
172 "can't find encoding");
173 goto onError;
174 }
Guido van Rossumfeee4b92000-03-10 22:57:27 +0000175
176 for (i = 0; i < len; i++) {
177 PyObject *func;
178
179 func = PyList_GetItem(_PyCodec_SearchPath, i);
180 if (func == NULL)
181 goto onError;
Guido van Rossum5ba3c842000-03-24 20:52:23 +0000182 result = PyEval_CallObject(func, args);
Guido van Rossumfeee4b92000-03-10 22:57:27 +0000183 if (result == NULL)
184 goto onError;
185 if (result == Py_None) {
186 Py_DECREF(result);
187 continue;
188 }
189 if (!PyTuple_Check(result) || PyTuple_GET_SIZE(result) != 4) {
190 PyErr_SetString(PyExc_TypeError,
191 "codec search functions must return 4-tuples");
192 Py_DECREF(result);
193 goto onError;
194 }
195 break;
196 }
197 if (i == len) {
198 /* XXX Perhaps we should cache misses too ? */
199 PyErr_SetString(PyExc_LookupError,
Barry Warsaw51ac5802000-03-20 16:36:48 +0000200 "unknown encoding");
Guido van Rossumfeee4b92000-03-10 22:57:27 +0000201 goto onError;
202 }
203
204 /* Cache and return the result */
205 PyDict_SetItem(_PyCodec_SearchCache, v, result);
206 Py_DECREF(args);
207 return result;
208
209 onError:
210 Py_XDECREF(args);
211 return NULL;
212}
213
214static
215PyObject *args_tuple(PyObject *object,
216 const char *errors)
217{
218 PyObject *args;
219
220 args = PyTuple_New(1 + (errors != NULL));
221 if (args == NULL)
222 return NULL;
223 Py_INCREF(object);
224 PyTuple_SET_ITEM(args,0,object);
225 if (errors) {
226 PyObject *v;
227
228 v = PyString_FromString(errors);
229 if (v == NULL) {
230 Py_DECREF(args);
231 return NULL;
232 }
233 PyTuple_SET_ITEM(args, 1, v);
234 }
235 return args;
236}
237
238/* Build a codec by calling factory(stream[,errors]) or just
239 factory(errors) depending on whether the given parameters are
240 non-NULL. */
241
242static
243PyObject *build_stream_codec(PyObject *factory,
244 PyObject *stream,
245 const char *errors)
246{
247 PyObject *args, *codec;
248
249 args = args_tuple(stream, errors);
250 if (args == NULL)
251 return NULL;
252
253 codec = PyEval_CallObject(factory, args);
254 Py_DECREF(args);
255 return codec;
256}
257
258/* Convenience APIs to query the Codec registry.
259
260 All APIs return a codec object with incremented refcount.
261
262 */
263
264PyObject *PyCodec_Encoder(const char *encoding)
265{
266 PyObject *codecs;
267 PyObject *v;
268
269 codecs = _PyCodec_Lookup(encoding);
270 if (codecs == NULL)
271 goto onError;
272 v = PyTuple_GET_ITEM(codecs,0);
273 Py_INCREF(v);
274 return v;
275
276 onError:
277 return NULL;
278}
279
280PyObject *PyCodec_Decoder(const char *encoding)
281{
282 PyObject *codecs;
283 PyObject *v;
284
285 codecs = _PyCodec_Lookup(encoding);
286 if (codecs == NULL)
287 goto onError;
288 v = PyTuple_GET_ITEM(codecs,1);
289 Py_INCREF(v);
290 return v;
291
292 onError:
293 return NULL;
294}
295
296PyObject *PyCodec_StreamReader(const char *encoding,
297 PyObject *stream,
298 const char *errors)
299{
300 PyObject *codecs;
301
302 codecs = _PyCodec_Lookup(encoding);
303 if (codecs == NULL)
304 goto onError;
305 return build_stream_codec(PyTuple_GET_ITEM(codecs,2),stream,errors);
306
307 onError:
308 return NULL;
309}
310
311PyObject *PyCodec_StreamWriter(const char *encoding,
312 PyObject *stream,
313 const char *errors)
314{
315 PyObject *codecs;
316
317 codecs = _PyCodec_Lookup(encoding);
318 if (codecs == NULL)
319 goto onError;
320 return build_stream_codec(PyTuple_GET_ITEM(codecs,3),stream,errors);
321
322 onError:
323 return NULL;
324}
325
326/* Encode an object (e.g. an Unicode object) using the given encoding
327 and return the resulting encoded object (usually a Python string).
328
329 errors is passed to the encoder factory as argument if non-NULL. */
330
331PyObject *PyCodec_Encode(PyObject *object,
332 const char *encoding,
333 const char *errors)
334{
335 PyObject *encoder = NULL;
336 PyObject *args = NULL, *result;
337 PyObject *v;
338
339 encoder = PyCodec_Encoder(encoding);
340 if (encoder == NULL)
341 goto onError;
342
343 args = args_tuple(object, errors);
344 if (args == NULL)
345 goto onError;
346
347 result = PyEval_CallObject(encoder,args);
348 if (result == NULL)
349 goto onError;
350
351 if (!PyTuple_Check(result) ||
352 PyTuple_GET_SIZE(result) != 2) {
353 PyErr_SetString(PyExc_TypeError,
354 "encoder must return a tuple (object,integer)");
355 goto onError;
356 }
357 v = PyTuple_GET_ITEM(result,0);
358 Py_INCREF(v);
359 /* We don't check or use the second (integer) entry. */
360
361 Py_DECREF(args);
362 Py_DECREF(encoder);
363 Py_DECREF(result);
364 return v;
365
366 onError:
367 Py_XDECREF(args);
368 Py_XDECREF(encoder);
369 return NULL;
370}
371
372/* Decode an object (usually a Python string) using the given encoding
373 and return an equivalent object (e.g. an Unicode object).
374
375 errors is passed to the decoder factory as argument if non-NULL. */
376
377PyObject *PyCodec_Decode(PyObject *object,
378 const char *encoding,
379 const char *errors)
380{
381 PyObject *decoder = NULL;
382 PyObject *args = NULL, *result = NULL;
383 PyObject *v;
384
385 decoder = PyCodec_Decoder(encoding);
386 if (decoder == NULL)
387 goto onError;
388
389 args = args_tuple(object, errors);
390 if (args == NULL)
391 goto onError;
392
393 result = PyEval_CallObject(decoder,args);
394 if (result == NULL)
395 goto onError;
396 if (!PyTuple_Check(result) ||
397 PyTuple_GET_SIZE(result) != 2) {
398 PyErr_SetString(PyExc_TypeError,
399 "decoder must return a tuple (object,integer)");
400 goto onError;
401 }
402 v = PyTuple_GET_ITEM(result,0);
403 Py_INCREF(v);
404 /* We don't check or use the second (integer) entry. */
405
406 Py_DECREF(args);
407 Py_DECREF(decoder);
408 Py_DECREF(result);
409 return v;
410
411 onError:
412 Py_XDECREF(args);
413 Py_XDECREF(decoder);
414 Py_XDECREF(result);
415 return NULL;
416}
417
418void _PyCodecRegistry_Init()
419{
420 if (_PyCodec_SearchPath == NULL)
421 _PyCodec_SearchPath = PyList_New(0);
422 if (_PyCodec_SearchCache == NULL)
423 _PyCodec_SearchCache = PyDict_New();
424 if (_PyCodec_SearchPath == NULL ||
425 _PyCodec_SearchCache == NULL)
426 Py_FatalError("can't intialize codec registry");
427}
428
429void _PyCodecRegistry_Fini()
430{
431 Py_XDECREF(_PyCodec_SearchPath);
Barry Warsaw51ac5802000-03-20 16:36:48 +0000432 _PyCodec_SearchPath = NULL;
Guido van Rossumfeee4b92000-03-10 22:57:27 +0000433 Py_XDECREF(_PyCodec_SearchCache);
Barry Warsaw51ac5802000-03-20 16:36:48 +0000434 _PyCodec_SearchCache = NULL;
Guido van Rossumfeee4b92000-03-10 22:57:27 +0000435}