blob: 3324b806fc9b227a384d285509ad57bf105b265c [file] [log] [blame]
Guido van Rossumfeee4b92000-03-10 22:57:27 +00001/* ------------------------------------------------------------------------
2
3 Python Codec Registry and support functions
4
5Written by Marc-Andre Lemburg (mal@lemburg.com).
6
Guido van Rossum16b1ad92000-08-03 16:24:25 +00007Copyright (c) Corporation for National Research Initiatives.
Guido van Rossumfeee4b92000-03-10 22:57:27 +00008
9 ------------------------------------------------------------------------ */
10
11#include "Python.h"
12#include <ctype.h>
13
14/* --- Globals ------------------------------------------------------------ */
15
16static PyObject *_PyCodec_SearchPath;
17static PyObject *_PyCodec_SearchCache;
18
19/* Flag used for lazy import of the standard encodings package */
20static int import_encodings_called = 0;
21
22/* --- Codec Registry ----------------------------------------------------- */
23
24/* Import the standard encodings package which will register the first
25 codec search function.
26
27 This is done in a lazy way so that the Unicode implementation does
28 not downgrade startup time of scripts not needing it.
29
Guido van Rossumb95de4f2000-03-31 17:25:23 +000030 ImportErrors are silently ignored by this function. Only one try is
31 made.
Guido van Rossumfeee4b92000-03-10 22:57:27 +000032
33*/
34
35static
Thomas Woutersf70ef4f2000-07-22 18:47:25 +000036int import_encodings(void)
Guido van Rossumfeee4b92000-03-10 22:57:27 +000037{
38 PyObject *mod;
39
40 import_encodings_called = 1;
41 mod = PyImport_ImportModule("encodings");
42 if (mod == NULL) {
Guido van Rossumb95de4f2000-03-31 17:25:23 +000043 if (PyErr_ExceptionMatches(PyExc_ImportError)) {
44 /* Ignore ImportErrors... this is done so that
45 distributions can disable the encodings package. Note
46 that other errors are not masked, e.g. SystemErrors
47 raised to inform the user of an error in the Python
48 configuration are still reported back to the user. */
49 PyErr_Clear();
50 return 0;
51 }
52 return -1;
Guido van Rossumfeee4b92000-03-10 22:57:27 +000053 }
54 Py_DECREF(mod);
Guido van Rossumb95de4f2000-03-31 17:25:23 +000055 return 0;
Guido van Rossumfeee4b92000-03-10 22:57:27 +000056}
57
Guido van Rossumfeee4b92000-03-10 22:57:27 +000058int PyCodec_Register(PyObject *search_function)
59{
Guido van Rossumb95de4f2000-03-31 17:25:23 +000060 if (!import_encodings_called) {
61 if (import_encodings())
62 goto onError;
63 }
Guido van Rossumfeee4b92000-03-10 22:57:27 +000064 if (search_function == NULL) {
65 PyErr_BadArgument();
Guido van Rossumb95de4f2000-03-31 17:25:23 +000066 goto onError;
Guido van Rossumfeee4b92000-03-10 22:57:27 +000067 }
68 if (!PyCallable_Check(search_function)) {
69 PyErr_SetString(PyExc_TypeError,
70 "argument must be callable");
Guido van Rossumb95de4f2000-03-31 17:25:23 +000071 goto onError;
Guido van Rossumfeee4b92000-03-10 22:57:27 +000072 }
73 return PyList_Append(_PyCodec_SearchPath, search_function);
Guido van Rossumb95de4f2000-03-31 17:25:23 +000074
75 onError:
76 return -1;
Guido van Rossumfeee4b92000-03-10 22:57:27 +000077}
78
Guido van Rossum9e896b32000-04-05 20:11:21 +000079/* Convert a string to a normalized Python string: all characters are
80 converted to lower case, spaces are replaced with underscores. */
81
Guido van Rossumfeee4b92000-03-10 22:57:27 +000082static
Guido van Rossum9e896b32000-04-05 20:11:21 +000083PyObject *normalizestring(const char *string)
Guido van Rossumfeee4b92000-03-10 22:57:27 +000084{
Guido van Rossum33831132000-06-29 14:50:15 +000085 register size_t i;
Guido van Rossum582acec2000-06-28 22:07:35 +000086 size_t len = strlen(string);
Guido van Rossumfeee4b92000-03-10 22:57:27 +000087 char *p;
88 PyObject *v;
89
Guido van Rossum582acec2000-06-28 22:07:35 +000090 if (len > INT_MAX) {
91 PyErr_SetString(PyExc_OverflowError, "string is too large");
92 return NULL;
93 }
94
95 v = PyString_FromStringAndSize(NULL, (int)len);
Guido van Rossumfeee4b92000-03-10 22:57:27 +000096 if (v == NULL)
97 return NULL;
98 p = PyString_AS_STRING(v);
Guido van Rossum9e896b32000-04-05 20:11:21 +000099 for (i = 0; i < len; i++) {
100 register char ch = string[i];
101 if (ch == ' ')
102 ch = '-';
103 else
104 ch = tolower(ch);
105 p[i] = ch;
106 }
Guido van Rossumfeee4b92000-03-10 22:57:27 +0000107 return v;
108}
109
110/* Lookup the given encoding and return a tuple providing the codec
111 facilities.
112
113 The encoding string is looked up converted to all lower-case
114 characters. This makes encodings looked up through this mechanism
115 effectively case-insensitive.
116
Fred Drake766de832000-05-09 19:55:59 +0000117 If no codec is found, a LookupError is set and NULL returned.
Guido van Rossumb95de4f2000-03-31 17:25:23 +0000118
119 As side effect, this tries to load the encodings package, if not
120 yet done. This is part of the lazy load strategy for the encodings
121 package.
122
123*/
Guido van Rossumfeee4b92000-03-10 22:57:27 +0000124
125PyObject *_PyCodec_Lookup(const char *encoding)
126{
Guido van Rossum5ba3c842000-03-24 20:52:23 +0000127 PyObject *result, *args = NULL, *v;
Guido van Rossumfeee4b92000-03-10 22:57:27 +0000128 int i, len;
129
Fred Drake766de832000-05-09 19:55:59 +0000130 if (encoding == NULL) {
131 PyErr_BadArgument();
132 goto onError;
133 }
Guido van Rossumb95de4f2000-03-31 17:25:23 +0000134 if (_PyCodec_SearchCache == NULL ||
135 _PyCodec_SearchPath == NULL) {
Barry Warsaw51ac5802000-03-20 16:36:48 +0000136 PyErr_SetString(PyExc_SystemError,
137 "codec module not properly initialized");
138 goto onError;
139 }
Guido van Rossumb95de4f2000-03-31 17:25:23 +0000140 if (!import_encodings_called) {
141 if (import_encodings())
142 goto onError;
143 }
Guido van Rossumfeee4b92000-03-10 22:57:27 +0000144
Guido van Rossum9e896b32000-04-05 20:11:21 +0000145 /* Convert the encoding to a normalized Python string: all
Thomas Wouters7e474022000-07-16 12:04:32 +0000146 characters are converted to lower case, spaces and hyphens are
Guido van Rossum9e896b32000-04-05 20:11:21 +0000147 replaced with underscores. */
148 v = normalizestring(encoding);
Guido van Rossumfeee4b92000-03-10 22:57:27 +0000149 if (v == NULL)
150 goto onError;
151 PyString_InternInPlace(&v);
152
153 /* First, try to lookup the name in the registry dictionary */
154 result = PyDict_GetItem(_PyCodec_SearchCache, v);
155 if (result != NULL) {
156 Py_INCREF(result);
Barry Warsaw51ac5802000-03-20 16:36:48 +0000157 Py_DECREF(v);
Guido van Rossumfeee4b92000-03-10 22:57:27 +0000158 return result;
159 }
160
161 /* Next, scan the search functions in order of registration */
Guido van Rossumfeee4b92000-03-10 22:57:27 +0000162 args = PyTuple_New(1);
163 if (args == NULL)
164 goto onError;
165 PyTuple_SET_ITEM(args,0,v);
Guido van Rossum5ba3c842000-03-24 20:52:23 +0000166
167 len = PyList_Size(_PyCodec_SearchPath);
168 if (len < 0)
169 goto onError;
Guido van Rossumb95de4f2000-03-31 17:25:23 +0000170 if (len == 0) {
171 PyErr_SetString(PyExc_LookupError,
172 "no codec search functions registered: "
173 "can't find encoding");
174 goto onError;
175 }
Guido van Rossumfeee4b92000-03-10 22:57:27 +0000176
177 for (i = 0; i < len; i++) {
178 PyObject *func;
179
180 func = PyList_GetItem(_PyCodec_SearchPath, i);
181 if (func == NULL)
182 goto onError;
Guido van Rossum5ba3c842000-03-24 20:52:23 +0000183 result = PyEval_CallObject(func, args);
Guido van Rossumfeee4b92000-03-10 22:57:27 +0000184 if (result == NULL)
185 goto onError;
186 if (result == Py_None) {
187 Py_DECREF(result);
188 continue;
189 }
190 if (!PyTuple_Check(result) || PyTuple_GET_SIZE(result) != 4) {
191 PyErr_SetString(PyExc_TypeError,
192 "codec search functions must return 4-tuples");
193 Py_DECREF(result);
194 goto onError;
195 }
196 break;
197 }
198 if (i == len) {
199 /* XXX Perhaps we should cache misses too ? */
200 PyErr_SetString(PyExc_LookupError,
Barry Warsaw51ac5802000-03-20 16:36:48 +0000201 "unknown encoding");
Guido van Rossumfeee4b92000-03-10 22:57:27 +0000202 goto onError;
203 }
204
205 /* Cache and return the result */
206 PyDict_SetItem(_PyCodec_SearchCache, v, result);
207 Py_DECREF(args);
208 return result;
209
210 onError:
211 Py_XDECREF(args);
212 return NULL;
213}
214
215static
216PyObject *args_tuple(PyObject *object,
217 const char *errors)
218{
219 PyObject *args;
220
221 args = PyTuple_New(1 + (errors != NULL));
222 if (args == NULL)
223 return NULL;
224 Py_INCREF(object);
225 PyTuple_SET_ITEM(args,0,object);
226 if (errors) {
227 PyObject *v;
228
229 v = PyString_FromString(errors);
230 if (v == NULL) {
231 Py_DECREF(args);
232 return NULL;
233 }
234 PyTuple_SET_ITEM(args, 1, v);
235 }
236 return args;
237}
238
239/* Build a codec by calling factory(stream[,errors]) or just
240 factory(errors) depending on whether the given parameters are
241 non-NULL. */
242
243static
244PyObject *build_stream_codec(PyObject *factory,
245 PyObject *stream,
246 const char *errors)
247{
248 PyObject *args, *codec;
249
250 args = args_tuple(stream, errors);
251 if (args == NULL)
252 return NULL;
253
254 codec = PyEval_CallObject(factory, args);
255 Py_DECREF(args);
256 return codec;
257}
258
259/* Convenience APIs to query the Codec registry.
260
261 All APIs return a codec object with incremented refcount.
262
263 */
264
265PyObject *PyCodec_Encoder(const char *encoding)
266{
267 PyObject *codecs;
268 PyObject *v;
269
270 codecs = _PyCodec_Lookup(encoding);
271 if (codecs == NULL)
272 goto onError;
273 v = PyTuple_GET_ITEM(codecs,0);
274 Py_INCREF(v);
275 return v;
276
277 onError:
278 return NULL;
279}
280
281PyObject *PyCodec_Decoder(const char *encoding)
282{
283 PyObject *codecs;
284 PyObject *v;
285
286 codecs = _PyCodec_Lookup(encoding);
287 if (codecs == NULL)
288 goto onError;
289 v = PyTuple_GET_ITEM(codecs,1);
290 Py_INCREF(v);
291 return v;
292
293 onError:
294 return NULL;
295}
296
297PyObject *PyCodec_StreamReader(const char *encoding,
298 PyObject *stream,
299 const char *errors)
300{
301 PyObject *codecs;
302
303 codecs = _PyCodec_Lookup(encoding);
304 if (codecs == NULL)
305 goto onError;
306 return build_stream_codec(PyTuple_GET_ITEM(codecs,2),stream,errors);
307
308 onError:
309 return NULL;
310}
311
312PyObject *PyCodec_StreamWriter(const char *encoding,
313 PyObject *stream,
314 const char *errors)
315{
316 PyObject *codecs;
317
318 codecs = _PyCodec_Lookup(encoding);
319 if (codecs == NULL)
320 goto onError;
321 return build_stream_codec(PyTuple_GET_ITEM(codecs,3),stream,errors);
322
323 onError:
324 return NULL;
325}
326
327/* Encode an object (e.g. an Unicode object) using the given encoding
328 and return the resulting encoded object (usually a Python string).
329
330 errors is passed to the encoder factory as argument if non-NULL. */
331
332PyObject *PyCodec_Encode(PyObject *object,
333 const char *encoding,
334 const char *errors)
335{
336 PyObject *encoder = NULL;
337 PyObject *args = NULL, *result;
338 PyObject *v;
339
340 encoder = PyCodec_Encoder(encoding);
341 if (encoder == NULL)
342 goto onError;
343
344 args = args_tuple(object, errors);
345 if (args == NULL)
346 goto onError;
347
348 result = PyEval_CallObject(encoder,args);
349 if (result == NULL)
350 goto onError;
351
352 if (!PyTuple_Check(result) ||
353 PyTuple_GET_SIZE(result) != 2) {
354 PyErr_SetString(PyExc_TypeError,
355 "encoder must return a tuple (object,integer)");
356 goto onError;
357 }
358 v = PyTuple_GET_ITEM(result,0);
359 Py_INCREF(v);
360 /* We don't check or use the second (integer) entry. */
361
362 Py_DECREF(args);
363 Py_DECREF(encoder);
364 Py_DECREF(result);
365 return v;
366
367 onError:
368 Py_XDECREF(args);
369 Py_XDECREF(encoder);
370 return NULL;
371}
372
373/* Decode an object (usually a Python string) using the given encoding
374 and return an equivalent object (e.g. an Unicode object).
375
376 errors is passed to the decoder factory as argument if non-NULL. */
377
378PyObject *PyCodec_Decode(PyObject *object,
379 const char *encoding,
380 const char *errors)
381{
382 PyObject *decoder = NULL;
383 PyObject *args = NULL, *result = NULL;
384 PyObject *v;
385
386 decoder = PyCodec_Decoder(encoding);
387 if (decoder == NULL)
388 goto onError;
389
390 args = args_tuple(object, errors);
391 if (args == NULL)
392 goto onError;
393
394 result = PyEval_CallObject(decoder,args);
395 if (result == NULL)
396 goto onError;
397 if (!PyTuple_Check(result) ||
398 PyTuple_GET_SIZE(result) != 2) {
399 PyErr_SetString(PyExc_TypeError,
400 "decoder must return a tuple (object,integer)");
401 goto onError;
402 }
403 v = PyTuple_GET_ITEM(result,0);
404 Py_INCREF(v);
405 /* We don't check or use the second (integer) entry. */
406
407 Py_DECREF(args);
408 Py_DECREF(decoder);
409 Py_DECREF(result);
410 return v;
411
412 onError:
413 Py_XDECREF(args);
414 Py_XDECREF(decoder);
415 Py_XDECREF(result);
416 return NULL;
417}
418
Thomas Woutersf70ef4f2000-07-22 18:47:25 +0000419void _PyCodecRegistry_Init(void)
Guido van Rossumfeee4b92000-03-10 22:57:27 +0000420{
421 if (_PyCodec_SearchPath == NULL)
422 _PyCodec_SearchPath = PyList_New(0);
423 if (_PyCodec_SearchCache == NULL)
424 _PyCodec_SearchCache = PyDict_New();
425 if (_PyCodec_SearchPath == NULL ||
426 _PyCodec_SearchCache == NULL)
Thomas Wouters7e474022000-07-16 12:04:32 +0000427 Py_FatalError("can't initialize codec registry");
Guido van Rossumfeee4b92000-03-10 22:57:27 +0000428}
429
Thomas Woutersf70ef4f2000-07-22 18:47:25 +0000430void _PyCodecRegistry_Fini(void)
Guido van Rossumfeee4b92000-03-10 22:57:27 +0000431{
432 Py_XDECREF(_PyCodec_SearchPath);
Barry Warsaw51ac5802000-03-20 16:36:48 +0000433 _PyCodec_SearchPath = NULL;
Guido van Rossumfeee4b92000-03-10 22:57:27 +0000434 Py_XDECREF(_PyCodec_SearchCache);
Barry Warsaw51ac5802000-03-20 16:36:48 +0000435 _PyCodec_SearchCache = NULL;
Guido van Rossumfeee4b92000-03-10 22:57:27 +0000436}