blob: 666afad6b6ec7a5a0f546ea5157e762eb26bda18 [file] [log] [blame]
Guido van Rossum0229bf62000-03-10 23:17:24 +00001""" Standard "encodings" Package
2
3 Standard Python encoding modules are stored in this package
4 directory.
5
Marc-André Lemburg70126732002-10-04 11:45:38 +00006 Codec modules must have names corresponding to normalized encoding
7 names as defined in the normalize_encoding() function below, e.g.
8 'utf-8' must be implemented by the module 'utf_8.py'.
Guido van Rossum0229bf62000-03-10 23:17:24 +00009
10 Each codec module must export the following interface:
11
12 * getregentry() -> (encoder, decoder, stream_reader, stream_writer)
13 The getregentry() API must return callable objects which adhere to
14 the Python Codec Interface Standard.
15
16 In addition, a module may optionally also define the following
17 APIs which are then used by the package's codec search function:
18
19 * getaliases() -> sequence of encoding name strings to use as aliases
20
Marc-André Lemburg70126732002-10-04 11:45:38 +000021 Alias names returned by getaliases() must be normalized encoding
22 names as defined by normalize_encoding().
Guido van Rossum0229bf62000-03-10 23:17:24 +000023
24Written by Marc-Andre Lemburg (mal@lemburg.com).
25
26(c) Copyright CNRI, All Rights Reserved. NO WARRANTY.
27
28"""#"
29
Marc-André Lemburg28201252003-05-16 17:07:51 +000030import codecs, exceptions, types
Guido van Rossum0229bf62000-03-10 23:17:24 +000031
32_cache = {}
Barry Warsaw51ac5802000-03-20 16:36:48 +000033_unknown = '--unknown--'
Marc-André Lemburg462004e2002-02-10 21:36:20 +000034_import_tail = ['*']
Marc-André Lemburg28201252003-05-16 17:07:51 +000035_norm_encoding_map = (' . '
36 '0123456789 ABCDEFGHIJKLMNOPQRSTUVWXYZ '
37 ' abcdefghijklmnopqrstuvwxyz '
38 ' '
39 ' '
40 ' ')
Guido van Rossum0229bf62000-03-10 23:17:24 +000041
Marc-André Lemburg816a1b72001-09-19 11:52:07 +000042class CodecRegistryError(exceptions.LookupError,
43 exceptions.SystemError):
44 pass
45
Marc-André Lemburg70126732002-10-04 11:45:38 +000046def normalize_encoding(encoding):
47
48 """ Normalize an encoding name.
49
50 Normalization works as follows: all non-alphanumeric
51 characters except the dot used for Python package names are
52 collapsed and replaced with a single underscore, e.g. ' -;#'
Marc-André Lemburg28201252003-05-16 17:07:51 +000053 becomes '_'. Leading and trailing underscores are removed.
54
55 Note that encoding names should be ASCII only; if they do use
56 non-ASCII characters, these must be Latin-1 compatible.
Tim Peters6578dc92002-12-24 18:31:27 +000057
Marc-André Lemburg70126732002-10-04 11:45:38 +000058 """
Marc-André Lemburg28201252003-05-16 17:07:51 +000059 # Make sure we have an 8-bit string, because .translate() works
60 # differently for Unicode strings.
61 if type(encoding) is types.UnicodeType:
62 # Note that .encode('latin-1') does *not* use the codec
63 # registry, so this call doesn't recurse. (See unicodeobject.c
64 # PyUnicode_AsEncodedString() for details)
65 encoding = encoding.encode('latin-1')
66 return '_'.join(encoding.translate(_norm_encoding_map).split())
Marc-André Lemburg70126732002-10-04 11:45:38 +000067
Guido van Rossum0229bf62000-03-10 23:17:24 +000068def search_function(encoding):
Tim Peters469cdad2002-08-08 20:19:19 +000069
Guido van Rossum0229bf62000-03-10 23:17:24 +000070 # Cache lookup
Marc-André Lemburg462004e2002-02-10 21:36:20 +000071 entry = _cache.get(encoding, _unknown)
Barry Warsaw51ac5802000-03-20 16:36:48 +000072 if entry is not _unknown:
Guido van Rossum0229bf62000-03-10 23:17:24 +000073 return entry
74
Marc-André Lemburg462004e2002-02-10 21:36:20 +000075 # Import the module:
76 #
77 # First look in the encodings package, then try to lookup the
78 # encoding in the aliases mapping and retry the import using the
79 # default import module lookup scheme with the alias name.
80 #
Marc-André Lemburg70126732002-10-04 11:45:38 +000081 modname = normalize_encoding(encoding)
Guido van Rossum0229bf62000-03-10 23:17:24 +000082 try:
Marc-André Lemburg462004e2002-02-10 21:36:20 +000083 mod = __import__('encodings.' + modname,
84 globals(), locals(), _import_tail)
Martin v. Löwisb9e07642002-07-29 14:05:24 +000085 except ImportError:
Marc-André Lemburg462004e2002-02-10 21:36:20 +000086 import aliases
Guido van Rossumc8c60652002-10-04 20:49:05 +000087 modname = (aliases.aliases.get(modname) or
88 aliases.aliases.get(modname.replace('.', '_')) or
89 modname)
Marc-André Lemburga0af63b2002-02-11 17:43:46 +000090 try:
Martin v. Löwisb9e07642002-07-29 14:05:24 +000091 mod = __import__(modname, globals(), locals(), _import_tail)
92 except ImportError:
Marc-André Lemburg462004e2002-02-10 21:36:20 +000093 mod = None
Martin v. Löwisb9e07642002-07-29 14:05:24 +000094
95 try:
96 getregentry = mod.getregentry
97 except AttributeError:
98 # Not a codec module
99 mod = None
100
Marc-André Lemburg462004e2002-02-10 21:36:20 +0000101 if mod is None:
Marc-André Lemburga0af63b2002-02-11 17:43:46 +0000102 # Cache misses
Guido van Rossum0229bf62000-03-10 23:17:24 +0000103 _cache[encoding] = None
Tim Peters469cdad2002-08-08 20:19:19 +0000104 return None
105
Guido van Rossum0229bf62000-03-10 23:17:24 +0000106 # Now ask the module for the registry entry
Martin v. Löwisb9e07642002-07-29 14:05:24 +0000107 entry = tuple(getregentry())
Guido van Rossum0229bf62000-03-10 23:17:24 +0000108 if len(entry) != 4:
Marc-André Lemburg816a1b72001-09-19 11:52:07 +0000109 raise CodecRegistryError,\
110 'module "%s" (%s) failed to register' % \
111 (mod.__name__, mod.__file__)
Guido van Rossum0229bf62000-03-10 23:17:24 +0000112 for obj in entry:
113 if not callable(obj):
Marc-André Lemburg816a1b72001-09-19 11:52:07 +0000114 raise CodecRegistryError,\
115 'incompatible codecs in module "%s" (%s)' % \
116 (mod.__name__, mod.__file__)
Guido van Rossum0229bf62000-03-10 23:17:24 +0000117
Marc-André Lemburg988ad2b2000-12-12 14:45:35 +0000118 # Cache the codec registry entry
Guido van Rossum0229bf62000-03-10 23:17:24 +0000119 _cache[encoding] = entry
Marc-André Lemburg988ad2b2000-12-12 14:45:35 +0000120
121 # Register its aliases (without overwriting previously registered
122 # aliases)
Guido van Rossum0229bf62000-03-10 23:17:24 +0000123 try:
124 codecaliases = mod.getaliases()
125 except AttributeError:
126 pass
127 else:
Marc-André Lemburg462004e2002-02-10 21:36:20 +0000128 import aliases
Guido van Rossum0229bf62000-03-10 23:17:24 +0000129 for alias in codecaliases:
Marc-André Lemburg988ad2b2000-12-12 14:45:35 +0000130 if not aliases.aliases.has_key(alias):
131 aliases.aliases[alias] = modname
132
133 # Return the registry entry
Guido van Rossum0229bf62000-03-10 23:17:24 +0000134 return entry
135
136# Register the search_function in the Python codec registry
137codecs.register(search_function)