Guido van Rossum | 0229bf6 | 2000-03-10 23:17:24 +0000 | [diff] [blame] | 1 | """ Standard "encodings" Package |
| 2 | |
| 3 | Standard Python encoding modules are stored in this package |
| 4 | directory. |
| 5 | |
Marc-André Lemburg | 7012673 | 2002-10-04 11:45:38 +0000 | [diff] [blame] | 6 | Codec modules must have names corresponding to normalized encoding |
| 7 | names as defined in the normalize_encoding() function below, e.g. |
| 8 | 'utf-8' must be implemented by the module 'utf_8.py'. |
Guido van Rossum | 0229bf6 | 2000-03-10 23:17:24 +0000 | [diff] [blame] | 9 | |
| 10 | Each codec module must export the following interface: |
| 11 | |
| 12 | * getregentry() -> (encoder, decoder, stream_reader, stream_writer) |
| 13 | The getregentry() API must return callable objects which adhere to |
| 14 | the Python Codec Interface Standard. |
| 15 | |
| 16 | In addition, a module may optionally also define the following |
| 17 | APIs which are then used by the package's codec search function: |
| 18 | |
| 19 | * getaliases() -> sequence of encoding name strings to use as aliases |
| 20 | |
Marc-André Lemburg | 7012673 | 2002-10-04 11:45:38 +0000 | [diff] [blame] | 21 | Alias names returned by getaliases() must be normalized encoding |
| 22 | names as defined by normalize_encoding(). |
Guido van Rossum | 0229bf6 | 2000-03-10 23:17:24 +0000 | [diff] [blame] | 23 | |
| 24 | Written by Marc-Andre Lemburg (mal@lemburg.com). |
| 25 | |
| 26 | (c) Copyright CNRI, All Rights Reserved. NO WARRANTY. |
| 27 | |
| 28 | """#" |
| 29 | |
Marc-André Lemburg | 2820125 | 2003-05-16 17:07:51 +0000 | [diff] [blame] | 30 | import codecs, exceptions, types |
Guido van Rossum | 0229bf6 | 2000-03-10 23:17:24 +0000 | [diff] [blame] | 31 | |
| 32 | _cache = {} |
Barry Warsaw | 51ac580 | 2000-03-20 16:36:48 +0000 | [diff] [blame] | 33 | _unknown = '--unknown--' |
Marc-André Lemburg | 462004e | 2002-02-10 21:36:20 +0000 | [diff] [blame] | 34 | _import_tail = ['*'] |
Marc-André Lemburg | 2820125 | 2003-05-16 17:07:51 +0000 | [diff] [blame] | 35 | _norm_encoding_map = (' . ' |
| 36 | '0123456789 ABCDEFGHIJKLMNOPQRSTUVWXYZ ' |
| 37 | ' abcdefghijklmnopqrstuvwxyz ' |
| 38 | ' ' |
| 39 | ' ' |
| 40 | ' ') |
Guido van Rossum | 0229bf6 | 2000-03-10 23:17:24 +0000 | [diff] [blame] | 41 | |
Marc-André Lemburg | 816a1b7 | 2001-09-19 11:52:07 +0000 | [diff] [blame] | 42 | class CodecRegistryError(exceptions.LookupError, |
| 43 | exceptions.SystemError): |
| 44 | pass |
| 45 | |
Marc-André Lemburg | 7012673 | 2002-10-04 11:45:38 +0000 | [diff] [blame] | 46 | def normalize_encoding(encoding): |
| 47 | |
| 48 | """ Normalize an encoding name. |
| 49 | |
| 50 | Normalization works as follows: all non-alphanumeric |
| 51 | characters except the dot used for Python package names are |
| 52 | collapsed and replaced with a single underscore, e.g. ' -;#' |
Marc-André Lemburg | 2820125 | 2003-05-16 17:07:51 +0000 | [diff] [blame] | 53 | becomes '_'. Leading and trailing underscores are removed. |
| 54 | |
| 55 | Note that encoding names should be ASCII only; if they do use |
| 56 | non-ASCII characters, these must be Latin-1 compatible. |
Tim Peters | 6578dc9 | 2002-12-24 18:31:27 +0000 | [diff] [blame] | 57 | |
Marc-André Lemburg | 7012673 | 2002-10-04 11:45:38 +0000 | [diff] [blame] | 58 | """ |
Marc-André Lemburg | 2820125 | 2003-05-16 17:07:51 +0000 | [diff] [blame] | 59 | # Make sure we have an 8-bit string, because .translate() works |
| 60 | # differently for Unicode strings. |
| 61 | if type(encoding) is types.UnicodeType: |
| 62 | # Note that .encode('latin-1') does *not* use the codec |
| 63 | # registry, so this call doesn't recurse. (See unicodeobject.c |
| 64 | # PyUnicode_AsEncodedString() for details) |
| 65 | encoding = encoding.encode('latin-1') |
| 66 | return '_'.join(encoding.translate(_norm_encoding_map).split()) |
Marc-André Lemburg | 7012673 | 2002-10-04 11:45:38 +0000 | [diff] [blame] | 67 | |
Guido van Rossum | 0229bf6 | 2000-03-10 23:17:24 +0000 | [diff] [blame] | 68 | def search_function(encoding): |
Tim Peters | 469cdad | 2002-08-08 20:19:19 +0000 | [diff] [blame] | 69 | |
Guido van Rossum | 0229bf6 | 2000-03-10 23:17:24 +0000 | [diff] [blame] | 70 | # Cache lookup |
Marc-André Lemburg | 462004e | 2002-02-10 21:36:20 +0000 | [diff] [blame] | 71 | entry = _cache.get(encoding, _unknown) |
Barry Warsaw | 51ac580 | 2000-03-20 16:36:48 +0000 | [diff] [blame] | 72 | if entry is not _unknown: |
Guido van Rossum | 0229bf6 | 2000-03-10 23:17:24 +0000 | [diff] [blame] | 73 | return entry |
| 74 | |
Marc-André Lemburg | 462004e | 2002-02-10 21:36:20 +0000 | [diff] [blame] | 75 | # Import the module: |
| 76 | # |
| 77 | # First look in the encodings package, then try to lookup the |
| 78 | # encoding in the aliases mapping and retry the import using the |
| 79 | # default import module lookup scheme with the alias name. |
| 80 | # |
Marc-André Lemburg | 7012673 | 2002-10-04 11:45:38 +0000 | [diff] [blame] | 81 | modname = normalize_encoding(encoding) |
Guido van Rossum | 0229bf6 | 2000-03-10 23:17:24 +0000 | [diff] [blame] | 82 | try: |
Marc-André Lemburg | 462004e | 2002-02-10 21:36:20 +0000 | [diff] [blame] | 83 | mod = __import__('encodings.' + modname, |
| 84 | globals(), locals(), _import_tail) |
Martin v. Löwis | b9e0764 | 2002-07-29 14:05:24 +0000 | [diff] [blame] | 85 | except ImportError: |
Marc-André Lemburg | 462004e | 2002-02-10 21:36:20 +0000 | [diff] [blame] | 86 | import aliases |
Guido van Rossum | c8c6065 | 2002-10-04 20:49:05 +0000 | [diff] [blame] | 87 | modname = (aliases.aliases.get(modname) or |
| 88 | aliases.aliases.get(modname.replace('.', '_')) or |
| 89 | modname) |
Marc-André Lemburg | a0af63b | 2002-02-11 17:43:46 +0000 | [diff] [blame] | 90 | try: |
Martin v. Löwis | b9e0764 | 2002-07-29 14:05:24 +0000 | [diff] [blame] | 91 | mod = __import__(modname, globals(), locals(), _import_tail) |
| 92 | except ImportError: |
Marc-André Lemburg | 462004e | 2002-02-10 21:36:20 +0000 | [diff] [blame] | 93 | mod = None |
Martin v. Löwis | b9e0764 | 2002-07-29 14:05:24 +0000 | [diff] [blame] | 94 | |
| 95 | try: |
| 96 | getregentry = mod.getregentry |
| 97 | except AttributeError: |
| 98 | # Not a codec module |
| 99 | mod = None |
| 100 | |
Marc-André Lemburg | 462004e | 2002-02-10 21:36:20 +0000 | [diff] [blame] | 101 | if mod is None: |
Marc-André Lemburg | a0af63b | 2002-02-11 17:43:46 +0000 | [diff] [blame] | 102 | # Cache misses |
Guido van Rossum | 0229bf6 | 2000-03-10 23:17:24 +0000 | [diff] [blame] | 103 | _cache[encoding] = None |
Tim Peters | 469cdad | 2002-08-08 20:19:19 +0000 | [diff] [blame] | 104 | return None |
| 105 | |
Guido van Rossum | 0229bf6 | 2000-03-10 23:17:24 +0000 | [diff] [blame] | 106 | # Now ask the module for the registry entry |
Martin v. Löwis | b9e0764 | 2002-07-29 14:05:24 +0000 | [diff] [blame] | 107 | entry = tuple(getregentry()) |
Guido van Rossum | 0229bf6 | 2000-03-10 23:17:24 +0000 | [diff] [blame] | 108 | if len(entry) != 4: |
Marc-André Lemburg | 816a1b7 | 2001-09-19 11:52:07 +0000 | [diff] [blame] | 109 | raise CodecRegistryError,\ |
| 110 | 'module "%s" (%s) failed to register' % \ |
| 111 | (mod.__name__, mod.__file__) |
Guido van Rossum | 0229bf6 | 2000-03-10 23:17:24 +0000 | [diff] [blame] | 112 | for obj in entry: |
| 113 | if not callable(obj): |
Marc-André Lemburg | 816a1b7 | 2001-09-19 11:52:07 +0000 | [diff] [blame] | 114 | raise CodecRegistryError,\ |
| 115 | 'incompatible codecs in module "%s" (%s)' % \ |
| 116 | (mod.__name__, mod.__file__) |
Guido van Rossum | 0229bf6 | 2000-03-10 23:17:24 +0000 | [diff] [blame] | 117 | |
Marc-André Lemburg | 988ad2b | 2000-12-12 14:45:35 +0000 | [diff] [blame] | 118 | # Cache the codec registry entry |
Guido van Rossum | 0229bf6 | 2000-03-10 23:17:24 +0000 | [diff] [blame] | 119 | _cache[encoding] = entry |
Marc-André Lemburg | 988ad2b | 2000-12-12 14:45:35 +0000 | [diff] [blame] | 120 | |
| 121 | # Register its aliases (without overwriting previously registered |
| 122 | # aliases) |
Guido van Rossum | 0229bf6 | 2000-03-10 23:17:24 +0000 | [diff] [blame] | 123 | try: |
| 124 | codecaliases = mod.getaliases() |
| 125 | except AttributeError: |
| 126 | pass |
| 127 | else: |
Marc-André Lemburg | 462004e | 2002-02-10 21:36:20 +0000 | [diff] [blame] | 128 | import aliases |
Guido van Rossum | 0229bf6 | 2000-03-10 23:17:24 +0000 | [diff] [blame] | 129 | for alias in codecaliases: |
Marc-André Lemburg | 988ad2b | 2000-12-12 14:45:35 +0000 | [diff] [blame] | 130 | if not aliases.aliases.has_key(alias): |
| 131 | aliases.aliases[alias] = modname |
| 132 | |
| 133 | # Return the registry entry |
Guido van Rossum | 0229bf6 | 2000-03-10 23:17:24 +0000 | [diff] [blame] | 134 | return entry |
| 135 | |
| 136 | # Register the search_function in the Python codec registry |
| 137 | codecs.register(search_function) |