blob: b089e907662a2b86dace6843d1d1773714c41d37 [file] [log] [blame]
Guido van Rossum0612d842000-03-10 23:20:43 +00001""" codecs -- Python Codec Registry, API and helpers.
2
3
4Written by Marc-Andre Lemburg (mal@lemburg.com).
5
6(c) Copyright CNRI, All Rights Reserved. NO WARRANTY.
7
8"""#"
9
Martin v. Löwis02d893c2001-08-02 07:15:29 +000010import struct, __builtin__
Guido van Rossum0612d842000-03-10 23:20:43 +000011
12### Registry and builtin stateless codec functions
13
Guido van Rossumb95de4f2000-03-31 17:25:23 +000014try:
15 from _codecs import *
Tim Peters30324a72001-05-15 17:19:16 +000016except ImportError, why:
Guido van Rossumb95de4f2000-03-31 17:25:23 +000017 raise SystemError,\
18 'Failed to load the builtin codecs: %s' % why
Guido van Rossum0612d842000-03-10 23:20:43 +000019
Tim Peters30324a72001-05-15 17:19:16 +000020__all__ = ["register", "lookup", "open", "EncodedFile", "BOM", "BOM_BE",
Walter Dörwald474458d2002-06-04 15:16:29 +000021 "BOM_LE", "BOM32_BE", "BOM32_LE", "BOM64_BE", "BOM64_LE",
22 "BOM_UTF8", "BOM_UTF16", "BOM_UTF16_LE", "BOM_UTF16_BE",
23 "BOM_UTF32", "BOM_UTF32_LE", "BOM_UTF32_BE"]
Skip Montanaroe99d5ea2001-01-20 19:54:20 +000024
Guido van Rossum0612d842000-03-10 23:20:43 +000025### Constants
26
27#
Walter Dörwald474458d2002-06-04 15:16:29 +000028# Byte Order Mark (BOM = ZERO WIDTH NO-BREAK SPACE = U+FEFF)
29# and its possible byte string values
30# for UTF8/UTF16/UTF32 output and little/big endian machines
Guido van Rossum0612d842000-03-10 23:20:43 +000031#
Guido van Rossum0612d842000-03-10 23:20:43 +000032
Walter Dörwald474458d2002-06-04 15:16:29 +000033# UTF-8
34BOM_UTF8 = '\xef\xbb\xbf'
35
36# UTF-16, little endian
37BOM_LE = BOM_UTF16_LE = '\xff\xfe'
38
39# UTF-16, big endian
40BOM_BE = BOM_UTF16_BE = '\xfe\xff'
41
42# UTF-32, little endian
43BOM_UTF32_LE = '\xff\xfe\x00\x00'
44
45# UTF-32, big endian
46BOM_UTF32_BE = '\x00\x00\xfe\xff'
47
48# UTF-16, native endianness
49BOM = BOM_UTF16 = struct.pack('=H', 0xFEFF)
50
51# UTF-32, native endianness
52BOM_UTF32 = struct.pack('=L', 0x0000FEFF)
53
54# Old broken names (don't use in new code)
55BOM32_LE = BOM_UTF16_LE
56BOM32_BE = BOM_UTF16_BE
57BOM64_LE = BOM_UTF32_LE
58BOM64_BE = BOM_UTF32_BE
Guido van Rossum0612d842000-03-10 23:20:43 +000059
60
61### Codec base classes (defining the API)
62
63class Codec:
64
65 """ Defines the interface for stateless encoders/decoders.
66
67 The .encode()/.decode() methods may implement different error
68 handling schemes by providing the errors argument. These
69 string values are defined:
70
Guido van Rossumd8855fd2000-03-24 22:14:19 +000071 'strict' - raise a ValueError error (or a subclass)
Guido van Rossum0612d842000-03-10 23:20:43 +000072 'ignore' - ignore the character and continue with the next
73 'replace' - replace with a suitable replacement character;
74 Python will use the official U+FFFD REPLACEMENT
75 CHARACTER for the builtin Unicode codecs.
76
77 """
Tim Peters30324a72001-05-15 17:19:16 +000078 def encode(self, input, errors='strict'):
Guido van Rossum1c89b0e2000-04-11 15:41:38 +000079
Fred Drake3e74c0d2000-03-17 15:40:35 +000080 """ Encodes the object input and returns a tuple (output
Guido van Rossum0612d842000-03-10 23:20:43 +000081 object, length consumed).
82
83 errors defines the error handling to apply. It defaults to
84 'strict' handling.
85
86 The method may not store state in the Codec instance. Use
87 StreamCodec for codecs which have to keep state in order to
88 make encoding/decoding efficient.
89
90 The encoder must be able to handle zero length input and
91 return an empty object of the output object type in this
92 situation.
93
94 """
95 raise NotImplementedError
96
Tim Peters30324a72001-05-15 17:19:16 +000097 def decode(self, input, errors='strict'):
Guido van Rossum0612d842000-03-10 23:20:43 +000098
99 """ Decodes the object input and returns a tuple (output
100 object, length consumed).
101
102 input must be an object which provides the bf_getreadbuf
103 buffer slot. Python strings, buffer objects and memory
104 mapped files are examples of objects providing this slot.
Guido van Rossum1c89b0e2000-04-11 15:41:38 +0000105
Guido van Rossum0612d842000-03-10 23:20:43 +0000106 errors defines the error handling to apply. It defaults to
107 'strict' handling.
108
109 The method may not store state in the Codec instance. Use
110 StreamCodec for codecs which have to keep state in order to
111 make encoding/decoding efficient.
112
113 The decoder must be able to handle zero length input and
114 return an empty object of the output object type in this
115 situation.
116
Guido van Rossum1c89b0e2000-04-11 15:41:38 +0000117 """
Guido van Rossum0612d842000-03-10 23:20:43 +0000118 raise NotImplementedError
119
120#
121# The StreamWriter and StreamReader class provide generic working
Andrew M. Kuchling97c56352001-09-18 20:29:48 +0000122# interfaces which can be used to implement new encoding submodules
Guido van Rossum0612d842000-03-10 23:20:43 +0000123# very easily. See encodings/utf_8.py for an example on how this is
124# done.
Guido van Rossum1c89b0e2000-04-11 15:41:38 +0000125#
Guido van Rossum0612d842000-03-10 23:20:43 +0000126
127class StreamWriter(Codec):
128
Tim Peters30324a72001-05-15 17:19:16 +0000129 def __init__(self, stream, errors='strict'):
Guido van Rossum0612d842000-03-10 23:20:43 +0000130
131 """ Creates a StreamWriter instance.
132
133 stream must be a file-like object open for writing
134 (binary) data.
135
136 The StreamWriter may implement different error handling
137 schemes by providing the errors keyword argument. These
138 parameters are defined:
139
140 'strict' - raise a ValueError (or a subclass)
141 'ignore' - ignore the character and continue with the next
142 'replace'- replace with a suitable replacement character
143
144 """
145 self.stream = stream
146 self.errors = errors
147
Guido van Rossuma3277132000-04-11 15:37:43 +0000148 def write(self, object):
Guido van Rossum0612d842000-03-10 23:20:43 +0000149
150 """ Writes the object's contents encoded to self.stream.
151 """
Tim Peters30324a72001-05-15 17:19:16 +0000152 data, consumed = self.encode(object, self.errors)
Guido van Rossum0612d842000-03-10 23:20:43 +0000153 self.stream.write(data)
154
Guido van Rossuma3277132000-04-11 15:37:43 +0000155 def writelines(self, list):
156
157 """ Writes the concatenated list of strings to the stream
158 using .write().
159 """
160 self.write(''.join(list))
Guido van Rossum1c89b0e2000-04-11 15:41:38 +0000161
Guido van Rossum0612d842000-03-10 23:20:43 +0000162 def reset(self):
163
164 """ Flushes and resets the codec buffers used for keeping state.
165
166 Calling this method should ensure that the data on the
167 output is put into a clean state, that allows appending
168 of new fresh data without having to rescan the whole
169 stream to recover state.
170
171 """
172 pass
173
Tim Peters30324a72001-05-15 17:19:16 +0000174 def __getattr__(self, name,
Guido van Rossum0612d842000-03-10 23:20:43 +0000175 getattr=getattr):
176
177 """ Inherit all other methods from the underlying stream.
178 """
Tim Peters30324a72001-05-15 17:19:16 +0000179 return getattr(self.stream, name)
Guido van Rossum0612d842000-03-10 23:20:43 +0000180
181###
182
183class StreamReader(Codec):
184
Tim Peters30324a72001-05-15 17:19:16 +0000185 def __init__(self, stream, errors='strict'):
Guido van Rossum0612d842000-03-10 23:20:43 +0000186
187 """ Creates a StreamReader instance.
188
189 stream must be a file-like object open for reading
190 (binary) data.
191
192 The StreamReader may implement different error handling
193 schemes by providing the errors keyword argument. These
194 parameters are defined:
195
196 'strict' - raise a ValueError (or a subclass)
197 'ignore' - ignore the character and continue with the next
198 'replace'- replace with a suitable replacement character;
199
200 """
201 self.stream = stream
202 self.errors = errors
203
Guido van Rossuma3277132000-04-11 15:37:43 +0000204 def read(self, size=-1):
Guido van Rossum0612d842000-03-10 23:20:43 +0000205
206 """ Decodes data from the stream self.stream and returns the
207 resulting object.
208
209 size indicates the approximate maximum number of bytes to
210 read from the stream for decoding purposes. The decoder
211 can modify this setting as appropriate. The default value
212 -1 indicates to read and decode as much as possible. size
213 is intended to prevent having to decode huge files in one
214 step.
215
216 The method should use a greedy read strategy meaning that
217 it should read as much data as is allowed within the
218 definition of the encoding and the given size, e.g. if
219 optional encoding endings or state markers are available
220 on the stream, these should be read too.
221
222 """
223 # Unsliced reading:
224 if size < 0:
Andrew M. Kuchlingc6c28382000-12-10 15:12:14 +0000225 return self.decode(self.stream.read(), self.errors)[0]
Guido van Rossum1c89b0e2000-04-11 15:41:38 +0000226
Guido van Rossum0612d842000-03-10 23:20:43 +0000227 # Sliced reading:
228 read = self.stream.read
229 decode = self.decode
230 data = read(size)
231 i = 0
232 while 1:
233 try:
Andrew M. Kuchlingc6c28382000-12-10 15:12:14 +0000234 object, decodedbytes = decode(data, self.errors)
Tim Peters30324a72001-05-15 17:19:16 +0000235 except ValueError, why:
Guido van Rossum0612d842000-03-10 23:20:43 +0000236 # This method is slow but should work under pretty much
237 # all conditions; at most 10 tries are made
238 i = i + 1
239 newdata = read(1)
240 if not newdata or i > 10:
241 raise
242 data = data + newdata
243 else:
244 return object
245
Guido van Rossuma3277132000-04-11 15:37:43 +0000246 def readline(self, size=None):
247
248 """ Read one line from the input stream and return the
249 decoded data.
250
Fred Drake49fd1072000-04-13 14:11:21 +0000251 Note: Unlike the .readlines() method, this method inherits
252 the line breaking knowledge from the underlying stream's
253 .readline() method -- there is currently no support for
254 line breaking using the codec decoder due to lack of line
255 buffering. Sublcasses should however, if possible, try to
256 implement this method using their own knowledge of line
257 breaking.
Guido van Rossuma3277132000-04-11 15:37:43 +0000258
259 size, if given, is passed as size argument to the stream's
260 .readline() method.
Guido van Rossum1c89b0e2000-04-11 15:41:38 +0000261
Guido van Rossuma3277132000-04-11 15:37:43 +0000262 """
263 if size is None:
264 line = self.stream.readline()
265 else:
266 line = self.stream.readline(size)
Tim Peters30324a72001-05-15 17:19:16 +0000267 return self.decode(line, self.errors)[0]
Guido van Rossum1c89b0e2000-04-11 15:41:38 +0000268
Guido van Rossuma3277132000-04-11 15:37:43 +0000269
Martin v. Löwisb786e612002-03-05 15:46:38 +0000270 def readlines(self, sizehint=None):
Guido van Rossuma3277132000-04-11 15:37:43 +0000271
272 """ Read all lines available on the input stream
273 and return them as list of lines.
274
275 Line breaks are implemented using the codec's decoder
276 method and are included in the list entries.
Guido van Rossum1c89b0e2000-04-11 15:41:38 +0000277
Guido van Rossuma3277132000-04-11 15:37:43 +0000278 sizehint, if given, is passed as size argument to the
279 stream's .read() method.
280
281 """
282 if sizehint is None:
283 data = self.stream.read()
284 else:
285 data = self.stream.read(sizehint)
Tim Peters30324a72001-05-15 17:19:16 +0000286 return self.decode(data, self.errors)[0].splitlines(1)
Guido van Rossum0612d842000-03-10 23:20:43 +0000287
288 def reset(self):
289
290 """ Resets the codec buffers used for keeping state.
291
292 Note that no stream repositioning should take place.
Thomas Wouters7e474022000-07-16 12:04:32 +0000293 This method is primarily intended to be able to recover
Guido van Rossum0612d842000-03-10 23:20:43 +0000294 from decoding errors.
295
296 """
297 pass
298
Tim Peters30324a72001-05-15 17:19:16 +0000299 def __getattr__(self, name,
Guido van Rossum0612d842000-03-10 23:20:43 +0000300 getattr=getattr):
301
302 """ Inherit all other methods from the underlying stream.
303 """
Tim Peters30324a72001-05-15 17:19:16 +0000304 return getattr(self.stream, name)
Guido van Rossum0612d842000-03-10 23:20:43 +0000305
306###
307
308class StreamReaderWriter:
309
Fred Drake49fd1072000-04-13 14:11:21 +0000310 """ StreamReaderWriter instances allow wrapping streams which
311 work in both read and write modes.
312
313 The design is such that one can use the factory functions
Thomas Wouters7e474022000-07-16 12:04:32 +0000314 returned by the codec.lookup() function to construct the
Fred Drake49fd1072000-04-13 14:11:21 +0000315 instance.
316
317 """
Guido van Rossuma3277132000-04-11 15:37:43 +0000318 # Optional attributes set by the file wrappers below
319 encoding = 'unknown'
320
Tim Peters30324a72001-05-15 17:19:16 +0000321 def __init__(self, stream, Reader, Writer, errors='strict'):
Guido van Rossum0612d842000-03-10 23:20:43 +0000322
323 """ Creates a StreamReaderWriter instance.
324
325 stream must be a Stream-like object.
326
327 Reader, Writer must be factory functions or classes
328 providing the StreamReader, StreamWriter interface resp.
329
330 Error handling is done in the same way as defined for the
331 StreamWriter/Readers.
332
333 """
334 self.stream = stream
335 self.reader = Reader(stream, errors)
336 self.writer = Writer(stream, errors)
337 self.errors = errors
338
Tim Peters30324a72001-05-15 17:19:16 +0000339 def read(self, size=-1):
Guido van Rossum0612d842000-03-10 23:20:43 +0000340
341 return self.reader.read(size)
342
Guido van Rossumd58c26f2000-05-01 16:17:32 +0000343 def readline(self, size=None):
Guido van Rossuma3277132000-04-11 15:37:43 +0000344
345 return self.reader.readline(size)
346
Guido van Rossumd58c26f2000-05-01 16:17:32 +0000347 def readlines(self, sizehint=None):
Guido van Rossuma3277132000-04-11 15:37:43 +0000348
349 return self.reader.readlines(sizehint)
350
Tim Peters30324a72001-05-15 17:19:16 +0000351 def write(self, data):
Guido van Rossum0612d842000-03-10 23:20:43 +0000352
353 return self.writer.write(data)
354
Tim Peters30324a72001-05-15 17:19:16 +0000355 def writelines(self, list):
Guido van Rossuma3277132000-04-11 15:37:43 +0000356
357 return self.writer.writelines(list)
358
Guido van Rossum0612d842000-03-10 23:20:43 +0000359 def reset(self):
360
361 self.reader.reset()
362 self.writer.reset()
363
Tim Peters30324a72001-05-15 17:19:16 +0000364 def __getattr__(self, name,
Guido van Rossum0612d842000-03-10 23:20:43 +0000365 getattr=getattr):
366
367 """ Inherit all other methods from the underlying stream.
368 """
Tim Peters30324a72001-05-15 17:19:16 +0000369 return getattr(self.stream, name)
Guido van Rossum0612d842000-03-10 23:20:43 +0000370
371###
372
373class StreamRecoder:
374
Fred Drake49fd1072000-04-13 14:11:21 +0000375 """ StreamRecoder instances provide a frontend - backend
376 view of encoding data.
377
378 They use the complete set of APIs returned by the
379 codecs.lookup() function to implement their task.
380
381 Data written to the stream is first decoded into an
382 intermediate format (which is dependent on the given codec
383 combination) and then written to the stream using an instance
384 of the provided Writer class.
385
386 In the other direction, data is read from the stream using a
387 Reader instance and then return encoded data to the caller.
388
389 """
Guido van Rossuma3277132000-04-11 15:37:43 +0000390 # Optional attributes set by the file wrappers below
391 data_encoding = 'unknown'
392 file_encoding = 'unknown'
393
Tim Peters30324a72001-05-15 17:19:16 +0000394 def __init__(self, stream, encode, decode, Reader, Writer,
395 errors='strict'):
Guido van Rossum0612d842000-03-10 23:20:43 +0000396
397 """ Creates a StreamRecoder instance which implements a two-way
398 conversion: encode and decode work on the frontend (the
Guido van Rossum1c89b0e2000-04-11 15:41:38 +0000399 input to .read() and output of .write()) while
Guido van Rossum0612d842000-03-10 23:20:43 +0000400 Reader and Writer work on the backend (reading and
Fred Drake908670c2000-03-17 15:42:11 +0000401 writing to the stream).
Guido van Rossum0612d842000-03-10 23:20:43 +0000402
403 You can use these objects to do transparent direct
404 recodings from e.g. latin-1 to utf-8 and back.
405
406 stream must be a file-like object.
407
408 encode, decode must adhere to the Codec interface, Reader,
409 Writer must be factory functions or classes providing the
410 StreamReader, StreamWriter interface resp.
411
412 encode and decode are needed for the frontend translation,
413 Reader and Writer for the backend translation. Unicode is
414 used as intermediate encoding.
415
416 Error handling is done in the same way as defined for the
417 StreamWriter/Readers.
418
419 """
420 self.stream = stream
421 self.encode = encode
422 self.decode = decode
423 self.reader = Reader(stream, errors)
424 self.writer = Writer(stream, errors)
425 self.errors = errors
426
Tim Peters30324a72001-05-15 17:19:16 +0000427 def read(self, size=-1):
Guido van Rossum0612d842000-03-10 23:20:43 +0000428
429 data = self.reader.read(size)
430 data, bytesencoded = self.encode(data, self.errors)
431 return data
432
Tim Peters30324a72001-05-15 17:19:16 +0000433 def readline(self, size=None):
Guido van Rossuma3277132000-04-11 15:37:43 +0000434
435 if size is None:
436 data = self.reader.readline()
437 else:
438 data = self.reader.readline(size)
439 data, bytesencoded = self.encode(data, self.errors)
440 return data
441
Tim Peters30324a72001-05-15 17:19:16 +0000442 def readlines(self, sizehint=None):
Guido van Rossuma3277132000-04-11 15:37:43 +0000443
444 if sizehint is None:
445 data = self.reader.read()
446 else:
447 data = self.reader.read(sizehint)
448 data, bytesencoded = self.encode(data, self.errors)
449 return data.splitlines(1)
450
Tim Peters30324a72001-05-15 17:19:16 +0000451 def write(self, data):
Guido van Rossum0612d842000-03-10 23:20:43 +0000452
453 data, bytesdecoded = self.decode(data, self.errors)
454 return self.writer.write(data)
455
Tim Peters30324a72001-05-15 17:19:16 +0000456 def writelines(self, list):
Guido van Rossuma3277132000-04-11 15:37:43 +0000457
458 data = ''.join(list)
459 data, bytesdecoded = self.decode(data, self.errors)
460 return self.writer.write(data)
Guido van Rossum0612d842000-03-10 23:20:43 +0000461
462 def reset(self):
463
464 self.reader.reset()
465 self.writer.reset()
466
Tim Peters30324a72001-05-15 17:19:16 +0000467 def __getattr__(self, name,
Guido van Rossum0612d842000-03-10 23:20:43 +0000468 getattr=getattr):
469
470 """ Inherit all other methods from the underlying stream.
471 """
Tim Peters30324a72001-05-15 17:19:16 +0000472 return getattr(self.stream, name)
Guido van Rossum0612d842000-03-10 23:20:43 +0000473
474### Shortcuts
475
Marc-André Lemburg349a3d32000-06-21 21:21:04 +0000476def open(filename, mode='rb', encoding=None, errors='strict', buffering=1):
Guido van Rossum0612d842000-03-10 23:20:43 +0000477
478 """ Open an encoded file using the given mode and return
479 a wrapped version providing transparent encoding/decoding.
480
481 Note: The wrapped version will only accept the object format
482 defined by the codecs, i.e. Unicode objects for most builtin
483 codecs. Output is also codec dependent and will usually by
484 Unicode as well.
485
Marc-André Lemburg349a3d32000-06-21 21:21:04 +0000486 Files are always opened in binary mode, even if no binary mode
487 was specified. Thisis done to avoid data loss due to encodings
488 using 8-bit values. The default file mode is 'rb' meaning to
489 open the file in binary read mode.
490
Guido van Rossum0612d842000-03-10 23:20:43 +0000491 encoding specifies the encoding which is to be used for the
492 the file.
493
494 errors may be given to define the error handling. It defaults
495 to 'strict' which causes ValueErrors to be raised in case an
496 encoding error occurs.
497
498 buffering has the same meaning as for the builtin open() API.
499 It defaults to line buffered.
500
Fred Drake49fd1072000-04-13 14:11:21 +0000501 The returned wrapped file object provides an extra attribute
502 .encoding which allows querying the used encoding. This
503 attribute is only available if an encoding was specified as
504 parameter.
505
Guido van Rossum0612d842000-03-10 23:20:43 +0000506 """
507 if encoding is not None and \
508 'b' not in mode:
509 # Force opening of the file in binary mode
510 mode = mode + 'b'
511 file = __builtin__.open(filename, mode, buffering)
512 if encoding is None:
513 return file
Tim Peters30324a72001-05-15 17:19:16 +0000514 (e, d, sr, sw) = lookup(encoding)
Guido van Rossuma3277132000-04-11 15:37:43 +0000515 srw = StreamReaderWriter(file, sr, sw, errors)
516 # Add attributes to simplify introspection
517 srw.encoding = encoding
518 return srw
Guido van Rossum0612d842000-03-10 23:20:43 +0000519
Guido van Rossuma3277132000-04-11 15:37:43 +0000520def EncodedFile(file, data_encoding, file_encoding=None, errors='strict'):
Guido van Rossum0612d842000-03-10 23:20:43 +0000521
522 """ Return a wrapped version of file which provides transparent
523 encoding translation.
524
525 Strings written to the wrapped file are interpreted according
Guido van Rossuma3277132000-04-11 15:37:43 +0000526 to the given data_encoding and then written to the original
527 file as string using file_encoding. The intermediate encoding
528 will usually be Unicode but depends on the specified codecs.
Guido van Rossum0612d842000-03-10 23:20:43 +0000529
Guido van Rossuma3277132000-04-11 15:37:43 +0000530 Strings are read from the file using file_encoding and then
531 passed back to the caller as string using data_encoding.
532
533 If file_encoding is not given, it defaults to data_encoding.
Guido van Rossum0612d842000-03-10 23:20:43 +0000534
535 errors may be given to define the error handling. It defaults
536 to 'strict' which causes ValueErrors to be raised in case an
537 encoding error occurs.
538
Fred Drake49fd1072000-04-13 14:11:21 +0000539 The returned wrapped file object provides two extra attributes
540 .data_encoding and .file_encoding which reflect the given
541 parameters of the same name. The attributes can be used for
542 introspection by Python programs.
543
Guido van Rossum0612d842000-03-10 23:20:43 +0000544 """
Guido van Rossuma3277132000-04-11 15:37:43 +0000545 if file_encoding is None:
546 file_encoding = data_encoding
547 encode, decode = lookup(data_encoding)[:2]
548 Reader, Writer = lookup(file_encoding)[2:]
549 sr = StreamRecoder(file,
Tim Peters30324a72001-05-15 17:19:16 +0000550 encode, decode, Reader, Writer,
Guido van Rossuma3277132000-04-11 15:37:43 +0000551 errors)
552 # Add attributes to simplify introspection
553 sr.data_encoding = data_encoding
554 sr.file_encoding = file_encoding
555 return sr
Guido van Rossum0612d842000-03-10 23:20:43 +0000556
Marc-André Lemburgaa32c5a2001-09-19 11:24:48 +0000557### Helpers for codec lookup
558
559def getencoder(encoding):
560
561 """ Lookup up the codec for the given encoding and return
562 its encoder function.
563
564 Raises a LookupError in case the encoding cannot be found.
565
566 """
567 return lookup(encoding)[0]
568
569def getdecoder(encoding):
570
571 """ Lookup up the codec for the given encoding and return
572 its decoder function.
573
574 Raises a LookupError in case the encoding cannot be found.
575
576 """
577 return lookup(encoding)[1]
578
579def getreader(encoding):
580
581 """ Lookup up the codec for the given encoding and return
582 its StreamReader class or factory function.
583
584 Raises a LookupError in case the encoding cannot be found.
585
586 """
587 return lookup(encoding)[2]
588
589def getwriter(encoding):
590
591 """ Lookup up the codec for the given encoding and return
592 its StreamWriter class or factory function.
593
594 Raises a LookupError in case the encoding cannot be found.
595
596 """
597 return lookup(encoding)[3]
598
Marc-André Lemburga866df82001-01-03 21:29:14 +0000599### Helpers for charmap-based codecs
600
601def make_identity_dict(rng):
602
603 """ make_identity_dict(rng) -> dict
604
605 Return a dictionary where elements of the rng sequence are
606 mapped to themselves.
Tim Peters88869f92001-01-14 23:36:06 +0000607
Marc-André Lemburga866df82001-01-03 21:29:14 +0000608 """
609 res = {}
610 for i in rng:
611 res[i]=i
612 return res
613
Marc-André Lemburg716cf912001-05-16 09:41:45 +0000614def make_encoding_map(decoding_map):
615
616 """ Creates an encoding map from a decoding map.
617
618 If a target mapping in the decoding map occurrs multiple
619 times, then that target is mapped to None (undefined mapping),
620 causing an exception when encountered by the charmap codec
621 during translation.
622
623 One example where this happens is cp875.py which decodes
624 multiple character to \u001a.
625
626 """
627 m = {}
628 for k,v in decoding_map.items():
Raymond Hettinger54f02222002-06-01 14:18:47 +0000629 if not v in m:
Marc-André Lemburg716cf912001-05-16 09:41:45 +0000630 m[v] = k
631 else:
632 m[v] = None
633 return m
Tim Peters3a2ab1a2001-05-29 06:06:54 +0000634
Martin v. Löwis6cd441d2001-07-31 08:54:55 +0000635# Tell modulefinder that using codecs probably needs the encodings
636# package
637_false = 0
638if _false:
639 import encodings
640
Guido van Rossum0612d842000-03-10 23:20:43 +0000641### Tests
Guido van Rossum1c89b0e2000-04-11 15:41:38 +0000642
Guido van Rossum0612d842000-03-10 23:20:43 +0000643if __name__ == '__main__':
644
645 import sys
Guido van Rossum1c89b0e2000-04-11 15:41:38 +0000646
Guido van Rossuma3277132000-04-11 15:37:43 +0000647 # Make stdout translate Latin-1 output into UTF-8 output
648 sys.stdout = EncodedFile(sys.stdout, 'latin-1', 'utf-8')
Guido van Rossum1c89b0e2000-04-11 15:41:38 +0000649
Guido van Rossuma3277132000-04-11 15:37:43 +0000650 # Have stdin translate Latin-1 input into UTF-8 input
651 sys.stdin = EncodedFile(sys.stdin, 'utf-8', 'latin-1')