blob: 0b43a72d709d083a5d65389bc9a599d0dbdc7ca8 [file] [log] [blame]
Guido van Rossum0612d842000-03-10 23:20:43 +00001""" codecs -- Python Codec Registry, API and helpers.
2
3
4Written by Marc-Andre Lemburg (mal@lemburg.com).
5
6(c) Copyright CNRI, All Rights Reserved. NO WARRANTY.
7
8"""#"
9
Martin v. Löwis02d893c2001-08-02 07:15:29 +000010import struct, __builtin__
Guido van Rossum0612d842000-03-10 23:20:43 +000011
12### Registry and builtin stateless codec functions
13
Guido van Rossumb95de4f2000-03-31 17:25:23 +000014try:
15 from _codecs import *
Tim Peters30324a72001-05-15 17:19:16 +000016except ImportError, why:
Guido van Rossumb95de4f2000-03-31 17:25:23 +000017 raise SystemError,\
18 'Failed to load the builtin codecs: %s' % why
Guido van Rossum0612d842000-03-10 23:20:43 +000019
Tim Peters30324a72001-05-15 17:19:16 +000020__all__ = ["register", "lookup", "open", "EncodedFile", "BOM", "BOM_BE",
Walter Dörwald474458d2002-06-04 15:16:29 +000021 "BOM_LE", "BOM32_BE", "BOM32_LE", "BOM64_BE", "BOM64_LE",
22 "BOM_UTF8", "BOM_UTF16", "BOM_UTF16_LE", "BOM_UTF16_BE",
Walter Dörwald3aeb6322002-09-02 13:14:32 +000023 "BOM_UTF32", "BOM_UTF32_LE", "BOM_UTF32_BE",
24 "strict_errors", "ignore_errors", "replace_errors",
25 "xmlcharrefreplace_errors",
26 "register_error", "lookup_error"]
Skip Montanaroe99d5ea2001-01-20 19:54:20 +000027
Guido van Rossum0612d842000-03-10 23:20:43 +000028### Constants
29
30#
Walter Dörwald474458d2002-06-04 15:16:29 +000031# Byte Order Mark (BOM = ZERO WIDTH NO-BREAK SPACE = U+FEFF)
32# and its possible byte string values
33# for UTF8/UTF16/UTF32 output and little/big endian machines
Guido van Rossum0612d842000-03-10 23:20:43 +000034#
Guido van Rossum0612d842000-03-10 23:20:43 +000035
Walter Dörwald474458d2002-06-04 15:16:29 +000036# UTF-8
37BOM_UTF8 = '\xef\xbb\xbf'
38
39# UTF-16, little endian
40BOM_LE = BOM_UTF16_LE = '\xff\xfe'
41
42# UTF-16, big endian
43BOM_BE = BOM_UTF16_BE = '\xfe\xff'
44
45# UTF-32, little endian
46BOM_UTF32_LE = '\xff\xfe\x00\x00'
47
48# UTF-32, big endian
49BOM_UTF32_BE = '\x00\x00\xfe\xff'
50
51# UTF-16, native endianness
52BOM = BOM_UTF16 = struct.pack('=H', 0xFEFF)
53
54# UTF-32, native endianness
55BOM_UTF32 = struct.pack('=L', 0x0000FEFF)
56
57# Old broken names (don't use in new code)
58BOM32_LE = BOM_UTF16_LE
59BOM32_BE = BOM_UTF16_BE
60BOM64_LE = BOM_UTF32_LE
61BOM64_BE = BOM_UTF32_BE
Guido van Rossum0612d842000-03-10 23:20:43 +000062
63
64### Codec base classes (defining the API)
65
66class Codec:
67
68 """ Defines the interface for stateless encoders/decoders.
69
70 The .encode()/.decode() methods may implement different error
71 handling schemes by providing the errors argument. These
72 string values are defined:
73
Guido van Rossumd8855fd2000-03-24 22:14:19 +000074 'strict' - raise a ValueError error (or a subclass)
Guido van Rossum0612d842000-03-10 23:20:43 +000075 'ignore' - ignore the character and continue with the next
76 'replace' - replace with a suitable replacement character;
77 Python will use the official U+FFFD REPLACEMENT
78 CHARACTER for the builtin Unicode codecs.
79
80 """
Tim Peters30324a72001-05-15 17:19:16 +000081 def encode(self, input, errors='strict'):
Guido van Rossum1c89b0e2000-04-11 15:41:38 +000082
Fred Drake3e74c0d2000-03-17 15:40:35 +000083 """ Encodes the object input and returns a tuple (output
Guido van Rossum0612d842000-03-10 23:20:43 +000084 object, length consumed).
85
86 errors defines the error handling to apply. It defaults to
87 'strict' handling.
88
89 The method may not store state in the Codec instance. Use
90 StreamCodec for codecs which have to keep state in order to
91 make encoding/decoding efficient.
92
93 The encoder must be able to handle zero length input and
94 return an empty object of the output object type in this
95 situation.
96
97 """
98 raise NotImplementedError
99
Tim Peters30324a72001-05-15 17:19:16 +0000100 def decode(self, input, errors='strict'):
Guido van Rossum0612d842000-03-10 23:20:43 +0000101
102 """ Decodes the object input and returns a tuple (output
103 object, length consumed).
104
105 input must be an object which provides the bf_getreadbuf
106 buffer slot. Python strings, buffer objects and memory
107 mapped files are examples of objects providing this slot.
Guido van Rossum1c89b0e2000-04-11 15:41:38 +0000108
Guido van Rossum0612d842000-03-10 23:20:43 +0000109 errors defines the error handling to apply. It defaults to
110 'strict' handling.
111
112 The method may not store state in the Codec instance. Use
113 StreamCodec for codecs which have to keep state in order to
114 make encoding/decoding efficient.
115
116 The decoder must be able to handle zero length input and
117 return an empty object of the output object type in this
118 situation.
119
Guido van Rossum1c89b0e2000-04-11 15:41:38 +0000120 """
Guido van Rossum0612d842000-03-10 23:20:43 +0000121 raise NotImplementedError
122
123#
124# The StreamWriter and StreamReader class provide generic working
Andrew M. Kuchling97c56352001-09-18 20:29:48 +0000125# interfaces which can be used to implement new encoding submodules
Guido van Rossum0612d842000-03-10 23:20:43 +0000126# very easily. See encodings/utf_8.py for an example on how this is
127# done.
Guido van Rossum1c89b0e2000-04-11 15:41:38 +0000128#
Guido van Rossum0612d842000-03-10 23:20:43 +0000129
130class StreamWriter(Codec):
131
Tim Peters30324a72001-05-15 17:19:16 +0000132 def __init__(self, stream, errors='strict'):
Guido van Rossum0612d842000-03-10 23:20:43 +0000133
134 """ Creates a StreamWriter instance.
135
136 stream must be a file-like object open for writing
137 (binary) data.
138
139 The StreamWriter may implement different error handling
140 schemes by providing the errors keyword argument. These
141 parameters are defined:
142
143 'strict' - raise a ValueError (or a subclass)
144 'ignore' - ignore the character and continue with the next
145 'replace'- replace with a suitable replacement character
146
147 """
148 self.stream = stream
149 self.errors = errors
150
Guido van Rossuma3277132000-04-11 15:37:43 +0000151 def write(self, object):
Guido van Rossum0612d842000-03-10 23:20:43 +0000152
153 """ Writes the object's contents encoded to self.stream.
154 """
Tim Peters30324a72001-05-15 17:19:16 +0000155 data, consumed = self.encode(object, self.errors)
Guido van Rossum0612d842000-03-10 23:20:43 +0000156 self.stream.write(data)
157
Guido van Rossuma3277132000-04-11 15:37:43 +0000158 def writelines(self, list):
159
160 """ Writes the concatenated list of strings to the stream
161 using .write().
162 """
163 self.write(''.join(list))
Guido van Rossum1c89b0e2000-04-11 15:41:38 +0000164
Guido van Rossum0612d842000-03-10 23:20:43 +0000165 def reset(self):
166
167 """ Flushes and resets the codec buffers used for keeping state.
168
169 Calling this method should ensure that the data on the
170 output is put into a clean state, that allows appending
171 of new fresh data without having to rescan the whole
172 stream to recover state.
173
174 """
175 pass
176
Tim Peters30324a72001-05-15 17:19:16 +0000177 def __getattr__(self, name,
Guido van Rossum0612d842000-03-10 23:20:43 +0000178 getattr=getattr):
179
180 """ Inherit all other methods from the underlying stream.
181 """
Tim Peters30324a72001-05-15 17:19:16 +0000182 return getattr(self.stream, name)
Guido van Rossum0612d842000-03-10 23:20:43 +0000183
184###
185
186class StreamReader(Codec):
187
Tim Peters30324a72001-05-15 17:19:16 +0000188 def __init__(self, stream, errors='strict'):
Guido van Rossum0612d842000-03-10 23:20:43 +0000189
190 """ Creates a StreamReader instance.
191
192 stream must be a file-like object open for reading
193 (binary) data.
194
195 The StreamReader may implement different error handling
196 schemes by providing the errors keyword argument. These
197 parameters are defined:
198
199 'strict' - raise a ValueError (or a subclass)
200 'ignore' - ignore the character and continue with the next
201 'replace'- replace with a suitable replacement character;
202
203 """
204 self.stream = stream
205 self.errors = errors
206
Guido van Rossuma3277132000-04-11 15:37:43 +0000207 def read(self, size=-1):
Guido van Rossum0612d842000-03-10 23:20:43 +0000208
209 """ Decodes data from the stream self.stream and returns the
210 resulting object.
211
212 size indicates the approximate maximum number of bytes to
213 read from the stream for decoding purposes. The decoder
214 can modify this setting as appropriate. The default value
215 -1 indicates to read and decode as much as possible. size
216 is intended to prevent having to decode huge files in one
217 step.
218
219 The method should use a greedy read strategy meaning that
220 it should read as much data as is allowed within the
221 definition of the encoding and the given size, e.g. if
222 optional encoding endings or state markers are available
223 on the stream, these should be read too.
224
225 """
226 # Unsliced reading:
227 if size < 0:
Andrew M. Kuchlingc6c28382000-12-10 15:12:14 +0000228 return self.decode(self.stream.read(), self.errors)[0]
Guido van Rossum1c89b0e2000-04-11 15:41:38 +0000229
Guido van Rossum0612d842000-03-10 23:20:43 +0000230 # Sliced reading:
231 read = self.stream.read
232 decode = self.decode
233 data = read(size)
234 i = 0
235 while 1:
236 try:
Andrew M. Kuchlingc6c28382000-12-10 15:12:14 +0000237 object, decodedbytes = decode(data, self.errors)
Tim Peters30324a72001-05-15 17:19:16 +0000238 except ValueError, why:
Guido van Rossum0612d842000-03-10 23:20:43 +0000239 # This method is slow but should work under pretty much
240 # all conditions; at most 10 tries are made
241 i = i + 1
242 newdata = read(1)
243 if not newdata or i > 10:
244 raise
245 data = data + newdata
246 else:
247 return object
248
Guido van Rossuma3277132000-04-11 15:37:43 +0000249 def readline(self, size=None):
250
251 """ Read one line from the input stream and return the
252 decoded data.
253
Fred Drake49fd1072000-04-13 14:11:21 +0000254 Note: Unlike the .readlines() method, this method inherits
255 the line breaking knowledge from the underlying stream's
256 .readline() method -- there is currently no support for
257 line breaking using the codec decoder due to lack of line
258 buffering. Sublcasses should however, if possible, try to
259 implement this method using their own knowledge of line
260 breaking.
Guido van Rossuma3277132000-04-11 15:37:43 +0000261
262 size, if given, is passed as size argument to the stream's
263 .readline() method.
Guido van Rossum1c89b0e2000-04-11 15:41:38 +0000264
Guido van Rossuma3277132000-04-11 15:37:43 +0000265 """
266 if size is None:
267 line = self.stream.readline()
268 else:
269 line = self.stream.readline(size)
Tim Peters30324a72001-05-15 17:19:16 +0000270 return self.decode(line, self.errors)[0]
Guido van Rossum1c89b0e2000-04-11 15:41:38 +0000271
Guido van Rossuma3277132000-04-11 15:37:43 +0000272
Martin v. Löwisb786e612002-03-05 15:46:38 +0000273 def readlines(self, sizehint=None):
Guido van Rossuma3277132000-04-11 15:37:43 +0000274
275 """ Read all lines available on the input stream
276 and return them as list of lines.
277
278 Line breaks are implemented using the codec's decoder
279 method and are included in the list entries.
Guido van Rossum1c89b0e2000-04-11 15:41:38 +0000280
Guido van Rossuma3277132000-04-11 15:37:43 +0000281 sizehint, if given, is passed as size argument to the
282 stream's .read() method.
283
284 """
285 if sizehint is None:
286 data = self.stream.read()
287 else:
288 data = self.stream.read(sizehint)
Tim Peters30324a72001-05-15 17:19:16 +0000289 return self.decode(data, self.errors)[0].splitlines(1)
Guido van Rossum0612d842000-03-10 23:20:43 +0000290
291 def reset(self):
292
293 """ Resets the codec buffers used for keeping state.
294
295 Note that no stream repositioning should take place.
Thomas Wouters7e474022000-07-16 12:04:32 +0000296 This method is primarily intended to be able to recover
Guido van Rossum0612d842000-03-10 23:20:43 +0000297 from decoding errors.
298
299 """
300 pass
301
Walter Dörwald4dbf1922002-11-06 16:53:44 +0000302 def next(self):
303
304 """ Return the next decoded line from the input stream."""
305 line = self.readline()
306 if line:
307 return line
308 raise StopIteration
309
310 def __iter__(self):
311 return self
312
Tim Peters30324a72001-05-15 17:19:16 +0000313 def __getattr__(self, name,
Guido van Rossum0612d842000-03-10 23:20:43 +0000314 getattr=getattr):
315
316 """ Inherit all other methods from the underlying stream.
317 """
Tim Peters30324a72001-05-15 17:19:16 +0000318 return getattr(self.stream, name)
Guido van Rossum0612d842000-03-10 23:20:43 +0000319
320###
321
322class StreamReaderWriter:
323
Fred Drake49fd1072000-04-13 14:11:21 +0000324 """ StreamReaderWriter instances allow wrapping streams which
325 work in both read and write modes.
326
327 The design is such that one can use the factory functions
Thomas Wouters7e474022000-07-16 12:04:32 +0000328 returned by the codec.lookup() function to construct the
Fred Drake49fd1072000-04-13 14:11:21 +0000329 instance.
330
331 """
Guido van Rossuma3277132000-04-11 15:37:43 +0000332 # Optional attributes set by the file wrappers below
333 encoding = 'unknown'
334
Tim Peters30324a72001-05-15 17:19:16 +0000335 def __init__(self, stream, Reader, Writer, errors='strict'):
Guido van Rossum0612d842000-03-10 23:20:43 +0000336
337 """ Creates a StreamReaderWriter instance.
338
339 stream must be a Stream-like object.
340
341 Reader, Writer must be factory functions or classes
342 providing the StreamReader, StreamWriter interface resp.
343
344 Error handling is done in the same way as defined for the
345 StreamWriter/Readers.
346
347 """
348 self.stream = stream
349 self.reader = Reader(stream, errors)
350 self.writer = Writer(stream, errors)
351 self.errors = errors
352
Tim Peters30324a72001-05-15 17:19:16 +0000353 def read(self, size=-1):
Guido van Rossum0612d842000-03-10 23:20:43 +0000354
355 return self.reader.read(size)
356
Guido van Rossumd58c26f2000-05-01 16:17:32 +0000357 def readline(self, size=None):
Guido van Rossuma3277132000-04-11 15:37:43 +0000358
359 return self.reader.readline(size)
360
Guido van Rossumd58c26f2000-05-01 16:17:32 +0000361 def readlines(self, sizehint=None):
Guido van Rossuma3277132000-04-11 15:37:43 +0000362
363 return self.reader.readlines(sizehint)
364
Walter Dörwald4dbf1922002-11-06 16:53:44 +0000365 def next(self):
366
367 """ Return the next decoded line from the input stream."""
368 return self.reader.next()
369
370 def __iter__(self):
371 return self
372
Tim Peters30324a72001-05-15 17:19:16 +0000373 def write(self, data):
Guido van Rossum0612d842000-03-10 23:20:43 +0000374
375 return self.writer.write(data)
376
Tim Peters30324a72001-05-15 17:19:16 +0000377 def writelines(self, list):
Guido van Rossuma3277132000-04-11 15:37:43 +0000378
379 return self.writer.writelines(list)
380
Guido van Rossum0612d842000-03-10 23:20:43 +0000381 def reset(self):
382
383 self.reader.reset()
384 self.writer.reset()
385
Tim Peters30324a72001-05-15 17:19:16 +0000386 def __getattr__(self, name,
Guido van Rossum0612d842000-03-10 23:20:43 +0000387 getattr=getattr):
388
389 """ Inherit all other methods from the underlying stream.
390 """
Tim Peters30324a72001-05-15 17:19:16 +0000391 return getattr(self.stream, name)
Guido van Rossum0612d842000-03-10 23:20:43 +0000392
393###
394
395class StreamRecoder:
396
Fred Drake49fd1072000-04-13 14:11:21 +0000397 """ StreamRecoder instances provide a frontend - backend
398 view of encoding data.
399
400 They use the complete set of APIs returned by the
401 codecs.lookup() function to implement their task.
402
403 Data written to the stream is first decoded into an
404 intermediate format (which is dependent on the given codec
405 combination) and then written to the stream using an instance
406 of the provided Writer class.
407
408 In the other direction, data is read from the stream using a
409 Reader instance and then return encoded data to the caller.
410
411 """
Guido van Rossuma3277132000-04-11 15:37:43 +0000412 # Optional attributes set by the file wrappers below
413 data_encoding = 'unknown'
414 file_encoding = 'unknown'
415
Tim Peters30324a72001-05-15 17:19:16 +0000416 def __init__(self, stream, encode, decode, Reader, Writer,
417 errors='strict'):
Guido van Rossum0612d842000-03-10 23:20:43 +0000418
419 """ Creates a StreamRecoder instance which implements a two-way
420 conversion: encode and decode work on the frontend (the
Guido van Rossum1c89b0e2000-04-11 15:41:38 +0000421 input to .read() and output of .write()) while
Guido van Rossum0612d842000-03-10 23:20:43 +0000422 Reader and Writer work on the backend (reading and
Fred Drake908670c2000-03-17 15:42:11 +0000423 writing to the stream).
Guido van Rossum0612d842000-03-10 23:20:43 +0000424
425 You can use these objects to do transparent direct
426 recodings from e.g. latin-1 to utf-8 and back.
427
428 stream must be a file-like object.
429
430 encode, decode must adhere to the Codec interface, Reader,
431 Writer must be factory functions or classes providing the
432 StreamReader, StreamWriter interface resp.
433
434 encode and decode are needed for the frontend translation,
435 Reader and Writer for the backend translation. Unicode is
436 used as intermediate encoding.
437
438 Error handling is done in the same way as defined for the
439 StreamWriter/Readers.
440
441 """
442 self.stream = stream
443 self.encode = encode
444 self.decode = decode
445 self.reader = Reader(stream, errors)
446 self.writer = Writer(stream, errors)
447 self.errors = errors
448
Tim Peters30324a72001-05-15 17:19:16 +0000449 def read(self, size=-1):
Guido van Rossum0612d842000-03-10 23:20:43 +0000450
451 data = self.reader.read(size)
452 data, bytesencoded = self.encode(data, self.errors)
453 return data
454
Tim Peters30324a72001-05-15 17:19:16 +0000455 def readline(self, size=None):
Guido van Rossuma3277132000-04-11 15:37:43 +0000456
457 if size is None:
458 data = self.reader.readline()
459 else:
460 data = self.reader.readline(size)
461 data, bytesencoded = self.encode(data, self.errors)
462 return data
463
Tim Peters30324a72001-05-15 17:19:16 +0000464 def readlines(self, sizehint=None):
Guido van Rossuma3277132000-04-11 15:37:43 +0000465
466 if sizehint is None:
467 data = self.reader.read()
468 else:
469 data = self.reader.read(sizehint)
470 data, bytesencoded = self.encode(data, self.errors)
471 return data.splitlines(1)
472
Walter Dörwald4dbf1922002-11-06 16:53:44 +0000473 def next(self):
474
475 """ Return the next decoded line from the input stream."""
476 return self.reader.next()
477
478 def __iter__(self):
479 return self
480
Tim Peters30324a72001-05-15 17:19:16 +0000481 def write(self, data):
Guido van Rossum0612d842000-03-10 23:20:43 +0000482
483 data, bytesdecoded = self.decode(data, self.errors)
484 return self.writer.write(data)
485
Tim Peters30324a72001-05-15 17:19:16 +0000486 def writelines(self, list):
Guido van Rossuma3277132000-04-11 15:37:43 +0000487
488 data = ''.join(list)
489 data, bytesdecoded = self.decode(data, self.errors)
490 return self.writer.write(data)
Guido van Rossum0612d842000-03-10 23:20:43 +0000491
492 def reset(self):
493
494 self.reader.reset()
495 self.writer.reset()
496
Tim Peters30324a72001-05-15 17:19:16 +0000497 def __getattr__(self, name,
Guido van Rossum0612d842000-03-10 23:20:43 +0000498 getattr=getattr):
499
500 """ Inherit all other methods from the underlying stream.
501 """
Tim Peters30324a72001-05-15 17:19:16 +0000502 return getattr(self.stream, name)
Guido van Rossum0612d842000-03-10 23:20:43 +0000503
504### Shortcuts
505
Marc-André Lemburg349a3d32000-06-21 21:21:04 +0000506def open(filename, mode='rb', encoding=None, errors='strict', buffering=1):
Guido van Rossum0612d842000-03-10 23:20:43 +0000507
508 """ Open an encoded file using the given mode and return
509 a wrapped version providing transparent encoding/decoding.
510
511 Note: The wrapped version will only accept the object format
512 defined by the codecs, i.e. Unicode objects for most builtin
513 codecs. Output is also codec dependent and will usually by
514 Unicode as well.
515
Marc-André Lemburg349a3d32000-06-21 21:21:04 +0000516 Files are always opened in binary mode, even if no binary mode
517 was specified. Thisis done to avoid data loss due to encodings
518 using 8-bit values. The default file mode is 'rb' meaning to
519 open the file in binary read mode.
520
Guido van Rossum0612d842000-03-10 23:20:43 +0000521 encoding specifies the encoding which is to be used for the
522 the file.
523
524 errors may be given to define the error handling. It defaults
525 to 'strict' which causes ValueErrors to be raised in case an
526 encoding error occurs.
527
528 buffering has the same meaning as for the builtin open() API.
529 It defaults to line buffered.
530
Fred Drake49fd1072000-04-13 14:11:21 +0000531 The returned wrapped file object provides an extra attribute
532 .encoding which allows querying the used encoding. This
533 attribute is only available if an encoding was specified as
534 parameter.
535
Guido van Rossum0612d842000-03-10 23:20:43 +0000536 """
537 if encoding is not None and \
538 'b' not in mode:
539 # Force opening of the file in binary mode
540 mode = mode + 'b'
541 file = __builtin__.open(filename, mode, buffering)
542 if encoding is None:
543 return file
Tim Peters30324a72001-05-15 17:19:16 +0000544 (e, d, sr, sw) = lookup(encoding)
Guido van Rossuma3277132000-04-11 15:37:43 +0000545 srw = StreamReaderWriter(file, sr, sw, errors)
546 # Add attributes to simplify introspection
547 srw.encoding = encoding
548 return srw
Guido van Rossum0612d842000-03-10 23:20:43 +0000549
Guido van Rossuma3277132000-04-11 15:37:43 +0000550def EncodedFile(file, data_encoding, file_encoding=None, errors='strict'):
Guido van Rossum0612d842000-03-10 23:20:43 +0000551
552 """ Return a wrapped version of file which provides transparent
553 encoding translation.
554
555 Strings written to the wrapped file are interpreted according
Guido van Rossuma3277132000-04-11 15:37:43 +0000556 to the given data_encoding and then written to the original
557 file as string using file_encoding. The intermediate encoding
558 will usually be Unicode but depends on the specified codecs.
Guido van Rossum0612d842000-03-10 23:20:43 +0000559
Guido van Rossuma3277132000-04-11 15:37:43 +0000560 Strings are read from the file using file_encoding and then
561 passed back to the caller as string using data_encoding.
562
563 If file_encoding is not given, it defaults to data_encoding.
Guido van Rossum0612d842000-03-10 23:20:43 +0000564
565 errors may be given to define the error handling. It defaults
566 to 'strict' which causes ValueErrors to be raised in case an
567 encoding error occurs.
568
Fred Drake49fd1072000-04-13 14:11:21 +0000569 The returned wrapped file object provides two extra attributes
570 .data_encoding and .file_encoding which reflect the given
571 parameters of the same name. The attributes can be used for
572 introspection by Python programs.
573
Guido van Rossum0612d842000-03-10 23:20:43 +0000574 """
Guido van Rossuma3277132000-04-11 15:37:43 +0000575 if file_encoding is None:
576 file_encoding = data_encoding
577 encode, decode = lookup(data_encoding)[:2]
578 Reader, Writer = lookup(file_encoding)[2:]
579 sr = StreamRecoder(file,
Tim Peters30324a72001-05-15 17:19:16 +0000580 encode, decode, Reader, Writer,
Guido van Rossuma3277132000-04-11 15:37:43 +0000581 errors)
582 # Add attributes to simplify introspection
583 sr.data_encoding = data_encoding
584 sr.file_encoding = file_encoding
585 return sr
Guido van Rossum0612d842000-03-10 23:20:43 +0000586
Marc-André Lemburgaa32c5a2001-09-19 11:24:48 +0000587### Helpers for codec lookup
588
589def getencoder(encoding):
590
591 """ Lookup up the codec for the given encoding and return
592 its encoder function.
593
594 Raises a LookupError in case the encoding cannot be found.
595
596 """
597 return lookup(encoding)[0]
598
599def getdecoder(encoding):
600
601 """ Lookup up the codec for the given encoding and return
602 its decoder function.
603
604 Raises a LookupError in case the encoding cannot be found.
605
606 """
607 return lookup(encoding)[1]
608
609def getreader(encoding):
610
611 """ Lookup up the codec for the given encoding and return
612 its StreamReader class or factory function.
613
614 Raises a LookupError in case the encoding cannot be found.
615
616 """
617 return lookup(encoding)[2]
618
619def getwriter(encoding):
620
621 """ Lookup up the codec for the given encoding and return
622 its StreamWriter class or factory function.
623
624 Raises a LookupError in case the encoding cannot be found.
625
626 """
627 return lookup(encoding)[3]
628
Marc-André Lemburga866df82001-01-03 21:29:14 +0000629### Helpers for charmap-based codecs
630
631def make_identity_dict(rng):
632
633 """ make_identity_dict(rng) -> dict
634
635 Return a dictionary where elements of the rng sequence are
636 mapped to themselves.
Tim Peters88869f92001-01-14 23:36:06 +0000637
Marc-André Lemburga866df82001-01-03 21:29:14 +0000638 """
639 res = {}
640 for i in rng:
641 res[i]=i
642 return res
643
Marc-André Lemburg716cf912001-05-16 09:41:45 +0000644def make_encoding_map(decoding_map):
645
646 """ Creates an encoding map from a decoding map.
647
648 If a target mapping in the decoding map occurrs multiple
649 times, then that target is mapped to None (undefined mapping),
650 causing an exception when encountered by the charmap codec
651 during translation.
652
653 One example where this happens is cp875.py which decodes
654 multiple character to \u001a.
655
656 """
657 m = {}
658 for k,v in decoding_map.items():
Raymond Hettinger54f02222002-06-01 14:18:47 +0000659 if not v in m:
Marc-André Lemburg716cf912001-05-16 09:41:45 +0000660 m[v] = k
661 else:
662 m[v] = None
663 return m
Tim Peters3a2ab1a2001-05-29 06:06:54 +0000664
Walter Dörwald3aeb6322002-09-02 13:14:32 +0000665### error handlers
666
667strict_errors = lookup_error("strict")
668ignore_errors = lookup_error("ignore")
669replace_errors = lookup_error("replace")
670xmlcharrefreplace_errors = lookup_error("xmlcharrefreplace")
671backslashreplace_errors = lookup_error("backslashreplace")
672
Martin v. Löwis6cd441d2001-07-31 08:54:55 +0000673# Tell modulefinder that using codecs probably needs the encodings
674# package
675_false = 0
676if _false:
677 import encodings
678
Guido van Rossum0612d842000-03-10 23:20:43 +0000679### Tests
Guido van Rossum1c89b0e2000-04-11 15:41:38 +0000680
Guido van Rossum0612d842000-03-10 23:20:43 +0000681if __name__ == '__main__':
682
683 import sys
Guido van Rossum1c89b0e2000-04-11 15:41:38 +0000684
Guido van Rossuma3277132000-04-11 15:37:43 +0000685 # Make stdout translate Latin-1 output into UTF-8 output
686 sys.stdout = EncodedFile(sys.stdout, 'latin-1', 'utf-8')
Guido van Rossum1c89b0e2000-04-11 15:41:38 +0000687
Guido van Rossuma3277132000-04-11 15:37:43 +0000688 # Have stdin translate Latin-1 input into UTF-8 input
689 sys.stdin = EncodedFile(sys.stdin, 'utf-8', 'latin-1')