blob: e7967d8744c2802bc769f6d981245f4abb46d83f [file] [log] [blame]
Guido van Rossum0612d842000-03-10 23:20:43 +00001""" codecs -- Python Codec Registry, API and helpers.
2
3
4Written by Marc-Andre Lemburg (mal@lemburg.com).
5
6(c) Copyright CNRI, All Rights Reserved. NO WARRANTY.
7
8"""#"
9
Martin v. Löwis02d893c2001-08-02 07:15:29 +000010import struct, __builtin__
Guido van Rossum0612d842000-03-10 23:20:43 +000011
12### Registry and builtin stateless codec functions
13
Guido van Rossumb95de4f2000-03-31 17:25:23 +000014try:
15 from _codecs import *
Tim Peters30324a72001-05-15 17:19:16 +000016except ImportError, why:
Guido van Rossumb95de4f2000-03-31 17:25:23 +000017 raise SystemError,\
18 'Failed to load the builtin codecs: %s' % why
Guido van Rossum0612d842000-03-10 23:20:43 +000019
Tim Peters30324a72001-05-15 17:19:16 +000020__all__ = ["register", "lookup", "open", "EncodedFile", "BOM", "BOM_BE",
Walter Dörwald474458d2002-06-04 15:16:29 +000021 "BOM_LE", "BOM32_BE", "BOM32_LE", "BOM64_BE", "BOM64_LE",
22 "BOM_UTF8", "BOM_UTF16", "BOM_UTF16_LE", "BOM_UTF16_BE",
Walter Dörwald3aeb6322002-09-02 13:14:32 +000023 "BOM_UTF32", "BOM_UTF32_LE", "BOM_UTF32_BE",
24 "strict_errors", "ignore_errors", "replace_errors",
25 "xmlcharrefreplace_errors",
26 "register_error", "lookup_error"]
Skip Montanaroe99d5ea2001-01-20 19:54:20 +000027
Guido van Rossum0612d842000-03-10 23:20:43 +000028### Constants
29
30#
Walter Dörwald474458d2002-06-04 15:16:29 +000031# Byte Order Mark (BOM = ZERO WIDTH NO-BREAK SPACE = U+FEFF)
32# and its possible byte string values
33# for UTF8/UTF16/UTF32 output and little/big endian machines
Guido van Rossum0612d842000-03-10 23:20:43 +000034#
Guido van Rossum0612d842000-03-10 23:20:43 +000035
Walter Dörwald474458d2002-06-04 15:16:29 +000036# UTF-8
37BOM_UTF8 = '\xef\xbb\xbf'
38
39# UTF-16, little endian
40BOM_LE = BOM_UTF16_LE = '\xff\xfe'
41
42# UTF-16, big endian
43BOM_BE = BOM_UTF16_BE = '\xfe\xff'
44
45# UTF-32, little endian
46BOM_UTF32_LE = '\xff\xfe\x00\x00'
47
48# UTF-32, big endian
49BOM_UTF32_BE = '\x00\x00\xfe\xff'
50
51# UTF-16, native endianness
52BOM = BOM_UTF16 = struct.pack('=H', 0xFEFF)
53
54# UTF-32, native endianness
55BOM_UTF32 = struct.pack('=L', 0x0000FEFF)
56
57# Old broken names (don't use in new code)
58BOM32_LE = BOM_UTF16_LE
59BOM32_BE = BOM_UTF16_BE
60BOM64_LE = BOM_UTF32_LE
61BOM64_BE = BOM_UTF32_BE
Guido van Rossum0612d842000-03-10 23:20:43 +000062
63
64### Codec base classes (defining the API)
65
66class Codec:
67
68 """ Defines the interface for stateless encoders/decoders.
69
Walter Dörwald7f82f792002-11-19 21:42:53 +000070 The .encode()/.decode() methods may use different error
Guido van Rossum0612d842000-03-10 23:20:43 +000071 handling schemes by providing the errors argument. These
Walter Dörwald7f82f792002-11-19 21:42:53 +000072 string values are predefined:
Guido van Rossum0612d842000-03-10 23:20:43 +000073
Guido van Rossumd8855fd2000-03-24 22:14:19 +000074 'strict' - raise a ValueError error (or a subclass)
Guido van Rossum0612d842000-03-10 23:20:43 +000075 'ignore' - ignore the character and continue with the next
76 'replace' - replace with a suitable replacement character;
77 Python will use the official U+FFFD REPLACEMENT
Walter Dörwald7f82f792002-11-19 21:42:53 +000078 CHARACTER for the builtin Unicode codecs on
79 decoding and '?' on encoding.
80 'xmlcharrefreplace' - Replace with the appropriate XML
81 character reference (only for encoding).
82 'backslashreplace' - Replace with backslashed escape sequences
83 (only for encoding).
84
85 The set of allowed values can be extended via register_error.
Guido van Rossum0612d842000-03-10 23:20:43 +000086
87 """
Tim Peters30324a72001-05-15 17:19:16 +000088 def encode(self, input, errors='strict'):
Guido van Rossum1c89b0e2000-04-11 15:41:38 +000089
Fred Drake3e74c0d2000-03-17 15:40:35 +000090 """ Encodes the object input and returns a tuple (output
Guido van Rossum0612d842000-03-10 23:20:43 +000091 object, length consumed).
92
93 errors defines the error handling to apply. It defaults to
94 'strict' handling.
95
96 The method may not store state in the Codec instance. Use
97 StreamCodec for codecs which have to keep state in order to
98 make encoding/decoding efficient.
99
100 The encoder must be able to handle zero length input and
101 return an empty object of the output object type in this
102 situation.
103
104 """
105 raise NotImplementedError
106
Tim Peters30324a72001-05-15 17:19:16 +0000107 def decode(self, input, errors='strict'):
Guido van Rossum0612d842000-03-10 23:20:43 +0000108
109 """ Decodes the object input and returns a tuple (output
110 object, length consumed).
111
112 input must be an object which provides the bf_getreadbuf
113 buffer slot. Python strings, buffer objects and memory
114 mapped files are examples of objects providing this slot.
Guido van Rossum1c89b0e2000-04-11 15:41:38 +0000115
Guido van Rossum0612d842000-03-10 23:20:43 +0000116 errors defines the error handling to apply. It defaults to
117 'strict' handling.
118
119 The method may not store state in the Codec instance. Use
120 StreamCodec for codecs which have to keep state in order to
121 make encoding/decoding efficient.
122
123 The decoder must be able to handle zero length input and
124 return an empty object of the output object type in this
125 situation.
126
Guido van Rossum1c89b0e2000-04-11 15:41:38 +0000127 """
Guido van Rossum0612d842000-03-10 23:20:43 +0000128 raise NotImplementedError
129
130#
131# The StreamWriter and StreamReader class provide generic working
Andrew M. Kuchling97c56352001-09-18 20:29:48 +0000132# interfaces which can be used to implement new encoding submodules
Guido van Rossum0612d842000-03-10 23:20:43 +0000133# very easily. See encodings/utf_8.py for an example on how this is
134# done.
Guido van Rossum1c89b0e2000-04-11 15:41:38 +0000135#
Guido van Rossum0612d842000-03-10 23:20:43 +0000136
137class StreamWriter(Codec):
138
Tim Peters30324a72001-05-15 17:19:16 +0000139 def __init__(self, stream, errors='strict'):
Guido van Rossum0612d842000-03-10 23:20:43 +0000140
141 """ Creates a StreamWriter instance.
142
143 stream must be a file-like object open for writing
144 (binary) data.
145
Walter Dörwald7f82f792002-11-19 21:42:53 +0000146 The StreamWriter may use different error handling
Guido van Rossum0612d842000-03-10 23:20:43 +0000147 schemes by providing the errors keyword argument. These
Walter Dörwald7f82f792002-11-19 21:42:53 +0000148 parameters are predefined:
Guido van Rossum0612d842000-03-10 23:20:43 +0000149
150 'strict' - raise a ValueError (or a subclass)
151 'ignore' - ignore the character and continue with the next
152 'replace'- replace with a suitable replacement character
Walter Dörwald7f82f792002-11-19 21:42:53 +0000153 'xmlcharrefreplace' - Replace with the appropriate XML
154 character reference.
155 'backslashreplace' - Replace with backslashed escape
156 sequences (only for encoding).
Guido van Rossum0612d842000-03-10 23:20:43 +0000157
Walter Dörwald7f82f792002-11-19 21:42:53 +0000158 The set of allowed parameter values can be extended via
159 register_error.
Guido van Rossum0612d842000-03-10 23:20:43 +0000160 """
161 self.stream = stream
162 self.errors = errors
163
Guido van Rossuma3277132000-04-11 15:37:43 +0000164 def write(self, object):
Guido van Rossum0612d842000-03-10 23:20:43 +0000165
166 """ Writes the object's contents encoded to self.stream.
167 """
Tim Peters30324a72001-05-15 17:19:16 +0000168 data, consumed = self.encode(object, self.errors)
Guido van Rossum0612d842000-03-10 23:20:43 +0000169 self.stream.write(data)
170
Guido van Rossuma3277132000-04-11 15:37:43 +0000171 def writelines(self, list):
172
173 """ Writes the concatenated list of strings to the stream
174 using .write().
175 """
176 self.write(''.join(list))
Guido van Rossum1c89b0e2000-04-11 15:41:38 +0000177
Guido van Rossum0612d842000-03-10 23:20:43 +0000178 def reset(self):
179
180 """ Flushes and resets the codec buffers used for keeping state.
181
182 Calling this method should ensure that the data on the
183 output is put into a clean state, that allows appending
184 of new fresh data without having to rescan the whole
185 stream to recover state.
186
187 """
188 pass
189
Tim Peters30324a72001-05-15 17:19:16 +0000190 def __getattr__(self, name,
Guido van Rossum0612d842000-03-10 23:20:43 +0000191 getattr=getattr):
192
193 """ Inherit all other methods from the underlying stream.
194 """
Tim Peters30324a72001-05-15 17:19:16 +0000195 return getattr(self.stream, name)
Guido van Rossum0612d842000-03-10 23:20:43 +0000196
197###
198
199class StreamReader(Codec):
200
Tim Peters30324a72001-05-15 17:19:16 +0000201 def __init__(self, stream, errors='strict'):
Guido van Rossum0612d842000-03-10 23:20:43 +0000202
203 """ Creates a StreamReader instance.
204
205 stream must be a file-like object open for reading
206 (binary) data.
207
Walter Dörwald7f82f792002-11-19 21:42:53 +0000208 The StreamReader may use different error handling
Guido van Rossum0612d842000-03-10 23:20:43 +0000209 schemes by providing the errors keyword argument. These
Walter Dörwald7f82f792002-11-19 21:42:53 +0000210 parameters are predefined:
Guido van Rossum0612d842000-03-10 23:20:43 +0000211
212 'strict' - raise a ValueError (or a subclass)
213 'ignore' - ignore the character and continue with the next
214 'replace'- replace with a suitable replacement character;
215
Walter Dörwald7f82f792002-11-19 21:42:53 +0000216 The set of allowed parameter values can be extended via
217 register_error.
Guido van Rossum0612d842000-03-10 23:20:43 +0000218 """
219 self.stream = stream
220 self.errors = errors
221
Guido van Rossuma3277132000-04-11 15:37:43 +0000222 def read(self, size=-1):
Guido van Rossum0612d842000-03-10 23:20:43 +0000223
224 """ Decodes data from the stream self.stream and returns the
225 resulting object.
226
227 size indicates the approximate maximum number of bytes to
228 read from the stream for decoding purposes. The decoder
229 can modify this setting as appropriate. The default value
230 -1 indicates to read and decode as much as possible. size
231 is intended to prevent having to decode huge files in one
232 step.
233
234 The method should use a greedy read strategy meaning that
235 it should read as much data as is allowed within the
236 definition of the encoding and the given size, e.g. if
237 optional encoding endings or state markers are available
238 on the stream, these should be read too.
239
240 """
241 # Unsliced reading:
242 if size < 0:
Andrew M. Kuchlingc6c28382000-12-10 15:12:14 +0000243 return self.decode(self.stream.read(), self.errors)[0]
Guido van Rossum1c89b0e2000-04-11 15:41:38 +0000244
Guido van Rossum0612d842000-03-10 23:20:43 +0000245 # Sliced reading:
246 read = self.stream.read
247 decode = self.decode
248 data = read(size)
249 i = 0
250 while 1:
251 try:
Andrew M. Kuchlingc6c28382000-12-10 15:12:14 +0000252 object, decodedbytes = decode(data, self.errors)
Tim Peters30324a72001-05-15 17:19:16 +0000253 except ValueError, why:
Guido van Rossum0612d842000-03-10 23:20:43 +0000254 # This method is slow but should work under pretty much
255 # all conditions; at most 10 tries are made
256 i = i + 1
257 newdata = read(1)
258 if not newdata or i > 10:
259 raise
260 data = data + newdata
261 else:
262 return object
263
Guido van Rossuma3277132000-04-11 15:37:43 +0000264 def readline(self, size=None):
265
266 """ Read one line from the input stream and return the
267 decoded data.
268
Fred Drake49fd1072000-04-13 14:11:21 +0000269 Note: Unlike the .readlines() method, this method inherits
270 the line breaking knowledge from the underlying stream's
271 .readline() method -- there is currently no support for
272 line breaking using the codec decoder due to lack of line
273 buffering. Sublcasses should however, if possible, try to
274 implement this method using their own knowledge of line
275 breaking.
Guido van Rossuma3277132000-04-11 15:37:43 +0000276
277 size, if given, is passed as size argument to the stream's
278 .readline() method.
Guido van Rossum1c89b0e2000-04-11 15:41:38 +0000279
Guido van Rossuma3277132000-04-11 15:37:43 +0000280 """
281 if size is None:
282 line = self.stream.readline()
283 else:
284 line = self.stream.readline(size)
Tim Peters30324a72001-05-15 17:19:16 +0000285 return self.decode(line, self.errors)[0]
Guido van Rossum1c89b0e2000-04-11 15:41:38 +0000286
Guido van Rossuma3277132000-04-11 15:37:43 +0000287
Martin v. Löwisb786e612002-03-05 15:46:38 +0000288 def readlines(self, sizehint=None):
Guido van Rossuma3277132000-04-11 15:37:43 +0000289
290 """ Read all lines available on the input stream
291 and return them as list of lines.
292
293 Line breaks are implemented using the codec's decoder
294 method and are included in the list entries.
Guido van Rossum1c89b0e2000-04-11 15:41:38 +0000295
Guido van Rossuma3277132000-04-11 15:37:43 +0000296 sizehint, if given, is passed as size argument to the
297 stream's .read() method.
298
299 """
300 if sizehint is None:
301 data = self.stream.read()
302 else:
303 data = self.stream.read(sizehint)
Tim Peters30324a72001-05-15 17:19:16 +0000304 return self.decode(data, self.errors)[0].splitlines(1)
Guido van Rossum0612d842000-03-10 23:20:43 +0000305
306 def reset(self):
307
308 """ Resets the codec buffers used for keeping state.
309
310 Note that no stream repositioning should take place.
Thomas Wouters7e474022000-07-16 12:04:32 +0000311 This method is primarily intended to be able to recover
Guido van Rossum0612d842000-03-10 23:20:43 +0000312 from decoding errors.
313
314 """
315 pass
316
Walter Dörwald4dbf1922002-11-06 16:53:44 +0000317 def next(self):
318
319 """ Return the next decoded line from the input stream."""
320 line = self.readline()
321 if line:
322 return line
323 raise StopIteration
324
325 def __iter__(self):
326 return self
327
Tim Peters30324a72001-05-15 17:19:16 +0000328 def __getattr__(self, name,
Guido van Rossum0612d842000-03-10 23:20:43 +0000329 getattr=getattr):
330
331 """ Inherit all other methods from the underlying stream.
332 """
Tim Peters30324a72001-05-15 17:19:16 +0000333 return getattr(self.stream, name)
Guido van Rossum0612d842000-03-10 23:20:43 +0000334
335###
336
337class StreamReaderWriter:
338
Fred Drake49fd1072000-04-13 14:11:21 +0000339 """ StreamReaderWriter instances allow wrapping streams which
340 work in both read and write modes.
341
342 The design is such that one can use the factory functions
Thomas Wouters7e474022000-07-16 12:04:32 +0000343 returned by the codec.lookup() function to construct the
Fred Drake49fd1072000-04-13 14:11:21 +0000344 instance.
345
346 """
Guido van Rossuma3277132000-04-11 15:37:43 +0000347 # Optional attributes set by the file wrappers below
348 encoding = 'unknown'
349
Tim Peters30324a72001-05-15 17:19:16 +0000350 def __init__(self, stream, Reader, Writer, errors='strict'):
Guido van Rossum0612d842000-03-10 23:20:43 +0000351
352 """ Creates a StreamReaderWriter instance.
353
354 stream must be a Stream-like object.
355
356 Reader, Writer must be factory functions or classes
357 providing the StreamReader, StreamWriter interface resp.
358
359 Error handling is done in the same way as defined for the
360 StreamWriter/Readers.
361
362 """
363 self.stream = stream
364 self.reader = Reader(stream, errors)
365 self.writer = Writer(stream, errors)
366 self.errors = errors
367
Tim Peters30324a72001-05-15 17:19:16 +0000368 def read(self, size=-1):
Guido van Rossum0612d842000-03-10 23:20:43 +0000369
370 return self.reader.read(size)
371
Guido van Rossumd58c26f2000-05-01 16:17:32 +0000372 def readline(self, size=None):
Guido van Rossuma3277132000-04-11 15:37:43 +0000373
374 return self.reader.readline(size)
375
Guido van Rossumd58c26f2000-05-01 16:17:32 +0000376 def readlines(self, sizehint=None):
Guido van Rossuma3277132000-04-11 15:37:43 +0000377
378 return self.reader.readlines(sizehint)
379
Walter Dörwald4dbf1922002-11-06 16:53:44 +0000380 def next(self):
381
382 """ Return the next decoded line from the input stream."""
383 return self.reader.next()
384
385 def __iter__(self):
386 return self
387
Tim Peters30324a72001-05-15 17:19:16 +0000388 def write(self, data):
Guido van Rossum0612d842000-03-10 23:20:43 +0000389
390 return self.writer.write(data)
391
Tim Peters30324a72001-05-15 17:19:16 +0000392 def writelines(self, list):
Guido van Rossuma3277132000-04-11 15:37:43 +0000393
394 return self.writer.writelines(list)
395
Guido van Rossum0612d842000-03-10 23:20:43 +0000396 def reset(self):
397
398 self.reader.reset()
399 self.writer.reset()
400
Tim Peters30324a72001-05-15 17:19:16 +0000401 def __getattr__(self, name,
Guido van Rossum0612d842000-03-10 23:20:43 +0000402 getattr=getattr):
403
404 """ Inherit all other methods from the underlying stream.
405 """
Tim Peters30324a72001-05-15 17:19:16 +0000406 return getattr(self.stream, name)
Guido van Rossum0612d842000-03-10 23:20:43 +0000407
408###
409
410class StreamRecoder:
411
Fred Drake49fd1072000-04-13 14:11:21 +0000412 """ StreamRecoder instances provide a frontend - backend
413 view of encoding data.
414
415 They use the complete set of APIs returned by the
416 codecs.lookup() function to implement their task.
417
418 Data written to the stream is first decoded into an
419 intermediate format (which is dependent on the given codec
420 combination) and then written to the stream using an instance
421 of the provided Writer class.
422
423 In the other direction, data is read from the stream using a
424 Reader instance and then return encoded data to the caller.
425
426 """
Guido van Rossuma3277132000-04-11 15:37:43 +0000427 # Optional attributes set by the file wrappers below
428 data_encoding = 'unknown'
429 file_encoding = 'unknown'
430
Tim Peters30324a72001-05-15 17:19:16 +0000431 def __init__(self, stream, encode, decode, Reader, Writer,
432 errors='strict'):
Guido van Rossum0612d842000-03-10 23:20:43 +0000433
434 """ Creates a StreamRecoder instance which implements a two-way
435 conversion: encode and decode work on the frontend (the
Guido van Rossum1c89b0e2000-04-11 15:41:38 +0000436 input to .read() and output of .write()) while
Guido van Rossum0612d842000-03-10 23:20:43 +0000437 Reader and Writer work on the backend (reading and
Fred Drake908670c2000-03-17 15:42:11 +0000438 writing to the stream).
Guido van Rossum0612d842000-03-10 23:20:43 +0000439
440 You can use these objects to do transparent direct
441 recodings from e.g. latin-1 to utf-8 and back.
442
443 stream must be a file-like object.
444
445 encode, decode must adhere to the Codec interface, Reader,
446 Writer must be factory functions or classes providing the
447 StreamReader, StreamWriter interface resp.
448
449 encode and decode are needed for the frontend translation,
450 Reader and Writer for the backend translation. Unicode is
451 used as intermediate encoding.
452
453 Error handling is done in the same way as defined for the
454 StreamWriter/Readers.
455
456 """
457 self.stream = stream
458 self.encode = encode
459 self.decode = decode
460 self.reader = Reader(stream, errors)
461 self.writer = Writer(stream, errors)
462 self.errors = errors
463
Tim Peters30324a72001-05-15 17:19:16 +0000464 def read(self, size=-1):
Guido van Rossum0612d842000-03-10 23:20:43 +0000465
466 data = self.reader.read(size)
467 data, bytesencoded = self.encode(data, self.errors)
468 return data
469
Tim Peters30324a72001-05-15 17:19:16 +0000470 def readline(self, size=None):
Guido van Rossuma3277132000-04-11 15:37:43 +0000471
472 if size is None:
473 data = self.reader.readline()
474 else:
475 data = self.reader.readline(size)
476 data, bytesencoded = self.encode(data, self.errors)
477 return data
478
Tim Peters30324a72001-05-15 17:19:16 +0000479 def readlines(self, sizehint=None):
Guido van Rossuma3277132000-04-11 15:37:43 +0000480
481 if sizehint is None:
482 data = self.reader.read()
483 else:
484 data = self.reader.read(sizehint)
485 data, bytesencoded = self.encode(data, self.errors)
486 return data.splitlines(1)
487
Walter Dörwald4dbf1922002-11-06 16:53:44 +0000488 def next(self):
489
490 """ Return the next decoded line from the input stream."""
491 return self.reader.next()
492
493 def __iter__(self):
494 return self
495
Tim Peters30324a72001-05-15 17:19:16 +0000496 def write(self, data):
Guido van Rossum0612d842000-03-10 23:20:43 +0000497
498 data, bytesdecoded = self.decode(data, self.errors)
499 return self.writer.write(data)
500
Tim Peters30324a72001-05-15 17:19:16 +0000501 def writelines(self, list):
Guido van Rossuma3277132000-04-11 15:37:43 +0000502
503 data = ''.join(list)
504 data, bytesdecoded = self.decode(data, self.errors)
505 return self.writer.write(data)
Guido van Rossum0612d842000-03-10 23:20:43 +0000506
507 def reset(self):
508
509 self.reader.reset()
510 self.writer.reset()
511
Tim Peters30324a72001-05-15 17:19:16 +0000512 def __getattr__(self, name,
Guido van Rossum0612d842000-03-10 23:20:43 +0000513 getattr=getattr):
514
515 """ Inherit all other methods from the underlying stream.
516 """
Tim Peters30324a72001-05-15 17:19:16 +0000517 return getattr(self.stream, name)
Guido van Rossum0612d842000-03-10 23:20:43 +0000518
519### Shortcuts
520
Marc-André Lemburg349a3d32000-06-21 21:21:04 +0000521def open(filename, mode='rb', encoding=None, errors='strict', buffering=1):
Guido van Rossum0612d842000-03-10 23:20:43 +0000522
523 """ Open an encoded file using the given mode and return
524 a wrapped version providing transparent encoding/decoding.
525
526 Note: The wrapped version will only accept the object format
527 defined by the codecs, i.e. Unicode objects for most builtin
528 codecs. Output is also codec dependent and will usually by
529 Unicode as well.
530
Marc-André Lemburg349a3d32000-06-21 21:21:04 +0000531 Files are always opened in binary mode, even if no binary mode
532 was specified. Thisis done to avoid data loss due to encodings
533 using 8-bit values. The default file mode is 'rb' meaning to
534 open the file in binary read mode.
535
Guido van Rossum0612d842000-03-10 23:20:43 +0000536 encoding specifies the encoding which is to be used for the
537 the file.
538
539 errors may be given to define the error handling. It defaults
540 to 'strict' which causes ValueErrors to be raised in case an
541 encoding error occurs.
542
543 buffering has the same meaning as for the builtin open() API.
544 It defaults to line buffered.
545
Fred Drake49fd1072000-04-13 14:11:21 +0000546 The returned wrapped file object provides an extra attribute
547 .encoding which allows querying the used encoding. This
548 attribute is only available if an encoding was specified as
549 parameter.
550
Guido van Rossum0612d842000-03-10 23:20:43 +0000551 """
552 if encoding is not None and \
553 'b' not in mode:
554 # Force opening of the file in binary mode
555 mode = mode + 'b'
556 file = __builtin__.open(filename, mode, buffering)
557 if encoding is None:
558 return file
Tim Peters30324a72001-05-15 17:19:16 +0000559 (e, d, sr, sw) = lookup(encoding)
Guido van Rossuma3277132000-04-11 15:37:43 +0000560 srw = StreamReaderWriter(file, sr, sw, errors)
561 # Add attributes to simplify introspection
562 srw.encoding = encoding
563 return srw
Guido van Rossum0612d842000-03-10 23:20:43 +0000564
Guido van Rossuma3277132000-04-11 15:37:43 +0000565def EncodedFile(file, data_encoding, file_encoding=None, errors='strict'):
Guido van Rossum0612d842000-03-10 23:20:43 +0000566
567 """ Return a wrapped version of file which provides transparent
568 encoding translation.
569
570 Strings written to the wrapped file are interpreted according
Guido van Rossuma3277132000-04-11 15:37:43 +0000571 to the given data_encoding and then written to the original
572 file as string using file_encoding. The intermediate encoding
573 will usually be Unicode but depends on the specified codecs.
Guido van Rossum0612d842000-03-10 23:20:43 +0000574
Guido van Rossuma3277132000-04-11 15:37:43 +0000575 Strings are read from the file using file_encoding and then
576 passed back to the caller as string using data_encoding.
577
578 If file_encoding is not given, it defaults to data_encoding.
Guido van Rossum0612d842000-03-10 23:20:43 +0000579
580 errors may be given to define the error handling. It defaults
581 to 'strict' which causes ValueErrors to be raised in case an
582 encoding error occurs.
583
Fred Drake49fd1072000-04-13 14:11:21 +0000584 The returned wrapped file object provides two extra attributes
585 .data_encoding and .file_encoding which reflect the given
586 parameters of the same name. The attributes can be used for
587 introspection by Python programs.
588
Guido van Rossum0612d842000-03-10 23:20:43 +0000589 """
Guido van Rossuma3277132000-04-11 15:37:43 +0000590 if file_encoding is None:
591 file_encoding = data_encoding
592 encode, decode = lookup(data_encoding)[:2]
593 Reader, Writer = lookup(file_encoding)[2:]
594 sr = StreamRecoder(file,
Tim Peters30324a72001-05-15 17:19:16 +0000595 encode, decode, Reader, Writer,
Guido van Rossuma3277132000-04-11 15:37:43 +0000596 errors)
597 # Add attributes to simplify introspection
598 sr.data_encoding = data_encoding
599 sr.file_encoding = file_encoding
600 return sr
Guido van Rossum0612d842000-03-10 23:20:43 +0000601
Marc-André Lemburgaa32c5a2001-09-19 11:24:48 +0000602### Helpers for codec lookup
603
604def getencoder(encoding):
605
606 """ Lookup up the codec for the given encoding and return
607 its encoder function.
608
609 Raises a LookupError in case the encoding cannot be found.
610
611 """
612 return lookup(encoding)[0]
613
614def getdecoder(encoding):
615
616 """ Lookup up the codec for the given encoding and return
617 its decoder function.
618
619 Raises a LookupError in case the encoding cannot be found.
620
621 """
622 return lookup(encoding)[1]
623
624def getreader(encoding):
625
626 """ Lookup up the codec for the given encoding and return
627 its StreamReader class or factory function.
628
629 Raises a LookupError in case the encoding cannot be found.
630
631 """
632 return lookup(encoding)[2]
633
634def getwriter(encoding):
635
636 """ Lookup up the codec for the given encoding and return
637 its StreamWriter class or factory function.
638
639 Raises a LookupError in case the encoding cannot be found.
640
641 """
642 return lookup(encoding)[3]
643
Marc-André Lemburga866df82001-01-03 21:29:14 +0000644### Helpers for charmap-based codecs
645
646def make_identity_dict(rng):
647
648 """ make_identity_dict(rng) -> dict
649
650 Return a dictionary where elements of the rng sequence are
651 mapped to themselves.
Tim Peters88869f92001-01-14 23:36:06 +0000652
Marc-André Lemburga866df82001-01-03 21:29:14 +0000653 """
654 res = {}
655 for i in rng:
656 res[i]=i
657 return res
658
Marc-André Lemburg716cf912001-05-16 09:41:45 +0000659def make_encoding_map(decoding_map):
660
661 """ Creates an encoding map from a decoding map.
662
663 If a target mapping in the decoding map occurrs multiple
664 times, then that target is mapped to None (undefined mapping),
665 causing an exception when encountered by the charmap codec
666 during translation.
667
668 One example where this happens is cp875.py which decodes
669 multiple character to \u001a.
670
671 """
672 m = {}
673 for k,v in decoding_map.items():
Raymond Hettinger54f02222002-06-01 14:18:47 +0000674 if not v in m:
Marc-André Lemburg716cf912001-05-16 09:41:45 +0000675 m[v] = k
676 else:
677 m[v] = None
678 return m
Tim Peters3a2ab1a2001-05-29 06:06:54 +0000679
Walter Dörwald3aeb6322002-09-02 13:14:32 +0000680### error handlers
681
682strict_errors = lookup_error("strict")
683ignore_errors = lookup_error("ignore")
684replace_errors = lookup_error("replace")
685xmlcharrefreplace_errors = lookup_error("xmlcharrefreplace")
686backslashreplace_errors = lookup_error("backslashreplace")
687
Martin v. Löwis6cd441d2001-07-31 08:54:55 +0000688# Tell modulefinder that using codecs probably needs the encodings
689# package
690_false = 0
691if _false:
692 import encodings
693
Guido van Rossum0612d842000-03-10 23:20:43 +0000694### Tests
Guido van Rossum1c89b0e2000-04-11 15:41:38 +0000695
Guido van Rossum0612d842000-03-10 23:20:43 +0000696if __name__ == '__main__':
697
698 import sys
Guido van Rossum1c89b0e2000-04-11 15:41:38 +0000699
Guido van Rossuma3277132000-04-11 15:37:43 +0000700 # Make stdout translate Latin-1 output into UTF-8 output
701 sys.stdout = EncodedFile(sys.stdout, 'latin-1', 'utf-8')
Guido van Rossum1c89b0e2000-04-11 15:41:38 +0000702
Guido van Rossuma3277132000-04-11 15:37:43 +0000703 # Have stdin translate Latin-1 input into UTF-8 input
704 sys.stdin = EncodedFile(sys.stdin, 'utf-8', 'latin-1')