blob: 92c6feff31f0ae0881cbb3fb2f7d881a33bc7807 [file] [log] [blame]
Guido van Rossum0612d842000-03-10 23:20:43 +00001""" codecs -- Python Codec Registry, API and helpers.
2
3
4Written by Marc-Andre Lemburg (mal@lemburg.com).
5
6(c) Copyright CNRI, All Rights Reserved. NO WARRANTY.
7
8"""#"
9
Marc-André Lemburgb28de0d2002-12-12 17:37:50 +000010import __builtin__, sys
Guido van Rossum0612d842000-03-10 23:20:43 +000011
12### Registry and builtin stateless codec functions
13
Guido van Rossumb95de4f2000-03-31 17:25:23 +000014try:
15 from _codecs import *
Tim Peters30324a72001-05-15 17:19:16 +000016except ImportError, why:
Guido van Rossumb95de4f2000-03-31 17:25:23 +000017 raise SystemError,\
18 'Failed to load the builtin codecs: %s' % why
Guido van Rossum0612d842000-03-10 23:20:43 +000019
Tim Peters30324a72001-05-15 17:19:16 +000020__all__ = ["register", "lookup", "open", "EncodedFile", "BOM", "BOM_BE",
Walter Dörwald474458d2002-06-04 15:16:29 +000021 "BOM_LE", "BOM32_BE", "BOM32_LE", "BOM64_BE", "BOM64_LE",
22 "BOM_UTF8", "BOM_UTF16", "BOM_UTF16_LE", "BOM_UTF16_BE",
Walter Dörwald3aeb6322002-09-02 13:14:32 +000023 "BOM_UTF32", "BOM_UTF32_LE", "BOM_UTF32_BE",
24 "strict_errors", "ignore_errors", "replace_errors",
25 "xmlcharrefreplace_errors",
26 "register_error", "lookup_error"]
Skip Montanaroe99d5ea2001-01-20 19:54:20 +000027
Guido van Rossum0612d842000-03-10 23:20:43 +000028### Constants
29
30#
Walter Dörwald474458d2002-06-04 15:16:29 +000031# Byte Order Mark (BOM = ZERO WIDTH NO-BREAK SPACE = U+FEFF)
32# and its possible byte string values
33# for UTF8/UTF16/UTF32 output and little/big endian machines
Guido van Rossum0612d842000-03-10 23:20:43 +000034#
Guido van Rossum0612d842000-03-10 23:20:43 +000035
Walter Dörwald474458d2002-06-04 15:16:29 +000036# UTF-8
37BOM_UTF8 = '\xef\xbb\xbf'
38
39# UTF-16, little endian
40BOM_LE = BOM_UTF16_LE = '\xff\xfe'
41
42# UTF-16, big endian
43BOM_BE = BOM_UTF16_BE = '\xfe\xff'
44
45# UTF-32, little endian
46BOM_UTF32_LE = '\xff\xfe\x00\x00'
47
48# UTF-32, big endian
49BOM_UTF32_BE = '\x00\x00\xfe\xff'
50
Marc-André Lemburgb28de0d2002-12-12 17:37:50 +000051if sys.byteorder == 'little':
Walter Dörwald474458d2002-06-04 15:16:29 +000052
Marc-André Lemburgb28de0d2002-12-12 17:37:50 +000053 # UTF-16, native endianness
54 BOM = BOM_UTF16 = BOM_UTF16_LE
55
56 # UTF-32, native endianness
57 BOM_UTF32 = BOM_UTF32_LE
58
59else:
60
61 # UTF-16, native endianness
62 BOM = BOM_UTF16 = BOM_UTF16_BE
63
64 # UTF-32, native endianness
65 BOM_UTF32 = BOM_UTF32_BE
Walter Dörwald474458d2002-06-04 15:16:29 +000066
67# Old broken names (don't use in new code)
68BOM32_LE = BOM_UTF16_LE
69BOM32_BE = BOM_UTF16_BE
70BOM64_LE = BOM_UTF32_LE
71BOM64_BE = BOM_UTF32_BE
Guido van Rossum0612d842000-03-10 23:20:43 +000072
73
74### Codec base classes (defining the API)
75
76class Codec:
77
78 """ Defines the interface for stateless encoders/decoders.
79
Walter Dörwald7f82f792002-11-19 21:42:53 +000080 The .encode()/.decode() methods may use different error
Guido van Rossum0612d842000-03-10 23:20:43 +000081 handling schemes by providing the errors argument. These
Walter Dörwald7f82f792002-11-19 21:42:53 +000082 string values are predefined:
Guido van Rossum0612d842000-03-10 23:20:43 +000083
Guido van Rossumd8855fd2000-03-24 22:14:19 +000084 'strict' - raise a ValueError error (or a subclass)
Guido van Rossum0612d842000-03-10 23:20:43 +000085 'ignore' - ignore the character and continue with the next
86 'replace' - replace with a suitable replacement character;
87 Python will use the official U+FFFD REPLACEMENT
Walter Dörwald7f82f792002-11-19 21:42:53 +000088 CHARACTER for the builtin Unicode codecs on
89 decoding and '?' on encoding.
90 'xmlcharrefreplace' - Replace with the appropriate XML
91 character reference (only for encoding).
92 'backslashreplace' - Replace with backslashed escape sequences
93 (only for encoding).
94
95 The set of allowed values can be extended via register_error.
Guido van Rossum0612d842000-03-10 23:20:43 +000096
97 """
Tim Peters30324a72001-05-15 17:19:16 +000098 def encode(self, input, errors='strict'):
Guido van Rossum1c89b0e2000-04-11 15:41:38 +000099
Fred Drake3e74c0d2000-03-17 15:40:35 +0000100 """ Encodes the object input and returns a tuple (output
Guido van Rossum0612d842000-03-10 23:20:43 +0000101 object, length consumed).
102
103 errors defines the error handling to apply. It defaults to
104 'strict' handling.
105
106 The method may not store state in the Codec instance. Use
107 StreamCodec for codecs which have to keep state in order to
108 make encoding/decoding efficient.
109
110 The encoder must be able to handle zero length input and
111 return an empty object of the output object type in this
112 situation.
113
114 """
115 raise NotImplementedError
116
Tim Peters30324a72001-05-15 17:19:16 +0000117 def decode(self, input, errors='strict'):
Guido van Rossum0612d842000-03-10 23:20:43 +0000118
119 """ Decodes the object input and returns a tuple (output
120 object, length consumed).
121
122 input must be an object which provides the bf_getreadbuf
123 buffer slot. Python strings, buffer objects and memory
124 mapped files are examples of objects providing this slot.
Guido van Rossum1c89b0e2000-04-11 15:41:38 +0000125
Guido van Rossum0612d842000-03-10 23:20:43 +0000126 errors defines the error handling to apply. It defaults to
127 'strict' handling.
128
129 The method may not store state in the Codec instance. Use
130 StreamCodec for codecs which have to keep state in order to
131 make encoding/decoding efficient.
132
133 The decoder must be able to handle zero length input and
134 return an empty object of the output object type in this
135 situation.
136
Guido van Rossum1c89b0e2000-04-11 15:41:38 +0000137 """
Guido van Rossum0612d842000-03-10 23:20:43 +0000138 raise NotImplementedError
139
140#
141# The StreamWriter and StreamReader class provide generic working
Andrew M. Kuchling97c56352001-09-18 20:29:48 +0000142# interfaces which can be used to implement new encoding submodules
Guido van Rossum0612d842000-03-10 23:20:43 +0000143# very easily. See encodings/utf_8.py for an example on how this is
144# done.
Guido van Rossum1c89b0e2000-04-11 15:41:38 +0000145#
Guido van Rossum0612d842000-03-10 23:20:43 +0000146
147class StreamWriter(Codec):
148
Tim Peters30324a72001-05-15 17:19:16 +0000149 def __init__(self, stream, errors='strict'):
Guido van Rossum0612d842000-03-10 23:20:43 +0000150
151 """ Creates a StreamWriter instance.
152
153 stream must be a file-like object open for writing
154 (binary) data.
155
Walter Dörwald7f82f792002-11-19 21:42:53 +0000156 The StreamWriter may use different error handling
Guido van Rossum0612d842000-03-10 23:20:43 +0000157 schemes by providing the errors keyword argument. These
Walter Dörwald7f82f792002-11-19 21:42:53 +0000158 parameters are predefined:
Guido van Rossum0612d842000-03-10 23:20:43 +0000159
160 'strict' - raise a ValueError (or a subclass)
161 'ignore' - ignore the character and continue with the next
162 'replace'- replace with a suitable replacement character
Walter Dörwald7f82f792002-11-19 21:42:53 +0000163 'xmlcharrefreplace' - Replace with the appropriate XML
164 character reference.
165 'backslashreplace' - Replace with backslashed escape
166 sequences (only for encoding).
Guido van Rossum0612d842000-03-10 23:20:43 +0000167
Walter Dörwald7f82f792002-11-19 21:42:53 +0000168 The set of allowed parameter values can be extended via
169 register_error.
Guido van Rossum0612d842000-03-10 23:20:43 +0000170 """
171 self.stream = stream
172 self.errors = errors
173
Guido van Rossuma3277132000-04-11 15:37:43 +0000174 def write(self, object):
Guido van Rossum0612d842000-03-10 23:20:43 +0000175
176 """ Writes the object's contents encoded to self.stream.
177 """
Tim Peters30324a72001-05-15 17:19:16 +0000178 data, consumed = self.encode(object, self.errors)
Guido van Rossum0612d842000-03-10 23:20:43 +0000179 self.stream.write(data)
180
Guido van Rossuma3277132000-04-11 15:37:43 +0000181 def writelines(self, list):
182
183 """ Writes the concatenated list of strings to the stream
184 using .write().
185 """
186 self.write(''.join(list))
Guido van Rossum1c89b0e2000-04-11 15:41:38 +0000187
Guido van Rossum0612d842000-03-10 23:20:43 +0000188 def reset(self):
189
190 """ Flushes and resets the codec buffers used for keeping state.
191
192 Calling this method should ensure that the data on the
193 output is put into a clean state, that allows appending
194 of new fresh data without having to rescan the whole
195 stream to recover state.
196
197 """
198 pass
199
Tim Peters30324a72001-05-15 17:19:16 +0000200 def __getattr__(self, name,
Guido van Rossum0612d842000-03-10 23:20:43 +0000201 getattr=getattr):
202
203 """ Inherit all other methods from the underlying stream.
204 """
Tim Peters30324a72001-05-15 17:19:16 +0000205 return getattr(self.stream, name)
Guido van Rossum0612d842000-03-10 23:20:43 +0000206
207###
208
209class StreamReader(Codec):
210
Tim Peters30324a72001-05-15 17:19:16 +0000211 def __init__(self, stream, errors='strict'):
Guido van Rossum0612d842000-03-10 23:20:43 +0000212
213 """ Creates a StreamReader instance.
214
215 stream must be a file-like object open for reading
216 (binary) data.
217
Walter Dörwald7f82f792002-11-19 21:42:53 +0000218 The StreamReader may use different error handling
Guido van Rossum0612d842000-03-10 23:20:43 +0000219 schemes by providing the errors keyword argument. These
Walter Dörwald7f82f792002-11-19 21:42:53 +0000220 parameters are predefined:
Guido van Rossum0612d842000-03-10 23:20:43 +0000221
222 'strict' - raise a ValueError (or a subclass)
223 'ignore' - ignore the character and continue with the next
224 'replace'- replace with a suitable replacement character;
225
Walter Dörwald7f82f792002-11-19 21:42:53 +0000226 The set of allowed parameter values can be extended via
227 register_error.
Guido van Rossum0612d842000-03-10 23:20:43 +0000228 """
229 self.stream = stream
230 self.errors = errors
231
Guido van Rossuma3277132000-04-11 15:37:43 +0000232 def read(self, size=-1):
Guido van Rossum0612d842000-03-10 23:20:43 +0000233
234 """ Decodes data from the stream self.stream and returns the
235 resulting object.
236
237 size indicates the approximate maximum number of bytes to
238 read from the stream for decoding purposes. The decoder
239 can modify this setting as appropriate. The default value
240 -1 indicates to read and decode as much as possible. size
241 is intended to prevent having to decode huge files in one
242 step.
243
244 The method should use a greedy read strategy meaning that
245 it should read as much data as is allowed within the
246 definition of the encoding and the given size, e.g. if
247 optional encoding endings or state markers are available
248 on the stream, these should be read too.
249
250 """
251 # Unsliced reading:
252 if size < 0:
Andrew M. Kuchlingc6c28382000-12-10 15:12:14 +0000253 return self.decode(self.stream.read(), self.errors)[0]
Guido van Rossum1c89b0e2000-04-11 15:41:38 +0000254
Guido van Rossum0612d842000-03-10 23:20:43 +0000255 # Sliced reading:
256 read = self.stream.read
257 decode = self.decode
258 data = read(size)
259 i = 0
260 while 1:
261 try:
Andrew M. Kuchlingc6c28382000-12-10 15:12:14 +0000262 object, decodedbytes = decode(data, self.errors)
Tim Peters30324a72001-05-15 17:19:16 +0000263 except ValueError, why:
Guido van Rossum0612d842000-03-10 23:20:43 +0000264 # This method is slow but should work under pretty much
265 # all conditions; at most 10 tries are made
266 i = i + 1
267 newdata = read(1)
268 if not newdata or i > 10:
269 raise
270 data = data + newdata
271 else:
272 return object
273
Guido van Rossuma3277132000-04-11 15:37:43 +0000274 def readline(self, size=None):
275
276 """ Read one line from the input stream and return the
277 decoded data.
278
Fred Drake49fd1072000-04-13 14:11:21 +0000279 Note: Unlike the .readlines() method, this method inherits
280 the line breaking knowledge from the underlying stream's
281 .readline() method -- there is currently no support for
282 line breaking using the codec decoder due to lack of line
Walter Dörwald7f3ed742003-02-02 23:08:27 +0000283 buffering. Subclasses should however, if possible, try to
Fred Drake49fd1072000-04-13 14:11:21 +0000284 implement this method using their own knowledge of line
285 breaking.
Guido van Rossuma3277132000-04-11 15:37:43 +0000286
287 size, if given, is passed as size argument to the stream's
288 .readline() method.
Guido van Rossum1c89b0e2000-04-11 15:41:38 +0000289
Guido van Rossuma3277132000-04-11 15:37:43 +0000290 """
291 if size is None:
292 line = self.stream.readline()
293 else:
294 line = self.stream.readline(size)
Tim Peters30324a72001-05-15 17:19:16 +0000295 return self.decode(line, self.errors)[0]
Guido van Rossum1c89b0e2000-04-11 15:41:38 +0000296
Guido van Rossuma3277132000-04-11 15:37:43 +0000297
Martin v. Löwisb786e612002-03-05 15:46:38 +0000298 def readlines(self, sizehint=None):
Guido van Rossuma3277132000-04-11 15:37:43 +0000299
300 """ Read all lines available on the input stream
301 and return them as list of lines.
302
303 Line breaks are implemented using the codec's decoder
304 method and are included in the list entries.
Guido van Rossum1c89b0e2000-04-11 15:41:38 +0000305
Marc-André Lemburgd5948492004-02-26 15:22:17 +0000306 sizehint, if given, is ignored since there is no efficient
307 way to finding the true end-of-line.
Guido van Rossuma3277132000-04-11 15:37:43 +0000308
309 """
Marc-André Lemburgd5948492004-02-26 15:22:17 +0000310 data = self.stream.read()
Tim Peters30324a72001-05-15 17:19:16 +0000311 return self.decode(data, self.errors)[0].splitlines(1)
Guido van Rossum0612d842000-03-10 23:20:43 +0000312
313 def reset(self):
314
315 """ Resets the codec buffers used for keeping state.
316
317 Note that no stream repositioning should take place.
Thomas Wouters7e474022000-07-16 12:04:32 +0000318 This method is primarily intended to be able to recover
Guido van Rossum0612d842000-03-10 23:20:43 +0000319 from decoding errors.
320
321 """
322 pass
323
Walter Dörwald4dbf1922002-11-06 16:53:44 +0000324 def next(self):
325
326 """ Return the next decoded line from the input stream."""
327 line = self.readline()
328 if line:
329 return line
330 raise StopIteration
331
332 def __iter__(self):
333 return self
334
Tim Peters30324a72001-05-15 17:19:16 +0000335 def __getattr__(self, name,
Guido van Rossum0612d842000-03-10 23:20:43 +0000336 getattr=getattr):
337
338 """ Inherit all other methods from the underlying stream.
339 """
Tim Peters30324a72001-05-15 17:19:16 +0000340 return getattr(self.stream, name)
Guido van Rossum0612d842000-03-10 23:20:43 +0000341
342###
343
344class StreamReaderWriter:
345
Fred Drake49fd1072000-04-13 14:11:21 +0000346 """ StreamReaderWriter instances allow wrapping streams which
347 work in both read and write modes.
348
349 The design is such that one can use the factory functions
Thomas Wouters7e474022000-07-16 12:04:32 +0000350 returned by the codec.lookup() function to construct the
Fred Drake49fd1072000-04-13 14:11:21 +0000351 instance.
352
353 """
Guido van Rossuma3277132000-04-11 15:37:43 +0000354 # Optional attributes set by the file wrappers below
355 encoding = 'unknown'
356
Tim Peters30324a72001-05-15 17:19:16 +0000357 def __init__(self, stream, Reader, Writer, errors='strict'):
Guido van Rossum0612d842000-03-10 23:20:43 +0000358
359 """ Creates a StreamReaderWriter instance.
360
361 stream must be a Stream-like object.
362
363 Reader, Writer must be factory functions or classes
364 providing the StreamReader, StreamWriter interface resp.
365
366 Error handling is done in the same way as defined for the
367 StreamWriter/Readers.
368
369 """
370 self.stream = stream
371 self.reader = Reader(stream, errors)
372 self.writer = Writer(stream, errors)
373 self.errors = errors
374
Tim Peters30324a72001-05-15 17:19:16 +0000375 def read(self, size=-1):
Guido van Rossum0612d842000-03-10 23:20:43 +0000376
377 return self.reader.read(size)
378
Guido van Rossumd58c26f2000-05-01 16:17:32 +0000379 def readline(self, size=None):
Guido van Rossuma3277132000-04-11 15:37:43 +0000380
381 return self.reader.readline(size)
382
Guido van Rossumd58c26f2000-05-01 16:17:32 +0000383 def readlines(self, sizehint=None):
Guido van Rossuma3277132000-04-11 15:37:43 +0000384
385 return self.reader.readlines(sizehint)
386
Walter Dörwald4dbf1922002-11-06 16:53:44 +0000387 def next(self):
388
389 """ Return the next decoded line from the input stream."""
390 return self.reader.next()
391
392 def __iter__(self):
393 return self
394
Tim Peters30324a72001-05-15 17:19:16 +0000395 def write(self, data):
Guido van Rossum0612d842000-03-10 23:20:43 +0000396
397 return self.writer.write(data)
398
Tim Peters30324a72001-05-15 17:19:16 +0000399 def writelines(self, list):
Guido van Rossuma3277132000-04-11 15:37:43 +0000400
401 return self.writer.writelines(list)
402
Guido van Rossum0612d842000-03-10 23:20:43 +0000403 def reset(self):
404
405 self.reader.reset()
406 self.writer.reset()
407
Tim Peters30324a72001-05-15 17:19:16 +0000408 def __getattr__(self, name,
Guido van Rossum0612d842000-03-10 23:20:43 +0000409 getattr=getattr):
410
411 """ Inherit all other methods from the underlying stream.
412 """
Tim Peters30324a72001-05-15 17:19:16 +0000413 return getattr(self.stream, name)
Guido van Rossum0612d842000-03-10 23:20:43 +0000414
415###
416
417class StreamRecoder:
418
Fred Drake49fd1072000-04-13 14:11:21 +0000419 """ StreamRecoder instances provide a frontend - backend
420 view of encoding data.
421
422 They use the complete set of APIs returned by the
423 codecs.lookup() function to implement their task.
424
425 Data written to the stream is first decoded into an
426 intermediate format (which is dependent on the given codec
427 combination) and then written to the stream using an instance
428 of the provided Writer class.
429
430 In the other direction, data is read from the stream using a
431 Reader instance and then return encoded data to the caller.
432
433 """
Guido van Rossuma3277132000-04-11 15:37:43 +0000434 # Optional attributes set by the file wrappers below
435 data_encoding = 'unknown'
436 file_encoding = 'unknown'
437
Tim Peters30324a72001-05-15 17:19:16 +0000438 def __init__(self, stream, encode, decode, Reader, Writer,
439 errors='strict'):
Guido van Rossum0612d842000-03-10 23:20:43 +0000440
441 """ Creates a StreamRecoder instance which implements a two-way
442 conversion: encode and decode work on the frontend (the
Guido van Rossum1c89b0e2000-04-11 15:41:38 +0000443 input to .read() and output of .write()) while
Guido van Rossum0612d842000-03-10 23:20:43 +0000444 Reader and Writer work on the backend (reading and
Fred Drake908670c2000-03-17 15:42:11 +0000445 writing to the stream).
Guido van Rossum0612d842000-03-10 23:20:43 +0000446
447 You can use these objects to do transparent direct
448 recodings from e.g. latin-1 to utf-8 and back.
449
450 stream must be a file-like object.
451
452 encode, decode must adhere to the Codec interface, Reader,
453 Writer must be factory functions or classes providing the
454 StreamReader, StreamWriter interface resp.
455
456 encode and decode are needed for the frontend translation,
457 Reader and Writer for the backend translation. Unicode is
458 used as intermediate encoding.
459
460 Error handling is done in the same way as defined for the
461 StreamWriter/Readers.
462
463 """
464 self.stream = stream
465 self.encode = encode
466 self.decode = decode
467 self.reader = Reader(stream, errors)
468 self.writer = Writer(stream, errors)
469 self.errors = errors
470
Tim Peters30324a72001-05-15 17:19:16 +0000471 def read(self, size=-1):
Guido van Rossum0612d842000-03-10 23:20:43 +0000472
473 data = self.reader.read(size)
474 data, bytesencoded = self.encode(data, self.errors)
475 return data
476
Tim Peters30324a72001-05-15 17:19:16 +0000477 def readline(self, size=None):
Guido van Rossuma3277132000-04-11 15:37:43 +0000478
479 if size is None:
480 data = self.reader.readline()
481 else:
482 data = self.reader.readline(size)
483 data, bytesencoded = self.encode(data, self.errors)
484 return data
485
Tim Peters30324a72001-05-15 17:19:16 +0000486 def readlines(self, sizehint=None):
Guido van Rossuma3277132000-04-11 15:37:43 +0000487
Marc-André Lemburgd5948492004-02-26 15:22:17 +0000488 data = self.reader.read()
Guido van Rossuma3277132000-04-11 15:37:43 +0000489 data, bytesencoded = self.encode(data, self.errors)
490 return data.splitlines(1)
491
Walter Dörwald4dbf1922002-11-06 16:53:44 +0000492 def next(self):
493
494 """ Return the next decoded line from the input stream."""
495 return self.reader.next()
496
497 def __iter__(self):
498 return self
499
Tim Peters30324a72001-05-15 17:19:16 +0000500 def write(self, data):
Guido van Rossum0612d842000-03-10 23:20:43 +0000501
502 data, bytesdecoded = self.decode(data, self.errors)
503 return self.writer.write(data)
504
Tim Peters30324a72001-05-15 17:19:16 +0000505 def writelines(self, list):
Guido van Rossuma3277132000-04-11 15:37:43 +0000506
507 data = ''.join(list)
508 data, bytesdecoded = self.decode(data, self.errors)
509 return self.writer.write(data)
Guido van Rossum0612d842000-03-10 23:20:43 +0000510
511 def reset(self):
512
513 self.reader.reset()
514 self.writer.reset()
515
Tim Peters30324a72001-05-15 17:19:16 +0000516 def __getattr__(self, name,
Guido van Rossum0612d842000-03-10 23:20:43 +0000517 getattr=getattr):
518
519 """ Inherit all other methods from the underlying stream.
520 """
Tim Peters30324a72001-05-15 17:19:16 +0000521 return getattr(self.stream, name)
Guido van Rossum0612d842000-03-10 23:20:43 +0000522
523### Shortcuts
524
Marc-André Lemburg349a3d32000-06-21 21:21:04 +0000525def open(filename, mode='rb', encoding=None, errors='strict', buffering=1):
Guido van Rossum0612d842000-03-10 23:20:43 +0000526
527 """ Open an encoded file using the given mode and return
528 a wrapped version providing transparent encoding/decoding.
529
530 Note: The wrapped version will only accept the object format
531 defined by the codecs, i.e. Unicode objects for most builtin
532 codecs. Output is also codec dependent and will usually by
533 Unicode as well.
534
Marc-André Lemburg349a3d32000-06-21 21:21:04 +0000535 Files are always opened in binary mode, even if no binary mode
Walter Dörwald7f3ed742003-02-02 23:08:27 +0000536 was specified. This is done to avoid data loss due to encodings
Marc-André Lemburg349a3d32000-06-21 21:21:04 +0000537 using 8-bit values. The default file mode is 'rb' meaning to
538 open the file in binary read mode.
539
Guido van Rossum0612d842000-03-10 23:20:43 +0000540 encoding specifies the encoding which is to be used for the
Walter Dörwald7f3ed742003-02-02 23:08:27 +0000541 file.
Guido van Rossum0612d842000-03-10 23:20:43 +0000542
543 errors may be given to define the error handling. It defaults
544 to 'strict' which causes ValueErrors to be raised in case an
545 encoding error occurs.
546
547 buffering has the same meaning as for the builtin open() API.
548 It defaults to line buffered.
549
Fred Drake49fd1072000-04-13 14:11:21 +0000550 The returned wrapped file object provides an extra attribute
551 .encoding which allows querying the used encoding. This
552 attribute is only available if an encoding was specified as
553 parameter.
554
Guido van Rossum0612d842000-03-10 23:20:43 +0000555 """
556 if encoding is not None and \
557 'b' not in mode:
558 # Force opening of the file in binary mode
559 mode = mode + 'b'
560 file = __builtin__.open(filename, mode, buffering)
561 if encoding is None:
562 return file
Tim Peters30324a72001-05-15 17:19:16 +0000563 (e, d, sr, sw) = lookup(encoding)
Guido van Rossuma3277132000-04-11 15:37:43 +0000564 srw = StreamReaderWriter(file, sr, sw, errors)
565 # Add attributes to simplify introspection
566 srw.encoding = encoding
567 return srw
Guido van Rossum0612d842000-03-10 23:20:43 +0000568
Guido van Rossuma3277132000-04-11 15:37:43 +0000569def EncodedFile(file, data_encoding, file_encoding=None, errors='strict'):
Guido van Rossum0612d842000-03-10 23:20:43 +0000570
571 """ Return a wrapped version of file which provides transparent
572 encoding translation.
573
574 Strings written to the wrapped file are interpreted according
Guido van Rossuma3277132000-04-11 15:37:43 +0000575 to the given data_encoding and then written to the original
576 file as string using file_encoding. The intermediate encoding
577 will usually be Unicode but depends on the specified codecs.
Guido van Rossum0612d842000-03-10 23:20:43 +0000578
Guido van Rossuma3277132000-04-11 15:37:43 +0000579 Strings are read from the file using file_encoding and then
580 passed back to the caller as string using data_encoding.
581
582 If file_encoding is not given, it defaults to data_encoding.
Guido van Rossum0612d842000-03-10 23:20:43 +0000583
584 errors may be given to define the error handling. It defaults
585 to 'strict' which causes ValueErrors to be raised in case an
586 encoding error occurs.
587
Fred Drake49fd1072000-04-13 14:11:21 +0000588 The returned wrapped file object provides two extra attributes
589 .data_encoding and .file_encoding which reflect the given
590 parameters of the same name. The attributes can be used for
591 introspection by Python programs.
592
Guido van Rossum0612d842000-03-10 23:20:43 +0000593 """
Guido van Rossuma3277132000-04-11 15:37:43 +0000594 if file_encoding is None:
595 file_encoding = data_encoding
596 encode, decode = lookup(data_encoding)[:2]
597 Reader, Writer = lookup(file_encoding)[2:]
598 sr = StreamRecoder(file,
Tim Peters30324a72001-05-15 17:19:16 +0000599 encode, decode, Reader, Writer,
Guido van Rossuma3277132000-04-11 15:37:43 +0000600 errors)
601 # Add attributes to simplify introspection
602 sr.data_encoding = data_encoding
603 sr.file_encoding = file_encoding
604 return sr
Guido van Rossum0612d842000-03-10 23:20:43 +0000605
Marc-André Lemburgaa32c5a2001-09-19 11:24:48 +0000606### Helpers for codec lookup
607
608def getencoder(encoding):
609
610 """ Lookup up the codec for the given encoding and return
611 its encoder function.
612
613 Raises a LookupError in case the encoding cannot be found.
614
615 """
616 return lookup(encoding)[0]
617
618def getdecoder(encoding):
619
620 """ Lookup up the codec for the given encoding and return
621 its decoder function.
622
623 Raises a LookupError in case the encoding cannot be found.
624
625 """
626 return lookup(encoding)[1]
627
628def getreader(encoding):
629
630 """ Lookup up the codec for the given encoding and return
631 its StreamReader class or factory function.
632
633 Raises a LookupError in case the encoding cannot be found.
634
635 """
636 return lookup(encoding)[2]
637
638def getwriter(encoding):
639
640 """ Lookup up the codec for the given encoding and return
641 its StreamWriter class or factory function.
642
643 Raises a LookupError in case the encoding cannot be found.
644
645 """
646 return lookup(encoding)[3]
647
Marc-André Lemburga866df82001-01-03 21:29:14 +0000648### Helpers for charmap-based codecs
649
650def make_identity_dict(rng):
651
652 """ make_identity_dict(rng) -> dict
653
654 Return a dictionary where elements of the rng sequence are
655 mapped to themselves.
Tim Peters88869f92001-01-14 23:36:06 +0000656
Marc-André Lemburga866df82001-01-03 21:29:14 +0000657 """
658 res = {}
659 for i in rng:
660 res[i]=i
661 return res
662
Marc-André Lemburg716cf912001-05-16 09:41:45 +0000663def make_encoding_map(decoding_map):
664
665 """ Creates an encoding map from a decoding map.
666
Walter Dörwald7f3ed742003-02-02 23:08:27 +0000667 If a target mapping in the decoding map occurs multiple
Marc-André Lemburg716cf912001-05-16 09:41:45 +0000668 times, then that target is mapped to None (undefined mapping),
669 causing an exception when encountered by the charmap codec
670 during translation.
671
672 One example where this happens is cp875.py which decodes
673 multiple character to \u001a.
674
675 """
676 m = {}
677 for k,v in decoding_map.items():
Raymond Hettinger54f02222002-06-01 14:18:47 +0000678 if not v in m:
Marc-André Lemburg716cf912001-05-16 09:41:45 +0000679 m[v] = k
680 else:
681 m[v] = None
682 return m
Tim Peters3a2ab1a2001-05-29 06:06:54 +0000683
Walter Dörwald3aeb6322002-09-02 13:14:32 +0000684### error handlers
685
686strict_errors = lookup_error("strict")
687ignore_errors = lookup_error("ignore")
688replace_errors = lookup_error("replace")
689xmlcharrefreplace_errors = lookup_error("xmlcharrefreplace")
690backslashreplace_errors = lookup_error("backslashreplace")
691
Martin v. Löwis6cd441d2001-07-31 08:54:55 +0000692# Tell modulefinder that using codecs probably needs the encodings
693# package
694_false = 0
695if _false:
696 import encodings
697
Guido van Rossum0612d842000-03-10 23:20:43 +0000698### Tests
Guido van Rossum1c89b0e2000-04-11 15:41:38 +0000699
Guido van Rossum0612d842000-03-10 23:20:43 +0000700if __name__ == '__main__':
701
Guido van Rossuma3277132000-04-11 15:37:43 +0000702 # Make stdout translate Latin-1 output into UTF-8 output
703 sys.stdout = EncodedFile(sys.stdout, 'latin-1', 'utf-8')
Guido van Rossum1c89b0e2000-04-11 15:41:38 +0000704
Guido van Rossuma3277132000-04-11 15:37:43 +0000705 # Have stdin translate Latin-1 input into UTF-8 input
706 sys.stdin = EncodedFile(sys.stdin, 'utf-8', 'latin-1')