blob: f831dd633758eb040d4da930aa85d808be2f8b59 [file] [log] [blame]
Guido van Rossum0612d842000-03-10 23:20:43 +00001""" codecs -- Python Codec Registry, API and helpers.
2
3
4Written by Marc-Andre Lemburg (mal@lemburg.com).
5
6(c) Copyright CNRI, All Rights Reserved. NO WARRANTY.
7
8"""#"
9
Marc-André Lemburgb28de0d2002-12-12 17:37:50 +000010import __builtin__, sys
Guido van Rossum0612d842000-03-10 23:20:43 +000011
12### Registry and builtin stateless codec functions
13
Guido van Rossumb95de4f2000-03-31 17:25:23 +000014try:
15 from _codecs import *
Tim Peters30324a72001-05-15 17:19:16 +000016except ImportError, why:
Guido van Rossumb95de4f2000-03-31 17:25:23 +000017 raise SystemError,\
18 'Failed to load the builtin codecs: %s' % why
Guido van Rossum0612d842000-03-10 23:20:43 +000019
Tim Peters30324a72001-05-15 17:19:16 +000020__all__ = ["register", "lookup", "open", "EncodedFile", "BOM", "BOM_BE",
Walter Dörwald474458d2002-06-04 15:16:29 +000021 "BOM_LE", "BOM32_BE", "BOM32_LE", "BOM64_BE", "BOM64_LE",
22 "BOM_UTF8", "BOM_UTF16", "BOM_UTF16_LE", "BOM_UTF16_BE",
Walter Dörwald3aeb6322002-09-02 13:14:32 +000023 "BOM_UTF32", "BOM_UTF32_LE", "BOM_UTF32_BE",
24 "strict_errors", "ignore_errors", "replace_errors",
25 "xmlcharrefreplace_errors",
26 "register_error", "lookup_error"]
Skip Montanaroe99d5ea2001-01-20 19:54:20 +000027
Guido van Rossum0612d842000-03-10 23:20:43 +000028### Constants
29
30#
Walter Dörwald474458d2002-06-04 15:16:29 +000031# Byte Order Mark (BOM = ZERO WIDTH NO-BREAK SPACE = U+FEFF)
32# and its possible byte string values
33# for UTF8/UTF16/UTF32 output and little/big endian machines
Guido van Rossum0612d842000-03-10 23:20:43 +000034#
Guido van Rossum0612d842000-03-10 23:20:43 +000035
Walter Dörwald474458d2002-06-04 15:16:29 +000036# UTF-8
37BOM_UTF8 = '\xef\xbb\xbf'
38
39# UTF-16, little endian
40BOM_LE = BOM_UTF16_LE = '\xff\xfe'
41
42# UTF-16, big endian
43BOM_BE = BOM_UTF16_BE = '\xfe\xff'
44
45# UTF-32, little endian
46BOM_UTF32_LE = '\xff\xfe\x00\x00'
47
48# UTF-32, big endian
49BOM_UTF32_BE = '\x00\x00\xfe\xff'
50
Marc-André Lemburgb28de0d2002-12-12 17:37:50 +000051if sys.byteorder == 'little':
Walter Dörwald474458d2002-06-04 15:16:29 +000052
Marc-André Lemburgb28de0d2002-12-12 17:37:50 +000053 # UTF-16, native endianness
54 BOM = BOM_UTF16 = BOM_UTF16_LE
55
56 # UTF-32, native endianness
57 BOM_UTF32 = BOM_UTF32_LE
58
59else:
60
61 # UTF-16, native endianness
62 BOM = BOM_UTF16 = BOM_UTF16_BE
63
64 # UTF-32, native endianness
65 BOM_UTF32 = BOM_UTF32_BE
Walter Dörwald474458d2002-06-04 15:16:29 +000066
67# Old broken names (don't use in new code)
68BOM32_LE = BOM_UTF16_LE
69BOM32_BE = BOM_UTF16_BE
70BOM64_LE = BOM_UTF32_LE
71BOM64_BE = BOM_UTF32_BE
Guido van Rossum0612d842000-03-10 23:20:43 +000072
73
74### Codec base classes (defining the API)
75
76class Codec:
77
78 """ Defines the interface for stateless encoders/decoders.
79
Walter Dörwald7f82f792002-11-19 21:42:53 +000080 The .encode()/.decode() methods may use different error
Guido van Rossum0612d842000-03-10 23:20:43 +000081 handling schemes by providing the errors argument. These
Walter Dörwald7f82f792002-11-19 21:42:53 +000082 string values are predefined:
Guido van Rossum0612d842000-03-10 23:20:43 +000083
Guido van Rossumd8855fd2000-03-24 22:14:19 +000084 'strict' - raise a ValueError error (or a subclass)
Guido van Rossum0612d842000-03-10 23:20:43 +000085 'ignore' - ignore the character and continue with the next
86 'replace' - replace with a suitable replacement character;
87 Python will use the official U+FFFD REPLACEMENT
Walter Dörwald7f82f792002-11-19 21:42:53 +000088 CHARACTER for the builtin Unicode codecs on
89 decoding and '?' on encoding.
90 'xmlcharrefreplace' - Replace with the appropriate XML
91 character reference (only for encoding).
92 'backslashreplace' - Replace with backslashed escape sequences
93 (only for encoding).
94
95 The set of allowed values can be extended via register_error.
Guido van Rossum0612d842000-03-10 23:20:43 +000096
97 """
Tim Peters30324a72001-05-15 17:19:16 +000098 def encode(self, input, errors='strict'):
Guido van Rossum1c89b0e2000-04-11 15:41:38 +000099
Fred Drake3e74c0d2000-03-17 15:40:35 +0000100 """ Encodes the object input and returns a tuple (output
Guido van Rossum0612d842000-03-10 23:20:43 +0000101 object, length consumed).
102
103 errors defines the error handling to apply. It defaults to
104 'strict' handling.
105
106 The method may not store state in the Codec instance. Use
107 StreamCodec for codecs which have to keep state in order to
108 make encoding/decoding efficient.
109
110 The encoder must be able to handle zero length input and
111 return an empty object of the output object type in this
112 situation.
113
114 """
115 raise NotImplementedError
116
Tim Peters30324a72001-05-15 17:19:16 +0000117 def decode(self, input, errors='strict'):
Guido van Rossum0612d842000-03-10 23:20:43 +0000118
119 """ Decodes the object input and returns a tuple (output
120 object, length consumed).
121
122 input must be an object which provides the bf_getreadbuf
123 buffer slot. Python strings, buffer objects and memory
124 mapped files are examples of objects providing this slot.
Guido van Rossum1c89b0e2000-04-11 15:41:38 +0000125
Guido van Rossum0612d842000-03-10 23:20:43 +0000126 errors defines the error handling to apply. It defaults to
127 'strict' handling.
128
129 The method may not store state in the Codec instance. Use
130 StreamCodec for codecs which have to keep state in order to
131 make encoding/decoding efficient.
132
133 The decoder must be able to handle zero length input and
134 return an empty object of the output object type in this
135 situation.
136
Guido van Rossum1c89b0e2000-04-11 15:41:38 +0000137 """
Guido van Rossum0612d842000-03-10 23:20:43 +0000138 raise NotImplementedError
139
140#
141# The StreamWriter and StreamReader class provide generic working
Andrew M. Kuchling97c56352001-09-18 20:29:48 +0000142# interfaces which can be used to implement new encoding submodules
Guido van Rossum0612d842000-03-10 23:20:43 +0000143# very easily. See encodings/utf_8.py for an example on how this is
144# done.
Guido van Rossum1c89b0e2000-04-11 15:41:38 +0000145#
Guido van Rossum0612d842000-03-10 23:20:43 +0000146
147class StreamWriter(Codec):
148
Tim Peters30324a72001-05-15 17:19:16 +0000149 def __init__(self, stream, errors='strict'):
Guido van Rossum0612d842000-03-10 23:20:43 +0000150
151 """ Creates a StreamWriter instance.
152
153 stream must be a file-like object open for writing
154 (binary) data.
155
Walter Dörwald7f82f792002-11-19 21:42:53 +0000156 The StreamWriter may use different error handling
Guido van Rossum0612d842000-03-10 23:20:43 +0000157 schemes by providing the errors keyword argument. These
Walter Dörwald7f82f792002-11-19 21:42:53 +0000158 parameters are predefined:
Guido van Rossum0612d842000-03-10 23:20:43 +0000159
160 'strict' - raise a ValueError (or a subclass)
161 'ignore' - ignore the character and continue with the next
162 'replace'- replace with a suitable replacement character
Walter Dörwald7f82f792002-11-19 21:42:53 +0000163 'xmlcharrefreplace' - Replace with the appropriate XML
164 character reference.
165 'backslashreplace' - Replace with backslashed escape
166 sequences (only for encoding).
Guido van Rossum0612d842000-03-10 23:20:43 +0000167
Walter Dörwald7f82f792002-11-19 21:42:53 +0000168 The set of allowed parameter values can be extended via
169 register_error.
Guido van Rossum0612d842000-03-10 23:20:43 +0000170 """
171 self.stream = stream
172 self.errors = errors
173
Guido van Rossuma3277132000-04-11 15:37:43 +0000174 def write(self, object):
Guido van Rossum0612d842000-03-10 23:20:43 +0000175
176 """ Writes the object's contents encoded to self.stream.
177 """
Tim Peters30324a72001-05-15 17:19:16 +0000178 data, consumed = self.encode(object, self.errors)
Guido van Rossum0612d842000-03-10 23:20:43 +0000179 self.stream.write(data)
180
Guido van Rossuma3277132000-04-11 15:37:43 +0000181 def writelines(self, list):
182
183 """ Writes the concatenated list of strings to the stream
184 using .write().
185 """
186 self.write(''.join(list))
Guido van Rossum1c89b0e2000-04-11 15:41:38 +0000187
Guido van Rossum0612d842000-03-10 23:20:43 +0000188 def reset(self):
189
190 """ Flushes and resets the codec buffers used for keeping state.
191
192 Calling this method should ensure that the data on the
193 output is put into a clean state, that allows appending
194 of new fresh data without having to rescan the whole
195 stream to recover state.
196
197 """
198 pass
199
Tim Peters30324a72001-05-15 17:19:16 +0000200 def __getattr__(self, name,
Guido van Rossum0612d842000-03-10 23:20:43 +0000201 getattr=getattr):
202
203 """ Inherit all other methods from the underlying stream.
204 """
Tim Peters30324a72001-05-15 17:19:16 +0000205 return getattr(self.stream, name)
Guido van Rossum0612d842000-03-10 23:20:43 +0000206
207###
208
209class StreamReader(Codec):
210
Tim Peters30324a72001-05-15 17:19:16 +0000211 def __init__(self, stream, errors='strict'):
Guido van Rossum0612d842000-03-10 23:20:43 +0000212
213 """ Creates a StreamReader instance.
214
215 stream must be a file-like object open for reading
216 (binary) data.
217
Walter Dörwald7f82f792002-11-19 21:42:53 +0000218 The StreamReader may use different error handling
Guido van Rossum0612d842000-03-10 23:20:43 +0000219 schemes by providing the errors keyword argument. These
Walter Dörwald7f82f792002-11-19 21:42:53 +0000220 parameters are predefined:
Guido van Rossum0612d842000-03-10 23:20:43 +0000221
222 'strict' - raise a ValueError (or a subclass)
223 'ignore' - ignore the character and continue with the next
224 'replace'- replace with a suitable replacement character;
225
Walter Dörwald7f82f792002-11-19 21:42:53 +0000226 The set of allowed parameter values can be extended via
227 register_error.
Guido van Rossum0612d842000-03-10 23:20:43 +0000228 """
229 self.stream = stream
230 self.errors = errors
Walter Dörwald69652032004-09-07 20:24:22 +0000231 self.bytebuffer = ""
232 self.charbuffer = u""
Guido van Rossum0612d842000-03-10 23:20:43 +0000233
Walter Dörwald69652032004-09-07 20:24:22 +0000234 def decode(self, input, errors='strict'):
235 raise NotImplementedError
236
237 def read(self, size=-1, chars=-1):
Guido van Rossum0612d842000-03-10 23:20:43 +0000238
239 """ Decodes data from the stream self.stream and returns the
240 resulting object.
241
Walter Dörwald69652032004-09-07 20:24:22 +0000242 chars indicates the number of characters to read from the
243 stream. read() will never return more than chars
244 characters, but it might return less, if there are not enough
245 characters available.
246
Guido van Rossum0612d842000-03-10 23:20:43 +0000247 size indicates the approximate maximum number of bytes to
248 read from the stream for decoding purposes. The decoder
249 can modify this setting as appropriate. The default value
250 -1 indicates to read and decode as much as possible. size
251 is intended to prevent having to decode huge files in one
252 step.
253
254 The method should use a greedy read strategy meaning that
255 it should read as much data as is allowed within the
256 definition of the encoding and the given size, e.g. if
257 optional encoding endings or state markers are available
258 on the stream, these should be read too.
259
260 """
Walter Dörwald69652032004-09-07 20:24:22 +0000261 # read until we get the required number of characters (if available)
262 done = False
263 while True:
264 # can the request can be satisfied from the character buffer?
265 if chars < 0:
266 if self.charbuffer:
267 done = True
Guido van Rossum0612d842000-03-10 23:20:43 +0000268 else:
Walter Dörwald69652032004-09-07 20:24:22 +0000269 if len(self.charbuffer) >= chars:
270 done = True
271 if done:
272 if chars < 0:
273 result = self.charbuffer
274 self.charbuffer = u""
275 break
276 else:
277 result = self.charbuffer[:chars]
278 self.charbuffer = self.charbuffer[chars:]
279 break
280 # we need more data
281 if size < 0:
282 newdata = self.stream.read()
283 else:
284 newdata = self.stream.read(size)
285 data = self.bytebuffer + newdata
286 object, decodedbytes = self.decode(data, self.errors)
287 # keep undecoded bytes until the next call
288 self.bytebuffer = data[decodedbytes:]
289 # put new characters in the character buffer
290 self.charbuffer += object
291 # there was no data available
292 if not newdata:
293 done = True
294 return result
Guido van Rossum0612d842000-03-10 23:20:43 +0000295
Walter Dörwald69652032004-09-07 20:24:22 +0000296 def readline(self, size=None, keepends=True):
Guido van Rossuma3277132000-04-11 15:37:43 +0000297
298 """ Read one line from the input stream and return the
299 decoded data.
300
Walter Dörwald69652032004-09-07 20:24:22 +0000301 size, if given, is passed as size argument to the
302 read() method.
Guido van Rossum1c89b0e2000-04-11 15:41:38 +0000303
Guido van Rossuma3277132000-04-11 15:37:43 +0000304 """
305 if size is None:
Walter Dörwald69652032004-09-07 20:24:22 +0000306 size = 10
307 line = u""
308 while True:
309 data = self.read(size)
310 line += data
311 pos = line.find("\n")
312 if pos>=0:
313 self.charbuffer = line[pos+1:] + self.charbuffer
314 if keepends:
315 line = line[:pos+1]
316 else:
317 line = line[:pos]
318 return line
319 elif not data:
320 return line
321 if size<8000:
322 size *= 2
Guido van Rossum1c89b0e2000-04-11 15:41:38 +0000323
Walter Dörwald69652032004-09-07 20:24:22 +0000324 def readlines(self, sizehint=None, keepends=True):
Guido van Rossuma3277132000-04-11 15:37:43 +0000325
326 """ Read all lines available on the input stream
327 and return them as list of lines.
328
329 Line breaks are implemented using the codec's decoder
330 method and are included in the list entries.
Guido van Rossum1c89b0e2000-04-11 15:41:38 +0000331
Marc-André Lemburgd5948492004-02-26 15:22:17 +0000332 sizehint, if given, is ignored since there is no efficient
333 way to finding the true end-of-line.
Guido van Rossuma3277132000-04-11 15:37:43 +0000334
335 """
Walter Dörwald69652032004-09-07 20:24:22 +0000336 data = self.read()
337 return self.splitlines(keepends)
Guido van Rossum0612d842000-03-10 23:20:43 +0000338
339 def reset(self):
340
341 """ Resets the codec buffers used for keeping state.
342
343 Note that no stream repositioning should take place.
Thomas Wouters7e474022000-07-16 12:04:32 +0000344 This method is primarily intended to be able to recover
Guido van Rossum0612d842000-03-10 23:20:43 +0000345 from decoding errors.
346
347 """
348 pass
349
Walter Dörwald4dbf1922002-11-06 16:53:44 +0000350 def next(self):
351
352 """ Return the next decoded line from the input stream."""
353 line = self.readline()
354 if line:
355 return line
356 raise StopIteration
357
358 def __iter__(self):
359 return self
360
Tim Peters30324a72001-05-15 17:19:16 +0000361 def __getattr__(self, name,
Guido van Rossum0612d842000-03-10 23:20:43 +0000362 getattr=getattr):
363
364 """ Inherit all other methods from the underlying stream.
365 """
Tim Peters30324a72001-05-15 17:19:16 +0000366 return getattr(self.stream, name)
Guido van Rossum0612d842000-03-10 23:20:43 +0000367
368###
369
370class StreamReaderWriter:
371
Fred Drake49fd1072000-04-13 14:11:21 +0000372 """ StreamReaderWriter instances allow wrapping streams which
373 work in both read and write modes.
374
375 The design is such that one can use the factory functions
Thomas Wouters7e474022000-07-16 12:04:32 +0000376 returned by the codec.lookup() function to construct the
Fred Drake49fd1072000-04-13 14:11:21 +0000377 instance.
378
379 """
Guido van Rossuma3277132000-04-11 15:37:43 +0000380 # Optional attributes set by the file wrappers below
381 encoding = 'unknown'
382
Tim Peters30324a72001-05-15 17:19:16 +0000383 def __init__(self, stream, Reader, Writer, errors='strict'):
Guido van Rossum0612d842000-03-10 23:20:43 +0000384
385 """ Creates a StreamReaderWriter instance.
386
387 stream must be a Stream-like object.
388
389 Reader, Writer must be factory functions or classes
390 providing the StreamReader, StreamWriter interface resp.
391
392 Error handling is done in the same way as defined for the
393 StreamWriter/Readers.
394
395 """
396 self.stream = stream
397 self.reader = Reader(stream, errors)
398 self.writer = Writer(stream, errors)
399 self.errors = errors
400
Tim Peters30324a72001-05-15 17:19:16 +0000401 def read(self, size=-1):
Guido van Rossum0612d842000-03-10 23:20:43 +0000402
403 return self.reader.read(size)
404
Guido van Rossumd58c26f2000-05-01 16:17:32 +0000405 def readline(self, size=None):
Guido van Rossuma3277132000-04-11 15:37:43 +0000406
407 return self.reader.readline(size)
408
Guido van Rossumd58c26f2000-05-01 16:17:32 +0000409 def readlines(self, sizehint=None):
Guido van Rossuma3277132000-04-11 15:37:43 +0000410
411 return self.reader.readlines(sizehint)
412
Walter Dörwald4dbf1922002-11-06 16:53:44 +0000413 def next(self):
414
415 """ Return the next decoded line from the input stream."""
416 return self.reader.next()
417
418 def __iter__(self):
419 return self
420
Tim Peters30324a72001-05-15 17:19:16 +0000421 def write(self, data):
Guido van Rossum0612d842000-03-10 23:20:43 +0000422
423 return self.writer.write(data)
424
Tim Peters30324a72001-05-15 17:19:16 +0000425 def writelines(self, list):
Guido van Rossuma3277132000-04-11 15:37:43 +0000426
427 return self.writer.writelines(list)
428
Guido van Rossum0612d842000-03-10 23:20:43 +0000429 def reset(self):
430
431 self.reader.reset()
432 self.writer.reset()
433
Tim Peters30324a72001-05-15 17:19:16 +0000434 def __getattr__(self, name,
Guido van Rossum0612d842000-03-10 23:20:43 +0000435 getattr=getattr):
436
437 """ Inherit all other methods from the underlying stream.
438 """
Tim Peters30324a72001-05-15 17:19:16 +0000439 return getattr(self.stream, name)
Guido van Rossum0612d842000-03-10 23:20:43 +0000440
441###
442
443class StreamRecoder:
444
Fred Drake49fd1072000-04-13 14:11:21 +0000445 """ StreamRecoder instances provide a frontend - backend
446 view of encoding data.
447
448 They use the complete set of APIs returned by the
449 codecs.lookup() function to implement their task.
450
451 Data written to the stream is first decoded into an
452 intermediate format (which is dependent on the given codec
453 combination) and then written to the stream using an instance
454 of the provided Writer class.
455
456 In the other direction, data is read from the stream using a
457 Reader instance and then return encoded data to the caller.
458
459 """
Guido van Rossuma3277132000-04-11 15:37:43 +0000460 # Optional attributes set by the file wrappers below
461 data_encoding = 'unknown'
462 file_encoding = 'unknown'
463
Tim Peters30324a72001-05-15 17:19:16 +0000464 def __init__(self, stream, encode, decode, Reader, Writer,
465 errors='strict'):
Guido van Rossum0612d842000-03-10 23:20:43 +0000466
467 """ Creates a StreamRecoder instance which implements a two-way
468 conversion: encode and decode work on the frontend (the
Guido van Rossum1c89b0e2000-04-11 15:41:38 +0000469 input to .read() and output of .write()) while
Guido van Rossum0612d842000-03-10 23:20:43 +0000470 Reader and Writer work on the backend (reading and
Fred Drake908670c2000-03-17 15:42:11 +0000471 writing to the stream).
Guido van Rossum0612d842000-03-10 23:20:43 +0000472
473 You can use these objects to do transparent direct
474 recodings from e.g. latin-1 to utf-8 and back.
475
476 stream must be a file-like object.
477
478 encode, decode must adhere to the Codec interface, Reader,
479 Writer must be factory functions or classes providing the
480 StreamReader, StreamWriter interface resp.
481
482 encode and decode are needed for the frontend translation,
483 Reader and Writer for the backend translation. Unicode is
484 used as intermediate encoding.
485
486 Error handling is done in the same way as defined for the
487 StreamWriter/Readers.
488
489 """
490 self.stream = stream
491 self.encode = encode
492 self.decode = decode
493 self.reader = Reader(stream, errors)
494 self.writer = Writer(stream, errors)
495 self.errors = errors
496
Tim Peters30324a72001-05-15 17:19:16 +0000497 def read(self, size=-1):
Guido van Rossum0612d842000-03-10 23:20:43 +0000498
499 data = self.reader.read(size)
500 data, bytesencoded = self.encode(data, self.errors)
501 return data
502
Tim Peters30324a72001-05-15 17:19:16 +0000503 def readline(self, size=None):
Guido van Rossuma3277132000-04-11 15:37:43 +0000504
505 if size is None:
506 data = self.reader.readline()
507 else:
508 data = self.reader.readline(size)
509 data, bytesencoded = self.encode(data, self.errors)
510 return data
511
Tim Peters30324a72001-05-15 17:19:16 +0000512 def readlines(self, sizehint=None):
Guido van Rossuma3277132000-04-11 15:37:43 +0000513
Marc-André Lemburgd5948492004-02-26 15:22:17 +0000514 data = self.reader.read()
Guido van Rossuma3277132000-04-11 15:37:43 +0000515 data, bytesencoded = self.encode(data, self.errors)
516 return data.splitlines(1)
517
Walter Dörwald4dbf1922002-11-06 16:53:44 +0000518 def next(self):
519
520 """ Return the next decoded line from the input stream."""
521 return self.reader.next()
522
523 def __iter__(self):
524 return self
525
Tim Peters30324a72001-05-15 17:19:16 +0000526 def write(self, data):
Guido van Rossum0612d842000-03-10 23:20:43 +0000527
528 data, bytesdecoded = self.decode(data, self.errors)
529 return self.writer.write(data)
530
Tim Peters30324a72001-05-15 17:19:16 +0000531 def writelines(self, list):
Guido van Rossuma3277132000-04-11 15:37:43 +0000532
533 data = ''.join(list)
534 data, bytesdecoded = self.decode(data, self.errors)
535 return self.writer.write(data)
Guido van Rossum0612d842000-03-10 23:20:43 +0000536
537 def reset(self):
538
539 self.reader.reset()
540 self.writer.reset()
541
Tim Peters30324a72001-05-15 17:19:16 +0000542 def __getattr__(self, name,
Guido van Rossum0612d842000-03-10 23:20:43 +0000543 getattr=getattr):
544
545 """ Inherit all other methods from the underlying stream.
546 """
Tim Peters30324a72001-05-15 17:19:16 +0000547 return getattr(self.stream, name)
Guido van Rossum0612d842000-03-10 23:20:43 +0000548
549### Shortcuts
550
Marc-André Lemburg349a3d32000-06-21 21:21:04 +0000551def open(filename, mode='rb', encoding=None, errors='strict', buffering=1):
Guido van Rossum0612d842000-03-10 23:20:43 +0000552
553 """ Open an encoded file using the given mode and return
554 a wrapped version providing transparent encoding/decoding.
555
556 Note: The wrapped version will only accept the object format
557 defined by the codecs, i.e. Unicode objects for most builtin
558 codecs. Output is also codec dependent and will usually by
559 Unicode as well.
560
Marc-André Lemburg349a3d32000-06-21 21:21:04 +0000561 Files are always opened in binary mode, even if no binary mode
Walter Dörwald7f3ed742003-02-02 23:08:27 +0000562 was specified. This is done to avoid data loss due to encodings
Marc-André Lemburg349a3d32000-06-21 21:21:04 +0000563 using 8-bit values. The default file mode is 'rb' meaning to
564 open the file in binary read mode.
565
Guido van Rossum0612d842000-03-10 23:20:43 +0000566 encoding specifies the encoding which is to be used for the
Walter Dörwald7f3ed742003-02-02 23:08:27 +0000567 file.
Guido van Rossum0612d842000-03-10 23:20:43 +0000568
569 errors may be given to define the error handling. It defaults
570 to 'strict' which causes ValueErrors to be raised in case an
571 encoding error occurs.
572
573 buffering has the same meaning as for the builtin open() API.
574 It defaults to line buffered.
575
Fred Drake49fd1072000-04-13 14:11:21 +0000576 The returned wrapped file object provides an extra attribute
577 .encoding which allows querying the used encoding. This
578 attribute is only available if an encoding was specified as
579 parameter.
580
Guido van Rossum0612d842000-03-10 23:20:43 +0000581 """
582 if encoding is not None and \
583 'b' not in mode:
584 # Force opening of the file in binary mode
585 mode = mode + 'b'
586 file = __builtin__.open(filename, mode, buffering)
587 if encoding is None:
588 return file
Tim Peters30324a72001-05-15 17:19:16 +0000589 (e, d, sr, sw) = lookup(encoding)
Guido van Rossuma3277132000-04-11 15:37:43 +0000590 srw = StreamReaderWriter(file, sr, sw, errors)
591 # Add attributes to simplify introspection
592 srw.encoding = encoding
593 return srw
Guido van Rossum0612d842000-03-10 23:20:43 +0000594
Guido van Rossuma3277132000-04-11 15:37:43 +0000595def EncodedFile(file, data_encoding, file_encoding=None, errors='strict'):
Guido van Rossum0612d842000-03-10 23:20:43 +0000596
597 """ Return a wrapped version of file which provides transparent
598 encoding translation.
599
600 Strings written to the wrapped file are interpreted according
Guido van Rossuma3277132000-04-11 15:37:43 +0000601 to the given data_encoding and then written to the original
602 file as string using file_encoding. The intermediate encoding
603 will usually be Unicode but depends on the specified codecs.
Guido van Rossum0612d842000-03-10 23:20:43 +0000604
Guido van Rossuma3277132000-04-11 15:37:43 +0000605 Strings are read from the file using file_encoding and then
606 passed back to the caller as string using data_encoding.
607
608 If file_encoding is not given, it defaults to data_encoding.
Guido van Rossum0612d842000-03-10 23:20:43 +0000609
610 errors may be given to define the error handling. It defaults
611 to 'strict' which causes ValueErrors to be raised in case an
612 encoding error occurs.
613
Fred Drake49fd1072000-04-13 14:11:21 +0000614 The returned wrapped file object provides two extra attributes
615 .data_encoding and .file_encoding which reflect the given
616 parameters of the same name. The attributes can be used for
617 introspection by Python programs.
618
Guido van Rossum0612d842000-03-10 23:20:43 +0000619 """
Guido van Rossuma3277132000-04-11 15:37:43 +0000620 if file_encoding is None:
621 file_encoding = data_encoding
622 encode, decode = lookup(data_encoding)[:2]
623 Reader, Writer = lookup(file_encoding)[2:]
624 sr = StreamRecoder(file,
Tim Peters30324a72001-05-15 17:19:16 +0000625 encode, decode, Reader, Writer,
Guido van Rossuma3277132000-04-11 15:37:43 +0000626 errors)
627 # Add attributes to simplify introspection
628 sr.data_encoding = data_encoding
629 sr.file_encoding = file_encoding
630 return sr
Guido van Rossum0612d842000-03-10 23:20:43 +0000631
Marc-André Lemburgaa32c5a2001-09-19 11:24:48 +0000632### Helpers for codec lookup
633
634def getencoder(encoding):
635
636 """ Lookup up the codec for the given encoding and return
637 its encoder function.
638
639 Raises a LookupError in case the encoding cannot be found.
640
641 """
642 return lookup(encoding)[0]
643
644def getdecoder(encoding):
645
646 """ Lookup up the codec for the given encoding and return
647 its decoder function.
648
649 Raises a LookupError in case the encoding cannot be found.
650
651 """
652 return lookup(encoding)[1]
653
654def getreader(encoding):
655
656 """ Lookup up the codec for the given encoding and return
657 its StreamReader class or factory function.
658
659 Raises a LookupError in case the encoding cannot be found.
660
661 """
662 return lookup(encoding)[2]
663
664def getwriter(encoding):
665
666 """ Lookup up the codec for the given encoding and return
667 its StreamWriter class or factory function.
668
669 Raises a LookupError in case the encoding cannot be found.
670
671 """
672 return lookup(encoding)[3]
673
Marc-André Lemburga866df82001-01-03 21:29:14 +0000674### Helpers for charmap-based codecs
675
676def make_identity_dict(rng):
677
678 """ make_identity_dict(rng) -> dict
679
680 Return a dictionary where elements of the rng sequence are
681 mapped to themselves.
Tim Peters88869f92001-01-14 23:36:06 +0000682
Marc-André Lemburga866df82001-01-03 21:29:14 +0000683 """
684 res = {}
685 for i in rng:
686 res[i]=i
687 return res
688
Marc-André Lemburg716cf912001-05-16 09:41:45 +0000689def make_encoding_map(decoding_map):
690
691 """ Creates an encoding map from a decoding map.
692
Walter Dörwald7f3ed742003-02-02 23:08:27 +0000693 If a target mapping in the decoding map occurs multiple
Marc-André Lemburg716cf912001-05-16 09:41:45 +0000694 times, then that target is mapped to None (undefined mapping),
695 causing an exception when encountered by the charmap codec
696 during translation.
697
698 One example where this happens is cp875.py which decodes
699 multiple character to \u001a.
700
701 """
702 m = {}
703 for k,v in decoding_map.items():
Raymond Hettinger54f02222002-06-01 14:18:47 +0000704 if not v in m:
Marc-André Lemburg716cf912001-05-16 09:41:45 +0000705 m[v] = k
706 else:
707 m[v] = None
708 return m
Tim Peters3a2ab1a2001-05-29 06:06:54 +0000709
Walter Dörwald3aeb6322002-09-02 13:14:32 +0000710### error handlers
711
712strict_errors = lookup_error("strict")
713ignore_errors = lookup_error("ignore")
714replace_errors = lookup_error("replace")
715xmlcharrefreplace_errors = lookup_error("xmlcharrefreplace")
716backslashreplace_errors = lookup_error("backslashreplace")
717
Martin v. Löwis6cd441d2001-07-31 08:54:55 +0000718# Tell modulefinder that using codecs probably needs the encodings
719# package
720_false = 0
721if _false:
722 import encodings
723
Guido van Rossum0612d842000-03-10 23:20:43 +0000724### Tests
Guido van Rossum1c89b0e2000-04-11 15:41:38 +0000725
Guido van Rossum0612d842000-03-10 23:20:43 +0000726if __name__ == '__main__':
727
Guido van Rossuma3277132000-04-11 15:37:43 +0000728 # Make stdout translate Latin-1 output into UTF-8 output
729 sys.stdout = EncodedFile(sys.stdout, 'latin-1', 'utf-8')
Guido van Rossum1c89b0e2000-04-11 15:41:38 +0000730
Guido van Rossuma3277132000-04-11 15:37:43 +0000731 # Have stdin translate Latin-1 input into UTF-8 input
732 sys.stdin = EncodedFile(sys.stdin, 'utf-8', 'latin-1')