blob: 4a4d043272096328b779ac2d5e12a0e232553d58 [file] [log] [blame]
Guido van Rossum0612d842000-03-10 23:20:43 +00001""" codecs -- Python Codec Registry, API and helpers.
2
3
4Written by Marc-Andre Lemburg (mal@lemburg.com).
5
6(c) Copyright CNRI, All Rights Reserved. NO WARRANTY.
7
8"""#"
9
Georg Brandl1a3284e2007-12-02 09:40:06 +000010import builtins, sys
Guido van Rossum0612d842000-03-10 23:20:43 +000011
12### Registry and builtin stateless codec functions
13
Guido van Rossumb95de4f2000-03-31 17:25:23 +000014try:
15 from _codecs import *
Guido van Rossumb940e112007-01-10 16:19:56 +000016except ImportError as why:
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000017 raise SystemError('Failed to load the builtin codecs: %s' % why)
Guido van Rossum0612d842000-03-10 23:20:43 +000018
Tim Peters30324a72001-05-15 17:19:16 +000019__all__ = ["register", "lookup", "open", "EncodedFile", "BOM", "BOM_BE",
Walter Dörwald474458d2002-06-04 15:16:29 +000020 "BOM_LE", "BOM32_BE", "BOM32_LE", "BOM64_BE", "BOM64_LE",
21 "BOM_UTF8", "BOM_UTF16", "BOM_UTF16_LE", "BOM_UTF16_BE",
Walter Dörwald3aeb6322002-09-02 13:14:32 +000022 "BOM_UTF32", "BOM_UTF32_LE", "BOM_UTF32_BE",
Serhiy Storchakade3ee5b2014-12-20 17:42:38 +020023 "CodecInfo", "Codec", "IncrementalEncoder", "IncrementalDecoder",
24 "StreamReader", "StreamWriter",
25 "StreamReaderWriter", "StreamRecoder",
26 "getencoder", "getdecoder", "getincrementalencoder",
27 "getincrementaldecoder", "getreader", "getwriter",
28 "encode", "decode", "iterencode", "iterdecode",
Walter Dörwald3aeb6322002-09-02 13:14:32 +000029 "strict_errors", "ignore_errors", "replace_errors",
Serhiy Storchakade3ee5b2014-12-20 17:42:38 +020030 "xmlcharrefreplace_errors", "backslashreplace_errors",
Walter Dörwald3aeb6322002-09-02 13:14:32 +000031 "register_error", "lookup_error"]
Skip Montanaroe99d5ea2001-01-20 19:54:20 +000032
Guido van Rossum0612d842000-03-10 23:20:43 +000033### Constants
34
35#
Walter Dörwald474458d2002-06-04 15:16:29 +000036# Byte Order Mark (BOM = ZERO WIDTH NO-BREAK SPACE = U+FEFF)
37# and its possible byte string values
38# for UTF8/UTF16/UTF32 output and little/big endian machines
Guido van Rossum0612d842000-03-10 23:20:43 +000039#
Guido van Rossum0612d842000-03-10 23:20:43 +000040
Walter Dörwald474458d2002-06-04 15:16:29 +000041# UTF-8
Walter Dörwaldca8a8d02007-05-04 13:05:09 +000042BOM_UTF8 = b'\xef\xbb\xbf'
Walter Dörwald474458d2002-06-04 15:16:29 +000043
44# UTF-16, little endian
Walter Dörwaldca8a8d02007-05-04 13:05:09 +000045BOM_LE = BOM_UTF16_LE = b'\xff\xfe'
Walter Dörwald474458d2002-06-04 15:16:29 +000046
47# UTF-16, big endian
Walter Dörwaldca8a8d02007-05-04 13:05:09 +000048BOM_BE = BOM_UTF16_BE = b'\xfe\xff'
Walter Dörwald474458d2002-06-04 15:16:29 +000049
50# UTF-32, little endian
Walter Dörwaldca8a8d02007-05-04 13:05:09 +000051BOM_UTF32_LE = b'\xff\xfe\x00\x00'
Walter Dörwald474458d2002-06-04 15:16:29 +000052
53# UTF-32, big endian
Walter Dörwaldca8a8d02007-05-04 13:05:09 +000054BOM_UTF32_BE = b'\x00\x00\xfe\xff'
Walter Dörwald474458d2002-06-04 15:16:29 +000055
Marc-André Lemburgb28de0d2002-12-12 17:37:50 +000056if sys.byteorder == 'little':
Walter Dörwald474458d2002-06-04 15:16:29 +000057
Marc-André Lemburgb28de0d2002-12-12 17:37:50 +000058 # UTF-16, native endianness
59 BOM = BOM_UTF16 = BOM_UTF16_LE
60
61 # UTF-32, native endianness
62 BOM_UTF32 = BOM_UTF32_LE
63
64else:
65
66 # UTF-16, native endianness
67 BOM = BOM_UTF16 = BOM_UTF16_BE
68
69 # UTF-32, native endianness
70 BOM_UTF32 = BOM_UTF32_BE
Walter Dörwald474458d2002-06-04 15:16:29 +000071
72# Old broken names (don't use in new code)
73BOM32_LE = BOM_UTF16_LE
74BOM32_BE = BOM_UTF16_BE
75BOM64_LE = BOM_UTF32_LE
76BOM64_BE = BOM_UTF32_BE
Guido van Rossum0612d842000-03-10 23:20:43 +000077
78
79### Codec base classes (defining the API)
80
Thomas Woutersa9773292006-04-21 09:43:23 +000081class CodecInfo(tuple):
Nick Coghlanc72e4e62013-11-22 22:39:36 +100082 """Codec details when looking up the codec registry"""
83
84 # Private API to allow Python 3.4 to blacklist the known non-Unicode
85 # codecs in the standard library. A more general mechanism to
86 # reliably distinguish test encodings from other codecs will hopefully
87 # be defined for Python 3.5
88 #
89 # See http://bugs.python.org/issue19619
90 _is_text_encoding = True # Assume codecs are text encodings by default
Thomas Woutersa9773292006-04-21 09:43:23 +000091
92 def __new__(cls, encode, decode, streamreader=None, streamwriter=None,
Nick Coghlanc72e4e62013-11-22 22:39:36 +100093 incrementalencoder=None, incrementaldecoder=None, name=None,
94 *, _is_text_encoding=None):
Thomas Woutersa9773292006-04-21 09:43:23 +000095 self = tuple.__new__(cls, (encode, decode, streamreader, streamwriter))
96 self.name = name
97 self.encode = encode
98 self.decode = decode
99 self.incrementalencoder = incrementalencoder
100 self.incrementaldecoder = incrementaldecoder
101 self.streamwriter = streamwriter
102 self.streamreader = streamreader
Nick Coghlanc72e4e62013-11-22 22:39:36 +1000103 if _is_text_encoding is not None:
104 self._is_text_encoding = _is_text_encoding
Thomas Woutersa9773292006-04-21 09:43:23 +0000105 return self
106
107 def __repr__(self):
Walter Dörwald3abcb012007-04-16 22:10:50 +0000108 return "<%s.%s object for encoding %s at 0x%x>" % \
109 (self.__class__.__module__, self.__class__.__name__,
110 self.name, id(self))
Thomas Woutersa9773292006-04-21 09:43:23 +0000111
Guido van Rossum0612d842000-03-10 23:20:43 +0000112class Codec:
113
114 """ Defines the interface for stateless encoders/decoders.
115
Walter Dörwald7f82f792002-11-19 21:42:53 +0000116 The .encode()/.decode() methods may use different error
Guido van Rossum0612d842000-03-10 23:20:43 +0000117 handling schemes by providing the errors argument. These
Walter Dörwald7f82f792002-11-19 21:42:53 +0000118 string values are predefined:
Guido van Rossum0612d842000-03-10 23:20:43 +0000119
Guido van Rossumd8855fd2000-03-24 22:14:19 +0000120 'strict' - raise a ValueError error (or a subclass)
Guido van Rossum0612d842000-03-10 23:20:43 +0000121 'ignore' - ignore the character and continue with the next
122 'replace' - replace with a suitable replacement character;
123 Python will use the official U+FFFD REPLACEMENT
Walter Dörwald7f82f792002-11-19 21:42:53 +0000124 CHARACTER for the builtin Unicode codecs on
125 decoding and '?' on encoding.
Serhiy Storchakad3faf432015-01-18 11:28:37 +0200126 'surrogateescape' - replace with private code points U+DCnn.
Walter Dörwald7f82f792002-11-19 21:42:53 +0000127 'xmlcharrefreplace' - Replace with the appropriate XML
128 character reference (only for encoding).
129 'backslashreplace' - Replace with backslashed escape sequences
130 (only for encoding).
131
132 The set of allowed values can be extended via register_error.
Guido van Rossum0612d842000-03-10 23:20:43 +0000133
134 """
Tim Peters30324a72001-05-15 17:19:16 +0000135 def encode(self, input, errors='strict'):
Guido van Rossum1c89b0e2000-04-11 15:41:38 +0000136
Fred Drake3e74c0d2000-03-17 15:40:35 +0000137 """ Encodes the object input and returns a tuple (output
Guido van Rossum0612d842000-03-10 23:20:43 +0000138 object, length consumed).
139
140 errors defines the error handling to apply. It defaults to
141 'strict' handling.
142
143 The method may not store state in the Codec instance. Use
Berker Peksag41ca8282015-07-30 18:26:10 +0300144 StreamWriter for codecs which have to keep state in order to
145 make encoding efficient.
Guido van Rossum0612d842000-03-10 23:20:43 +0000146
147 The encoder must be able to handle zero length input and
148 return an empty object of the output object type in this
149 situation.
150
151 """
152 raise NotImplementedError
153
Tim Peters30324a72001-05-15 17:19:16 +0000154 def decode(self, input, errors='strict'):
Guido van Rossum0612d842000-03-10 23:20:43 +0000155
156 """ Decodes the object input and returns a tuple (output
157 object, length consumed).
158
159 input must be an object which provides the bf_getreadbuf
160 buffer slot. Python strings, buffer objects and memory
161 mapped files are examples of objects providing this slot.
Guido van Rossum1c89b0e2000-04-11 15:41:38 +0000162
Guido van Rossum0612d842000-03-10 23:20:43 +0000163 errors defines the error handling to apply. It defaults to
164 'strict' handling.
165
166 The method may not store state in the Codec instance. Use
Berker Peksag41ca8282015-07-30 18:26:10 +0300167 StreamReader for codecs which have to keep state in order to
168 make decoding efficient.
Guido van Rossum0612d842000-03-10 23:20:43 +0000169
170 The decoder must be able to handle zero length input and
171 return an empty object of the output object type in this
172 situation.
173
Guido van Rossum1c89b0e2000-04-11 15:41:38 +0000174 """
Guido van Rossum0612d842000-03-10 23:20:43 +0000175 raise NotImplementedError
176
Thomas Woutersa9773292006-04-21 09:43:23 +0000177class IncrementalEncoder(object):
178 """
Walter Dörwald3abcb012007-04-16 22:10:50 +0000179 An IncrementalEncoder encodes an input in multiple steps. The input can
180 be passed piece by piece to the encode() method. The IncrementalEncoder
181 remembers the state of the encoding process between calls to encode().
Thomas Woutersa9773292006-04-21 09:43:23 +0000182 """
183 def __init__(self, errors='strict'):
184 """
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000185 Creates an IncrementalEncoder instance.
Thomas Woutersa9773292006-04-21 09:43:23 +0000186
187 The IncrementalEncoder may use different error handling schemes by
188 providing the errors keyword argument. See the module docstring
189 for a list of possible values.
190 """
191 self.errors = errors
192 self.buffer = ""
193
194 def encode(self, input, final=False):
195 """
196 Encodes input and returns the resulting object.
197 """
198 raise NotImplementedError
199
200 def reset(self):
201 """
202 Resets the encoder to the initial state.
203 """
204
Walter Dörwald3abcb012007-04-16 22:10:50 +0000205 def getstate(self):
206 """
207 Return the current state of the encoder.
208 """
209 return 0
210
211 def setstate(self, state):
212 """
213 Set the current state of the encoder. state must have been
214 returned by getstate().
215 """
216
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000217class BufferedIncrementalEncoder(IncrementalEncoder):
218 """
219 This subclass of IncrementalEncoder can be used as the baseclass for an
220 incremental encoder if the encoder must keep some of the output in a
221 buffer between calls to encode().
222 """
223 def __init__(self, errors='strict'):
224 IncrementalEncoder.__init__(self, errors)
Walter Dörwald3abcb012007-04-16 22:10:50 +0000225 # unencoded input that is kept between calls to encode()
226 self.buffer = ""
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000227
228 def _buffer_encode(self, input, errors, final):
229 # Overwrite this method in subclasses: It must encode input
230 # and return an (output, length consumed) tuple
231 raise NotImplementedError
232
233 def encode(self, input, final=False):
234 # encode input (taking the buffer into account)
235 data = self.buffer + input
236 (result, consumed) = self._buffer_encode(data, self.errors, final)
237 # keep unencoded input until the next call
238 self.buffer = data[consumed:]
239 return result
240
241 def reset(self):
242 IncrementalEncoder.reset(self)
243 self.buffer = ""
244
Walter Dörwald3abcb012007-04-16 22:10:50 +0000245 def getstate(self):
246 return self.buffer or 0
247
248 def setstate(self, state):
249 self.buffer = state or ""
250
Thomas Woutersa9773292006-04-21 09:43:23 +0000251class IncrementalDecoder(object):
252 """
Walter Dörwald3abcb012007-04-16 22:10:50 +0000253 An IncrementalDecoder decodes an input in multiple steps. The input can
254 be passed piece by piece to the decode() method. The IncrementalDecoder
Thomas Woutersa9773292006-04-21 09:43:23 +0000255 remembers the state of the decoding process between calls to decode().
256 """
257 def __init__(self, errors='strict'):
258 """
Ka-Ping Yeef44c7e82008-03-18 04:51:32 +0000259 Create a IncrementalDecoder instance.
Thomas Woutersa9773292006-04-21 09:43:23 +0000260
261 The IncrementalDecoder may use different error handling schemes by
262 providing the errors keyword argument. See the module docstring
263 for a list of possible values.
264 """
265 self.errors = errors
266
267 def decode(self, input, final=False):
268 """
Ka-Ping Yeef44c7e82008-03-18 04:51:32 +0000269 Decode input and returns the resulting object.
Thomas Woutersa9773292006-04-21 09:43:23 +0000270 """
271 raise NotImplementedError
272
273 def reset(self):
274 """
Ka-Ping Yeef44c7e82008-03-18 04:51:32 +0000275 Reset the decoder to the initial state.
Thomas Woutersa9773292006-04-21 09:43:23 +0000276 """
277
Walter Dörwald3abcb012007-04-16 22:10:50 +0000278 def getstate(self):
279 """
Ka-Ping Yeef44c7e82008-03-18 04:51:32 +0000280 Return the current state of the decoder.
281
282 This must be a (buffered_input, additional_state_info) tuple.
283 buffered_input must be a bytes object containing bytes that
284 were passed to decode() that have not yet been converted.
285 additional_state_info must be a non-negative integer
286 representing the state of the decoder WITHOUT yet having
287 processed the contents of buffered_input. In the initial state
288 and after reset(), getstate() must return (b"", 0).
Walter Dörwald3abcb012007-04-16 22:10:50 +0000289 """
Walter Dörwaldca8a8d02007-05-04 13:05:09 +0000290 return (b"", 0)
Walter Dörwald3abcb012007-04-16 22:10:50 +0000291
292 def setstate(self, state):
293 """
Ka-Ping Yeef44c7e82008-03-18 04:51:32 +0000294 Set the current state of the decoder.
295
296 state must have been returned by getstate(). The effect of
297 setstate((b"", 0)) must be equivalent to reset().
Walter Dörwald3abcb012007-04-16 22:10:50 +0000298 """
299
Thomas Woutersa9773292006-04-21 09:43:23 +0000300class BufferedIncrementalDecoder(IncrementalDecoder):
301 """
302 This subclass of IncrementalDecoder can be used as the baseclass for an
Walter Dörwald3abcb012007-04-16 22:10:50 +0000303 incremental decoder if the decoder must be able to handle incomplete
304 byte sequences.
Thomas Woutersa9773292006-04-21 09:43:23 +0000305 """
306 def __init__(self, errors='strict'):
307 IncrementalDecoder.__init__(self, errors)
Walter Dörwald3abcb012007-04-16 22:10:50 +0000308 # undecoded input that is kept between calls to decode()
Walter Dörwaldca8a8d02007-05-04 13:05:09 +0000309 self.buffer = b""
Thomas Woutersa9773292006-04-21 09:43:23 +0000310
311 def _buffer_decode(self, input, errors, final):
312 # Overwrite this method in subclasses: It must decode input
313 # and return an (output, length consumed) tuple
314 raise NotImplementedError
315
316 def decode(self, input, final=False):
317 # decode input (taking the buffer into account)
318 data = self.buffer + input
319 (result, consumed) = self._buffer_decode(data, self.errors, final)
320 # keep undecoded input until the next call
321 self.buffer = data[consumed:]
322 return result
323
324 def reset(self):
325 IncrementalDecoder.reset(self)
Walter Dörwaldca8a8d02007-05-04 13:05:09 +0000326 self.buffer = b""
Thomas Woutersa9773292006-04-21 09:43:23 +0000327
Walter Dörwald3abcb012007-04-16 22:10:50 +0000328 def getstate(self):
329 # additional state info is always 0
330 return (self.buffer, 0)
331
332 def setstate(self, state):
333 # ignore additional state info
334 self.buffer = state[0]
335
Guido van Rossum0612d842000-03-10 23:20:43 +0000336#
337# The StreamWriter and StreamReader class provide generic working
Andrew M. Kuchling97c56352001-09-18 20:29:48 +0000338# interfaces which can be used to implement new encoding submodules
Guido van Rossum0612d842000-03-10 23:20:43 +0000339# very easily. See encodings/utf_8.py for an example on how this is
340# done.
Guido van Rossum1c89b0e2000-04-11 15:41:38 +0000341#
Guido van Rossum0612d842000-03-10 23:20:43 +0000342
343class StreamWriter(Codec):
344
Tim Peters30324a72001-05-15 17:19:16 +0000345 def __init__(self, stream, errors='strict'):
Guido van Rossum0612d842000-03-10 23:20:43 +0000346
347 """ Creates a StreamWriter instance.
348
Nick Coghlanb9fdb7a2015-01-07 00:22:00 +1000349 stream must be a file-like object open for writing.
Guido van Rossum0612d842000-03-10 23:20:43 +0000350
Walter Dörwald7f82f792002-11-19 21:42:53 +0000351 The StreamWriter may use different error handling
Guido van Rossum0612d842000-03-10 23:20:43 +0000352 schemes by providing the errors keyword argument. These
Walter Dörwald7f82f792002-11-19 21:42:53 +0000353 parameters are predefined:
Guido van Rossum0612d842000-03-10 23:20:43 +0000354
355 'strict' - raise a ValueError (or a subclass)
356 'ignore' - ignore the character and continue with the next
357 'replace'- replace with a suitable replacement character
Walter Dörwald7f82f792002-11-19 21:42:53 +0000358 'xmlcharrefreplace' - Replace with the appropriate XML
359 character reference.
360 'backslashreplace' - Replace with backslashed escape
361 sequences (only for encoding).
Guido van Rossum0612d842000-03-10 23:20:43 +0000362
Walter Dörwald7f82f792002-11-19 21:42:53 +0000363 The set of allowed parameter values can be extended via
364 register_error.
Guido van Rossum0612d842000-03-10 23:20:43 +0000365 """
366 self.stream = stream
367 self.errors = errors
368
Guido van Rossuma3277132000-04-11 15:37:43 +0000369 def write(self, object):
Guido van Rossum0612d842000-03-10 23:20:43 +0000370
371 """ Writes the object's contents encoded to self.stream.
372 """
Tim Peters30324a72001-05-15 17:19:16 +0000373 data, consumed = self.encode(object, self.errors)
Guido van Rossum0612d842000-03-10 23:20:43 +0000374 self.stream.write(data)
375
Guido van Rossuma3277132000-04-11 15:37:43 +0000376 def writelines(self, list):
377
378 """ Writes the concatenated list of strings to the stream
379 using .write().
380 """
381 self.write(''.join(list))
Guido van Rossum1c89b0e2000-04-11 15:41:38 +0000382
Guido van Rossum0612d842000-03-10 23:20:43 +0000383 def reset(self):
384
385 """ Flushes and resets the codec buffers used for keeping state.
386
387 Calling this method should ensure that the data on the
388 output is put into a clean state, that allows appending
389 of new fresh data without having to rescan the whole
390 stream to recover state.
391
392 """
393 pass
394
Victor Stinnera92ad7e2010-05-22 16:59:09 +0000395 def seek(self, offset, whence=0):
396 self.stream.seek(offset, whence)
397 if whence == 0 and offset == 0:
398 self.reset()
399
Tim Peters30324a72001-05-15 17:19:16 +0000400 def __getattr__(self, name,
Guido van Rossum0612d842000-03-10 23:20:43 +0000401 getattr=getattr):
402
403 """ Inherit all other methods from the underlying stream.
404 """
Tim Peters30324a72001-05-15 17:19:16 +0000405 return getattr(self.stream, name)
Guido van Rossum0612d842000-03-10 23:20:43 +0000406
Thomas Wouters89f507f2006-12-13 04:49:30 +0000407 def __enter__(self):
408 return self
409
410 def __exit__(self, type, value, tb):
411 self.stream.close()
412
Guido van Rossum0612d842000-03-10 23:20:43 +0000413###
414
415class StreamReader(Codec):
416
Georg Brandl02524622010-12-02 18:06:51 +0000417 charbuffertype = str
418
Tim Peters30324a72001-05-15 17:19:16 +0000419 def __init__(self, stream, errors='strict'):
Guido van Rossum0612d842000-03-10 23:20:43 +0000420
421 """ Creates a StreamReader instance.
422
Nick Coghlanb9fdb7a2015-01-07 00:22:00 +1000423 stream must be a file-like object open for reading.
Guido van Rossum0612d842000-03-10 23:20:43 +0000424
Walter Dörwald7f82f792002-11-19 21:42:53 +0000425 The StreamReader may use different error handling
Guido van Rossum0612d842000-03-10 23:20:43 +0000426 schemes by providing the errors keyword argument. These
Walter Dörwald7f82f792002-11-19 21:42:53 +0000427 parameters are predefined:
Guido van Rossum0612d842000-03-10 23:20:43 +0000428
429 'strict' - raise a ValueError (or a subclass)
430 'ignore' - ignore the character and continue with the next
431 'replace'- replace with a suitable replacement character;
432
Walter Dörwald7f82f792002-11-19 21:42:53 +0000433 The set of allowed parameter values can be extended via
434 register_error.
Guido van Rossum0612d842000-03-10 23:20:43 +0000435 """
436 self.stream = stream
437 self.errors = errors
Walter Dörwaldca8a8d02007-05-04 13:05:09 +0000438 self.bytebuffer = b""
Georg Brandl02524622010-12-02 18:06:51 +0000439 self._empty_charbuffer = self.charbuffertype()
440 self.charbuffer = self._empty_charbuffer
Martin v. Löwis4ed67382005-09-18 08:34:39 +0000441 self.linebuffer = None
Guido van Rossum0612d842000-03-10 23:20:43 +0000442
Walter Dörwald69652032004-09-07 20:24:22 +0000443 def decode(self, input, errors='strict'):
444 raise NotImplementedError
445
Martin v. Löwis56066d22005-08-24 07:38:12 +0000446 def read(self, size=-1, chars=-1, firstline=False):
Guido van Rossum0612d842000-03-10 23:20:43 +0000447
448 """ Decodes data from the stream self.stream and returns the
449 resulting object.
450
Nick Coghlanb9fdb7a2015-01-07 00:22:00 +1000451 chars indicates the number of decoded code points or bytes to
452 return. read() will never return more data than requested,
453 but it might return less, if there is not enough available.
Walter Dörwald69652032004-09-07 20:24:22 +0000454
Nick Coghlanb9fdb7a2015-01-07 00:22:00 +1000455 size indicates the approximate maximum number of decoded
456 bytes or code points to read for decoding. The decoder
Guido van Rossum0612d842000-03-10 23:20:43 +0000457 can modify this setting as appropriate. The default value
458 -1 indicates to read and decode as much as possible. size
459 is intended to prevent having to decode huge files in one
460 step.
461
Martin v. Löwis56066d22005-08-24 07:38:12 +0000462 If firstline is true, and a UnicodeDecodeError happens
463 after the first line terminator in the input only the first line
464 will be returned, the rest of the input will be kept until the
465 next call to read().
466
Nick Coghlanb9fdb7a2015-01-07 00:22:00 +1000467 The method should use a greedy read strategy, meaning that
Guido van Rossum0612d842000-03-10 23:20:43 +0000468 it should read as much data as is allowed within the
469 definition of the encoding and the given size, e.g. if
470 optional encoding endings or state markers are available
471 on the stream, these should be read too.
Guido van Rossum0612d842000-03-10 23:20:43 +0000472 """
Martin v. Löwis4ed67382005-09-18 08:34:39 +0000473 # If we have lines cached, first merge them back into characters
474 if self.linebuffer:
Georg Brandl02524622010-12-02 18:06:51 +0000475 self.charbuffer = self._empty_charbuffer.join(self.linebuffer)
Martin v. Löwis4ed67382005-09-18 08:34:39 +0000476 self.linebuffer = None
Tim Peters536cf992005-12-25 23:18:31 +0000477
Walter Dörwald69652032004-09-07 20:24:22 +0000478 # read until we get the required number of characters (if available)
Walter Dörwald69652032004-09-07 20:24:22 +0000479 while True:
Tim Golden621302c2012-10-01 16:40:40 +0100480 # can the request be satisfied from the character buffer?
Serhiy Storchakadbe09822014-01-26 19:27:56 +0200481 if chars >= 0:
Walter Dörwald69652032004-09-07 20:24:22 +0000482 if len(self.charbuffer) >= chars:
Walter Dörwald69652032004-09-07 20:24:22 +0000483 break
Serhiy Storchakadbe09822014-01-26 19:27:56 +0200484 elif size >= 0:
485 if len(self.charbuffer) >= size:
486 break
Walter Dörwald69652032004-09-07 20:24:22 +0000487 # we need more data
488 if size < 0:
489 newdata = self.stream.read()
490 else:
491 newdata = self.stream.read(size)
Walter Dörwalde57d7b12004-12-21 22:24:00 +0000492 # decode bytes (those remaining from the last call included)
Walter Dörwald69652032004-09-07 20:24:22 +0000493 data = self.bytebuffer + newdata
Serhiy Storchakadbe09822014-01-26 19:27:56 +0200494 if not data:
495 break
Martin v. Löwis56066d22005-08-24 07:38:12 +0000496 try:
497 newchars, decodedbytes = self.decode(data, self.errors)
Guido van Rossumb940e112007-01-10 16:19:56 +0000498 except UnicodeDecodeError as exc:
Martin v. Löwis56066d22005-08-24 07:38:12 +0000499 if firstline:
Walter Dörwald3abcb012007-04-16 22:10:50 +0000500 newchars, decodedbytes = \
501 self.decode(data[:exc.start], self.errors)
Ezio Melottid8b509b2011-09-28 17:37:55 +0300502 lines = newchars.splitlines(keepends=True)
Martin v. Löwis56066d22005-08-24 07:38:12 +0000503 if len(lines)<=1:
504 raise
505 else:
506 raise
Walter Dörwald69652032004-09-07 20:24:22 +0000507 # keep undecoded bytes until the next call
508 self.bytebuffer = data[decodedbytes:]
509 # put new characters in the character buffer
Walter Dörwalde57d7b12004-12-21 22:24:00 +0000510 self.charbuffer += newchars
Walter Dörwald69652032004-09-07 20:24:22 +0000511 # there was no data available
512 if not newdata:
Walter Dörwalde57d7b12004-12-21 22:24:00 +0000513 break
514 if chars < 0:
515 # Return everything we've got
516 result = self.charbuffer
Georg Brandl02524622010-12-02 18:06:51 +0000517 self.charbuffer = self._empty_charbuffer
Walter Dörwalde57d7b12004-12-21 22:24:00 +0000518 else:
519 # Return the first chars characters
520 result = self.charbuffer[:chars]
521 self.charbuffer = self.charbuffer[chars:]
Walter Dörwald69652032004-09-07 20:24:22 +0000522 return result
Guido van Rossum0612d842000-03-10 23:20:43 +0000523
Walter Dörwald69652032004-09-07 20:24:22 +0000524 def readline(self, size=None, keepends=True):
Guido van Rossuma3277132000-04-11 15:37:43 +0000525
526 """ Read one line from the input stream and return the
527 decoded data.
528
Walter Dörwald69652032004-09-07 20:24:22 +0000529 size, if given, is passed as size argument to the
530 read() method.
Guido van Rossum1c89b0e2000-04-11 15:41:38 +0000531
Guido van Rossuma3277132000-04-11 15:37:43 +0000532 """
Martin v. Löwis4ed67382005-09-18 08:34:39 +0000533 # If we have lines cached from an earlier read, return
534 # them unconditionally
535 if self.linebuffer:
536 line = self.linebuffer[0]
537 del self.linebuffer[0]
538 if len(self.linebuffer) == 1:
539 # revert to charbuffer mode; we might need more data
540 # next time
541 self.charbuffer = self.linebuffer[0]
542 self.linebuffer = None
543 if not keepends:
Ezio Melottid8b509b2011-09-28 17:37:55 +0300544 line = line.splitlines(keepends=False)[0]
Martin v. Löwis4ed67382005-09-18 08:34:39 +0000545 return line
Tim Peters536cf992005-12-25 23:18:31 +0000546
Walter Dörwalde57d7b12004-12-21 22:24:00 +0000547 readsize = size or 72
Georg Brandl02524622010-12-02 18:06:51 +0000548 line = self._empty_charbuffer
Walter Dörwalde57d7b12004-12-21 22:24:00 +0000549 # If size is given, we call read() only once
Walter Dörwald69652032004-09-07 20:24:22 +0000550 while True:
Martin v. Löwis56066d22005-08-24 07:38:12 +0000551 data = self.read(readsize, firstline=True)
Walter Dörwalde57d7b12004-12-21 22:24:00 +0000552 if data:
Walter Dörwalda4eb2d52005-04-21 21:42:35 +0000553 # If we're at a "\r" read one extra character (which might
554 # be a "\n") to get a proper line ending. If the stream is
Walter Dörwaldbc8e6422005-04-21 21:32:03 +0000555 # temporarily exhausted we return the wrong line ending.
Georg Brandl02524622010-12-02 18:06:51 +0000556 if (isinstance(data, str) and data.endswith("\r")) or \
557 (isinstance(data, bytes) and data.endswith(b"\r")):
Walter Dörwald7a6dc132005-04-04 21:38:47 +0000558 data += self.read(size=1, chars=1)
Walter Dörwald7a6dc132005-04-04 21:38:47 +0000559
Walter Dörwald69652032004-09-07 20:24:22 +0000560 line += data
Ezio Melottid8b509b2011-09-28 17:37:55 +0300561 lines = line.splitlines(keepends=True)
Walter Dörwalde57d7b12004-12-21 22:24:00 +0000562 if lines:
Martin v. Löwis4ed67382005-09-18 08:34:39 +0000563 if len(lines) > 1:
564 # More than one line result; the first line is a full line
565 # to return
566 line = lines[0]
567 del lines[0]
568 if len(lines) > 1:
569 # cache the remaining lines
570 lines[-1] += self.charbuffer
571 self.linebuffer = lines
572 self.charbuffer = None
573 else:
574 # only one remaining line, put it back into charbuffer
575 self.charbuffer = lines[0] + self.charbuffer
576 if not keepends:
Ezio Melottid8b509b2011-09-28 17:37:55 +0300577 line = line.splitlines(keepends=False)[0]
Martin v. Löwis4ed67382005-09-18 08:34:39 +0000578 break
Walter Dörwalde57d7b12004-12-21 22:24:00 +0000579 line0withend = lines[0]
Ezio Melottid8b509b2011-09-28 17:37:55 +0300580 line0withoutend = lines[0].splitlines(keepends=False)[0]
Walter Dörwalde57d7b12004-12-21 22:24:00 +0000581 if line0withend != line0withoutend: # We really have a line end
582 # Put the rest back together and keep it until the next call
Georg Brandl02524622010-12-02 18:06:51 +0000583 self.charbuffer = self._empty_charbuffer.join(lines[1:]) + \
584 self.charbuffer
Walter Dörwalde57d7b12004-12-21 22:24:00 +0000585 if keepends:
586 line = line0withend
587 else:
588 line = line0withoutend
Walter Dörwald9fa09462005-01-10 12:01:39 +0000589 break
Walter Dörwalde57d7b12004-12-21 22:24:00 +0000590 # we didn't get anything or this was our only try
Walter Dörwald9fa09462005-01-10 12:01:39 +0000591 if not data or size is not None:
Walter Dörwalde57d7b12004-12-21 22:24:00 +0000592 if line and not keepends:
Ezio Melottid8b509b2011-09-28 17:37:55 +0300593 line = line.splitlines(keepends=False)[0]
Walter Dörwalde57d7b12004-12-21 22:24:00 +0000594 break
Georg Brandl02524622010-12-02 18:06:51 +0000595 if readsize < 8000:
Walter Dörwalde57d7b12004-12-21 22:24:00 +0000596 readsize *= 2
597 return line
Guido van Rossum1c89b0e2000-04-11 15:41:38 +0000598
Walter Dörwald69652032004-09-07 20:24:22 +0000599 def readlines(self, sizehint=None, keepends=True):
Guido van Rossuma3277132000-04-11 15:37:43 +0000600
601 """ Read all lines available on the input stream
Nick Coghlanb9fdb7a2015-01-07 00:22:00 +1000602 and return them as a list.
Guido van Rossuma3277132000-04-11 15:37:43 +0000603
604 Line breaks are implemented using the codec's decoder
605 method and are included in the list entries.
Guido van Rossum1c89b0e2000-04-11 15:41:38 +0000606
Marc-André Lemburgd5948492004-02-26 15:22:17 +0000607 sizehint, if given, is ignored since there is no efficient
608 way to finding the true end-of-line.
Guido van Rossuma3277132000-04-11 15:37:43 +0000609
610 """
Walter Dörwald69652032004-09-07 20:24:22 +0000611 data = self.read()
Hye-Shik Changaf5c7cf2004-10-17 23:51:21 +0000612 return data.splitlines(keepends)
Guido van Rossum0612d842000-03-10 23:20:43 +0000613
614 def reset(self):
615
616 """ Resets the codec buffers used for keeping state.
617
618 Note that no stream repositioning should take place.
Thomas Wouters7e474022000-07-16 12:04:32 +0000619 This method is primarily intended to be able to recover
Guido van Rossum0612d842000-03-10 23:20:43 +0000620 from decoding errors.
621
622 """
Walter Dörwaldca8a8d02007-05-04 13:05:09 +0000623 self.bytebuffer = b""
Georg Brandl02524622010-12-02 18:06:51 +0000624 self.charbuffer = self._empty_charbuffer
Martin v. Löwis4ed67382005-09-18 08:34:39 +0000625 self.linebuffer = None
Walter Dörwald729c31f2005-03-14 19:06:30 +0000626
Walter Dörwald71fd90d2005-03-14 19:25:41 +0000627 def seek(self, offset, whence=0):
Walter Dörwald729c31f2005-03-14 19:06:30 +0000628 """ Set the input stream's current position.
629
630 Resets the codec buffers used for keeping state.
631 """
Walter Dörwald729c31f2005-03-14 19:06:30 +0000632 self.stream.seek(offset, whence)
Victor Stinnera92ad7e2010-05-22 16:59:09 +0000633 self.reset()
Guido van Rossum0612d842000-03-10 23:20:43 +0000634
Georg Brandla18af4e2007-04-21 15:47:16 +0000635 def __next__(self):
Walter Dörwald4dbf1922002-11-06 16:53:44 +0000636
637 """ Return the next decoded line from the input stream."""
638 line = self.readline()
639 if line:
640 return line
641 raise StopIteration
642
643 def __iter__(self):
644 return self
645
Tim Peters30324a72001-05-15 17:19:16 +0000646 def __getattr__(self, name,
Guido van Rossum0612d842000-03-10 23:20:43 +0000647 getattr=getattr):
648
649 """ Inherit all other methods from the underlying stream.
650 """
Tim Peters30324a72001-05-15 17:19:16 +0000651 return getattr(self.stream, name)
Guido van Rossum0612d842000-03-10 23:20:43 +0000652
Thomas Wouters89f507f2006-12-13 04:49:30 +0000653 def __enter__(self):
654 return self
655
656 def __exit__(self, type, value, tb):
657 self.stream.close()
658
Guido van Rossum0612d842000-03-10 23:20:43 +0000659###
660
661class StreamReaderWriter:
662
Fred Drake49fd1072000-04-13 14:11:21 +0000663 """ StreamReaderWriter instances allow wrapping streams which
664 work in both read and write modes.
665
666 The design is such that one can use the factory functions
Thomas Wouters7e474022000-07-16 12:04:32 +0000667 returned by the codec.lookup() function to construct the
Fred Drake49fd1072000-04-13 14:11:21 +0000668 instance.
669
670 """
Guido van Rossuma3277132000-04-11 15:37:43 +0000671 # Optional attributes set by the file wrappers below
672 encoding = 'unknown'
673
Tim Peters30324a72001-05-15 17:19:16 +0000674 def __init__(self, stream, Reader, Writer, errors='strict'):
Guido van Rossum0612d842000-03-10 23:20:43 +0000675
676 """ Creates a StreamReaderWriter instance.
677
678 stream must be a Stream-like object.
679
680 Reader, Writer must be factory functions or classes
681 providing the StreamReader, StreamWriter interface resp.
682
683 Error handling is done in the same way as defined for the
684 StreamWriter/Readers.
685
686 """
687 self.stream = stream
688 self.reader = Reader(stream, errors)
689 self.writer = Writer(stream, errors)
690 self.errors = errors
691
Tim Peters30324a72001-05-15 17:19:16 +0000692 def read(self, size=-1):
Guido van Rossum0612d842000-03-10 23:20:43 +0000693
694 return self.reader.read(size)
695
Guido van Rossumd58c26f2000-05-01 16:17:32 +0000696 def readline(self, size=None):
Guido van Rossuma3277132000-04-11 15:37:43 +0000697
698 return self.reader.readline(size)
699
Guido van Rossumd58c26f2000-05-01 16:17:32 +0000700 def readlines(self, sizehint=None):
Guido van Rossuma3277132000-04-11 15:37:43 +0000701
702 return self.reader.readlines(sizehint)
703
Georg Brandla18af4e2007-04-21 15:47:16 +0000704 def __next__(self):
Walter Dörwald4dbf1922002-11-06 16:53:44 +0000705
706 """ Return the next decoded line from the input stream."""
Georg Brandla18af4e2007-04-21 15:47:16 +0000707 return next(self.reader)
Walter Dörwald4dbf1922002-11-06 16:53:44 +0000708
709 def __iter__(self):
710 return self
711
Tim Peters30324a72001-05-15 17:19:16 +0000712 def write(self, data):
Guido van Rossum0612d842000-03-10 23:20:43 +0000713
714 return self.writer.write(data)
715
Tim Peters30324a72001-05-15 17:19:16 +0000716 def writelines(self, list):
Guido van Rossuma3277132000-04-11 15:37:43 +0000717
718 return self.writer.writelines(list)
719
Guido van Rossum0612d842000-03-10 23:20:43 +0000720 def reset(self):
721
722 self.reader.reset()
723 self.writer.reset()
724
Victor Stinner3fed0872010-05-22 02:16:27 +0000725 def seek(self, offset, whence=0):
Victor Stinnera92ad7e2010-05-22 16:59:09 +0000726 self.stream.seek(offset, whence)
727 self.reader.reset()
728 if whence == 0 and offset == 0:
729 self.writer.reset()
Victor Stinner3fed0872010-05-22 02:16:27 +0000730
Tim Peters30324a72001-05-15 17:19:16 +0000731 def __getattr__(self, name,
Guido van Rossum0612d842000-03-10 23:20:43 +0000732 getattr=getattr):
733
734 """ Inherit all other methods from the underlying stream.
735 """
Tim Peters30324a72001-05-15 17:19:16 +0000736 return getattr(self.stream, name)
Guido van Rossum0612d842000-03-10 23:20:43 +0000737
Thomas Wouters89f507f2006-12-13 04:49:30 +0000738 # these are needed to make "with codecs.open(...)" work properly
739
740 def __enter__(self):
741 return self
742
743 def __exit__(self, type, value, tb):
744 self.stream.close()
745
Guido van Rossum0612d842000-03-10 23:20:43 +0000746###
747
748class StreamRecoder:
749
Nick Coghlanb9fdb7a2015-01-07 00:22:00 +1000750 """ StreamRecoder instances translate data from one encoding to another.
Fred Drake49fd1072000-04-13 14:11:21 +0000751
752 They use the complete set of APIs returned by the
753 codecs.lookup() function to implement their task.
754
Nick Coghlanb9fdb7a2015-01-07 00:22:00 +1000755 Data written to the StreamRecoder is first decoded into an
756 intermediate format (depending on the "decode" codec) and then
757 written to the underlying stream using an instance of the provided
758 Writer class.
Fred Drake49fd1072000-04-13 14:11:21 +0000759
Nick Coghlanb9fdb7a2015-01-07 00:22:00 +1000760 In the other direction, data is read from the underlying stream using
761 a Reader instance and then encoded and returned to the caller.
Fred Drake49fd1072000-04-13 14:11:21 +0000762
763 """
Guido van Rossuma3277132000-04-11 15:37:43 +0000764 # Optional attributes set by the file wrappers below
765 data_encoding = 'unknown'
766 file_encoding = 'unknown'
767
Tim Peters30324a72001-05-15 17:19:16 +0000768 def __init__(self, stream, encode, decode, Reader, Writer,
769 errors='strict'):
Guido van Rossum0612d842000-03-10 23:20:43 +0000770
771 """ Creates a StreamRecoder instance which implements a two-way
772 conversion: encode and decode work on the frontend (the
Nick Coghlanb9fdb7a2015-01-07 00:22:00 +1000773 data visible to .read() and .write()) while Reader and Writer
774 work on the backend (the data in stream).
Guido van Rossum0612d842000-03-10 23:20:43 +0000775
Nick Coghlanb9fdb7a2015-01-07 00:22:00 +1000776 You can use these objects to do transparent
777 transcodings from e.g. latin-1 to utf-8 and back.
Guido van Rossum0612d842000-03-10 23:20:43 +0000778
779 stream must be a file-like object.
780
Nick Coghlanb9fdb7a2015-01-07 00:22:00 +1000781 encode and decode must adhere to the Codec interface; Reader and
Guido van Rossum0612d842000-03-10 23:20:43 +0000782 Writer must be factory functions or classes providing the
Nick Coghlanb9fdb7a2015-01-07 00:22:00 +1000783 StreamReader and StreamWriter interfaces resp.
Guido van Rossum0612d842000-03-10 23:20:43 +0000784
785 Error handling is done in the same way as defined for the
786 StreamWriter/Readers.
787
788 """
789 self.stream = stream
790 self.encode = encode
791 self.decode = decode
792 self.reader = Reader(stream, errors)
793 self.writer = Writer(stream, errors)
794 self.errors = errors
795
Tim Peters30324a72001-05-15 17:19:16 +0000796 def read(self, size=-1):
Guido van Rossum0612d842000-03-10 23:20:43 +0000797
798 data = self.reader.read(size)
799 data, bytesencoded = self.encode(data, self.errors)
800 return data
801
Tim Peters30324a72001-05-15 17:19:16 +0000802 def readline(self, size=None):
Guido van Rossuma3277132000-04-11 15:37:43 +0000803
804 if size is None:
805 data = self.reader.readline()
806 else:
807 data = self.reader.readline(size)
808 data, bytesencoded = self.encode(data, self.errors)
809 return data
810
Tim Peters30324a72001-05-15 17:19:16 +0000811 def readlines(self, sizehint=None):
Guido van Rossuma3277132000-04-11 15:37:43 +0000812
Marc-André Lemburgd5948492004-02-26 15:22:17 +0000813 data = self.reader.read()
Guido van Rossuma3277132000-04-11 15:37:43 +0000814 data, bytesencoded = self.encode(data, self.errors)
Ezio Melottid8b509b2011-09-28 17:37:55 +0300815 return data.splitlines(keepends=True)
Guido van Rossuma3277132000-04-11 15:37:43 +0000816
Georg Brandla18af4e2007-04-21 15:47:16 +0000817 def __next__(self):
Walter Dörwald4dbf1922002-11-06 16:53:44 +0000818
819 """ Return the next decoded line from the input stream."""
Georg Brandla18af4e2007-04-21 15:47:16 +0000820 data = next(self.reader)
Walter Dörwaldc5238b82005-09-01 11:56:53 +0000821 data, bytesencoded = self.encode(data, self.errors)
822 return data
Walter Dörwald4dbf1922002-11-06 16:53:44 +0000823
824 def __iter__(self):
825 return self
826
Tim Peters30324a72001-05-15 17:19:16 +0000827 def write(self, data):
Guido van Rossum0612d842000-03-10 23:20:43 +0000828
829 data, bytesdecoded = self.decode(data, self.errors)
830 return self.writer.write(data)
831
Tim Peters30324a72001-05-15 17:19:16 +0000832 def writelines(self, list):
Guido van Rossuma3277132000-04-11 15:37:43 +0000833
834 data = ''.join(list)
835 data, bytesdecoded = self.decode(data, self.errors)
836 return self.writer.write(data)
Guido van Rossum0612d842000-03-10 23:20:43 +0000837
838 def reset(self):
839
840 self.reader.reset()
841 self.writer.reset()
842
Tim Peters30324a72001-05-15 17:19:16 +0000843 def __getattr__(self, name,
Guido van Rossum0612d842000-03-10 23:20:43 +0000844 getattr=getattr):
845
846 """ Inherit all other methods from the underlying stream.
847 """
Tim Peters30324a72001-05-15 17:19:16 +0000848 return getattr(self.stream, name)
Guido van Rossum0612d842000-03-10 23:20:43 +0000849
Thomas Wouters89f507f2006-12-13 04:49:30 +0000850 def __enter__(self):
851 return self
852
853 def __exit__(self, type, value, tb):
854 self.stream.close()
855
Guido van Rossum0612d842000-03-10 23:20:43 +0000856### Shortcuts
857
Nick Coghlanb9fdb7a2015-01-07 00:22:00 +1000858def open(filename, mode='r', encoding=None, errors='strict', buffering=1):
Guido van Rossum0612d842000-03-10 23:20:43 +0000859
860 """ Open an encoded file using the given mode and return
861 a wrapped version providing transparent encoding/decoding.
862
863 Note: The wrapped version will only accept the object format
864 defined by the codecs, i.e. Unicode objects for most builtin
Skip Montanaro9f5f9d92005-03-16 03:51:56 +0000865 codecs. Output is also codec dependent and will usually be
Guido van Rossum0612d842000-03-10 23:20:43 +0000866 Unicode as well.
867
Nick Coghlanb9fdb7a2015-01-07 00:22:00 +1000868 Underlying encoded files are always opened in binary mode.
869 The default file mode is 'r', meaning to open the file in read mode.
Marc-André Lemburg349a3d32000-06-21 21:21:04 +0000870
Guido van Rossum0612d842000-03-10 23:20:43 +0000871 encoding specifies the encoding which is to be used for the
Walter Dörwald7f3ed742003-02-02 23:08:27 +0000872 file.
Guido van Rossum0612d842000-03-10 23:20:43 +0000873
874 errors may be given to define the error handling. It defaults
875 to 'strict' which causes ValueErrors to be raised in case an
876 encoding error occurs.
877
878 buffering has the same meaning as for the builtin open() API.
879 It defaults to line buffered.
880
Fred Drake49fd1072000-04-13 14:11:21 +0000881 The returned wrapped file object provides an extra attribute
882 .encoding which allows querying the used encoding. This
883 attribute is only available if an encoding was specified as
884 parameter.
885
Guido van Rossum0612d842000-03-10 23:20:43 +0000886 """
887 if encoding is not None and \
888 'b' not in mode:
889 # Force opening of the file in binary mode
890 mode = mode + 'b'
Georg Brandl1a3284e2007-12-02 09:40:06 +0000891 file = builtins.open(filename, mode, buffering)
Guido van Rossum0612d842000-03-10 23:20:43 +0000892 if encoding is None:
893 return file
Thomas Woutersa9773292006-04-21 09:43:23 +0000894 info = lookup(encoding)
895 srw = StreamReaderWriter(file, info.streamreader, info.streamwriter, errors)
Guido van Rossuma3277132000-04-11 15:37:43 +0000896 # Add attributes to simplify introspection
897 srw.encoding = encoding
898 return srw
Guido van Rossum0612d842000-03-10 23:20:43 +0000899
Guido van Rossuma3277132000-04-11 15:37:43 +0000900def EncodedFile(file, data_encoding, file_encoding=None, errors='strict'):
Guido van Rossum0612d842000-03-10 23:20:43 +0000901
902 """ Return a wrapped version of file which provides transparent
903 encoding translation.
904
Nick Coghlanb9fdb7a2015-01-07 00:22:00 +1000905 Data written to the wrapped file is decoded according
906 to the given data_encoding and then encoded to the underlying
907 file using file_encoding. The intermediate data type
Guido van Rossuma3277132000-04-11 15:37:43 +0000908 will usually be Unicode but depends on the specified codecs.
Guido van Rossum0612d842000-03-10 23:20:43 +0000909
Nick Coghlanb9fdb7a2015-01-07 00:22:00 +1000910 Bytes read from the file are decoded using file_encoding and then
911 passed back to the caller encoded using data_encoding.
Guido van Rossuma3277132000-04-11 15:37:43 +0000912
913 If file_encoding is not given, it defaults to data_encoding.
Guido van Rossum0612d842000-03-10 23:20:43 +0000914
915 errors may be given to define the error handling. It defaults
916 to 'strict' which causes ValueErrors to be raised in case an
917 encoding error occurs.
918
Fred Drake49fd1072000-04-13 14:11:21 +0000919 The returned wrapped file object provides two extra attributes
920 .data_encoding and .file_encoding which reflect the given
921 parameters of the same name. The attributes can be used for
922 introspection by Python programs.
923
Guido van Rossum0612d842000-03-10 23:20:43 +0000924 """
Guido van Rossuma3277132000-04-11 15:37:43 +0000925 if file_encoding is None:
926 file_encoding = data_encoding
Thomas Wouters89f507f2006-12-13 04:49:30 +0000927 data_info = lookup(data_encoding)
928 file_info = lookup(file_encoding)
929 sr = StreamRecoder(file, data_info.encode, data_info.decode,
930 file_info.streamreader, file_info.streamwriter, errors)
Guido van Rossuma3277132000-04-11 15:37:43 +0000931 # Add attributes to simplify introspection
932 sr.data_encoding = data_encoding
933 sr.file_encoding = file_encoding
934 return sr
Guido van Rossum0612d842000-03-10 23:20:43 +0000935
Marc-André Lemburgaa32c5a2001-09-19 11:24:48 +0000936### Helpers for codec lookup
937
938def getencoder(encoding):
939
940 """ Lookup up the codec for the given encoding and return
941 its encoder function.
942
943 Raises a LookupError in case the encoding cannot be found.
944
945 """
Thomas Woutersa9773292006-04-21 09:43:23 +0000946 return lookup(encoding).encode
Marc-André Lemburgaa32c5a2001-09-19 11:24:48 +0000947
948def getdecoder(encoding):
949
950 """ Lookup up the codec for the given encoding and return
951 its decoder function.
952
953 Raises a LookupError in case the encoding cannot be found.
954
955 """
Thomas Woutersa9773292006-04-21 09:43:23 +0000956 return lookup(encoding).decode
957
958def getincrementalencoder(encoding):
959
960 """ Lookup up the codec for the given encoding and return
961 its IncrementalEncoder class or factory function.
962
963 Raises a LookupError in case the encoding cannot be found
964 or the codecs doesn't provide an incremental encoder.
965
966 """
967 encoder = lookup(encoding).incrementalencoder
968 if encoder is None:
969 raise LookupError(encoding)
970 return encoder
971
972def getincrementaldecoder(encoding):
973
974 """ Lookup up the codec for the given encoding and return
975 its IncrementalDecoder class or factory function.
976
977 Raises a LookupError in case the encoding cannot be found
978 or the codecs doesn't provide an incremental decoder.
979
980 """
981 decoder = lookup(encoding).incrementaldecoder
982 if decoder is None:
983 raise LookupError(encoding)
984 return decoder
Marc-André Lemburgaa32c5a2001-09-19 11:24:48 +0000985
986def getreader(encoding):
987
988 """ Lookup up the codec for the given encoding and return
989 its StreamReader class or factory function.
990
991 Raises a LookupError in case the encoding cannot be found.
992
993 """
Thomas Woutersa9773292006-04-21 09:43:23 +0000994 return lookup(encoding).streamreader
Marc-André Lemburgaa32c5a2001-09-19 11:24:48 +0000995
996def getwriter(encoding):
997
998 """ Lookup up the codec for the given encoding and return
999 its StreamWriter class or factory function.
1000
1001 Raises a LookupError in case the encoding cannot be found.
1002
1003 """
Thomas Woutersa9773292006-04-21 09:43:23 +00001004 return lookup(encoding).streamwriter
1005
1006def iterencode(iterator, encoding, errors='strict', **kwargs):
1007 """
1008 Encoding iterator.
1009
1010 Encodes the input strings from the iterator using a IncrementalEncoder.
1011
1012 errors and kwargs are passed through to the IncrementalEncoder
1013 constructor.
1014 """
1015 encoder = getincrementalencoder(encoding)(errors, **kwargs)
1016 for input in iterator:
1017 output = encoder.encode(input)
1018 if output:
1019 yield output
1020 output = encoder.encode("", True)
1021 if output:
1022 yield output
1023
1024def iterdecode(iterator, encoding, errors='strict', **kwargs):
1025 """
1026 Decoding iterator.
1027
1028 Decodes the input strings from the iterator using a IncrementalDecoder.
1029
1030 errors and kwargs are passed through to the IncrementalDecoder
1031 constructor.
1032 """
1033 decoder = getincrementaldecoder(encoding)(errors, **kwargs)
1034 for input in iterator:
1035 output = decoder.decode(input)
1036 if output:
1037 yield output
Walter Dörwaldca8a8d02007-05-04 13:05:09 +00001038 output = decoder.decode(b"", True)
Thomas Woutersa9773292006-04-21 09:43:23 +00001039 if output:
1040 yield output
Marc-André Lemburgaa32c5a2001-09-19 11:24:48 +00001041
Marc-André Lemburga866df82001-01-03 21:29:14 +00001042### Helpers for charmap-based codecs
1043
1044def make_identity_dict(rng):
1045
1046 """ make_identity_dict(rng) -> dict
1047
1048 Return a dictionary where elements of the rng sequence are
1049 mapped to themselves.
Tim Peters88869f92001-01-14 23:36:06 +00001050
Marc-André Lemburga866df82001-01-03 21:29:14 +00001051 """
Antoine Pitrouaaefac72012-06-16 22:48:21 +02001052 return {i:i for i in rng}
Marc-André Lemburga866df82001-01-03 21:29:14 +00001053
Marc-André Lemburg716cf912001-05-16 09:41:45 +00001054def make_encoding_map(decoding_map):
1055
1056 """ Creates an encoding map from a decoding map.
1057
Walter Dörwald7f3ed742003-02-02 23:08:27 +00001058 If a target mapping in the decoding map occurs multiple
Marc-André Lemburg716cf912001-05-16 09:41:45 +00001059 times, then that target is mapped to None (undefined mapping),
1060 causing an exception when encountered by the charmap codec
1061 during translation.
1062
1063 One example where this happens is cp875.py which decodes
Serhiy Storchaka9f8a8912015-04-03 18:12:41 +03001064 multiple character to \\u001a.
Marc-André Lemburg716cf912001-05-16 09:41:45 +00001065
1066 """
1067 m = {}
1068 for k,v in decoding_map.items():
Raymond Hettinger54f02222002-06-01 14:18:47 +00001069 if not v in m:
Marc-André Lemburg716cf912001-05-16 09:41:45 +00001070 m[v] = k
1071 else:
1072 m[v] = None
1073 return m
Tim Peters3a2ab1a2001-05-29 06:06:54 +00001074
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001075### error handlers
1076
Martin v. Löwise2713be2005-03-08 15:03:08 +00001077try:
1078 strict_errors = lookup_error("strict")
1079 ignore_errors = lookup_error("ignore")
1080 replace_errors = lookup_error("replace")
1081 xmlcharrefreplace_errors = lookup_error("xmlcharrefreplace")
1082 backslashreplace_errors = lookup_error("backslashreplace")
1083except LookupError:
1084 # In --disable-unicode builds, these error handler are missing
1085 strict_errors = None
1086 ignore_errors = None
1087 replace_errors = None
1088 xmlcharrefreplace_errors = None
1089 backslashreplace_errors = None
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001090
Martin v. Löwis6cd441d2001-07-31 08:54:55 +00001091# Tell modulefinder that using codecs probably needs the encodings
1092# package
1093_false = 0
1094if _false:
1095 import encodings
1096
Guido van Rossum0612d842000-03-10 23:20:43 +00001097### Tests
Guido van Rossum1c89b0e2000-04-11 15:41:38 +00001098
Guido van Rossum0612d842000-03-10 23:20:43 +00001099if __name__ == '__main__':
1100
Guido van Rossuma3277132000-04-11 15:37:43 +00001101 # Make stdout translate Latin-1 output into UTF-8 output
1102 sys.stdout = EncodedFile(sys.stdout, 'latin-1', 'utf-8')
Guido van Rossum1c89b0e2000-04-11 15:41:38 +00001103
Guido van Rossuma3277132000-04-11 15:37:43 +00001104 # Have stdin translate Latin-1 input into UTF-8 input
1105 sys.stdin = EncodedFile(sys.stdin, 'utf-8', 'latin-1')