blob: 2d6291e61617205a4952a953dfaf3635c82efdd2 [file] [log] [blame]
Guido van Rossum0612d842000-03-10 23:20:43 +00001""" codecs -- Python Codec Registry, API and helpers.
2
3
4Written by Marc-Andre Lemburg (mal@lemburg.com).
5
6(c) Copyright CNRI, All Rights Reserved. NO WARRANTY.
7
8"""#"
9
Georg Brandl1a3284e2007-12-02 09:40:06 +000010import builtins, sys
Guido van Rossum0612d842000-03-10 23:20:43 +000011
12### Registry and builtin stateless codec functions
13
Guido van Rossumb95de4f2000-03-31 17:25:23 +000014try:
15 from _codecs import *
Guido van Rossumb940e112007-01-10 16:19:56 +000016except ImportError as why:
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000017 raise SystemError('Failed to load the builtin codecs: %s' % why)
Guido van Rossum0612d842000-03-10 23:20:43 +000018
Tim Peters30324a72001-05-15 17:19:16 +000019__all__ = ["register", "lookup", "open", "EncodedFile", "BOM", "BOM_BE",
Walter Dörwald474458d2002-06-04 15:16:29 +000020 "BOM_LE", "BOM32_BE", "BOM32_LE", "BOM64_BE", "BOM64_LE",
21 "BOM_UTF8", "BOM_UTF16", "BOM_UTF16_LE", "BOM_UTF16_BE",
Walter Dörwald3aeb6322002-09-02 13:14:32 +000022 "BOM_UTF32", "BOM_UTF32_LE", "BOM_UTF32_BE",
Serhiy Storchakade3ee5b2014-12-20 17:42:38 +020023 "CodecInfo", "Codec", "IncrementalEncoder", "IncrementalDecoder",
24 "StreamReader", "StreamWriter",
25 "StreamReaderWriter", "StreamRecoder",
26 "getencoder", "getdecoder", "getincrementalencoder",
27 "getincrementaldecoder", "getreader", "getwriter",
28 "encode", "decode", "iterencode", "iterdecode",
Walter Dörwald3aeb6322002-09-02 13:14:32 +000029 "strict_errors", "ignore_errors", "replace_errors",
30 "xmlcharrefreplace_errors",
Serhiy Storchaka166ebc42014-11-25 13:57:17 +020031 "backslashreplace_errors", "namereplace_errors",
Walter Dörwald3aeb6322002-09-02 13:14:32 +000032 "register_error", "lookup_error"]
Skip Montanaroe99d5ea2001-01-20 19:54:20 +000033
Guido van Rossum0612d842000-03-10 23:20:43 +000034### Constants
35
36#
Walter Dörwald474458d2002-06-04 15:16:29 +000037# Byte Order Mark (BOM = ZERO WIDTH NO-BREAK SPACE = U+FEFF)
38# and its possible byte string values
39# for UTF8/UTF16/UTF32 output and little/big endian machines
Guido van Rossum0612d842000-03-10 23:20:43 +000040#
Guido van Rossum0612d842000-03-10 23:20:43 +000041
Walter Dörwald474458d2002-06-04 15:16:29 +000042# UTF-8
Walter Dörwaldca8a8d02007-05-04 13:05:09 +000043BOM_UTF8 = b'\xef\xbb\xbf'
Walter Dörwald474458d2002-06-04 15:16:29 +000044
45# UTF-16, little endian
Walter Dörwaldca8a8d02007-05-04 13:05:09 +000046BOM_LE = BOM_UTF16_LE = b'\xff\xfe'
Walter Dörwald474458d2002-06-04 15:16:29 +000047
48# UTF-16, big endian
Walter Dörwaldca8a8d02007-05-04 13:05:09 +000049BOM_BE = BOM_UTF16_BE = b'\xfe\xff'
Walter Dörwald474458d2002-06-04 15:16:29 +000050
51# UTF-32, little endian
Walter Dörwaldca8a8d02007-05-04 13:05:09 +000052BOM_UTF32_LE = b'\xff\xfe\x00\x00'
Walter Dörwald474458d2002-06-04 15:16:29 +000053
54# UTF-32, big endian
Walter Dörwaldca8a8d02007-05-04 13:05:09 +000055BOM_UTF32_BE = b'\x00\x00\xfe\xff'
Walter Dörwald474458d2002-06-04 15:16:29 +000056
Marc-André Lemburgb28de0d2002-12-12 17:37:50 +000057if sys.byteorder == 'little':
Walter Dörwald474458d2002-06-04 15:16:29 +000058
Marc-André Lemburgb28de0d2002-12-12 17:37:50 +000059 # UTF-16, native endianness
60 BOM = BOM_UTF16 = BOM_UTF16_LE
61
62 # UTF-32, native endianness
63 BOM_UTF32 = BOM_UTF32_LE
64
65else:
66
67 # UTF-16, native endianness
68 BOM = BOM_UTF16 = BOM_UTF16_BE
69
70 # UTF-32, native endianness
71 BOM_UTF32 = BOM_UTF32_BE
Walter Dörwald474458d2002-06-04 15:16:29 +000072
73# Old broken names (don't use in new code)
74BOM32_LE = BOM_UTF16_LE
75BOM32_BE = BOM_UTF16_BE
76BOM64_LE = BOM_UTF32_LE
77BOM64_BE = BOM_UTF32_BE
Guido van Rossum0612d842000-03-10 23:20:43 +000078
79
80### Codec base classes (defining the API)
81
Thomas Woutersa9773292006-04-21 09:43:23 +000082class CodecInfo(tuple):
Nick Coghlanc72e4e62013-11-22 22:39:36 +100083 """Codec details when looking up the codec registry"""
84
85 # Private API to allow Python 3.4 to blacklist the known non-Unicode
86 # codecs in the standard library. A more general mechanism to
87 # reliably distinguish test encodings from other codecs will hopefully
88 # be defined for Python 3.5
89 #
90 # See http://bugs.python.org/issue19619
91 _is_text_encoding = True # Assume codecs are text encodings by default
Thomas Woutersa9773292006-04-21 09:43:23 +000092
93 def __new__(cls, encode, decode, streamreader=None, streamwriter=None,
Nick Coghlanc72e4e62013-11-22 22:39:36 +100094 incrementalencoder=None, incrementaldecoder=None, name=None,
95 *, _is_text_encoding=None):
Thomas Woutersa9773292006-04-21 09:43:23 +000096 self = tuple.__new__(cls, (encode, decode, streamreader, streamwriter))
97 self.name = name
98 self.encode = encode
99 self.decode = decode
100 self.incrementalencoder = incrementalencoder
101 self.incrementaldecoder = incrementaldecoder
102 self.streamwriter = streamwriter
103 self.streamreader = streamreader
Nick Coghlanc72e4e62013-11-22 22:39:36 +1000104 if _is_text_encoding is not None:
105 self._is_text_encoding = _is_text_encoding
Thomas Woutersa9773292006-04-21 09:43:23 +0000106 return self
107
108 def __repr__(self):
Serhiy Storchaka521e5862014-07-22 15:00:37 +0300109 return "<%s.%s object for encoding %s at %#x>" % \
110 (self.__class__.__module__, self.__class__.__qualname__,
Walter Dörwald3abcb012007-04-16 22:10:50 +0000111 self.name, id(self))
Thomas Woutersa9773292006-04-21 09:43:23 +0000112
Guido van Rossum0612d842000-03-10 23:20:43 +0000113class Codec:
114
115 """ Defines the interface for stateless encoders/decoders.
116
Walter Dörwald7f82f792002-11-19 21:42:53 +0000117 The .encode()/.decode() methods may use different error
Guido van Rossum0612d842000-03-10 23:20:43 +0000118 handling schemes by providing the errors argument. These
Walter Dörwald7f82f792002-11-19 21:42:53 +0000119 string values are predefined:
Guido van Rossum0612d842000-03-10 23:20:43 +0000120
Guido van Rossumd8855fd2000-03-24 22:14:19 +0000121 'strict' - raise a ValueError error (or a subclass)
Guido van Rossum0612d842000-03-10 23:20:43 +0000122 'ignore' - ignore the character and continue with the next
123 'replace' - replace with a suitable replacement character;
124 Python will use the official U+FFFD REPLACEMENT
Walter Dörwald7f82f792002-11-19 21:42:53 +0000125 CHARACTER for the builtin Unicode codecs on
126 decoding and '?' on encoding.
Andrew Kuchlingc7b6c502013-06-16 12:58:48 -0400127 'surrogateescape' - replace with private codepoints U+DCnn.
Walter Dörwald7f82f792002-11-19 21:42:53 +0000128 'xmlcharrefreplace' - Replace with the appropriate XML
129 character reference (only for encoding).
130 'backslashreplace' - Replace with backslashed escape sequences
131 (only for encoding).
132
133 The set of allowed values can be extended via register_error.
Guido van Rossum0612d842000-03-10 23:20:43 +0000134
135 """
Tim Peters30324a72001-05-15 17:19:16 +0000136 def encode(self, input, errors='strict'):
Guido van Rossum1c89b0e2000-04-11 15:41:38 +0000137
Fred Drake3e74c0d2000-03-17 15:40:35 +0000138 """ Encodes the object input and returns a tuple (output
Guido van Rossum0612d842000-03-10 23:20:43 +0000139 object, length consumed).
140
141 errors defines the error handling to apply. It defaults to
142 'strict' handling.
143
144 The method may not store state in the Codec instance. Use
145 StreamCodec for codecs which have to keep state in order to
146 make encoding/decoding efficient.
147
148 The encoder must be able to handle zero length input and
149 return an empty object of the output object type in this
150 situation.
151
152 """
153 raise NotImplementedError
154
Tim Peters30324a72001-05-15 17:19:16 +0000155 def decode(self, input, errors='strict'):
Guido van Rossum0612d842000-03-10 23:20:43 +0000156
157 """ Decodes the object input and returns a tuple (output
158 object, length consumed).
159
160 input must be an object which provides the bf_getreadbuf
161 buffer slot. Python strings, buffer objects and memory
162 mapped files are examples of objects providing this slot.
Guido van Rossum1c89b0e2000-04-11 15:41:38 +0000163
Guido van Rossum0612d842000-03-10 23:20:43 +0000164 errors defines the error handling to apply. It defaults to
165 'strict' handling.
166
167 The method may not store state in the Codec instance. Use
168 StreamCodec for codecs which have to keep state in order to
169 make encoding/decoding efficient.
170
171 The decoder must be able to handle zero length input and
172 return an empty object of the output object type in this
173 situation.
174
Guido van Rossum1c89b0e2000-04-11 15:41:38 +0000175 """
Guido van Rossum0612d842000-03-10 23:20:43 +0000176 raise NotImplementedError
177
Thomas Woutersa9773292006-04-21 09:43:23 +0000178class IncrementalEncoder(object):
179 """
Walter Dörwald3abcb012007-04-16 22:10:50 +0000180 An IncrementalEncoder encodes an input in multiple steps. The input can
181 be passed piece by piece to the encode() method. The IncrementalEncoder
182 remembers the state of the encoding process between calls to encode().
Thomas Woutersa9773292006-04-21 09:43:23 +0000183 """
184 def __init__(self, errors='strict'):
185 """
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000186 Creates an IncrementalEncoder instance.
Thomas Woutersa9773292006-04-21 09:43:23 +0000187
188 The IncrementalEncoder may use different error handling schemes by
189 providing the errors keyword argument. See the module docstring
190 for a list of possible values.
191 """
192 self.errors = errors
193 self.buffer = ""
194
195 def encode(self, input, final=False):
196 """
197 Encodes input and returns the resulting object.
198 """
199 raise NotImplementedError
200
201 def reset(self):
202 """
203 Resets the encoder to the initial state.
204 """
205
Walter Dörwald3abcb012007-04-16 22:10:50 +0000206 def getstate(self):
207 """
208 Return the current state of the encoder.
209 """
210 return 0
211
212 def setstate(self, state):
213 """
214 Set the current state of the encoder. state must have been
215 returned by getstate().
216 """
217
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000218class BufferedIncrementalEncoder(IncrementalEncoder):
219 """
220 This subclass of IncrementalEncoder can be used as the baseclass for an
221 incremental encoder if the encoder must keep some of the output in a
222 buffer between calls to encode().
223 """
224 def __init__(self, errors='strict'):
225 IncrementalEncoder.__init__(self, errors)
Walter Dörwald3abcb012007-04-16 22:10:50 +0000226 # unencoded input that is kept between calls to encode()
227 self.buffer = ""
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000228
229 def _buffer_encode(self, input, errors, final):
230 # Overwrite this method in subclasses: It must encode input
231 # and return an (output, length consumed) tuple
232 raise NotImplementedError
233
234 def encode(self, input, final=False):
235 # encode input (taking the buffer into account)
236 data = self.buffer + input
237 (result, consumed) = self._buffer_encode(data, self.errors, final)
238 # keep unencoded input until the next call
239 self.buffer = data[consumed:]
240 return result
241
242 def reset(self):
243 IncrementalEncoder.reset(self)
244 self.buffer = ""
245
Walter Dörwald3abcb012007-04-16 22:10:50 +0000246 def getstate(self):
247 return self.buffer or 0
248
249 def setstate(self, state):
250 self.buffer = state or ""
251
Thomas Woutersa9773292006-04-21 09:43:23 +0000252class IncrementalDecoder(object):
253 """
Walter Dörwald3abcb012007-04-16 22:10:50 +0000254 An IncrementalDecoder decodes an input in multiple steps. The input can
255 be passed piece by piece to the decode() method. The IncrementalDecoder
Thomas Woutersa9773292006-04-21 09:43:23 +0000256 remembers the state of the decoding process between calls to decode().
257 """
258 def __init__(self, errors='strict'):
259 """
Ka-Ping Yeef44c7e82008-03-18 04:51:32 +0000260 Create a IncrementalDecoder instance.
Thomas Woutersa9773292006-04-21 09:43:23 +0000261
262 The IncrementalDecoder may use different error handling schemes by
263 providing the errors keyword argument. See the module docstring
264 for a list of possible values.
265 """
266 self.errors = errors
267
268 def decode(self, input, final=False):
269 """
Ka-Ping Yeef44c7e82008-03-18 04:51:32 +0000270 Decode input and returns the resulting object.
Thomas Woutersa9773292006-04-21 09:43:23 +0000271 """
272 raise NotImplementedError
273
274 def reset(self):
275 """
Ka-Ping Yeef44c7e82008-03-18 04:51:32 +0000276 Reset the decoder to the initial state.
Thomas Woutersa9773292006-04-21 09:43:23 +0000277 """
278
Walter Dörwald3abcb012007-04-16 22:10:50 +0000279 def getstate(self):
280 """
Ka-Ping Yeef44c7e82008-03-18 04:51:32 +0000281 Return the current state of the decoder.
282
283 This must be a (buffered_input, additional_state_info) tuple.
284 buffered_input must be a bytes object containing bytes that
285 were passed to decode() that have not yet been converted.
286 additional_state_info must be a non-negative integer
287 representing the state of the decoder WITHOUT yet having
288 processed the contents of buffered_input. In the initial state
289 and after reset(), getstate() must return (b"", 0).
Walter Dörwald3abcb012007-04-16 22:10:50 +0000290 """
Walter Dörwaldca8a8d02007-05-04 13:05:09 +0000291 return (b"", 0)
Walter Dörwald3abcb012007-04-16 22:10:50 +0000292
293 def setstate(self, state):
294 """
Ka-Ping Yeef44c7e82008-03-18 04:51:32 +0000295 Set the current state of the decoder.
296
297 state must have been returned by getstate(). The effect of
298 setstate((b"", 0)) must be equivalent to reset().
Walter Dörwald3abcb012007-04-16 22:10:50 +0000299 """
300
Thomas Woutersa9773292006-04-21 09:43:23 +0000301class BufferedIncrementalDecoder(IncrementalDecoder):
302 """
303 This subclass of IncrementalDecoder can be used as the baseclass for an
Walter Dörwald3abcb012007-04-16 22:10:50 +0000304 incremental decoder if the decoder must be able to handle incomplete
305 byte sequences.
Thomas Woutersa9773292006-04-21 09:43:23 +0000306 """
307 def __init__(self, errors='strict'):
308 IncrementalDecoder.__init__(self, errors)
Walter Dörwald3abcb012007-04-16 22:10:50 +0000309 # undecoded input that is kept between calls to decode()
Walter Dörwaldca8a8d02007-05-04 13:05:09 +0000310 self.buffer = b""
Thomas Woutersa9773292006-04-21 09:43:23 +0000311
312 def _buffer_decode(self, input, errors, final):
313 # Overwrite this method in subclasses: It must decode input
314 # and return an (output, length consumed) tuple
315 raise NotImplementedError
316
317 def decode(self, input, final=False):
318 # decode input (taking the buffer into account)
319 data = self.buffer + input
320 (result, consumed) = self._buffer_decode(data, self.errors, final)
321 # keep undecoded input until the next call
322 self.buffer = data[consumed:]
323 return result
324
325 def reset(self):
326 IncrementalDecoder.reset(self)
Walter Dörwaldca8a8d02007-05-04 13:05:09 +0000327 self.buffer = b""
Thomas Woutersa9773292006-04-21 09:43:23 +0000328
Walter Dörwald3abcb012007-04-16 22:10:50 +0000329 def getstate(self):
330 # additional state info is always 0
331 return (self.buffer, 0)
332
333 def setstate(self, state):
334 # ignore additional state info
335 self.buffer = state[0]
336
Guido van Rossum0612d842000-03-10 23:20:43 +0000337#
338# The StreamWriter and StreamReader class provide generic working
Andrew M. Kuchling97c56352001-09-18 20:29:48 +0000339# interfaces which can be used to implement new encoding submodules
Guido van Rossum0612d842000-03-10 23:20:43 +0000340# very easily. See encodings/utf_8.py for an example on how this is
341# done.
Guido van Rossum1c89b0e2000-04-11 15:41:38 +0000342#
Guido van Rossum0612d842000-03-10 23:20:43 +0000343
344class StreamWriter(Codec):
345
Tim Peters30324a72001-05-15 17:19:16 +0000346 def __init__(self, stream, errors='strict'):
Guido van Rossum0612d842000-03-10 23:20:43 +0000347
348 """ Creates a StreamWriter instance.
349
Nick Coghlanb9fdb7a2015-01-07 00:22:00 +1000350 stream must be a file-like object open for writing.
Guido van Rossum0612d842000-03-10 23:20:43 +0000351
Walter Dörwald7f82f792002-11-19 21:42:53 +0000352 The StreamWriter may use different error handling
Guido van Rossum0612d842000-03-10 23:20:43 +0000353 schemes by providing the errors keyword argument. These
Walter Dörwald7f82f792002-11-19 21:42:53 +0000354 parameters are predefined:
Guido van Rossum0612d842000-03-10 23:20:43 +0000355
356 'strict' - raise a ValueError (or a subclass)
357 'ignore' - ignore the character and continue with the next
358 'replace'- replace with a suitable replacement character
Walter Dörwald7f82f792002-11-19 21:42:53 +0000359 'xmlcharrefreplace' - Replace with the appropriate XML
360 character reference.
361 'backslashreplace' - Replace with backslashed escape
362 sequences (only for encoding).
Guido van Rossum0612d842000-03-10 23:20:43 +0000363
Walter Dörwald7f82f792002-11-19 21:42:53 +0000364 The set of allowed parameter values can be extended via
365 register_error.
Guido van Rossum0612d842000-03-10 23:20:43 +0000366 """
367 self.stream = stream
368 self.errors = errors
369
Guido van Rossuma3277132000-04-11 15:37:43 +0000370 def write(self, object):
Guido van Rossum0612d842000-03-10 23:20:43 +0000371
372 """ Writes the object's contents encoded to self.stream.
373 """
Tim Peters30324a72001-05-15 17:19:16 +0000374 data, consumed = self.encode(object, self.errors)
Guido van Rossum0612d842000-03-10 23:20:43 +0000375 self.stream.write(data)
376
Guido van Rossuma3277132000-04-11 15:37:43 +0000377 def writelines(self, list):
378
379 """ Writes the concatenated list of strings to the stream
380 using .write().
381 """
382 self.write(''.join(list))
Guido van Rossum1c89b0e2000-04-11 15:41:38 +0000383
Guido van Rossum0612d842000-03-10 23:20:43 +0000384 def reset(self):
385
386 """ Flushes and resets the codec buffers used for keeping state.
387
388 Calling this method should ensure that the data on the
389 output is put into a clean state, that allows appending
390 of new fresh data without having to rescan the whole
391 stream to recover state.
392
393 """
394 pass
395
Victor Stinnera92ad7e2010-05-22 16:59:09 +0000396 def seek(self, offset, whence=0):
397 self.stream.seek(offset, whence)
398 if whence == 0 and offset == 0:
399 self.reset()
400
Tim Peters30324a72001-05-15 17:19:16 +0000401 def __getattr__(self, name,
Guido van Rossum0612d842000-03-10 23:20:43 +0000402 getattr=getattr):
403
404 """ Inherit all other methods from the underlying stream.
405 """
Tim Peters30324a72001-05-15 17:19:16 +0000406 return getattr(self.stream, name)
Guido van Rossum0612d842000-03-10 23:20:43 +0000407
Thomas Wouters89f507f2006-12-13 04:49:30 +0000408 def __enter__(self):
409 return self
410
411 def __exit__(self, type, value, tb):
412 self.stream.close()
413
Guido van Rossum0612d842000-03-10 23:20:43 +0000414###
415
416class StreamReader(Codec):
417
Georg Brandl02524622010-12-02 18:06:51 +0000418 charbuffertype = str
419
Tim Peters30324a72001-05-15 17:19:16 +0000420 def __init__(self, stream, errors='strict'):
Guido van Rossum0612d842000-03-10 23:20:43 +0000421
422 """ Creates a StreamReader instance.
423
Nick Coghlanb9fdb7a2015-01-07 00:22:00 +1000424 stream must be a file-like object open for reading.
Guido van Rossum0612d842000-03-10 23:20:43 +0000425
Walter Dörwald7f82f792002-11-19 21:42:53 +0000426 The StreamReader may use different error handling
Guido van Rossum0612d842000-03-10 23:20:43 +0000427 schemes by providing the errors keyword argument. These
Walter Dörwald7f82f792002-11-19 21:42:53 +0000428 parameters are predefined:
Guido van Rossum0612d842000-03-10 23:20:43 +0000429
430 'strict' - raise a ValueError (or a subclass)
431 'ignore' - ignore the character and continue with the next
432 'replace'- replace with a suitable replacement character;
433
Walter Dörwald7f82f792002-11-19 21:42:53 +0000434 The set of allowed parameter values can be extended via
435 register_error.
Guido van Rossum0612d842000-03-10 23:20:43 +0000436 """
437 self.stream = stream
438 self.errors = errors
Walter Dörwaldca8a8d02007-05-04 13:05:09 +0000439 self.bytebuffer = b""
Georg Brandl02524622010-12-02 18:06:51 +0000440 self._empty_charbuffer = self.charbuffertype()
441 self.charbuffer = self._empty_charbuffer
Martin v. Löwis4ed67382005-09-18 08:34:39 +0000442 self.linebuffer = None
Guido van Rossum0612d842000-03-10 23:20:43 +0000443
Walter Dörwald69652032004-09-07 20:24:22 +0000444 def decode(self, input, errors='strict'):
445 raise NotImplementedError
446
Martin v. Löwis56066d22005-08-24 07:38:12 +0000447 def read(self, size=-1, chars=-1, firstline=False):
Guido van Rossum0612d842000-03-10 23:20:43 +0000448
449 """ Decodes data from the stream self.stream and returns the
450 resulting object.
451
Nick Coghlanb9fdb7a2015-01-07 00:22:00 +1000452 chars indicates the number of decoded code points or bytes to
453 return. read() will never return more data than requested,
454 but it might return less, if there is not enough available.
Walter Dörwald69652032004-09-07 20:24:22 +0000455
Nick Coghlanb9fdb7a2015-01-07 00:22:00 +1000456 size indicates the approximate maximum number of decoded
457 bytes or code points to read for decoding. The decoder
Guido van Rossum0612d842000-03-10 23:20:43 +0000458 can modify this setting as appropriate. The default value
459 -1 indicates to read and decode as much as possible. size
460 is intended to prevent having to decode huge files in one
461 step.
462
Martin v. Löwis56066d22005-08-24 07:38:12 +0000463 If firstline is true, and a UnicodeDecodeError happens
464 after the first line terminator in the input only the first line
465 will be returned, the rest of the input will be kept until the
466 next call to read().
467
Nick Coghlanb9fdb7a2015-01-07 00:22:00 +1000468 The method should use a greedy read strategy, meaning that
Guido van Rossum0612d842000-03-10 23:20:43 +0000469 it should read as much data as is allowed within the
470 definition of the encoding and the given size, e.g. if
471 optional encoding endings or state markers are available
472 on the stream, these should be read too.
Guido van Rossum0612d842000-03-10 23:20:43 +0000473 """
Martin v. Löwis4ed67382005-09-18 08:34:39 +0000474 # If we have lines cached, first merge them back into characters
475 if self.linebuffer:
Georg Brandl02524622010-12-02 18:06:51 +0000476 self.charbuffer = self._empty_charbuffer.join(self.linebuffer)
Martin v. Löwis4ed67382005-09-18 08:34:39 +0000477 self.linebuffer = None
Tim Peters536cf992005-12-25 23:18:31 +0000478
Walter Dörwald69652032004-09-07 20:24:22 +0000479 # read until we get the required number of characters (if available)
Walter Dörwald69652032004-09-07 20:24:22 +0000480 while True:
Tim Golden621302c2012-10-01 16:40:40 +0100481 # can the request be satisfied from the character buffer?
Serhiy Storchakadbe09822014-01-26 19:27:56 +0200482 if chars >= 0:
Walter Dörwald69652032004-09-07 20:24:22 +0000483 if len(self.charbuffer) >= chars:
Walter Dörwald69652032004-09-07 20:24:22 +0000484 break
Serhiy Storchakadbe09822014-01-26 19:27:56 +0200485 elif size >= 0:
486 if len(self.charbuffer) >= size:
487 break
Walter Dörwald69652032004-09-07 20:24:22 +0000488 # we need more data
489 if size < 0:
490 newdata = self.stream.read()
491 else:
492 newdata = self.stream.read(size)
Walter Dörwalde57d7b12004-12-21 22:24:00 +0000493 # decode bytes (those remaining from the last call included)
Walter Dörwald69652032004-09-07 20:24:22 +0000494 data = self.bytebuffer + newdata
Serhiy Storchakadbe09822014-01-26 19:27:56 +0200495 if not data:
496 break
Martin v. Löwis56066d22005-08-24 07:38:12 +0000497 try:
498 newchars, decodedbytes = self.decode(data, self.errors)
Guido van Rossumb940e112007-01-10 16:19:56 +0000499 except UnicodeDecodeError as exc:
Martin v. Löwis56066d22005-08-24 07:38:12 +0000500 if firstline:
Walter Dörwald3abcb012007-04-16 22:10:50 +0000501 newchars, decodedbytes = \
502 self.decode(data[:exc.start], self.errors)
Ezio Melottid8b509b2011-09-28 17:37:55 +0300503 lines = newchars.splitlines(keepends=True)
Martin v. Löwis56066d22005-08-24 07:38:12 +0000504 if len(lines)<=1:
505 raise
506 else:
507 raise
Walter Dörwald69652032004-09-07 20:24:22 +0000508 # keep undecoded bytes until the next call
509 self.bytebuffer = data[decodedbytes:]
510 # put new characters in the character buffer
Walter Dörwalde57d7b12004-12-21 22:24:00 +0000511 self.charbuffer += newchars
Walter Dörwald69652032004-09-07 20:24:22 +0000512 # there was no data available
513 if not newdata:
Walter Dörwalde57d7b12004-12-21 22:24:00 +0000514 break
515 if chars < 0:
516 # Return everything we've got
517 result = self.charbuffer
Georg Brandl02524622010-12-02 18:06:51 +0000518 self.charbuffer = self._empty_charbuffer
Walter Dörwalde57d7b12004-12-21 22:24:00 +0000519 else:
520 # Return the first chars characters
521 result = self.charbuffer[:chars]
522 self.charbuffer = self.charbuffer[chars:]
Walter Dörwald69652032004-09-07 20:24:22 +0000523 return result
Guido van Rossum0612d842000-03-10 23:20:43 +0000524
Walter Dörwald69652032004-09-07 20:24:22 +0000525 def readline(self, size=None, keepends=True):
Guido van Rossuma3277132000-04-11 15:37:43 +0000526
527 """ Read one line from the input stream and return the
528 decoded data.
529
Walter Dörwald69652032004-09-07 20:24:22 +0000530 size, if given, is passed as size argument to the
531 read() method.
Guido van Rossum1c89b0e2000-04-11 15:41:38 +0000532
Guido van Rossuma3277132000-04-11 15:37:43 +0000533 """
Martin v. Löwis4ed67382005-09-18 08:34:39 +0000534 # If we have lines cached from an earlier read, return
535 # them unconditionally
536 if self.linebuffer:
537 line = self.linebuffer[0]
538 del self.linebuffer[0]
539 if len(self.linebuffer) == 1:
540 # revert to charbuffer mode; we might need more data
541 # next time
542 self.charbuffer = self.linebuffer[0]
543 self.linebuffer = None
544 if not keepends:
Ezio Melottid8b509b2011-09-28 17:37:55 +0300545 line = line.splitlines(keepends=False)[0]
Martin v. Löwis4ed67382005-09-18 08:34:39 +0000546 return line
Tim Peters536cf992005-12-25 23:18:31 +0000547
Walter Dörwalde57d7b12004-12-21 22:24:00 +0000548 readsize = size or 72
Georg Brandl02524622010-12-02 18:06:51 +0000549 line = self._empty_charbuffer
Walter Dörwalde57d7b12004-12-21 22:24:00 +0000550 # If size is given, we call read() only once
Walter Dörwald69652032004-09-07 20:24:22 +0000551 while True:
Martin v. Löwis56066d22005-08-24 07:38:12 +0000552 data = self.read(readsize, firstline=True)
Walter Dörwalde57d7b12004-12-21 22:24:00 +0000553 if data:
Walter Dörwalda4eb2d52005-04-21 21:42:35 +0000554 # If we're at a "\r" read one extra character (which might
555 # be a "\n") to get a proper line ending. If the stream is
Walter Dörwaldbc8e6422005-04-21 21:32:03 +0000556 # temporarily exhausted we return the wrong line ending.
Georg Brandl02524622010-12-02 18:06:51 +0000557 if (isinstance(data, str) and data.endswith("\r")) or \
558 (isinstance(data, bytes) and data.endswith(b"\r")):
Walter Dörwald7a6dc132005-04-04 21:38:47 +0000559 data += self.read(size=1, chars=1)
Walter Dörwald7a6dc132005-04-04 21:38:47 +0000560
Walter Dörwald69652032004-09-07 20:24:22 +0000561 line += data
Ezio Melottid8b509b2011-09-28 17:37:55 +0300562 lines = line.splitlines(keepends=True)
Walter Dörwalde57d7b12004-12-21 22:24:00 +0000563 if lines:
Martin v. Löwis4ed67382005-09-18 08:34:39 +0000564 if len(lines) > 1:
565 # More than one line result; the first line is a full line
566 # to return
567 line = lines[0]
568 del lines[0]
569 if len(lines) > 1:
570 # cache the remaining lines
571 lines[-1] += self.charbuffer
572 self.linebuffer = lines
573 self.charbuffer = None
574 else:
575 # only one remaining line, put it back into charbuffer
576 self.charbuffer = lines[0] + self.charbuffer
577 if not keepends:
Ezio Melottid8b509b2011-09-28 17:37:55 +0300578 line = line.splitlines(keepends=False)[0]
Martin v. Löwis4ed67382005-09-18 08:34:39 +0000579 break
Walter Dörwalde57d7b12004-12-21 22:24:00 +0000580 line0withend = lines[0]
Ezio Melottid8b509b2011-09-28 17:37:55 +0300581 line0withoutend = lines[0].splitlines(keepends=False)[0]
Walter Dörwalde57d7b12004-12-21 22:24:00 +0000582 if line0withend != line0withoutend: # We really have a line end
583 # Put the rest back together and keep it until the next call
Georg Brandl02524622010-12-02 18:06:51 +0000584 self.charbuffer = self._empty_charbuffer.join(lines[1:]) + \
585 self.charbuffer
Walter Dörwalde57d7b12004-12-21 22:24:00 +0000586 if keepends:
587 line = line0withend
588 else:
589 line = line0withoutend
Walter Dörwald9fa09462005-01-10 12:01:39 +0000590 break
Walter Dörwalde57d7b12004-12-21 22:24:00 +0000591 # we didn't get anything or this was our only try
Walter Dörwald9fa09462005-01-10 12:01:39 +0000592 if not data or size is not None:
Walter Dörwalde57d7b12004-12-21 22:24:00 +0000593 if line and not keepends:
Ezio Melottid8b509b2011-09-28 17:37:55 +0300594 line = line.splitlines(keepends=False)[0]
Walter Dörwalde57d7b12004-12-21 22:24:00 +0000595 break
Georg Brandl02524622010-12-02 18:06:51 +0000596 if readsize < 8000:
Walter Dörwalde57d7b12004-12-21 22:24:00 +0000597 readsize *= 2
598 return line
Guido van Rossum1c89b0e2000-04-11 15:41:38 +0000599
Walter Dörwald69652032004-09-07 20:24:22 +0000600 def readlines(self, sizehint=None, keepends=True):
Guido van Rossuma3277132000-04-11 15:37:43 +0000601
602 """ Read all lines available on the input stream
Nick Coghlanb9fdb7a2015-01-07 00:22:00 +1000603 and return them as a list.
Guido van Rossuma3277132000-04-11 15:37:43 +0000604
605 Line breaks are implemented using the codec's decoder
606 method and are included in the list entries.
Guido van Rossum1c89b0e2000-04-11 15:41:38 +0000607
Marc-André Lemburgd5948492004-02-26 15:22:17 +0000608 sizehint, if given, is ignored since there is no efficient
609 way to finding the true end-of-line.
Guido van Rossuma3277132000-04-11 15:37:43 +0000610
611 """
Walter Dörwald69652032004-09-07 20:24:22 +0000612 data = self.read()
Hye-Shik Changaf5c7cf2004-10-17 23:51:21 +0000613 return data.splitlines(keepends)
Guido van Rossum0612d842000-03-10 23:20:43 +0000614
615 def reset(self):
616
617 """ Resets the codec buffers used for keeping state.
618
619 Note that no stream repositioning should take place.
Thomas Wouters7e474022000-07-16 12:04:32 +0000620 This method is primarily intended to be able to recover
Guido van Rossum0612d842000-03-10 23:20:43 +0000621 from decoding errors.
622
623 """
Walter Dörwaldca8a8d02007-05-04 13:05:09 +0000624 self.bytebuffer = b""
Georg Brandl02524622010-12-02 18:06:51 +0000625 self.charbuffer = self._empty_charbuffer
Martin v. Löwis4ed67382005-09-18 08:34:39 +0000626 self.linebuffer = None
Walter Dörwald729c31f2005-03-14 19:06:30 +0000627
Walter Dörwald71fd90d2005-03-14 19:25:41 +0000628 def seek(self, offset, whence=0):
Walter Dörwald729c31f2005-03-14 19:06:30 +0000629 """ Set the input stream's current position.
630
631 Resets the codec buffers used for keeping state.
632 """
Walter Dörwald729c31f2005-03-14 19:06:30 +0000633 self.stream.seek(offset, whence)
Victor Stinnera92ad7e2010-05-22 16:59:09 +0000634 self.reset()
Guido van Rossum0612d842000-03-10 23:20:43 +0000635
Georg Brandla18af4e2007-04-21 15:47:16 +0000636 def __next__(self):
Walter Dörwald4dbf1922002-11-06 16:53:44 +0000637
638 """ Return the next decoded line from the input stream."""
639 line = self.readline()
640 if line:
641 return line
642 raise StopIteration
643
644 def __iter__(self):
645 return self
646
Tim Peters30324a72001-05-15 17:19:16 +0000647 def __getattr__(self, name,
Guido van Rossum0612d842000-03-10 23:20:43 +0000648 getattr=getattr):
649
650 """ Inherit all other methods from the underlying stream.
651 """
Tim Peters30324a72001-05-15 17:19:16 +0000652 return getattr(self.stream, name)
Guido van Rossum0612d842000-03-10 23:20:43 +0000653
Thomas Wouters89f507f2006-12-13 04:49:30 +0000654 def __enter__(self):
655 return self
656
657 def __exit__(self, type, value, tb):
658 self.stream.close()
659
Guido van Rossum0612d842000-03-10 23:20:43 +0000660###
661
662class StreamReaderWriter:
663
Fred Drake49fd1072000-04-13 14:11:21 +0000664 """ StreamReaderWriter instances allow wrapping streams which
665 work in both read and write modes.
666
667 The design is such that one can use the factory functions
Thomas Wouters7e474022000-07-16 12:04:32 +0000668 returned by the codec.lookup() function to construct the
Fred Drake49fd1072000-04-13 14:11:21 +0000669 instance.
670
671 """
Guido van Rossuma3277132000-04-11 15:37:43 +0000672 # Optional attributes set by the file wrappers below
673 encoding = 'unknown'
674
Tim Peters30324a72001-05-15 17:19:16 +0000675 def __init__(self, stream, Reader, Writer, errors='strict'):
Guido van Rossum0612d842000-03-10 23:20:43 +0000676
677 """ Creates a StreamReaderWriter instance.
678
679 stream must be a Stream-like object.
680
681 Reader, Writer must be factory functions or classes
682 providing the StreamReader, StreamWriter interface resp.
683
684 Error handling is done in the same way as defined for the
685 StreamWriter/Readers.
686
687 """
688 self.stream = stream
689 self.reader = Reader(stream, errors)
690 self.writer = Writer(stream, errors)
691 self.errors = errors
692
Tim Peters30324a72001-05-15 17:19:16 +0000693 def read(self, size=-1):
Guido van Rossum0612d842000-03-10 23:20:43 +0000694
695 return self.reader.read(size)
696
Guido van Rossumd58c26f2000-05-01 16:17:32 +0000697 def readline(self, size=None):
Guido van Rossuma3277132000-04-11 15:37:43 +0000698
699 return self.reader.readline(size)
700
Guido van Rossumd58c26f2000-05-01 16:17:32 +0000701 def readlines(self, sizehint=None):
Guido van Rossuma3277132000-04-11 15:37:43 +0000702
703 return self.reader.readlines(sizehint)
704
Georg Brandla18af4e2007-04-21 15:47:16 +0000705 def __next__(self):
Walter Dörwald4dbf1922002-11-06 16:53:44 +0000706
707 """ Return the next decoded line from the input stream."""
Georg Brandla18af4e2007-04-21 15:47:16 +0000708 return next(self.reader)
Walter Dörwald4dbf1922002-11-06 16:53:44 +0000709
710 def __iter__(self):
711 return self
712
Tim Peters30324a72001-05-15 17:19:16 +0000713 def write(self, data):
Guido van Rossum0612d842000-03-10 23:20:43 +0000714
715 return self.writer.write(data)
716
Tim Peters30324a72001-05-15 17:19:16 +0000717 def writelines(self, list):
Guido van Rossuma3277132000-04-11 15:37:43 +0000718
719 return self.writer.writelines(list)
720
Guido van Rossum0612d842000-03-10 23:20:43 +0000721 def reset(self):
722
723 self.reader.reset()
724 self.writer.reset()
725
Victor Stinner3fed0872010-05-22 02:16:27 +0000726 def seek(self, offset, whence=0):
Victor Stinnera92ad7e2010-05-22 16:59:09 +0000727 self.stream.seek(offset, whence)
728 self.reader.reset()
729 if whence == 0 and offset == 0:
730 self.writer.reset()
Victor Stinner3fed0872010-05-22 02:16:27 +0000731
Tim Peters30324a72001-05-15 17:19:16 +0000732 def __getattr__(self, name,
Guido van Rossum0612d842000-03-10 23:20:43 +0000733 getattr=getattr):
734
735 """ Inherit all other methods from the underlying stream.
736 """
Tim Peters30324a72001-05-15 17:19:16 +0000737 return getattr(self.stream, name)
Guido van Rossum0612d842000-03-10 23:20:43 +0000738
Thomas Wouters89f507f2006-12-13 04:49:30 +0000739 # these are needed to make "with codecs.open(...)" work properly
740
741 def __enter__(self):
742 return self
743
744 def __exit__(self, type, value, tb):
745 self.stream.close()
746
Guido van Rossum0612d842000-03-10 23:20:43 +0000747###
748
749class StreamRecoder:
750
Nick Coghlanb9fdb7a2015-01-07 00:22:00 +1000751 """ StreamRecoder instances translate data from one encoding to another.
Fred Drake49fd1072000-04-13 14:11:21 +0000752
753 They use the complete set of APIs returned by the
754 codecs.lookup() function to implement their task.
755
Nick Coghlanb9fdb7a2015-01-07 00:22:00 +1000756 Data written to the StreamRecoder is first decoded into an
757 intermediate format (depending on the "decode" codec) and then
758 written to the underlying stream using an instance of the provided
759 Writer class.
Fred Drake49fd1072000-04-13 14:11:21 +0000760
Nick Coghlanb9fdb7a2015-01-07 00:22:00 +1000761 In the other direction, data is read from the underlying stream using
762 a Reader instance and then encoded and returned to the caller.
Fred Drake49fd1072000-04-13 14:11:21 +0000763
764 """
Guido van Rossuma3277132000-04-11 15:37:43 +0000765 # Optional attributes set by the file wrappers below
766 data_encoding = 'unknown'
767 file_encoding = 'unknown'
768
Tim Peters30324a72001-05-15 17:19:16 +0000769 def __init__(self, stream, encode, decode, Reader, Writer,
770 errors='strict'):
Guido van Rossum0612d842000-03-10 23:20:43 +0000771
772 """ Creates a StreamRecoder instance which implements a two-way
773 conversion: encode and decode work on the frontend (the
Nick Coghlanb9fdb7a2015-01-07 00:22:00 +1000774 data visible to .read() and .write()) while Reader and Writer
775 work on the backend (the data in stream).
Guido van Rossum0612d842000-03-10 23:20:43 +0000776
Nick Coghlanb9fdb7a2015-01-07 00:22:00 +1000777 You can use these objects to do transparent
778 transcodings from e.g. latin-1 to utf-8 and back.
Guido van Rossum0612d842000-03-10 23:20:43 +0000779
780 stream must be a file-like object.
781
Nick Coghlanb9fdb7a2015-01-07 00:22:00 +1000782 encode and decode must adhere to the Codec interface; Reader and
Guido van Rossum0612d842000-03-10 23:20:43 +0000783 Writer must be factory functions or classes providing the
Nick Coghlanb9fdb7a2015-01-07 00:22:00 +1000784 StreamReader and StreamWriter interfaces resp.
Guido van Rossum0612d842000-03-10 23:20:43 +0000785
786 Error handling is done in the same way as defined for the
787 StreamWriter/Readers.
788
789 """
790 self.stream = stream
791 self.encode = encode
792 self.decode = decode
793 self.reader = Reader(stream, errors)
794 self.writer = Writer(stream, errors)
795 self.errors = errors
796
Tim Peters30324a72001-05-15 17:19:16 +0000797 def read(self, size=-1):
Guido van Rossum0612d842000-03-10 23:20:43 +0000798
799 data = self.reader.read(size)
800 data, bytesencoded = self.encode(data, self.errors)
801 return data
802
Tim Peters30324a72001-05-15 17:19:16 +0000803 def readline(self, size=None):
Guido van Rossuma3277132000-04-11 15:37:43 +0000804
805 if size is None:
806 data = self.reader.readline()
807 else:
808 data = self.reader.readline(size)
809 data, bytesencoded = self.encode(data, self.errors)
810 return data
811
Tim Peters30324a72001-05-15 17:19:16 +0000812 def readlines(self, sizehint=None):
Guido van Rossuma3277132000-04-11 15:37:43 +0000813
Marc-André Lemburgd5948492004-02-26 15:22:17 +0000814 data = self.reader.read()
Guido van Rossuma3277132000-04-11 15:37:43 +0000815 data, bytesencoded = self.encode(data, self.errors)
Ezio Melottid8b509b2011-09-28 17:37:55 +0300816 return data.splitlines(keepends=True)
Guido van Rossuma3277132000-04-11 15:37:43 +0000817
Georg Brandla18af4e2007-04-21 15:47:16 +0000818 def __next__(self):
Walter Dörwald4dbf1922002-11-06 16:53:44 +0000819
820 """ Return the next decoded line from the input stream."""
Georg Brandla18af4e2007-04-21 15:47:16 +0000821 data = next(self.reader)
Walter Dörwaldc5238b82005-09-01 11:56:53 +0000822 data, bytesencoded = self.encode(data, self.errors)
823 return data
Walter Dörwald4dbf1922002-11-06 16:53:44 +0000824
825 def __iter__(self):
826 return self
827
Tim Peters30324a72001-05-15 17:19:16 +0000828 def write(self, data):
Guido van Rossum0612d842000-03-10 23:20:43 +0000829
830 data, bytesdecoded = self.decode(data, self.errors)
831 return self.writer.write(data)
832
Tim Peters30324a72001-05-15 17:19:16 +0000833 def writelines(self, list):
Guido van Rossuma3277132000-04-11 15:37:43 +0000834
835 data = ''.join(list)
836 data, bytesdecoded = self.decode(data, self.errors)
837 return self.writer.write(data)
Guido van Rossum0612d842000-03-10 23:20:43 +0000838
839 def reset(self):
840
841 self.reader.reset()
842 self.writer.reset()
843
Tim Peters30324a72001-05-15 17:19:16 +0000844 def __getattr__(self, name,
Guido van Rossum0612d842000-03-10 23:20:43 +0000845 getattr=getattr):
846
847 """ Inherit all other methods from the underlying stream.
848 """
Tim Peters30324a72001-05-15 17:19:16 +0000849 return getattr(self.stream, name)
Guido van Rossum0612d842000-03-10 23:20:43 +0000850
Thomas Wouters89f507f2006-12-13 04:49:30 +0000851 def __enter__(self):
852 return self
853
854 def __exit__(self, type, value, tb):
855 self.stream.close()
856
Guido van Rossum0612d842000-03-10 23:20:43 +0000857### Shortcuts
858
Nick Coghlanb9fdb7a2015-01-07 00:22:00 +1000859def open(filename, mode='r', encoding=None, errors='strict', buffering=1):
Guido van Rossum0612d842000-03-10 23:20:43 +0000860
861 """ Open an encoded file using the given mode and return
862 a wrapped version providing transparent encoding/decoding.
863
864 Note: The wrapped version will only accept the object format
865 defined by the codecs, i.e. Unicode objects for most builtin
Skip Montanaro9f5f9d92005-03-16 03:51:56 +0000866 codecs. Output is also codec dependent and will usually be
Guido van Rossum0612d842000-03-10 23:20:43 +0000867 Unicode as well.
868
Nick Coghlanb9fdb7a2015-01-07 00:22:00 +1000869 Underlying encoded files are always opened in binary mode.
870 The default file mode is 'r', meaning to open the file in read mode.
Marc-André Lemburg349a3d32000-06-21 21:21:04 +0000871
Guido van Rossum0612d842000-03-10 23:20:43 +0000872 encoding specifies the encoding which is to be used for the
Walter Dörwald7f3ed742003-02-02 23:08:27 +0000873 file.
Guido van Rossum0612d842000-03-10 23:20:43 +0000874
875 errors may be given to define the error handling. It defaults
876 to 'strict' which causes ValueErrors to be raised in case an
877 encoding error occurs.
878
879 buffering has the same meaning as for the builtin open() API.
880 It defaults to line buffered.
881
Fred Drake49fd1072000-04-13 14:11:21 +0000882 The returned wrapped file object provides an extra attribute
883 .encoding which allows querying the used encoding. This
884 attribute is only available if an encoding was specified as
885 parameter.
886
Guido van Rossum0612d842000-03-10 23:20:43 +0000887 """
888 if encoding is not None and \
889 'b' not in mode:
890 # Force opening of the file in binary mode
891 mode = mode + 'b'
Georg Brandl1a3284e2007-12-02 09:40:06 +0000892 file = builtins.open(filename, mode, buffering)
Guido van Rossum0612d842000-03-10 23:20:43 +0000893 if encoding is None:
894 return file
Thomas Woutersa9773292006-04-21 09:43:23 +0000895 info = lookup(encoding)
896 srw = StreamReaderWriter(file, info.streamreader, info.streamwriter, errors)
Guido van Rossuma3277132000-04-11 15:37:43 +0000897 # Add attributes to simplify introspection
898 srw.encoding = encoding
899 return srw
Guido van Rossum0612d842000-03-10 23:20:43 +0000900
Guido van Rossuma3277132000-04-11 15:37:43 +0000901def EncodedFile(file, data_encoding, file_encoding=None, errors='strict'):
Guido van Rossum0612d842000-03-10 23:20:43 +0000902
903 """ Return a wrapped version of file which provides transparent
904 encoding translation.
905
Nick Coghlanb9fdb7a2015-01-07 00:22:00 +1000906 Data written to the wrapped file is decoded according
907 to the given data_encoding and then encoded to the underlying
908 file using file_encoding. The intermediate data type
Guido van Rossuma3277132000-04-11 15:37:43 +0000909 will usually be Unicode but depends on the specified codecs.
Guido van Rossum0612d842000-03-10 23:20:43 +0000910
Nick Coghlanb9fdb7a2015-01-07 00:22:00 +1000911 Bytes read from the file are decoded using file_encoding and then
912 passed back to the caller encoded using data_encoding.
Guido van Rossuma3277132000-04-11 15:37:43 +0000913
914 If file_encoding is not given, it defaults to data_encoding.
Guido van Rossum0612d842000-03-10 23:20:43 +0000915
916 errors may be given to define the error handling. It defaults
917 to 'strict' which causes ValueErrors to be raised in case an
918 encoding error occurs.
919
Fred Drake49fd1072000-04-13 14:11:21 +0000920 The returned wrapped file object provides two extra attributes
921 .data_encoding and .file_encoding which reflect the given
922 parameters of the same name. The attributes can be used for
923 introspection by Python programs.
924
Guido van Rossum0612d842000-03-10 23:20:43 +0000925 """
Guido van Rossuma3277132000-04-11 15:37:43 +0000926 if file_encoding is None:
927 file_encoding = data_encoding
Thomas Wouters89f507f2006-12-13 04:49:30 +0000928 data_info = lookup(data_encoding)
929 file_info = lookup(file_encoding)
930 sr = StreamRecoder(file, data_info.encode, data_info.decode,
931 file_info.streamreader, file_info.streamwriter, errors)
Guido van Rossuma3277132000-04-11 15:37:43 +0000932 # Add attributes to simplify introspection
933 sr.data_encoding = data_encoding
934 sr.file_encoding = file_encoding
935 return sr
Guido van Rossum0612d842000-03-10 23:20:43 +0000936
Marc-André Lemburgaa32c5a2001-09-19 11:24:48 +0000937### Helpers for codec lookup
938
939def getencoder(encoding):
940
941 """ Lookup up the codec for the given encoding and return
942 its encoder function.
943
944 Raises a LookupError in case the encoding cannot be found.
945
946 """
Thomas Woutersa9773292006-04-21 09:43:23 +0000947 return lookup(encoding).encode
Marc-André Lemburgaa32c5a2001-09-19 11:24:48 +0000948
949def getdecoder(encoding):
950
951 """ Lookup up the codec for the given encoding and return
952 its decoder function.
953
954 Raises a LookupError in case the encoding cannot be found.
955
956 """
Thomas Woutersa9773292006-04-21 09:43:23 +0000957 return lookup(encoding).decode
958
959def getincrementalencoder(encoding):
960
961 """ Lookup up the codec for the given encoding and return
962 its IncrementalEncoder class or factory function.
963
964 Raises a LookupError in case the encoding cannot be found
965 or the codecs doesn't provide an incremental encoder.
966
967 """
968 encoder = lookup(encoding).incrementalencoder
969 if encoder is None:
970 raise LookupError(encoding)
971 return encoder
972
973def getincrementaldecoder(encoding):
974
975 """ Lookup up the codec for the given encoding and return
976 its IncrementalDecoder class or factory function.
977
978 Raises a LookupError in case the encoding cannot be found
979 or the codecs doesn't provide an incremental decoder.
980
981 """
982 decoder = lookup(encoding).incrementaldecoder
983 if decoder is None:
984 raise LookupError(encoding)
985 return decoder
Marc-André Lemburgaa32c5a2001-09-19 11:24:48 +0000986
987def getreader(encoding):
988
989 """ Lookup up the codec for the given encoding and return
990 its StreamReader class or factory function.
991
992 Raises a LookupError in case the encoding cannot be found.
993
994 """
Thomas Woutersa9773292006-04-21 09:43:23 +0000995 return lookup(encoding).streamreader
Marc-André Lemburgaa32c5a2001-09-19 11:24:48 +0000996
997def getwriter(encoding):
998
999 """ Lookup up the codec for the given encoding and return
1000 its StreamWriter class or factory function.
1001
1002 Raises a LookupError in case the encoding cannot be found.
1003
1004 """
Thomas Woutersa9773292006-04-21 09:43:23 +00001005 return lookup(encoding).streamwriter
1006
1007def iterencode(iterator, encoding, errors='strict', **kwargs):
1008 """
1009 Encoding iterator.
1010
1011 Encodes the input strings from the iterator using a IncrementalEncoder.
1012
1013 errors and kwargs are passed through to the IncrementalEncoder
1014 constructor.
1015 """
1016 encoder = getincrementalencoder(encoding)(errors, **kwargs)
1017 for input in iterator:
1018 output = encoder.encode(input)
1019 if output:
1020 yield output
1021 output = encoder.encode("", True)
1022 if output:
1023 yield output
1024
1025def iterdecode(iterator, encoding, errors='strict', **kwargs):
1026 """
1027 Decoding iterator.
1028
1029 Decodes the input strings from the iterator using a IncrementalDecoder.
1030
1031 errors and kwargs are passed through to the IncrementalDecoder
1032 constructor.
1033 """
1034 decoder = getincrementaldecoder(encoding)(errors, **kwargs)
1035 for input in iterator:
1036 output = decoder.decode(input)
1037 if output:
1038 yield output
Walter Dörwaldca8a8d02007-05-04 13:05:09 +00001039 output = decoder.decode(b"", True)
Thomas Woutersa9773292006-04-21 09:43:23 +00001040 if output:
1041 yield output
Marc-André Lemburgaa32c5a2001-09-19 11:24:48 +00001042
Marc-André Lemburga866df82001-01-03 21:29:14 +00001043### Helpers for charmap-based codecs
1044
1045def make_identity_dict(rng):
1046
1047 """ make_identity_dict(rng) -> dict
1048
1049 Return a dictionary where elements of the rng sequence are
1050 mapped to themselves.
Tim Peters88869f92001-01-14 23:36:06 +00001051
Marc-André Lemburga866df82001-01-03 21:29:14 +00001052 """
Antoine Pitrouaaefac72012-06-16 22:48:21 +02001053 return {i:i for i in rng}
Marc-André Lemburga866df82001-01-03 21:29:14 +00001054
Marc-André Lemburg716cf912001-05-16 09:41:45 +00001055def make_encoding_map(decoding_map):
1056
1057 """ Creates an encoding map from a decoding map.
1058
Walter Dörwald7f3ed742003-02-02 23:08:27 +00001059 If a target mapping in the decoding map occurs multiple
Marc-André Lemburg716cf912001-05-16 09:41:45 +00001060 times, then that target is mapped to None (undefined mapping),
1061 causing an exception when encountered by the charmap codec
1062 during translation.
1063
1064 One example where this happens is cp875.py which decodes
1065 multiple character to \u001a.
1066
1067 """
1068 m = {}
1069 for k,v in decoding_map.items():
Raymond Hettinger54f02222002-06-01 14:18:47 +00001070 if not v in m:
Marc-André Lemburg716cf912001-05-16 09:41:45 +00001071 m[v] = k
1072 else:
1073 m[v] = None
1074 return m
Tim Peters3a2ab1a2001-05-29 06:06:54 +00001075
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001076### error handlers
1077
Martin v. Löwise2713be2005-03-08 15:03:08 +00001078try:
1079 strict_errors = lookup_error("strict")
1080 ignore_errors = lookup_error("ignore")
1081 replace_errors = lookup_error("replace")
1082 xmlcharrefreplace_errors = lookup_error("xmlcharrefreplace")
1083 backslashreplace_errors = lookup_error("backslashreplace")
Serhiy Storchaka166ebc42014-11-25 13:57:17 +02001084 namereplace_errors = lookup_error("namereplace")
Martin v. Löwise2713be2005-03-08 15:03:08 +00001085except LookupError:
1086 # In --disable-unicode builds, these error handler are missing
1087 strict_errors = None
1088 ignore_errors = None
1089 replace_errors = None
1090 xmlcharrefreplace_errors = None
1091 backslashreplace_errors = None
Serhiy Storchaka166ebc42014-11-25 13:57:17 +02001092 namereplace_errors = None
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001093
Martin v. Löwis6cd441d2001-07-31 08:54:55 +00001094# Tell modulefinder that using codecs probably needs the encodings
1095# package
1096_false = 0
1097if _false:
1098 import encodings
1099
Guido van Rossum0612d842000-03-10 23:20:43 +00001100### Tests
Guido van Rossum1c89b0e2000-04-11 15:41:38 +00001101
Guido van Rossum0612d842000-03-10 23:20:43 +00001102if __name__ == '__main__':
1103
Guido van Rossuma3277132000-04-11 15:37:43 +00001104 # Make stdout translate Latin-1 output into UTF-8 output
1105 sys.stdout = EncodedFile(sys.stdout, 'latin-1', 'utf-8')
Guido van Rossum1c89b0e2000-04-11 15:41:38 +00001106
Guido van Rossuma3277132000-04-11 15:37:43 +00001107 # Have stdin translate Latin-1 input into UTF-8 input
1108 sys.stdin = EncodedFile(sys.stdin, 'utf-8', 'latin-1')