blob: 21c45a7d10a4c9adf3a864a2d7dab49551116602 [file] [log] [blame]
Guido van Rossum0612d842000-03-10 23:20:43 +00001""" codecs -- Python Codec Registry, API and helpers.
2
3
4Written by Marc-Andre Lemburg (mal@lemburg.com).
5
6(c) Copyright CNRI, All Rights Reserved. NO WARRANTY.
7
Victor Stinner272d8882017-06-16 08:59:01 +02008"""
Guido van Rossum0612d842000-03-10 23:20:43 +00009
Victor Stinner272d8882017-06-16 08:59:01 +020010import builtins
11import sys
Guido van Rossum0612d842000-03-10 23:20:43 +000012
13### Registry and builtin stateless codec functions
14
Guido van Rossumb95de4f2000-03-31 17:25:23 +000015try:
16 from _codecs import *
Guido van Rossumb940e112007-01-10 16:19:56 +000017except ImportError as why:
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000018 raise SystemError('Failed to load the builtin codecs: %s' % why)
Guido van Rossum0612d842000-03-10 23:20:43 +000019
Tim Peters30324a72001-05-15 17:19:16 +000020__all__ = ["register", "lookup", "open", "EncodedFile", "BOM", "BOM_BE",
Walter Dörwald474458d2002-06-04 15:16:29 +000021 "BOM_LE", "BOM32_BE", "BOM32_LE", "BOM64_BE", "BOM64_LE",
22 "BOM_UTF8", "BOM_UTF16", "BOM_UTF16_LE", "BOM_UTF16_BE",
Walter Dörwald3aeb6322002-09-02 13:14:32 +000023 "BOM_UTF32", "BOM_UTF32_LE", "BOM_UTF32_BE",
Serhiy Storchakade3ee5b2014-12-20 17:42:38 +020024 "CodecInfo", "Codec", "IncrementalEncoder", "IncrementalDecoder",
25 "StreamReader", "StreamWriter",
26 "StreamReaderWriter", "StreamRecoder",
27 "getencoder", "getdecoder", "getincrementalencoder",
28 "getincrementaldecoder", "getreader", "getwriter",
29 "encode", "decode", "iterencode", "iterdecode",
Walter Dörwald3aeb6322002-09-02 13:14:32 +000030 "strict_errors", "ignore_errors", "replace_errors",
31 "xmlcharrefreplace_errors",
Serhiy Storchaka166ebc42014-11-25 13:57:17 +020032 "backslashreplace_errors", "namereplace_errors",
Walter Dörwald3aeb6322002-09-02 13:14:32 +000033 "register_error", "lookup_error"]
Skip Montanaroe99d5ea2001-01-20 19:54:20 +000034
Guido van Rossum0612d842000-03-10 23:20:43 +000035### Constants
36
37#
Walter Dörwald474458d2002-06-04 15:16:29 +000038# Byte Order Mark (BOM = ZERO WIDTH NO-BREAK SPACE = U+FEFF)
39# and its possible byte string values
40# for UTF8/UTF16/UTF32 output and little/big endian machines
Guido van Rossum0612d842000-03-10 23:20:43 +000041#
Guido van Rossum0612d842000-03-10 23:20:43 +000042
Walter Dörwald474458d2002-06-04 15:16:29 +000043# UTF-8
Walter Dörwaldca8a8d02007-05-04 13:05:09 +000044BOM_UTF8 = b'\xef\xbb\xbf'
Walter Dörwald474458d2002-06-04 15:16:29 +000045
46# UTF-16, little endian
Walter Dörwaldca8a8d02007-05-04 13:05:09 +000047BOM_LE = BOM_UTF16_LE = b'\xff\xfe'
Walter Dörwald474458d2002-06-04 15:16:29 +000048
49# UTF-16, big endian
Walter Dörwaldca8a8d02007-05-04 13:05:09 +000050BOM_BE = BOM_UTF16_BE = b'\xfe\xff'
Walter Dörwald474458d2002-06-04 15:16:29 +000051
52# UTF-32, little endian
Walter Dörwaldca8a8d02007-05-04 13:05:09 +000053BOM_UTF32_LE = b'\xff\xfe\x00\x00'
Walter Dörwald474458d2002-06-04 15:16:29 +000054
55# UTF-32, big endian
Walter Dörwaldca8a8d02007-05-04 13:05:09 +000056BOM_UTF32_BE = b'\x00\x00\xfe\xff'
Walter Dörwald474458d2002-06-04 15:16:29 +000057
Marc-André Lemburgb28de0d2002-12-12 17:37:50 +000058if sys.byteorder == 'little':
Walter Dörwald474458d2002-06-04 15:16:29 +000059
Marc-André Lemburgb28de0d2002-12-12 17:37:50 +000060 # UTF-16, native endianness
61 BOM = BOM_UTF16 = BOM_UTF16_LE
62
63 # UTF-32, native endianness
64 BOM_UTF32 = BOM_UTF32_LE
65
66else:
67
68 # UTF-16, native endianness
69 BOM = BOM_UTF16 = BOM_UTF16_BE
70
71 # UTF-32, native endianness
72 BOM_UTF32 = BOM_UTF32_BE
Walter Dörwald474458d2002-06-04 15:16:29 +000073
74# Old broken names (don't use in new code)
75BOM32_LE = BOM_UTF16_LE
76BOM32_BE = BOM_UTF16_BE
77BOM64_LE = BOM_UTF32_LE
78BOM64_BE = BOM_UTF32_BE
Guido van Rossum0612d842000-03-10 23:20:43 +000079
80
81### Codec base classes (defining the API)
82
Thomas Woutersa9773292006-04-21 09:43:23 +000083class CodecInfo(tuple):
Nick Coghlanc72e4e62013-11-22 22:39:36 +100084 """Codec details when looking up the codec registry"""
85
86 # Private API to allow Python 3.4 to blacklist the known non-Unicode
87 # codecs in the standard library. A more general mechanism to
88 # reliably distinguish test encodings from other codecs will hopefully
89 # be defined for Python 3.5
90 #
91 # See http://bugs.python.org/issue19619
92 _is_text_encoding = True # Assume codecs are text encodings by default
Thomas Woutersa9773292006-04-21 09:43:23 +000093
94 def __new__(cls, encode, decode, streamreader=None, streamwriter=None,
Nick Coghlanc72e4e62013-11-22 22:39:36 +100095 incrementalencoder=None, incrementaldecoder=None, name=None,
96 *, _is_text_encoding=None):
Thomas Woutersa9773292006-04-21 09:43:23 +000097 self = tuple.__new__(cls, (encode, decode, streamreader, streamwriter))
98 self.name = name
99 self.encode = encode
100 self.decode = decode
101 self.incrementalencoder = incrementalencoder
102 self.incrementaldecoder = incrementaldecoder
103 self.streamwriter = streamwriter
104 self.streamreader = streamreader
Nick Coghlanc72e4e62013-11-22 22:39:36 +1000105 if _is_text_encoding is not None:
106 self._is_text_encoding = _is_text_encoding
Thomas Woutersa9773292006-04-21 09:43:23 +0000107 return self
108
109 def __repr__(self):
Serhiy Storchaka521e5862014-07-22 15:00:37 +0300110 return "<%s.%s object for encoding %s at %#x>" % \
111 (self.__class__.__module__, self.__class__.__qualname__,
Walter Dörwald3abcb012007-04-16 22:10:50 +0000112 self.name, id(self))
Thomas Woutersa9773292006-04-21 09:43:23 +0000113
Guido van Rossum0612d842000-03-10 23:20:43 +0000114class Codec:
115
116 """ Defines the interface for stateless encoders/decoders.
117
Walter Dörwald7f82f792002-11-19 21:42:53 +0000118 The .encode()/.decode() methods may use different error
Guido van Rossum0612d842000-03-10 23:20:43 +0000119 handling schemes by providing the errors argument. These
Walter Dörwald7f82f792002-11-19 21:42:53 +0000120 string values are predefined:
Guido van Rossum0612d842000-03-10 23:20:43 +0000121
Guido van Rossumd8855fd2000-03-24 22:14:19 +0000122 'strict' - raise a ValueError error (or a subclass)
Guido van Rossum0612d842000-03-10 23:20:43 +0000123 'ignore' - ignore the character and continue with the next
124 'replace' - replace with a suitable replacement character;
125 Python will use the official U+FFFD REPLACEMENT
Walter Dörwald7f82f792002-11-19 21:42:53 +0000126 CHARACTER for the builtin Unicode codecs on
127 decoding and '?' on encoding.
Serhiy Storchakad3faf432015-01-18 11:28:37 +0200128 'surrogateescape' - replace with private code points U+DCnn.
Walter Dörwald7f82f792002-11-19 21:42:53 +0000129 'xmlcharrefreplace' - Replace with the appropriate XML
130 character reference (only for encoding).
Serhiy Storchaka07985ef2015-01-25 22:56:57 +0200131 'backslashreplace' - Replace with backslashed escape sequences.
132 'namereplace' - Replace with \\N{...} escape sequences
Walter Dörwald7f82f792002-11-19 21:42:53 +0000133 (only for encoding).
134
135 The set of allowed values can be extended via register_error.
Guido van Rossum0612d842000-03-10 23:20:43 +0000136
137 """
Tim Peters30324a72001-05-15 17:19:16 +0000138 def encode(self, input, errors='strict'):
Guido van Rossum1c89b0e2000-04-11 15:41:38 +0000139
Fred Drake3e74c0d2000-03-17 15:40:35 +0000140 """ Encodes the object input and returns a tuple (output
Guido van Rossum0612d842000-03-10 23:20:43 +0000141 object, length consumed).
142
143 errors defines the error handling to apply. It defaults to
144 'strict' handling.
145
146 The method may not store state in the Codec instance. Use
Berker Peksag41ca8282015-07-30 18:26:10 +0300147 StreamWriter for codecs which have to keep state in order to
148 make encoding efficient.
Guido van Rossum0612d842000-03-10 23:20:43 +0000149
150 The encoder must be able to handle zero length input and
151 return an empty object of the output object type in this
152 situation.
153
154 """
155 raise NotImplementedError
156
Tim Peters30324a72001-05-15 17:19:16 +0000157 def decode(self, input, errors='strict'):
Guido van Rossum0612d842000-03-10 23:20:43 +0000158
159 """ Decodes the object input and returns a tuple (output
160 object, length consumed).
161
162 input must be an object which provides the bf_getreadbuf
163 buffer slot. Python strings, buffer objects and memory
164 mapped files are examples of objects providing this slot.
Guido van Rossum1c89b0e2000-04-11 15:41:38 +0000165
Guido van Rossum0612d842000-03-10 23:20:43 +0000166 errors defines the error handling to apply. It defaults to
167 'strict' handling.
168
169 The method may not store state in the Codec instance. Use
Berker Peksag41ca8282015-07-30 18:26:10 +0300170 StreamReader for codecs which have to keep state in order to
171 make decoding efficient.
Guido van Rossum0612d842000-03-10 23:20:43 +0000172
173 The decoder must be able to handle zero length input and
174 return an empty object of the output object type in this
175 situation.
176
Guido van Rossum1c89b0e2000-04-11 15:41:38 +0000177 """
Guido van Rossum0612d842000-03-10 23:20:43 +0000178 raise NotImplementedError
179
Thomas Woutersa9773292006-04-21 09:43:23 +0000180class IncrementalEncoder(object):
181 """
Walter Dörwald3abcb012007-04-16 22:10:50 +0000182 An IncrementalEncoder encodes an input in multiple steps. The input can
183 be passed piece by piece to the encode() method. The IncrementalEncoder
184 remembers the state of the encoding process between calls to encode().
Thomas Woutersa9773292006-04-21 09:43:23 +0000185 """
186 def __init__(self, errors='strict'):
187 """
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000188 Creates an IncrementalEncoder instance.
Thomas Woutersa9773292006-04-21 09:43:23 +0000189
190 The IncrementalEncoder may use different error handling schemes by
191 providing the errors keyword argument. See the module docstring
192 for a list of possible values.
193 """
194 self.errors = errors
195 self.buffer = ""
196
197 def encode(self, input, final=False):
198 """
199 Encodes input and returns the resulting object.
200 """
201 raise NotImplementedError
202
203 def reset(self):
204 """
205 Resets the encoder to the initial state.
206 """
207
Walter Dörwald3abcb012007-04-16 22:10:50 +0000208 def getstate(self):
209 """
210 Return the current state of the encoder.
211 """
212 return 0
213
214 def setstate(self, state):
215 """
216 Set the current state of the encoder. state must have been
217 returned by getstate().
218 """
219
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000220class BufferedIncrementalEncoder(IncrementalEncoder):
221 """
222 This subclass of IncrementalEncoder can be used as the baseclass for an
223 incremental encoder if the encoder must keep some of the output in a
224 buffer between calls to encode().
225 """
226 def __init__(self, errors='strict'):
227 IncrementalEncoder.__init__(self, errors)
Walter Dörwald3abcb012007-04-16 22:10:50 +0000228 # unencoded input that is kept between calls to encode()
229 self.buffer = ""
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000230
231 def _buffer_encode(self, input, errors, final):
232 # Overwrite this method in subclasses: It must encode input
233 # and return an (output, length consumed) tuple
234 raise NotImplementedError
235
236 def encode(self, input, final=False):
237 # encode input (taking the buffer into account)
238 data = self.buffer + input
239 (result, consumed) = self._buffer_encode(data, self.errors, final)
240 # keep unencoded input until the next call
241 self.buffer = data[consumed:]
242 return result
243
244 def reset(self):
245 IncrementalEncoder.reset(self)
246 self.buffer = ""
247
Walter Dörwald3abcb012007-04-16 22:10:50 +0000248 def getstate(self):
249 return self.buffer or 0
250
251 def setstate(self, state):
252 self.buffer = state or ""
253
Thomas Woutersa9773292006-04-21 09:43:23 +0000254class IncrementalDecoder(object):
255 """
Walter Dörwald3abcb012007-04-16 22:10:50 +0000256 An IncrementalDecoder decodes an input in multiple steps. The input can
257 be passed piece by piece to the decode() method. The IncrementalDecoder
Thomas Woutersa9773292006-04-21 09:43:23 +0000258 remembers the state of the decoding process between calls to decode().
259 """
260 def __init__(self, errors='strict'):
261 """
Martin Panter7462b6492015-11-02 03:37:02 +0000262 Create an IncrementalDecoder instance.
Thomas Woutersa9773292006-04-21 09:43:23 +0000263
264 The IncrementalDecoder may use different error handling schemes by
265 providing the errors keyword argument. See the module docstring
266 for a list of possible values.
267 """
268 self.errors = errors
269
270 def decode(self, input, final=False):
271 """
Ka-Ping Yeef44c7e82008-03-18 04:51:32 +0000272 Decode input and returns the resulting object.
Thomas Woutersa9773292006-04-21 09:43:23 +0000273 """
274 raise NotImplementedError
275
276 def reset(self):
277 """
Ka-Ping Yeef44c7e82008-03-18 04:51:32 +0000278 Reset the decoder to the initial state.
Thomas Woutersa9773292006-04-21 09:43:23 +0000279 """
280
Walter Dörwald3abcb012007-04-16 22:10:50 +0000281 def getstate(self):
282 """
Ka-Ping Yeef44c7e82008-03-18 04:51:32 +0000283 Return the current state of the decoder.
284
285 This must be a (buffered_input, additional_state_info) tuple.
286 buffered_input must be a bytes object containing bytes that
287 were passed to decode() that have not yet been converted.
288 additional_state_info must be a non-negative integer
289 representing the state of the decoder WITHOUT yet having
290 processed the contents of buffered_input. In the initial state
291 and after reset(), getstate() must return (b"", 0).
Walter Dörwald3abcb012007-04-16 22:10:50 +0000292 """
Walter Dörwaldca8a8d02007-05-04 13:05:09 +0000293 return (b"", 0)
Walter Dörwald3abcb012007-04-16 22:10:50 +0000294
295 def setstate(self, state):
296 """
Ka-Ping Yeef44c7e82008-03-18 04:51:32 +0000297 Set the current state of the decoder.
298
299 state must have been returned by getstate(). The effect of
300 setstate((b"", 0)) must be equivalent to reset().
Walter Dörwald3abcb012007-04-16 22:10:50 +0000301 """
302
Thomas Woutersa9773292006-04-21 09:43:23 +0000303class BufferedIncrementalDecoder(IncrementalDecoder):
304 """
305 This subclass of IncrementalDecoder can be used as the baseclass for an
Walter Dörwald3abcb012007-04-16 22:10:50 +0000306 incremental decoder if the decoder must be able to handle incomplete
307 byte sequences.
Thomas Woutersa9773292006-04-21 09:43:23 +0000308 """
309 def __init__(self, errors='strict'):
310 IncrementalDecoder.__init__(self, errors)
Walter Dörwald3abcb012007-04-16 22:10:50 +0000311 # undecoded input that is kept between calls to decode()
Walter Dörwaldca8a8d02007-05-04 13:05:09 +0000312 self.buffer = b""
Thomas Woutersa9773292006-04-21 09:43:23 +0000313
314 def _buffer_decode(self, input, errors, final):
315 # Overwrite this method in subclasses: It must decode input
316 # and return an (output, length consumed) tuple
317 raise NotImplementedError
318
319 def decode(self, input, final=False):
320 # decode input (taking the buffer into account)
321 data = self.buffer + input
322 (result, consumed) = self._buffer_decode(data, self.errors, final)
323 # keep undecoded input until the next call
324 self.buffer = data[consumed:]
325 return result
326
327 def reset(self):
328 IncrementalDecoder.reset(self)
Walter Dörwaldca8a8d02007-05-04 13:05:09 +0000329 self.buffer = b""
Thomas Woutersa9773292006-04-21 09:43:23 +0000330
Walter Dörwald3abcb012007-04-16 22:10:50 +0000331 def getstate(self):
332 # additional state info is always 0
333 return (self.buffer, 0)
334
335 def setstate(self, state):
336 # ignore additional state info
337 self.buffer = state[0]
338
Guido van Rossum0612d842000-03-10 23:20:43 +0000339#
340# The StreamWriter and StreamReader class provide generic working
Andrew M. Kuchling97c56352001-09-18 20:29:48 +0000341# interfaces which can be used to implement new encoding submodules
Guido van Rossum0612d842000-03-10 23:20:43 +0000342# very easily. See encodings/utf_8.py for an example on how this is
343# done.
Guido van Rossum1c89b0e2000-04-11 15:41:38 +0000344#
Guido van Rossum0612d842000-03-10 23:20:43 +0000345
346class StreamWriter(Codec):
347
Tim Peters30324a72001-05-15 17:19:16 +0000348 def __init__(self, stream, errors='strict'):
Guido van Rossum0612d842000-03-10 23:20:43 +0000349
350 """ Creates a StreamWriter instance.
351
Nick Coghlanb9fdb7a2015-01-07 00:22:00 +1000352 stream must be a file-like object open for writing.
Guido van Rossum0612d842000-03-10 23:20:43 +0000353
Walter Dörwald7f82f792002-11-19 21:42:53 +0000354 The StreamWriter may use different error handling
Guido van Rossum0612d842000-03-10 23:20:43 +0000355 schemes by providing the errors keyword argument. These
Walter Dörwald7f82f792002-11-19 21:42:53 +0000356 parameters are predefined:
Guido van Rossum0612d842000-03-10 23:20:43 +0000357
358 'strict' - raise a ValueError (or a subclass)
359 'ignore' - ignore the character and continue with the next
360 'replace'- replace with a suitable replacement character
Walter Dörwald7f82f792002-11-19 21:42:53 +0000361 'xmlcharrefreplace' - Replace with the appropriate XML
362 character reference.
363 'backslashreplace' - Replace with backslashed escape
Serhiy Storchaka07985ef2015-01-25 22:56:57 +0200364 sequences.
365 'namereplace' - Replace with \\N{...} escape sequences.
Guido van Rossum0612d842000-03-10 23:20:43 +0000366
Walter Dörwald7f82f792002-11-19 21:42:53 +0000367 The set of allowed parameter values can be extended via
368 register_error.
Guido van Rossum0612d842000-03-10 23:20:43 +0000369 """
370 self.stream = stream
371 self.errors = errors
372
Guido van Rossuma3277132000-04-11 15:37:43 +0000373 def write(self, object):
Guido van Rossum0612d842000-03-10 23:20:43 +0000374
375 """ Writes the object's contents encoded to self.stream.
376 """
Tim Peters30324a72001-05-15 17:19:16 +0000377 data, consumed = self.encode(object, self.errors)
Guido van Rossum0612d842000-03-10 23:20:43 +0000378 self.stream.write(data)
379
Guido van Rossuma3277132000-04-11 15:37:43 +0000380 def writelines(self, list):
381
382 """ Writes the concatenated list of strings to the stream
383 using .write().
384 """
385 self.write(''.join(list))
Guido van Rossum1c89b0e2000-04-11 15:41:38 +0000386
Guido van Rossum0612d842000-03-10 23:20:43 +0000387 def reset(self):
388
389 """ Flushes and resets the codec buffers used for keeping state.
390
391 Calling this method should ensure that the data on the
392 output is put into a clean state, that allows appending
393 of new fresh data without having to rescan the whole
394 stream to recover state.
395
396 """
397 pass
398
Victor Stinnera92ad7e2010-05-22 16:59:09 +0000399 def seek(self, offset, whence=0):
400 self.stream.seek(offset, whence)
401 if whence == 0 and offset == 0:
402 self.reset()
403
Tim Peters30324a72001-05-15 17:19:16 +0000404 def __getattr__(self, name,
Guido van Rossum0612d842000-03-10 23:20:43 +0000405 getattr=getattr):
406
407 """ Inherit all other methods from the underlying stream.
408 """
Tim Peters30324a72001-05-15 17:19:16 +0000409 return getattr(self.stream, name)
Guido van Rossum0612d842000-03-10 23:20:43 +0000410
Thomas Wouters89f507f2006-12-13 04:49:30 +0000411 def __enter__(self):
412 return self
413
414 def __exit__(self, type, value, tb):
415 self.stream.close()
416
Guido van Rossum0612d842000-03-10 23:20:43 +0000417###
418
419class StreamReader(Codec):
420
Georg Brandl02524622010-12-02 18:06:51 +0000421 charbuffertype = str
422
Tim Peters30324a72001-05-15 17:19:16 +0000423 def __init__(self, stream, errors='strict'):
Guido van Rossum0612d842000-03-10 23:20:43 +0000424
425 """ Creates a StreamReader instance.
426
Nick Coghlanb9fdb7a2015-01-07 00:22:00 +1000427 stream must be a file-like object open for reading.
Guido van Rossum0612d842000-03-10 23:20:43 +0000428
Walter Dörwald7f82f792002-11-19 21:42:53 +0000429 The StreamReader may use different error handling
Guido van Rossum0612d842000-03-10 23:20:43 +0000430 schemes by providing the errors keyword argument. These
Walter Dörwald7f82f792002-11-19 21:42:53 +0000431 parameters are predefined:
Guido van Rossum0612d842000-03-10 23:20:43 +0000432
433 'strict' - raise a ValueError (or a subclass)
434 'ignore' - ignore the character and continue with the next
Serhiy Storchaka07985ef2015-01-25 22:56:57 +0200435 'replace'- replace with a suitable replacement character
436 'backslashreplace' - Replace with backslashed escape sequences;
Guido van Rossum0612d842000-03-10 23:20:43 +0000437
Walter Dörwald7f82f792002-11-19 21:42:53 +0000438 The set of allowed parameter values can be extended via
439 register_error.
Guido van Rossum0612d842000-03-10 23:20:43 +0000440 """
441 self.stream = stream
442 self.errors = errors
Walter Dörwaldca8a8d02007-05-04 13:05:09 +0000443 self.bytebuffer = b""
Georg Brandl02524622010-12-02 18:06:51 +0000444 self._empty_charbuffer = self.charbuffertype()
445 self.charbuffer = self._empty_charbuffer
Martin v. Löwis4ed67382005-09-18 08:34:39 +0000446 self.linebuffer = None
Guido van Rossum0612d842000-03-10 23:20:43 +0000447
Walter Dörwald69652032004-09-07 20:24:22 +0000448 def decode(self, input, errors='strict'):
449 raise NotImplementedError
450
Martin v. Löwis56066d22005-08-24 07:38:12 +0000451 def read(self, size=-1, chars=-1, firstline=False):
Guido van Rossum0612d842000-03-10 23:20:43 +0000452
453 """ Decodes data from the stream self.stream and returns the
454 resulting object.
455
Nick Coghlanb9fdb7a2015-01-07 00:22:00 +1000456 chars indicates the number of decoded code points or bytes to
457 return. read() will never return more data than requested,
458 but it might return less, if there is not enough available.
Walter Dörwald69652032004-09-07 20:24:22 +0000459
Nick Coghlanb9fdb7a2015-01-07 00:22:00 +1000460 size indicates the approximate maximum number of decoded
461 bytes or code points to read for decoding. The decoder
Guido van Rossum0612d842000-03-10 23:20:43 +0000462 can modify this setting as appropriate. The default value
463 -1 indicates to read and decode as much as possible. size
464 is intended to prevent having to decode huge files in one
465 step.
466
Martin v. Löwis56066d22005-08-24 07:38:12 +0000467 If firstline is true, and a UnicodeDecodeError happens
468 after the first line terminator in the input only the first line
469 will be returned, the rest of the input will be kept until the
470 next call to read().
471
Nick Coghlanb9fdb7a2015-01-07 00:22:00 +1000472 The method should use a greedy read strategy, meaning that
Guido van Rossum0612d842000-03-10 23:20:43 +0000473 it should read as much data as is allowed within the
474 definition of the encoding and the given size, e.g. if
475 optional encoding endings or state markers are available
476 on the stream, these should be read too.
Guido van Rossum0612d842000-03-10 23:20:43 +0000477 """
Martin v. Löwis4ed67382005-09-18 08:34:39 +0000478 # If we have lines cached, first merge them back into characters
479 if self.linebuffer:
Georg Brandl02524622010-12-02 18:06:51 +0000480 self.charbuffer = self._empty_charbuffer.join(self.linebuffer)
Martin v. Löwis4ed67382005-09-18 08:34:39 +0000481 self.linebuffer = None
Tim Peters536cf992005-12-25 23:18:31 +0000482
Serhiy Storchaka219c2de2017-11-29 01:30:00 +0200483 if chars < 0:
484 # For compatibility with other read() methods that take a
485 # single argument
486 chars = size
487
Walter Dörwald69652032004-09-07 20:24:22 +0000488 # read until we get the required number of characters (if available)
Walter Dörwald69652032004-09-07 20:24:22 +0000489 while True:
Tim Golden621302c2012-10-01 16:40:40 +0100490 # can the request be satisfied from the character buffer?
Serhiy Storchakadbe09822014-01-26 19:27:56 +0200491 if chars >= 0:
Walter Dörwald69652032004-09-07 20:24:22 +0000492 if len(self.charbuffer) >= chars:
Walter Dörwald69652032004-09-07 20:24:22 +0000493 break
494 # we need more data
495 if size < 0:
496 newdata = self.stream.read()
497 else:
498 newdata = self.stream.read(size)
Walter Dörwalde57d7b12004-12-21 22:24:00 +0000499 # decode bytes (those remaining from the last call included)
Walter Dörwald69652032004-09-07 20:24:22 +0000500 data = self.bytebuffer + newdata
Serhiy Storchakadbe09822014-01-26 19:27:56 +0200501 if not data:
502 break
Martin v. Löwis56066d22005-08-24 07:38:12 +0000503 try:
504 newchars, decodedbytes = self.decode(data, self.errors)
Guido van Rossumb940e112007-01-10 16:19:56 +0000505 except UnicodeDecodeError as exc:
Martin v. Löwis56066d22005-08-24 07:38:12 +0000506 if firstline:
Walter Dörwald3abcb012007-04-16 22:10:50 +0000507 newchars, decodedbytes = \
508 self.decode(data[:exc.start], self.errors)
Ezio Melottid8b509b2011-09-28 17:37:55 +0300509 lines = newchars.splitlines(keepends=True)
Martin v. Löwis56066d22005-08-24 07:38:12 +0000510 if len(lines)<=1:
511 raise
512 else:
513 raise
Walter Dörwald69652032004-09-07 20:24:22 +0000514 # keep undecoded bytes until the next call
515 self.bytebuffer = data[decodedbytes:]
516 # put new characters in the character buffer
Walter Dörwalde57d7b12004-12-21 22:24:00 +0000517 self.charbuffer += newchars
Walter Dörwald69652032004-09-07 20:24:22 +0000518 # there was no data available
519 if not newdata:
Walter Dörwalde57d7b12004-12-21 22:24:00 +0000520 break
521 if chars < 0:
522 # Return everything we've got
523 result = self.charbuffer
Georg Brandl02524622010-12-02 18:06:51 +0000524 self.charbuffer = self._empty_charbuffer
Walter Dörwalde57d7b12004-12-21 22:24:00 +0000525 else:
526 # Return the first chars characters
527 result = self.charbuffer[:chars]
528 self.charbuffer = self.charbuffer[chars:]
Walter Dörwald69652032004-09-07 20:24:22 +0000529 return result
Guido van Rossum0612d842000-03-10 23:20:43 +0000530
Walter Dörwald69652032004-09-07 20:24:22 +0000531 def readline(self, size=None, keepends=True):
Guido van Rossuma3277132000-04-11 15:37:43 +0000532
533 """ Read one line from the input stream and return the
534 decoded data.
535
Walter Dörwald69652032004-09-07 20:24:22 +0000536 size, if given, is passed as size argument to the
537 read() method.
Guido van Rossum1c89b0e2000-04-11 15:41:38 +0000538
Guido van Rossuma3277132000-04-11 15:37:43 +0000539 """
Martin v. Löwis4ed67382005-09-18 08:34:39 +0000540 # If we have lines cached from an earlier read, return
541 # them unconditionally
542 if self.linebuffer:
543 line = self.linebuffer[0]
544 del self.linebuffer[0]
545 if len(self.linebuffer) == 1:
546 # revert to charbuffer mode; we might need more data
547 # next time
548 self.charbuffer = self.linebuffer[0]
549 self.linebuffer = None
550 if not keepends:
Ezio Melottid8b509b2011-09-28 17:37:55 +0300551 line = line.splitlines(keepends=False)[0]
Martin v. Löwis4ed67382005-09-18 08:34:39 +0000552 return line
Tim Peters536cf992005-12-25 23:18:31 +0000553
Walter Dörwalde57d7b12004-12-21 22:24:00 +0000554 readsize = size or 72
Georg Brandl02524622010-12-02 18:06:51 +0000555 line = self._empty_charbuffer
Walter Dörwalde57d7b12004-12-21 22:24:00 +0000556 # If size is given, we call read() only once
Walter Dörwald69652032004-09-07 20:24:22 +0000557 while True:
Martin v. Löwis56066d22005-08-24 07:38:12 +0000558 data = self.read(readsize, firstline=True)
Walter Dörwalde57d7b12004-12-21 22:24:00 +0000559 if data:
Walter Dörwalda4eb2d52005-04-21 21:42:35 +0000560 # If we're at a "\r" read one extra character (which might
561 # be a "\n") to get a proper line ending. If the stream is
Walter Dörwaldbc8e6422005-04-21 21:32:03 +0000562 # temporarily exhausted we return the wrong line ending.
Georg Brandl02524622010-12-02 18:06:51 +0000563 if (isinstance(data, str) and data.endswith("\r")) or \
564 (isinstance(data, bytes) and data.endswith(b"\r")):
Walter Dörwald7a6dc132005-04-04 21:38:47 +0000565 data += self.read(size=1, chars=1)
Walter Dörwald7a6dc132005-04-04 21:38:47 +0000566
Walter Dörwald69652032004-09-07 20:24:22 +0000567 line += data
Ezio Melottid8b509b2011-09-28 17:37:55 +0300568 lines = line.splitlines(keepends=True)
Walter Dörwalde57d7b12004-12-21 22:24:00 +0000569 if lines:
Martin v. Löwis4ed67382005-09-18 08:34:39 +0000570 if len(lines) > 1:
571 # More than one line result; the first line is a full line
572 # to return
573 line = lines[0]
574 del lines[0]
575 if len(lines) > 1:
576 # cache the remaining lines
577 lines[-1] += self.charbuffer
578 self.linebuffer = lines
579 self.charbuffer = None
580 else:
581 # only one remaining line, put it back into charbuffer
582 self.charbuffer = lines[0] + self.charbuffer
583 if not keepends:
Ezio Melottid8b509b2011-09-28 17:37:55 +0300584 line = line.splitlines(keepends=False)[0]
Martin v. Löwis4ed67382005-09-18 08:34:39 +0000585 break
Walter Dörwalde57d7b12004-12-21 22:24:00 +0000586 line0withend = lines[0]
Ezio Melottid8b509b2011-09-28 17:37:55 +0300587 line0withoutend = lines[0].splitlines(keepends=False)[0]
Walter Dörwalde57d7b12004-12-21 22:24:00 +0000588 if line0withend != line0withoutend: # We really have a line end
589 # Put the rest back together and keep it until the next call
Georg Brandl02524622010-12-02 18:06:51 +0000590 self.charbuffer = self._empty_charbuffer.join(lines[1:]) + \
591 self.charbuffer
Walter Dörwalde57d7b12004-12-21 22:24:00 +0000592 if keepends:
593 line = line0withend
594 else:
595 line = line0withoutend
Walter Dörwald9fa09462005-01-10 12:01:39 +0000596 break
Walter Dörwalde57d7b12004-12-21 22:24:00 +0000597 # we didn't get anything or this was our only try
Walter Dörwald9fa09462005-01-10 12:01:39 +0000598 if not data or size is not None:
Walter Dörwalde57d7b12004-12-21 22:24:00 +0000599 if line and not keepends:
Ezio Melottid8b509b2011-09-28 17:37:55 +0300600 line = line.splitlines(keepends=False)[0]
Walter Dörwalde57d7b12004-12-21 22:24:00 +0000601 break
Georg Brandl02524622010-12-02 18:06:51 +0000602 if readsize < 8000:
Walter Dörwalde57d7b12004-12-21 22:24:00 +0000603 readsize *= 2
604 return line
Guido van Rossum1c89b0e2000-04-11 15:41:38 +0000605
Walter Dörwald69652032004-09-07 20:24:22 +0000606 def readlines(self, sizehint=None, keepends=True):
Guido van Rossuma3277132000-04-11 15:37:43 +0000607
608 """ Read all lines available on the input stream
Nick Coghlanb9fdb7a2015-01-07 00:22:00 +1000609 and return them as a list.
Guido van Rossuma3277132000-04-11 15:37:43 +0000610
611 Line breaks are implemented using the codec's decoder
612 method and are included in the list entries.
Guido van Rossum1c89b0e2000-04-11 15:41:38 +0000613
Marc-André Lemburgd5948492004-02-26 15:22:17 +0000614 sizehint, if given, is ignored since there is no efficient
615 way to finding the true end-of-line.
Guido van Rossuma3277132000-04-11 15:37:43 +0000616
617 """
Walter Dörwald69652032004-09-07 20:24:22 +0000618 data = self.read()
Hye-Shik Changaf5c7cf2004-10-17 23:51:21 +0000619 return data.splitlines(keepends)
Guido van Rossum0612d842000-03-10 23:20:43 +0000620
621 def reset(self):
622
623 """ Resets the codec buffers used for keeping state.
624
625 Note that no stream repositioning should take place.
Thomas Wouters7e474022000-07-16 12:04:32 +0000626 This method is primarily intended to be able to recover
Guido van Rossum0612d842000-03-10 23:20:43 +0000627 from decoding errors.
628
629 """
Walter Dörwaldca8a8d02007-05-04 13:05:09 +0000630 self.bytebuffer = b""
Georg Brandl02524622010-12-02 18:06:51 +0000631 self.charbuffer = self._empty_charbuffer
Martin v. Löwis4ed67382005-09-18 08:34:39 +0000632 self.linebuffer = None
Walter Dörwald729c31f2005-03-14 19:06:30 +0000633
Walter Dörwald71fd90d2005-03-14 19:25:41 +0000634 def seek(self, offset, whence=0):
Walter Dörwald729c31f2005-03-14 19:06:30 +0000635 """ Set the input stream's current position.
636
637 Resets the codec buffers used for keeping state.
638 """
Walter Dörwald729c31f2005-03-14 19:06:30 +0000639 self.stream.seek(offset, whence)
Victor Stinnera92ad7e2010-05-22 16:59:09 +0000640 self.reset()
Guido van Rossum0612d842000-03-10 23:20:43 +0000641
Georg Brandla18af4e2007-04-21 15:47:16 +0000642 def __next__(self):
Walter Dörwald4dbf1922002-11-06 16:53:44 +0000643
644 """ Return the next decoded line from the input stream."""
645 line = self.readline()
646 if line:
647 return line
648 raise StopIteration
649
650 def __iter__(self):
651 return self
652
Tim Peters30324a72001-05-15 17:19:16 +0000653 def __getattr__(self, name,
Guido van Rossum0612d842000-03-10 23:20:43 +0000654 getattr=getattr):
655
656 """ Inherit all other methods from the underlying stream.
657 """
Tim Peters30324a72001-05-15 17:19:16 +0000658 return getattr(self.stream, name)
Guido van Rossum0612d842000-03-10 23:20:43 +0000659
Thomas Wouters89f507f2006-12-13 04:49:30 +0000660 def __enter__(self):
661 return self
662
663 def __exit__(self, type, value, tb):
664 self.stream.close()
665
Guido van Rossum0612d842000-03-10 23:20:43 +0000666###
667
668class StreamReaderWriter:
669
Fred Drake49fd1072000-04-13 14:11:21 +0000670 """ StreamReaderWriter instances allow wrapping streams which
671 work in both read and write modes.
672
673 The design is such that one can use the factory functions
Thomas Wouters7e474022000-07-16 12:04:32 +0000674 returned by the codec.lookup() function to construct the
Fred Drake49fd1072000-04-13 14:11:21 +0000675 instance.
676
677 """
Guido van Rossuma3277132000-04-11 15:37:43 +0000678 # Optional attributes set by the file wrappers below
679 encoding = 'unknown'
680
Tim Peters30324a72001-05-15 17:19:16 +0000681 def __init__(self, stream, Reader, Writer, errors='strict'):
Guido van Rossum0612d842000-03-10 23:20:43 +0000682
683 """ Creates a StreamReaderWriter instance.
684
685 stream must be a Stream-like object.
686
687 Reader, Writer must be factory functions or classes
688 providing the StreamReader, StreamWriter interface resp.
689
690 Error handling is done in the same way as defined for the
691 StreamWriter/Readers.
692
693 """
694 self.stream = stream
695 self.reader = Reader(stream, errors)
696 self.writer = Writer(stream, errors)
697 self.errors = errors
698
Tim Peters30324a72001-05-15 17:19:16 +0000699 def read(self, size=-1):
Guido van Rossum0612d842000-03-10 23:20:43 +0000700
701 return self.reader.read(size)
702
Guido van Rossumd58c26f2000-05-01 16:17:32 +0000703 def readline(self, size=None):
Guido van Rossuma3277132000-04-11 15:37:43 +0000704
705 return self.reader.readline(size)
706
Guido van Rossumd58c26f2000-05-01 16:17:32 +0000707 def readlines(self, sizehint=None):
Guido van Rossuma3277132000-04-11 15:37:43 +0000708
709 return self.reader.readlines(sizehint)
710
Georg Brandla18af4e2007-04-21 15:47:16 +0000711 def __next__(self):
Walter Dörwald4dbf1922002-11-06 16:53:44 +0000712
713 """ Return the next decoded line from the input stream."""
Georg Brandla18af4e2007-04-21 15:47:16 +0000714 return next(self.reader)
Walter Dörwald4dbf1922002-11-06 16:53:44 +0000715
716 def __iter__(self):
717 return self
718
Tim Peters30324a72001-05-15 17:19:16 +0000719 def write(self, data):
Guido van Rossum0612d842000-03-10 23:20:43 +0000720
721 return self.writer.write(data)
722
Tim Peters30324a72001-05-15 17:19:16 +0000723 def writelines(self, list):
Guido van Rossuma3277132000-04-11 15:37:43 +0000724
725 return self.writer.writelines(list)
726
Guido van Rossum0612d842000-03-10 23:20:43 +0000727 def reset(self):
728
729 self.reader.reset()
730 self.writer.reset()
731
Victor Stinner3fed0872010-05-22 02:16:27 +0000732 def seek(self, offset, whence=0):
Victor Stinnera92ad7e2010-05-22 16:59:09 +0000733 self.stream.seek(offset, whence)
734 self.reader.reset()
735 if whence == 0 and offset == 0:
736 self.writer.reset()
Victor Stinner3fed0872010-05-22 02:16:27 +0000737
Tim Peters30324a72001-05-15 17:19:16 +0000738 def __getattr__(self, name,
Guido van Rossum0612d842000-03-10 23:20:43 +0000739 getattr=getattr):
740
741 """ Inherit all other methods from the underlying stream.
742 """
Tim Peters30324a72001-05-15 17:19:16 +0000743 return getattr(self.stream, name)
Guido van Rossum0612d842000-03-10 23:20:43 +0000744
Victor Stinner272d8882017-06-16 08:59:01 +0200745 # these are needed to make "with StreamReaderWriter(...)" work properly
Thomas Wouters89f507f2006-12-13 04:49:30 +0000746
747 def __enter__(self):
748 return self
749
750 def __exit__(self, type, value, tb):
751 self.stream.close()
752
Guido van Rossum0612d842000-03-10 23:20:43 +0000753###
754
755class StreamRecoder:
756
Nick Coghlanb9fdb7a2015-01-07 00:22:00 +1000757 """ StreamRecoder instances translate data from one encoding to another.
Fred Drake49fd1072000-04-13 14:11:21 +0000758
759 They use the complete set of APIs returned by the
760 codecs.lookup() function to implement their task.
761
Nick Coghlanb9fdb7a2015-01-07 00:22:00 +1000762 Data written to the StreamRecoder is first decoded into an
763 intermediate format (depending on the "decode" codec) and then
764 written to the underlying stream using an instance of the provided
765 Writer class.
Fred Drake49fd1072000-04-13 14:11:21 +0000766
Nick Coghlanb9fdb7a2015-01-07 00:22:00 +1000767 In the other direction, data is read from the underlying stream using
768 a Reader instance and then encoded and returned to the caller.
Fred Drake49fd1072000-04-13 14:11:21 +0000769
770 """
Guido van Rossuma3277132000-04-11 15:37:43 +0000771 # Optional attributes set by the file wrappers below
772 data_encoding = 'unknown'
773 file_encoding = 'unknown'
774
Tim Peters30324a72001-05-15 17:19:16 +0000775 def __init__(self, stream, encode, decode, Reader, Writer,
776 errors='strict'):
Guido van Rossum0612d842000-03-10 23:20:43 +0000777
778 """ Creates a StreamRecoder instance which implements a two-way
779 conversion: encode and decode work on the frontend (the
Nick Coghlanb9fdb7a2015-01-07 00:22:00 +1000780 data visible to .read() and .write()) while Reader and Writer
781 work on the backend (the data in stream).
Guido van Rossum0612d842000-03-10 23:20:43 +0000782
Nick Coghlanb9fdb7a2015-01-07 00:22:00 +1000783 You can use these objects to do transparent
784 transcodings from e.g. latin-1 to utf-8 and back.
Guido van Rossum0612d842000-03-10 23:20:43 +0000785
786 stream must be a file-like object.
787
Nick Coghlanb9fdb7a2015-01-07 00:22:00 +1000788 encode and decode must adhere to the Codec interface; Reader and
Guido van Rossum0612d842000-03-10 23:20:43 +0000789 Writer must be factory functions or classes providing the
Nick Coghlanb9fdb7a2015-01-07 00:22:00 +1000790 StreamReader and StreamWriter interfaces resp.
Guido van Rossum0612d842000-03-10 23:20:43 +0000791
792 Error handling is done in the same way as defined for the
793 StreamWriter/Readers.
794
795 """
796 self.stream = stream
797 self.encode = encode
798 self.decode = decode
799 self.reader = Reader(stream, errors)
800 self.writer = Writer(stream, errors)
801 self.errors = errors
802
Tim Peters30324a72001-05-15 17:19:16 +0000803 def read(self, size=-1):
Guido van Rossum0612d842000-03-10 23:20:43 +0000804
805 data = self.reader.read(size)
806 data, bytesencoded = self.encode(data, self.errors)
807 return data
808
Tim Peters30324a72001-05-15 17:19:16 +0000809 def readline(self, size=None):
Guido van Rossuma3277132000-04-11 15:37:43 +0000810
811 if size is None:
812 data = self.reader.readline()
813 else:
814 data = self.reader.readline(size)
815 data, bytesencoded = self.encode(data, self.errors)
816 return data
817
Tim Peters30324a72001-05-15 17:19:16 +0000818 def readlines(self, sizehint=None):
Guido van Rossuma3277132000-04-11 15:37:43 +0000819
Marc-André Lemburgd5948492004-02-26 15:22:17 +0000820 data = self.reader.read()
Guido van Rossuma3277132000-04-11 15:37:43 +0000821 data, bytesencoded = self.encode(data, self.errors)
Ezio Melottid8b509b2011-09-28 17:37:55 +0300822 return data.splitlines(keepends=True)
Guido van Rossuma3277132000-04-11 15:37:43 +0000823
Georg Brandla18af4e2007-04-21 15:47:16 +0000824 def __next__(self):
Walter Dörwald4dbf1922002-11-06 16:53:44 +0000825
826 """ Return the next decoded line from the input stream."""
Georg Brandla18af4e2007-04-21 15:47:16 +0000827 data = next(self.reader)
Walter Dörwaldc5238b82005-09-01 11:56:53 +0000828 data, bytesencoded = self.encode(data, self.errors)
829 return data
Walter Dörwald4dbf1922002-11-06 16:53:44 +0000830
831 def __iter__(self):
832 return self
833
Tim Peters30324a72001-05-15 17:19:16 +0000834 def write(self, data):
Guido van Rossum0612d842000-03-10 23:20:43 +0000835
836 data, bytesdecoded = self.decode(data, self.errors)
837 return self.writer.write(data)
838
Tim Peters30324a72001-05-15 17:19:16 +0000839 def writelines(self, list):
Guido van Rossuma3277132000-04-11 15:37:43 +0000840
Jelle Zijlstrab3be4072019-05-22 08:18:26 -0700841 data = b''.join(list)
Guido van Rossuma3277132000-04-11 15:37:43 +0000842 data, bytesdecoded = self.decode(data, self.errors)
843 return self.writer.write(data)
Guido van Rossum0612d842000-03-10 23:20:43 +0000844
845 def reset(self):
846
847 self.reader.reset()
848 self.writer.reset()
849
Ammar Askara6ec1ce2019-05-31 12:44:01 -0700850 def seek(self, offset, whence=0):
851 # Seeks must be propagated to both the readers and writers
852 # as they might need to reset their internal buffers.
853 self.reader.seek(offset, whence)
854 self.writer.seek(offset, whence)
855
Tim Peters30324a72001-05-15 17:19:16 +0000856 def __getattr__(self, name,
Guido van Rossum0612d842000-03-10 23:20:43 +0000857 getattr=getattr):
858
859 """ Inherit all other methods from the underlying stream.
860 """
Tim Peters30324a72001-05-15 17:19:16 +0000861 return getattr(self.stream, name)
Guido van Rossum0612d842000-03-10 23:20:43 +0000862
Thomas Wouters89f507f2006-12-13 04:49:30 +0000863 def __enter__(self):
864 return self
865
866 def __exit__(self, type, value, tb):
867 self.stream.close()
868
Guido van Rossum0612d842000-03-10 23:20:43 +0000869### Shortcuts
870
Alexey Izbysheva2670562018-10-20 03:22:31 +0300871def open(filename, mode='r', encoding=None, errors='strict', buffering=-1):
Guido van Rossum0612d842000-03-10 23:20:43 +0000872
873 """ Open an encoded file using the given mode and return
874 a wrapped version providing transparent encoding/decoding.
875
876 Note: The wrapped version will only accept the object format
877 defined by the codecs, i.e. Unicode objects for most builtin
Skip Montanaro9f5f9d92005-03-16 03:51:56 +0000878 codecs. Output is also codec dependent and will usually be
Guido van Rossum0612d842000-03-10 23:20:43 +0000879 Unicode as well.
880
Nick Coghlanb9fdb7a2015-01-07 00:22:00 +1000881 Underlying encoded files are always opened in binary mode.
882 The default file mode is 'r', meaning to open the file in read mode.
Marc-André Lemburg349a3d32000-06-21 21:21:04 +0000883
Guido van Rossum0612d842000-03-10 23:20:43 +0000884 encoding specifies the encoding which is to be used for the
Walter Dörwald7f3ed742003-02-02 23:08:27 +0000885 file.
Guido van Rossum0612d842000-03-10 23:20:43 +0000886
887 errors may be given to define the error handling. It defaults
888 to 'strict' which causes ValueErrors to be raised in case an
889 encoding error occurs.
890
891 buffering has the same meaning as for the builtin open() API.
Alexey Izbysheva2670562018-10-20 03:22:31 +0300892 It defaults to -1 which means that the default buffer size will
893 be used.
Guido van Rossum0612d842000-03-10 23:20:43 +0000894
Fred Drake49fd1072000-04-13 14:11:21 +0000895 The returned wrapped file object provides an extra attribute
896 .encoding which allows querying the used encoding. This
897 attribute is only available if an encoding was specified as
898 parameter.
899
Guido van Rossum0612d842000-03-10 23:20:43 +0000900 """
901 if encoding is not None and \
902 'b' not in mode:
903 # Force opening of the file in binary mode
904 mode = mode + 'b'
Georg Brandl1a3284e2007-12-02 09:40:06 +0000905 file = builtins.open(filename, mode, buffering)
Guido van Rossum0612d842000-03-10 23:20:43 +0000906 if encoding is None:
907 return file
Thomas Woutersa9773292006-04-21 09:43:23 +0000908 info = lookup(encoding)
909 srw = StreamReaderWriter(file, info.streamreader, info.streamwriter, errors)
Guido van Rossuma3277132000-04-11 15:37:43 +0000910 # Add attributes to simplify introspection
911 srw.encoding = encoding
912 return srw
Guido van Rossum0612d842000-03-10 23:20:43 +0000913
Guido van Rossuma3277132000-04-11 15:37:43 +0000914def EncodedFile(file, data_encoding, file_encoding=None, errors='strict'):
Guido van Rossum0612d842000-03-10 23:20:43 +0000915
916 """ Return a wrapped version of file which provides transparent
917 encoding translation.
918
Nick Coghlanb9fdb7a2015-01-07 00:22:00 +1000919 Data written to the wrapped file is decoded according
920 to the given data_encoding and then encoded to the underlying
921 file using file_encoding. The intermediate data type
Guido van Rossuma3277132000-04-11 15:37:43 +0000922 will usually be Unicode but depends on the specified codecs.
Guido van Rossum0612d842000-03-10 23:20:43 +0000923
Nick Coghlanb9fdb7a2015-01-07 00:22:00 +1000924 Bytes read from the file are decoded using file_encoding and then
925 passed back to the caller encoded using data_encoding.
Guido van Rossuma3277132000-04-11 15:37:43 +0000926
927 If file_encoding is not given, it defaults to data_encoding.
Guido van Rossum0612d842000-03-10 23:20:43 +0000928
929 errors may be given to define the error handling. It defaults
930 to 'strict' which causes ValueErrors to be raised in case an
931 encoding error occurs.
932
Fred Drake49fd1072000-04-13 14:11:21 +0000933 The returned wrapped file object provides two extra attributes
934 .data_encoding and .file_encoding which reflect the given
935 parameters of the same name. The attributes can be used for
936 introspection by Python programs.
937
Guido van Rossum0612d842000-03-10 23:20:43 +0000938 """
Guido van Rossuma3277132000-04-11 15:37:43 +0000939 if file_encoding is None:
940 file_encoding = data_encoding
Thomas Wouters89f507f2006-12-13 04:49:30 +0000941 data_info = lookup(data_encoding)
942 file_info = lookup(file_encoding)
943 sr = StreamRecoder(file, data_info.encode, data_info.decode,
944 file_info.streamreader, file_info.streamwriter, errors)
Guido van Rossuma3277132000-04-11 15:37:43 +0000945 # Add attributes to simplify introspection
946 sr.data_encoding = data_encoding
947 sr.file_encoding = file_encoding
948 return sr
Guido van Rossum0612d842000-03-10 23:20:43 +0000949
Marc-André Lemburgaa32c5a2001-09-19 11:24:48 +0000950### Helpers for codec lookup
951
952def getencoder(encoding):
953
954 """ Lookup up the codec for the given encoding and return
955 its encoder function.
956
957 Raises a LookupError in case the encoding cannot be found.
958
959 """
Thomas Woutersa9773292006-04-21 09:43:23 +0000960 return lookup(encoding).encode
Marc-André Lemburgaa32c5a2001-09-19 11:24:48 +0000961
962def getdecoder(encoding):
963
964 """ Lookup up the codec for the given encoding and return
965 its decoder function.
966
967 Raises a LookupError in case the encoding cannot be found.
968
969 """
Thomas Woutersa9773292006-04-21 09:43:23 +0000970 return lookup(encoding).decode
971
972def getincrementalencoder(encoding):
973
974 """ Lookup up the codec for the given encoding and return
975 its IncrementalEncoder class or factory function.
976
977 Raises a LookupError in case the encoding cannot be found
978 or the codecs doesn't provide an incremental encoder.
979
980 """
981 encoder = lookup(encoding).incrementalencoder
982 if encoder is None:
983 raise LookupError(encoding)
984 return encoder
985
986def getincrementaldecoder(encoding):
987
988 """ Lookup up the codec for the given encoding and return
989 its IncrementalDecoder class or factory function.
990
991 Raises a LookupError in case the encoding cannot be found
992 or the codecs doesn't provide an incremental decoder.
993
994 """
995 decoder = lookup(encoding).incrementaldecoder
996 if decoder is None:
997 raise LookupError(encoding)
998 return decoder
Marc-André Lemburgaa32c5a2001-09-19 11:24:48 +0000999
1000def getreader(encoding):
1001
1002 """ Lookup up the codec for the given encoding and return
1003 its StreamReader class or factory function.
1004
1005 Raises a LookupError in case the encoding cannot be found.
1006
1007 """
Thomas Woutersa9773292006-04-21 09:43:23 +00001008 return lookup(encoding).streamreader
Marc-André Lemburgaa32c5a2001-09-19 11:24:48 +00001009
1010def getwriter(encoding):
1011
1012 """ Lookup up the codec for the given encoding and return
1013 its StreamWriter class or factory function.
1014
1015 Raises a LookupError in case the encoding cannot be found.
1016
1017 """
Thomas Woutersa9773292006-04-21 09:43:23 +00001018 return lookup(encoding).streamwriter
1019
1020def iterencode(iterator, encoding, errors='strict', **kwargs):
1021 """
1022 Encoding iterator.
1023
Martin Panter7462b6492015-11-02 03:37:02 +00001024 Encodes the input strings from the iterator using an IncrementalEncoder.
Thomas Woutersa9773292006-04-21 09:43:23 +00001025
1026 errors and kwargs are passed through to the IncrementalEncoder
1027 constructor.
1028 """
1029 encoder = getincrementalencoder(encoding)(errors, **kwargs)
1030 for input in iterator:
1031 output = encoder.encode(input)
1032 if output:
1033 yield output
1034 output = encoder.encode("", True)
1035 if output:
1036 yield output
1037
1038def iterdecode(iterator, encoding, errors='strict', **kwargs):
1039 """
1040 Decoding iterator.
1041
Martin Panter7462b6492015-11-02 03:37:02 +00001042 Decodes the input strings from the iterator using an IncrementalDecoder.
Thomas Woutersa9773292006-04-21 09:43:23 +00001043
1044 errors and kwargs are passed through to the IncrementalDecoder
1045 constructor.
1046 """
1047 decoder = getincrementaldecoder(encoding)(errors, **kwargs)
1048 for input in iterator:
1049 output = decoder.decode(input)
1050 if output:
1051 yield output
Walter Dörwaldca8a8d02007-05-04 13:05:09 +00001052 output = decoder.decode(b"", True)
Thomas Woutersa9773292006-04-21 09:43:23 +00001053 if output:
1054 yield output
Marc-André Lemburgaa32c5a2001-09-19 11:24:48 +00001055
Marc-André Lemburga866df82001-01-03 21:29:14 +00001056### Helpers for charmap-based codecs
1057
1058def make_identity_dict(rng):
1059
1060 """ make_identity_dict(rng) -> dict
1061
1062 Return a dictionary where elements of the rng sequence are
1063 mapped to themselves.
Tim Peters88869f92001-01-14 23:36:06 +00001064
Marc-André Lemburga866df82001-01-03 21:29:14 +00001065 """
Antoine Pitrouaaefac72012-06-16 22:48:21 +02001066 return {i:i for i in rng}
Marc-André Lemburga866df82001-01-03 21:29:14 +00001067
Marc-André Lemburg716cf912001-05-16 09:41:45 +00001068def make_encoding_map(decoding_map):
1069
1070 """ Creates an encoding map from a decoding map.
1071
Walter Dörwald7f3ed742003-02-02 23:08:27 +00001072 If a target mapping in the decoding map occurs multiple
Marc-André Lemburg716cf912001-05-16 09:41:45 +00001073 times, then that target is mapped to None (undefined mapping),
1074 causing an exception when encountered by the charmap codec
1075 during translation.
1076
1077 One example where this happens is cp875.py which decodes
Serhiy Storchaka9f8a8912015-04-03 18:12:41 +03001078 multiple character to \\u001a.
Marc-André Lemburg716cf912001-05-16 09:41:45 +00001079
1080 """
1081 m = {}
1082 for k,v in decoding_map.items():
Raymond Hettinger54f02222002-06-01 14:18:47 +00001083 if not v in m:
Marc-André Lemburg716cf912001-05-16 09:41:45 +00001084 m[v] = k
1085 else:
1086 m[v] = None
1087 return m
Tim Peters3a2ab1a2001-05-29 06:06:54 +00001088
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001089### error handlers
1090
Martin v. Löwise2713be2005-03-08 15:03:08 +00001091try:
1092 strict_errors = lookup_error("strict")
1093 ignore_errors = lookup_error("ignore")
1094 replace_errors = lookup_error("replace")
1095 xmlcharrefreplace_errors = lookup_error("xmlcharrefreplace")
1096 backslashreplace_errors = lookup_error("backslashreplace")
Serhiy Storchaka166ebc42014-11-25 13:57:17 +02001097 namereplace_errors = lookup_error("namereplace")
Martin v. Löwise2713be2005-03-08 15:03:08 +00001098except LookupError:
1099 # In --disable-unicode builds, these error handler are missing
1100 strict_errors = None
1101 ignore_errors = None
1102 replace_errors = None
1103 xmlcharrefreplace_errors = None
1104 backslashreplace_errors = None
Serhiy Storchaka166ebc42014-11-25 13:57:17 +02001105 namereplace_errors = None
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001106
Martin v. Löwis6cd441d2001-07-31 08:54:55 +00001107# Tell modulefinder that using codecs probably needs the encodings
1108# package
1109_false = 0
1110if _false:
1111 import encodings
1112
Guido van Rossum0612d842000-03-10 23:20:43 +00001113### Tests
Guido van Rossum1c89b0e2000-04-11 15:41:38 +00001114
Guido van Rossum0612d842000-03-10 23:20:43 +00001115if __name__ == '__main__':
1116
Guido van Rossuma3277132000-04-11 15:37:43 +00001117 # Make stdout translate Latin-1 output into UTF-8 output
1118 sys.stdout = EncodedFile(sys.stdout, 'latin-1', 'utf-8')
Guido van Rossum1c89b0e2000-04-11 15:41:38 +00001119
Guido van Rossuma3277132000-04-11 15:37:43 +00001120 # Have stdin translate Latin-1 input into UTF-8 input
1121 sys.stdin = EncodedFile(sys.stdin, 'utf-8', 'latin-1')