blob: 39ec8454acd93940b58af428c1ee500be8fe6feb [file] [log] [blame]
Guido van Rossum0612d842000-03-10 23:20:43 +00001""" codecs -- Python Codec Registry, API and helpers.
2
3
4Written by Marc-Andre Lemburg (mal@lemburg.com).
5
6(c) Copyright CNRI, All Rights Reserved. NO WARRANTY.
7
8"""#"
9
Georg Brandl1a3284e2007-12-02 09:40:06 +000010import builtins, sys
Guido van Rossum0612d842000-03-10 23:20:43 +000011
12### Registry and builtin stateless codec functions
13
Guido van Rossumb95de4f2000-03-31 17:25:23 +000014try:
15 from _codecs import *
Guido van Rossumb940e112007-01-10 16:19:56 +000016except ImportError as why:
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000017 raise SystemError('Failed to load the builtin codecs: %s' % why)
Guido van Rossum0612d842000-03-10 23:20:43 +000018
Tim Peters30324a72001-05-15 17:19:16 +000019__all__ = ["register", "lookup", "open", "EncodedFile", "BOM", "BOM_BE",
Walter Dörwald474458d2002-06-04 15:16:29 +000020 "BOM_LE", "BOM32_BE", "BOM32_LE", "BOM64_BE", "BOM64_LE",
21 "BOM_UTF8", "BOM_UTF16", "BOM_UTF16_LE", "BOM_UTF16_BE",
Walter Dörwald3aeb6322002-09-02 13:14:32 +000022 "BOM_UTF32", "BOM_UTF32_LE", "BOM_UTF32_BE",
Serhiy Storchakade3ee5b2014-12-20 17:42:38 +020023 "CodecInfo", "Codec", "IncrementalEncoder", "IncrementalDecoder",
24 "StreamReader", "StreamWriter",
25 "StreamReaderWriter", "StreamRecoder",
26 "getencoder", "getdecoder", "getincrementalencoder",
27 "getincrementaldecoder", "getreader", "getwriter",
28 "encode", "decode", "iterencode", "iterdecode",
Walter Dörwald3aeb6322002-09-02 13:14:32 +000029 "strict_errors", "ignore_errors", "replace_errors",
30 "xmlcharrefreplace_errors",
Serhiy Storchaka166ebc42014-11-25 13:57:17 +020031 "backslashreplace_errors", "namereplace_errors",
Walter Dörwald3aeb6322002-09-02 13:14:32 +000032 "register_error", "lookup_error"]
Skip Montanaroe99d5ea2001-01-20 19:54:20 +000033
Guido van Rossum0612d842000-03-10 23:20:43 +000034### Constants
35
36#
Walter Dörwald474458d2002-06-04 15:16:29 +000037# Byte Order Mark (BOM = ZERO WIDTH NO-BREAK SPACE = U+FEFF)
38# and its possible byte string values
39# for UTF8/UTF16/UTF32 output and little/big endian machines
Guido van Rossum0612d842000-03-10 23:20:43 +000040#
Guido van Rossum0612d842000-03-10 23:20:43 +000041
Walter Dörwald474458d2002-06-04 15:16:29 +000042# UTF-8
Walter Dörwaldca8a8d02007-05-04 13:05:09 +000043BOM_UTF8 = b'\xef\xbb\xbf'
Walter Dörwald474458d2002-06-04 15:16:29 +000044
45# UTF-16, little endian
Walter Dörwaldca8a8d02007-05-04 13:05:09 +000046BOM_LE = BOM_UTF16_LE = b'\xff\xfe'
Walter Dörwald474458d2002-06-04 15:16:29 +000047
48# UTF-16, big endian
Walter Dörwaldca8a8d02007-05-04 13:05:09 +000049BOM_BE = BOM_UTF16_BE = b'\xfe\xff'
Walter Dörwald474458d2002-06-04 15:16:29 +000050
51# UTF-32, little endian
Walter Dörwaldca8a8d02007-05-04 13:05:09 +000052BOM_UTF32_LE = b'\xff\xfe\x00\x00'
Walter Dörwald474458d2002-06-04 15:16:29 +000053
54# UTF-32, big endian
Walter Dörwaldca8a8d02007-05-04 13:05:09 +000055BOM_UTF32_BE = b'\x00\x00\xfe\xff'
Walter Dörwald474458d2002-06-04 15:16:29 +000056
Marc-André Lemburgb28de0d2002-12-12 17:37:50 +000057if sys.byteorder == 'little':
Walter Dörwald474458d2002-06-04 15:16:29 +000058
Marc-André Lemburgb28de0d2002-12-12 17:37:50 +000059 # UTF-16, native endianness
60 BOM = BOM_UTF16 = BOM_UTF16_LE
61
62 # UTF-32, native endianness
63 BOM_UTF32 = BOM_UTF32_LE
64
65else:
66
67 # UTF-16, native endianness
68 BOM = BOM_UTF16 = BOM_UTF16_BE
69
70 # UTF-32, native endianness
71 BOM_UTF32 = BOM_UTF32_BE
Walter Dörwald474458d2002-06-04 15:16:29 +000072
73# Old broken names (don't use in new code)
74BOM32_LE = BOM_UTF16_LE
75BOM32_BE = BOM_UTF16_BE
76BOM64_LE = BOM_UTF32_LE
77BOM64_BE = BOM_UTF32_BE
Guido van Rossum0612d842000-03-10 23:20:43 +000078
79
80### Codec base classes (defining the API)
81
Thomas Woutersa9773292006-04-21 09:43:23 +000082class CodecInfo(tuple):
Nick Coghlanc72e4e62013-11-22 22:39:36 +100083 """Codec details when looking up the codec registry"""
84
85 # Private API to allow Python 3.4 to blacklist the known non-Unicode
86 # codecs in the standard library. A more general mechanism to
87 # reliably distinguish test encodings from other codecs will hopefully
88 # be defined for Python 3.5
89 #
90 # See http://bugs.python.org/issue19619
91 _is_text_encoding = True # Assume codecs are text encodings by default
Thomas Woutersa9773292006-04-21 09:43:23 +000092
93 def __new__(cls, encode, decode, streamreader=None, streamwriter=None,
Nick Coghlanc72e4e62013-11-22 22:39:36 +100094 incrementalencoder=None, incrementaldecoder=None, name=None,
95 *, _is_text_encoding=None):
Thomas Woutersa9773292006-04-21 09:43:23 +000096 self = tuple.__new__(cls, (encode, decode, streamreader, streamwriter))
97 self.name = name
98 self.encode = encode
99 self.decode = decode
100 self.incrementalencoder = incrementalencoder
101 self.incrementaldecoder = incrementaldecoder
102 self.streamwriter = streamwriter
103 self.streamreader = streamreader
Nick Coghlanc72e4e62013-11-22 22:39:36 +1000104 if _is_text_encoding is not None:
105 self._is_text_encoding = _is_text_encoding
Thomas Woutersa9773292006-04-21 09:43:23 +0000106 return self
107
108 def __repr__(self):
Serhiy Storchaka521e5862014-07-22 15:00:37 +0300109 return "<%s.%s object for encoding %s at %#x>" % \
110 (self.__class__.__module__, self.__class__.__qualname__,
Walter Dörwald3abcb012007-04-16 22:10:50 +0000111 self.name, id(self))
Thomas Woutersa9773292006-04-21 09:43:23 +0000112
Guido van Rossum0612d842000-03-10 23:20:43 +0000113class Codec:
114
115 """ Defines the interface for stateless encoders/decoders.
116
Walter Dörwald7f82f792002-11-19 21:42:53 +0000117 The .encode()/.decode() methods may use different error
Guido van Rossum0612d842000-03-10 23:20:43 +0000118 handling schemes by providing the errors argument. These
Walter Dörwald7f82f792002-11-19 21:42:53 +0000119 string values are predefined:
Guido van Rossum0612d842000-03-10 23:20:43 +0000120
Guido van Rossumd8855fd2000-03-24 22:14:19 +0000121 'strict' - raise a ValueError error (or a subclass)
Guido van Rossum0612d842000-03-10 23:20:43 +0000122 'ignore' - ignore the character and continue with the next
123 'replace' - replace with a suitable replacement character;
124 Python will use the official U+FFFD REPLACEMENT
Walter Dörwald7f82f792002-11-19 21:42:53 +0000125 CHARACTER for the builtin Unicode codecs on
126 decoding and '?' on encoding.
Serhiy Storchakad3faf432015-01-18 11:28:37 +0200127 'surrogateescape' - replace with private code points U+DCnn.
Walter Dörwald7f82f792002-11-19 21:42:53 +0000128 'xmlcharrefreplace' - Replace with the appropriate XML
129 character reference (only for encoding).
Serhiy Storchaka07985ef2015-01-25 22:56:57 +0200130 'backslashreplace' - Replace with backslashed escape sequences.
131 'namereplace' - Replace with \\N{...} escape sequences
Walter Dörwald7f82f792002-11-19 21:42:53 +0000132 (only for encoding).
133
134 The set of allowed values can be extended via register_error.
Guido van Rossum0612d842000-03-10 23:20:43 +0000135
136 """
Tim Peters30324a72001-05-15 17:19:16 +0000137 def encode(self, input, errors='strict'):
Guido van Rossum1c89b0e2000-04-11 15:41:38 +0000138
Fred Drake3e74c0d2000-03-17 15:40:35 +0000139 """ Encodes the object input and returns a tuple (output
Guido van Rossum0612d842000-03-10 23:20:43 +0000140 object, length consumed).
141
142 errors defines the error handling to apply. It defaults to
143 'strict' handling.
144
145 The method may not store state in the Codec instance. Use
Berker Peksag41ca8282015-07-30 18:26:10 +0300146 StreamWriter for codecs which have to keep state in order to
147 make encoding efficient.
Guido van Rossum0612d842000-03-10 23:20:43 +0000148
149 The encoder must be able to handle zero length input and
150 return an empty object of the output object type in this
151 situation.
152
153 """
154 raise NotImplementedError
155
Tim Peters30324a72001-05-15 17:19:16 +0000156 def decode(self, input, errors='strict'):
Guido van Rossum0612d842000-03-10 23:20:43 +0000157
158 """ Decodes the object input and returns a tuple (output
159 object, length consumed).
160
161 input must be an object which provides the bf_getreadbuf
162 buffer slot. Python strings, buffer objects and memory
163 mapped files are examples of objects providing this slot.
Guido van Rossum1c89b0e2000-04-11 15:41:38 +0000164
Guido van Rossum0612d842000-03-10 23:20:43 +0000165 errors defines the error handling to apply. It defaults to
166 'strict' handling.
167
168 The method may not store state in the Codec instance. Use
Berker Peksag41ca8282015-07-30 18:26:10 +0300169 StreamReader for codecs which have to keep state in order to
170 make decoding efficient.
Guido van Rossum0612d842000-03-10 23:20:43 +0000171
172 The decoder must be able to handle zero length input and
173 return an empty object of the output object type in this
174 situation.
175
Guido van Rossum1c89b0e2000-04-11 15:41:38 +0000176 """
Guido van Rossum0612d842000-03-10 23:20:43 +0000177 raise NotImplementedError
178
Thomas Woutersa9773292006-04-21 09:43:23 +0000179class IncrementalEncoder(object):
180 """
Walter Dörwald3abcb012007-04-16 22:10:50 +0000181 An IncrementalEncoder encodes an input in multiple steps. The input can
182 be passed piece by piece to the encode() method. The IncrementalEncoder
183 remembers the state of the encoding process between calls to encode().
Thomas Woutersa9773292006-04-21 09:43:23 +0000184 """
185 def __init__(self, errors='strict'):
186 """
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000187 Creates an IncrementalEncoder instance.
Thomas Woutersa9773292006-04-21 09:43:23 +0000188
189 The IncrementalEncoder may use different error handling schemes by
190 providing the errors keyword argument. See the module docstring
191 for a list of possible values.
192 """
193 self.errors = errors
194 self.buffer = ""
195
196 def encode(self, input, final=False):
197 """
198 Encodes input and returns the resulting object.
199 """
200 raise NotImplementedError
201
202 def reset(self):
203 """
204 Resets the encoder to the initial state.
205 """
206
Walter Dörwald3abcb012007-04-16 22:10:50 +0000207 def getstate(self):
208 """
209 Return the current state of the encoder.
210 """
211 return 0
212
213 def setstate(self, state):
214 """
215 Set the current state of the encoder. state must have been
216 returned by getstate().
217 """
218
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000219class BufferedIncrementalEncoder(IncrementalEncoder):
220 """
221 This subclass of IncrementalEncoder can be used as the baseclass for an
222 incremental encoder if the encoder must keep some of the output in a
223 buffer between calls to encode().
224 """
225 def __init__(self, errors='strict'):
226 IncrementalEncoder.__init__(self, errors)
Walter Dörwald3abcb012007-04-16 22:10:50 +0000227 # unencoded input that is kept between calls to encode()
228 self.buffer = ""
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000229
230 def _buffer_encode(self, input, errors, final):
231 # Overwrite this method in subclasses: It must encode input
232 # and return an (output, length consumed) tuple
233 raise NotImplementedError
234
235 def encode(self, input, final=False):
236 # encode input (taking the buffer into account)
237 data = self.buffer + input
238 (result, consumed) = self._buffer_encode(data, self.errors, final)
239 # keep unencoded input until the next call
240 self.buffer = data[consumed:]
241 return result
242
243 def reset(self):
244 IncrementalEncoder.reset(self)
245 self.buffer = ""
246
Walter Dörwald3abcb012007-04-16 22:10:50 +0000247 def getstate(self):
248 return self.buffer or 0
249
250 def setstate(self, state):
251 self.buffer = state or ""
252
Thomas Woutersa9773292006-04-21 09:43:23 +0000253class IncrementalDecoder(object):
254 """
Walter Dörwald3abcb012007-04-16 22:10:50 +0000255 An IncrementalDecoder decodes an input in multiple steps. The input can
256 be passed piece by piece to the decode() method. The IncrementalDecoder
Thomas Woutersa9773292006-04-21 09:43:23 +0000257 remembers the state of the decoding process between calls to decode().
258 """
259 def __init__(self, errors='strict'):
260 """
Martin Panter7462b6492015-11-02 03:37:02 +0000261 Create an IncrementalDecoder instance.
Thomas Woutersa9773292006-04-21 09:43:23 +0000262
263 The IncrementalDecoder may use different error handling schemes by
264 providing the errors keyword argument. See the module docstring
265 for a list of possible values.
266 """
267 self.errors = errors
268
269 def decode(self, input, final=False):
270 """
Ka-Ping Yeef44c7e82008-03-18 04:51:32 +0000271 Decode input and returns the resulting object.
Thomas Woutersa9773292006-04-21 09:43:23 +0000272 """
273 raise NotImplementedError
274
275 def reset(self):
276 """
Ka-Ping Yeef44c7e82008-03-18 04:51:32 +0000277 Reset the decoder to the initial state.
Thomas Woutersa9773292006-04-21 09:43:23 +0000278 """
279
Walter Dörwald3abcb012007-04-16 22:10:50 +0000280 def getstate(self):
281 """
Ka-Ping Yeef44c7e82008-03-18 04:51:32 +0000282 Return the current state of the decoder.
283
284 This must be a (buffered_input, additional_state_info) tuple.
285 buffered_input must be a bytes object containing bytes that
286 were passed to decode() that have not yet been converted.
287 additional_state_info must be a non-negative integer
288 representing the state of the decoder WITHOUT yet having
289 processed the contents of buffered_input. In the initial state
290 and after reset(), getstate() must return (b"", 0).
Walter Dörwald3abcb012007-04-16 22:10:50 +0000291 """
Walter Dörwaldca8a8d02007-05-04 13:05:09 +0000292 return (b"", 0)
Walter Dörwald3abcb012007-04-16 22:10:50 +0000293
294 def setstate(self, state):
295 """
Ka-Ping Yeef44c7e82008-03-18 04:51:32 +0000296 Set the current state of the decoder.
297
298 state must have been returned by getstate(). The effect of
299 setstate((b"", 0)) must be equivalent to reset().
Walter Dörwald3abcb012007-04-16 22:10:50 +0000300 """
301
Thomas Woutersa9773292006-04-21 09:43:23 +0000302class BufferedIncrementalDecoder(IncrementalDecoder):
303 """
304 This subclass of IncrementalDecoder can be used as the baseclass for an
Walter Dörwald3abcb012007-04-16 22:10:50 +0000305 incremental decoder if the decoder must be able to handle incomplete
306 byte sequences.
Thomas Woutersa9773292006-04-21 09:43:23 +0000307 """
308 def __init__(self, errors='strict'):
309 IncrementalDecoder.__init__(self, errors)
Walter Dörwald3abcb012007-04-16 22:10:50 +0000310 # undecoded input that is kept between calls to decode()
Walter Dörwaldca8a8d02007-05-04 13:05:09 +0000311 self.buffer = b""
Thomas Woutersa9773292006-04-21 09:43:23 +0000312
313 def _buffer_decode(self, input, errors, final):
314 # Overwrite this method in subclasses: It must decode input
315 # and return an (output, length consumed) tuple
316 raise NotImplementedError
317
318 def decode(self, input, final=False):
319 # decode input (taking the buffer into account)
320 data = self.buffer + input
321 (result, consumed) = self._buffer_decode(data, self.errors, final)
322 # keep undecoded input until the next call
323 self.buffer = data[consumed:]
324 return result
325
326 def reset(self):
327 IncrementalDecoder.reset(self)
Walter Dörwaldca8a8d02007-05-04 13:05:09 +0000328 self.buffer = b""
Thomas Woutersa9773292006-04-21 09:43:23 +0000329
Walter Dörwald3abcb012007-04-16 22:10:50 +0000330 def getstate(self):
331 # additional state info is always 0
332 return (self.buffer, 0)
333
334 def setstate(self, state):
335 # ignore additional state info
336 self.buffer = state[0]
337
Guido van Rossum0612d842000-03-10 23:20:43 +0000338#
339# The StreamWriter and StreamReader class provide generic working
Andrew M. Kuchling97c56352001-09-18 20:29:48 +0000340# interfaces which can be used to implement new encoding submodules
Guido van Rossum0612d842000-03-10 23:20:43 +0000341# very easily. See encodings/utf_8.py for an example on how this is
342# done.
Guido van Rossum1c89b0e2000-04-11 15:41:38 +0000343#
Guido van Rossum0612d842000-03-10 23:20:43 +0000344
345class StreamWriter(Codec):
346
Tim Peters30324a72001-05-15 17:19:16 +0000347 def __init__(self, stream, errors='strict'):
Guido van Rossum0612d842000-03-10 23:20:43 +0000348
349 """ Creates a StreamWriter instance.
350
Nick Coghlanb9fdb7a2015-01-07 00:22:00 +1000351 stream must be a file-like object open for writing.
Guido van Rossum0612d842000-03-10 23:20:43 +0000352
Walter Dörwald7f82f792002-11-19 21:42:53 +0000353 The StreamWriter may use different error handling
Guido van Rossum0612d842000-03-10 23:20:43 +0000354 schemes by providing the errors keyword argument. These
Walter Dörwald7f82f792002-11-19 21:42:53 +0000355 parameters are predefined:
Guido van Rossum0612d842000-03-10 23:20:43 +0000356
357 'strict' - raise a ValueError (or a subclass)
358 'ignore' - ignore the character and continue with the next
359 'replace'- replace with a suitable replacement character
Walter Dörwald7f82f792002-11-19 21:42:53 +0000360 'xmlcharrefreplace' - Replace with the appropriate XML
361 character reference.
362 'backslashreplace' - Replace with backslashed escape
Serhiy Storchaka07985ef2015-01-25 22:56:57 +0200363 sequences.
364 'namereplace' - Replace with \\N{...} escape sequences.
Guido van Rossum0612d842000-03-10 23:20:43 +0000365
Walter Dörwald7f82f792002-11-19 21:42:53 +0000366 The set of allowed parameter values can be extended via
367 register_error.
Guido van Rossum0612d842000-03-10 23:20:43 +0000368 """
369 self.stream = stream
370 self.errors = errors
371
Guido van Rossuma3277132000-04-11 15:37:43 +0000372 def write(self, object):
Guido van Rossum0612d842000-03-10 23:20:43 +0000373
374 """ Writes the object's contents encoded to self.stream.
375 """
Tim Peters30324a72001-05-15 17:19:16 +0000376 data, consumed = self.encode(object, self.errors)
Guido van Rossum0612d842000-03-10 23:20:43 +0000377 self.stream.write(data)
378
Guido van Rossuma3277132000-04-11 15:37:43 +0000379 def writelines(self, list):
380
381 """ Writes the concatenated list of strings to the stream
382 using .write().
383 """
384 self.write(''.join(list))
Guido van Rossum1c89b0e2000-04-11 15:41:38 +0000385
Guido van Rossum0612d842000-03-10 23:20:43 +0000386 def reset(self):
387
388 """ Flushes and resets the codec buffers used for keeping state.
389
390 Calling this method should ensure that the data on the
391 output is put into a clean state, that allows appending
392 of new fresh data without having to rescan the whole
393 stream to recover state.
394
395 """
396 pass
397
Victor Stinnera92ad7e2010-05-22 16:59:09 +0000398 def seek(self, offset, whence=0):
399 self.stream.seek(offset, whence)
400 if whence == 0 and offset == 0:
401 self.reset()
402
Tim Peters30324a72001-05-15 17:19:16 +0000403 def __getattr__(self, name,
Guido van Rossum0612d842000-03-10 23:20:43 +0000404 getattr=getattr):
405
406 """ Inherit all other methods from the underlying stream.
407 """
Tim Peters30324a72001-05-15 17:19:16 +0000408 return getattr(self.stream, name)
Guido van Rossum0612d842000-03-10 23:20:43 +0000409
Thomas Wouters89f507f2006-12-13 04:49:30 +0000410 def __enter__(self):
411 return self
412
413 def __exit__(self, type, value, tb):
414 self.stream.close()
415
Guido van Rossum0612d842000-03-10 23:20:43 +0000416###
417
418class StreamReader(Codec):
419
Georg Brandl02524622010-12-02 18:06:51 +0000420 charbuffertype = str
421
Tim Peters30324a72001-05-15 17:19:16 +0000422 def __init__(self, stream, errors='strict'):
Guido van Rossum0612d842000-03-10 23:20:43 +0000423
424 """ Creates a StreamReader instance.
425
Nick Coghlanb9fdb7a2015-01-07 00:22:00 +1000426 stream must be a file-like object open for reading.
Guido van Rossum0612d842000-03-10 23:20:43 +0000427
Walter Dörwald7f82f792002-11-19 21:42:53 +0000428 The StreamReader may use different error handling
Guido van Rossum0612d842000-03-10 23:20:43 +0000429 schemes by providing the errors keyword argument. These
Walter Dörwald7f82f792002-11-19 21:42:53 +0000430 parameters are predefined:
Guido van Rossum0612d842000-03-10 23:20:43 +0000431
432 'strict' - raise a ValueError (or a subclass)
433 'ignore' - ignore the character and continue with the next
Serhiy Storchaka07985ef2015-01-25 22:56:57 +0200434 'replace'- replace with a suitable replacement character
435 'backslashreplace' - Replace with backslashed escape sequences;
Guido van Rossum0612d842000-03-10 23:20:43 +0000436
Walter Dörwald7f82f792002-11-19 21:42:53 +0000437 The set of allowed parameter values can be extended via
438 register_error.
Guido van Rossum0612d842000-03-10 23:20:43 +0000439 """
440 self.stream = stream
441 self.errors = errors
Walter Dörwaldca8a8d02007-05-04 13:05:09 +0000442 self.bytebuffer = b""
Georg Brandl02524622010-12-02 18:06:51 +0000443 self._empty_charbuffer = self.charbuffertype()
444 self.charbuffer = self._empty_charbuffer
Martin v. Löwis4ed67382005-09-18 08:34:39 +0000445 self.linebuffer = None
Guido van Rossum0612d842000-03-10 23:20:43 +0000446
Walter Dörwald69652032004-09-07 20:24:22 +0000447 def decode(self, input, errors='strict'):
448 raise NotImplementedError
449
Martin v. Löwis56066d22005-08-24 07:38:12 +0000450 def read(self, size=-1, chars=-1, firstline=False):
Guido van Rossum0612d842000-03-10 23:20:43 +0000451
452 """ Decodes data from the stream self.stream and returns the
453 resulting object.
454
Nick Coghlanb9fdb7a2015-01-07 00:22:00 +1000455 chars indicates the number of decoded code points or bytes to
456 return. read() will never return more data than requested,
457 but it might return less, if there is not enough available.
Walter Dörwald69652032004-09-07 20:24:22 +0000458
Nick Coghlanb9fdb7a2015-01-07 00:22:00 +1000459 size indicates the approximate maximum number of decoded
460 bytes or code points to read for decoding. The decoder
Guido van Rossum0612d842000-03-10 23:20:43 +0000461 can modify this setting as appropriate. The default value
462 -1 indicates to read and decode as much as possible. size
463 is intended to prevent having to decode huge files in one
464 step.
465
Martin v. Löwis56066d22005-08-24 07:38:12 +0000466 If firstline is true, and a UnicodeDecodeError happens
467 after the first line terminator in the input only the first line
468 will be returned, the rest of the input will be kept until the
469 next call to read().
470
Nick Coghlanb9fdb7a2015-01-07 00:22:00 +1000471 The method should use a greedy read strategy, meaning that
Guido van Rossum0612d842000-03-10 23:20:43 +0000472 it should read as much data as is allowed within the
473 definition of the encoding and the given size, e.g. if
474 optional encoding endings or state markers are available
475 on the stream, these should be read too.
Guido van Rossum0612d842000-03-10 23:20:43 +0000476 """
Martin v. Löwis4ed67382005-09-18 08:34:39 +0000477 # If we have lines cached, first merge them back into characters
478 if self.linebuffer:
Georg Brandl02524622010-12-02 18:06:51 +0000479 self.charbuffer = self._empty_charbuffer.join(self.linebuffer)
Martin v. Löwis4ed67382005-09-18 08:34:39 +0000480 self.linebuffer = None
Tim Peters536cf992005-12-25 23:18:31 +0000481
Walter Dörwald69652032004-09-07 20:24:22 +0000482 # read until we get the required number of characters (if available)
Walter Dörwald69652032004-09-07 20:24:22 +0000483 while True:
Tim Golden621302c2012-10-01 16:40:40 +0100484 # can the request be satisfied from the character buffer?
Serhiy Storchakadbe09822014-01-26 19:27:56 +0200485 if chars >= 0:
Walter Dörwald69652032004-09-07 20:24:22 +0000486 if len(self.charbuffer) >= chars:
Walter Dörwald69652032004-09-07 20:24:22 +0000487 break
Serhiy Storchakadbe09822014-01-26 19:27:56 +0200488 elif size >= 0:
489 if len(self.charbuffer) >= size:
490 break
Walter Dörwald69652032004-09-07 20:24:22 +0000491 # we need more data
492 if size < 0:
493 newdata = self.stream.read()
494 else:
495 newdata = self.stream.read(size)
Walter Dörwalde57d7b12004-12-21 22:24:00 +0000496 # decode bytes (those remaining from the last call included)
Walter Dörwald69652032004-09-07 20:24:22 +0000497 data = self.bytebuffer + newdata
Serhiy Storchakadbe09822014-01-26 19:27:56 +0200498 if not data:
499 break
Martin v. Löwis56066d22005-08-24 07:38:12 +0000500 try:
501 newchars, decodedbytes = self.decode(data, self.errors)
Guido van Rossumb940e112007-01-10 16:19:56 +0000502 except UnicodeDecodeError as exc:
Martin v. Löwis56066d22005-08-24 07:38:12 +0000503 if firstline:
Walter Dörwald3abcb012007-04-16 22:10:50 +0000504 newchars, decodedbytes = \
505 self.decode(data[:exc.start], self.errors)
Ezio Melottid8b509b2011-09-28 17:37:55 +0300506 lines = newchars.splitlines(keepends=True)
Martin v. Löwis56066d22005-08-24 07:38:12 +0000507 if len(lines)<=1:
508 raise
509 else:
510 raise
Walter Dörwald69652032004-09-07 20:24:22 +0000511 # keep undecoded bytes until the next call
512 self.bytebuffer = data[decodedbytes:]
513 # put new characters in the character buffer
Walter Dörwalde57d7b12004-12-21 22:24:00 +0000514 self.charbuffer += newchars
Walter Dörwald69652032004-09-07 20:24:22 +0000515 # there was no data available
516 if not newdata:
Walter Dörwalde57d7b12004-12-21 22:24:00 +0000517 break
518 if chars < 0:
519 # Return everything we've got
520 result = self.charbuffer
Georg Brandl02524622010-12-02 18:06:51 +0000521 self.charbuffer = self._empty_charbuffer
Walter Dörwalde57d7b12004-12-21 22:24:00 +0000522 else:
523 # Return the first chars characters
524 result = self.charbuffer[:chars]
525 self.charbuffer = self.charbuffer[chars:]
Walter Dörwald69652032004-09-07 20:24:22 +0000526 return result
Guido van Rossum0612d842000-03-10 23:20:43 +0000527
Walter Dörwald69652032004-09-07 20:24:22 +0000528 def readline(self, size=None, keepends=True):
Guido van Rossuma3277132000-04-11 15:37:43 +0000529
530 """ Read one line from the input stream and return the
531 decoded data.
532
Walter Dörwald69652032004-09-07 20:24:22 +0000533 size, if given, is passed as size argument to the
534 read() method.
Guido van Rossum1c89b0e2000-04-11 15:41:38 +0000535
Guido van Rossuma3277132000-04-11 15:37:43 +0000536 """
Martin v. Löwis4ed67382005-09-18 08:34:39 +0000537 # If we have lines cached from an earlier read, return
538 # them unconditionally
539 if self.linebuffer:
540 line = self.linebuffer[0]
541 del self.linebuffer[0]
542 if len(self.linebuffer) == 1:
543 # revert to charbuffer mode; we might need more data
544 # next time
545 self.charbuffer = self.linebuffer[0]
546 self.linebuffer = None
547 if not keepends:
Ezio Melottid8b509b2011-09-28 17:37:55 +0300548 line = line.splitlines(keepends=False)[0]
Martin v. Löwis4ed67382005-09-18 08:34:39 +0000549 return line
Tim Peters536cf992005-12-25 23:18:31 +0000550
Walter Dörwalde57d7b12004-12-21 22:24:00 +0000551 readsize = size or 72
Georg Brandl02524622010-12-02 18:06:51 +0000552 line = self._empty_charbuffer
Walter Dörwalde57d7b12004-12-21 22:24:00 +0000553 # If size is given, we call read() only once
Walter Dörwald69652032004-09-07 20:24:22 +0000554 while True:
Martin v. Löwis56066d22005-08-24 07:38:12 +0000555 data = self.read(readsize, firstline=True)
Walter Dörwalde57d7b12004-12-21 22:24:00 +0000556 if data:
Walter Dörwalda4eb2d52005-04-21 21:42:35 +0000557 # If we're at a "\r" read one extra character (which might
558 # be a "\n") to get a proper line ending. If the stream is
Walter Dörwaldbc8e6422005-04-21 21:32:03 +0000559 # temporarily exhausted we return the wrong line ending.
Georg Brandl02524622010-12-02 18:06:51 +0000560 if (isinstance(data, str) and data.endswith("\r")) or \
561 (isinstance(data, bytes) and data.endswith(b"\r")):
Walter Dörwald7a6dc132005-04-04 21:38:47 +0000562 data += self.read(size=1, chars=1)
Walter Dörwald7a6dc132005-04-04 21:38:47 +0000563
Walter Dörwald69652032004-09-07 20:24:22 +0000564 line += data
Ezio Melottid8b509b2011-09-28 17:37:55 +0300565 lines = line.splitlines(keepends=True)
Walter Dörwalde57d7b12004-12-21 22:24:00 +0000566 if lines:
Martin v. Löwis4ed67382005-09-18 08:34:39 +0000567 if len(lines) > 1:
568 # More than one line result; the first line is a full line
569 # to return
570 line = lines[0]
571 del lines[0]
572 if len(lines) > 1:
573 # cache the remaining lines
574 lines[-1] += self.charbuffer
575 self.linebuffer = lines
576 self.charbuffer = None
577 else:
578 # only one remaining line, put it back into charbuffer
579 self.charbuffer = lines[0] + self.charbuffer
580 if not keepends:
Ezio Melottid8b509b2011-09-28 17:37:55 +0300581 line = line.splitlines(keepends=False)[0]
Martin v. Löwis4ed67382005-09-18 08:34:39 +0000582 break
Walter Dörwalde57d7b12004-12-21 22:24:00 +0000583 line0withend = lines[0]
Ezio Melottid8b509b2011-09-28 17:37:55 +0300584 line0withoutend = lines[0].splitlines(keepends=False)[0]
Walter Dörwalde57d7b12004-12-21 22:24:00 +0000585 if line0withend != line0withoutend: # We really have a line end
586 # Put the rest back together and keep it until the next call
Georg Brandl02524622010-12-02 18:06:51 +0000587 self.charbuffer = self._empty_charbuffer.join(lines[1:]) + \
588 self.charbuffer
Walter Dörwalde57d7b12004-12-21 22:24:00 +0000589 if keepends:
590 line = line0withend
591 else:
592 line = line0withoutend
Walter Dörwald9fa09462005-01-10 12:01:39 +0000593 break
Walter Dörwalde57d7b12004-12-21 22:24:00 +0000594 # we didn't get anything or this was our only try
Walter Dörwald9fa09462005-01-10 12:01:39 +0000595 if not data or size is not None:
Walter Dörwalde57d7b12004-12-21 22:24:00 +0000596 if line and not keepends:
Ezio Melottid8b509b2011-09-28 17:37:55 +0300597 line = line.splitlines(keepends=False)[0]
Walter Dörwalde57d7b12004-12-21 22:24:00 +0000598 break
Georg Brandl02524622010-12-02 18:06:51 +0000599 if readsize < 8000:
Walter Dörwalde57d7b12004-12-21 22:24:00 +0000600 readsize *= 2
601 return line
Guido van Rossum1c89b0e2000-04-11 15:41:38 +0000602
Walter Dörwald69652032004-09-07 20:24:22 +0000603 def readlines(self, sizehint=None, keepends=True):
Guido van Rossuma3277132000-04-11 15:37:43 +0000604
605 """ Read all lines available on the input stream
Nick Coghlanb9fdb7a2015-01-07 00:22:00 +1000606 and return them as a list.
Guido van Rossuma3277132000-04-11 15:37:43 +0000607
608 Line breaks are implemented using the codec's decoder
609 method and are included in the list entries.
Guido van Rossum1c89b0e2000-04-11 15:41:38 +0000610
Marc-André Lemburgd5948492004-02-26 15:22:17 +0000611 sizehint, if given, is ignored since there is no efficient
612 way to finding the true end-of-line.
Guido van Rossuma3277132000-04-11 15:37:43 +0000613
614 """
Walter Dörwald69652032004-09-07 20:24:22 +0000615 data = self.read()
Hye-Shik Changaf5c7cf2004-10-17 23:51:21 +0000616 return data.splitlines(keepends)
Guido van Rossum0612d842000-03-10 23:20:43 +0000617
618 def reset(self):
619
620 """ Resets the codec buffers used for keeping state.
621
622 Note that no stream repositioning should take place.
Thomas Wouters7e474022000-07-16 12:04:32 +0000623 This method is primarily intended to be able to recover
Guido van Rossum0612d842000-03-10 23:20:43 +0000624 from decoding errors.
625
626 """
Walter Dörwaldca8a8d02007-05-04 13:05:09 +0000627 self.bytebuffer = b""
Georg Brandl02524622010-12-02 18:06:51 +0000628 self.charbuffer = self._empty_charbuffer
Martin v. Löwis4ed67382005-09-18 08:34:39 +0000629 self.linebuffer = None
Walter Dörwald729c31f2005-03-14 19:06:30 +0000630
Walter Dörwald71fd90d2005-03-14 19:25:41 +0000631 def seek(self, offset, whence=0):
Walter Dörwald729c31f2005-03-14 19:06:30 +0000632 """ Set the input stream's current position.
633
634 Resets the codec buffers used for keeping state.
635 """
Walter Dörwald729c31f2005-03-14 19:06:30 +0000636 self.stream.seek(offset, whence)
Victor Stinnera92ad7e2010-05-22 16:59:09 +0000637 self.reset()
Guido van Rossum0612d842000-03-10 23:20:43 +0000638
Georg Brandla18af4e2007-04-21 15:47:16 +0000639 def __next__(self):
Walter Dörwald4dbf1922002-11-06 16:53:44 +0000640
641 """ Return the next decoded line from the input stream."""
642 line = self.readline()
643 if line:
644 return line
645 raise StopIteration
646
647 def __iter__(self):
648 return self
649
Tim Peters30324a72001-05-15 17:19:16 +0000650 def __getattr__(self, name,
Guido van Rossum0612d842000-03-10 23:20:43 +0000651 getattr=getattr):
652
653 """ Inherit all other methods from the underlying stream.
654 """
Tim Peters30324a72001-05-15 17:19:16 +0000655 return getattr(self.stream, name)
Guido van Rossum0612d842000-03-10 23:20:43 +0000656
Thomas Wouters89f507f2006-12-13 04:49:30 +0000657 def __enter__(self):
658 return self
659
660 def __exit__(self, type, value, tb):
661 self.stream.close()
662
Guido van Rossum0612d842000-03-10 23:20:43 +0000663###
664
665class StreamReaderWriter:
666
Fred Drake49fd1072000-04-13 14:11:21 +0000667 """ StreamReaderWriter instances allow wrapping streams which
668 work in both read and write modes.
669
670 The design is such that one can use the factory functions
Thomas Wouters7e474022000-07-16 12:04:32 +0000671 returned by the codec.lookup() function to construct the
Fred Drake49fd1072000-04-13 14:11:21 +0000672 instance.
673
674 """
Guido van Rossuma3277132000-04-11 15:37:43 +0000675 # Optional attributes set by the file wrappers below
676 encoding = 'unknown'
677
Tim Peters30324a72001-05-15 17:19:16 +0000678 def __init__(self, stream, Reader, Writer, errors='strict'):
Guido van Rossum0612d842000-03-10 23:20:43 +0000679
680 """ Creates a StreamReaderWriter instance.
681
682 stream must be a Stream-like object.
683
684 Reader, Writer must be factory functions or classes
685 providing the StreamReader, StreamWriter interface resp.
686
687 Error handling is done in the same way as defined for the
688 StreamWriter/Readers.
689
690 """
691 self.stream = stream
692 self.reader = Reader(stream, errors)
693 self.writer = Writer(stream, errors)
694 self.errors = errors
695
Tim Peters30324a72001-05-15 17:19:16 +0000696 def read(self, size=-1):
Guido van Rossum0612d842000-03-10 23:20:43 +0000697
698 return self.reader.read(size)
699
Guido van Rossumd58c26f2000-05-01 16:17:32 +0000700 def readline(self, size=None):
Guido van Rossuma3277132000-04-11 15:37:43 +0000701
702 return self.reader.readline(size)
703
Guido van Rossumd58c26f2000-05-01 16:17:32 +0000704 def readlines(self, sizehint=None):
Guido van Rossuma3277132000-04-11 15:37:43 +0000705
706 return self.reader.readlines(sizehint)
707
Georg Brandla18af4e2007-04-21 15:47:16 +0000708 def __next__(self):
Walter Dörwald4dbf1922002-11-06 16:53:44 +0000709
710 """ Return the next decoded line from the input stream."""
Georg Brandla18af4e2007-04-21 15:47:16 +0000711 return next(self.reader)
Walter Dörwald4dbf1922002-11-06 16:53:44 +0000712
713 def __iter__(self):
714 return self
715
Tim Peters30324a72001-05-15 17:19:16 +0000716 def write(self, data):
Guido van Rossum0612d842000-03-10 23:20:43 +0000717
718 return self.writer.write(data)
719
Tim Peters30324a72001-05-15 17:19:16 +0000720 def writelines(self, list):
Guido van Rossuma3277132000-04-11 15:37:43 +0000721
722 return self.writer.writelines(list)
723
Guido van Rossum0612d842000-03-10 23:20:43 +0000724 def reset(self):
725
726 self.reader.reset()
727 self.writer.reset()
728
Victor Stinner3fed0872010-05-22 02:16:27 +0000729 def seek(self, offset, whence=0):
Victor Stinnera92ad7e2010-05-22 16:59:09 +0000730 self.stream.seek(offset, whence)
731 self.reader.reset()
732 if whence == 0 and offset == 0:
733 self.writer.reset()
Victor Stinner3fed0872010-05-22 02:16:27 +0000734
Tim Peters30324a72001-05-15 17:19:16 +0000735 def __getattr__(self, name,
Guido van Rossum0612d842000-03-10 23:20:43 +0000736 getattr=getattr):
737
738 """ Inherit all other methods from the underlying stream.
739 """
Tim Peters30324a72001-05-15 17:19:16 +0000740 return getattr(self.stream, name)
Guido van Rossum0612d842000-03-10 23:20:43 +0000741
Thomas Wouters89f507f2006-12-13 04:49:30 +0000742 # these are needed to make "with codecs.open(...)" work properly
743
744 def __enter__(self):
745 return self
746
747 def __exit__(self, type, value, tb):
748 self.stream.close()
749
Guido van Rossum0612d842000-03-10 23:20:43 +0000750###
751
752class StreamRecoder:
753
Nick Coghlanb9fdb7a2015-01-07 00:22:00 +1000754 """ StreamRecoder instances translate data from one encoding to another.
Fred Drake49fd1072000-04-13 14:11:21 +0000755
756 They use the complete set of APIs returned by the
757 codecs.lookup() function to implement their task.
758
Nick Coghlanb9fdb7a2015-01-07 00:22:00 +1000759 Data written to the StreamRecoder is first decoded into an
760 intermediate format (depending on the "decode" codec) and then
761 written to the underlying stream using an instance of the provided
762 Writer class.
Fred Drake49fd1072000-04-13 14:11:21 +0000763
Nick Coghlanb9fdb7a2015-01-07 00:22:00 +1000764 In the other direction, data is read from the underlying stream using
765 a Reader instance and then encoded and returned to the caller.
Fred Drake49fd1072000-04-13 14:11:21 +0000766
767 """
Guido van Rossuma3277132000-04-11 15:37:43 +0000768 # Optional attributes set by the file wrappers below
769 data_encoding = 'unknown'
770 file_encoding = 'unknown'
771
Tim Peters30324a72001-05-15 17:19:16 +0000772 def __init__(self, stream, encode, decode, Reader, Writer,
773 errors='strict'):
Guido van Rossum0612d842000-03-10 23:20:43 +0000774
775 """ Creates a StreamRecoder instance which implements a two-way
776 conversion: encode and decode work on the frontend (the
Nick Coghlanb9fdb7a2015-01-07 00:22:00 +1000777 data visible to .read() and .write()) while Reader and Writer
778 work on the backend (the data in stream).
Guido van Rossum0612d842000-03-10 23:20:43 +0000779
Nick Coghlanb9fdb7a2015-01-07 00:22:00 +1000780 You can use these objects to do transparent
781 transcodings from e.g. latin-1 to utf-8 and back.
Guido van Rossum0612d842000-03-10 23:20:43 +0000782
783 stream must be a file-like object.
784
Nick Coghlanb9fdb7a2015-01-07 00:22:00 +1000785 encode and decode must adhere to the Codec interface; Reader and
Guido van Rossum0612d842000-03-10 23:20:43 +0000786 Writer must be factory functions or classes providing the
Nick Coghlanb9fdb7a2015-01-07 00:22:00 +1000787 StreamReader and StreamWriter interfaces resp.
Guido van Rossum0612d842000-03-10 23:20:43 +0000788
789 Error handling is done in the same way as defined for the
790 StreamWriter/Readers.
791
792 """
793 self.stream = stream
794 self.encode = encode
795 self.decode = decode
796 self.reader = Reader(stream, errors)
797 self.writer = Writer(stream, errors)
798 self.errors = errors
799
Tim Peters30324a72001-05-15 17:19:16 +0000800 def read(self, size=-1):
Guido van Rossum0612d842000-03-10 23:20:43 +0000801
802 data = self.reader.read(size)
803 data, bytesencoded = self.encode(data, self.errors)
804 return data
805
Tim Peters30324a72001-05-15 17:19:16 +0000806 def readline(self, size=None):
Guido van Rossuma3277132000-04-11 15:37:43 +0000807
808 if size is None:
809 data = self.reader.readline()
810 else:
811 data = self.reader.readline(size)
812 data, bytesencoded = self.encode(data, self.errors)
813 return data
814
Tim Peters30324a72001-05-15 17:19:16 +0000815 def readlines(self, sizehint=None):
Guido van Rossuma3277132000-04-11 15:37:43 +0000816
Marc-André Lemburgd5948492004-02-26 15:22:17 +0000817 data = self.reader.read()
Guido van Rossuma3277132000-04-11 15:37:43 +0000818 data, bytesencoded = self.encode(data, self.errors)
Ezio Melottid8b509b2011-09-28 17:37:55 +0300819 return data.splitlines(keepends=True)
Guido van Rossuma3277132000-04-11 15:37:43 +0000820
Georg Brandla18af4e2007-04-21 15:47:16 +0000821 def __next__(self):
Walter Dörwald4dbf1922002-11-06 16:53:44 +0000822
823 """ Return the next decoded line from the input stream."""
Georg Brandla18af4e2007-04-21 15:47:16 +0000824 data = next(self.reader)
Walter Dörwaldc5238b82005-09-01 11:56:53 +0000825 data, bytesencoded = self.encode(data, self.errors)
826 return data
Walter Dörwald4dbf1922002-11-06 16:53:44 +0000827
828 def __iter__(self):
829 return self
830
Tim Peters30324a72001-05-15 17:19:16 +0000831 def write(self, data):
Guido van Rossum0612d842000-03-10 23:20:43 +0000832
833 data, bytesdecoded = self.decode(data, self.errors)
834 return self.writer.write(data)
835
Tim Peters30324a72001-05-15 17:19:16 +0000836 def writelines(self, list):
Guido van Rossuma3277132000-04-11 15:37:43 +0000837
838 data = ''.join(list)
839 data, bytesdecoded = self.decode(data, self.errors)
840 return self.writer.write(data)
Guido van Rossum0612d842000-03-10 23:20:43 +0000841
842 def reset(self):
843
844 self.reader.reset()
845 self.writer.reset()
846
Tim Peters30324a72001-05-15 17:19:16 +0000847 def __getattr__(self, name,
Guido van Rossum0612d842000-03-10 23:20:43 +0000848 getattr=getattr):
849
850 """ Inherit all other methods from the underlying stream.
851 """
Tim Peters30324a72001-05-15 17:19:16 +0000852 return getattr(self.stream, name)
Guido van Rossum0612d842000-03-10 23:20:43 +0000853
Thomas Wouters89f507f2006-12-13 04:49:30 +0000854 def __enter__(self):
855 return self
856
857 def __exit__(self, type, value, tb):
858 self.stream.close()
859
Guido van Rossum0612d842000-03-10 23:20:43 +0000860### Shortcuts
861
Nick Coghlanb9fdb7a2015-01-07 00:22:00 +1000862def open(filename, mode='r', encoding=None, errors='strict', buffering=1):
Guido van Rossum0612d842000-03-10 23:20:43 +0000863
864 """ Open an encoded file using the given mode and return
865 a wrapped version providing transparent encoding/decoding.
866
867 Note: The wrapped version will only accept the object format
868 defined by the codecs, i.e. Unicode objects for most builtin
Skip Montanaro9f5f9d92005-03-16 03:51:56 +0000869 codecs. Output is also codec dependent and will usually be
Guido van Rossum0612d842000-03-10 23:20:43 +0000870 Unicode as well.
871
Nick Coghlanb9fdb7a2015-01-07 00:22:00 +1000872 Underlying encoded files are always opened in binary mode.
873 The default file mode is 'r', meaning to open the file in read mode.
Marc-André Lemburg349a3d32000-06-21 21:21:04 +0000874
Guido van Rossum0612d842000-03-10 23:20:43 +0000875 encoding specifies the encoding which is to be used for the
Walter Dörwald7f3ed742003-02-02 23:08:27 +0000876 file.
Guido van Rossum0612d842000-03-10 23:20:43 +0000877
878 errors may be given to define the error handling. It defaults
879 to 'strict' which causes ValueErrors to be raised in case an
880 encoding error occurs.
881
882 buffering has the same meaning as for the builtin open() API.
883 It defaults to line buffered.
884
Fred Drake49fd1072000-04-13 14:11:21 +0000885 The returned wrapped file object provides an extra attribute
886 .encoding which allows querying the used encoding. This
887 attribute is only available if an encoding was specified as
888 parameter.
889
Guido van Rossum0612d842000-03-10 23:20:43 +0000890 """
891 if encoding is not None and \
892 'b' not in mode:
893 # Force opening of the file in binary mode
894 mode = mode + 'b'
Georg Brandl1a3284e2007-12-02 09:40:06 +0000895 file = builtins.open(filename, mode, buffering)
Guido van Rossum0612d842000-03-10 23:20:43 +0000896 if encoding is None:
897 return file
Thomas Woutersa9773292006-04-21 09:43:23 +0000898 info = lookup(encoding)
899 srw = StreamReaderWriter(file, info.streamreader, info.streamwriter, errors)
Guido van Rossuma3277132000-04-11 15:37:43 +0000900 # Add attributes to simplify introspection
901 srw.encoding = encoding
902 return srw
Guido van Rossum0612d842000-03-10 23:20:43 +0000903
Guido van Rossuma3277132000-04-11 15:37:43 +0000904def EncodedFile(file, data_encoding, file_encoding=None, errors='strict'):
Guido van Rossum0612d842000-03-10 23:20:43 +0000905
906 """ Return a wrapped version of file which provides transparent
907 encoding translation.
908
Nick Coghlanb9fdb7a2015-01-07 00:22:00 +1000909 Data written to the wrapped file is decoded according
910 to the given data_encoding and then encoded to the underlying
911 file using file_encoding. The intermediate data type
Guido van Rossuma3277132000-04-11 15:37:43 +0000912 will usually be Unicode but depends on the specified codecs.
Guido van Rossum0612d842000-03-10 23:20:43 +0000913
Nick Coghlanb9fdb7a2015-01-07 00:22:00 +1000914 Bytes read from the file are decoded using file_encoding and then
915 passed back to the caller encoded using data_encoding.
Guido van Rossuma3277132000-04-11 15:37:43 +0000916
917 If file_encoding is not given, it defaults to data_encoding.
Guido van Rossum0612d842000-03-10 23:20:43 +0000918
919 errors may be given to define the error handling. It defaults
920 to 'strict' which causes ValueErrors to be raised in case an
921 encoding error occurs.
922
Fred Drake49fd1072000-04-13 14:11:21 +0000923 The returned wrapped file object provides two extra attributes
924 .data_encoding and .file_encoding which reflect the given
925 parameters of the same name. The attributes can be used for
926 introspection by Python programs.
927
Guido van Rossum0612d842000-03-10 23:20:43 +0000928 """
Guido van Rossuma3277132000-04-11 15:37:43 +0000929 if file_encoding is None:
930 file_encoding = data_encoding
Thomas Wouters89f507f2006-12-13 04:49:30 +0000931 data_info = lookup(data_encoding)
932 file_info = lookup(file_encoding)
933 sr = StreamRecoder(file, data_info.encode, data_info.decode,
934 file_info.streamreader, file_info.streamwriter, errors)
Guido van Rossuma3277132000-04-11 15:37:43 +0000935 # Add attributes to simplify introspection
936 sr.data_encoding = data_encoding
937 sr.file_encoding = file_encoding
938 return sr
Guido van Rossum0612d842000-03-10 23:20:43 +0000939
Marc-André Lemburgaa32c5a2001-09-19 11:24:48 +0000940### Helpers for codec lookup
941
942def getencoder(encoding):
943
944 """ Lookup up the codec for the given encoding and return
945 its encoder function.
946
947 Raises a LookupError in case the encoding cannot be found.
948
949 """
Thomas Woutersa9773292006-04-21 09:43:23 +0000950 return lookup(encoding).encode
Marc-André Lemburgaa32c5a2001-09-19 11:24:48 +0000951
952def getdecoder(encoding):
953
954 """ Lookup up the codec for the given encoding and return
955 its decoder function.
956
957 Raises a LookupError in case the encoding cannot be found.
958
959 """
Thomas Woutersa9773292006-04-21 09:43:23 +0000960 return lookup(encoding).decode
961
962def getincrementalencoder(encoding):
963
964 """ Lookup up the codec for the given encoding and return
965 its IncrementalEncoder class or factory function.
966
967 Raises a LookupError in case the encoding cannot be found
968 or the codecs doesn't provide an incremental encoder.
969
970 """
971 encoder = lookup(encoding).incrementalencoder
972 if encoder is None:
973 raise LookupError(encoding)
974 return encoder
975
976def getincrementaldecoder(encoding):
977
978 """ Lookup up the codec for the given encoding and return
979 its IncrementalDecoder class or factory function.
980
981 Raises a LookupError in case the encoding cannot be found
982 or the codecs doesn't provide an incremental decoder.
983
984 """
985 decoder = lookup(encoding).incrementaldecoder
986 if decoder is None:
987 raise LookupError(encoding)
988 return decoder
Marc-André Lemburgaa32c5a2001-09-19 11:24:48 +0000989
990def getreader(encoding):
991
992 """ Lookup up the codec for the given encoding and return
993 its StreamReader class or factory function.
994
995 Raises a LookupError in case the encoding cannot be found.
996
997 """
Thomas Woutersa9773292006-04-21 09:43:23 +0000998 return lookup(encoding).streamreader
Marc-André Lemburgaa32c5a2001-09-19 11:24:48 +0000999
1000def getwriter(encoding):
1001
1002 """ Lookup up the codec for the given encoding and return
1003 its StreamWriter class or factory function.
1004
1005 Raises a LookupError in case the encoding cannot be found.
1006
1007 """
Thomas Woutersa9773292006-04-21 09:43:23 +00001008 return lookup(encoding).streamwriter
1009
1010def iterencode(iterator, encoding, errors='strict', **kwargs):
1011 """
1012 Encoding iterator.
1013
Martin Panter7462b6492015-11-02 03:37:02 +00001014 Encodes the input strings from the iterator using an IncrementalEncoder.
Thomas Woutersa9773292006-04-21 09:43:23 +00001015
1016 errors and kwargs are passed through to the IncrementalEncoder
1017 constructor.
1018 """
1019 encoder = getincrementalencoder(encoding)(errors, **kwargs)
1020 for input in iterator:
1021 output = encoder.encode(input)
1022 if output:
1023 yield output
1024 output = encoder.encode("", True)
1025 if output:
1026 yield output
1027
1028def iterdecode(iterator, encoding, errors='strict', **kwargs):
1029 """
1030 Decoding iterator.
1031
Martin Panter7462b6492015-11-02 03:37:02 +00001032 Decodes the input strings from the iterator using an IncrementalDecoder.
Thomas Woutersa9773292006-04-21 09:43:23 +00001033
1034 errors and kwargs are passed through to the IncrementalDecoder
1035 constructor.
1036 """
1037 decoder = getincrementaldecoder(encoding)(errors, **kwargs)
1038 for input in iterator:
1039 output = decoder.decode(input)
1040 if output:
1041 yield output
Walter Dörwaldca8a8d02007-05-04 13:05:09 +00001042 output = decoder.decode(b"", True)
Thomas Woutersa9773292006-04-21 09:43:23 +00001043 if output:
1044 yield output
Marc-André Lemburgaa32c5a2001-09-19 11:24:48 +00001045
Marc-André Lemburga866df82001-01-03 21:29:14 +00001046### Helpers for charmap-based codecs
1047
1048def make_identity_dict(rng):
1049
1050 """ make_identity_dict(rng) -> dict
1051
1052 Return a dictionary where elements of the rng sequence are
1053 mapped to themselves.
Tim Peters88869f92001-01-14 23:36:06 +00001054
Marc-André Lemburga866df82001-01-03 21:29:14 +00001055 """
Antoine Pitrouaaefac72012-06-16 22:48:21 +02001056 return {i:i for i in rng}
Marc-André Lemburga866df82001-01-03 21:29:14 +00001057
Marc-André Lemburg716cf912001-05-16 09:41:45 +00001058def make_encoding_map(decoding_map):
1059
1060 """ Creates an encoding map from a decoding map.
1061
Walter Dörwald7f3ed742003-02-02 23:08:27 +00001062 If a target mapping in the decoding map occurs multiple
Marc-André Lemburg716cf912001-05-16 09:41:45 +00001063 times, then that target is mapped to None (undefined mapping),
1064 causing an exception when encountered by the charmap codec
1065 during translation.
1066
1067 One example where this happens is cp875.py which decodes
Serhiy Storchaka9f8a8912015-04-03 18:12:41 +03001068 multiple character to \\u001a.
Marc-André Lemburg716cf912001-05-16 09:41:45 +00001069
1070 """
1071 m = {}
1072 for k,v in decoding_map.items():
Raymond Hettinger54f02222002-06-01 14:18:47 +00001073 if not v in m:
Marc-André Lemburg716cf912001-05-16 09:41:45 +00001074 m[v] = k
1075 else:
1076 m[v] = None
1077 return m
Tim Peters3a2ab1a2001-05-29 06:06:54 +00001078
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001079### error handlers
1080
Martin v. Löwise2713be2005-03-08 15:03:08 +00001081try:
1082 strict_errors = lookup_error("strict")
1083 ignore_errors = lookup_error("ignore")
1084 replace_errors = lookup_error("replace")
1085 xmlcharrefreplace_errors = lookup_error("xmlcharrefreplace")
1086 backslashreplace_errors = lookup_error("backslashreplace")
Serhiy Storchaka166ebc42014-11-25 13:57:17 +02001087 namereplace_errors = lookup_error("namereplace")
Martin v. Löwise2713be2005-03-08 15:03:08 +00001088except LookupError:
1089 # In --disable-unicode builds, these error handler are missing
1090 strict_errors = None
1091 ignore_errors = None
1092 replace_errors = None
1093 xmlcharrefreplace_errors = None
1094 backslashreplace_errors = None
Serhiy Storchaka166ebc42014-11-25 13:57:17 +02001095 namereplace_errors = None
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001096
Martin v. Löwis6cd441d2001-07-31 08:54:55 +00001097# Tell modulefinder that using codecs probably needs the encodings
1098# package
1099_false = 0
1100if _false:
1101 import encodings
1102
Guido van Rossum0612d842000-03-10 23:20:43 +00001103### Tests
Guido van Rossum1c89b0e2000-04-11 15:41:38 +00001104
Guido van Rossum0612d842000-03-10 23:20:43 +00001105if __name__ == '__main__':
1106
Guido van Rossuma3277132000-04-11 15:37:43 +00001107 # Make stdout translate Latin-1 output into UTF-8 output
1108 sys.stdout = EncodedFile(sys.stdout, 'latin-1', 'utf-8')
Guido van Rossum1c89b0e2000-04-11 15:41:38 +00001109
Guido van Rossuma3277132000-04-11 15:37:43 +00001110 # Have stdin translate Latin-1 input into UTF-8 input
1111 sys.stdin = EncodedFile(sys.stdin, 'utf-8', 'latin-1')