blob: ba9c073feb3f853b97b8f31e4ae95484a06aaee5 [file] [log] [blame]
Guido van Rossum0612d842000-03-10 23:20:43 +00001""" codecs -- Python Codec Registry, API and helpers.
2
3
4Written by Marc-Andre Lemburg (mal@lemburg.com).
5
6(c) Copyright CNRI, All Rights Reserved. NO WARRANTY.
7
8"""#"
9
Georg Brandl1a3284e2007-12-02 09:40:06 +000010import builtins, sys
Guido van Rossum0612d842000-03-10 23:20:43 +000011
12### Registry and builtin stateless codec functions
13
Guido van Rossumb95de4f2000-03-31 17:25:23 +000014try:
15 from _codecs import *
Guido van Rossumb940e112007-01-10 16:19:56 +000016except ImportError as why:
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000017 raise SystemError('Failed to load the builtin codecs: %s' % why)
Guido van Rossum0612d842000-03-10 23:20:43 +000018
Tim Peters30324a72001-05-15 17:19:16 +000019__all__ = ["register", "lookup", "open", "EncodedFile", "BOM", "BOM_BE",
Walter Dörwald474458d2002-06-04 15:16:29 +000020 "BOM_LE", "BOM32_BE", "BOM32_LE", "BOM64_BE", "BOM64_LE",
21 "BOM_UTF8", "BOM_UTF16", "BOM_UTF16_LE", "BOM_UTF16_BE",
Walter Dörwald3aeb6322002-09-02 13:14:32 +000022 "BOM_UTF32", "BOM_UTF32_LE", "BOM_UTF32_BE",
Serhiy Storchakade3ee5b2014-12-20 17:42:38 +020023 "CodecInfo", "Codec", "IncrementalEncoder", "IncrementalDecoder",
24 "StreamReader", "StreamWriter",
25 "StreamReaderWriter", "StreamRecoder",
26 "getencoder", "getdecoder", "getincrementalencoder",
27 "getincrementaldecoder", "getreader", "getwriter",
28 "encode", "decode", "iterencode", "iterdecode",
Walter Dörwald3aeb6322002-09-02 13:14:32 +000029 "strict_errors", "ignore_errors", "replace_errors",
Serhiy Storchakade3ee5b2014-12-20 17:42:38 +020030 "xmlcharrefreplace_errors", "backslashreplace_errors",
Walter Dörwald3aeb6322002-09-02 13:14:32 +000031 "register_error", "lookup_error"]
Skip Montanaroe99d5ea2001-01-20 19:54:20 +000032
Guido van Rossum0612d842000-03-10 23:20:43 +000033### Constants
34
35#
Walter Dörwald474458d2002-06-04 15:16:29 +000036# Byte Order Mark (BOM = ZERO WIDTH NO-BREAK SPACE = U+FEFF)
37# and its possible byte string values
38# for UTF8/UTF16/UTF32 output and little/big endian machines
Guido van Rossum0612d842000-03-10 23:20:43 +000039#
Guido van Rossum0612d842000-03-10 23:20:43 +000040
Walter Dörwald474458d2002-06-04 15:16:29 +000041# UTF-8
Walter Dörwaldca8a8d02007-05-04 13:05:09 +000042BOM_UTF8 = b'\xef\xbb\xbf'
Walter Dörwald474458d2002-06-04 15:16:29 +000043
44# UTF-16, little endian
Walter Dörwaldca8a8d02007-05-04 13:05:09 +000045BOM_LE = BOM_UTF16_LE = b'\xff\xfe'
Walter Dörwald474458d2002-06-04 15:16:29 +000046
47# UTF-16, big endian
Walter Dörwaldca8a8d02007-05-04 13:05:09 +000048BOM_BE = BOM_UTF16_BE = b'\xfe\xff'
Walter Dörwald474458d2002-06-04 15:16:29 +000049
50# UTF-32, little endian
Walter Dörwaldca8a8d02007-05-04 13:05:09 +000051BOM_UTF32_LE = b'\xff\xfe\x00\x00'
Walter Dörwald474458d2002-06-04 15:16:29 +000052
53# UTF-32, big endian
Walter Dörwaldca8a8d02007-05-04 13:05:09 +000054BOM_UTF32_BE = b'\x00\x00\xfe\xff'
Walter Dörwald474458d2002-06-04 15:16:29 +000055
Marc-André Lemburgb28de0d2002-12-12 17:37:50 +000056if sys.byteorder == 'little':
Walter Dörwald474458d2002-06-04 15:16:29 +000057
Marc-André Lemburgb28de0d2002-12-12 17:37:50 +000058 # UTF-16, native endianness
59 BOM = BOM_UTF16 = BOM_UTF16_LE
60
61 # UTF-32, native endianness
62 BOM_UTF32 = BOM_UTF32_LE
63
64else:
65
66 # UTF-16, native endianness
67 BOM = BOM_UTF16 = BOM_UTF16_BE
68
69 # UTF-32, native endianness
70 BOM_UTF32 = BOM_UTF32_BE
Walter Dörwald474458d2002-06-04 15:16:29 +000071
72# Old broken names (don't use in new code)
73BOM32_LE = BOM_UTF16_LE
74BOM32_BE = BOM_UTF16_BE
75BOM64_LE = BOM_UTF32_LE
76BOM64_BE = BOM_UTF32_BE
Guido van Rossum0612d842000-03-10 23:20:43 +000077
78
79### Codec base classes (defining the API)
80
Thomas Woutersa9773292006-04-21 09:43:23 +000081class CodecInfo(tuple):
Nick Coghlanc72e4e62013-11-22 22:39:36 +100082 """Codec details when looking up the codec registry"""
83
84 # Private API to allow Python 3.4 to blacklist the known non-Unicode
85 # codecs in the standard library. A more general mechanism to
86 # reliably distinguish test encodings from other codecs will hopefully
87 # be defined for Python 3.5
88 #
89 # See http://bugs.python.org/issue19619
90 _is_text_encoding = True # Assume codecs are text encodings by default
Thomas Woutersa9773292006-04-21 09:43:23 +000091
92 def __new__(cls, encode, decode, streamreader=None, streamwriter=None,
Nick Coghlanc72e4e62013-11-22 22:39:36 +100093 incrementalencoder=None, incrementaldecoder=None, name=None,
94 *, _is_text_encoding=None):
Thomas Woutersa9773292006-04-21 09:43:23 +000095 self = tuple.__new__(cls, (encode, decode, streamreader, streamwriter))
96 self.name = name
97 self.encode = encode
98 self.decode = decode
99 self.incrementalencoder = incrementalencoder
100 self.incrementaldecoder = incrementaldecoder
101 self.streamwriter = streamwriter
102 self.streamreader = streamreader
Nick Coghlanc72e4e62013-11-22 22:39:36 +1000103 if _is_text_encoding is not None:
104 self._is_text_encoding = _is_text_encoding
Thomas Woutersa9773292006-04-21 09:43:23 +0000105 return self
106
107 def __repr__(self):
Walter Dörwald3abcb012007-04-16 22:10:50 +0000108 return "<%s.%s object for encoding %s at 0x%x>" % \
109 (self.__class__.__module__, self.__class__.__name__,
110 self.name, id(self))
Thomas Woutersa9773292006-04-21 09:43:23 +0000111
Guido van Rossum0612d842000-03-10 23:20:43 +0000112class Codec:
113
114 """ Defines the interface for stateless encoders/decoders.
115
Walter Dörwald7f82f792002-11-19 21:42:53 +0000116 The .encode()/.decode() methods may use different error
Guido van Rossum0612d842000-03-10 23:20:43 +0000117 handling schemes by providing the errors argument. These
Walter Dörwald7f82f792002-11-19 21:42:53 +0000118 string values are predefined:
Guido van Rossum0612d842000-03-10 23:20:43 +0000119
Guido van Rossumd8855fd2000-03-24 22:14:19 +0000120 'strict' - raise a ValueError error (or a subclass)
Guido van Rossum0612d842000-03-10 23:20:43 +0000121 'ignore' - ignore the character and continue with the next
122 'replace' - replace with a suitable replacement character;
123 Python will use the official U+FFFD REPLACEMENT
Walter Dörwald7f82f792002-11-19 21:42:53 +0000124 CHARACTER for the builtin Unicode codecs on
125 decoding and '?' on encoding.
Andrew Kuchlingc7b6c502013-06-16 12:58:48 -0400126 'surrogateescape' - replace with private codepoints U+DCnn.
Walter Dörwald7f82f792002-11-19 21:42:53 +0000127 'xmlcharrefreplace' - Replace with the appropriate XML
128 character reference (only for encoding).
129 'backslashreplace' - Replace with backslashed escape sequences
130 (only for encoding).
131
132 The set of allowed values can be extended via register_error.
Guido van Rossum0612d842000-03-10 23:20:43 +0000133
134 """
Tim Peters30324a72001-05-15 17:19:16 +0000135 def encode(self, input, errors='strict'):
Guido van Rossum1c89b0e2000-04-11 15:41:38 +0000136
Fred Drake3e74c0d2000-03-17 15:40:35 +0000137 """ Encodes the object input and returns a tuple (output
Guido van Rossum0612d842000-03-10 23:20:43 +0000138 object, length consumed).
139
140 errors defines the error handling to apply. It defaults to
141 'strict' handling.
142
143 The method may not store state in the Codec instance. Use
144 StreamCodec for codecs which have to keep state in order to
145 make encoding/decoding efficient.
146
147 The encoder must be able to handle zero length input and
148 return an empty object of the output object type in this
149 situation.
150
151 """
152 raise NotImplementedError
153
Tim Peters30324a72001-05-15 17:19:16 +0000154 def decode(self, input, errors='strict'):
Guido van Rossum0612d842000-03-10 23:20:43 +0000155
156 """ Decodes the object input and returns a tuple (output
157 object, length consumed).
158
159 input must be an object which provides the bf_getreadbuf
160 buffer slot. Python strings, buffer objects and memory
161 mapped files are examples of objects providing this slot.
Guido van Rossum1c89b0e2000-04-11 15:41:38 +0000162
Guido van Rossum0612d842000-03-10 23:20:43 +0000163 errors defines the error handling to apply. It defaults to
164 'strict' handling.
165
166 The method may not store state in the Codec instance. Use
167 StreamCodec for codecs which have to keep state in order to
168 make encoding/decoding efficient.
169
170 The decoder must be able to handle zero length input and
171 return an empty object of the output object type in this
172 situation.
173
Guido van Rossum1c89b0e2000-04-11 15:41:38 +0000174 """
Guido van Rossum0612d842000-03-10 23:20:43 +0000175 raise NotImplementedError
176
Thomas Woutersa9773292006-04-21 09:43:23 +0000177class IncrementalEncoder(object):
178 """
Walter Dörwald3abcb012007-04-16 22:10:50 +0000179 An IncrementalEncoder encodes an input in multiple steps. The input can
180 be passed piece by piece to the encode() method. The IncrementalEncoder
181 remembers the state of the encoding process between calls to encode().
Thomas Woutersa9773292006-04-21 09:43:23 +0000182 """
183 def __init__(self, errors='strict'):
184 """
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000185 Creates an IncrementalEncoder instance.
Thomas Woutersa9773292006-04-21 09:43:23 +0000186
187 The IncrementalEncoder may use different error handling schemes by
188 providing the errors keyword argument. See the module docstring
189 for a list of possible values.
190 """
191 self.errors = errors
192 self.buffer = ""
193
194 def encode(self, input, final=False):
195 """
196 Encodes input and returns the resulting object.
197 """
198 raise NotImplementedError
199
200 def reset(self):
201 """
202 Resets the encoder to the initial state.
203 """
204
Walter Dörwald3abcb012007-04-16 22:10:50 +0000205 def getstate(self):
206 """
207 Return the current state of the encoder.
208 """
209 return 0
210
211 def setstate(self, state):
212 """
213 Set the current state of the encoder. state must have been
214 returned by getstate().
215 """
216
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000217class BufferedIncrementalEncoder(IncrementalEncoder):
218 """
219 This subclass of IncrementalEncoder can be used as the baseclass for an
220 incremental encoder if the encoder must keep some of the output in a
221 buffer between calls to encode().
222 """
223 def __init__(self, errors='strict'):
224 IncrementalEncoder.__init__(self, errors)
Walter Dörwald3abcb012007-04-16 22:10:50 +0000225 # unencoded input that is kept between calls to encode()
226 self.buffer = ""
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000227
228 def _buffer_encode(self, input, errors, final):
229 # Overwrite this method in subclasses: It must encode input
230 # and return an (output, length consumed) tuple
231 raise NotImplementedError
232
233 def encode(self, input, final=False):
234 # encode input (taking the buffer into account)
235 data = self.buffer + input
236 (result, consumed) = self._buffer_encode(data, self.errors, final)
237 # keep unencoded input until the next call
238 self.buffer = data[consumed:]
239 return result
240
241 def reset(self):
242 IncrementalEncoder.reset(self)
243 self.buffer = ""
244
Walter Dörwald3abcb012007-04-16 22:10:50 +0000245 def getstate(self):
246 return self.buffer or 0
247
248 def setstate(self, state):
249 self.buffer = state or ""
250
Thomas Woutersa9773292006-04-21 09:43:23 +0000251class IncrementalDecoder(object):
252 """
Walter Dörwald3abcb012007-04-16 22:10:50 +0000253 An IncrementalDecoder decodes an input in multiple steps. The input can
254 be passed piece by piece to the decode() method. The IncrementalDecoder
Thomas Woutersa9773292006-04-21 09:43:23 +0000255 remembers the state of the decoding process between calls to decode().
256 """
257 def __init__(self, errors='strict'):
258 """
Ka-Ping Yeef44c7e82008-03-18 04:51:32 +0000259 Create a IncrementalDecoder instance.
Thomas Woutersa9773292006-04-21 09:43:23 +0000260
261 The IncrementalDecoder may use different error handling schemes by
262 providing the errors keyword argument. See the module docstring
263 for a list of possible values.
264 """
265 self.errors = errors
266
267 def decode(self, input, final=False):
268 """
Ka-Ping Yeef44c7e82008-03-18 04:51:32 +0000269 Decode input and returns the resulting object.
Thomas Woutersa9773292006-04-21 09:43:23 +0000270 """
271 raise NotImplementedError
272
273 def reset(self):
274 """
Ka-Ping Yeef44c7e82008-03-18 04:51:32 +0000275 Reset the decoder to the initial state.
Thomas Woutersa9773292006-04-21 09:43:23 +0000276 """
277
Walter Dörwald3abcb012007-04-16 22:10:50 +0000278 def getstate(self):
279 """
Ka-Ping Yeef44c7e82008-03-18 04:51:32 +0000280 Return the current state of the decoder.
281
282 This must be a (buffered_input, additional_state_info) tuple.
283 buffered_input must be a bytes object containing bytes that
284 were passed to decode() that have not yet been converted.
285 additional_state_info must be a non-negative integer
286 representing the state of the decoder WITHOUT yet having
287 processed the contents of buffered_input. In the initial state
288 and after reset(), getstate() must return (b"", 0).
Walter Dörwald3abcb012007-04-16 22:10:50 +0000289 """
Walter Dörwaldca8a8d02007-05-04 13:05:09 +0000290 return (b"", 0)
Walter Dörwald3abcb012007-04-16 22:10:50 +0000291
292 def setstate(self, state):
293 """
Ka-Ping Yeef44c7e82008-03-18 04:51:32 +0000294 Set the current state of the decoder.
295
296 state must have been returned by getstate(). The effect of
297 setstate((b"", 0)) must be equivalent to reset().
Walter Dörwald3abcb012007-04-16 22:10:50 +0000298 """
299
Thomas Woutersa9773292006-04-21 09:43:23 +0000300class BufferedIncrementalDecoder(IncrementalDecoder):
301 """
302 This subclass of IncrementalDecoder can be used as the baseclass for an
Walter Dörwald3abcb012007-04-16 22:10:50 +0000303 incremental decoder if the decoder must be able to handle incomplete
304 byte sequences.
Thomas Woutersa9773292006-04-21 09:43:23 +0000305 """
306 def __init__(self, errors='strict'):
307 IncrementalDecoder.__init__(self, errors)
Walter Dörwald3abcb012007-04-16 22:10:50 +0000308 # undecoded input that is kept between calls to decode()
Walter Dörwaldca8a8d02007-05-04 13:05:09 +0000309 self.buffer = b""
Thomas Woutersa9773292006-04-21 09:43:23 +0000310
311 def _buffer_decode(self, input, errors, final):
312 # Overwrite this method in subclasses: It must decode input
313 # and return an (output, length consumed) tuple
314 raise NotImplementedError
315
316 def decode(self, input, final=False):
317 # decode input (taking the buffer into account)
318 data = self.buffer + input
319 (result, consumed) = self._buffer_decode(data, self.errors, final)
320 # keep undecoded input until the next call
321 self.buffer = data[consumed:]
322 return result
323
324 def reset(self):
325 IncrementalDecoder.reset(self)
Walter Dörwaldca8a8d02007-05-04 13:05:09 +0000326 self.buffer = b""
Thomas Woutersa9773292006-04-21 09:43:23 +0000327
Walter Dörwald3abcb012007-04-16 22:10:50 +0000328 def getstate(self):
329 # additional state info is always 0
330 return (self.buffer, 0)
331
332 def setstate(self, state):
333 # ignore additional state info
334 self.buffer = state[0]
335
Guido van Rossum0612d842000-03-10 23:20:43 +0000336#
337# The StreamWriter and StreamReader class provide generic working
Andrew M. Kuchling97c56352001-09-18 20:29:48 +0000338# interfaces which can be used to implement new encoding submodules
Guido van Rossum0612d842000-03-10 23:20:43 +0000339# very easily. See encodings/utf_8.py for an example on how this is
340# done.
Guido van Rossum1c89b0e2000-04-11 15:41:38 +0000341#
Guido van Rossum0612d842000-03-10 23:20:43 +0000342
343class StreamWriter(Codec):
344
Tim Peters30324a72001-05-15 17:19:16 +0000345 def __init__(self, stream, errors='strict'):
Guido van Rossum0612d842000-03-10 23:20:43 +0000346
347 """ Creates a StreamWriter instance.
348
349 stream must be a file-like object open for writing
350 (binary) data.
351
Walter Dörwald7f82f792002-11-19 21:42:53 +0000352 The StreamWriter may use different error handling
Guido van Rossum0612d842000-03-10 23:20:43 +0000353 schemes by providing the errors keyword argument. These
Walter Dörwald7f82f792002-11-19 21:42:53 +0000354 parameters are predefined:
Guido van Rossum0612d842000-03-10 23:20:43 +0000355
356 'strict' - raise a ValueError (or a subclass)
357 'ignore' - ignore the character and continue with the next
358 'replace'- replace with a suitable replacement character
Walter Dörwald7f82f792002-11-19 21:42:53 +0000359 'xmlcharrefreplace' - Replace with the appropriate XML
360 character reference.
361 'backslashreplace' - Replace with backslashed escape
362 sequences (only for encoding).
Guido van Rossum0612d842000-03-10 23:20:43 +0000363
Walter Dörwald7f82f792002-11-19 21:42:53 +0000364 The set of allowed parameter values can be extended via
365 register_error.
Guido van Rossum0612d842000-03-10 23:20:43 +0000366 """
367 self.stream = stream
368 self.errors = errors
369
Guido van Rossuma3277132000-04-11 15:37:43 +0000370 def write(self, object):
Guido van Rossum0612d842000-03-10 23:20:43 +0000371
372 """ Writes the object's contents encoded to self.stream.
373 """
Tim Peters30324a72001-05-15 17:19:16 +0000374 data, consumed = self.encode(object, self.errors)
Guido van Rossum0612d842000-03-10 23:20:43 +0000375 self.stream.write(data)
376
Guido van Rossuma3277132000-04-11 15:37:43 +0000377 def writelines(self, list):
378
379 """ Writes the concatenated list of strings to the stream
380 using .write().
381 """
382 self.write(''.join(list))
Guido van Rossum1c89b0e2000-04-11 15:41:38 +0000383
Guido van Rossum0612d842000-03-10 23:20:43 +0000384 def reset(self):
385
386 """ Flushes and resets the codec buffers used for keeping state.
387
388 Calling this method should ensure that the data on the
389 output is put into a clean state, that allows appending
390 of new fresh data without having to rescan the whole
391 stream to recover state.
392
393 """
394 pass
395
Victor Stinnera92ad7e2010-05-22 16:59:09 +0000396 def seek(self, offset, whence=0):
397 self.stream.seek(offset, whence)
398 if whence == 0 and offset == 0:
399 self.reset()
400
Tim Peters30324a72001-05-15 17:19:16 +0000401 def __getattr__(self, name,
Guido van Rossum0612d842000-03-10 23:20:43 +0000402 getattr=getattr):
403
404 """ Inherit all other methods from the underlying stream.
405 """
Tim Peters30324a72001-05-15 17:19:16 +0000406 return getattr(self.stream, name)
Guido van Rossum0612d842000-03-10 23:20:43 +0000407
Thomas Wouters89f507f2006-12-13 04:49:30 +0000408 def __enter__(self):
409 return self
410
411 def __exit__(self, type, value, tb):
412 self.stream.close()
413
Guido van Rossum0612d842000-03-10 23:20:43 +0000414###
415
416class StreamReader(Codec):
417
Georg Brandl02524622010-12-02 18:06:51 +0000418 charbuffertype = str
419
Tim Peters30324a72001-05-15 17:19:16 +0000420 def __init__(self, stream, errors='strict'):
Guido van Rossum0612d842000-03-10 23:20:43 +0000421
422 """ Creates a StreamReader instance.
423
424 stream must be a file-like object open for reading
425 (binary) data.
426
Walter Dörwald7f82f792002-11-19 21:42:53 +0000427 The StreamReader may use different error handling
Guido van Rossum0612d842000-03-10 23:20:43 +0000428 schemes by providing the errors keyword argument. These
Walter Dörwald7f82f792002-11-19 21:42:53 +0000429 parameters are predefined:
Guido van Rossum0612d842000-03-10 23:20:43 +0000430
431 'strict' - raise a ValueError (or a subclass)
432 'ignore' - ignore the character and continue with the next
433 'replace'- replace with a suitable replacement character;
434
Walter Dörwald7f82f792002-11-19 21:42:53 +0000435 The set of allowed parameter values can be extended via
436 register_error.
Guido van Rossum0612d842000-03-10 23:20:43 +0000437 """
438 self.stream = stream
439 self.errors = errors
Walter Dörwaldca8a8d02007-05-04 13:05:09 +0000440 self.bytebuffer = b""
Georg Brandl02524622010-12-02 18:06:51 +0000441 self._empty_charbuffer = self.charbuffertype()
442 self.charbuffer = self._empty_charbuffer
Martin v. Löwis4ed67382005-09-18 08:34:39 +0000443 self.linebuffer = None
Guido van Rossum0612d842000-03-10 23:20:43 +0000444
Walter Dörwald69652032004-09-07 20:24:22 +0000445 def decode(self, input, errors='strict'):
446 raise NotImplementedError
447
Martin v. Löwis56066d22005-08-24 07:38:12 +0000448 def read(self, size=-1, chars=-1, firstline=False):
Guido van Rossum0612d842000-03-10 23:20:43 +0000449
450 """ Decodes data from the stream self.stream and returns the
451 resulting object.
452
Walter Dörwald69652032004-09-07 20:24:22 +0000453 chars indicates the number of characters to read from the
454 stream. read() will never return more than chars
455 characters, but it might return less, if there are not enough
456 characters available.
457
Guido van Rossum0612d842000-03-10 23:20:43 +0000458 size indicates the approximate maximum number of bytes to
459 read from the stream for decoding purposes. The decoder
460 can modify this setting as appropriate. The default value
461 -1 indicates to read and decode as much as possible. size
462 is intended to prevent having to decode huge files in one
463 step.
464
Martin v. Löwis56066d22005-08-24 07:38:12 +0000465 If firstline is true, and a UnicodeDecodeError happens
466 after the first line terminator in the input only the first line
467 will be returned, the rest of the input will be kept until the
468 next call to read().
469
Guido van Rossum0612d842000-03-10 23:20:43 +0000470 The method should use a greedy read strategy meaning that
471 it should read as much data as is allowed within the
472 definition of the encoding and the given size, e.g. if
473 optional encoding endings or state markers are available
474 on the stream, these should be read too.
Guido van Rossum0612d842000-03-10 23:20:43 +0000475 """
Martin v. Löwis4ed67382005-09-18 08:34:39 +0000476 # If we have lines cached, first merge them back into characters
477 if self.linebuffer:
Georg Brandl02524622010-12-02 18:06:51 +0000478 self.charbuffer = self._empty_charbuffer.join(self.linebuffer)
Martin v. Löwis4ed67382005-09-18 08:34:39 +0000479 self.linebuffer = None
Tim Peters536cf992005-12-25 23:18:31 +0000480
Walter Dörwald69652032004-09-07 20:24:22 +0000481 # read until we get the required number of characters (if available)
Walter Dörwald69652032004-09-07 20:24:22 +0000482 while True:
Tim Golden621302c2012-10-01 16:40:40 +0100483 # can the request be satisfied from the character buffer?
Serhiy Storchakadbe09822014-01-26 19:27:56 +0200484 if chars >= 0:
Walter Dörwald69652032004-09-07 20:24:22 +0000485 if len(self.charbuffer) >= chars:
Walter Dörwald69652032004-09-07 20:24:22 +0000486 break
Serhiy Storchakadbe09822014-01-26 19:27:56 +0200487 elif size >= 0:
488 if len(self.charbuffer) >= size:
489 break
Walter Dörwald69652032004-09-07 20:24:22 +0000490 # we need more data
491 if size < 0:
492 newdata = self.stream.read()
493 else:
494 newdata = self.stream.read(size)
Walter Dörwalde57d7b12004-12-21 22:24:00 +0000495 # decode bytes (those remaining from the last call included)
Walter Dörwald69652032004-09-07 20:24:22 +0000496 data = self.bytebuffer + newdata
Serhiy Storchakadbe09822014-01-26 19:27:56 +0200497 if not data:
498 break
Martin v. Löwis56066d22005-08-24 07:38:12 +0000499 try:
500 newchars, decodedbytes = self.decode(data, self.errors)
Guido van Rossumb940e112007-01-10 16:19:56 +0000501 except UnicodeDecodeError as exc:
Martin v. Löwis56066d22005-08-24 07:38:12 +0000502 if firstline:
Walter Dörwald3abcb012007-04-16 22:10:50 +0000503 newchars, decodedbytes = \
504 self.decode(data[:exc.start], self.errors)
Ezio Melottid8b509b2011-09-28 17:37:55 +0300505 lines = newchars.splitlines(keepends=True)
Martin v. Löwis56066d22005-08-24 07:38:12 +0000506 if len(lines)<=1:
507 raise
508 else:
509 raise
Walter Dörwald69652032004-09-07 20:24:22 +0000510 # keep undecoded bytes until the next call
511 self.bytebuffer = data[decodedbytes:]
512 # put new characters in the character buffer
Walter Dörwalde57d7b12004-12-21 22:24:00 +0000513 self.charbuffer += newchars
Walter Dörwald69652032004-09-07 20:24:22 +0000514 # there was no data available
515 if not newdata:
Walter Dörwalde57d7b12004-12-21 22:24:00 +0000516 break
517 if chars < 0:
518 # Return everything we've got
519 result = self.charbuffer
Georg Brandl02524622010-12-02 18:06:51 +0000520 self.charbuffer = self._empty_charbuffer
Walter Dörwalde57d7b12004-12-21 22:24:00 +0000521 else:
522 # Return the first chars characters
523 result = self.charbuffer[:chars]
524 self.charbuffer = self.charbuffer[chars:]
Walter Dörwald69652032004-09-07 20:24:22 +0000525 return result
Guido van Rossum0612d842000-03-10 23:20:43 +0000526
Walter Dörwald69652032004-09-07 20:24:22 +0000527 def readline(self, size=None, keepends=True):
Guido van Rossuma3277132000-04-11 15:37:43 +0000528
529 """ Read one line from the input stream and return the
530 decoded data.
531
Walter Dörwald69652032004-09-07 20:24:22 +0000532 size, if given, is passed as size argument to the
533 read() method.
Guido van Rossum1c89b0e2000-04-11 15:41:38 +0000534
Guido van Rossuma3277132000-04-11 15:37:43 +0000535 """
Martin v. Löwis4ed67382005-09-18 08:34:39 +0000536 # If we have lines cached from an earlier read, return
537 # them unconditionally
538 if self.linebuffer:
539 line = self.linebuffer[0]
540 del self.linebuffer[0]
541 if len(self.linebuffer) == 1:
542 # revert to charbuffer mode; we might need more data
543 # next time
544 self.charbuffer = self.linebuffer[0]
545 self.linebuffer = None
546 if not keepends:
Ezio Melottid8b509b2011-09-28 17:37:55 +0300547 line = line.splitlines(keepends=False)[0]
Martin v. Löwis4ed67382005-09-18 08:34:39 +0000548 return line
Tim Peters536cf992005-12-25 23:18:31 +0000549
Walter Dörwalde57d7b12004-12-21 22:24:00 +0000550 readsize = size or 72
Georg Brandl02524622010-12-02 18:06:51 +0000551 line = self._empty_charbuffer
Walter Dörwalde57d7b12004-12-21 22:24:00 +0000552 # If size is given, we call read() only once
Walter Dörwald69652032004-09-07 20:24:22 +0000553 while True:
Martin v. Löwis56066d22005-08-24 07:38:12 +0000554 data = self.read(readsize, firstline=True)
Walter Dörwalde57d7b12004-12-21 22:24:00 +0000555 if data:
Walter Dörwalda4eb2d52005-04-21 21:42:35 +0000556 # If we're at a "\r" read one extra character (which might
557 # be a "\n") to get a proper line ending. If the stream is
Walter Dörwaldbc8e6422005-04-21 21:32:03 +0000558 # temporarily exhausted we return the wrong line ending.
Georg Brandl02524622010-12-02 18:06:51 +0000559 if (isinstance(data, str) and data.endswith("\r")) or \
560 (isinstance(data, bytes) and data.endswith(b"\r")):
Walter Dörwald7a6dc132005-04-04 21:38:47 +0000561 data += self.read(size=1, chars=1)
Walter Dörwald7a6dc132005-04-04 21:38:47 +0000562
Walter Dörwald69652032004-09-07 20:24:22 +0000563 line += data
Ezio Melottid8b509b2011-09-28 17:37:55 +0300564 lines = line.splitlines(keepends=True)
Walter Dörwalde57d7b12004-12-21 22:24:00 +0000565 if lines:
Martin v. Löwis4ed67382005-09-18 08:34:39 +0000566 if len(lines) > 1:
567 # More than one line result; the first line is a full line
568 # to return
569 line = lines[0]
570 del lines[0]
571 if len(lines) > 1:
572 # cache the remaining lines
573 lines[-1] += self.charbuffer
574 self.linebuffer = lines
575 self.charbuffer = None
576 else:
577 # only one remaining line, put it back into charbuffer
578 self.charbuffer = lines[0] + self.charbuffer
579 if not keepends:
Ezio Melottid8b509b2011-09-28 17:37:55 +0300580 line = line.splitlines(keepends=False)[0]
Martin v. Löwis4ed67382005-09-18 08:34:39 +0000581 break
Walter Dörwalde57d7b12004-12-21 22:24:00 +0000582 line0withend = lines[0]
Ezio Melottid8b509b2011-09-28 17:37:55 +0300583 line0withoutend = lines[0].splitlines(keepends=False)[0]
Walter Dörwalde57d7b12004-12-21 22:24:00 +0000584 if line0withend != line0withoutend: # We really have a line end
585 # Put the rest back together and keep it until the next call
Georg Brandl02524622010-12-02 18:06:51 +0000586 self.charbuffer = self._empty_charbuffer.join(lines[1:]) + \
587 self.charbuffer
Walter Dörwalde57d7b12004-12-21 22:24:00 +0000588 if keepends:
589 line = line0withend
590 else:
591 line = line0withoutend
Walter Dörwald9fa09462005-01-10 12:01:39 +0000592 break
Walter Dörwalde57d7b12004-12-21 22:24:00 +0000593 # we didn't get anything or this was our only try
Walter Dörwald9fa09462005-01-10 12:01:39 +0000594 if not data or size is not None:
Walter Dörwalde57d7b12004-12-21 22:24:00 +0000595 if line and not keepends:
Ezio Melottid8b509b2011-09-28 17:37:55 +0300596 line = line.splitlines(keepends=False)[0]
Walter Dörwalde57d7b12004-12-21 22:24:00 +0000597 break
Georg Brandl02524622010-12-02 18:06:51 +0000598 if readsize < 8000:
Walter Dörwalde57d7b12004-12-21 22:24:00 +0000599 readsize *= 2
600 return line
Guido van Rossum1c89b0e2000-04-11 15:41:38 +0000601
Walter Dörwald69652032004-09-07 20:24:22 +0000602 def readlines(self, sizehint=None, keepends=True):
Guido van Rossuma3277132000-04-11 15:37:43 +0000603
604 """ Read all lines available on the input stream
605 and return them as list of lines.
606
607 Line breaks are implemented using the codec's decoder
608 method and are included in the list entries.
Guido van Rossum1c89b0e2000-04-11 15:41:38 +0000609
Marc-André Lemburgd5948492004-02-26 15:22:17 +0000610 sizehint, if given, is ignored since there is no efficient
611 way to finding the true end-of-line.
Guido van Rossuma3277132000-04-11 15:37:43 +0000612
613 """
Walter Dörwald69652032004-09-07 20:24:22 +0000614 data = self.read()
Hye-Shik Changaf5c7cf2004-10-17 23:51:21 +0000615 return data.splitlines(keepends)
Guido van Rossum0612d842000-03-10 23:20:43 +0000616
617 def reset(self):
618
619 """ Resets the codec buffers used for keeping state.
620
621 Note that no stream repositioning should take place.
Thomas Wouters7e474022000-07-16 12:04:32 +0000622 This method is primarily intended to be able to recover
Guido van Rossum0612d842000-03-10 23:20:43 +0000623 from decoding errors.
624
625 """
Walter Dörwaldca8a8d02007-05-04 13:05:09 +0000626 self.bytebuffer = b""
Georg Brandl02524622010-12-02 18:06:51 +0000627 self.charbuffer = self._empty_charbuffer
Martin v. Löwis4ed67382005-09-18 08:34:39 +0000628 self.linebuffer = None
Walter Dörwald729c31f2005-03-14 19:06:30 +0000629
Walter Dörwald71fd90d2005-03-14 19:25:41 +0000630 def seek(self, offset, whence=0):
Walter Dörwald729c31f2005-03-14 19:06:30 +0000631 """ Set the input stream's current position.
632
633 Resets the codec buffers used for keeping state.
634 """
Walter Dörwald729c31f2005-03-14 19:06:30 +0000635 self.stream.seek(offset, whence)
Victor Stinnera92ad7e2010-05-22 16:59:09 +0000636 self.reset()
Guido van Rossum0612d842000-03-10 23:20:43 +0000637
Georg Brandla18af4e2007-04-21 15:47:16 +0000638 def __next__(self):
Walter Dörwald4dbf1922002-11-06 16:53:44 +0000639
640 """ Return the next decoded line from the input stream."""
641 line = self.readline()
642 if line:
643 return line
644 raise StopIteration
645
646 def __iter__(self):
647 return self
648
Tim Peters30324a72001-05-15 17:19:16 +0000649 def __getattr__(self, name,
Guido van Rossum0612d842000-03-10 23:20:43 +0000650 getattr=getattr):
651
652 """ Inherit all other methods from the underlying stream.
653 """
Tim Peters30324a72001-05-15 17:19:16 +0000654 return getattr(self.stream, name)
Guido van Rossum0612d842000-03-10 23:20:43 +0000655
Thomas Wouters89f507f2006-12-13 04:49:30 +0000656 def __enter__(self):
657 return self
658
659 def __exit__(self, type, value, tb):
660 self.stream.close()
661
Guido van Rossum0612d842000-03-10 23:20:43 +0000662###
663
664class StreamReaderWriter:
665
Fred Drake49fd1072000-04-13 14:11:21 +0000666 """ StreamReaderWriter instances allow wrapping streams which
667 work in both read and write modes.
668
669 The design is such that one can use the factory functions
Thomas Wouters7e474022000-07-16 12:04:32 +0000670 returned by the codec.lookup() function to construct the
Fred Drake49fd1072000-04-13 14:11:21 +0000671 instance.
672
673 """
Guido van Rossuma3277132000-04-11 15:37:43 +0000674 # Optional attributes set by the file wrappers below
675 encoding = 'unknown'
676
Tim Peters30324a72001-05-15 17:19:16 +0000677 def __init__(self, stream, Reader, Writer, errors='strict'):
Guido van Rossum0612d842000-03-10 23:20:43 +0000678
679 """ Creates a StreamReaderWriter instance.
680
681 stream must be a Stream-like object.
682
683 Reader, Writer must be factory functions or classes
684 providing the StreamReader, StreamWriter interface resp.
685
686 Error handling is done in the same way as defined for the
687 StreamWriter/Readers.
688
689 """
690 self.stream = stream
691 self.reader = Reader(stream, errors)
692 self.writer = Writer(stream, errors)
693 self.errors = errors
694
Tim Peters30324a72001-05-15 17:19:16 +0000695 def read(self, size=-1):
Guido van Rossum0612d842000-03-10 23:20:43 +0000696
697 return self.reader.read(size)
698
Guido van Rossumd58c26f2000-05-01 16:17:32 +0000699 def readline(self, size=None):
Guido van Rossuma3277132000-04-11 15:37:43 +0000700
701 return self.reader.readline(size)
702
Guido van Rossumd58c26f2000-05-01 16:17:32 +0000703 def readlines(self, sizehint=None):
Guido van Rossuma3277132000-04-11 15:37:43 +0000704
705 return self.reader.readlines(sizehint)
706
Georg Brandla18af4e2007-04-21 15:47:16 +0000707 def __next__(self):
Walter Dörwald4dbf1922002-11-06 16:53:44 +0000708
709 """ Return the next decoded line from the input stream."""
Georg Brandla18af4e2007-04-21 15:47:16 +0000710 return next(self.reader)
Walter Dörwald4dbf1922002-11-06 16:53:44 +0000711
712 def __iter__(self):
713 return self
714
Tim Peters30324a72001-05-15 17:19:16 +0000715 def write(self, data):
Guido van Rossum0612d842000-03-10 23:20:43 +0000716
717 return self.writer.write(data)
718
Tim Peters30324a72001-05-15 17:19:16 +0000719 def writelines(self, list):
Guido van Rossuma3277132000-04-11 15:37:43 +0000720
721 return self.writer.writelines(list)
722
Guido van Rossum0612d842000-03-10 23:20:43 +0000723 def reset(self):
724
725 self.reader.reset()
726 self.writer.reset()
727
Victor Stinner3fed0872010-05-22 02:16:27 +0000728 def seek(self, offset, whence=0):
Victor Stinnera92ad7e2010-05-22 16:59:09 +0000729 self.stream.seek(offset, whence)
730 self.reader.reset()
731 if whence == 0 and offset == 0:
732 self.writer.reset()
Victor Stinner3fed0872010-05-22 02:16:27 +0000733
Tim Peters30324a72001-05-15 17:19:16 +0000734 def __getattr__(self, name,
Guido van Rossum0612d842000-03-10 23:20:43 +0000735 getattr=getattr):
736
737 """ Inherit all other methods from the underlying stream.
738 """
Tim Peters30324a72001-05-15 17:19:16 +0000739 return getattr(self.stream, name)
Guido van Rossum0612d842000-03-10 23:20:43 +0000740
Thomas Wouters89f507f2006-12-13 04:49:30 +0000741 # these are needed to make "with codecs.open(...)" work properly
742
743 def __enter__(self):
744 return self
745
746 def __exit__(self, type, value, tb):
747 self.stream.close()
748
Guido van Rossum0612d842000-03-10 23:20:43 +0000749###
750
751class StreamRecoder:
752
Fred Drake49fd1072000-04-13 14:11:21 +0000753 """ StreamRecoder instances provide a frontend - backend
754 view of encoding data.
755
756 They use the complete set of APIs returned by the
757 codecs.lookup() function to implement their task.
758
759 Data written to the stream is first decoded into an
760 intermediate format (which is dependent on the given codec
761 combination) and then written to the stream using an instance
762 of the provided Writer class.
763
764 In the other direction, data is read from the stream using a
765 Reader instance and then return encoded data to the caller.
766
767 """
Guido van Rossuma3277132000-04-11 15:37:43 +0000768 # Optional attributes set by the file wrappers below
769 data_encoding = 'unknown'
770 file_encoding = 'unknown'
771
Tim Peters30324a72001-05-15 17:19:16 +0000772 def __init__(self, stream, encode, decode, Reader, Writer,
773 errors='strict'):
Guido van Rossum0612d842000-03-10 23:20:43 +0000774
775 """ Creates a StreamRecoder instance which implements a two-way
776 conversion: encode and decode work on the frontend (the
Guido van Rossum1c89b0e2000-04-11 15:41:38 +0000777 input to .read() and output of .write()) while
Guido van Rossum0612d842000-03-10 23:20:43 +0000778 Reader and Writer work on the backend (reading and
Fred Drake908670c2000-03-17 15:42:11 +0000779 writing to the stream).
Guido van Rossum0612d842000-03-10 23:20:43 +0000780
781 You can use these objects to do transparent direct
782 recodings from e.g. latin-1 to utf-8 and back.
783
784 stream must be a file-like object.
785
786 encode, decode must adhere to the Codec interface, Reader,
787 Writer must be factory functions or classes providing the
788 StreamReader, StreamWriter interface resp.
789
790 encode and decode are needed for the frontend translation,
791 Reader and Writer for the backend translation. Unicode is
792 used as intermediate encoding.
793
794 Error handling is done in the same way as defined for the
795 StreamWriter/Readers.
796
797 """
798 self.stream = stream
799 self.encode = encode
800 self.decode = decode
801 self.reader = Reader(stream, errors)
802 self.writer = Writer(stream, errors)
803 self.errors = errors
804
Tim Peters30324a72001-05-15 17:19:16 +0000805 def read(self, size=-1):
Guido van Rossum0612d842000-03-10 23:20:43 +0000806
807 data = self.reader.read(size)
808 data, bytesencoded = self.encode(data, self.errors)
809 return data
810
Tim Peters30324a72001-05-15 17:19:16 +0000811 def readline(self, size=None):
Guido van Rossuma3277132000-04-11 15:37:43 +0000812
813 if size is None:
814 data = self.reader.readline()
815 else:
816 data = self.reader.readline(size)
817 data, bytesencoded = self.encode(data, self.errors)
818 return data
819
Tim Peters30324a72001-05-15 17:19:16 +0000820 def readlines(self, sizehint=None):
Guido van Rossuma3277132000-04-11 15:37:43 +0000821
Marc-André Lemburgd5948492004-02-26 15:22:17 +0000822 data = self.reader.read()
Guido van Rossuma3277132000-04-11 15:37:43 +0000823 data, bytesencoded = self.encode(data, self.errors)
Ezio Melottid8b509b2011-09-28 17:37:55 +0300824 return data.splitlines(keepends=True)
Guido van Rossuma3277132000-04-11 15:37:43 +0000825
Georg Brandla18af4e2007-04-21 15:47:16 +0000826 def __next__(self):
Walter Dörwald4dbf1922002-11-06 16:53:44 +0000827
828 """ Return the next decoded line from the input stream."""
Georg Brandla18af4e2007-04-21 15:47:16 +0000829 data = next(self.reader)
Walter Dörwaldc5238b82005-09-01 11:56:53 +0000830 data, bytesencoded = self.encode(data, self.errors)
831 return data
Walter Dörwald4dbf1922002-11-06 16:53:44 +0000832
833 def __iter__(self):
834 return self
835
Tim Peters30324a72001-05-15 17:19:16 +0000836 def write(self, data):
Guido van Rossum0612d842000-03-10 23:20:43 +0000837
838 data, bytesdecoded = self.decode(data, self.errors)
839 return self.writer.write(data)
840
Tim Peters30324a72001-05-15 17:19:16 +0000841 def writelines(self, list):
Guido van Rossuma3277132000-04-11 15:37:43 +0000842
843 data = ''.join(list)
844 data, bytesdecoded = self.decode(data, self.errors)
845 return self.writer.write(data)
Guido van Rossum0612d842000-03-10 23:20:43 +0000846
847 def reset(self):
848
849 self.reader.reset()
850 self.writer.reset()
851
Tim Peters30324a72001-05-15 17:19:16 +0000852 def __getattr__(self, name,
Guido van Rossum0612d842000-03-10 23:20:43 +0000853 getattr=getattr):
854
855 """ Inherit all other methods from the underlying stream.
856 """
Tim Peters30324a72001-05-15 17:19:16 +0000857 return getattr(self.stream, name)
Guido van Rossum0612d842000-03-10 23:20:43 +0000858
Thomas Wouters89f507f2006-12-13 04:49:30 +0000859 def __enter__(self):
860 return self
861
862 def __exit__(self, type, value, tb):
863 self.stream.close()
864
Guido van Rossum0612d842000-03-10 23:20:43 +0000865### Shortcuts
866
Marc-André Lemburg349a3d32000-06-21 21:21:04 +0000867def open(filename, mode='rb', encoding=None, errors='strict', buffering=1):
Guido van Rossum0612d842000-03-10 23:20:43 +0000868
869 """ Open an encoded file using the given mode and return
870 a wrapped version providing transparent encoding/decoding.
871
872 Note: The wrapped version will only accept the object format
873 defined by the codecs, i.e. Unicode objects for most builtin
Skip Montanaro9f5f9d92005-03-16 03:51:56 +0000874 codecs. Output is also codec dependent and will usually be
Guido van Rossum0612d842000-03-10 23:20:43 +0000875 Unicode as well.
876
Marc-André Lemburg349a3d32000-06-21 21:21:04 +0000877 Files are always opened in binary mode, even if no binary mode
Walter Dörwald7f3ed742003-02-02 23:08:27 +0000878 was specified. This is done to avoid data loss due to encodings
Marc-André Lemburg349a3d32000-06-21 21:21:04 +0000879 using 8-bit values. The default file mode is 'rb' meaning to
880 open the file in binary read mode.
881
Guido van Rossum0612d842000-03-10 23:20:43 +0000882 encoding specifies the encoding which is to be used for the
Walter Dörwald7f3ed742003-02-02 23:08:27 +0000883 file.
Guido van Rossum0612d842000-03-10 23:20:43 +0000884
885 errors may be given to define the error handling. It defaults
886 to 'strict' which causes ValueErrors to be raised in case an
887 encoding error occurs.
888
889 buffering has the same meaning as for the builtin open() API.
890 It defaults to line buffered.
891
Fred Drake49fd1072000-04-13 14:11:21 +0000892 The returned wrapped file object provides an extra attribute
893 .encoding which allows querying the used encoding. This
894 attribute is only available if an encoding was specified as
895 parameter.
896
Guido van Rossum0612d842000-03-10 23:20:43 +0000897 """
898 if encoding is not None and \
899 'b' not in mode:
900 # Force opening of the file in binary mode
901 mode = mode + 'b'
Georg Brandl1a3284e2007-12-02 09:40:06 +0000902 file = builtins.open(filename, mode, buffering)
Guido van Rossum0612d842000-03-10 23:20:43 +0000903 if encoding is None:
904 return file
Thomas Woutersa9773292006-04-21 09:43:23 +0000905 info = lookup(encoding)
906 srw = StreamReaderWriter(file, info.streamreader, info.streamwriter, errors)
Guido van Rossuma3277132000-04-11 15:37:43 +0000907 # Add attributes to simplify introspection
908 srw.encoding = encoding
909 return srw
Guido van Rossum0612d842000-03-10 23:20:43 +0000910
Guido van Rossuma3277132000-04-11 15:37:43 +0000911def EncodedFile(file, data_encoding, file_encoding=None, errors='strict'):
Guido van Rossum0612d842000-03-10 23:20:43 +0000912
913 """ Return a wrapped version of file which provides transparent
914 encoding translation.
915
916 Strings written to the wrapped file are interpreted according
Guido van Rossuma3277132000-04-11 15:37:43 +0000917 to the given data_encoding and then written to the original
918 file as string using file_encoding. The intermediate encoding
919 will usually be Unicode but depends on the specified codecs.
Guido van Rossum0612d842000-03-10 23:20:43 +0000920
Guido van Rossuma3277132000-04-11 15:37:43 +0000921 Strings are read from the file using file_encoding and then
922 passed back to the caller as string using data_encoding.
923
924 If file_encoding is not given, it defaults to data_encoding.
Guido van Rossum0612d842000-03-10 23:20:43 +0000925
926 errors may be given to define the error handling. It defaults
927 to 'strict' which causes ValueErrors to be raised in case an
928 encoding error occurs.
929
Fred Drake49fd1072000-04-13 14:11:21 +0000930 The returned wrapped file object provides two extra attributes
931 .data_encoding and .file_encoding which reflect the given
932 parameters of the same name. The attributes can be used for
933 introspection by Python programs.
934
Guido van Rossum0612d842000-03-10 23:20:43 +0000935 """
Guido van Rossuma3277132000-04-11 15:37:43 +0000936 if file_encoding is None:
937 file_encoding = data_encoding
Thomas Wouters89f507f2006-12-13 04:49:30 +0000938 data_info = lookup(data_encoding)
939 file_info = lookup(file_encoding)
940 sr = StreamRecoder(file, data_info.encode, data_info.decode,
941 file_info.streamreader, file_info.streamwriter, errors)
Guido van Rossuma3277132000-04-11 15:37:43 +0000942 # Add attributes to simplify introspection
943 sr.data_encoding = data_encoding
944 sr.file_encoding = file_encoding
945 return sr
Guido van Rossum0612d842000-03-10 23:20:43 +0000946
Marc-André Lemburgaa32c5a2001-09-19 11:24:48 +0000947### Helpers for codec lookup
948
949def getencoder(encoding):
950
951 """ Lookup up the codec for the given encoding and return
952 its encoder function.
953
954 Raises a LookupError in case the encoding cannot be found.
955
956 """
Thomas Woutersa9773292006-04-21 09:43:23 +0000957 return lookup(encoding).encode
Marc-André Lemburgaa32c5a2001-09-19 11:24:48 +0000958
959def getdecoder(encoding):
960
961 """ Lookup up the codec for the given encoding and return
962 its decoder function.
963
964 Raises a LookupError in case the encoding cannot be found.
965
966 """
Thomas Woutersa9773292006-04-21 09:43:23 +0000967 return lookup(encoding).decode
968
969def getincrementalencoder(encoding):
970
971 """ Lookup up the codec for the given encoding and return
972 its IncrementalEncoder class or factory function.
973
974 Raises a LookupError in case the encoding cannot be found
975 or the codecs doesn't provide an incremental encoder.
976
977 """
978 encoder = lookup(encoding).incrementalencoder
979 if encoder is None:
980 raise LookupError(encoding)
981 return encoder
982
983def getincrementaldecoder(encoding):
984
985 """ Lookup up the codec for the given encoding and return
986 its IncrementalDecoder class or factory function.
987
988 Raises a LookupError in case the encoding cannot be found
989 or the codecs doesn't provide an incremental decoder.
990
991 """
992 decoder = lookup(encoding).incrementaldecoder
993 if decoder is None:
994 raise LookupError(encoding)
995 return decoder
Marc-André Lemburgaa32c5a2001-09-19 11:24:48 +0000996
997def getreader(encoding):
998
999 """ Lookup up the codec for the given encoding and return
1000 its StreamReader class or factory function.
1001
1002 Raises a LookupError in case the encoding cannot be found.
1003
1004 """
Thomas Woutersa9773292006-04-21 09:43:23 +00001005 return lookup(encoding).streamreader
Marc-André Lemburgaa32c5a2001-09-19 11:24:48 +00001006
1007def getwriter(encoding):
1008
1009 """ Lookup up the codec for the given encoding and return
1010 its StreamWriter class or factory function.
1011
1012 Raises a LookupError in case the encoding cannot be found.
1013
1014 """
Thomas Woutersa9773292006-04-21 09:43:23 +00001015 return lookup(encoding).streamwriter
1016
1017def iterencode(iterator, encoding, errors='strict', **kwargs):
1018 """
1019 Encoding iterator.
1020
1021 Encodes the input strings from the iterator using a IncrementalEncoder.
1022
1023 errors and kwargs are passed through to the IncrementalEncoder
1024 constructor.
1025 """
1026 encoder = getincrementalencoder(encoding)(errors, **kwargs)
1027 for input in iterator:
1028 output = encoder.encode(input)
1029 if output:
1030 yield output
1031 output = encoder.encode("", True)
1032 if output:
1033 yield output
1034
1035def iterdecode(iterator, encoding, errors='strict', **kwargs):
1036 """
1037 Decoding iterator.
1038
1039 Decodes the input strings from the iterator using a IncrementalDecoder.
1040
1041 errors and kwargs are passed through to the IncrementalDecoder
1042 constructor.
1043 """
1044 decoder = getincrementaldecoder(encoding)(errors, **kwargs)
1045 for input in iterator:
1046 output = decoder.decode(input)
1047 if output:
1048 yield output
Walter Dörwaldca8a8d02007-05-04 13:05:09 +00001049 output = decoder.decode(b"", True)
Thomas Woutersa9773292006-04-21 09:43:23 +00001050 if output:
1051 yield output
Marc-André Lemburgaa32c5a2001-09-19 11:24:48 +00001052
Marc-André Lemburga866df82001-01-03 21:29:14 +00001053### Helpers for charmap-based codecs
1054
1055def make_identity_dict(rng):
1056
1057 """ make_identity_dict(rng) -> dict
1058
1059 Return a dictionary where elements of the rng sequence are
1060 mapped to themselves.
Tim Peters88869f92001-01-14 23:36:06 +00001061
Marc-André Lemburga866df82001-01-03 21:29:14 +00001062 """
Antoine Pitrouaaefac72012-06-16 22:48:21 +02001063 return {i:i for i in rng}
Marc-André Lemburga866df82001-01-03 21:29:14 +00001064
Marc-André Lemburg716cf912001-05-16 09:41:45 +00001065def make_encoding_map(decoding_map):
1066
1067 """ Creates an encoding map from a decoding map.
1068
Walter Dörwald7f3ed742003-02-02 23:08:27 +00001069 If a target mapping in the decoding map occurs multiple
Marc-André Lemburg716cf912001-05-16 09:41:45 +00001070 times, then that target is mapped to None (undefined mapping),
1071 causing an exception when encountered by the charmap codec
1072 during translation.
1073
1074 One example where this happens is cp875.py which decodes
1075 multiple character to \u001a.
1076
1077 """
1078 m = {}
1079 for k,v in decoding_map.items():
Raymond Hettinger54f02222002-06-01 14:18:47 +00001080 if not v in m:
Marc-André Lemburg716cf912001-05-16 09:41:45 +00001081 m[v] = k
1082 else:
1083 m[v] = None
1084 return m
Tim Peters3a2ab1a2001-05-29 06:06:54 +00001085
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001086### error handlers
1087
Martin v. Löwise2713be2005-03-08 15:03:08 +00001088try:
1089 strict_errors = lookup_error("strict")
1090 ignore_errors = lookup_error("ignore")
1091 replace_errors = lookup_error("replace")
1092 xmlcharrefreplace_errors = lookup_error("xmlcharrefreplace")
1093 backslashreplace_errors = lookup_error("backslashreplace")
1094except LookupError:
1095 # In --disable-unicode builds, these error handler are missing
1096 strict_errors = None
1097 ignore_errors = None
1098 replace_errors = None
1099 xmlcharrefreplace_errors = None
1100 backslashreplace_errors = None
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001101
Martin v. Löwis6cd441d2001-07-31 08:54:55 +00001102# Tell modulefinder that using codecs probably needs the encodings
1103# package
1104_false = 0
1105if _false:
1106 import encodings
1107
Guido van Rossum0612d842000-03-10 23:20:43 +00001108### Tests
Guido van Rossum1c89b0e2000-04-11 15:41:38 +00001109
Guido van Rossum0612d842000-03-10 23:20:43 +00001110if __name__ == '__main__':
1111
Guido van Rossuma3277132000-04-11 15:37:43 +00001112 # Make stdout translate Latin-1 output into UTF-8 output
1113 sys.stdout = EncodedFile(sys.stdout, 'latin-1', 'utf-8')
Guido van Rossum1c89b0e2000-04-11 15:41:38 +00001114
Guido van Rossuma3277132000-04-11 15:37:43 +00001115 # Have stdin translate Latin-1 input into UTF-8 input
1116 sys.stdin = EncodedFile(sys.stdin, 'utf-8', 'latin-1')