blob: c20cd6c18bf33adabe92b0adda6002e19d320119 [file] [log] [blame]
Guido van Rossum0612d842000-03-10 23:20:43 +00001""" codecs -- Python Codec Registry, API and helpers.
2
3
4Written by Marc-Andre Lemburg (mal@lemburg.com).
5
6(c) Copyright CNRI, All Rights Reserved. NO WARRANTY.
7
8"""#"
9
Georg Brandl1a3284e2007-12-02 09:40:06 +000010import builtins, sys
Guido van Rossum0612d842000-03-10 23:20:43 +000011
12### Registry and builtin stateless codec functions
13
Guido van Rossumb95de4f2000-03-31 17:25:23 +000014try:
15 from _codecs import *
Guido van Rossumb940e112007-01-10 16:19:56 +000016except ImportError as why:
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000017 raise SystemError('Failed to load the builtin codecs: %s' % why)
Guido van Rossum0612d842000-03-10 23:20:43 +000018
Tim Peters30324a72001-05-15 17:19:16 +000019__all__ = ["register", "lookup", "open", "EncodedFile", "BOM", "BOM_BE",
Walter Dörwald474458d2002-06-04 15:16:29 +000020 "BOM_LE", "BOM32_BE", "BOM32_LE", "BOM64_BE", "BOM64_LE",
21 "BOM_UTF8", "BOM_UTF16", "BOM_UTF16_LE", "BOM_UTF16_BE",
Walter Dörwald3aeb6322002-09-02 13:14:32 +000022 "BOM_UTF32", "BOM_UTF32_LE", "BOM_UTF32_BE",
Serhiy Storchakade3ee5b2014-12-20 17:42:38 +020023 "CodecInfo", "Codec", "IncrementalEncoder", "IncrementalDecoder",
24 "StreamReader", "StreamWriter",
25 "StreamReaderWriter", "StreamRecoder",
26 "getencoder", "getdecoder", "getincrementalencoder",
27 "getincrementaldecoder", "getreader", "getwriter",
28 "encode", "decode", "iterencode", "iterdecode",
Walter Dörwald3aeb6322002-09-02 13:14:32 +000029 "strict_errors", "ignore_errors", "replace_errors",
30 "xmlcharrefreplace_errors",
Serhiy Storchaka166ebc42014-11-25 13:57:17 +020031 "backslashreplace_errors", "namereplace_errors",
Walter Dörwald3aeb6322002-09-02 13:14:32 +000032 "register_error", "lookup_error"]
Skip Montanaroe99d5ea2001-01-20 19:54:20 +000033
Guido van Rossum0612d842000-03-10 23:20:43 +000034### Constants
35
36#
Walter Dörwald474458d2002-06-04 15:16:29 +000037# Byte Order Mark (BOM = ZERO WIDTH NO-BREAK SPACE = U+FEFF)
38# and its possible byte string values
39# for UTF8/UTF16/UTF32 output and little/big endian machines
Guido van Rossum0612d842000-03-10 23:20:43 +000040#
Guido van Rossum0612d842000-03-10 23:20:43 +000041
Walter Dörwald474458d2002-06-04 15:16:29 +000042# UTF-8
Walter Dörwaldca8a8d02007-05-04 13:05:09 +000043BOM_UTF8 = b'\xef\xbb\xbf'
Walter Dörwald474458d2002-06-04 15:16:29 +000044
45# UTF-16, little endian
Walter Dörwaldca8a8d02007-05-04 13:05:09 +000046BOM_LE = BOM_UTF16_LE = b'\xff\xfe'
Walter Dörwald474458d2002-06-04 15:16:29 +000047
48# UTF-16, big endian
Walter Dörwaldca8a8d02007-05-04 13:05:09 +000049BOM_BE = BOM_UTF16_BE = b'\xfe\xff'
Walter Dörwald474458d2002-06-04 15:16:29 +000050
51# UTF-32, little endian
Walter Dörwaldca8a8d02007-05-04 13:05:09 +000052BOM_UTF32_LE = b'\xff\xfe\x00\x00'
Walter Dörwald474458d2002-06-04 15:16:29 +000053
54# UTF-32, big endian
Walter Dörwaldca8a8d02007-05-04 13:05:09 +000055BOM_UTF32_BE = b'\x00\x00\xfe\xff'
Walter Dörwald474458d2002-06-04 15:16:29 +000056
Marc-André Lemburgb28de0d2002-12-12 17:37:50 +000057if sys.byteorder == 'little':
Walter Dörwald474458d2002-06-04 15:16:29 +000058
Marc-André Lemburgb28de0d2002-12-12 17:37:50 +000059 # UTF-16, native endianness
60 BOM = BOM_UTF16 = BOM_UTF16_LE
61
62 # UTF-32, native endianness
63 BOM_UTF32 = BOM_UTF32_LE
64
65else:
66
67 # UTF-16, native endianness
68 BOM = BOM_UTF16 = BOM_UTF16_BE
69
70 # UTF-32, native endianness
71 BOM_UTF32 = BOM_UTF32_BE
Walter Dörwald474458d2002-06-04 15:16:29 +000072
73# Old broken names (don't use in new code)
74BOM32_LE = BOM_UTF16_LE
75BOM32_BE = BOM_UTF16_BE
76BOM64_LE = BOM_UTF32_LE
77BOM64_BE = BOM_UTF32_BE
Guido van Rossum0612d842000-03-10 23:20:43 +000078
79
80### Codec base classes (defining the API)
81
Thomas Woutersa9773292006-04-21 09:43:23 +000082class CodecInfo(tuple):
Nick Coghlanc72e4e62013-11-22 22:39:36 +100083 """Codec details when looking up the codec registry"""
84
85 # Private API to allow Python 3.4 to blacklist the known non-Unicode
86 # codecs in the standard library. A more general mechanism to
87 # reliably distinguish test encodings from other codecs will hopefully
88 # be defined for Python 3.5
89 #
90 # See http://bugs.python.org/issue19619
91 _is_text_encoding = True # Assume codecs are text encodings by default
Thomas Woutersa9773292006-04-21 09:43:23 +000092
93 def __new__(cls, encode, decode, streamreader=None, streamwriter=None,
Nick Coghlanc72e4e62013-11-22 22:39:36 +100094 incrementalencoder=None, incrementaldecoder=None, name=None,
95 *, _is_text_encoding=None):
Thomas Woutersa9773292006-04-21 09:43:23 +000096 self = tuple.__new__(cls, (encode, decode, streamreader, streamwriter))
97 self.name = name
98 self.encode = encode
99 self.decode = decode
100 self.incrementalencoder = incrementalencoder
101 self.incrementaldecoder = incrementaldecoder
102 self.streamwriter = streamwriter
103 self.streamreader = streamreader
Nick Coghlanc72e4e62013-11-22 22:39:36 +1000104 if _is_text_encoding is not None:
105 self._is_text_encoding = _is_text_encoding
Thomas Woutersa9773292006-04-21 09:43:23 +0000106 return self
107
108 def __repr__(self):
Serhiy Storchaka521e5862014-07-22 15:00:37 +0300109 return "<%s.%s object for encoding %s at %#x>" % \
110 (self.__class__.__module__, self.__class__.__qualname__,
Walter Dörwald3abcb012007-04-16 22:10:50 +0000111 self.name, id(self))
Thomas Woutersa9773292006-04-21 09:43:23 +0000112
Guido van Rossum0612d842000-03-10 23:20:43 +0000113class Codec:
114
115 """ Defines the interface for stateless encoders/decoders.
116
Walter Dörwald7f82f792002-11-19 21:42:53 +0000117 The .encode()/.decode() methods may use different error
Guido van Rossum0612d842000-03-10 23:20:43 +0000118 handling schemes by providing the errors argument. These
Walter Dörwald7f82f792002-11-19 21:42:53 +0000119 string values are predefined:
Guido van Rossum0612d842000-03-10 23:20:43 +0000120
Guido van Rossumd8855fd2000-03-24 22:14:19 +0000121 'strict' - raise a ValueError error (or a subclass)
Guido van Rossum0612d842000-03-10 23:20:43 +0000122 'ignore' - ignore the character and continue with the next
123 'replace' - replace with a suitable replacement character;
124 Python will use the official U+FFFD REPLACEMENT
Walter Dörwald7f82f792002-11-19 21:42:53 +0000125 CHARACTER for the builtin Unicode codecs on
126 decoding and '?' on encoding.
Andrew Kuchlingc7b6c502013-06-16 12:58:48 -0400127 'surrogateescape' - replace with private codepoints U+DCnn.
Walter Dörwald7f82f792002-11-19 21:42:53 +0000128 'xmlcharrefreplace' - Replace with the appropriate XML
129 character reference (only for encoding).
130 'backslashreplace' - Replace with backslashed escape sequences
131 (only for encoding).
132
133 The set of allowed values can be extended via register_error.
Guido van Rossum0612d842000-03-10 23:20:43 +0000134
135 """
Tim Peters30324a72001-05-15 17:19:16 +0000136 def encode(self, input, errors='strict'):
Guido van Rossum1c89b0e2000-04-11 15:41:38 +0000137
Fred Drake3e74c0d2000-03-17 15:40:35 +0000138 """ Encodes the object input and returns a tuple (output
Guido van Rossum0612d842000-03-10 23:20:43 +0000139 object, length consumed).
140
141 errors defines the error handling to apply. It defaults to
142 'strict' handling.
143
144 The method may not store state in the Codec instance. Use
145 StreamCodec for codecs which have to keep state in order to
146 make encoding/decoding efficient.
147
148 The encoder must be able to handle zero length input and
149 return an empty object of the output object type in this
150 situation.
151
152 """
153 raise NotImplementedError
154
Tim Peters30324a72001-05-15 17:19:16 +0000155 def decode(self, input, errors='strict'):
Guido van Rossum0612d842000-03-10 23:20:43 +0000156
157 """ Decodes the object input and returns a tuple (output
158 object, length consumed).
159
160 input must be an object which provides the bf_getreadbuf
161 buffer slot. Python strings, buffer objects and memory
162 mapped files are examples of objects providing this slot.
Guido van Rossum1c89b0e2000-04-11 15:41:38 +0000163
Guido van Rossum0612d842000-03-10 23:20:43 +0000164 errors defines the error handling to apply. It defaults to
165 'strict' handling.
166
167 The method may not store state in the Codec instance. Use
168 StreamCodec for codecs which have to keep state in order to
169 make encoding/decoding efficient.
170
171 The decoder must be able to handle zero length input and
172 return an empty object of the output object type in this
173 situation.
174
Guido van Rossum1c89b0e2000-04-11 15:41:38 +0000175 """
Guido van Rossum0612d842000-03-10 23:20:43 +0000176 raise NotImplementedError
177
Thomas Woutersa9773292006-04-21 09:43:23 +0000178class IncrementalEncoder(object):
179 """
Walter Dörwald3abcb012007-04-16 22:10:50 +0000180 An IncrementalEncoder encodes an input in multiple steps. The input can
181 be passed piece by piece to the encode() method. The IncrementalEncoder
182 remembers the state of the encoding process between calls to encode().
Thomas Woutersa9773292006-04-21 09:43:23 +0000183 """
184 def __init__(self, errors='strict'):
185 """
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000186 Creates an IncrementalEncoder instance.
Thomas Woutersa9773292006-04-21 09:43:23 +0000187
188 The IncrementalEncoder may use different error handling schemes by
189 providing the errors keyword argument. See the module docstring
190 for a list of possible values.
191 """
192 self.errors = errors
193 self.buffer = ""
194
195 def encode(self, input, final=False):
196 """
197 Encodes input and returns the resulting object.
198 """
199 raise NotImplementedError
200
201 def reset(self):
202 """
203 Resets the encoder to the initial state.
204 """
205
Walter Dörwald3abcb012007-04-16 22:10:50 +0000206 def getstate(self):
207 """
208 Return the current state of the encoder.
209 """
210 return 0
211
212 def setstate(self, state):
213 """
214 Set the current state of the encoder. state must have been
215 returned by getstate().
216 """
217
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000218class BufferedIncrementalEncoder(IncrementalEncoder):
219 """
220 This subclass of IncrementalEncoder can be used as the baseclass for an
221 incremental encoder if the encoder must keep some of the output in a
222 buffer between calls to encode().
223 """
224 def __init__(self, errors='strict'):
225 IncrementalEncoder.__init__(self, errors)
Walter Dörwald3abcb012007-04-16 22:10:50 +0000226 # unencoded input that is kept between calls to encode()
227 self.buffer = ""
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000228
229 def _buffer_encode(self, input, errors, final):
230 # Overwrite this method in subclasses: It must encode input
231 # and return an (output, length consumed) tuple
232 raise NotImplementedError
233
234 def encode(self, input, final=False):
235 # encode input (taking the buffer into account)
236 data = self.buffer + input
237 (result, consumed) = self._buffer_encode(data, self.errors, final)
238 # keep unencoded input until the next call
239 self.buffer = data[consumed:]
240 return result
241
242 def reset(self):
243 IncrementalEncoder.reset(self)
244 self.buffer = ""
245
Walter Dörwald3abcb012007-04-16 22:10:50 +0000246 def getstate(self):
247 return self.buffer or 0
248
249 def setstate(self, state):
250 self.buffer = state or ""
251
Thomas Woutersa9773292006-04-21 09:43:23 +0000252class IncrementalDecoder(object):
253 """
Walter Dörwald3abcb012007-04-16 22:10:50 +0000254 An IncrementalDecoder decodes an input in multiple steps. The input can
255 be passed piece by piece to the decode() method. The IncrementalDecoder
Thomas Woutersa9773292006-04-21 09:43:23 +0000256 remembers the state of the decoding process between calls to decode().
257 """
258 def __init__(self, errors='strict'):
259 """
Ka-Ping Yeef44c7e82008-03-18 04:51:32 +0000260 Create a IncrementalDecoder instance.
Thomas Woutersa9773292006-04-21 09:43:23 +0000261
262 The IncrementalDecoder may use different error handling schemes by
263 providing the errors keyword argument. See the module docstring
264 for a list of possible values.
265 """
266 self.errors = errors
267
268 def decode(self, input, final=False):
269 """
Ka-Ping Yeef44c7e82008-03-18 04:51:32 +0000270 Decode input and returns the resulting object.
Thomas Woutersa9773292006-04-21 09:43:23 +0000271 """
272 raise NotImplementedError
273
274 def reset(self):
275 """
Ka-Ping Yeef44c7e82008-03-18 04:51:32 +0000276 Reset the decoder to the initial state.
Thomas Woutersa9773292006-04-21 09:43:23 +0000277 """
278
Walter Dörwald3abcb012007-04-16 22:10:50 +0000279 def getstate(self):
280 """
Ka-Ping Yeef44c7e82008-03-18 04:51:32 +0000281 Return the current state of the decoder.
282
283 This must be a (buffered_input, additional_state_info) tuple.
284 buffered_input must be a bytes object containing bytes that
285 were passed to decode() that have not yet been converted.
286 additional_state_info must be a non-negative integer
287 representing the state of the decoder WITHOUT yet having
288 processed the contents of buffered_input. In the initial state
289 and after reset(), getstate() must return (b"", 0).
Walter Dörwald3abcb012007-04-16 22:10:50 +0000290 """
Walter Dörwaldca8a8d02007-05-04 13:05:09 +0000291 return (b"", 0)
Walter Dörwald3abcb012007-04-16 22:10:50 +0000292
293 def setstate(self, state):
294 """
Ka-Ping Yeef44c7e82008-03-18 04:51:32 +0000295 Set the current state of the decoder.
296
297 state must have been returned by getstate(). The effect of
298 setstate((b"", 0)) must be equivalent to reset().
Walter Dörwald3abcb012007-04-16 22:10:50 +0000299 """
300
Thomas Woutersa9773292006-04-21 09:43:23 +0000301class BufferedIncrementalDecoder(IncrementalDecoder):
302 """
303 This subclass of IncrementalDecoder can be used as the baseclass for an
Walter Dörwald3abcb012007-04-16 22:10:50 +0000304 incremental decoder if the decoder must be able to handle incomplete
305 byte sequences.
Thomas Woutersa9773292006-04-21 09:43:23 +0000306 """
307 def __init__(self, errors='strict'):
308 IncrementalDecoder.__init__(self, errors)
Walter Dörwald3abcb012007-04-16 22:10:50 +0000309 # undecoded input that is kept between calls to decode()
Walter Dörwaldca8a8d02007-05-04 13:05:09 +0000310 self.buffer = b""
Thomas Woutersa9773292006-04-21 09:43:23 +0000311
312 def _buffer_decode(self, input, errors, final):
313 # Overwrite this method in subclasses: It must decode input
314 # and return an (output, length consumed) tuple
315 raise NotImplementedError
316
317 def decode(self, input, final=False):
318 # decode input (taking the buffer into account)
319 data = self.buffer + input
320 (result, consumed) = self._buffer_decode(data, self.errors, final)
321 # keep undecoded input until the next call
322 self.buffer = data[consumed:]
323 return result
324
325 def reset(self):
326 IncrementalDecoder.reset(self)
Walter Dörwaldca8a8d02007-05-04 13:05:09 +0000327 self.buffer = b""
Thomas Woutersa9773292006-04-21 09:43:23 +0000328
Walter Dörwald3abcb012007-04-16 22:10:50 +0000329 def getstate(self):
330 # additional state info is always 0
331 return (self.buffer, 0)
332
333 def setstate(self, state):
334 # ignore additional state info
335 self.buffer = state[0]
336
Guido van Rossum0612d842000-03-10 23:20:43 +0000337#
338# The StreamWriter and StreamReader class provide generic working
Andrew M. Kuchling97c56352001-09-18 20:29:48 +0000339# interfaces which can be used to implement new encoding submodules
Guido van Rossum0612d842000-03-10 23:20:43 +0000340# very easily. See encodings/utf_8.py for an example on how this is
341# done.
Guido van Rossum1c89b0e2000-04-11 15:41:38 +0000342#
Guido van Rossum0612d842000-03-10 23:20:43 +0000343
344class StreamWriter(Codec):
345
Tim Peters30324a72001-05-15 17:19:16 +0000346 def __init__(self, stream, errors='strict'):
Guido van Rossum0612d842000-03-10 23:20:43 +0000347
348 """ Creates a StreamWriter instance.
349
350 stream must be a file-like object open for writing
351 (binary) data.
352
Walter Dörwald7f82f792002-11-19 21:42:53 +0000353 The StreamWriter may use different error handling
Guido van Rossum0612d842000-03-10 23:20:43 +0000354 schemes by providing the errors keyword argument. These
Walter Dörwald7f82f792002-11-19 21:42:53 +0000355 parameters are predefined:
Guido van Rossum0612d842000-03-10 23:20:43 +0000356
357 'strict' - raise a ValueError (or a subclass)
358 'ignore' - ignore the character and continue with the next
359 'replace'- replace with a suitable replacement character
Walter Dörwald7f82f792002-11-19 21:42:53 +0000360 'xmlcharrefreplace' - Replace with the appropriate XML
361 character reference.
362 'backslashreplace' - Replace with backslashed escape
363 sequences (only for encoding).
Guido van Rossum0612d842000-03-10 23:20:43 +0000364
Walter Dörwald7f82f792002-11-19 21:42:53 +0000365 The set of allowed parameter values can be extended via
366 register_error.
Guido van Rossum0612d842000-03-10 23:20:43 +0000367 """
368 self.stream = stream
369 self.errors = errors
370
Guido van Rossuma3277132000-04-11 15:37:43 +0000371 def write(self, object):
Guido van Rossum0612d842000-03-10 23:20:43 +0000372
373 """ Writes the object's contents encoded to self.stream.
374 """
Tim Peters30324a72001-05-15 17:19:16 +0000375 data, consumed = self.encode(object, self.errors)
Guido van Rossum0612d842000-03-10 23:20:43 +0000376 self.stream.write(data)
377
Guido van Rossuma3277132000-04-11 15:37:43 +0000378 def writelines(self, list):
379
380 """ Writes the concatenated list of strings to the stream
381 using .write().
382 """
383 self.write(''.join(list))
Guido van Rossum1c89b0e2000-04-11 15:41:38 +0000384
Guido van Rossum0612d842000-03-10 23:20:43 +0000385 def reset(self):
386
387 """ Flushes and resets the codec buffers used for keeping state.
388
389 Calling this method should ensure that the data on the
390 output is put into a clean state, that allows appending
391 of new fresh data without having to rescan the whole
392 stream to recover state.
393
394 """
395 pass
396
Victor Stinnera92ad7e2010-05-22 16:59:09 +0000397 def seek(self, offset, whence=0):
398 self.stream.seek(offset, whence)
399 if whence == 0 and offset == 0:
400 self.reset()
401
Tim Peters30324a72001-05-15 17:19:16 +0000402 def __getattr__(self, name,
Guido van Rossum0612d842000-03-10 23:20:43 +0000403 getattr=getattr):
404
405 """ Inherit all other methods from the underlying stream.
406 """
Tim Peters30324a72001-05-15 17:19:16 +0000407 return getattr(self.stream, name)
Guido van Rossum0612d842000-03-10 23:20:43 +0000408
Thomas Wouters89f507f2006-12-13 04:49:30 +0000409 def __enter__(self):
410 return self
411
412 def __exit__(self, type, value, tb):
413 self.stream.close()
414
Guido van Rossum0612d842000-03-10 23:20:43 +0000415###
416
417class StreamReader(Codec):
418
Georg Brandl02524622010-12-02 18:06:51 +0000419 charbuffertype = str
420
Tim Peters30324a72001-05-15 17:19:16 +0000421 def __init__(self, stream, errors='strict'):
Guido van Rossum0612d842000-03-10 23:20:43 +0000422
423 """ Creates a StreamReader instance.
424
425 stream must be a file-like object open for reading
426 (binary) data.
427
Walter Dörwald7f82f792002-11-19 21:42:53 +0000428 The StreamReader may use different error handling
Guido van Rossum0612d842000-03-10 23:20:43 +0000429 schemes by providing the errors keyword argument. These
Walter Dörwald7f82f792002-11-19 21:42:53 +0000430 parameters are predefined:
Guido van Rossum0612d842000-03-10 23:20:43 +0000431
432 'strict' - raise a ValueError (or a subclass)
433 'ignore' - ignore the character and continue with the next
434 'replace'- replace with a suitable replacement character;
435
Walter Dörwald7f82f792002-11-19 21:42:53 +0000436 The set of allowed parameter values can be extended via
437 register_error.
Guido van Rossum0612d842000-03-10 23:20:43 +0000438 """
439 self.stream = stream
440 self.errors = errors
Walter Dörwaldca8a8d02007-05-04 13:05:09 +0000441 self.bytebuffer = b""
Georg Brandl02524622010-12-02 18:06:51 +0000442 self._empty_charbuffer = self.charbuffertype()
443 self.charbuffer = self._empty_charbuffer
Martin v. Löwis4ed67382005-09-18 08:34:39 +0000444 self.linebuffer = None
Guido van Rossum0612d842000-03-10 23:20:43 +0000445
Walter Dörwald69652032004-09-07 20:24:22 +0000446 def decode(self, input, errors='strict'):
447 raise NotImplementedError
448
Martin v. Löwis56066d22005-08-24 07:38:12 +0000449 def read(self, size=-1, chars=-1, firstline=False):
Guido van Rossum0612d842000-03-10 23:20:43 +0000450
451 """ Decodes data from the stream self.stream and returns the
452 resulting object.
453
Walter Dörwald69652032004-09-07 20:24:22 +0000454 chars indicates the number of characters to read from the
455 stream. read() will never return more than chars
456 characters, but it might return less, if there are not enough
457 characters available.
458
Guido van Rossum0612d842000-03-10 23:20:43 +0000459 size indicates the approximate maximum number of bytes to
460 read from the stream for decoding purposes. The decoder
461 can modify this setting as appropriate. The default value
462 -1 indicates to read and decode as much as possible. size
463 is intended to prevent having to decode huge files in one
464 step.
465
Martin v. Löwis56066d22005-08-24 07:38:12 +0000466 If firstline is true, and a UnicodeDecodeError happens
467 after the first line terminator in the input only the first line
468 will be returned, the rest of the input will be kept until the
469 next call to read().
470
Guido van Rossum0612d842000-03-10 23:20:43 +0000471 The method should use a greedy read strategy meaning that
472 it should read as much data as is allowed within the
473 definition of the encoding and the given size, e.g. if
474 optional encoding endings or state markers are available
475 on the stream, these should be read too.
Guido van Rossum0612d842000-03-10 23:20:43 +0000476 """
Martin v. Löwis4ed67382005-09-18 08:34:39 +0000477 # If we have lines cached, first merge them back into characters
478 if self.linebuffer:
Georg Brandl02524622010-12-02 18:06:51 +0000479 self.charbuffer = self._empty_charbuffer.join(self.linebuffer)
Martin v. Löwis4ed67382005-09-18 08:34:39 +0000480 self.linebuffer = None
Tim Peters536cf992005-12-25 23:18:31 +0000481
Walter Dörwald69652032004-09-07 20:24:22 +0000482 # read until we get the required number of characters (if available)
Walter Dörwald69652032004-09-07 20:24:22 +0000483 while True:
Tim Golden621302c2012-10-01 16:40:40 +0100484 # can the request be satisfied from the character buffer?
Serhiy Storchakadbe09822014-01-26 19:27:56 +0200485 if chars >= 0:
Walter Dörwald69652032004-09-07 20:24:22 +0000486 if len(self.charbuffer) >= chars:
Walter Dörwald69652032004-09-07 20:24:22 +0000487 break
Serhiy Storchakadbe09822014-01-26 19:27:56 +0200488 elif size >= 0:
489 if len(self.charbuffer) >= size:
490 break
Walter Dörwald69652032004-09-07 20:24:22 +0000491 # we need more data
492 if size < 0:
493 newdata = self.stream.read()
494 else:
495 newdata = self.stream.read(size)
Walter Dörwalde57d7b12004-12-21 22:24:00 +0000496 # decode bytes (those remaining from the last call included)
Walter Dörwald69652032004-09-07 20:24:22 +0000497 data = self.bytebuffer + newdata
Serhiy Storchakadbe09822014-01-26 19:27:56 +0200498 if not data:
499 break
Martin v. Löwis56066d22005-08-24 07:38:12 +0000500 try:
501 newchars, decodedbytes = self.decode(data, self.errors)
Guido van Rossumb940e112007-01-10 16:19:56 +0000502 except UnicodeDecodeError as exc:
Martin v. Löwis56066d22005-08-24 07:38:12 +0000503 if firstline:
Walter Dörwald3abcb012007-04-16 22:10:50 +0000504 newchars, decodedbytes = \
505 self.decode(data[:exc.start], self.errors)
Ezio Melottid8b509b2011-09-28 17:37:55 +0300506 lines = newchars.splitlines(keepends=True)
Martin v. Löwis56066d22005-08-24 07:38:12 +0000507 if len(lines)<=1:
508 raise
509 else:
510 raise
Walter Dörwald69652032004-09-07 20:24:22 +0000511 # keep undecoded bytes until the next call
512 self.bytebuffer = data[decodedbytes:]
513 # put new characters in the character buffer
Walter Dörwalde57d7b12004-12-21 22:24:00 +0000514 self.charbuffer += newchars
Walter Dörwald69652032004-09-07 20:24:22 +0000515 # there was no data available
516 if not newdata:
Walter Dörwalde57d7b12004-12-21 22:24:00 +0000517 break
518 if chars < 0:
519 # Return everything we've got
520 result = self.charbuffer
Georg Brandl02524622010-12-02 18:06:51 +0000521 self.charbuffer = self._empty_charbuffer
Walter Dörwalde57d7b12004-12-21 22:24:00 +0000522 else:
523 # Return the first chars characters
524 result = self.charbuffer[:chars]
525 self.charbuffer = self.charbuffer[chars:]
Walter Dörwald69652032004-09-07 20:24:22 +0000526 return result
Guido van Rossum0612d842000-03-10 23:20:43 +0000527
Walter Dörwald69652032004-09-07 20:24:22 +0000528 def readline(self, size=None, keepends=True):
Guido van Rossuma3277132000-04-11 15:37:43 +0000529
530 """ Read one line from the input stream and return the
531 decoded data.
532
Walter Dörwald69652032004-09-07 20:24:22 +0000533 size, if given, is passed as size argument to the
534 read() method.
Guido van Rossum1c89b0e2000-04-11 15:41:38 +0000535
Guido van Rossuma3277132000-04-11 15:37:43 +0000536 """
Martin v. Löwis4ed67382005-09-18 08:34:39 +0000537 # If we have lines cached from an earlier read, return
538 # them unconditionally
539 if self.linebuffer:
540 line = self.linebuffer[0]
541 del self.linebuffer[0]
542 if len(self.linebuffer) == 1:
543 # revert to charbuffer mode; we might need more data
544 # next time
545 self.charbuffer = self.linebuffer[0]
546 self.linebuffer = None
547 if not keepends:
Ezio Melottid8b509b2011-09-28 17:37:55 +0300548 line = line.splitlines(keepends=False)[0]
Martin v. Löwis4ed67382005-09-18 08:34:39 +0000549 return line
Tim Peters536cf992005-12-25 23:18:31 +0000550
Walter Dörwalde57d7b12004-12-21 22:24:00 +0000551 readsize = size or 72
Georg Brandl02524622010-12-02 18:06:51 +0000552 line = self._empty_charbuffer
Walter Dörwalde57d7b12004-12-21 22:24:00 +0000553 # If size is given, we call read() only once
Walter Dörwald69652032004-09-07 20:24:22 +0000554 while True:
Martin v. Löwis56066d22005-08-24 07:38:12 +0000555 data = self.read(readsize, firstline=True)
Walter Dörwalde57d7b12004-12-21 22:24:00 +0000556 if data:
Walter Dörwalda4eb2d52005-04-21 21:42:35 +0000557 # If we're at a "\r" read one extra character (which might
558 # be a "\n") to get a proper line ending. If the stream is
Walter Dörwaldbc8e6422005-04-21 21:32:03 +0000559 # temporarily exhausted we return the wrong line ending.
Georg Brandl02524622010-12-02 18:06:51 +0000560 if (isinstance(data, str) and data.endswith("\r")) or \
561 (isinstance(data, bytes) and data.endswith(b"\r")):
Walter Dörwald7a6dc132005-04-04 21:38:47 +0000562 data += self.read(size=1, chars=1)
Walter Dörwald7a6dc132005-04-04 21:38:47 +0000563
Walter Dörwald69652032004-09-07 20:24:22 +0000564 line += data
Ezio Melottid8b509b2011-09-28 17:37:55 +0300565 lines = line.splitlines(keepends=True)
Walter Dörwalde57d7b12004-12-21 22:24:00 +0000566 if lines:
Martin v. Löwis4ed67382005-09-18 08:34:39 +0000567 if len(lines) > 1:
568 # More than one line result; the first line is a full line
569 # to return
570 line = lines[0]
571 del lines[0]
572 if len(lines) > 1:
573 # cache the remaining lines
574 lines[-1] += self.charbuffer
575 self.linebuffer = lines
576 self.charbuffer = None
577 else:
578 # only one remaining line, put it back into charbuffer
579 self.charbuffer = lines[0] + self.charbuffer
580 if not keepends:
Ezio Melottid8b509b2011-09-28 17:37:55 +0300581 line = line.splitlines(keepends=False)[0]
Martin v. Löwis4ed67382005-09-18 08:34:39 +0000582 break
Walter Dörwalde57d7b12004-12-21 22:24:00 +0000583 line0withend = lines[0]
Ezio Melottid8b509b2011-09-28 17:37:55 +0300584 line0withoutend = lines[0].splitlines(keepends=False)[0]
Walter Dörwalde57d7b12004-12-21 22:24:00 +0000585 if line0withend != line0withoutend: # We really have a line end
586 # Put the rest back together and keep it until the next call
Georg Brandl02524622010-12-02 18:06:51 +0000587 self.charbuffer = self._empty_charbuffer.join(lines[1:]) + \
588 self.charbuffer
Walter Dörwalde57d7b12004-12-21 22:24:00 +0000589 if keepends:
590 line = line0withend
591 else:
592 line = line0withoutend
Walter Dörwald9fa09462005-01-10 12:01:39 +0000593 break
Walter Dörwalde57d7b12004-12-21 22:24:00 +0000594 # we didn't get anything or this was our only try
Walter Dörwald9fa09462005-01-10 12:01:39 +0000595 if not data or size is not None:
Walter Dörwalde57d7b12004-12-21 22:24:00 +0000596 if line and not keepends:
Ezio Melottid8b509b2011-09-28 17:37:55 +0300597 line = line.splitlines(keepends=False)[0]
Walter Dörwalde57d7b12004-12-21 22:24:00 +0000598 break
Georg Brandl02524622010-12-02 18:06:51 +0000599 if readsize < 8000:
Walter Dörwalde57d7b12004-12-21 22:24:00 +0000600 readsize *= 2
601 return line
Guido van Rossum1c89b0e2000-04-11 15:41:38 +0000602
Walter Dörwald69652032004-09-07 20:24:22 +0000603 def readlines(self, sizehint=None, keepends=True):
Guido van Rossuma3277132000-04-11 15:37:43 +0000604
605 """ Read all lines available on the input stream
606 and return them as list of lines.
607
608 Line breaks are implemented using the codec's decoder
609 method and are included in the list entries.
Guido van Rossum1c89b0e2000-04-11 15:41:38 +0000610
Marc-André Lemburgd5948492004-02-26 15:22:17 +0000611 sizehint, if given, is ignored since there is no efficient
612 way to finding the true end-of-line.
Guido van Rossuma3277132000-04-11 15:37:43 +0000613
614 """
Walter Dörwald69652032004-09-07 20:24:22 +0000615 data = self.read()
Hye-Shik Changaf5c7cf2004-10-17 23:51:21 +0000616 return data.splitlines(keepends)
Guido van Rossum0612d842000-03-10 23:20:43 +0000617
618 def reset(self):
619
620 """ Resets the codec buffers used for keeping state.
621
622 Note that no stream repositioning should take place.
Thomas Wouters7e474022000-07-16 12:04:32 +0000623 This method is primarily intended to be able to recover
Guido van Rossum0612d842000-03-10 23:20:43 +0000624 from decoding errors.
625
626 """
Walter Dörwaldca8a8d02007-05-04 13:05:09 +0000627 self.bytebuffer = b""
Georg Brandl02524622010-12-02 18:06:51 +0000628 self.charbuffer = self._empty_charbuffer
Martin v. Löwis4ed67382005-09-18 08:34:39 +0000629 self.linebuffer = None
Walter Dörwald729c31f2005-03-14 19:06:30 +0000630
Walter Dörwald71fd90d2005-03-14 19:25:41 +0000631 def seek(self, offset, whence=0):
Walter Dörwald729c31f2005-03-14 19:06:30 +0000632 """ Set the input stream's current position.
633
634 Resets the codec buffers used for keeping state.
635 """
Walter Dörwald729c31f2005-03-14 19:06:30 +0000636 self.stream.seek(offset, whence)
Victor Stinnera92ad7e2010-05-22 16:59:09 +0000637 self.reset()
Guido van Rossum0612d842000-03-10 23:20:43 +0000638
Georg Brandla18af4e2007-04-21 15:47:16 +0000639 def __next__(self):
Walter Dörwald4dbf1922002-11-06 16:53:44 +0000640
641 """ Return the next decoded line from the input stream."""
642 line = self.readline()
643 if line:
644 return line
645 raise StopIteration
646
647 def __iter__(self):
648 return self
649
Tim Peters30324a72001-05-15 17:19:16 +0000650 def __getattr__(self, name,
Guido van Rossum0612d842000-03-10 23:20:43 +0000651 getattr=getattr):
652
653 """ Inherit all other methods from the underlying stream.
654 """
Tim Peters30324a72001-05-15 17:19:16 +0000655 return getattr(self.stream, name)
Guido van Rossum0612d842000-03-10 23:20:43 +0000656
Thomas Wouters89f507f2006-12-13 04:49:30 +0000657 def __enter__(self):
658 return self
659
660 def __exit__(self, type, value, tb):
661 self.stream.close()
662
Guido van Rossum0612d842000-03-10 23:20:43 +0000663###
664
665class StreamReaderWriter:
666
Fred Drake49fd1072000-04-13 14:11:21 +0000667 """ StreamReaderWriter instances allow wrapping streams which
668 work in both read and write modes.
669
670 The design is such that one can use the factory functions
Thomas Wouters7e474022000-07-16 12:04:32 +0000671 returned by the codec.lookup() function to construct the
Fred Drake49fd1072000-04-13 14:11:21 +0000672 instance.
673
674 """
Guido van Rossuma3277132000-04-11 15:37:43 +0000675 # Optional attributes set by the file wrappers below
676 encoding = 'unknown'
677
Tim Peters30324a72001-05-15 17:19:16 +0000678 def __init__(self, stream, Reader, Writer, errors='strict'):
Guido van Rossum0612d842000-03-10 23:20:43 +0000679
680 """ Creates a StreamReaderWriter instance.
681
682 stream must be a Stream-like object.
683
684 Reader, Writer must be factory functions or classes
685 providing the StreamReader, StreamWriter interface resp.
686
687 Error handling is done in the same way as defined for the
688 StreamWriter/Readers.
689
690 """
691 self.stream = stream
692 self.reader = Reader(stream, errors)
693 self.writer = Writer(stream, errors)
694 self.errors = errors
695
Tim Peters30324a72001-05-15 17:19:16 +0000696 def read(self, size=-1):
Guido van Rossum0612d842000-03-10 23:20:43 +0000697
698 return self.reader.read(size)
699
Guido van Rossumd58c26f2000-05-01 16:17:32 +0000700 def readline(self, size=None):
Guido van Rossuma3277132000-04-11 15:37:43 +0000701
702 return self.reader.readline(size)
703
Guido van Rossumd58c26f2000-05-01 16:17:32 +0000704 def readlines(self, sizehint=None):
Guido van Rossuma3277132000-04-11 15:37:43 +0000705
706 return self.reader.readlines(sizehint)
707
Georg Brandla18af4e2007-04-21 15:47:16 +0000708 def __next__(self):
Walter Dörwald4dbf1922002-11-06 16:53:44 +0000709
710 """ Return the next decoded line from the input stream."""
Georg Brandla18af4e2007-04-21 15:47:16 +0000711 return next(self.reader)
Walter Dörwald4dbf1922002-11-06 16:53:44 +0000712
713 def __iter__(self):
714 return self
715
Tim Peters30324a72001-05-15 17:19:16 +0000716 def write(self, data):
Guido van Rossum0612d842000-03-10 23:20:43 +0000717
718 return self.writer.write(data)
719
Tim Peters30324a72001-05-15 17:19:16 +0000720 def writelines(self, list):
Guido van Rossuma3277132000-04-11 15:37:43 +0000721
722 return self.writer.writelines(list)
723
Guido van Rossum0612d842000-03-10 23:20:43 +0000724 def reset(self):
725
726 self.reader.reset()
727 self.writer.reset()
728
Victor Stinner3fed0872010-05-22 02:16:27 +0000729 def seek(self, offset, whence=0):
Victor Stinnera92ad7e2010-05-22 16:59:09 +0000730 self.stream.seek(offset, whence)
731 self.reader.reset()
732 if whence == 0 and offset == 0:
733 self.writer.reset()
Victor Stinner3fed0872010-05-22 02:16:27 +0000734
Tim Peters30324a72001-05-15 17:19:16 +0000735 def __getattr__(self, name,
Guido van Rossum0612d842000-03-10 23:20:43 +0000736 getattr=getattr):
737
738 """ Inherit all other methods from the underlying stream.
739 """
Tim Peters30324a72001-05-15 17:19:16 +0000740 return getattr(self.stream, name)
Guido van Rossum0612d842000-03-10 23:20:43 +0000741
Thomas Wouters89f507f2006-12-13 04:49:30 +0000742 # these are needed to make "with codecs.open(...)" work properly
743
744 def __enter__(self):
745 return self
746
747 def __exit__(self, type, value, tb):
748 self.stream.close()
749
Guido van Rossum0612d842000-03-10 23:20:43 +0000750###
751
752class StreamRecoder:
753
Fred Drake49fd1072000-04-13 14:11:21 +0000754 """ StreamRecoder instances provide a frontend - backend
755 view of encoding data.
756
757 They use the complete set of APIs returned by the
758 codecs.lookup() function to implement their task.
759
760 Data written to the stream is first decoded into an
761 intermediate format (which is dependent on the given codec
762 combination) and then written to the stream using an instance
763 of the provided Writer class.
764
765 In the other direction, data is read from the stream using a
766 Reader instance and then return encoded data to the caller.
767
768 """
Guido van Rossuma3277132000-04-11 15:37:43 +0000769 # Optional attributes set by the file wrappers below
770 data_encoding = 'unknown'
771 file_encoding = 'unknown'
772
Tim Peters30324a72001-05-15 17:19:16 +0000773 def __init__(self, stream, encode, decode, Reader, Writer,
774 errors='strict'):
Guido van Rossum0612d842000-03-10 23:20:43 +0000775
776 """ Creates a StreamRecoder instance which implements a two-way
777 conversion: encode and decode work on the frontend (the
Guido van Rossum1c89b0e2000-04-11 15:41:38 +0000778 input to .read() and output of .write()) while
Guido van Rossum0612d842000-03-10 23:20:43 +0000779 Reader and Writer work on the backend (reading and
Fred Drake908670c2000-03-17 15:42:11 +0000780 writing to the stream).
Guido van Rossum0612d842000-03-10 23:20:43 +0000781
782 You can use these objects to do transparent direct
783 recodings from e.g. latin-1 to utf-8 and back.
784
785 stream must be a file-like object.
786
787 encode, decode must adhere to the Codec interface, Reader,
788 Writer must be factory functions or classes providing the
789 StreamReader, StreamWriter interface resp.
790
791 encode and decode are needed for the frontend translation,
792 Reader and Writer for the backend translation. Unicode is
793 used as intermediate encoding.
794
795 Error handling is done in the same way as defined for the
796 StreamWriter/Readers.
797
798 """
799 self.stream = stream
800 self.encode = encode
801 self.decode = decode
802 self.reader = Reader(stream, errors)
803 self.writer = Writer(stream, errors)
804 self.errors = errors
805
Tim Peters30324a72001-05-15 17:19:16 +0000806 def read(self, size=-1):
Guido van Rossum0612d842000-03-10 23:20:43 +0000807
808 data = self.reader.read(size)
809 data, bytesencoded = self.encode(data, self.errors)
810 return data
811
Tim Peters30324a72001-05-15 17:19:16 +0000812 def readline(self, size=None):
Guido van Rossuma3277132000-04-11 15:37:43 +0000813
814 if size is None:
815 data = self.reader.readline()
816 else:
817 data = self.reader.readline(size)
818 data, bytesencoded = self.encode(data, self.errors)
819 return data
820
Tim Peters30324a72001-05-15 17:19:16 +0000821 def readlines(self, sizehint=None):
Guido van Rossuma3277132000-04-11 15:37:43 +0000822
Marc-André Lemburgd5948492004-02-26 15:22:17 +0000823 data = self.reader.read()
Guido van Rossuma3277132000-04-11 15:37:43 +0000824 data, bytesencoded = self.encode(data, self.errors)
Ezio Melottid8b509b2011-09-28 17:37:55 +0300825 return data.splitlines(keepends=True)
Guido van Rossuma3277132000-04-11 15:37:43 +0000826
Georg Brandla18af4e2007-04-21 15:47:16 +0000827 def __next__(self):
Walter Dörwald4dbf1922002-11-06 16:53:44 +0000828
829 """ Return the next decoded line from the input stream."""
Georg Brandla18af4e2007-04-21 15:47:16 +0000830 data = next(self.reader)
Walter Dörwaldc5238b82005-09-01 11:56:53 +0000831 data, bytesencoded = self.encode(data, self.errors)
832 return data
Walter Dörwald4dbf1922002-11-06 16:53:44 +0000833
834 def __iter__(self):
835 return self
836
Tim Peters30324a72001-05-15 17:19:16 +0000837 def write(self, data):
Guido van Rossum0612d842000-03-10 23:20:43 +0000838
839 data, bytesdecoded = self.decode(data, self.errors)
840 return self.writer.write(data)
841
Tim Peters30324a72001-05-15 17:19:16 +0000842 def writelines(self, list):
Guido van Rossuma3277132000-04-11 15:37:43 +0000843
844 data = ''.join(list)
845 data, bytesdecoded = self.decode(data, self.errors)
846 return self.writer.write(data)
Guido van Rossum0612d842000-03-10 23:20:43 +0000847
848 def reset(self):
849
850 self.reader.reset()
851 self.writer.reset()
852
Tim Peters30324a72001-05-15 17:19:16 +0000853 def __getattr__(self, name,
Guido van Rossum0612d842000-03-10 23:20:43 +0000854 getattr=getattr):
855
856 """ Inherit all other methods from the underlying stream.
857 """
Tim Peters30324a72001-05-15 17:19:16 +0000858 return getattr(self.stream, name)
Guido van Rossum0612d842000-03-10 23:20:43 +0000859
Thomas Wouters89f507f2006-12-13 04:49:30 +0000860 def __enter__(self):
861 return self
862
863 def __exit__(self, type, value, tb):
864 self.stream.close()
865
Guido van Rossum0612d842000-03-10 23:20:43 +0000866### Shortcuts
867
Marc-André Lemburg349a3d32000-06-21 21:21:04 +0000868def open(filename, mode='rb', encoding=None, errors='strict', buffering=1):
Guido van Rossum0612d842000-03-10 23:20:43 +0000869
870 """ Open an encoded file using the given mode and return
871 a wrapped version providing transparent encoding/decoding.
872
873 Note: The wrapped version will only accept the object format
874 defined by the codecs, i.e. Unicode objects for most builtin
Skip Montanaro9f5f9d92005-03-16 03:51:56 +0000875 codecs. Output is also codec dependent and will usually be
Guido van Rossum0612d842000-03-10 23:20:43 +0000876 Unicode as well.
877
Marc-André Lemburg349a3d32000-06-21 21:21:04 +0000878 Files are always opened in binary mode, even if no binary mode
Walter Dörwald7f3ed742003-02-02 23:08:27 +0000879 was specified. This is done to avoid data loss due to encodings
Marc-André Lemburg349a3d32000-06-21 21:21:04 +0000880 using 8-bit values. The default file mode is 'rb' meaning to
881 open the file in binary read mode.
882
Guido van Rossum0612d842000-03-10 23:20:43 +0000883 encoding specifies the encoding which is to be used for the
Walter Dörwald7f3ed742003-02-02 23:08:27 +0000884 file.
Guido van Rossum0612d842000-03-10 23:20:43 +0000885
886 errors may be given to define the error handling. It defaults
887 to 'strict' which causes ValueErrors to be raised in case an
888 encoding error occurs.
889
890 buffering has the same meaning as for the builtin open() API.
891 It defaults to line buffered.
892
Fred Drake49fd1072000-04-13 14:11:21 +0000893 The returned wrapped file object provides an extra attribute
894 .encoding which allows querying the used encoding. This
895 attribute is only available if an encoding was specified as
896 parameter.
897
Guido van Rossum0612d842000-03-10 23:20:43 +0000898 """
899 if encoding is not None and \
900 'b' not in mode:
901 # Force opening of the file in binary mode
902 mode = mode + 'b'
Georg Brandl1a3284e2007-12-02 09:40:06 +0000903 file = builtins.open(filename, mode, buffering)
Guido van Rossum0612d842000-03-10 23:20:43 +0000904 if encoding is None:
905 return file
Thomas Woutersa9773292006-04-21 09:43:23 +0000906 info = lookup(encoding)
907 srw = StreamReaderWriter(file, info.streamreader, info.streamwriter, errors)
Guido van Rossuma3277132000-04-11 15:37:43 +0000908 # Add attributes to simplify introspection
909 srw.encoding = encoding
910 return srw
Guido van Rossum0612d842000-03-10 23:20:43 +0000911
Guido van Rossuma3277132000-04-11 15:37:43 +0000912def EncodedFile(file, data_encoding, file_encoding=None, errors='strict'):
Guido van Rossum0612d842000-03-10 23:20:43 +0000913
914 """ Return a wrapped version of file which provides transparent
915 encoding translation.
916
917 Strings written to the wrapped file are interpreted according
Guido van Rossuma3277132000-04-11 15:37:43 +0000918 to the given data_encoding and then written to the original
919 file as string using file_encoding. The intermediate encoding
920 will usually be Unicode but depends on the specified codecs.
Guido van Rossum0612d842000-03-10 23:20:43 +0000921
Guido van Rossuma3277132000-04-11 15:37:43 +0000922 Strings are read from the file using file_encoding and then
923 passed back to the caller as string using data_encoding.
924
925 If file_encoding is not given, it defaults to data_encoding.
Guido van Rossum0612d842000-03-10 23:20:43 +0000926
927 errors may be given to define the error handling. It defaults
928 to 'strict' which causes ValueErrors to be raised in case an
929 encoding error occurs.
930
Fred Drake49fd1072000-04-13 14:11:21 +0000931 The returned wrapped file object provides two extra attributes
932 .data_encoding and .file_encoding which reflect the given
933 parameters of the same name. The attributes can be used for
934 introspection by Python programs.
935
Guido van Rossum0612d842000-03-10 23:20:43 +0000936 """
Guido van Rossuma3277132000-04-11 15:37:43 +0000937 if file_encoding is None:
938 file_encoding = data_encoding
Thomas Wouters89f507f2006-12-13 04:49:30 +0000939 data_info = lookup(data_encoding)
940 file_info = lookup(file_encoding)
941 sr = StreamRecoder(file, data_info.encode, data_info.decode,
942 file_info.streamreader, file_info.streamwriter, errors)
Guido van Rossuma3277132000-04-11 15:37:43 +0000943 # Add attributes to simplify introspection
944 sr.data_encoding = data_encoding
945 sr.file_encoding = file_encoding
946 return sr
Guido van Rossum0612d842000-03-10 23:20:43 +0000947
Marc-André Lemburgaa32c5a2001-09-19 11:24:48 +0000948### Helpers for codec lookup
949
950def getencoder(encoding):
951
952 """ Lookup up the codec for the given encoding and return
953 its encoder function.
954
955 Raises a LookupError in case the encoding cannot be found.
956
957 """
Thomas Woutersa9773292006-04-21 09:43:23 +0000958 return lookup(encoding).encode
Marc-André Lemburgaa32c5a2001-09-19 11:24:48 +0000959
960def getdecoder(encoding):
961
962 """ Lookup up the codec for the given encoding and return
963 its decoder function.
964
965 Raises a LookupError in case the encoding cannot be found.
966
967 """
Thomas Woutersa9773292006-04-21 09:43:23 +0000968 return lookup(encoding).decode
969
970def getincrementalencoder(encoding):
971
972 """ Lookup up the codec for the given encoding and return
973 its IncrementalEncoder class or factory function.
974
975 Raises a LookupError in case the encoding cannot be found
976 or the codecs doesn't provide an incremental encoder.
977
978 """
979 encoder = lookup(encoding).incrementalencoder
980 if encoder is None:
981 raise LookupError(encoding)
982 return encoder
983
984def getincrementaldecoder(encoding):
985
986 """ Lookup up the codec for the given encoding and return
987 its IncrementalDecoder class or factory function.
988
989 Raises a LookupError in case the encoding cannot be found
990 or the codecs doesn't provide an incremental decoder.
991
992 """
993 decoder = lookup(encoding).incrementaldecoder
994 if decoder is None:
995 raise LookupError(encoding)
996 return decoder
Marc-André Lemburgaa32c5a2001-09-19 11:24:48 +0000997
998def getreader(encoding):
999
1000 """ Lookup up the codec for the given encoding and return
1001 its StreamReader class or factory function.
1002
1003 Raises a LookupError in case the encoding cannot be found.
1004
1005 """
Thomas Woutersa9773292006-04-21 09:43:23 +00001006 return lookup(encoding).streamreader
Marc-André Lemburgaa32c5a2001-09-19 11:24:48 +00001007
1008def getwriter(encoding):
1009
1010 """ Lookup up the codec for the given encoding and return
1011 its StreamWriter class or factory function.
1012
1013 Raises a LookupError in case the encoding cannot be found.
1014
1015 """
Thomas Woutersa9773292006-04-21 09:43:23 +00001016 return lookup(encoding).streamwriter
1017
1018def iterencode(iterator, encoding, errors='strict', **kwargs):
1019 """
1020 Encoding iterator.
1021
1022 Encodes the input strings from the iterator using a IncrementalEncoder.
1023
1024 errors and kwargs are passed through to the IncrementalEncoder
1025 constructor.
1026 """
1027 encoder = getincrementalencoder(encoding)(errors, **kwargs)
1028 for input in iterator:
1029 output = encoder.encode(input)
1030 if output:
1031 yield output
1032 output = encoder.encode("", True)
1033 if output:
1034 yield output
1035
1036def iterdecode(iterator, encoding, errors='strict', **kwargs):
1037 """
1038 Decoding iterator.
1039
1040 Decodes the input strings from the iterator using a IncrementalDecoder.
1041
1042 errors and kwargs are passed through to the IncrementalDecoder
1043 constructor.
1044 """
1045 decoder = getincrementaldecoder(encoding)(errors, **kwargs)
1046 for input in iterator:
1047 output = decoder.decode(input)
1048 if output:
1049 yield output
Walter Dörwaldca8a8d02007-05-04 13:05:09 +00001050 output = decoder.decode(b"", True)
Thomas Woutersa9773292006-04-21 09:43:23 +00001051 if output:
1052 yield output
Marc-André Lemburgaa32c5a2001-09-19 11:24:48 +00001053
Marc-André Lemburga866df82001-01-03 21:29:14 +00001054### Helpers for charmap-based codecs
1055
1056def make_identity_dict(rng):
1057
1058 """ make_identity_dict(rng) -> dict
1059
1060 Return a dictionary where elements of the rng sequence are
1061 mapped to themselves.
Tim Peters88869f92001-01-14 23:36:06 +00001062
Marc-André Lemburga866df82001-01-03 21:29:14 +00001063 """
Antoine Pitrouaaefac72012-06-16 22:48:21 +02001064 return {i:i for i in rng}
Marc-André Lemburga866df82001-01-03 21:29:14 +00001065
Marc-André Lemburg716cf912001-05-16 09:41:45 +00001066def make_encoding_map(decoding_map):
1067
1068 """ Creates an encoding map from a decoding map.
1069
Walter Dörwald7f3ed742003-02-02 23:08:27 +00001070 If a target mapping in the decoding map occurs multiple
Marc-André Lemburg716cf912001-05-16 09:41:45 +00001071 times, then that target is mapped to None (undefined mapping),
1072 causing an exception when encountered by the charmap codec
1073 during translation.
1074
1075 One example where this happens is cp875.py which decodes
1076 multiple character to \u001a.
1077
1078 """
1079 m = {}
1080 for k,v in decoding_map.items():
Raymond Hettinger54f02222002-06-01 14:18:47 +00001081 if not v in m:
Marc-André Lemburg716cf912001-05-16 09:41:45 +00001082 m[v] = k
1083 else:
1084 m[v] = None
1085 return m
Tim Peters3a2ab1a2001-05-29 06:06:54 +00001086
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001087### error handlers
1088
Martin v. Löwise2713be2005-03-08 15:03:08 +00001089try:
1090 strict_errors = lookup_error("strict")
1091 ignore_errors = lookup_error("ignore")
1092 replace_errors = lookup_error("replace")
1093 xmlcharrefreplace_errors = lookup_error("xmlcharrefreplace")
1094 backslashreplace_errors = lookup_error("backslashreplace")
Serhiy Storchaka166ebc42014-11-25 13:57:17 +02001095 namereplace_errors = lookup_error("namereplace")
Martin v. Löwise2713be2005-03-08 15:03:08 +00001096except LookupError:
1097 # In --disable-unicode builds, these error handler are missing
1098 strict_errors = None
1099 ignore_errors = None
1100 replace_errors = None
1101 xmlcharrefreplace_errors = None
1102 backslashreplace_errors = None
Serhiy Storchaka166ebc42014-11-25 13:57:17 +02001103 namereplace_errors = None
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001104
Martin v. Löwis6cd441d2001-07-31 08:54:55 +00001105# Tell modulefinder that using codecs probably needs the encodings
1106# package
1107_false = 0
1108if _false:
1109 import encodings
1110
Guido van Rossum0612d842000-03-10 23:20:43 +00001111### Tests
Guido van Rossum1c89b0e2000-04-11 15:41:38 +00001112
Guido van Rossum0612d842000-03-10 23:20:43 +00001113if __name__ == '__main__':
1114
Guido van Rossuma3277132000-04-11 15:37:43 +00001115 # Make stdout translate Latin-1 output into UTF-8 output
1116 sys.stdout = EncodedFile(sys.stdout, 'latin-1', 'utf-8')
Guido van Rossum1c89b0e2000-04-11 15:41:38 +00001117
Guido van Rossuma3277132000-04-11 15:37:43 +00001118 # Have stdin translate Latin-1 input into UTF-8 input
1119 sys.stdin = EncodedFile(sys.stdin, 'utf-8', 'latin-1')