blob: 590238ec502bfb6466e85b9177e6c31bd51ce1cb [file] [log] [blame]
Guido van Rossum0612d842000-03-10 23:20:43 +00001""" codecs -- Python Codec Registry, API and helpers.
2
3
4Written by Marc-Andre Lemburg (mal@lemburg.com).
5
6(c) Copyright CNRI, All Rights Reserved. NO WARRANTY.
7
8"""#"
9
Marc-André Lemburgb28de0d2002-12-12 17:37:50 +000010import __builtin__, sys
Guido van Rossum0612d842000-03-10 23:20:43 +000011
12### Registry and builtin stateless codec functions
13
Guido van Rossumb95de4f2000-03-31 17:25:23 +000014try:
15 from _codecs import *
Tim Peters30324a72001-05-15 17:19:16 +000016except ImportError, why:
Walter Dörwald6a7ec7c2006-03-18 16:35:17 +000017 raise SystemError('Failed to load the builtin codecs: %s' % why)
Guido van Rossum0612d842000-03-10 23:20:43 +000018
Tim Peters30324a72001-05-15 17:19:16 +000019__all__ = ["register", "lookup", "open", "EncodedFile", "BOM", "BOM_BE",
Walter Dörwald474458d2002-06-04 15:16:29 +000020 "BOM_LE", "BOM32_BE", "BOM32_LE", "BOM64_BE", "BOM64_LE",
21 "BOM_UTF8", "BOM_UTF16", "BOM_UTF16_LE", "BOM_UTF16_BE",
Walter Dörwald3aeb6322002-09-02 13:14:32 +000022 "BOM_UTF32", "BOM_UTF32_LE", "BOM_UTF32_BE",
Serhiy Storchaka74a651b2014-12-20 17:42:24 +020023 "CodecInfo", "Codec", "IncrementalEncoder", "IncrementalDecoder",
24 "StreamReader", "StreamWriter",
25 "StreamReaderWriter", "StreamRecoder",
26 "getencoder", "getdecoder", "getincrementalencoder",
27 "getincrementaldecoder", "getreader", "getwriter",
28 "encode", "decode", "iterencode", "iterdecode",
Walter Dörwald3aeb6322002-09-02 13:14:32 +000029 "strict_errors", "ignore_errors", "replace_errors",
Serhiy Storchaka74a651b2014-12-20 17:42:24 +020030 "xmlcharrefreplace_errors", "backslashreplace_errors",
Walter Dörwald3aeb6322002-09-02 13:14:32 +000031 "register_error", "lookup_error"]
Skip Montanaroe99d5ea2001-01-20 19:54:20 +000032
Guido van Rossum0612d842000-03-10 23:20:43 +000033### Constants
34
35#
Walter Dörwald474458d2002-06-04 15:16:29 +000036# Byte Order Mark (BOM = ZERO WIDTH NO-BREAK SPACE = U+FEFF)
37# and its possible byte string values
38# for UTF8/UTF16/UTF32 output and little/big endian machines
Guido van Rossum0612d842000-03-10 23:20:43 +000039#
Guido van Rossum0612d842000-03-10 23:20:43 +000040
Walter Dörwald474458d2002-06-04 15:16:29 +000041# UTF-8
42BOM_UTF8 = '\xef\xbb\xbf'
43
44# UTF-16, little endian
45BOM_LE = BOM_UTF16_LE = '\xff\xfe'
46
47# UTF-16, big endian
48BOM_BE = BOM_UTF16_BE = '\xfe\xff'
49
50# UTF-32, little endian
51BOM_UTF32_LE = '\xff\xfe\x00\x00'
52
53# UTF-32, big endian
54BOM_UTF32_BE = '\x00\x00\xfe\xff'
55
Marc-André Lemburgb28de0d2002-12-12 17:37:50 +000056if sys.byteorder == 'little':
Walter Dörwald474458d2002-06-04 15:16:29 +000057
Marc-André Lemburgb28de0d2002-12-12 17:37:50 +000058 # UTF-16, native endianness
59 BOM = BOM_UTF16 = BOM_UTF16_LE
60
61 # UTF-32, native endianness
62 BOM_UTF32 = BOM_UTF32_LE
63
64else:
65
66 # UTF-16, native endianness
67 BOM = BOM_UTF16 = BOM_UTF16_BE
68
69 # UTF-32, native endianness
70 BOM_UTF32 = BOM_UTF32_BE
Walter Dörwald474458d2002-06-04 15:16:29 +000071
72# Old broken names (don't use in new code)
73BOM32_LE = BOM_UTF16_LE
74BOM32_BE = BOM_UTF16_BE
75BOM64_LE = BOM_UTF32_LE
76BOM64_BE = BOM_UTF32_BE
Guido van Rossum0612d842000-03-10 23:20:43 +000077
78
79### Codec base classes (defining the API)
80
Walter Dörwaldabb02e52006-03-15 11:35:15 +000081class CodecInfo(tuple):
Serhiy Storchakac7797dc2015-05-31 20:21:00 +030082 """Codec details when looking up the codec registry"""
83
84 # Private API to allow Python to blacklist the known non-Unicode
85 # codecs in the standard library. A more general mechanism to
86 # reliably distinguish test encodings from other codecs will hopefully
87 # be defined for Python 3.5
88 #
89 # See http://bugs.python.org/issue19619
90 _is_text_encoding = True # Assume codecs are text encodings by default
Walter Dörwaldabb02e52006-03-15 11:35:15 +000091
92 def __new__(cls, encode, decode, streamreader=None, streamwriter=None,
Serhiy Storchakac7797dc2015-05-31 20:21:00 +030093 incrementalencoder=None, incrementaldecoder=None, name=None,
94 _is_text_encoding=None):
Walter Dörwaldabb02e52006-03-15 11:35:15 +000095 self = tuple.__new__(cls, (encode, decode, streamreader, streamwriter))
96 self.name = name
97 self.encode = encode
98 self.decode = decode
99 self.incrementalencoder = incrementalencoder
100 self.incrementaldecoder = incrementaldecoder
101 self.streamwriter = streamwriter
102 self.streamreader = streamreader
Serhiy Storchakac7797dc2015-05-31 20:21:00 +0300103 if _is_text_encoding is not None:
104 self._is_text_encoding = _is_text_encoding
Walter Dörwaldabb02e52006-03-15 11:35:15 +0000105 return self
106
107 def __repr__(self):
108 return "<%s.%s object for encoding %s at 0x%x>" % (self.__class__.__module__, self.__class__.__name__, self.name, id(self))
109
Guido van Rossum0612d842000-03-10 23:20:43 +0000110class Codec:
111
112 """ Defines the interface for stateless encoders/decoders.
113
Walter Dörwald7f82f792002-11-19 21:42:53 +0000114 The .encode()/.decode() methods may use different error
Guido van Rossum0612d842000-03-10 23:20:43 +0000115 handling schemes by providing the errors argument. These
Walter Dörwald7f82f792002-11-19 21:42:53 +0000116 string values are predefined:
Guido van Rossum0612d842000-03-10 23:20:43 +0000117
Guido van Rossumd8855fd2000-03-24 22:14:19 +0000118 'strict' - raise a ValueError error (or a subclass)
Guido van Rossum0612d842000-03-10 23:20:43 +0000119 'ignore' - ignore the character and continue with the next
120 'replace' - replace with a suitable replacement character;
121 Python will use the official U+FFFD REPLACEMENT
Walter Dörwald7f82f792002-11-19 21:42:53 +0000122 CHARACTER for the builtin Unicode codecs on
123 decoding and '?' on encoding.
124 'xmlcharrefreplace' - Replace with the appropriate XML
125 character reference (only for encoding).
126 'backslashreplace' - Replace with backslashed escape sequences
127 (only for encoding).
128
129 The set of allowed values can be extended via register_error.
Guido van Rossum0612d842000-03-10 23:20:43 +0000130
131 """
Tim Peters30324a72001-05-15 17:19:16 +0000132 def encode(self, input, errors='strict'):
Guido van Rossum1c89b0e2000-04-11 15:41:38 +0000133
Fred Drake3e74c0d2000-03-17 15:40:35 +0000134 """ Encodes the object input and returns a tuple (output
Guido van Rossum0612d842000-03-10 23:20:43 +0000135 object, length consumed).
136
137 errors defines the error handling to apply. It defaults to
138 'strict' handling.
139
140 The method may not store state in the Codec instance. Use
Berker Peksagffc7e8e2015-07-30 23:27:13 +0300141 StreamWriter for codecs which have to keep state in order to
142 make encoding efficient.
Guido van Rossum0612d842000-03-10 23:20:43 +0000143
144 The encoder must be able to handle zero length input and
145 return an empty object of the output object type in this
146 situation.
147
148 """
149 raise NotImplementedError
150
Tim Peters30324a72001-05-15 17:19:16 +0000151 def decode(self, input, errors='strict'):
Guido van Rossum0612d842000-03-10 23:20:43 +0000152
153 """ Decodes the object input and returns a tuple (output
154 object, length consumed).
155
156 input must be an object which provides the bf_getreadbuf
157 buffer slot. Python strings, buffer objects and memory
158 mapped files are examples of objects providing this slot.
Guido van Rossum1c89b0e2000-04-11 15:41:38 +0000159
Guido van Rossum0612d842000-03-10 23:20:43 +0000160 errors defines the error handling to apply. It defaults to
161 'strict' handling.
162
163 The method may not store state in the Codec instance. Use
Berker Peksagffc7e8e2015-07-30 23:27:13 +0300164 StreamReader for codecs which have to keep state in order to
165 make decoding efficient.
Guido van Rossum0612d842000-03-10 23:20:43 +0000166
167 The decoder must be able to handle zero length input and
168 return an empty object of the output object type in this
169 situation.
170
Guido van Rossum1c89b0e2000-04-11 15:41:38 +0000171 """
Guido van Rossum0612d842000-03-10 23:20:43 +0000172 raise NotImplementedError
173
Walter Dörwaldabb02e52006-03-15 11:35:15 +0000174class IncrementalEncoder(object):
175 """
Neal Norwitz6bed1c12006-03-16 07:49:19 +0000176 An IncrementalEncoder encodes an input in multiple steps. The input can be
Walter Dörwaldabb02e52006-03-15 11:35:15 +0000177 passed piece by piece to the encode() method. The IncrementalEncoder remembers
178 the state of the Encoding process between calls to encode().
179 """
180 def __init__(self, errors='strict'):
181 """
Neal Norwitz6bed1c12006-03-16 07:49:19 +0000182 Creates an IncrementalEncoder instance.
Walter Dörwaldabb02e52006-03-15 11:35:15 +0000183
184 The IncrementalEncoder may use different error handling schemes by
185 providing the errors keyword argument. See the module docstring
186 for a list of possible values.
187 """
188 self.errors = errors
189 self.buffer = ""
190
191 def encode(self, input, final=False):
192 """
193 Encodes input and returns the resulting object.
194 """
195 raise NotImplementedError
196
197 def reset(self):
198 """
199 Resets the encoder to the initial state.
200 """
201
Christian Heimes1a6387e2008-03-26 12:49:49 +0000202 def getstate(self):
203 """
204 Return the current state of the encoder.
205 """
206 return 0
207
208 def setstate(self, state):
209 """
210 Set the current state of the encoder. state must have been
211 returned by getstate().
212 """
213
Walter Dörwald78a0be62006-04-14 18:25:39 +0000214class BufferedIncrementalEncoder(IncrementalEncoder):
215 """
216 This subclass of IncrementalEncoder can be used as the baseclass for an
217 incremental encoder if the encoder must keep some of the output in a
218 buffer between calls to encode().
219 """
220 def __init__(self, errors='strict'):
221 IncrementalEncoder.__init__(self, errors)
222 self.buffer = "" # unencoded input that is kept between calls to encode()
223
224 def _buffer_encode(self, input, errors, final):
225 # Overwrite this method in subclasses: It must encode input
226 # and return an (output, length consumed) tuple
227 raise NotImplementedError
228
229 def encode(self, input, final=False):
230 # encode input (taking the buffer into account)
231 data = self.buffer + input
232 (result, consumed) = self._buffer_encode(data, self.errors, final)
233 # keep unencoded input until the next call
234 self.buffer = data[consumed:]
235 return result
236
237 def reset(self):
238 IncrementalEncoder.reset(self)
239 self.buffer = ""
240
Christian Heimes1a6387e2008-03-26 12:49:49 +0000241 def getstate(self):
242 return self.buffer or 0
243
244 def setstate(self, state):
245 self.buffer = state or ""
246
Walter Dörwaldabb02e52006-03-15 11:35:15 +0000247class IncrementalDecoder(object):
248 """
249 An IncrementalDecoder decodes an input in multiple steps. The input can be
250 passed piece by piece to the decode() method. The IncrementalDecoder
251 remembers the state of the decoding process between calls to decode().
252 """
253 def __init__(self, errors='strict'):
254 """
Martin Panterb362f752015-11-02 03:37:02 +0000255 Creates an IncrementalDecoder instance.
Walter Dörwaldabb02e52006-03-15 11:35:15 +0000256
257 The IncrementalDecoder may use different error handling schemes by
258 providing the errors keyword argument. See the module docstring
259 for a list of possible values.
260 """
261 self.errors = errors
262
263 def decode(self, input, final=False):
264 """
265 Decodes input and returns the resulting object.
266 """
267 raise NotImplementedError
268
269 def reset(self):
270 """
271 Resets the decoder to the initial state.
272 """
273
Christian Heimes1a6387e2008-03-26 12:49:49 +0000274 def getstate(self):
275 """
276 Return the current state of the decoder.
277
278 This must be a (buffered_input, additional_state_info) tuple.
279 buffered_input must be a bytes object containing bytes that
280 were passed to decode() that have not yet been converted.
281 additional_state_info must be a non-negative integer
282 representing the state of the decoder WITHOUT yet having
283 processed the contents of buffered_input. In the initial state
284 and after reset(), getstate() must return (b"", 0).
285 """
286 return (b"", 0)
287
288 def setstate(self, state):
289 """
290 Set the current state of the decoder.
291
292 state must have been returned by getstate(). The effect of
293 setstate((b"", 0)) must be equivalent to reset().
294 """
295
Walter Dörwaldabb02e52006-03-15 11:35:15 +0000296class BufferedIncrementalDecoder(IncrementalDecoder):
297 """
298 This subclass of IncrementalDecoder can be used as the baseclass for an
299 incremental decoder if the decoder must be able to handle incomplete byte
300 sequences.
301 """
302 def __init__(self, errors='strict'):
303 IncrementalDecoder.__init__(self, errors)
304 self.buffer = "" # undecoded input that is kept between calls to decode()
305
306 def _buffer_decode(self, input, errors, final):
307 # Overwrite this method in subclasses: It must decode input
308 # and return an (output, length consumed) tuple
309 raise NotImplementedError
310
311 def decode(self, input, final=False):
312 # decode input (taking the buffer into account)
313 data = self.buffer + input
314 (result, consumed) = self._buffer_decode(data, self.errors, final)
315 # keep undecoded input until the next call
316 self.buffer = data[consumed:]
317 return result
318
319 def reset(self):
320 IncrementalDecoder.reset(self)
Walter Dörwaldb17f12b2006-04-14 15:40:54 +0000321 self.buffer = ""
Walter Dörwaldabb02e52006-03-15 11:35:15 +0000322
Christian Heimes1a6387e2008-03-26 12:49:49 +0000323 def getstate(self):
324 # additional state info is always 0
325 return (self.buffer, 0)
326
327 def setstate(self, state):
328 # ignore additional state info
329 self.buffer = state[0]
330
Guido van Rossum0612d842000-03-10 23:20:43 +0000331#
332# The StreamWriter and StreamReader class provide generic working
Andrew M. Kuchling97c56352001-09-18 20:29:48 +0000333# interfaces which can be used to implement new encoding submodules
Guido van Rossum0612d842000-03-10 23:20:43 +0000334# very easily. See encodings/utf_8.py for an example on how this is
335# done.
Guido van Rossum1c89b0e2000-04-11 15:41:38 +0000336#
Guido van Rossum0612d842000-03-10 23:20:43 +0000337
338class StreamWriter(Codec):
339
Tim Peters30324a72001-05-15 17:19:16 +0000340 def __init__(self, stream, errors='strict'):
Guido van Rossum0612d842000-03-10 23:20:43 +0000341
342 """ Creates a StreamWriter instance.
343
344 stream must be a file-like object open for writing
345 (binary) data.
346
Walter Dörwald7f82f792002-11-19 21:42:53 +0000347 The StreamWriter may use different error handling
Guido van Rossum0612d842000-03-10 23:20:43 +0000348 schemes by providing the errors keyword argument. These
Walter Dörwald7f82f792002-11-19 21:42:53 +0000349 parameters are predefined:
Guido van Rossum0612d842000-03-10 23:20:43 +0000350
351 'strict' - raise a ValueError (or a subclass)
352 'ignore' - ignore the character and continue with the next
353 'replace'- replace with a suitable replacement character
Walter Dörwald7f82f792002-11-19 21:42:53 +0000354 'xmlcharrefreplace' - Replace with the appropriate XML
355 character reference.
356 'backslashreplace' - Replace with backslashed escape
357 sequences (only for encoding).
Guido van Rossum0612d842000-03-10 23:20:43 +0000358
Walter Dörwald7f82f792002-11-19 21:42:53 +0000359 The set of allowed parameter values can be extended via
360 register_error.
Guido van Rossum0612d842000-03-10 23:20:43 +0000361 """
362 self.stream = stream
363 self.errors = errors
364
Guido van Rossuma3277132000-04-11 15:37:43 +0000365 def write(self, object):
Guido van Rossum0612d842000-03-10 23:20:43 +0000366
367 """ Writes the object's contents encoded to self.stream.
368 """
Tim Peters30324a72001-05-15 17:19:16 +0000369 data, consumed = self.encode(object, self.errors)
Guido van Rossum0612d842000-03-10 23:20:43 +0000370 self.stream.write(data)
371
Guido van Rossuma3277132000-04-11 15:37:43 +0000372 def writelines(self, list):
373
374 """ Writes the concatenated list of strings to the stream
375 using .write().
376 """
377 self.write(''.join(list))
Guido van Rossum1c89b0e2000-04-11 15:41:38 +0000378
Guido van Rossum0612d842000-03-10 23:20:43 +0000379 def reset(self):
380
381 """ Flushes and resets the codec buffers used for keeping state.
382
383 Calling this method should ensure that the data on the
384 output is put into a clean state, that allows appending
385 of new fresh data without having to rescan the whole
386 stream to recover state.
387
388 """
389 pass
390
Victor Stinner7df55da2010-05-22 13:37:56 +0000391 def seek(self, offset, whence=0):
392 self.stream.seek(offset, whence)
393 if whence == 0 and offset == 0:
394 self.reset()
395
Tim Peters30324a72001-05-15 17:19:16 +0000396 def __getattr__(self, name,
Guido van Rossum0612d842000-03-10 23:20:43 +0000397 getattr=getattr):
398
399 """ Inherit all other methods from the underlying stream.
400 """
Tim Peters30324a72001-05-15 17:19:16 +0000401 return getattr(self.stream, name)
Guido van Rossum0612d842000-03-10 23:20:43 +0000402
Georg Brandl8f99f812006-10-29 08:39:22 +0000403 def __enter__(self):
404 return self
405
406 def __exit__(self, type, value, tb):
407 self.stream.close()
408
Guido van Rossum0612d842000-03-10 23:20:43 +0000409###
410
411class StreamReader(Codec):
412
Tim Peters30324a72001-05-15 17:19:16 +0000413 def __init__(self, stream, errors='strict'):
Guido van Rossum0612d842000-03-10 23:20:43 +0000414
415 """ Creates a StreamReader instance.
416
417 stream must be a file-like object open for reading
418 (binary) data.
419
Walter Dörwald7f82f792002-11-19 21:42:53 +0000420 The StreamReader may use different error handling
Guido van Rossum0612d842000-03-10 23:20:43 +0000421 schemes by providing the errors keyword argument. These
Walter Dörwald7f82f792002-11-19 21:42:53 +0000422 parameters are predefined:
Guido van Rossum0612d842000-03-10 23:20:43 +0000423
424 'strict' - raise a ValueError (or a subclass)
425 'ignore' - ignore the character and continue with the next
426 'replace'- replace with a suitable replacement character;
427
Walter Dörwald7f82f792002-11-19 21:42:53 +0000428 The set of allowed parameter values can be extended via
429 register_error.
Guido van Rossum0612d842000-03-10 23:20:43 +0000430 """
431 self.stream = stream
432 self.errors = errors
Walter Dörwald69652032004-09-07 20:24:22 +0000433 self.bytebuffer = ""
Walter Dörwaldc9878e12005-07-20 22:15:39 +0000434 # For str->str decoding this will stay a str
435 # For str->unicode decoding the first read will promote it to unicode
436 self.charbuffer = ""
Martin v. Löwis4ed67382005-09-18 08:34:39 +0000437 self.linebuffer = None
Guido van Rossum0612d842000-03-10 23:20:43 +0000438
Walter Dörwald69652032004-09-07 20:24:22 +0000439 def decode(self, input, errors='strict'):
440 raise NotImplementedError
441
Martin v. Löwis56066d22005-08-24 07:38:12 +0000442 def read(self, size=-1, chars=-1, firstline=False):
Guido van Rossum0612d842000-03-10 23:20:43 +0000443
444 """ Decodes data from the stream self.stream and returns the
445 resulting object.
446
Walter Dörwald69652032004-09-07 20:24:22 +0000447 chars indicates the number of characters to read from the
448 stream. read() will never return more than chars
449 characters, but it might return less, if there are not enough
450 characters available.
451
Guido van Rossum0612d842000-03-10 23:20:43 +0000452 size indicates the approximate maximum number of bytes to
453 read from the stream for decoding purposes. The decoder
454 can modify this setting as appropriate. The default value
455 -1 indicates to read and decode as much as possible. size
456 is intended to prevent having to decode huge files in one
457 step.
458
Martin v. Löwis56066d22005-08-24 07:38:12 +0000459 If firstline is true, and a UnicodeDecodeError happens
460 after the first line terminator in the input only the first line
461 will be returned, the rest of the input will be kept until the
462 next call to read().
463
Guido van Rossum0612d842000-03-10 23:20:43 +0000464 The method should use a greedy read strategy meaning that
465 it should read as much data as is allowed within the
466 definition of the encoding and the given size, e.g. if
467 optional encoding endings or state markers are available
468 on the stream, these should be read too.
Guido van Rossum0612d842000-03-10 23:20:43 +0000469 """
Martin v. Löwis4ed67382005-09-18 08:34:39 +0000470 # If we have lines cached, first merge them back into characters
471 if self.linebuffer:
472 self.charbuffer = "".join(self.linebuffer)
473 self.linebuffer = None
Tim Peters536cf992005-12-25 23:18:31 +0000474
Walter Dörwald69652032004-09-07 20:24:22 +0000475 # read until we get the required number of characters (if available)
Walter Dörwald69652032004-09-07 20:24:22 +0000476 while True:
Serhiy Storchaka2403a782014-01-26 19:20:24 +0200477 # can the request be satisfied from the character buffer?
478 if chars >= 0:
Walter Dörwald69652032004-09-07 20:24:22 +0000479 if len(self.charbuffer) >= chars:
Walter Dörwald69652032004-09-07 20:24:22 +0000480 break
Serhiy Storchaka2403a782014-01-26 19:20:24 +0200481 elif size >= 0:
482 if len(self.charbuffer) >= size:
483 break
Walter Dörwald69652032004-09-07 20:24:22 +0000484 # we need more data
485 if size < 0:
486 newdata = self.stream.read()
487 else:
488 newdata = self.stream.read(size)
Walter Dörwalde57d7b12004-12-21 22:24:00 +0000489 # decode bytes (those remaining from the last call included)
Walter Dörwald69652032004-09-07 20:24:22 +0000490 data = self.bytebuffer + newdata
Martin v. Löwis56066d22005-08-24 07:38:12 +0000491 try:
492 newchars, decodedbytes = self.decode(data, self.errors)
493 except UnicodeDecodeError, exc:
494 if firstline:
495 newchars, decodedbytes = self.decode(data[:exc.start], self.errors)
496 lines = newchars.splitlines(True)
497 if len(lines)<=1:
498 raise
499 else:
500 raise
Walter Dörwald69652032004-09-07 20:24:22 +0000501 # keep undecoded bytes until the next call
502 self.bytebuffer = data[decodedbytes:]
503 # put new characters in the character buffer
Walter Dörwalde57d7b12004-12-21 22:24:00 +0000504 self.charbuffer += newchars
Walter Dörwald69652032004-09-07 20:24:22 +0000505 # there was no data available
506 if not newdata:
Walter Dörwalde57d7b12004-12-21 22:24:00 +0000507 break
508 if chars < 0:
509 # Return everything we've got
510 result = self.charbuffer
Walter Dörwaldc9878e12005-07-20 22:15:39 +0000511 self.charbuffer = ""
Walter Dörwalde57d7b12004-12-21 22:24:00 +0000512 else:
513 # Return the first chars characters
514 result = self.charbuffer[:chars]
515 self.charbuffer = self.charbuffer[chars:]
Walter Dörwald69652032004-09-07 20:24:22 +0000516 return result
Guido van Rossum0612d842000-03-10 23:20:43 +0000517
Walter Dörwald69652032004-09-07 20:24:22 +0000518 def readline(self, size=None, keepends=True):
Guido van Rossuma3277132000-04-11 15:37:43 +0000519
520 """ Read one line from the input stream and return the
521 decoded data.
522
Walter Dörwald69652032004-09-07 20:24:22 +0000523 size, if given, is passed as size argument to the
524 read() method.
Guido van Rossum1c89b0e2000-04-11 15:41:38 +0000525
Guido van Rossuma3277132000-04-11 15:37:43 +0000526 """
Martin v. Löwis4ed67382005-09-18 08:34:39 +0000527 # If we have lines cached from an earlier read, return
528 # them unconditionally
529 if self.linebuffer:
530 line = self.linebuffer[0]
531 del self.linebuffer[0]
532 if len(self.linebuffer) == 1:
533 # revert to charbuffer mode; we might need more data
534 # next time
535 self.charbuffer = self.linebuffer[0]
536 self.linebuffer = None
537 if not keepends:
538 line = line.splitlines(False)[0]
539 return line
Tim Peters536cf992005-12-25 23:18:31 +0000540
Walter Dörwalde57d7b12004-12-21 22:24:00 +0000541 readsize = size or 72
Walter Dörwaldc9878e12005-07-20 22:15:39 +0000542 line = ""
Walter Dörwalde57d7b12004-12-21 22:24:00 +0000543 # If size is given, we call read() only once
Walter Dörwald69652032004-09-07 20:24:22 +0000544 while True:
Martin v. Löwis56066d22005-08-24 07:38:12 +0000545 data = self.read(readsize, firstline=True)
Walter Dörwalde57d7b12004-12-21 22:24:00 +0000546 if data:
Walter Dörwalda4eb2d52005-04-21 21:42:35 +0000547 # If we're at a "\r" read one extra character (which might
548 # be a "\n") to get a proper line ending. If the stream is
Walter Dörwaldbc8e6422005-04-21 21:32:03 +0000549 # temporarily exhausted we return the wrong line ending.
Walter Dörwaldc9878e12005-07-20 22:15:39 +0000550 if data.endswith("\r"):
Walter Dörwald7a6dc132005-04-04 21:38:47 +0000551 data += self.read(size=1, chars=1)
Walter Dörwald7a6dc132005-04-04 21:38:47 +0000552
Walter Dörwald69652032004-09-07 20:24:22 +0000553 line += data
Walter Dörwalde57d7b12004-12-21 22:24:00 +0000554 lines = line.splitlines(True)
555 if lines:
Martin v. Löwis4ed67382005-09-18 08:34:39 +0000556 if len(lines) > 1:
557 # More than one line result; the first line is a full line
558 # to return
559 line = lines[0]
560 del lines[0]
561 if len(lines) > 1:
562 # cache the remaining lines
563 lines[-1] += self.charbuffer
564 self.linebuffer = lines
565 self.charbuffer = None
566 else:
567 # only one remaining line, put it back into charbuffer
568 self.charbuffer = lines[0] + self.charbuffer
569 if not keepends:
570 line = line.splitlines(False)[0]
571 break
Walter Dörwalde57d7b12004-12-21 22:24:00 +0000572 line0withend = lines[0]
573 line0withoutend = lines[0].splitlines(False)[0]
574 if line0withend != line0withoutend: # We really have a line end
575 # Put the rest back together and keep it until the next call
Walter Dörwaldc9878e12005-07-20 22:15:39 +0000576 self.charbuffer = "".join(lines[1:]) + self.charbuffer
Walter Dörwalde57d7b12004-12-21 22:24:00 +0000577 if keepends:
578 line = line0withend
579 else:
580 line = line0withoutend
Walter Dörwald9fa09462005-01-10 12:01:39 +0000581 break
Walter Dörwalde57d7b12004-12-21 22:24:00 +0000582 # we didn't get anything or this was our only try
Walter Dörwald9fa09462005-01-10 12:01:39 +0000583 if not data or size is not None:
Walter Dörwalde57d7b12004-12-21 22:24:00 +0000584 if line and not keepends:
585 line = line.splitlines(False)[0]
586 break
587 if readsize<8000:
588 readsize *= 2
589 return line
Guido van Rossum1c89b0e2000-04-11 15:41:38 +0000590
Walter Dörwald69652032004-09-07 20:24:22 +0000591 def readlines(self, sizehint=None, keepends=True):
Guido van Rossuma3277132000-04-11 15:37:43 +0000592
593 """ Read all lines available on the input stream
594 and return them as list of lines.
595
596 Line breaks are implemented using the codec's decoder
597 method and are included in the list entries.
Guido van Rossum1c89b0e2000-04-11 15:41:38 +0000598
Marc-André Lemburgd5948492004-02-26 15:22:17 +0000599 sizehint, if given, is ignored since there is no efficient
600 way to finding the true end-of-line.
Guido van Rossuma3277132000-04-11 15:37:43 +0000601
602 """
Walter Dörwald69652032004-09-07 20:24:22 +0000603 data = self.read()
Hye-Shik Changaf5c7cf2004-10-17 23:51:21 +0000604 return data.splitlines(keepends)
Guido van Rossum0612d842000-03-10 23:20:43 +0000605
606 def reset(self):
607
608 """ Resets the codec buffers used for keeping state.
609
610 Note that no stream repositioning should take place.
Thomas Wouters7e474022000-07-16 12:04:32 +0000611 This method is primarily intended to be able to recover
Guido van Rossum0612d842000-03-10 23:20:43 +0000612 from decoding errors.
613
614 """
Walter Dörwald729c31f2005-03-14 19:06:30 +0000615 self.bytebuffer = ""
616 self.charbuffer = u""
Martin v. Löwis4ed67382005-09-18 08:34:39 +0000617 self.linebuffer = None
Walter Dörwald729c31f2005-03-14 19:06:30 +0000618
Walter Dörwald71fd90d2005-03-14 19:25:41 +0000619 def seek(self, offset, whence=0):
Walter Dörwald729c31f2005-03-14 19:06:30 +0000620 """ Set the input stream's current position.
621
622 Resets the codec buffers used for keeping state.
623 """
Walter Dörwald729c31f2005-03-14 19:06:30 +0000624 self.stream.seek(offset, whence)
Victor Stinner7df55da2010-05-22 13:37:56 +0000625 self.reset()
Guido van Rossum0612d842000-03-10 23:20:43 +0000626
Walter Dörwald4dbf1922002-11-06 16:53:44 +0000627 def next(self):
628
629 """ Return the next decoded line from the input stream."""
630 line = self.readline()
631 if line:
632 return line
633 raise StopIteration
634
635 def __iter__(self):
636 return self
637
Tim Peters30324a72001-05-15 17:19:16 +0000638 def __getattr__(self, name,
Guido van Rossum0612d842000-03-10 23:20:43 +0000639 getattr=getattr):
640
641 """ Inherit all other methods from the underlying stream.
642 """
Tim Peters30324a72001-05-15 17:19:16 +0000643 return getattr(self.stream, name)
Guido van Rossum0612d842000-03-10 23:20:43 +0000644
Georg Brandl8f99f812006-10-29 08:39:22 +0000645 def __enter__(self):
646 return self
647
648 def __exit__(self, type, value, tb):
649 self.stream.close()
650
Guido van Rossum0612d842000-03-10 23:20:43 +0000651###
652
653class StreamReaderWriter:
654
Fred Drake49fd1072000-04-13 14:11:21 +0000655 """ StreamReaderWriter instances allow wrapping streams which
656 work in both read and write modes.
657
658 The design is such that one can use the factory functions
Thomas Wouters7e474022000-07-16 12:04:32 +0000659 returned by the codec.lookup() function to construct the
Fred Drake49fd1072000-04-13 14:11:21 +0000660 instance.
661
662 """
Guido van Rossuma3277132000-04-11 15:37:43 +0000663 # Optional attributes set by the file wrappers below
664 encoding = 'unknown'
665
Tim Peters30324a72001-05-15 17:19:16 +0000666 def __init__(self, stream, Reader, Writer, errors='strict'):
Guido van Rossum0612d842000-03-10 23:20:43 +0000667
668 """ Creates a StreamReaderWriter instance.
669
670 stream must be a Stream-like object.
671
672 Reader, Writer must be factory functions or classes
673 providing the StreamReader, StreamWriter interface resp.
674
675 Error handling is done in the same way as defined for the
676 StreamWriter/Readers.
677
678 """
679 self.stream = stream
680 self.reader = Reader(stream, errors)
681 self.writer = Writer(stream, errors)
682 self.errors = errors
683
Tim Peters30324a72001-05-15 17:19:16 +0000684 def read(self, size=-1):
Guido van Rossum0612d842000-03-10 23:20:43 +0000685
686 return self.reader.read(size)
687
Guido van Rossumd58c26f2000-05-01 16:17:32 +0000688 def readline(self, size=None):
Guido van Rossuma3277132000-04-11 15:37:43 +0000689
690 return self.reader.readline(size)
691
Guido van Rossumd58c26f2000-05-01 16:17:32 +0000692 def readlines(self, sizehint=None):
Guido van Rossuma3277132000-04-11 15:37:43 +0000693
694 return self.reader.readlines(sizehint)
695
Walter Dörwald4dbf1922002-11-06 16:53:44 +0000696 def next(self):
697
698 """ Return the next decoded line from the input stream."""
699 return self.reader.next()
700
701 def __iter__(self):
702 return self
703
Tim Peters30324a72001-05-15 17:19:16 +0000704 def write(self, data):
Guido van Rossum0612d842000-03-10 23:20:43 +0000705
706 return self.writer.write(data)
707
Tim Peters30324a72001-05-15 17:19:16 +0000708 def writelines(self, list):
Guido van Rossuma3277132000-04-11 15:37:43 +0000709
710 return self.writer.writelines(list)
711
Guido van Rossum0612d842000-03-10 23:20:43 +0000712 def reset(self):
713
714 self.reader.reset()
715 self.writer.reset()
716
Victor Stinner262be5e2010-05-22 02:11:07 +0000717 def seek(self, offset, whence=0):
Victor Stinner7df55da2010-05-22 13:37:56 +0000718 self.stream.seek(offset, whence)
719 self.reader.reset()
720 if whence == 0 and offset == 0:
721 self.writer.reset()
Victor Stinner262be5e2010-05-22 02:11:07 +0000722
Tim Peters30324a72001-05-15 17:19:16 +0000723 def __getattr__(self, name,
Guido van Rossum0612d842000-03-10 23:20:43 +0000724 getattr=getattr):
725
726 """ Inherit all other methods from the underlying stream.
727 """
Tim Peters30324a72001-05-15 17:19:16 +0000728 return getattr(self.stream, name)
Guido van Rossum0612d842000-03-10 23:20:43 +0000729
Georg Brandl8f99f812006-10-29 08:39:22 +0000730 # these are needed to make "with codecs.open(...)" work properly
731
732 def __enter__(self):
733 return self
734
735 def __exit__(self, type, value, tb):
736 self.stream.close()
737
Guido van Rossum0612d842000-03-10 23:20:43 +0000738###
739
740class StreamRecoder:
741
Fred Drake49fd1072000-04-13 14:11:21 +0000742 """ StreamRecoder instances provide a frontend - backend
743 view of encoding data.
744
745 They use the complete set of APIs returned by the
746 codecs.lookup() function to implement their task.
747
748 Data written to the stream is first decoded into an
749 intermediate format (which is dependent on the given codec
750 combination) and then written to the stream using an instance
751 of the provided Writer class.
752
753 In the other direction, data is read from the stream using a
754 Reader instance and then return encoded data to the caller.
755
756 """
Guido van Rossuma3277132000-04-11 15:37:43 +0000757 # Optional attributes set by the file wrappers below
758 data_encoding = 'unknown'
759 file_encoding = 'unknown'
760
Tim Peters30324a72001-05-15 17:19:16 +0000761 def __init__(self, stream, encode, decode, Reader, Writer,
762 errors='strict'):
Guido van Rossum0612d842000-03-10 23:20:43 +0000763
764 """ Creates a StreamRecoder instance which implements a two-way
765 conversion: encode and decode work on the frontend (the
Guido van Rossum1c89b0e2000-04-11 15:41:38 +0000766 input to .read() and output of .write()) while
Guido van Rossum0612d842000-03-10 23:20:43 +0000767 Reader and Writer work on the backend (reading and
Fred Drake908670c2000-03-17 15:42:11 +0000768 writing to the stream).
Guido van Rossum0612d842000-03-10 23:20:43 +0000769
770 You can use these objects to do transparent direct
771 recodings from e.g. latin-1 to utf-8 and back.
772
773 stream must be a file-like object.
774
775 encode, decode must adhere to the Codec interface, Reader,
776 Writer must be factory functions or classes providing the
777 StreamReader, StreamWriter interface resp.
778
779 encode and decode are needed for the frontend translation,
780 Reader and Writer for the backend translation. Unicode is
781 used as intermediate encoding.
782
783 Error handling is done in the same way as defined for the
784 StreamWriter/Readers.
785
786 """
787 self.stream = stream
788 self.encode = encode
789 self.decode = decode
790 self.reader = Reader(stream, errors)
791 self.writer = Writer(stream, errors)
792 self.errors = errors
793
Tim Peters30324a72001-05-15 17:19:16 +0000794 def read(self, size=-1):
Guido van Rossum0612d842000-03-10 23:20:43 +0000795
796 data = self.reader.read(size)
797 data, bytesencoded = self.encode(data, self.errors)
798 return data
799
Tim Peters30324a72001-05-15 17:19:16 +0000800 def readline(self, size=None):
Guido van Rossuma3277132000-04-11 15:37:43 +0000801
802 if size is None:
803 data = self.reader.readline()
804 else:
805 data = self.reader.readline(size)
806 data, bytesencoded = self.encode(data, self.errors)
807 return data
808
Tim Peters30324a72001-05-15 17:19:16 +0000809 def readlines(self, sizehint=None):
Guido van Rossuma3277132000-04-11 15:37:43 +0000810
Marc-André Lemburgd5948492004-02-26 15:22:17 +0000811 data = self.reader.read()
Guido van Rossuma3277132000-04-11 15:37:43 +0000812 data, bytesencoded = self.encode(data, self.errors)
813 return data.splitlines(1)
814
Walter Dörwald4dbf1922002-11-06 16:53:44 +0000815 def next(self):
816
817 """ Return the next decoded line from the input stream."""
Walter Dörwaldc5238b82005-09-01 11:56:53 +0000818 data = self.reader.next()
819 data, bytesencoded = self.encode(data, self.errors)
820 return data
Walter Dörwald4dbf1922002-11-06 16:53:44 +0000821
822 def __iter__(self):
823 return self
824
Tim Peters30324a72001-05-15 17:19:16 +0000825 def write(self, data):
Guido van Rossum0612d842000-03-10 23:20:43 +0000826
827 data, bytesdecoded = self.decode(data, self.errors)
828 return self.writer.write(data)
829
Tim Peters30324a72001-05-15 17:19:16 +0000830 def writelines(self, list):
Guido van Rossuma3277132000-04-11 15:37:43 +0000831
832 data = ''.join(list)
833 data, bytesdecoded = self.decode(data, self.errors)
834 return self.writer.write(data)
Guido van Rossum0612d842000-03-10 23:20:43 +0000835
836 def reset(self):
837
838 self.reader.reset()
839 self.writer.reset()
840
Tim Peters30324a72001-05-15 17:19:16 +0000841 def __getattr__(self, name,
Guido van Rossum0612d842000-03-10 23:20:43 +0000842 getattr=getattr):
843
844 """ Inherit all other methods from the underlying stream.
845 """
Tim Peters30324a72001-05-15 17:19:16 +0000846 return getattr(self.stream, name)
Guido van Rossum0612d842000-03-10 23:20:43 +0000847
Georg Brandl8f99f812006-10-29 08:39:22 +0000848 def __enter__(self):
849 return self
850
851 def __exit__(self, type, value, tb):
852 self.stream.close()
853
Guido van Rossum0612d842000-03-10 23:20:43 +0000854### Shortcuts
855
Marc-André Lemburg349a3d32000-06-21 21:21:04 +0000856def open(filename, mode='rb', encoding=None, errors='strict', buffering=1):
Guido van Rossum0612d842000-03-10 23:20:43 +0000857
858 """ Open an encoded file using the given mode and return
859 a wrapped version providing transparent encoding/decoding.
860
861 Note: The wrapped version will only accept the object format
862 defined by the codecs, i.e. Unicode objects for most builtin
Skip Montanaro9f5f9d92005-03-16 03:51:56 +0000863 codecs. Output is also codec dependent and will usually be
Guido van Rossum0612d842000-03-10 23:20:43 +0000864 Unicode as well.
865
Marc-André Lemburg349a3d32000-06-21 21:21:04 +0000866 Files are always opened in binary mode, even if no binary mode
Walter Dörwald7f3ed742003-02-02 23:08:27 +0000867 was specified. This is done to avoid data loss due to encodings
Marc-André Lemburg349a3d32000-06-21 21:21:04 +0000868 using 8-bit values. The default file mode is 'rb' meaning to
869 open the file in binary read mode.
870
Guido van Rossum0612d842000-03-10 23:20:43 +0000871 encoding specifies the encoding which is to be used for the
Walter Dörwald7f3ed742003-02-02 23:08:27 +0000872 file.
Guido van Rossum0612d842000-03-10 23:20:43 +0000873
874 errors may be given to define the error handling. It defaults
875 to 'strict' which causes ValueErrors to be raised in case an
876 encoding error occurs.
877
878 buffering has the same meaning as for the builtin open() API.
879 It defaults to line buffered.
880
Fred Drake49fd1072000-04-13 14:11:21 +0000881 The returned wrapped file object provides an extra attribute
882 .encoding which allows querying the used encoding. This
883 attribute is only available if an encoding was specified as
884 parameter.
885
Guido van Rossum0612d842000-03-10 23:20:43 +0000886 """
Florent Xiclunaf4b61862010-02-26 10:40:58 +0000887 if encoding is not None:
888 if 'U' in mode:
889 # No automatic conversion of '\n' is done on reading and writing
890 mode = mode.strip().replace('U', '')
891 if mode[:1] not in set('rwa'):
892 mode = 'r' + mode
893 if 'b' not in mode:
894 # Force opening of the file in binary mode
895 mode = mode + 'b'
Guido van Rossum0612d842000-03-10 23:20:43 +0000896 file = __builtin__.open(filename, mode, buffering)
897 if encoding is None:
898 return file
Walter Dörwaldabb02e52006-03-15 11:35:15 +0000899 info = lookup(encoding)
900 srw = StreamReaderWriter(file, info.streamreader, info.streamwriter, errors)
Guido van Rossuma3277132000-04-11 15:37:43 +0000901 # Add attributes to simplify introspection
902 srw.encoding = encoding
903 return srw
Guido van Rossum0612d842000-03-10 23:20:43 +0000904
Guido van Rossuma3277132000-04-11 15:37:43 +0000905def EncodedFile(file, data_encoding, file_encoding=None, errors='strict'):
Guido van Rossum0612d842000-03-10 23:20:43 +0000906
907 """ Return a wrapped version of file which provides transparent
908 encoding translation.
909
910 Strings written to the wrapped file are interpreted according
Guido van Rossuma3277132000-04-11 15:37:43 +0000911 to the given data_encoding and then written to the original
912 file as string using file_encoding. The intermediate encoding
913 will usually be Unicode but depends on the specified codecs.
Guido van Rossum0612d842000-03-10 23:20:43 +0000914
Guido van Rossuma3277132000-04-11 15:37:43 +0000915 Strings are read from the file using file_encoding and then
916 passed back to the caller as string using data_encoding.
917
918 If file_encoding is not given, it defaults to data_encoding.
Guido van Rossum0612d842000-03-10 23:20:43 +0000919
920 errors may be given to define the error handling. It defaults
921 to 'strict' which causes ValueErrors to be raised in case an
922 encoding error occurs.
923
Fred Drake49fd1072000-04-13 14:11:21 +0000924 The returned wrapped file object provides two extra attributes
925 .data_encoding and .file_encoding which reflect the given
926 parameters of the same name. The attributes can be used for
927 introspection by Python programs.
928
Guido van Rossum0612d842000-03-10 23:20:43 +0000929 """
Guido van Rossuma3277132000-04-11 15:37:43 +0000930 if file_encoding is None:
931 file_encoding = data_encoding
Georg Brandl8f99f812006-10-29 08:39:22 +0000932 data_info = lookup(data_encoding)
933 file_info = lookup(file_encoding)
934 sr = StreamRecoder(file, data_info.encode, data_info.decode,
935 file_info.streamreader, file_info.streamwriter, errors)
Guido van Rossuma3277132000-04-11 15:37:43 +0000936 # Add attributes to simplify introspection
937 sr.data_encoding = data_encoding
938 sr.file_encoding = file_encoding
939 return sr
Guido van Rossum0612d842000-03-10 23:20:43 +0000940
Marc-André Lemburgaa32c5a2001-09-19 11:24:48 +0000941### Helpers for codec lookup
942
943def getencoder(encoding):
944
945 """ Lookup up the codec for the given encoding and return
946 its encoder function.
947
948 Raises a LookupError in case the encoding cannot be found.
949
950 """
Walter Dörwaldabb02e52006-03-15 11:35:15 +0000951 return lookup(encoding).encode
Marc-André Lemburgaa32c5a2001-09-19 11:24:48 +0000952
953def getdecoder(encoding):
954
955 """ Lookup up the codec for the given encoding and return
956 its decoder function.
957
958 Raises a LookupError in case the encoding cannot be found.
959
960 """
Walter Dörwaldabb02e52006-03-15 11:35:15 +0000961 return lookup(encoding).decode
962
963def getincrementalencoder(encoding):
964
965 """ Lookup up the codec for the given encoding and return
966 its IncrementalEncoder class or factory function.
967
968 Raises a LookupError in case the encoding cannot be found
969 or the codecs doesn't provide an incremental encoder.
970
971 """
972 encoder = lookup(encoding).incrementalencoder
973 if encoder is None:
974 raise LookupError(encoding)
975 return encoder
976
977def getincrementaldecoder(encoding):
978
979 """ Lookup up the codec for the given encoding and return
980 its IncrementalDecoder class or factory function.
981
982 Raises a LookupError in case the encoding cannot be found
983 or the codecs doesn't provide an incremental decoder.
984
985 """
986 decoder = lookup(encoding).incrementaldecoder
987 if decoder is None:
988 raise LookupError(encoding)
989 return decoder
Marc-André Lemburgaa32c5a2001-09-19 11:24:48 +0000990
991def getreader(encoding):
992
993 """ Lookup up the codec for the given encoding and return
994 its StreamReader class or factory function.
995
996 Raises a LookupError in case the encoding cannot be found.
997
998 """
Walter Dörwaldabb02e52006-03-15 11:35:15 +0000999 return lookup(encoding).streamreader
Marc-André Lemburgaa32c5a2001-09-19 11:24:48 +00001000
1001def getwriter(encoding):
1002
1003 """ Lookup up the codec for the given encoding and return
1004 its StreamWriter class or factory function.
1005
1006 Raises a LookupError in case the encoding cannot be found.
1007
1008 """
Walter Dörwaldabb02e52006-03-15 11:35:15 +00001009 return lookup(encoding).streamwriter
1010
1011def iterencode(iterator, encoding, errors='strict', **kwargs):
1012 """
1013 Encoding iterator.
1014
Martin Panterb362f752015-11-02 03:37:02 +00001015 Encodes the input strings from the iterator using an IncrementalEncoder.
Walter Dörwaldabb02e52006-03-15 11:35:15 +00001016
1017 errors and kwargs are passed through to the IncrementalEncoder
1018 constructor.
1019 """
1020 encoder = getincrementalencoder(encoding)(errors, **kwargs)
1021 for input in iterator:
1022 output = encoder.encode(input)
1023 if output:
1024 yield output
1025 output = encoder.encode("", True)
1026 if output:
1027 yield output
1028
1029def iterdecode(iterator, encoding, errors='strict', **kwargs):
1030 """
1031 Decoding iterator.
1032
Martin Panterb362f752015-11-02 03:37:02 +00001033 Decodes the input strings from the iterator using an IncrementalDecoder.
Walter Dörwaldabb02e52006-03-15 11:35:15 +00001034
1035 errors and kwargs are passed through to the IncrementalDecoder
1036 constructor.
1037 """
1038 decoder = getincrementaldecoder(encoding)(errors, **kwargs)
1039 for input in iterator:
1040 output = decoder.decode(input)
1041 if output:
1042 yield output
1043 output = decoder.decode("", True)
1044 if output:
1045 yield output
Marc-André Lemburgaa32c5a2001-09-19 11:24:48 +00001046
Marc-André Lemburga866df82001-01-03 21:29:14 +00001047### Helpers for charmap-based codecs
1048
1049def make_identity_dict(rng):
1050
1051 """ make_identity_dict(rng) -> dict
1052
1053 Return a dictionary where elements of the rng sequence are
1054 mapped to themselves.
Tim Peters88869f92001-01-14 23:36:06 +00001055
Marc-André Lemburga866df82001-01-03 21:29:14 +00001056 """
1057 res = {}
1058 for i in rng:
1059 res[i]=i
1060 return res
1061
Marc-André Lemburg716cf912001-05-16 09:41:45 +00001062def make_encoding_map(decoding_map):
1063
1064 """ Creates an encoding map from a decoding map.
1065
Walter Dörwald7f3ed742003-02-02 23:08:27 +00001066 If a target mapping in the decoding map occurs multiple
Marc-André Lemburg716cf912001-05-16 09:41:45 +00001067 times, then that target is mapped to None (undefined mapping),
1068 causing an exception when encountered by the charmap codec
1069 during translation.
1070
1071 One example where this happens is cp875.py which decodes
Serhiy Storchakac8113282015-04-03 18:12:32 +03001072 multiple character to \\u001a.
Marc-André Lemburg716cf912001-05-16 09:41:45 +00001073
1074 """
1075 m = {}
1076 for k,v in decoding_map.items():
Raymond Hettinger54f02222002-06-01 14:18:47 +00001077 if not v in m:
Marc-André Lemburg716cf912001-05-16 09:41:45 +00001078 m[v] = k
1079 else:
1080 m[v] = None
1081 return m
Tim Peters3a2ab1a2001-05-29 06:06:54 +00001082
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001083### error handlers
1084
Martin v. Löwise2713be2005-03-08 15:03:08 +00001085try:
1086 strict_errors = lookup_error("strict")
1087 ignore_errors = lookup_error("ignore")
1088 replace_errors = lookup_error("replace")
1089 xmlcharrefreplace_errors = lookup_error("xmlcharrefreplace")
1090 backslashreplace_errors = lookup_error("backslashreplace")
1091except LookupError:
1092 # In --disable-unicode builds, these error handler are missing
1093 strict_errors = None
1094 ignore_errors = None
1095 replace_errors = None
1096 xmlcharrefreplace_errors = None
1097 backslashreplace_errors = None
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001098
Martin v. Löwis6cd441d2001-07-31 08:54:55 +00001099# Tell modulefinder that using codecs probably needs the encodings
1100# package
1101_false = 0
1102if _false:
1103 import encodings
1104
Guido van Rossum0612d842000-03-10 23:20:43 +00001105### Tests
Guido van Rossum1c89b0e2000-04-11 15:41:38 +00001106
Guido van Rossum0612d842000-03-10 23:20:43 +00001107if __name__ == '__main__':
1108
Guido van Rossuma3277132000-04-11 15:37:43 +00001109 # Make stdout translate Latin-1 output into UTF-8 output
1110 sys.stdout = EncodedFile(sys.stdout, 'latin-1', 'utf-8')
Guido van Rossum1c89b0e2000-04-11 15:41:38 +00001111
Guido van Rossuma3277132000-04-11 15:37:43 +00001112 # Have stdin translate Latin-1 input into UTF-8 input
1113 sys.stdin = EncodedFile(sys.stdin, 'utf-8', 'latin-1')