blob: e120d636bcb28bf5410c6e4f192e1ecaf89237f5 [file] [log] [blame]
Guido van Rossum0612d842000-03-10 23:20:43 +00001""" codecs -- Python Codec Registry, API and helpers.
2
3
4Written by Marc-Andre Lemburg (mal@lemburg.com).
5
6(c) Copyright CNRI, All Rights Reserved. NO WARRANTY.
7
8"""#"
9
Marc-André Lemburgb28de0d2002-12-12 17:37:50 +000010import __builtin__, sys
Guido van Rossum0612d842000-03-10 23:20:43 +000011
12### Registry and builtin stateless codec functions
13
Guido van Rossumb95de4f2000-03-31 17:25:23 +000014try:
15 from _codecs import *
Tim Peters30324a72001-05-15 17:19:16 +000016except ImportError, why:
Walter Dörwald6a7ec7c2006-03-18 16:35:17 +000017 raise SystemError('Failed to load the builtin codecs: %s' % why)
Guido van Rossum0612d842000-03-10 23:20:43 +000018
Tim Peters30324a72001-05-15 17:19:16 +000019__all__ = ["register", "lookup", "open", "EncodedFile", "BOM", "BOM_BE",
Walter Dörwald474458d2002-06-04 15:16:29 +000020 "BOM_LE", "BOM32_BE", "BOM32_LE", "BOM64_BE", "BOM64_LE",
21 "BOM_UTF8", "BOM_UTF16", "BOM_UTF16_LE", "BOM_UTF16_BE",
Walter Dörwald3aeb6322002-09-02 13:14:32 +000022 "BOM_UTF32", "BOM_UTF32_LE", "BOM_UTF32_BE",
Serhiy Storchaka74a651b2014-12-20 17:42:24 +020023 "CodecInfo", "Codec", "IncrementalEncoder", "IncrementalDecoder",
24 "StreamReader", "StreamWriter",
25 "StreamReaderWriter", "StreamRecoder",
26 "getencoder", "getdecoder", "getincrementalencoder",
27 "getincrementaldecoder", "getreader", "getwriter",
28 "encode", "decode", "iterencode", "iterdecode",
Walter Dörwald3aeb6322002-09-02 13:14:32 +000029 "strict_errors", "ignore_errors", "replace_errors",
Serhiy Storchaka74a651b2014-12-20 17:42:24 +020030 "xmlcharrefreplace_errors", "backslashreplace_errors",
Walter Dörwald3aeb6322002-09-02 13:14:32 +000031 "register_error", "lookup_error"]
Skip Montanaroe99d5ea2001-01-20 19:54:20 +000032
Guido van Rossum0612d842000-03-10 23:20:43 +000033### Constants
34
35#
Walter Dörwald474458d2002-06-04 15:16:29 +000036# Byte Order Mark (BOM = ZERO WIDTH NO-BREAK SPACE = U+FEFF)
37# and its possible byte string values
38# for UTF8/UTF16/UTF32 output and little/big endian machines
Guido van Rossum0612d842000-03-10 23:20:43 +000039#
Guido van Rossum0612d842000-03-10 23:20:43 +000040
Walter Dörwald474458d2002-06-04 15:16:29 +000041# UTF-8
42BOM_UTF8 = '\xef\xbb\xbf'
43
44# UTF-16, little endian
45BOM_LE = BOM_UTF16_LE = '\xff\xfe'
46
47# UTF-16, big endian
48BOM_BE = BOM_UTF16_BE = '\xfe\xff'
49
50# UTF-32, little endian
51BOM_UTF32_LE = '\xff\xfe\x00\x00'
52
53# UTF-32, big endian
54BOM_UTF32_BE = '\x00\x00\xfe\xff'
55
Marc-André Lemburgb28de0d2002-12-12 17:37:50 +000056if sys.byteorder == 'little':
Walter Dörwald474458d2002-06-04 15:16:29 +000057
Marc-André Lemburgb28de0d2002-12-12 17:37:50 +000058 # UTF-16, native endianness
59 BOM = BOM_UTF16 = BOM_UTF16_LE
60
61 # UTF-32, native endianness
62 BOM_UTF32 = BOM_UTF32_LE
63
64else:
65
66 # UTF-16, native endianness
67 BOM = BOM_UTF16 = BOM_UTF16_BE
68
69 # UTF-32, native endianness
70 BOM_UTF32 = BOM_UTF32_BE
Walter Dörwald474458d2002-06-04 15:16:29 +000071
72# Old broken names (don't use in new code)
73BOM32_LE = BOM_UTF16_LE
74BOM32_BE = BOM_UTF16_BE
75BOM64_LE = BOM_UTF32_LE
76BOM64_BE = BOM_UTF32_BE
Guido van Rossum0612d842000-03-10 23:20:43 +000077
78
79### Codec base classes (defining the API)
80
Walter Dörwaldabb02e52006-03-15 11:35:15 +000081class CodecInfo(tuple):
Serhiy Storchakac7797dc2015-05-31 20:21:00 +030082 """Codec details when looking up the codec registry"""
83
84 # Private API to allow Python to blacklist the known non-Unicode
85 # codecs in the standard library. A more general mechanism to
86 # reliably distinguish test encodings from other codecs will hopefully
87 # be defined for Python 3.5
88 #
89 # See http://bugs.python.org/issue19619
90 _is_text_encoding = True # Assume codecs are text encodings by default
Walter Dörwaldabb02e52006-03-15 11:35:15 +000091
92 def __new__(cls, encode, decode, streamreader=None, streamwriter=None,
Serhiy Storchakac7797dc2015-05-31 20:21:00 +030093 incrementalencoder=None, incrementaldecoder=None, name=None,
94 _is_text_encoding=None):
Walter Dörwaldabb02e52006-03-15 11:35:15 +000095 self = tuple.__new__(cls, (encode, decode, streamreader, streamwriter))
96 self.name = name
97 self.encode = encode
98 self.decode = decode
99 self.incrementalencoder = incrementalencoder
100 self.incrementaldecoder = incrementaldecoder
101 self.streamwriter = streamwriter
102 self.streamreader = streamreader
Serhiy Storchakac7797dc2015-05-31 20:21:00 +0300103 if _is_text_encoding is not None:
104 self._is_text_encoding = _is_text_encoding
Walter Dörwaldabb02e52006-03-15 11:35:15 +0000105 return self
106
107 def __repr__(self):
108 return "<%s.%s object for encoding %s at 0x%x>" % (self.__class__.__module__, self.__class__.__name__, self.name, id(self))
109
Guido van Rossum0612d842000-03-10 23:20:43 +0000110class Codec:
111
112 """ Defines the interface for stateless encoders/decoders.
113
Walter Dörwald7f82f792002-11-19 21:42:53 +0000114 The .encode()/.decode() methods may use different error
Guido van Rossum0612d842000-03-10 23:20:43 +0000115 handling schemes by providing the errors argument. These
Walter Dörwald7f82f792002-11-19 21:42:53 +0000116 string values are predefined:
Guido van Rossum0612d842000-03-10 23:20:43 +0000117
Guido van Rossumd8855fd2000-03-24 22:14:19 +0000118 'strict' - raise a ValueError error (or a subclass)
Guido van Rossum0612d842000-03-10 23:20:43 +0000119 'ignore' - ignore the character and continue with the next
120 'replace' - replace with a suitable replacement character;
121 Python will use the official U+FFFD REPLACEMENT
Walter Dörwald7f82f792002-11-19 21:42:53 +0000122 CHARACTER for the builtin Unicode codecs on
123 decoding and '?' on encoding.
124 'xmlcharrefreplace' - Replace with the appropriate XML
125 character reference (only for encoding).
126 'backslashreplace' - Replace with backslashed escape sequences
127 (only for encoding).
128
129 The set of allowed values can be extended via register_error.
Guido van Rossum0612d842000-03-10 23:20:43 +0000130
131 """
Tim Peters30324a72001-05-15 17:19:16 +0000132 def encode(self, input, errors='strict'):
Guido van Rossum1c89b0e2000-04-11 15:41:38 +0000133
Fred Drake3e74c0d2000-03-17 15:40:35 +0000134 """ Encodes the object input and returns a tuple (output
Guido van Rossum0612d842000-03-10 23:20:43 +0000135 object, length consumed).
136
137 errors defines the error handling to apply. It defaults to
138 'strict' handling.
139
140 The method may not store state in the Codec instance. Use
Berker Peksagffc7e8e2015-07-30 23:27:13 +0300141 StreamWriter for codecs which have to keep state in order to
142 make encoding efficient.
Guido van Rossum0612d842000-03-10 23:20:43 +0000143
144 The encoder must be able to handle zero length input and
145 return an empty object of the output object type in this
146 situation.
147
148 """
149 raise NotImplementedError
150
Tim Peters30324a72001-05-15 17:19:16 +0000151 def decode(self, input, errors='strict'):
Guido van Rossum0612d842000-03-10 23:20:43 +0000152
153 """ Decodes the object input and returns a tuple (output
154 object, length consumed).
155
156 input must be an object which provides the bf_getreadbuf
157 buffer slot. Python strings, buffer objects and memory
158 mapped files are examples of objects providing this slot.
Guido van Rossum1c89b0e2000-04-11 15:41:38 +0000159
Guido van Rossum0612d842000-03-10 23:20:43 +0000160 errors defines the error handling to apply. It defaults to
161 'strict' handling.
162
163 The method may not store state in the Codec instance. Use
Berker Peksagffc7e8e2015-07-30 23:27:13 +0300164 StreamReader for codecs which have to keep state in order to
165 make decoding efficient.
Guido van Rossum0612d842000-03-10 23:20:43 +0000166
167 The decoder must be able to handle zero length input and
168 return an empty object of the output object type in this
169 situation.
170
Guido van Rossum1c89b0e2000-04-11 15:41:38 +0000171 """
Guido van Rossum0612d842000-03-10 23:20:43 +0000172 raise NotImplementedError
173
Walter Dörwaldabb02e52006-03-15 11:35:15 +0000174class IncrementalEncoder(object):
175 """
Neal Norwitz6bed1c12006-03-16 07:49:19 +0000176 An IncrementalEncoder encodes an input in multiple steps. The input can be
Walter Dörwaldabb02e52006-03-15 11:35:15 +0000177 passed piece by piece to the encode() method. The IncrementalEncoder remembers
178 the state of the Encoding process between calls to encode().
179 """
180 def __init__(self, errors='strict'):
181 """
Neal Norwitz6bed1c12006-03-16 07:49:19 +0000182 Creates an IncrementalEncoder instance.
Walter Dörwaldabb02e52006-03-15 11:35:15 +0000183
184 The IncrementalEncoder may use different error handling schemes by
185 providing the errors keyword argument. See the module docstring
186 for a list of possible values.
187 """
188 self.errors = errors
189 self.buffer = ""
190
191 def encode(self, input, final=False):
192 """
193 Encodes input and returns the resulting object.
194 """
195 raise NotImplementedError
196
197 def reset(self):
198 """
199 Resets the encoder to the initial state.
200 """
201
Christian Heimes1a6387e2008-03-26 12:49:49 +0000202 def getstate(self):
203 """
204 Return the current state of the encoder.
205 """
206 return 0
207
208 def setstate(self, state):
209 """
210 Set the current state of the encoder. state must have been
211 returned by getstate().
212 """
213
Walter Dörwald78a0be62006-04-14 18:25:39 +0000214class BufferedIncrementalEncoder(IncrementalEncoder):
215 """
216 This subclass of IncrementalEncoder can be used as the baseclass for an
217 incremental encoder if the encoder must keep some of the output in a
218 buffer between calls to encode().
219 """
220 def __init__(self, errors='strict'):
221 IncrementalEncoder.__init__(self, errors)
222 self.buffer = "" # unencoded input that is kept between calls to encode()
223
224 def _buffer_encode(self, input, errors, final):
225 # Overwrite this method in subclasses: It must encode input
226 # and return an (output, length consumed) tuple
227 raise NotImplementedError
228
229 def encode(self, input, final=False):
230 # encode input (taking the buffer into account)
231 data = self.buffer + input
232 (result, consumed) = self._buffer_encode(data, self.errors, final)
233 # keep unencoded input until the next call
234 self.buffer = data[consumed:]
235 return result
236
237 def reset(self):
238 IncrementalEncoder.reset(self)
239 self.buffer = ""
240
Christian Heimes1a6387e2008-03-26 12:49:49 +0000241 def getstate(self):
242 return self.buffer or 0
243
244 def setstate(self, state):
245 self.buffer = state or ""
246
Walter Dörwaldabb02e52006-03-15 11:35:15 +0000247class IncrementalDecoder(object):
248 """
249 An IncrementalDecoder decodes an input in multiple steps. The input can be
250 passed piece by piece to the decode() method. The IncrementalDecoder
251 remembers the state of the decoding process between calls to decode().
252 """
253 def __init__(self, errors='strict'):
254 """
Martin Panterb362f752015-11-02 03:37:02 +0000255 Creates an IncrementalDecoder instance.
Walter Dörwaldabb02e52006-03-15 11:35:15 +0000256
257 The IncrementalDecoder may use different error handling schemes by
258 providing the errors keyword argument. See the module docstring
259 for a list of possible values.
260 """
261 self.errors = errors
262
263 def decode(self, input, final=False):
264 """
265 Decodes input and returns the resulting object.
266 """
267 raise NotImplementedError
268
269 def reset(self):
270 """
271 Resets the decoder to the initial state.
272 """
273
Christian Heimes1a6387e2008-03-26 12:49:49 +0000274 def getstate(self):
275 """
276 Return the current state of the decoder.
277
278 This must be a (buffered_input, additional_state_info) tuple.
279 buffered_input must be a bytes object containing bytes that
280 were passed to decode() that have not yet been converted.
281 additional_state_info must be a non-negative integer
282 representing the state of the decoder WITHOUT yet having
283 processed the contents of buffered_input. In the initial state
284 and after reset(), getstate() must return (b"", 0).
285 """
286 return (b"", 0)
287
288 def setstate(self, state):
289 """
290 Set the current state of the decoder.
291
292 state must have been returned by getstate(). The effect of
293 setstate((b"", 0)) must be equivalent to reset().
294 """
295
Walter Dörwaldabb02e52006-03-15 11:35:15 +0000296class BufferedIncrementalDecoder(IncrementalDecoder):
297 """
298 This subclass of IncrementalDecoder can be used as the baseclass for an
299 incremental decoder if the decoder must be able to handle incomplete byte
300 sequences.
301 """
302 def __init__(self, errors='strict'):
303 IncrementalDecoder.__init__(self, errors)
304 self.buffer = "" # undecoded input that is kept between calls to decode()
305
306 def _buffer_decode(self, input, errors, final):
307 # Overwrite this method in subclasses: It must decode input
308 # and return an (output, length consumed) tuple
309 raise NotImplementedError
310
311 def decode(self, input, final=False):
312 # decode input (taking the buffer into account)
313 data = self.buffer + input
314 (result, consumed) = self._buffer_decode(data, self.errors, final)
315 # keep undecoded input until the next call
316 self.buffer = data[consumed:]
317 return result
318
319 def reset(self):
320 IncrementalDecoder.reset(self)
Walter Dörwaldb17f12b2006-04-14 15:40:54 +0000321 self.buffer = ""
Walter Dörwaldabb02e52006-03-15 11:35:15 +0000322
Christian Heimes1a6387e2008-03-26 12:49:49 +0000323 def getstate(self):
324 # additional state info is always 0
325 return (self.buffer, 0)
326
327 def setstate(self, state):
328 # ignore additional state info
329 self.buffer = state[0]
330
Guido van Rossum0612d842000-03-10 23:20:43 +0000331#
332# The StreamWriter and StreamReader class provide generic working
Andrew M. Kuchling97c56352001-09-18 20:29:48 +0000333# interfaces which can be used to implement new encoding submodules
Guido van Rossum0612d842000-03-10 23:20:43 +0000334# very easily. See encodings/utf_8.py for an example on how this is
335# done.
Guido van Rossum1c89b0e2000-04-11 15:41:38 +0000336#
Guido van Rossum0612d842000-03-10 23:20:43 +0000337
338class StreamWriter(Codec):
339
Tim Peters30324a72001-05-15 17:19:16 +0000340 def __init__(self, stream, errors='strict'):
Guido van Rossum0612d842000-03-10 23:20:43 +0000341
342 """ Creates a StreamWriter instance.
343
344 stream must be a file-like object open for writing
345 (binary) data.
346
Walter Dörwald7f82f792002-11-19 21:42:53 +0000347 The StreamWriter may use different error handling
Guido van Rossum0612d842000-03-10 23:20:43 +0000348 schemes by providing the errors keyword argument. These
Walter Dörwald7f82f792002-11-19 21:42:53 +0000349 parameters are predefined:
Guido van Rossum0612d842000-03-10 23:20:43 +0000350
351 'strict' - raise a ValueError (or a subclass)
352 'ignore' - ignore the character and continue with the next
353 'replace'- replace with a suitable replacement character
Walter Dörwald7f82f792002-11-19 21:42:53 +0000354 'xmlcharrefreplace' - Replace with the appropriate XML
355 character reference.
356 'backslashreplace' - Replace with backslashed escape
357 sequences (only for encoding).
Guido van Rossum0612d842000-03-10 23:20:43 +0000358
Walter Dörwald7f82f792002-11-19 21:42:53 +0000359 The set of allowed parameter values can be extended via
360 register_error.
Guido van Rossum0612d842000-03-10 23:20:43 +0000361 """
362 self.stream = stream
363 self.errors = errors
364
Guido van Rossuma3277132000-04-11 15:37:43 +0000365 def write(self, object):
Guido van Rossum0612d842000-03-10 23:20:43 +0000366
367 """ Writes the object's contents encoded to self.stream.
368 """
Tim Peters30324a72001-05-15 17:19:16 +0000369 data, consumed = self.encode(object, self.errors)
Guido van Rossum0612d842000-03-10 23:20:43 +0000370 self.stream.write(data)
371
Guido van Rossuma3277132000-04-11 15:37:43 +0000372 def writelines(self, list):
373
374 """ Writes the concatenated list of strings to the stream
375 using .write().
376 """
377 self.write(''.join(list))
Guido van Rossum1c89b0e2000-04-11 15:41:38 +0000378
Guido van Rossum0612d842000-03-10 23:20:43 +0000379 def reset(self):
380
381 """ Flushes and resets the codec buffers used for keeping state.
382
383 Calling this method should ensure that the data on the
384 output is put into a clean state, that allows appending
385 of new fresh data without having to rescan the whole
386 stream to recover state.
387
388 """
389 pass
390
Victor Stinner7df55da2010-05-22 13:37:56 +0000391 def seek(self, offset, whence=0):
392 self.stream.seek(offset, whence)
393 if whence == 0 and offset == 0:
394 self.reset()
395
Tim Peters30324a72001-05-15 17:19:16 +0000396 def __getattr__(self, name,
Guido van Rossum0612d842000-03-10 23:20:43 +0000397 getattr=getattr):
398
399 """ Inherit all other methods from the underlying stream.
400 """
Tim Peters30324a72001-05-15 17:19:16 +0000401 return getattr(self.stream, name)
Guido van Rossum0612d842000-03-10 23:20:43 +0000402
Georg Brandl8f99f812006-10-29 08:39:22 +0000403 def __enter__(self):
404 return self
405
406 def __exit__(self, type, value, tb):
407 self.stream.close()
408
Guido van Rossum0612d842000-03-10 23:20:43 +0000409###
410
411class StreamReader(Codec):
412
Tim Peters30324a72001-05-15 17:19:16 +0000413 def __init__(self, stream, errors='strict'):
Guido van Rossum0612d842000-03-10 23:20:43 +0000414
415 """ Creates a StreamReader instance.
416
417 stream must be a file-like object open for reading
418 (binary) data.
419
Walter Dörwald7f82f792002-11-19 21:42:53 +0000420 The StreamReader may use different error handling
Guido van Rossum0612d842000-03-10 23:20:43 +0000421 schemes by providing the errors keyword argument. These
Walter Dörwald7f82f792002-11-19 21:42:53 +0000422 parameters are predefined:
Guido van Rossum0612d842000-03-10 23:20:43 +0000423
424 'strict' - raise a ValueError (or a subclass)
425 'ignore' - ignore the character and continue with the next
426 'replace'- replace with a suitable replacement character;
427
Walter Dörwald7f82f792002-11-19 21:42:53 +0000428 The set of allowed parameter values can be extended via
429 register_error.
Guido van Rossum0612d842000-03-10 23:20:43 +0000430 """
431 self.stream = stream
432 self.errors = errors
Walter Dörwald69652032004-09-07 20:24:22 +0000433 self.bytebuffer = ""
Walter Dörwaldc9878e12005-07-20 22:15:39 +0000434 # For str->str decoding this will stay a str
435 # For str->unicode decoding the first read will promote it to unicode
436 self.charbuffer = ""
Martin v. Löwis4ed67382005-09-18 08:34:39 +0000437 self.linebuffer = None
Guido van Rossum0612d842000-03-10 23:20:43 +0000438
Walter Dörwald69652032004-09-07 20:24:22 +0000439 def decode(self, input, errors='strict'):
440 raise NotImplementedError
441
Martin v. Löwis56066d22005-08-24 07:38:12 +0000442 def read(self, size=-1, chars=-1, firstline=False):
Guido van Rossum0612d842000-03-10 23:20:43 +0000443
444 """ Decodes data from the stream self.stream and returns the
445 resulting object.
446
Walter Dörwald69652032004-09-07 20:24:22 +0000447 chars indicates the number of characters to read from the
448 stream. read() will never return more than chars
449 characters, but it might return less, if there are not enough
450 characters available.
451
Guido van Rossum0612d842000-03-10 23:20:43 +0000452 size indicates the approximate maximum number of bytes to
453 read from the stream for decoding purposes. The decoder
454 can modify this setting as appropriate. The default value
455 -1 indicates to read and decode as much as possible. size
456 is intended to prevent having to decode huge files in one
457 step.
458
Martin v. Löwis56066d22005-08-24 07:38:12 +0000459 If firstline is true, and a UnicodeDecodeError happens
460 after the first line terminator in the input only the first line
461 will be returned, the rest of the input will be kept until the
462 next call to read().
463
Guido van Rossum0612d842000-03-10 23:20:43 +0000464 The method should use a greedy read strategy meaning that
465 it should read as much data as is allowed within the
466 definition of the encoding and the given size, e.g. if
467 optional encoding endings or state markers are available
468 on the stream, these should be read too.
Guido van Rossum0612d842000-03-10 23:20:43 +0000469 """
Martin v. Löwis4ed67382005-09-18 08:34:39 +0000470 # If we have lines cached, first merge them back into characters
471 if self.linebuffer:
472 self.charbuffer = "".join(self.linebuffer)
473 self.linebuffer = None
Tim Peters536cf992005-12-25 23:18:31 +0000474
Miss Islington (bot)fc73c542017-11-28 16:15:43 -0800475 if chars < 0:
476 # For compatibility with other read() methods that take a
477 # single argument
478 chars = size
479
Walter Dörwald69652032004-09-07 20:24:22 +0000480 # read until we get the required number of characters (if available)
Walter Dörwald69652032004-09-07 20:24:22 +0000481 while True:
Serhiy Storchaka2403a782014-01-26 19:20:24 +0200482 # can the request be satisfied from the character buffer?
483 if chars >= 0:
Walter Dörwald69652032004-09-07 20:24:22 +0000484 if len(self.charbuffer) >= chars:
Walter Dörwald69652032004-09-07 20:24:22 +0000485 break
486 # we need more data
487 if size < 0:
488 newdata = self.stream.read()
489 else:
490 newdata = self.stream.read(size)
Walter Dörwalde57d7b12004-12-21 22:24:00 +0000491 # decode bytes (those remaining from the last call included)
Walter Dörwald69652032004-09-07 20:24:22 +0000492 data = self.bytebuffer + newdata
Martin v. Löwis56066d22005-08-24 07:38:12 +0000493 try:
494 newchars, decodedbytes = self.decode(data, self.errors)
495 except UnicodeDecodeError, exc:
496 if firstline:
497 newchars, decodedbytes = self.decode(data[:exc.start], self.errors)
498 lines = newchars.splitlines(True)
499 if len(lines)<=1:
500 raise
501 else:
502 raise
Walter Dörwald69652032004-09-07 20:24:22 +0000503 # keep undecoded bytes until the next call
504 self.bytebuffer = data[decodedbytes:]
505 # put new characters in the character buffer
Walter Dörwalde57d7b12004-12-21 22:24:00 +0000506 self.charbuffer += newchars
Walter Dörwald69652032004-09-07 20:24:22 +0000507 # there was no data available
508 if not newdata:
Walter Dörwalde57d7b12004-12-21 22:24:00 +0000509 break
510 if chars < 0:
511 # Return everything we've got
512 result = self.charbuffer
Walter Dörwaldc9878e12005-07-20 22:15:39 +0000513 self.charbuffer = ""
Walter Dörwalde57d7b12004-12-21 22:24:00 +0000514 else:
515 # Return the first chars characters
516 result = self.charbuffer[:chars]
517 self.charbuffer = self.charbuffer[chars:]
Walter Dörwald69652032004-09-07 20:24:22 +0000518 return result
Guido van Rossum0612d842000-03-10 23:20:43 +0000519
Walter Dörwald69652032004-09-07 20:24:22 +0000520 def readline(self, size=None, keepends=True):
Guido van Rossuma3277132000-04-11 15:37:43 +0000521
522 """ Read one line from the input stream and return the
523 decoded data.
524
Walter Dörwald69652032004-09-07 20:24:22 +0000525 size, if given, is passed as size argument to the
526 read() method.
Guido van Rossum1c89b0e2000-04-11 15:41:38 +0000527
Guido van Rossuma3277132000-04-11 15:37:43 +0000528 """
Martin v. Löwis4ed67382005-09-18 08:34:39 +0000529 # If we have lines cached from an earlier read, return
530 # them unconditionally
531 if self.linebuffer:
532 line = self.linebuffer[0]
533 del self.linebuffer[0]
534 if len(self.linebuffer) == 1:
535 # revert to charbuffer mode; we might need more data
536 # next time
537 self.charbuffer = self.linebuffer[0]
538 self.linebuffer = None
539 if not keepends:
540 line = line.splitlines(False)[0]
541 return line
Tim Peters536cf992005-12-25 23:18:31 +0000542
Walter Dörwalde57d7b12004-12-21 22:24:00 +0000543 readsize = size or 72
Walter Dörwaldc9878e12005-07-20 22:15:39 +0000544 line = ""
Walter Dörwalde57d7b12004-12-21 22:24:00 +0000545 # If size is given, we call read() only once
Walter Dörwald69652032004-09-07 20:24:22 +0000546 while True:
Martin v. Löwis56066d22005-08-24 07:38:12 +0000547 data = self.read(readsize, firstline=True)
Walter Dörwalde57d7b12004-12-21 22:24:00 +0000548 if data:
Walter Dörwalda4eb2d52005-04-21 21:42:35 +0000549 # If we're at a "\r" read one extra character (which might
550 # be a "\n") to get a proper line ending. If the stream is
Walter Dörwaldbc8e6422005-04-21 21:32:03 +0000551 # temporarily exhausted we return the wrong line ending.
Walter Dörwaldc9878e12005-07-20 22:15:39 +0000552 if data.endswith("\r"):
Walter Dörwald7a6dc132005-04-04 21:38:47 +0000553 data += self.read(size=1, chars=1)
Walter Dörwald7a6dc132005-04-04 21:38:47 +0000554
Walter Dörwald69652032004-09-07 20:24:22 +0000555 line += data
Walter Dörwalde57d7b12004-12-21 22:24:00 +0000556 lines = line.splitlines(True)
557 if lines:
Martin v. Löwis4ed67382005-09-18 08:34:39 +0000558 if len(lines) > 1:
559 # More than one line result; the first line is a full line
560 # to return
561 line = lines[0]
562 del lines[0]
563 if len(lines) > 1:
564 # cache the remaining lines
565 lines[-1] += self.charbuffer
566 self.linebuffer = lines
567 self.charbuffer = None
568 else:
569 # only one remaining line, put it back into charbuffer
570 self.charbuffer = lines[0] + self.charbuffer
571 if not keepends:
572 line = line.splitlines(False)[0]
573 break
Walter Dörwalde57d7b12004-12-21 22:24:00 +0000574 line0withend = lines[0]
575 line0withoutend = lines[0].splitlines(False)[0]
576 if line0withend != line0withoutend: # We really have a line end
577 # Put the rest back together and keep it until the next call
Walter Dörwaldc9878e12005-07-20 22:15:39 +0000578 self.charbuffer = "".join(lines[1:]) + self.charbuffer
Walter Dörwalde57d7b12004-12-21 22:24:00 +0000579 if keepends:
580 line = line0withend
581 else:
582 line = line0withoutend
Walter Dörwald9fa09462005-01-10 12:01:39 +0000583 break
Walter Dörwalde57d7b12004-12-21 22:24:00 +0000584 # we didn't get anything or this was our only try
Walter Dörwald9fa09462005-01-10 12:01:39 +0000585 if not data or size is not None:
Walter Dörwalde57d7b12004-12-21 22:24:00 +0000586 if line and not keepends:
587 line = line.splitlines(False)[0]
588 break
589 if readsize<8000:
590 readsize *= 2
591 return line
Guido van Rossum1c89b0e2000-04-11 15:41:38 +0000592
Walter Dörwald69652032004-09-07 20:24:22 +0000593 def readlines(self, sizehint=None, keepends=True):
Guido van Rossuma3277132000-04-11 15:37:43 +0000594
595 """ Read all lines available on the input stream
596 and return them as list of lines.
597
598 Line breaks are implemented using the codec's decoder
599 method and are included in the list entries.
Guido van Rossum1c89b0e2000-04-11 15:41:38 +0000600
Marc-André Lemburgd5948492004-02-26 15:22:17 +0000601 sizehint, if given, is ignored since there is no efficient
602 way to finding the true end-of-line.
Guido van Rossuma3277132000-04-11 15:37:43 +0000603
604 """
Walter Dörwald69652032004-09-07 20:24:22 +0000605 data = self.read()
Hye-Shik Changaf5c7cf2004-10-17 23:51:21 +0000606 return data.splitlines(keepends)
Guido van Rossum0612d842000-03-10 23:20:43 +0000607
608 def reset(self):
609
610 """ Resets the codec buffers used for keeping state.
611
612 Note that no stream repositioning should take place.
Thomas Wouters7e474022000-07-16 12:04:32 +0000613 This method is primarily intended to be able to recover
Guido van Rossum0612d842000-03-10 23:20:43 +0000614 from decoding errors.
615
616 """
Walter Dörwald729c31f2005-03-14 19:06:30 +0000617 self.bytebuffer = ""
618 self.charbuffer = u""
Martin v. Löwis4ed67382005-09-18 08:34:39 +0000619 self.linebuffer = None
Walter Dörwald729c31f2005-03-14 19:06:30 +0000620
Walter Dörwald71fd90d2005-03-14 19:25:41 +0000621 def seek(self, offset, whence=0):
Walter Dörwald729c31f2005-03-14 19:06:30 +0000622 """ Set the input stream's current position.
623
624 Resets the codec buffers used for keeping state.
625 """
Walter Dörwald729c31f2005-03-14 19:06:30 +0000626 self.stream.seek(offset, whence)
Victor Stinner7df55da2010-05-22 13:37:56 +0000627 self.reset()
Guido van Rossum0612d842000-03-10 23:20:43 +0000628
Walter Dörwald4dbf1922002-11-06 16:53:44 +0000629 def next(self):
630
631 """ Return the next decoded line from the input stream."""
632 line = self.readline()
633 if line:
634 return line
635 raise StopIteration
636
637 def __iter__(self):
638 return self
639
Tim Peters30324a72001-05-15 17:19:16 +0000640 def __getattr__(self, name,
Guido van Rossum0612d842000-03-10 23:20:43 +0000641 getattr=getattr):
642
643 """ Inherit all other methods from the underlying stream.
644 """
Tim Peters30324a72001-05-15 17:19:16 +0000645 return getattr(self.stream, name)
Guido van Rossum0612d842000-03-10 23:20:43 +0000646
Georg Brandl8f99f812006-10-29 08:39:22 +0000647 def __enter__(self):
648 return self
649
650 def __exit__(self, type, value, tb):
651 self.stream.close()
652
Guido van Rossum0612d842000-03-10 23:20:43 +0000653###
654
655class StreamReaderWriter:
656
Fred Drake49fd1072000-04-13 14:11:21 +0000657 """ StreamReaderWriter instances allow wrapping streams which
658 work in both read and write modes.
659
660 The design is such that one can use the factory functions
Thomas Wouters7e474022000-07-16 12:04:32 +0000661 returned by the codec.lookup() function to construct the
Fred Drake49fd1072000-04-13 14:11:21 +0000662 instance.
663
664 """
Guido van Rossuma3277132000-04-11 15:37:43 +0000665 # Optional attributes set by the file wrappers below
666 encoding = 'unknown'
667
Tim Peters30324a72001-05-15 17:19:16 +0000668 def __init__(self, stream, Reader, Writer, errors='strict'):
Guido van Rossum0612d842000-03-10 23:20:43 +0000669
670 """ Creates a StreamReaderWriter instance.
671
672 stream must be a Stream-like object.
673
674 Reader, Writer must be factory functions or classes
675 providing the StreamReader, StreamWriter interface resp.
676
677 Error handling is done in the same way as defined for the
678 StreamWriter/Readers.
679
680 """
681 self.stream = stream
682 self.reader = Reader(stream, errors)
683 self.writer = Writer(stream, errors)
684 self.errors = errors
685
Tim Peters30324a72001-05-15 17:19:16 +0000686 def read(self, size=-1):
Guido van Rossum0612d842000-03-10 23:20:43 +0000687
688 return self.reader.read(size)
689
Guido van Rossumd58c26f2000-05-01 16:17:32 +0000690 def readline(self, size=None):
Guido van Rossuma3277132000-04-11 15:37:43 +0000691
692 return self.reader.readline(size)
693
Guido van Rossumd58c26f2000-05-01 16:17:32 +0000694 def readlines(self, sizehint=None):
Guido van Rossuma3277132000-04-11 15:37:43 +0000695
696 return self.reader.readlines(sizehint)
697
Walter Dörwald4dbf1922002-11-06 16:53:44 +0000698 def next(self):
699
700 """ Return the next decoded line from the input stream."""
701 return self.reader.next()
702
703 def __iter__(self):
704 return self
705
Tim Peters30324a72001-05-15 17:19:16 +0000706 def write(self, data):
Guido van Rossum0612d842000-03-10 23:20:43 +0000707
708 return self.writer.write(data)
709
Tim Peters30324a72001-05-15 17:19:16 +0000710 def writelines(self, list):
Guido van Rossuma3277132000-04-11 15:37:43 +0000711
712 return self.writer.writelines(list)
713
Guido van Rossum0612d842000-03-10 23:20:43 +0000714 def reset(self):
715
716 self.reader.reset()
717 self.writer.reset()
718
Victor Stinner262be5e2010-05-22 02:11:07 +0000719 def seek(self, offset, whence=0):
Victor Stinner7df55da2010-05-22 13:37:56 +0000720 self.stream.seek(offset, whence)
721 self.reader.reset()
722 if whence == 0 and offset == 0:
723 self.writer.reset()
Victor Stinner262be5e2010-05-22 02:11:07 +0000724
Tim Peters30324a72001-05-15 17:19:16 +0000725 def __getattr__(self, name,
Guido van Rossum0612d842000-03-10 23:20:43 +0000726 getattr=getattr):
727
728 """ Inherit all other methods from the underlying stream.
729 """
Tim Peters30324a72001-05-15 17:19:16 +0000730 return getattr(self.stream, name)
Guido van Rossum0612d842000-03-10 23:20:43 +0000731
Georg Brandl8f99f812006-10-29 08:39:22 +0000732 # these are needed to make "with codecs.open(...)" work properly
733
734 def __enter__(self):
735 return self
736
737 def __exit__(self, type, value, tb):
738 self.stream.close()
739
Guido van Rossum0612d842000-03-10 23:20:43 +0000740###
741
742class StreamRecoder:
743
Fred Drake49fd1072000-04-13 14:11:21 +0000744 """ StreamRecoder instances provide a frontend - backend
745 view of encoding data.
746
747 They use the complete set of APIs returned by the
748 codecs.lookup() function to implement their task.
749
750 Data written to the stream is first decoded into an
751 intermediate format (which is dependent on the given codec
752 combination) and then written to the stream using an instance
753 of the provided Writer class.
754
755 In the other direction, data is read from the stream using a
756 Reader instance and then return encoded data to the caller.
757
758 """
Guido van Rossuma3277132000-04-11 15:37:43 +0000759 # Optional attributes set by the file wrappers below
760 data_encoding = 'unknown'
761 file_encoding = 'unknown'
762
Tim Peters30324a72001-05-15 17:19:16 +0000763 def __init__(self, stream, encode, decode, Reader, Writer,
764 errors='strict'):
Guido van Rossum0612d842000-03-10 23:20:43 +0000765
766 """ Creates a StreamRecoder instance which implements a two-way
767 conversion: encode and decode work on the frontend (the
Guido van Rossum1c89b0e2000-04-11 15:41:38 +0000768 input to .read() and output of .write()) while
Guido van Rossum0612d842000-03-10 23:20:43 +0000769 Reader and Writer work on the backend (reading and
Fred Drake908670c2000-03-17 15:42:11 +0000770 writing to the stream).
Guido van Rossum0612d842000-03-10 23:20:43 +0000771
772 You can use these objects to do transparent direct
773 recodings from e.g. latin-1 to utf-8 and back.
774
775 stream must be a file-like object.
776
777 encode, decode must adhere to the Codec interface, Reader,
778 Writer must be factory functions or classes providing the
779 StreamReader, StreamWriter interface resp.
780
781 encode and decode are needed for the frontend translation,
782 Reader and Writer for the backend translation. Unicode is
783 used as intermediate encoding.
784
785 Error handling is done in the same way as defined for the
786 StreamWriter/Readers.
787
788 """
789 self.stream = stream
790 self.encode = encode
791 self.decode = decode
792 self.reader = Reader(stream, errors)
793 self.writer = Writer(stream, errors)
794 self.errors = errors
795
Tim Peters30324a72001-05-15 17:19:16 +0000796 def read(self, size=-1):
Guido van Rossum0612d842000-03-10 23:20:43 +0000797
798 data = self.reader.read(size)
799 data, bytesencoded = self.encode(data, self.errors)
800 return data
801
Tim Peters30324a72001-05-15 17:19:16 +0000802 def readline(self, size=None):
Guido van Rossuma3277132000-04-11 15:37:43 +0000803
804 if size is None:
805 data = self.reader.readline()
806 else:
807 data = self.reader.readline(size)
808 data, bytesencoded = self.encode(data, self.errors)
809 return data
810
Tim Peters30324a72001-05-15 17:19:16 +0000811 def readlines(self, sizehint=None):
Guido van Rossuma3277132000-04-11 15:37:43 +0000812
Marc-André Lemburgd5948492004-02-26 15:22:17 +0000813 data = self.reader.read()
Guido van Rossuma3277132000-04-11 15:37:43 +0000814 data, bytesencoded = self.encode(data, self.errors)
815 return data.splitlines(1)
816
Walter Dörwald4dbf1922002-11-06 16:53:44 +0000817 def next(self):
818
819 """ Return the next decoded line from the input stream."""
Walter Dörwaldc5238b82005-09-01 11:56:53 +0000820 data = self.reader.next()
821 data, bytesencoded = self.encode(data, self.errors)
822 return data
Walter Dörwald4dbf1922002-11-06 16:53:44 +0000823
824 def __iter__(self):
825 return self
826
Tim Peters30324a72001-05-15 17:19:16 +0000827 def write(self, data):
Guido van Rossum0612d842000-03-10 23:20:43 +0000828
829 data, bytesdecoded = self.decode(data, self.errors)
830 return self.writer.write(data)
831
Tim Peters30324a72001-05-15 17:19:16 +0000832 def writelines(self, list):
Guido van Rossuma3277132000-04-11 15:37:43 +0000833
834 data = ''.join(list)
835 data, bytesdecoded = self.decode(data, self.errors)
836 return self.writer.write(data)
Guido van Rossum0612d842000-03-10 23:20:43 +0000837
838 def reset(self):
839
840 self.reader.reset()
841 self.writer.reset()
842
Tim Peters30324a72001-05-15 17:19:16 +0000843 def __getattr__(self, name,
Guido van Rossum0612d842000-03-10 23:20:43 +0000844 getattr=getattr):
845
846 """ Inherit all other methods from the underlying stream.
847 """
Tim Peters30324a72001-05-15 17:19:16 +0000848 return getattr(self.stream, name)
Guido van Rossum0612d842000-03-10 23:20:43 +0000849
Georg Brandl8f99f812006-10-29 08:39:22 +0000850 def __enter__(self):
851 return self
852
853 def __exit__(self, type, value, tb):
854 self.stream.close()
855
Guido van Rossum0612d842000-03-10 23:20:43 +0000856### Shortcuts
857
Marc-André Lemburg349a3d32000-06-21 21:21:04 +0000858def open(filename, mode='rb', encoding=None, errors='strict', buffering=1):
Guido van Rossum0612d842000-03-10 23:20:43 +0000859
860 """ Open an encoded file using the given mode and return
861 a wrapped version providing transparent encoding/decoding.
862
863 Note: The wrapped version will only accept the object format
864 defined by the codecs, i.e. Unicode objects for most builtin
Skip Montanaro9f5f9d92005-03-16 03:51:56 +0000865 codecs. Output is also codec dependent and will usually be
Guido van Rossum0612d842000-03-10 23:20:43 +0000866 Unicode as well.
867
Marc-André Lemburg349a3d32000-06-21 21:21:04 +0000868 Files are always opened in binary mode, even if no binary mode
Walter Dörwald7f3ed742003-02-02 23:08:27 +0000869 was specified. This is done to avoid data loss due to encodings
Marc-André Lemburg349a3d32000-06-21 21:21:04 +0000870 using 8-bit values. The default file mode is 'rb' meaning to
871 open the file in binary read mode.
872
Guido van Rossum0612d842000-03-10 23:20:43 +0000873 encoding specifies the encoding which is to be used for the
Walter Dörwald7f3ed742003-02-02 23:08:27 +0000874 file.
Guido van Rossum0612d842000-03-10 23:20:43 +0000875
876 errors may be given to define the error handling. It defaults
877 to 'strict' which causes ValueErrors to be raised in case an
878 encoding error occurs.
879
880 buffering has the same meaning as for the builtin open() API.
881 It defaults to line buffered.
882
Fred Drake49fd1072000-04-13 14:11:21 +0000883 The returned wrapped file object provides an extra attribute
884 .encoding which allows querying the used encoding. This
885 attribute is only available if an encoding was specified as
886 parameter.
887
Guido van Rossum0612d842000-03-10 23:20:43 +0000888 """
Florent Xiclunaf4b61862010-02-26 10:40:58 +0000889 if encoding is not None:
890 if 'U' in mode:
891 # No automatic conversion of '\n' is done on reading and writing
892 mode = mode.strip().replace('U', '')
893 if mode[:1] not in set('rwa'):
894 mode = 'r' + mode
895 if 'b' not in mode:
896 # Force opening of the file in binary mode
897 mode = mode + 'b'
Guido van Rossum0612d842000-03-10 23:20:43 +0000898 file = __builtin__.open(filename, mode, buffering)
899 if encoding is None:
900 return file
Walter Dörwaldabb02e52006-03-15 11:35:15 +0000901 info = lookup(encoding)
902 srw = StreamReaderWriter(file, info.streamreader, info.streamwriter, errors)
Guido van Rossuma3277132000-04-11 15:37:43 +0000903 # Add attributes to simplify introspection
904 srw.encoding = encoding
905 return srw
Guido van Rossum0612d842000-03-10 23:20:43 +0000906
Guido van Rossuma3277132000-04-11 15:37:43 +0000907def EncodedFile(file, data_encoding, file_encoding=None, errors='strict'):
Guido van Rossum0612d842000-03-10 23:20:43 +0000908
909 """ Return a wrapped version of file which provides transparent
910 encoding translation.
911
912 Strings written to the wrapped file are interpreted according
Guido van Rossuma3277132000-04-11 15:37:43 +0000913 to the given data_encoding and then written to the original
914 file as string using file_encoding. The intermediate encoding
915 will usually be Unicode but depends on the specified codecs.
Guido van Rossum0612d842000-03-10 23:20:43 +0000916
Guido van Rossuma3277132000-04-11 15:37:43 +0000917 Strings are read from the file using file_encoding and then
918 passed back to the caller as string using data_encoding.
919
920 If file_encoding is not given, it defaults to data_encoding.
Guido van Rossum0612d842000-03-10 23:20:43 +0000921
922 errors may be given to define the error handling. It defaults
923 to 'strict' which causes ValueErrors to be raised in case an
924 encoding error occurs.
925
Fred Drake49fd1072000-04-13 14:11:21 +0000926 The returned wrapped file object provides two extra attributes
927 .data_encoding and .file_encoding which reflect the given
928 parameters of the same name. The attributes can be used for
929 introspection by Python programs.
930
Guido van Rossum0612d842000-03-10 23:20:43 +0000931 """
Guido van Rossuma3277132000-04-11 15:37:43 +0000932 if file_encoding is None:
933 file_encoding = data_encoding
Georg Brandl8f99f812006-10-29 08:39:22 +0000934 data_info = lookup(data_encoding)
935 file_info = lookup(file_encoding)
936 sr = StreamRecoder(file, data_info.encode, data_info.decode,
937 file_info.streamreader, file_info.streamwriter, errors)
Guido van Rossuma3277132000-04-11 15:37:43 +0000938 # Add attributes to simplify introspection
939 sr.data_encoding = data_encoding
940 sr.file_encoding = file_encoding
941 return sr
Guido van Rossum0612d842000-03-10 23:20:43 +0000942
Marc-André Lemburgaa32c5a2001-09-19 11:24:48 +0000943### Helpers for codec lookup
944
945def getencoder(encoding):
946
947 """ Lookup up the codec for the given encoding and return
948 its encoder function.
949
950 Raises a LookupError in case the encoding cannot be found.
951
952 """
Walter Dörwaldabb02e52006-03-15 11:35:15 +0000953 return lookup(encoding).encode
Marc-André Lemburgaa32c5a2001-09-19 11:24:48 +0000954
955def getdecoder(encoding):
956
957 """ Lookup up the codec for the given encoding and return
958 its decoder function.
959
960 Raises a LookupError in case the encoding cannot be found.
961
962 """
Walter Dörwaldabb02e52006-03-15 11:35:15 +0000963 return lookup(encoding).decode
964
965def getincrementalencoder(encoding):
966
967 """ Lookup up the codec for the given encoding and return
968 its IncrementalEncoder class or factory function.
969
970 Raises a LookupError in case the encoding cannot be found
971 or the codecs doesn't provide an incremental encoder.
972
973 """
974 encoder = lookup(encoding).incrementalencoder
975 if encoder is None:
976 raise LookupError(encoding)
977 return encoder
978
979def getincrementaldecoder(encoding):
980
981 """ Lookup up the codec for the given encoding and return
982 its IncrementalDecoder class or factory function.
983
984 Raises a LookupError in case the encoding cannot be found
985 or the codecs doesn't provide an incremental decoder.
986
987 """
988 decoder = lookup(encoding).incrementaldecoder
989 if decoder is None:
990 raise LookupError(encoding)
991 return decoder
Marc-André Lemburgaa32c5a2001-09-19 11:24:48 +0000992
993def getreader(encoding):
994
995 """ Lookup up the codec for the given encoding and return
996 its StreamReader class or factory function.
997
998 Raises a LookupError in case the encoding cannot be found.
999
1000 """
Walter Dörwaldabb02e52006-03-15 11:35:15 +00001001 return lookup(encoding).streamreader
Marc-André Lemburgaa32c5a2001-09-19 11:24:48 +00001002
1003def getwriter(encoding):
1004
1005 """ Lookup up the codec for the given encoding and return
1006 its StreamWriter class or factory function.
1007
1008 Raises a LookupError in case the encoding cannot be found.
1009
1010 """
Walter Dörwaldabb02e52006-03-15 11:35:15 +00001011 return lookup(encoding).streamwriter
1012
1013def iterencode(iterator, encoding, errors='strict', **kwargs):
1014 """
1015 Encoding iterator.
1016
Martin Panterb362f752015-11-02 03:37:02 +00001017 Encodes the input strings from the iterator using an IncrementalEncoder.
Walter Dörwaldabb02e52006-03-15 11:35:15 +00001018
1019 errors and kwargs are passed through to the IncrementalEncoder
1020 constructor.
1021 """
1022 encoder = getincrementalencoder(encoding)(errors, **kwargs)
1023 for input in iterator:
1024 output = encoder.encode(input)
1025 if output:
1026 yield output
1027 output = encoder.encode("", True)
1028 if output:
1029 yield output
1030
1031def iterdecode(iterator, encoding, errors='strict', **kwargs):
1032 """
1033 Decoding iterator.
1034
Martin Panterb362f752015-11-02 03:37:02 +00001035 Decodes the input strings from the iterator using an IncrementalDecoder.
Walter Dörwaldabb02e52006-03-15 11:35:15 +00001036
1037 errors and kwargs are passed through to the IncrementalDecoder
1038 constructor.
1039 """
1040 decoder = getincrementaldecoder(encoding)(errors, **kwargs)
1041 for input in iterator:
1042 output = decoder.decode(input)
1043 if output:
1044 yield output
1045 output = decoder.decode("", True)
1046 if output:
1047 yield output
Marc-André Lemburgaa32c5a2001-09-19 11:24:48 +00001048
Marc-André Lemburga866df82001-01-03 21:29:14 +00001049### Helpers for charmap-based codecs
1050
1051def make_identity_dict(rng):
1052
1053 """ make_identity_dict(rng) -> dict
1054
1055 Return a dictionary where elements of the rng sequence are
1056 mapped to themselves.
Tim Peters88869f92001-01-14 23:36:06 +00001057
Marc-André Lemburga866df82001-01-03 21:29:14 +00001058 """
1059 res = {}
1060 for i in rng:
1061 res[i]=i
1062 return res
1063
Marc-André Lemburg716cf912001-05-16 09:41:45 +00001064def make_encoding_map(decoding_map):
1065
1066 """ Creates an encoding map from a decoding map.
1067
Walter Dörwald7f3ed742003-02-02 23:08:27 +00001068 If a target mapping in the decoding map occurs multiple
Marc-André Lemburg716cf912001-05-16 09:41:45 +00001069 times, then that target is mapped to None (undefined mapping),
1070 causing an exception when encountered by the charmap codec
1071 during translation.
1072
1073 One example where this happens is cp875.py which decodes
Serhiy Storchakac8113282015-04-03 18:12:32 +03001074 multiple character to \\u001a.
Marc-André Lemburg716cf912001-05-16 09:41:45 +00001075
1076 """
1077 m = {}
1078 for k,v in decoding_map.items():
Raymond Hettinger54f02222002-06-01 14:18:47 +00001079 if not v in m:
Marc-André Lemburg716cf912001-05-16 09:41:45 +00001080 m[v] = k
1081 else:
1082 m[v] = None
1083 return m
Tim Peters3a2ab1a2001-05-29 06:06:54 +00001084
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001085### error handlers
1086
Martin v. Löwise2713be2005-03-08 15:03:08 +00001087try:
1088 strict_errors = lookup_error("strict")
1089 ignore_errors = lookup_error("ignore")
1090 replace_errors = lookup_error("replace")
1091 xmlcharrefreplace_errors = lookup_error("xmlcharrefreplace")
1092 backslashreplace_errors = lookup_error("backslashreplace")
1093except LookupError:
1094 # In --disable-unicode builds, these error handler are missing
1095 strict_errors = None
1096 ignore_errors = None
1097 replace_errors = None
1098 xmlcharrefreplace_errors = None
1099 backslashreplace_errors = None
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001100
Martin v. Löwis6cd441d2001-07-31 08:54:55 +00001101# Tell modulefinder that using codecs probably needs the encodings
1102# package
1103_false = 0
1104if _false:
1105 import encodings
1106
Guido van Rossum0612d842000-03-10 23:20:43 +00001107### Tests
Guido van Rossum1c89b0e2000-04-11 15:41:38 +00001108
Guido van Rossum0612d842000-03-10 23:20:43 +00001109if __name__ == '__main__':
1110
Guido van Rossuma3277132000-04-11 15:37:43 +00001111 # Make stdout translate Latin-1 output into UTF-8 output
1112 sys.stdout = EncodedFile(sys.stdout, 'latin-1', 'utf-8')
Guido van Rossum1c89b0e2000-04-11 15:41:38 +00001113
Guido van Rossuma3277132000-04-11 15:37:43 +00001114 # Have stdin translate Latin-1 input into UTF-8 input
1115 sys.stdin = EncodedFile(sys.stdin, 'utf-8', 'latin-1')