blob: f6c24481687df4c98469e129cb4d80c9c4ddcc94 [file] [log] [blame]
Guido van Rossum0612d842000-03-10 23:20:43 +00001""" codecs -- Python Codec Registry, API and helpers.
2
3
4Written by Marc-Andre Lemburg (mal@lemburg.com).
5
6(c) Copyright CNRI, All Rights Reserved. NO WARRANTY.
7
8"""#"
9
Georg Brandl1a3284e2007-12-02 09:40:06 +000010import builtins, sys
Guido van Rossum0612d842000-03-10 23:20:43 +000011
12### Registry and builtin stateless codec functions
13
Guido van Rossumb95de4f2000-03-31 17:25:23 +000014try:
15 from _codecs import *
Guido van Rossumb940e112007-01-10 16:19:56 +000016except ImportError as why:
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000017 raise SystemError('Failed to load the builtin codecs: %s' % why)
Guido van Rossum0612d842000-03-10 23:20:43 +000018
Tim Peters30324a72001-05-15 17:19:16 +000019__all__ = ["register", "lookup", "open", "EncodedFile", "BOM", "BOM_BE",
Walter Dörwald474458d2002-06-04 15:16:29 +000020 "BOM_LE", "BOM32_BE", "BOM32_LE", "BOM64_BE", "BOM64_LE",
21 "BOM_UTF8", "BOM_UTF16", "BOM_UTF16_LE", "BOM_UTF16_BE",
Walter Dörwald3aeb6322002-09-02 13:14:32 +000022 "BOM_UTF32", "BOM_UTF32_LE", "BOM_UTF32_BE",
23 "strict_errors", "ignore_errors", "replace_errors",
24 "xmlcharrefreplace_errors",
25 "register_error", "lookup_error"]
Skip Montanaroe99d5ea2001-01-20 19:54:20 +000026
Guido van Rossum0612d842000-03-10 23:20:43 +000027### Constants
28
29#
Walter Dörwald474458d2002-06-04 15:16:29 +000030# Byte Order Mark (BOM = ZERO WIDTH NO-BREAK SPACE = U+FEFF)
31# and its possible byte string values
32# for UTF8/UTF16/UTF32 output and little/big endian machines
Guido van Rossum0612d842000-03-10 23:20:43 +000033#
Guido van Rossum0612d842000-03-10 23:20:43 +000034
Walter Dörwald474458d2002-06-04 15:16:29 +000035# UTF-8
Walter Dörwaldca8a8d02007-05-04 13:05:09 +000036BOM_UTF8 = b'\xef\xbb\xbf'
Walter Dörwald474458d2002-06-04 15:16:29 +000037
38# UTF-16, little endian
Walter Dörwaldca8a8d02007-05-04 13:05:09 +000039BOM_LE = BOM_UTF16_LE = b'\xff\xfe'
Walter Dörwald474458d2002-06-04 15:16:29 +000040
41# UTF-16, big endian
Walter Dörwaldca8a8d02007-05-04 13:05:09 +000042BOM_BE = BOM_UTF16_BE = b'\xfe\xff'
Walter Dörwald474458d2002-06-04 15:16:29 +000043
44# UTF-32, little endian
Walter Dörwaldca8a8d02007-05-04 13:05:09 +000045BOM_UTF32_LE = b'\xff\xfe\x00\x00'
Walter Dörwald474458d2002-06-04 15:16:29 +000046
47# UTF-32, big endian
Walter Dörwaldca8a8d02007-05-04 13:05:09 +000048BOM_UTF32_BE = b'\x00\x00\xfe\xff'
Walter Dörwald474458d2002-06-04 15:16:29 +000049
Marc-André Lemburgb28de0d2002-12-12 17:37:50 +000050if sys.byteorder == 'little':
Walter Dörwald474458d2002-06-04 15:16:29 +000051
Marc-André Lemburgb28de0d2002-12-12 17:37:50 +000052 # UTF-16, native endianness
53 BOM = BOM_UTF16 = BOM_UTF16_LE
54
55 # UTF-32, native endianness
56 BOM_UTF32 = BOM_UTF32_LE
57
58else:
59
60 # UTF-16, native endianness
61 BOM = BOM_UTF16 = BOM_UTF16_BE
62
63 # UTF-32, native endianness
64 BOM_UTF32 = BOM_UTF32_BE
Walter Dörwald474458d2002-06-04 15:16:29 +000065
66# Old broken names (don't use in new code)
67BOM32_LE = BOM_UTF16_LE
68BOM32_BE = BOM_UTF16_BE
69BOM64_LE = BOM_UTF32_LE
70BOM64_BE = BOM_UTF32_BE
Guido van Rossum0612d842000-03-10 23:20:43 +000071
72
73### Codec base classes (defining the API)
74
Thomas Woutersa9773292006-04-21 09:43:23 +000075class CodecInfo(tuple):
76
77 def __new__(cls, encode, decode, streamreader=None, streamwriter=None,
78 incrementalencoder=None, incrementaldecoder=None, name=None):
79 self = tuple.__new__(cls, (encode, decode, streamreader, streamwriter))
80 self.name = name
81 self.encode = encode
82 self.decode = decode
83 self.incrementalencoder = incrementalencoder
84 self.incrementaldecoder = incrementaldecoder
85 self.streamwriter = streamwriter
86 self.streamreader = streamreader
87 return self
88
89 def __repr__(self):
Walter Dörwald3abcb012007-04-16 22:10:50 +000090 return "<%s.%s object for encoding %s at 0x%x>" % \
91 (self.__class__.__module__, self.__class__.__name__,
92 self.name, id(self))
Thomas Woutersa9773292006-04-21 09:43:23 +000093
Guido van Rossum0612d842000-03-10 23:20:43 +000094class Codec:
95
96 """ Defines the interface for stateless encoders/decoders.
97
Walter Dörwald7f82f792002-11-19 21:42:53 +000098 The .encode()/.decode() methods may use different error
Guido van Rossum0612d842000-03-10 23:20:43 +000099 handling schemes by providing the errors argument. These
Walter Dörwald7f82f792002-11-19 21:42:53 +0000100 string values are predefined:
Guido van Rossum0612d842000-03-10 23:20:43 +0000101
Guido van Rossumd8855fd2000-03-24 22:14:19 +0000102 'strict' - raise a ValueError error (or a subclass)
Guido van Rossum0612d842000-03-10 23:20:43 +0000103 'ignore' - ignore the character and continue with the next
104 'replace' - replace with a suitable replacement character;
105 Python will use the official U+FFFD REPLACEMENT
Walter Dörwald7f82f792002-11-19 21:42:53 +0000106 CHARACTER for the builtin Unicode codecs on
107 decoding and '?' on encoding.
108 'xmlcharrefreplace' - Replace with the appropriate XML
109 character reference (only for encoding).
110 'backslashreplace' - Replace with backslashed escape sequences
111 (only for encoding).
112
113 The set of allowed values can be extended via register_error.
Guido van Rossum0612d842000-03-10 23:20:43 +0000114
115 """
Tim Peters30324a72001-05-15 17:19:16 +0000116 def encode(self, input, errors='strict'):
Guido van Rossum1c89b0e2000-04-11 15:41:38 +0000117
Fred Drake3e74c0d2000-03-17 15:40:35 +0000118 """ Encodes the object input and returns a tuple (output
Guido van Rossum0612d842000-03-10 23:20:43 +0000119 object, length consumed).
120
121 errors defines the error handling to apply. It defaults to
122 'strict' handling.
123
124 The method may not store state in the Codec instance. Use
125 StreamCodec for codecs which have to keep state in order to
126 make encoding/decoding efficient.
127
128 The encoder must be able to handle zero length input and
129 return an empty object of the output object type in this
130 situation.
131
132 """
133 raise NotImplementedError
134
Tim Peters30324a72001-05-15 17:19:16 +0000135 def decode(self, input, errors='strict'):
Guido van Rossum0612d842000-03-10 23:20:43 +0000136
137 """ Decodes the object input and returns a tuple (output
138 object, length consumed).
139
140 input must be an object which provides the bf_getreadbuf
141 buffer slot. Python strings, buffer objects and memory
142 mapped files are examples of objects providing this slot.
Guido van Rossum1c89b0e2000-04-11 15:41:38 +0000143
Guido van Rossum0612d842000-03-10 23:20:43 +0000144 errors defines the error handling to apply. It defaults to
145 'strict' handling.
146
147 The method may not store state in the Codec instance. Use
148 StreamCodec for codecs which have to keep state in order to
149 make encoding/decoding efficient.
150
151 The decoder must be able to handle zero length input and
152 return an empty object of the output object type in this
153 situation.
154
Guido van Rossum1c89b0e2000-04-11 15:41:38 +0000155 """
Guido van Rossum0612d842000-03-10 23:20:43 +0000156 raise NotImplementedError
157
Thomas Woutersa9773292006-04-21 09:43:23 +0000158class IncrementalEncoder(object):
159 """
Walter Dörwald3abcb012007-04-16 22:10:50 +0000160 An IncrementalEncoder encodes an input in multiple steps. The input can
161 be passed piece by piece to the encode() method. The IncrementalEncoder
162 remembers the state of the encoding process between calls to encode().
Thomas Woutersa9773292006-04-21 09:43:23 +0000163 """
164 def __init__(self, errors='strict'):
165 """
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000166 Creates an IncrementalEncoder instance.
Thomas Woutersa9773292006-04-21 09:43:23 +0000167
168 The IncrementalEncoder may use different error handling schemes by
169 providing the errors keyword argument. See the module docstring
170 for a list of possible values.
171 """
172 self.errors = errors
173 self.buffer = ""
174
175 def encode(self, input, final=False):
176 """
177 Encodes input and returns the resulting object.
178 """
179 raise NotImplementedError
180
181 def reset(self):
182 """
183 Resets the encoder to the initial state.
184 """
185
Walter Dörwald3abcb012007-04-16 22:10:50 +0000186 def getstate(self):
187 """
188 Return the current state of the encoder.
189 """
190 return 0
191
192 def setstate(self, state):
193 """
194 Set the current state of the encoder. state must have been
195 returned by getstate().
196 """
197
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000198class BufferedIncrementalEncoder(IncrementalEncoder):
199 """
200 This subclass of IncrementalEncoder can be used as the baseclass for an
201 incremental encoder if the encoder must keep some of the output in a
202 buffer between calls to encode().
203 """
204 def __init__(self, errors='strict'):
205 IncrementalEncoder.__init__(self, errors)
Walter Dörwald3abcb012007-04-16 22:10:50 +0000206 # unencoded input that is kept between calls to encode()
207 self.buffer = ""
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000208
209 def _buffer_encode(self, input, errors, final):
210 # Overwrite this method in subclasses: It must encode input
211 # and return an (output, length consumed) tuple
212 raise NotImplementedError
213
214 def encode(self, input, final=False):
215 # encode input (taking the buffer into account)
216 data = self.buffer + input
217 (result, consumed) = self._buffer_encode(data, self.errors, final)
218 # keep unencoded input until the next call
219 self.buffer = data[consumed:]
220 return result
221
222 def reset(self):
223 IncrementalEncoder.reset(self)
224 self.buffer = ""
225
Walter Dörwald3abcb012007-04-16 22:10:50 +0000226 def getstate(self):
227 return self.buffer or 0
228
229 def setstate(self, state):
230 self.buffer = state or ""
231
Thomas Woutersa9773292006-04-21 09:43:23 +0000232class IncrementalDecoder(object):
233 """
Walter Dörwald3abcb012007-04-16 22:10:50 +0000234 An IncrementalDecoder decodes an input in multiple steps. The input can
235 be passed piece by piece to the decode() method. The IncrementalDecoder
Thomas Woutersa9773292006-04-21 09:43:23 +0000236 remembers the state of the decoding process between calls to decode().
237 """
238 def __init__(self, errors='strict'):
239 """
Ka-Ping Yeef44c7e82008-03-18 04:51:32 +0000240 Create a IncrementalDecoder instance.
Thomas Woutersa9773292006-04-21 09:43:23 +0000241
242 The IncrementalDecoder may use different error handling schemes by
243 providing the errors keyword argument. See the module docstring
244 for a list of possible values.
245 """
246 self.errors = errors
247
248 def decode(self, input, final=False):
249 """
Ka-Ping Yeef44c7e82008-03-18 04:51:32 +0000250 Decode input and returns the resulting object.
Thomas Woutersa9773292006-04-21 09:43:23 +0000251 """
252 raise NotImplementedError
253
254 def reset(self):
255 """
Ka-Ping Yeef44c7e82008-03-18 04:51:32 +0000256 Reset the decoder to the initial state.
Thomas Woutersa9773292006-04-21 09:43:23 +0000257 """
258
Walter Dörwald3abcb012007-04-16 22:10:50 +0000259 def getstate(self):
260 """
Ka-Ping Yeef44c7e82008-03-18 04:51:32 +0000261 Return the current state of the decoder.
262
263 This must be a (buffered_input, additional_state_info) tuple.
264 buffered_input must be a bytes object containing bytes that
265 were passed to decode() that have not yet been converted.
266 additional_state_info must be a non-negative integer
267 representing the state of the decoder WITHOUT yet having
268 processed the contents of buffered_input. In the initial state
269 and after reset(), getstate() must return (b"", 0).
Walter Dörwald3abcb012007-04-16 22:10:50 +0000270 """
Walter Dörwaldca8a8d02007-05-04 13:05:09 +0000271 return (b"", 0)
Walter Dörwald3abcb012007-04-16 22:10:50 +0000272
273 def setstate(self, state):
274 """
Ka-Ping Yeef44c7e82008-03-18 04:51:32 +0000275 Set the current state of the decoder.
276
277 state must have been returned by getstate(). The effect of
278 setstate((b"", 0)) must be equivalent to reset().
Walter Dörwald3abcb012007-04-16 22:10:50 +0000279 """
280
Thomas Woutersa9773292006-04-21 09:43:23 +0000281class BufferedIncrementalDecoder(IncrementalDecoder):
282 """
283 This subclass of IncrementalDecoder can be used as the baseclass for an
Walter Dörwald3abcb012007-04-16 22:10:50 +0000284 incremental decoder if the decoder must be able to handle incomplete
285 byte sequences.
Thomas Woutersa9773292006-04-21 09:43:23 +0000286 """
287 def __init__(self, errors='strict'):
288 IncrementalDecoder.__init__(self, errors)
Walter Dörwald3abcb012007-04-16 22:10:50 +0000289 # undecoded input that is kept between calls to decode()
Walter Dörwaldca8a8d02007-05-04 13:05:09 +0000290 self.buffer = b""
Thomas Woutersa9773292006-04-21 09:43:23 +0000291
292 def _buffer_decode(self, input, errors, final):
293 # Overwrite this method in subclasses: It must decode input
294 # and return an (output, length consumed) tuple
295 raise NotImplementedError
296
297 def decode(self, input, final=False):
298 # decode input (taking the buffer into account)
299 data = self.buffer + input
300 (result, consumed) = self._buffer_decode(data, self.errors, final)
301 # keep undecoded input until the next call
302 self.buffer = data[consumed:]
303 return result
304
305 def reset(self):
306 IncrementalDecoder.reset(self)
Walter Dörwaldca8a8d02007-05-04 13:05:09 +0000307 self.buffer = b""
Thomas Woutersa9773292006-04-21 09:43:23 +0000308
Walter Dörwald3abcb012007-04-16 22:10:50 +0000309 def getstate(self):
310 # additional state info is always 0
311 return (self.buffer, 0)
312
313 def setstate(self, state):
314 # ignore additional state info
315 self.buffer = state[0]
316
Guido van Rossum0612d842000-03-10 23:20:43 +0000317#
318# The StreamWriter and StreamReader class provide generic working
Andrew M. Kuchling97c56352001-09-18 20:29:48 +0000319# interfaces which can be used to implement new encoding submodules
Guido van Rossum0612d842000-03-10 23:20:43 +0000320# very easily. See encodings/utf_8.py for an example on how this is
321# done.
Guido van Rossum1c89b0e2000-04-11 15:41:38 +0000322#
Guido van Rossum0612d842000-03-10 23:20:43 +0000323
324class StreamWriter(Codec):
325
Tim Peters30324a72001-05-15 17:19:16 +0000326 def __init__(self, stream, errors='strict'):
Guido van Rossum0612d842000-03-10 23:20:43 +0000327
328 """ Creates a StreamWriter instance.
329
330 stream must be a file-like object open for writing
331 (binary) data.
332
Walter Dörwald7f82f792002-11-19 21:42:53 +0000333 The StreamWriter may use different error handling
Guido van Rossum0612d842000-03-10 23:20:43 +0000334 schemes by providing the errors keyword argument. These
Walter Dörwald7f82f792002-11-19 21:42:53 +0000335 parameters are predefined:
Guido van Rossum0612d842000-03-10 23:20:43 +0000336
337 'strict' - raise a ValueError (or a subclass)
338 'ignore' - ignore the character and continue with the next
339 'replace'- replace with a suitable replacement character
Walter Dörwald7f82f792002-11-19 21:42:53 +0000340 'xmlcharrefreplace' - Replace with the appropriate XML
341 character reference.
342 'backslashreplace' - Replace with backslashed escape
343 sequences (only for encoding).
Guido van Rossum0612d842000-03-10 23:20:43 +0000344
Walter Dörwald7f82f792002-11-19 21:42:53 +0000345 The set of allowed parameter values can be extended via
346 register_error.
Guido van Rossum0612d842000-03-10 23:20:43 +0000347 """
348 self.stream = stream
349 self.errors = errors
350
Guido van Rossuma3277132000-04-11 15:37:43 +0000351 def write(self, object):
Guido van Rossum0612d842000-03-10 23:20:43 +0000352
353 """ Writes the object's contents encoded to self.stream.
354 """
Tim Peters30324a72001-05-15 17:19:16 +0000355 data, consumed = self.encode(object, self.errors)
Guido van Rossum0612d842000-03-10 23:20:43 +0000356 self.stream.write(data)
357
Guido van Rossuma3277132000-04-11 15:37:43 +0000358 def writelines(self, list):
359
360 """ Writes the concatenated list of strings to the stream
361 using .write().
362 """
363 self.write(''.join(list))
Guido van Rossum1c89b0e2000-04-11 15:41:38 +0000364
Guido van Rossum0612d842000-03-10 23:20:43 +0000365 def reset(self):
366
367 """ Flushes and resets the codec buffers used for keeping state.
368
369 Calling this method should ensure that the data on the
370 output is put into a clean state, that allows appending
371 of new fresh data without having to rescan the whole
372 stream to recover state.
373
374 """
375 pass
376
Victor Stinnera92ad7e2010-05-22 16:59:09 +0000377 def seek(self, offset, whence=0):
378 self.stream.seek(offset, whence)
379 if whence == 0 and offset == 0:
380 self.reset()
381
Tim Peters30324a72001-05-15 17:19:16 +0000382 def __getattr__(self, name,
Guido van Rossum0612d842000-03-10 23:20:43 +0000383 getattr=getattr):
384
385 """ Inherit all other methods from the underlying stream.
386 """
Tim Peters30324a72001-05-15 17:19:16 +0000387 return getattr(self.stream, name)
Guido van Rossum0612d842000-03-10 23:20:43 +0000388
Thomas Wouters89f507f2006-12-13 04:49:30 +0000389 def __enter__(self):
390 return self
391
392 def __exit__(self, type, value, tb):
393 self.stream.close()
394
Guido van Rossum0612d842000-03-10 23:20:43 +0000395###
396
397class StreamReader(Codec):
398
Tim Peters30324a72001-05-15 17:19:16 +0000399 def __init__(self, stream, errors='strict'):
Guido van Rossum0612d842000-03-10 23:20:43 +0000400
401 """ Creates a StreamReader instance.
402
403 stream must be a file-like object open for reading
404 (binary) data.
405
Walter Dörwald7f82f792002-11-19 21:42:53 +0000406 The StreamReader may use different error handling
Guido van Rossum0612d842000-03-10 23:20:43 +0000407 schemes by providing the errors keyword argument. These
Walter Dörwald7f82f792002-11-19 21:42:53 +0000408 parameters are predefined:
Guido van Rossum0612d842000-03-10 23:20:43 +0000409
410 'strict' - raise a ValueError (or a subclass)
411 'ignore' - ignore the character and continue with the next
412 'replace'- replace with a suitable replacement character;
413
Walter Dörwald7f82f792002-11-19 21:42:53 +0000414 The set of allowed parameter values can be extended via
415 register_error.
Guido van Rossum0612d842000-03-10 23:20:43 +0000416 """
417 self.stream = stream
418 self.errors = errors
Walter Dörwaldca8a8d02007-05-04 13:05:09 +0000419 self.bytebuffer = b""
Walter Dörwaldc9878e12005-07-20 22:15:39 +0000420 # For str->str decoding this will stay a str
421 # For str->unicode decoding the first read will promote it to unicode
422 self.charbuffer = ""
Martin v. Löwis4ed67382005-09-18 08:34:39 +0000423 self.linebuffer = None
Guido van Rossum0612d842000-03-10 23:20:43 +0000424
Walter Dörwald69652032004-09-07 20:24:22 +0000425 def decode(self, input, errors='strict'):
426 raise NotImplementedError
427
Martin v. Löwis56066d22005-08-24 07:38:12 +0000428 def read(self, size=-1, chars=-1, firstline=False):
Guido van Rossum0612d842000-03-10 23:20:43 +0000429
430 """ Decodes data from the stream self.stream and returns the
431 resulting object.
432
Walter Dörwald69652032004-09-07 20:24:22 +0000433 chars indicates the number of characters to read from the
434 stream. read() will never return more than chars
435 characters, but it might return less, if there are not enough
436 characters available.
437
Guido van Rossum0612d842000-03-10 23:20:43 +0000438 size indicates the approximate maximum number of bytes to
439 read from the stream for decoding purposes. The decoder
440 can modify this setting as appropriate. The default value
441 -1 indicates to read and decode as much as possible. size
442 is intended to prevent having to decode huge files in one
443 step.
444
Martin v. Löwis56066d22005-08-24 07:38:12 +0000445 If firstline is true, and a UnicodeDecodeError happens
446 after the first line terminator in the input only the first line
447 will be returned, the rest of the input will be kept until the
448 next call to read().
449
Guido van Rossum0612d842000-03-10 23:20:43 +0000450 The method should use a greedy read strategy meaning that
451 it should read as much data as is allowed within the
452 definition of the encoding and the given size, e.g. if
453 optional encoding endings or state markers are available
454 on the stream, these should be read too.
Guido van Rossum0612d842000-03-10 23:20:43 +0000455 """
Martin v. Löwis4ed67382005-09-18 08:34:39 +0000456 # If we have lines cached, first merge them back into characters
457 if self.linebuffer:
458 self.charbuffer = "".join(self.linebuffer)
459 self.linebuffer = None
Tim Peters536cf992005-12-25 23:18:31 +0000460
Walter Dörwald69652032004-09-07 20:24:22 +0000461 # read until we get the required number of characters (if available)
Walter Dörwald69652032004-09-07 20:24:22 +0000462 while True:
463 # can the request can be satisfied from the character buffer?
464 if chars < 0:
Walter Dörwaldca199432006-03-06 22:39:12 +0000465 if size < 0:
466 if self.charbuffer:
467 break
468 elif len(self.charbuffer) >= size:
Walter Dörwalde57d7b12004-12-21 22:24:00 +0000469 break
Guido van Rossum0612d842000-03-10 23:20:43 +0000470 else:
Walter Dörwald69652032004-09-07 20:24:22 +0000471 if len(self.charbuffer) >= chars:
Walter Dörwald69652032004-09-07 20:24:22 +0000472 break
473 # we need more data
474 if size < 0:
475 newdata = self.stream.read()
476 else:
477 newdata = self.stream.read(size)
Walter Dörwalde57d7b12004-12-21 22:24:00 +0000478 # decode bytes (those remaining from the last call included)
Walter Dörwald69652032004-09-07 20:24:22 +0000479 data = self.bytebuffer + newdata
Martin v. Löwis56066d22005-08-24 07:38:12 +0000480 try:
481 newchars, decodedbytes = self.decode(data, self.errors)
Guido van Rossumb940e112007-01-10 16:19:56 +0000482 except UnicodeDecodeError as exc:
Martin v. Löwis56066d22005-08-24 07:38:12 +0000483 if firstline:
Walter Dörwald3abcb012007-04-16 22:10:50 +0000484 newchars, decodedbytes = \
485 self.decode(data[:exc.start], self.errors)
Martin v. Löwis56066d22005-08-24 07:38:12 +0000486 lines = newchars.splitlines(True)
487 if len(lines)<=1:
488 raise
489 else:
490 raise
Walter Dörwald69652032004-09-07 20:24:22 +0000491 # keep undecoded bytes until the next call
492 self.bytebuffer = data[decodedbytes:]
493 # put new characters in the character buffer
Walter Dörwalde57d7b12004-12-21 22:24:00 +0000494 self.charbuffer += newchars
Walter Dörwald69652032004-09-07 20:24:22 +0000495 # there was no data available
496 if not newdata:
Walter Dörwalde57d7b12004-12-21 22:24:00 +0000497 break
498 if chars < 0:
499 # Return everything we've got
500 result = self.charbuffer
Walter Dörwaldc9878e12005-07-20 22:15:39 +0000501 self.charbuffer = ""
Walter Dörwalde57d7b12004-12-21 22:24:00 +0000502 else:
503 # Return the first chars characters
504 result = self.charbuffer[:chars]
505 self.charbuffer = self.charbuffer[chars:]
Walter Dörwald69652032004-09-07 20:24:22 +0000506 return result
Guido van Rossum0612d842000-03-10 23:20:43 +0000507
Walter Dörwald69652032004-09-07 20:24:22 +0000508 def readline(self, size=None, keepends=True):
Guido van Rossuma3277132000-04-11 15:37:43 +0000509
510 """ Read one line from the input stream and return the
511 decoded data.
512
Walter Dörwald69652032004-09-07 20:24:22 +0000513 size, if given, is passed as size argument to the
514 read() method.
Guido van Rossum1c89b0e2000-04-11 15:41:38 +0000515
Guido van Rossuma3277132000-04-11 15:37:43 +0000516 """
Martin v. Löwis4ed67382005-09-18 08:34:39 +0000517 # If we have lines cached from an earlier read, return
518 # them unconditionally
519 if self.linebuffer:
520 line = self.linebuffer[0]
521 del self.linebuffer[0]
522 if len(self.linebuffer) == 1:
523 # revert to charbuffer mode; we might need more data
524 # next time
525 self.charbuffer = self.linebuffer[0]
526 self.linebuffer = None
527 if not keepends:
528 line = line.splitlines(False)[0]
529 return line
Tim Peters536cf992005-12-25 23:18:31 +0000530
Walter Dörwalde57d7b12004-12-21 22:24:00 +0000531 readsize = size or 72
Walter Dörwaldc9878e12005-07-20 22:15:39 +0000532 line = ""
Walter Dörwalde57d7b12004-12-21 22:24:00 +0000533 # If size is given, we call read() only once
Walter Dörwald69652032004-09-07 20:24:22 +0000534 while True:
Martin v. Löwis56066d22005-08-24 07:38:12 +0000535 data = self.read(readsize, firstline=True)
Walter Dörwalde57d7b12004-12-21 22:24:00 +0000536 if data:
Walter Dörwalda4eb2d52005-04-21 21:42:35 +0000537 # If we're at a "\r" read one extra character (which might
538 # be a "\n") to get a proper line ending. If the stream is
Walter Dörwaldbc8e6422005-04-21 21:32:03 +0000539 # temporarily exhausted we return the wrong line ending.
Walter Dörwaldc9878e12005-07-20 22:15:39 +0000540 if data.endswith("\r"):
Walter Dörwald7a6dc132005-04-04 21:38:47 +0000541 data += self.read(size=1, chars=1)
Walter Dörwald7a6dc132005-04-04 21:38:47 +0000542
Walter Dörwald69652032004-09-07 20:24:22 +0000543 line += data
Walter Dörwalde57d7b12004-12-21 22:24:00 +0000544 lines = line.splitlines(True)
545 if lines:
Martin v. Löwis4ed67382005-09-18 08:34:39 +0000546 if len(lines) > 1:
547 # More than one line result; the first line is a full line
548 # to return
549 line = lines[0]
550 del lines[0]
551 if len(lines) > 1:
552 # cache the remaining lines
553 lines[-1] += self.charbuffer
554 self.linebuffer = lines
555 self.charbuffer = None
556 else:
557 # only one remaining line, put it back into charbuffer
558 self.charbuffer = lines[0] + self.charbuffer
559 if not keepends:
560 line = line.splitlines(False)[0]
561 break
Walter Dörwalde57d7b12004-12-21 22:24:00 +0000562 line0withend = lines[0]
563 line0withoutend = lines[0].splitlines(False)[0]
564 if line0withend != line0withoutend: # We really have a line end
565 # Put the rest back together and keep it until the next call
Walter Dörwaldc9878e12005-07-20 22:15:39 +0000566 self.charbuffer = "".join(lines[1:]) + self.charbuffer
Walter Dörwalde57d7b12004-12-21 22:24:00 +0000567 if keepends:
568 line = line0withend
569 else:
570 line = line0withoutend
Walter Dörwald9fa09462005-01-10 12:01:39 +0000571 break
Walter Dörwalde57d7b12004-12-21 22:24:00 +0000572 # we didn't get anything or this was our only try
Walter Dörwald9fa09462005-01-10 12:01:39 +0000573 if not data or size is not None:
Walter Dörwalde57d7b12004-12-21 22:24:00 +0000574 if line and not keepends:
575 line = line.splitlines(False)[0]
576 break
577 if readsize<8000:
578 readsize *= 2
579 return line
Guido van Rossum1c89b0e2000-04-11 15:41:38 +0000580
Walter Dörwald69652032004-09-07 20:24:22 +0000581 def readlines(self, sizehint=None, keepends=True):
Guido van Rossuma3277132000-04-11 15:37:43 +0000582
583 """ Read all lines available on the input stream
584 and return them as list of lines.
585
586 Line breaks are implemented using the codec's decoder
587 method and are included in the list entries.
Guido van Rossum1c89b0e2000-04-11 15:41:38 +0000588
Marc-André Lemburgd5948492004-02-26 15:22:17 +0000589 sizehint, if given, is ignored since there is no efficient
590 way to finding the true end-of-line.
Guido van Rossuma3277132000-04-11 15:37:43 +0000591
592 """
Walter Dörwald69652032004-09-07 20:24:22 +0000593 data = self.read()
Hye-Shik Changaf5c7cf2004-10-17 23:51:21 +0000594 return data.splitlines(keepends)
Guido van Rossum0612d842000-03-10 23:20:43 +0000595
596 def reset(self):
597
598 """ Resets the codec buffers used for keeping state.
599
600 Note that no stream repositioning should take place.
Thomas Wouters7e474022000-07-16 12:04:32 +0000601 This method is primarily intended to be able to recover
Guido van Rossum0612d842000-03-10 23:20:43 +0000602 from decoding errors.
603
604 """
Walter Dörwaldca8a8d02007-05-04 13:05:09 +0000605 self.bytebuffer = b""
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000606 self.charbuffer = ""
Martin v. Löwis4ed67382005-09-18 08:34:39 +0000607 self.linebuffer = None
Walter Dörwald729c31f2005-03-14 19:06:30 +0000608
Walter Dörwald71fd90d2005-03-14 19:25:41 +0000609 def seek(self, offset, whence=0):
Walter Dörwald729c31f2005-03-14 19:06:30 +0000610 """ Set the input stream's current position.
611
612 Resets the codec buffers used for keeping state.
613 """
Walter Dörwald729c31f2005-03-14 19:06:30 +0000614 self.stream.seek(offset, whence)
Victor Stinnera92ad7e2010-05-22 16:59:09 +0000615 self.reset()
Guido van Rossum0612d842000-03-10 23:20:43 +0000616
Georg Brandla18af4e2007-04-21 15:47:16 +0000617 def __next__(self):
Walter Dörwald4dbf1922002-11-06 16:53:44 +0000618
619 """ Return the next decoded line from the input stream."""
620 line = self.readline()
621 if line:
622 return line
623 raise StopIteration
624
625 def __iter__(self):
626 return self
627
Tim Peters30324a72001-05-15 17:19:16 +0000628 def __getattr__(self, name,
Guido van Rossum0612d842000-03-10 23:20:43 +0000629 getattr=getattr):
630
631 """ Inherit all other methods from the underlying stream.
632 """
Tim Peters30324a72001-05-15 17:19:16 +0000633 return getattr(self.stream, name)
Guido van Rossum0612d842000-03-10 23:20:43 +0000634
Thomas Wouters89f507f2006-12-13 04:49:30 +0000635 def __enter__(self):
636 return self
637
638 def __exit__(self, type, value, tb):
639 self.stream.close()
640
Guido van Rossum0612d842000-03-10 23:20:43 +0000641###
642
643class StreamReaderWriter:
644
Fred Drake49fd1072000-04-13 14:11:21 +0000645 """ StreamReaderWriter instances allow wrapping streams which
646 work in both read and write modes.
647
648 The design is such that one can use the factory functions
Thomas Wouters7e474022000-07-16 12:04:32 +0000649 returned by the codec.lookup() function to construct the
Fred Drake49fd1072000-04-13 14:11:21 +0000650 instance.
651
652 """
Guido van Rossuma3277132000-04-11 15:37:43 +0000653 # Optional attributes set by the file wrappers below
654 encoding = 'unknown'
655
Tim Peters30324a72001-05-15 17:19:16 +0000656 def __init__(self, stream, Reader, Writer, errors='strict'):
Guido van Rossum0612d842000-03-10 23:20:43 +0000657
658 """ Creates a StreamReaderWriter instance.
659
660 stream must be a Stream-like object.
661
662 Reader, Writer must be factory functions or classes
663 providing the StreamReader, StreamWriter interface resp.
664
665 Error handling is done in the same way as defined for the
666 StreamWriter/Readers.
667
668 """
669 self.stream = stream
670 self.reader = Reader(stream, errors)
671 self.writer = Writer(stream, errors)
672 self.errors = errors
673
Tim Peters30324a72001-05-15 17:19:16 +0000674 def read(self, size=-1):
Guido van Rossum0612d842000-03-10 23:20:43 +0000675
676 return self.reader.read(size)
677
Guido van Rossumd58c26f2000-05-01 16:17:32 +0000678 def readline(self, size=None):
Guido van Rossuma3277132000-04-11 15:37:43 +0000679
680 return self.reader.readline(size)
681
Guido van Rossumd58c26f2000-05-01 16:17:32 +0000682 def readlines(self, sizehint=None):
Guido van Rossuma3277132000-04-11 15:37:43 +0000683
684 return self.reader.readlines(sizehint)
685
Georg Brandla18af4e2007-04-21 15:47:16 +0000686 def __next__(self):
Walter Dörwald4dbf1922002-11-06 16:53:44 +0000687
688 """ Return the next decoded line from the input stream."""
Georg Brandla18af4e2007-04-21 15:47:16 +0000689 return next(self.reader)
Walter Dörwald4dbf1922002-11-06 16:53:44 +0000690
691 def __iter__(self):
692 return self
693
Tim Peters30324a72001-05-15 17:19:16 +0000694 def write(self, data):
Guido van Rossum0612d842000-03-10 23:20:43 +0000695
696 return self.writer.write(data)
697
Tim Peters30324a72001-05-15 17:19:16 +0000698 def writelines(self, list):
Guido van Rossuma3277132000-04-11 15:37:43 +0000699
700 return self.writer.writelines(list)
701
Guido van Rossum0612d842000-03-10 23:20:43 +0000702 def reset(self):
703
704 self.reader.reset()
705 self.writer.reset()
706
Victor Stinner3fed0872010-05-22 02:16:27 +0000707 def seek(self, offset, whence=0):
Victor Stinnera92ad7e2010-05-22 16:59:09 +0000708 self.stream.seek(offset, whence)
709 self.reader.reset()
710 if whence == 0 and offset == 0:
711 self.writer.reset()
Victor Stinner3fed0872010-05-22 02:16:27 +0000712
Tim Peters30324a72001-05-15 17:19:16 +0000713 def __getattr__(self, name,
Guido van Rossum0612d842000-03-10 23:20:43 +0000714 getattr=getattr):
715
716 """ Inherit all other methods from the underlying stream.
717 """
Tim Peters30324a72001-05-15 17:19:16 +0000718 return getattr(self.stream, name)
Guido van Rossum0612d842000-03-10 23:20:43 +0000719
Thomas Wouters89f507f2006-12-13 04:49:30 +0000720 # these are needed to make "with codecs.open(...)" work properly
721
722 def __enter__(self):
723 return self
724
725 def __exit__(self, type, value, tb):
726 self.stream.close()
727
Guido van Rossum0612d842000-03-10 23:20:43 +0000728###
729
730class StreamRecoder:
731
Fred Drake49fd1072000-04-13 14:11:21 +0000732 """ StreamRecoder instances provide a frontend - backend
733 view of encoding data.
734
735 They use the complete set of APIs returned by the
736 codecs.lookup() function to implement their task.
737
738 Data written to the stream is first decoded into an
739 intermediate format (which is dependent on the given codec
740 combination) and then written to the stream using an instance
741 of the provided Writer class.
742
743 In the other direction, data is read from the stream using a
744 Reader instance and then return encoded data to the caller.
745
746 """
Guido van Rossuma3277132000-04-11 15:37:43 +0000747 # Optional attributes set by the file wrappers below
748 data_encoding = 'unknown'
749 file_encoding = 'unknown'
750
Tim Peters30324a72001-05-15 17:19:16 +0000751 def __init__(self, stream, encode, decode, Reader, Writer,
752 errors='strict'):
Guido van Rossum0612d842000-03-10 23:20:43 +0000753
754 """ Creates a StreamRecoder instance which implements a two-way
755 conversion: encode and decode work on the frontend (the
Guido van Rossum1c89b0e2000-04-11 15:41:38 +0000756 input to .read() and output of .write()) while
Guido van Rossum0612d842000-03-10 23:20:43 +0000757 Reader and Writer work on the backend (reading and
Fred Drake908670c2000-03-17 15:42:11 +0000758 writing to the stream).
Guido van Rossum0612d842000-03-10 23:20:43 +0000759
760 You can use these objects to do transparent direct
761 recodings from e.g. latin-1 to utf-8 and back.
762
763 stream must be a file-like object.
764
765 encode, decode must adhere to the Codec interface, Reader,
766 Writer must be factory functions or classes providing the
767 StreamReader, StreamWriter interface resp.
768
769 encode and decode are needed for the frontend translation,
770 Reader and Writer for the backend translation. Unicode is
771 used as intermediate encoding.
772
773 Error handling is done in the same way as defined for the
774 StreamWriter/Readers.
775
776 """
777 self.stream = stream
778 self.encode = encode
779 self.decode = decode
780 self.reader = Reader(stream, errors)
781 self.writer = Writer(stream, errors)
782 self.errors = errors
783
Tim Peters30324a72001-05-15 17:19:16 +0000784 def read(self, size=-1):
Guido van Rossum0612d842000-03-10 23:20:43 +0000785
786 data = self.reader.read(size)
787 data, bytesencoded = self.encode(data, self.errors)
788 return data
789
Tim Peters30324a72001-05-15 17:19:16 +0000790 def readline(self, size=None):
Guido van Rossuma3277132000-04-11 15:37:43 +0000791
792 if size is None:
793 data = self.reader.readline()
794 else:
795 data = self.reader.readline(size)
796 data, bytesencoded = self.encode(data, self.errors)
797 return data
798
Tim Peters30324a72001-05-15 17:19:16 +0000799 def readlines(self, sizehint=None):
Guido van Rossuma3277132000-04-11 15:37:43 +0000800
Marc-André Lemburgd5948492004-02-26 15:22:17 +0000801 data = self.reader.read()
Guido van Rossuma3277132000-04-11 15:37:43 +0000802 data, bytesencoded = self.encode(data, self.errors)
803 return data.splitlines(1)
804
Georg Brandla18af4e2007-04-21 15:47:16 +0000805 def __next__(self):
Walter Dörwald4dbf1922002-11-06 16:53:44 +0000806
807 """ Return the next decoded line from the input stream."""
Georg Brandla18af4e2007-04-21 15:47:16 +0000808 data = next(self.reader)
Walter Dörwaldc5238b82005-09-01 11:56:53 +0000809 data, bytesencoded = self.encode(data, self.errors)
810 return data
Walter Dörwald4dbf1922002-11-06 16:53:44 +0000811
812 def __iter__(self):
813 return self
814
Tim Peters30324a72001-05-15 17:19:16 +0000815 def write(self, data):
Guido van Rossum0612d842000-03-10 23:20:43 +0000816
817 data, bytesdecoded = self.decode(data, self.errors)
818 return self.writer.write(data)
819
Tim Peters30324a72001-05-15 17:19:16 +0000820 def writelines(self, list):
Guido van Rossuma3277132000-04-11 15:37:43 +0000821
822 data = ''.join(list)
823 data, bytesdecoded = self.decode(data, self.errors)
824 return self.writer.write(data)
Guido van Rossum0612d842000-03-10 23:20:43 +0000825
826 def reset(self):
827
828 self.reader.reset()
829 self.writer.reset()
830
Tim Peters30324a72001-05-15 17:19:16 +0000831 def __getattr__(self, name,
Guido van Rossum0612d842000-03-10 23:20:43 +0000832 getattr=getattr):
833
834 """ Inherit all other methods from the underlying stream.
835 """
Tim Peters30324a72001-05-15 17:19:16 +0000836 return getattr(self.stream, name)
Guido van Rossum0612d842000-03-10 23:20:43 +0000837
Thomas Wouters89f507f2006-12-13 04:49:30 +0000838 def __enter__(self):
839 return self
840
841 def __exit__(self, type, value, tb):
842 self.stream.close()
843
Guido van Rossum0612d842000-03-10 23:20:43 +0000844### Shortcuts
845
Marc-André Lemburg349a3d32000-06-21 21:21:04 +0000846def open(filename, mode='rb', encoding=None, errors='strict', buffering=1):
Guido van Rossum0612d842000-03-10 23:20:43 +0000847
848 """ Open an encoded file using the given mode and return
849 a wrapped version providing transparent encoding/decoding.
850
851 Note: The wrapped version will only accept the object format
852 defined by the codecs, i.e. Unicode objects for most builtin
Skip Montanaro9f5f9d92005-03-16 03:51:56 +0000853 codecs. Output is also codec dependent and will usually be
Guido van Rossum0612d842000-03-10 23:20:43 +0000854 Unicode as well.
855
Marc-André Lemburg349a3d32000-06-21 21:21:04 +0000856 Files are always opened in binary mode, even if no binary mode
Walter Dörwald7f3ed742003-02-02 23:08:27 +0000857 was specified. This is done to avoid data loss due to encodings
Marc-André Lemburg349a3d32000-06-21 21:21:04 +0000858 using 8-bit values. The default file mode is 'rb' meaning to
859 open the file in binary read mode.
860
Guido van Rossum0612d842000-03-10 23:20:43 +0000861 encoding specifies the encoding which is to be used for the
Walter Dörwald7f3ed742003-02-02 23:08:27 +0000862 file.
Guido van Rossum0612d842000-03-10 23:20:43 +0000863
864 errors may be given to define the error handling. It defaults
865 to 'strict' which causes ValueErrors to be raised in case an
866 encoding error occurs.
867
868 buffering has the same meaning as for the builtin open() API.
869 It defaults to line buffered.
870
Fred Drake49fd1072000-04-13 14:11:21 +0000871 The returned wrapped file object provides an extra attribute
872 .encoding which allows querying the used encoding. This
873 attribute is only available if an encoding was specified as
874 parameter.
875
Guido van Rossum0612d842000-03-10 23:20:43 +0000876 """
877 if encoding is not None and \
878 'b' not in mode:
879 # Force opening of the file in binary mode
880 mode = mode + 'b'
Georg Brandl1a3284e2007-12-02 09:40:06 +0000881 file = builtins.open(filename, mode, buffering)
Guido van Rossum0612d842000-03-10 23:20:43 +0000882 if encoding is None:
883 return file
Thomas Woutersa9773292006-04-21 09:43:23 +0000884 info = lookup(encoding)
885 srw = StreamReaderWriter(file, info.streamreader, info.streamwriter, errors)
Guido van Rossuma3277132000-04-11 15:37:43 +0000886 # Add attributes to simplify introspection
887 srw.encoding = encoding
888 return srw
Guido van Rossum0612d842000-03-10 23:20:43 +0000889
Guido van Rossuma3277132000-04-11 15:37:43 +0000890def EncodedFile(file, data_encoding, file_encoding=None, errors='strict'):
Guido van Rossum0612d842000-03-10 23:20:43 +0000891
892 """ Return a wrapped version of file which provides transparent
893 encoding translation.
894
895 Strings written to the wrapped file are interpreted according
Guido van Rossuma3277132000-04-11 15:37:43 +0000896 to the given data_encoding and then written to the original
897 file as string using file_encoding. The intermediate encoding
898 will usually be Unicode but depends on the specified codecs.
Guido van Rossum0612d842000-03-10 23:20:43 +0000899
Guido van Rossuma3277132000-04-11 15:37:43 +0000900 Strings are read from the file using file_encoding and then
901 passed back to the caller as string using data_encoding.
902
903 If file_encoding is not given, it defaults to data_encoding.
Guido van Rossum0612d842000-03-10 23:20:43 +0000904
905 errors may be given to define the error handling. It defaults
906 to 'strict' which causes ValueErrors to be raised in case an
907 encoding error occurs.
908
Fred Drake49fd1072000-04-13 14:11:21 +0000909 The returned wrapped file object provides two extra attributes
910 .data_encoding and .file_encoding which reflect the given
911 parameters of the same name. The attributes can be used for
912 introspection by Python programs.
913
Guido van Rossum0612d842000-03-10 23:20:43 +0000914 """
Guido van Rossuma3277132000-04-11 15:37:43 +0000915 if file_encoding is None:
916 file_encoding = data_encoding
Thomas Wouters89f507f2006-12-13 04:49:30 +0000917 data_info = lookup(data_encoding)
918 file_info = lookup(file_encoding)
919 sr = StreamRecoder(file, data_info.encode, data_info.decode,
920 file_info.streamreader, file_info.streamwriter, errors)
Guido van Rossuma3277132000-04-11 15:37:43 +0000921 # Add attributes to simplify introspection
922 sr.data_encoding = data_encoding
923 sr.file_encoding = file_encoding
924 return sr
Guido van Rossum0612d842000-03-10 23:20:43 +0000925
Marc-André Lemburgaa32c5a2001-09-19 11:24:48 +0000926### Helpers for codec lookup
927
928def getencoder(encoding):
929
930 """ Lookup up the codec for the given encoding and return
931 its encoder function.
932
933 Raises a LookupError in case the encoding cannot be found.
934
935 """
Thomas Woutersa9773292006-04-21 09:43:23 +0000936 return lookup(encoding).encode
Marc-André Lemburgaa32c5a2001-09-19 11:24:48 +0000937
938def getdecoder(encoding):
939
940 """ Lookup up the codec for the given encoding and return
941 its decoder function.
942
943 Raises a LookupError in case the encoding cannot be found.
944
945 """
Thomas Woutersa9773292006-04-21 09:43:23 +0000946 return lookup(encoding).decode
947
948def getincrementalencoder(encoding):
949
950 """ Lookup up the codec for the given encoding and return
951 its IncrementalEncoder class or factory function.
952
953 Raises a LookupError in case the encoding cannot be found
954 or the codecs doesn't provide an incremental encoder.
955
956 """
957 encoder = lookup(encoding).incrementalencoder
958 if encoder is None:
959 raise LookupError(encoding)
960 return encoder
961
962def getincrementaldecoder(encoding):
963
964 """ Lookup up the codec for the given encoding and return
965 its IncrementalDecoder class or factory function.
966
967 Raises a LookupError in case the encoding cannot be found
968 or the codecs doesn't provide an incremental decoder.
969
970 """
971 decoder = lookup(encoding).incrementaldecoder
972 if decoder is None:
973 raise LookupError(encoding)
974 return decoder
Marc-André Lemburgaa32c5a2001-09-19 11:24:48 +0000975
976def getreader(encoding):
977
978 """ Lookup up the codec for the given encoding and return
979 its StreamReader class or factory function.
980
981 Raises a LookupError in case the encoding cannot be found.
982
983 """
Thomas Woutersa9773292006-04-21 09:43:23 +0000984 return lookup(encoding).streamreader
Marc-André Lemburgaa32c5a2001-09-19 11:24:48 +0000985
986def getwriter(encoding):
987
988 """ Lookup up the codec for the given encoding and return
989 its StreamWriter class or factory function.
990
991 Raises a LookupError in case the encoding cannot be found.
992
993 """
Thomas Woutersa9773292006-04-21 09:43:23 +0000994 return lookup(encoding).streamwriter
995
996def iterencode(iterator, encoding, errors='strict', **kwargs):
997 """
998 Encoding iterator.
999
1000 Encodes the input strings from the iterator using a IncrementalEncoder.
1001
1002 errors and kwargs are passed through to the IncrementalEncoder
1003 constructor.
1004 """
1005 encoder = getincrementalencoder(encoding)(errors, **kwargs)
1006 for input in iterator:
1007 output = encoder.encode(input)
1008 if output:
1009 yield output
1010 output = encoder.encode("", True)
1011 if output:
1012 yield output
1013
1014def iterdecode(iterator, encoding, errors='strict', **kwargs):
1015 """
1016 Decoding iterator.
1017
1018 Decodes the input strings from the iterator using a IncrementalDecoder.
1019
1020 errors and kwargs are passed through to the IncrementalDecoder
1021 constructor.
1022 """
1023 decoder = getincrementaldecoder(encoding)(errors, **kwargs)
1024 for input in iterator:
1025 output = decoder.decode(input)
1026 if output:
1027 yield output
Walter Dörwaldca8a8d02007-05-04 13:05:09 +00001028 output = decoder.decode(b"", True)
Thomas Woutersa9773292006-04-21 09:43:23 +00001029 if output:
1030 yield output
Marc-André Lemburgaa32c5a2001-09-19 11:24:48 +00001031
Marc-André Lemburga866df82001-01-03 21:29:14 +00001032### Helpers for charmap-based codecs
1033
1034def make_identity_dict(rng):
1035
1036 """ make_identity_dict(rng) -> dict
1037
1038 Return a dictionary where elements of the rng sequence are
1039 mapped to themselves.
Tim Peters88869f92001-01-14 23:36:06 +00001040
Marc-André Lemburga866df82001-01-03 21:29:14 +00001041 """
1042 res = {}
1043 for i in rng:
1044 res[i]=i
1045 return res
1046
Marc-André Lemburg716cf912001-05-16 09:41:45 +00001047def make_encoding_map(decoding_map):
1048
1049 """ Creates an encoding map from a decoding map.
1050
Walter Dörwald7f3ed742003-02-02 23:08:27 +00001051 If a target mapping in the decoding map occurs multiple
Marc-André Lemburg716cf912001-05-16 09:41:45 +00001052 times, then that target is mapped to None (undefined mapping),
1053 causing an exception when encountered by the charmap codec
1054 during translation.
1055
1056 One example where this happens is cp875.py which decodes
1057 multiple character to \u001a.
1058
1059 """
1060 m = {}
1061 for k,v in decoding_map.items():
Raymond Hettinger54f02222002-06-01 14:18:47 +00001062 if not v in m:
Marc-André Lemburg716cf912001-05-16 09:41:45 +00001063 m[v] = k
1064 else:
1065 m[v] = None
1066 return m
Tim Peters3a2ab1a2001-05-29 06:06:54 +00001067
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001068### error handlers
1069
Martin v. Löwise2713be2005-03-08 15:03:08 +00001070try:
1071 strict_errors = lookup_error("strict")
1072 ignore_errors = lookup_error("ignore")
1073 replace_errors = lookup_error("replace")
1074 xmlcharrefreplace_errors = lookup_error("xmlcharrefreplace")
1075 backslashreplace_errors = lookup_error("backslashreplace")
1076except LookupError:
1077 # In --disable-unicode builds, these error handler are missing
1078 strict_errors = None
1079 ignore_errors = None
1080 replace_errors = None
1081 xmlcharrefreplace_errors = None
1082 backslashreplace_errors = None
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001083
Martin v. Löwis6cd441d2001-07-31 08:54:55 +00001084# Tell modulefinder that using codecs probably needs the encodings
1085# package
1086_false = 0
1087if _false:
1088 import encodings
1089
Guido van Rossum0612d842000-03-10 23:20:43 +00001090### Tests
Guido van Rossum1c89b0e2000-04-11 15:41:38 +00001091
Guido van Rossum0612d842000-03-10 23:20:43 +00001092if __name__ == '__main__':
1093
Guido van Rossuma3277132000-04-11 15:37:43 +00001094 # Make stdout translate Latin-1 output into UTF-8 output
1095 sys.stdout = EncodedFile(sys.stdout, 'latin-1', 'utf-8')
Guido van Rossum1c89b0e2000-04-11 15:41:38 +00001096
Guido van Rossuma3277132000-04-11 15:37:43 +00001097 # Have stdin translate Latin-1 input into UTF-8 input
1098 sys.stdin = EncodedFile(sys.stdin, 'utf-8', 'latin-1')