blob: 6a6eb900c72ee7ea6d5b8d1aaa603c4f1ec2ca34 [file] [log] [blame]
Guido van Rossum0612d842000-03-10 23:20:43 +00001""" codecs -- Python Codec Registry, API and helpers.
2
3
4Written by Marc-Andre Lemburg (mal@lemburg.com).
5
6(c) Copyright CNRI, All Rights Reserved. NO WARRANTY.
7
8"""#"
9
Georg Brandl1a3284e2007-12-02 09:40:06 +000010import builtins, sys
Guido van Rossum0612d842000-03-10 23:20:43 +000011
12### Registry and builtin stateless codec functions
13
Guido van Rossumb95de4f2000-03-31 17:25:23 +000014try:
15 from _codecs import *
Guido van Rossumb940e112007-01-10 16:19:56 +000016except ImportError as why:
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000017 raise SystemError('Failed to load the builtin codecs: %s' % why)
Guido van Rossum0612d842000-03-10 23:20:43 +000018
Tim Peters30324a72001-05-15 17:19:16 +000019__all__ = ["register", "lookup", "open", "EncodedFile", "BOM", "BOM_BE",
Walter Dörwald474458d2002-06-04 15:16:29 +000020 "BOM_LE", "BOM32_BE", "BOM32_LE", "BOM64_BE", "BOM64_LE",
21 "BOM_UTF8", "BOM_UTF16", "BOM_UTF16_LE", "BOM_UTF16_BE",
Walter Dörwald3aeb6322002-09-02 13:14:32 +000022 "BOM_UTF32", "BOM_UTF32_LE", "BOM_UTF32_BE",
23 "strict_errors", "ignore_errors", "replace_errors",
24 "xmlcharrefreplace_errors",
25 "register_error", "lookup_error"]
Skip Montanaroe99d5ea2001-01-20 19:54:20 +000026
Guido van Rossum0612d842000-03-10 23:20:43 +000027### Constants
28
29#
Walter Dörwald474458d2002-06-04 15:16:29 +000030# Byte Order Mark (BOM = ZERO WIDTH NO-BREAK SPACE = U+FEFF)
31# and its possible byte string values
32# for UTF8/UTF16/UTF32 output and little/big endian machines
Guido van Rossum0612d842000-03-10 23:20:43 +000033#
Guido van Rossum0612d842000-03-10 23:20:43 +000034
Walter Dörwald474458d2002-06-04 15:16:29 +000035# UTF-8
Walter Dörwaldca8a8d02007-05-04 13:05:09 +000036BOM_UTF8 = b'\xef\xbb\xbf'
Walter Dörwald474458d2002-06-04 15:16:29 +000037
38# UTF-16, little endian
Walter Dörwaldca8a8d02007-05-04 13:05:09 +000039BOM_LE = BOM_UTF16_LE = b'\xff\xfe'
Walter Dörwald474458d2002-06-04 15:16:29 +000040
41# UTF-16, big endian
Walter Dörwaldca8a8d02007-05-04 13:05:09 +000042BOM_BE = BOM_UTF16_BE = b'\xfe\xff'
Walter Dörwald474458d2002-06-04 15:16:29 +000043
44# UTF-32, little endian
Walter Dörwaldca8a8d02007-05-04 13:05:09 +000045BOM_UTF32_LE = b'\xff\xfe\x00\x00'
Walter Dörwald474458d2002-06-04 15:16:29 +000046
47# UTF-32, big endian
Walter Dörwaldca8a8d02007-05-04 13:05:09 +000048BOM_UTF32_BE = b'\x00\x00\xfe\xff'
Walter Dörwald474458d2002-06-04 15:16:29 +000049
Marc-André Lemburgb28de0d2002-12-12 17:37:50 +000050if sys.byteorder == 'little':
Walter Dörwald474458d2002-06-04 15:16:29 +000051
Marc-André Lemburgb28de0d2002-12-12 17:37:50 +000052 # UTF-16, native endianness
53 BOM = BOM_UTF16 = BOM_UTF16_LE
54
55 # UTF-32, native endianness
56 BOM_UTF32 = BOM_UTF32_LE
57
58else:
59
60 # UTF-16, native endianness
61 BOM = BOM_UTF16 = BOM_UTF16_BE
62
63 # UTF-32, native endianness
64 BOM_UTF32 = BOM_UTF32_BE
Walter Dörwald474458d2002-06-04 15:16:29 +000065
66# Old broken names (don't use in new code)
67BOM32_LE = BOM_UTF16_LE
68BOM32_BE = BOM_UTF16_BE
69BOM64_LE = BOM_UTF32_LE
70BOM64_BE = BOM_UTF32_BE
Guido van Rossum0612d842000-03-10 23:20:43 +000071
72
73### Codec base classes (defining the API)
74
Thomas Woutersa9773292006-04-21 09:43:23 +000075class CodecInfo(tuple):
76
77 def __new__(cls, encode, decode, streamreader=None, streamwriter=None,
78 incrementalencoder=None, incrementaldecoder=None, name=None):
79 self = tuple.__new__(cls, (encode, decode, streamreader, streamwriter))
80 self.name = name
81 self.encode = encode
82 self.decode = decode
83 self.incrementalencoder = incrementalencoder
84 self.incrementaldecoder = incrementaldecoder
85 self.streamwriter = streamwriter
86 self.streamreader = streamreader
87 return self
88
89 def __repr__(self):
Walter Dörwald3abcb012007-04-16 22:10:50 +000090 return "<%s.%s object for encoding %s at 0x%x>" % \
91 (self.__class__.__module__, self.__class__.__name__,
92 self.name, id(self))
Thomas Woutersa9773292006-04-21 09:43:23 +000093
Guido van Rossum0612d842000-03-10 23:20:43 +000094class Codec:
95
96 """ Defines the interface for stateless encoders/decoders.
97
Walter Dörwald7f82f792002-11-19 21:42:53 +000098 The .encode()/.decode() methods may use different error
Guido van Rossum0612d842000-03-10 23:20:43 +000099 handling schemes by providing the errors argument. These
Walter Dörwald7f82f792002-11-19 21:42:53 +0000100 string values are predefined:
Guido van Rossum0612d842000-03-10 23:20:43 +0000101
Guido van Rossumd8855fd2000-03-24 22:14:19 +0000102 'strict' - raise a ValueError error (or a subclass)
Guido van Rossum0612d842000-03-10 23:20:43 +0000103 'ignore' - ignore the character and continue with the next
104 'replace' - replace with a suitable replacement character;
105 Python will use the official U+FFFD REPLACEMENT
Walter Dörwald7f82f792002-11-19 21:42:53 +0000106 CHARACTER for the builtin Unicode codecs on
107 decoding and '?' on encoding.
Andrew Kuchlingc7b6c502013-06-16 12:58:48 -0400108 'surrogateescape' - replace with private codepoints U+DCnn.
Walter Dörwald7f82f792002-11-19 21:42:53 +0000109 'xmlcharrefreplace' - Replace with the appropriate XML
110 character reference (only for encoding).
111 'backslashreplace' - Replace with backslashed escape sequences
112 (only for encoding).
113
114 The set of allowed values can be extended via register_error.
Guido van Rossum0612d842000-03-10 23:20:43 +0000115
116 """
Tim Peters30324a72001-05-15 17:19:16 +0000117 def encode(self, input, errors='strict'):
Guido van Rossum1c89b0e2000-04-11 15:41:38 +0000118
Fred Drake3e74c0d2000-03-17 15:40:35 +0000119 """ Encodes the object input and returns a tuple (output
Guido van Rossum0612d842000-03-10 23:20:43 +0000120 object, length consumed).
121
122 errors defines the error handling to apply. It defaults to
123 'strict' handling.
124
125 The method may not store state in the Codec instance. Use
126 StreamCodec for codecs which have to keep state in order to
127 make encoding/decoding efficient.
128
129 The encoder must be able to handle zero length input and
130 return an empty object of the output object type in this
131 situation.
132
133 """
134 raise NotImplementedError
135
Tim Peters30324a72001-05-15 17:19:16 +0000136 def decode(self, input, errors='strict'):
Guido van Rossum0612d842000-03-10 23:20:43 +0000137
138 """ Decodes the object input and returns a tuple (output
139 object, length consumed).
140
141 input must be an object which provides the bf_getreadbuf
142 buffer slot. Python strings, buffer objects and memory
143 mapped files are examples of objects providing this slot.
Guido van Rossum1c89b0e2000-04-11 15:41:38 +0000144
Guido van Rossum0612d842000-03-10 23:20:43 +0000145 errors defines the error handling to apply. It defaults to
146 'strict' handling.
147
148 The method may not store state in the Codec instance. Use
149 StreamCodec for codecs which have to keep state in order to
150 make encoding/decoding efficient.
151
152 The decoder must be able to handle zero length input and
153 return an empty object of the output object type in this
154 situation.
155
Guido van Rossum1c89b0e2000-04-11 15:41:38 +0000156 """
Guido van Rossum0612d842000-03-10 23:20:43 +0000157 raise NotImplementedError
158
Thomas Woutersa9773292006-04-21 09:43:23 +0000159class IncrementalEncoder(object):
160 """
Walter Dörwald3abcb012007-04-16 22:10:50 +0000161 An IncrementalEncoder encodes an input in multiple steps. The input can
162 be passed piece by piece to the encode() method. The IncrementalEncoder
163 remembers the state of the encoding process between calls to encode().
Thomas Woutersa9773292006-04-21 09:43:23 +0000164 """
165 def __init__(self, errors='strict'):
166 """
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000167 Creates an IncrementalEncoder instance.
Thomas Woutersa9773292006-04-21 09:43:23 +0000168
169 The IncrementalEncoder may use different error handling schemes by
170 providing the errors keyword argument. See the module docstring
171 for a list of possible values.
172 """
173 self.errors = errors
174 self.buffer = ""
175
176 def encode(self, input, final=False):
177 """
178 Encodes input and returns the resulting object.
179 """
180 raise NotImplementedError
181
182 def reset(self):
183 """
184 Resets the encoder to the initial state.
185 """
186
Walter Dörwald3abcb012007-04-16 22:10:50 +0000187 def getstate(self):
188 """
189 Return the current state of the encoder.
190 """
191 return 0
192
193 def setstate(self, state):
194 """
195 Set the current state of the encoder. state must have been
196 returned by getstate().
197 """
198
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000199class BufferedIncrementalEncoder(IncrementalEncoder):
200 """
201 This subclass of IncrementalEncoder can be used as the baseclass for an
202 incremental encoder if the encoder must keep some of the output in a
203 buffer between calls to encode().
204 """
205 def __init__(self, errors='strict'):
206 IncrementalEncoder.__init__(self, errors)
Walter Dörwald3abcb012007-04-16 22:10:50 +0000207 # unencoded input that is kept between calls to encode()
208 self.buffer = ""
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000209
210 def _buffer_encode(self, input, errors, final):
211 # Overwrite this method in subclasses: It must encode input
212 # and return an (output, length consumed) tuple
213 raise NotImplementedError
214
215 def encode(self, input, final=False):
216 # encode input (taking the buffer into account)
217 data = self.buffer + input
218 (result, consumed) = self._buffer_encode(data, self.errors, final)
219 # keep unencoded input until the next call
220 self.buffer = data[consumed:]
221 return result
222
223 def reset(self):
224 IncrementalEncoder.reset(self)
225 self.buffer = ""
226
Walter Dörwald3abcb012007-04-16 22:10:50 +0000227 def getstate(self):
228 return self.buffer or 0
229
230 def setstate(self, state):
231 self.buffer = state or ""
232
Thomas Woutersa9773292006-04-21 09:43:23 +0000233class IncrementalDecoder(object):
234 """
Walter Dörwald3abcb012007-04-16 22:10:50 +0000235 An IncrementalDecoder decodes an input in multiple steps. The input can
236 be passed piece by piece to the decode() method. The IncrementalDecoder
Thomas Woutersa9773292006-04-21 09:43:23 +0000237 remembers the state of the decoding process between calls to decode().
238 """
239 def __init__(self, errors='strict'):
240 """
Ka-Ping Yeef44c7e82008-03-18 04:51:32 +0000241 Create a IncrementalDecoder instance.
Thomas Woutersa9773292006-04-21 09:43:23 +0000242
243 The IncrementalDecoder may use different error handling schemes by
244 providing the errors keyword argument. See the module docstring
245 for a list of possible values.
246 """
247 self.errors = errors
248
249 def decode(self, input, final=False):
250 """
Ka-Ping Yeef44c7e82008-03-18 04:51:32 +0000251 Decode input and returns the resulting object.
Thomas Woutersa9773292006-04-21 09:43:23 +0000252 """
253 raise NotImplementedError
254
255 def reset(self):
256 """
Ka-Ping Yeef44c7e82008-03-18 04:51:32 +0000257 Reset the decoder to the initial state.
Thomas Woutersa9773292006-04-21 09:43:23 +0000258 """
259
Walter Dörwald3abcb012007-04-16 22:10:50 +0000260 def getstate(self):
261 """
Ka-Ping Yeef44c7e82008-03-18 04:51:32 +0000262 Return the current state of the decoder.
263
264 This must be a (buffered_input, additional_state_info) tuple.
265 buffered_input must be a bytes object containing bytes that
266 were passed to decode() that have not yet been converted.
267 additional_state_info must be a non-negative integer
268 representing the state of the decoder WITHOUT yet having
269 processed the contents of buffered_input. In the initial state
270 and after reset(), getstate() must return (b"", 0).
Walter Dörwald3abcb012007-04-16 22:10:50 +0000271 """
Walter Dörwaldca8a8d02007-05-04 13:05:09 +0000272 return (b"", 0)
Walter Dörwald3abcb012007-04-16 22:10:50 +0000273
274 def setstate(self, state):
275 """
Ka-Ping Yeef44c7e82008-03-18 04:51:32 +0000276 Set the current state of the decoder.
277
278 state must have been returned by getstate(). The effect of
279 setstate((b"", 0)) must be equivalent to reset().
Walter Dörwald3abcb012007-04-16 22:10:50 +0000280 """
281
Thomas Woutersa9773292006-04-21 09:43:23 +0000282class BufferedIncrementalDecoder(IncrementalDecoder):
283 """
284 This subclass of IncrementalDecoder can be used as the baseclass for an
Walter Dörwald3abcb012007-04-16 22:10:50 +0000285 incremental decoder if the decoder must be able to handle incomplete
286 byte sequences.
Thomas Woutersa9773292006-04-21 09:43:23 +0000287 """
288 def __init__(self, errors='strict'):
289 IncrementalDecoder.__init__(self, errors)
Walter Dörwald3abcb012007-04-16 22:10:50 +0000290 # undecoded input that is kept between calls to decode()
Walter Dörwaldca8a8d02007-05-04 13:05:09 +0000291 self.buffer = b""
Thomas Woutersa9773292006-04-21 09:43:23 +0000292
293 def _buffer_decode(self, input, errors, final):
294 # Overwrite this method in subclasses: It must decode input
295 # and return an (output, length consumed) tuple
296 raise NotImplementedError
297
298 def decode(self, input, final=False):
299 # decode input (taking the buffer into account)
300 data = self.buffer + input
301 (result, consumed) = self._buffer_decode(data, self.errors, final)
302 # keep undecoded input until the next call
303 self.buffer = data[consumed:]
304 return result
305
306 def reset(self):
307 IncrementalDecoder.reset(self)
Walter Dörwaldca8a8d02007-05-04 13:05:09 +0000308 self.buffer = b""
Thomas Woutersa9773292006-04-21 09:43:23 +0000309
Walter Dörwald3abcb012007-04-16 22:10:50 +0000310 def getstate(self):
311 # additional state info is always 0
312 return (self.buffer, 0)
313
314 def setstate(self, state):
315 # ignore additional state info
316 self.buffer = state[0]
317
Guido van Rossum0612d842000-03-10 23:20:43 +0000318#
319# The StreamWriter and StreamReader class provide generic working
Andrew M. Kuchling97c56352001-09-18 20:29:48 +0000320# interfaces which can be used to implement new encoding submodules
Guido van Rossum0612d842000-03-10 23:20:43 +0000321# very easily. See encodings/utf_8.py for an example on how this is
322# done.
Guido van Rossum1c89b0e2000-04-11 15:41:38 +0000323#
Guido van Rossum0612d842000-03-10 23:20:43 +0000324
325class StreamWriter(Codec):
326
Tim Peters30324a72001-05-15 17:19:16 +0000327 def __init__(self, stream, errors='strict'):
Guido van Rossum0612d842000-03-10 23:20:43 +0000328
329 """ Creates a StreamWriter instance.
330
331 stream must be a file-like object open for writing
332 (binary) data.
333
Walter Dörwald7f82f792002-11-19 21:42:53 +0000334 The StreamWriter may use different error handling
Guido van Rossum0612d842000-03-10 23:20:43 +0000335 schemes by providing the errors keyword argument. These
Walter Dörwald7f82f792002-11-19 21:42:53 +0000336 parameters are predefined:
Guido van Rossum0612d842000-03-10 23:20:43 +0000337
338 'strict' - raise a ValueError (or a subclass)
339 'ignore' - ignore the character and continue with the next
340 'replace'- replace with a suitable replacement character
Walter Dörwald7f82f792002-11-19 21:42:53 +0000341 'xmlcharrefreplace' - Replace with the appropriate XML
342 character reference.
343 'backslashreplace' - Replace with backslashed escape
344 sequences (only for encoding).
Guido van Rossum0612d842000-03-10 23:20:43 +0000345
Walter Dörwald7f82f792002-11-19 21:42:53 +0000346 The set of allowed parameter values can be extended via
347 register_error.
Guido van Rossum0612d842000-03-10 23:20:43 +0000348 """
349 self.stream = stream
350 self.errors = errors
351
Guido van Rossuma3277132000-04-11 15:37:43 +0000352 def write(self, object):
Guido van Rossum0612d842000-03-10 23:20:43 +0000353
354 """ Writes the object's contents encoded to self.stream.
355 """
Tim Peters30324a72001-05-15 17:19:16 +0000356 data, consumed = self.encode(object, self.errors)
Guido van Rossum0612d842000-03-10 23:20:43 +0000357 self.stream.write(data)
358
Guido van Rossuma3277132000-04-11 15:37:43 +0000359 def writelines(self, list):
360
361 """ Writes the concatenated list of strings to the stream
362 using .write().
363 """
364 self.write(''.join(list))
Guido van Rossum1c89b0e2000-04-11 15:41:38 +0000365
Guido van Rossum0612d842000-03-10 23:20:43 +0000366 def reset(self):
367
368 """ Flushes and resets the codec buffers used for keeping state.
369
370 Calling this method should ensure that the data on the
371 output is put into a clean state, that allows appending
372 of new fresh data without having to rescan the whole
373 stream to recover state.
374
375 """
376 pass
377
Victor Stinnera92ad7e2010-05-22 16:59:09 +0000378 def seek(self, offset, whence=0):
379 self.stream.seek(offset, whence)
380 if whence == 0 and offset == 0:
381 self.reset()
382
Tim Peters30324a72001-05-15 17:19:16 +0000383 def __getattr__(self, name,
Guido van Rossum0612d842000-03-10 23:20:43 +0000384 getattr=getattr):
385
386 """ Inherit all other methods from the underlying stream.
387 """
Tim Peters30324a72001-05-15 17:19:16 +0000388 return getattr(self.stream, name)
Guido van Rossum0612d842000-03-10 23:20:43 +0000389
Thomas Wouters89f507f2006-12-13 04:49:30 +0000390 def __enter__(self):
391 return self
392
393 def __exit__(self, type, value, tb):
394 self.stream.close()
395
Guido van Rossum0612d842000-03-10 23:20:43 +0000396###
397
398class StreamReader(Codec):
399
Georg Brandl02524622010-12-02 18:06:51 +0000400 charbuffertype = str
401
Tim Peters30324a72001-05-15 17:19:16 +0000402 def __init__(self, stream, errors='strict'):
Guido van Rossum0612d842000-03-10 23:20:43 +0000403
404 """ Creates a StreamReader instance.
405
406 stream must be a file-like object open for reading
407 (binary) data.
408
Walter Dörwald7f82f792002-11-19 21:42:53 +0000409 The StreamReader may use different error handling
Guido van Rossum0612d842000-03-10 23:20:43 +0000410 schemes by providing the errors keyword argument. These
Walter Dörwald7f82f792002-11-19 21:42:53 +0000411 parameters are predefined:
Guido van Rossum0612d842000-03-10 23:20:43 +0000412
413 'strict' - raise a ValueError (or a subclass)
414 'ignore' - ignore the character and continue with the next
415 'replace'- replace with a suitable replacement character;
416
Walter Dörwald7f82f792002-11-19 21:42:53 +0000417 The set of allowed parameter values can be extended via
418 register_error.
Guido van Rossum0612d842000-03-10 23:20:43 +0000419 """
420 self.stream = stream
421 self.errors = errors
Walter Dörwaldca8a8d02007-05-04 13:05:09 +0000422 self.bytebuffer = b""
Georg Brandl02524622010-12-02 18:06:51 +0000423 self._empty_charbuffer = self.charbuffertype()
424 self.charbuffer = self._empty_charbuffer
Martin v. Löwis4ed67382005-09-18 08:34:39 +0000425 self.linebuffer = None
Guido van Rossum0612d842000-03-10 23:20:43 +0000426
Walter Dörwald69652032004-09-07 20:24:22 +0000427 def decode(self, input, errors='strict'):
428 raise NotImplementedError
429
Martin v. Löwis56066d22005-08-24 07:38:12 +0000430 def read(self, size=-1, chars=-1, firstline=False):
Guido van Rossum0612d842000-03-10 23:20:43 +0000431
432 """ Decodes data from the stream self.stream and returns the
433 resulting object.
434
Walter Dörwald69652032004-09-07 20:24:22 +0000435 chars indicates the number of characters to read from the
436 stream. read() will never return more than chars
437 characters, but it might return less, if there are not enough
438 characters available.
439
Guido van Rossum0612d842000-03-10 23:20:43 +0000440 size indicates the approximate maximum number of bytes to
441 read from the stream for decoding purposes. The decoder
442 can modify this setting as appropriate. The default value
443 -1 indicates to read and decode as much as possible. size
444 is intended to prevent having to decode huge files in one
445 step.
446
Martin v. Löwis56066d22005-08-24 07:38:12 +0000447 If firstline is true, and a UnicodeDecodeError happens
448 after the first line terminator in the input only the first line
449 will be returned, the rest of the input will be kept until the
450 next call to read().
451
Guido van Rossum0612d842000-03-10 23:20:43 +0000452 The method should use a greedy read strategy meaning that
453 it should read as much data as is allowed within the
454 definition of the encoding and the given size, e.g. if
455 optional encoding endings or state markers are available
456 on the stream, these should be read too.
Guido van Rossum0612d842000-03-10 23:20:43 +0000457 """
Martin v. Löwis4ed67382005-09-18 08:34:39 +0000458 # If we have lines cached, first merge them back into characters
459 if self.linebuffer:
Georg Brandl02524622010-12-02 18:06:51 +0000460 self.charbuffer = self._empty_charbuffer.join(self.linebuffer)
Martin v. Löwis4ed67382005-09-18 08:34:39 +0000461 self.linebuffer = None
Tim Peters536cf992005-12-25 23:18:31 +0000462
Walter Dörwald69652032004-09-07 20:24:22 +0000463 # read until we get the required number of characters (if available)
Walter Dörwald69652032004-09-07 20:24:22 +0000464 while True:
Tim Golden621302c2012-10-01 16:40:40 +0100465 # can the request be satisfied from the character buffer?
Walter Dörwald69652032004-09-07 20:24:22 +0000466 if chars < 0:
Walter Dörwaldca199432006-03-06 22:39:12 +0000467 if size < 0:
468 if self.charbuffer:
469 break
470 elif len(self.charbuffer) >= size:
Walter Dörwalde57d7b12004-12-21 22:24:00 +0000471 break
Guido van Rossum0612d842000-03-10 23:20:43 +0000472 else:
Walter Dörwald69652032004-09-07 20:24:22 +0000473 if len(self.charbuffer) >= chars:
Walter Dörwald69652032004-09-07 20:24:22 +0000474 break
475 # we need more data
476 if size < 0:
477 newdata = self.stream.read()
478 else:
479 newdata = self.stream.read(size)
Walter Dörwalde57d7b12004-12-21 22:24:00 +0000480 # decode bytes (those remaining from the last call included)
Walter Dörwald69652032004-09-07 20:24:22 +0000481 data = self.bytebuffer + newdata
Martin v. Löwis56066d22005-08-24 07:38:12 +0000482 try:
483 newchars, decodedbytes = self.decode(data, self.errors)
Guido van Rossumb940e112007-01-10 16:19:56 +0000484 except UnicodeDecodeError as exc:
Martin v. Löwis56066d22005-08-24 07:38:12 +0000485 if firstline:
Walter Dörwald3abcb012007-04-16 22:10:50 +0000486 newchars, decodedbytes = \
487 self.decode(data[:exc.start], self.errors)
Ezio Melottid8b509b2011-09-28 17:37:55 +0300488 lines = newchars.splitlines(keepends=True)
Martin v. Löwis56066d22005-08-24 07:38:12 +0000489 if len(lines)<=1:
490 raise
491 else:
492 raise
Walter Dörwald69652032004-09-07 20:24:22 +0000493 # keep undecoded bytes until the next call
494 self.bytebuffer = data[decodedbytes:]
495 # put new characters in the character buffer
Walter Dörwalde57d7b12004-12-21 22:24:00 +0000496 self.charbuffer += newchars
Walter Dörwald69652032004-09-07 20:24:22 +0000497 # there was no data available
498 if not newdata:
Walter Dörwalde57d7b12004-12-21 22:24:00 +0000499 break
500 if chars < 0:
501 # Return everything we've got
502 result = self.charbuffer
Georg Brandl02524622010-12-02 18:06:51 +0000503 self.charbuffer = self._empty_charbuffer
Walter Dörwalde57d7b12004-12-21 22:24:00 +0000504 else:
505 # Return the first chars characters
506 result = self.charbuffer[:chars]
507 self.charbuffer = self.charbuffer[chars:]
Walter Dörwald69652032004-09-07 20:24:22 +0000508 return result
Guido van Rossum0612d842000-03-10 23:20:43 +0000509
Walter Dörwald69652032004-09-07 20:24:22 +0000510 def readline(self, size=None, keepends=True):
Guido van Rossuma3277132000-04-11 15:37:43 +0000511
512 """ Read one line from the input stream and return the
513 decoded data.
514
Walter Dörwald69652032004-09-07 20:24:22 +0000515 size, if given, is passed as size argument to the
516 read() method.
Guido van Rossum1c89b0e2000-04-11 15:41:38 +0000517
Guido van Rossuma3277132000-04-11 15:37:43 +0000518 """
Martin v. Löwis4ed67382005-09-18 08:34:39 +0000519 # If we have lines cached from an earlier read, return
520 # them unconditionally
521 if self.linebuffer:
522 line = self.linebuffer[0]
523 del self.linebuffer[0]
524 if len(self.linebuffer) == 1:
525 # revert to charbuffer mode; we might need more data
526 # next time
527 self.charbuffer = self.linebuffer[0]
528 self.linebuffer = None
529 if not keepends:
Ezio Melottid8b509b2011-09-28 17:37:55 +0300530 line = line.splitlines(keepends=False)[0]
Martin v. Löwis4ed67382005-09-18 08:34:39 +0000531 return line
Tim Peters536cf992005-12-25 23:18:31 +0000532
Walter Dörwalde57d7b12004-12-21 22:24:00 +0000533 readsize = size or 72
Georg Brandl02524622010-12-02 18:06:51 +0000534 line = self._empty_charbuffer
Walter Dörwalde57d7b12004-12-21 22:24:00 +0000535 # If size is given, we call read() only once
Walter Dörwald69652032004-09-07 20:24:22 +0000536 while True:
Martin v. Löwis56066d22005-08-24 07:38:12 +0000537 data = self.read(readsize, firstline=True)
Walter Dörwalde57d7b12004-12-21 22:24:00 +0000538 if data:
Walter Dörwalda4eb2d52005-04-21 21:42:35 +0000539 # If we're at a "\r" read one extra character (which might
540 # be a "\n") to get a proper line ending. If the stream is
Walter Dörwaldbc8e6422005-04-21 21:32:03 +0000541 # temporarily exhausted we return the wrong line ending.
Georg Brandl02524622010-12-02 18:06:51 +0000542 if (isinstance(data, str) and data.endswith("\r")) or \
543 (isinstance(data, bytes) and data.endswith(b"\r")):
Walter Dörwald7a6dc132005-04-04 21:38:47 +0000544 data += self.read(size=1, chars=1)
Walter Dörwald7a6dc132005-04-04 21:38:47 +0000545
Walter Dörwald69652032004-09-07 20:24:22 +0000546 line += data
Ezio Melottid8b509b2011-09-28 17:37:55 +0300547 lines = line.splitlines(keepends=True)
Walter Dörwalde57d7b12004-12-21 22:24:00 +0000548 if lines:
Martin v. Löwis4ed67382005-09-18 08:34:39 +0000549 if len(lines) > 1:
550 # More than one line result; the first line is a full line
551 # to return
552 line = lines[0]
553 del lines[0]
554 if len(lines) > 1:
555 # cache the remaining lines
556 lines[-1] += self.charbuffer
557 self.linebuffer = lines
558 self.charbuffer = None
559 else:
560 # only one remaining line, put it back into charbuffer
561 self.charbuffer = lines[0] + self.charbuffer
562 if not keepends:
Ezio Melottid8b509b2011-09-28 17:37:55 +0300563 line = line.splitlines(keepends=False)[0]
Martin v. Löwis4ed67382005-09-18 08:34:39 +0000564 break
Walter Dörwalde57d7b12004-12-21 22:24:00 +0000565 line0withend = lines[0]
Ezio Melottid8b509b2011-09-28 17:37:55 +0300566 line0withoutend = lines[0].splitlines(keepends=False)[0]
Walter Dörwalde57d7b12004-12-21 22:24:00 +0000567 if line0withend != line0withoutend: # We really have a line end
568 # Put the rest back together and keep it until the next call
Georg Brandl02524622010-12-02 18:06:51 +0000569 self.charbuffer = self._empty_charbuffer.join(lines[1:]) + \
570 self.charbuffer
Walter Dörwalde57d7b12004-12-21 22:24:00 +0000571 if keepends:
572 line = line0withend
573 else:
574 line = line0withoutend
Walter Dörwald9fa09462005-01-10 12:01:39 +0000575 break
Walter Dörwalde57d7b12004-12-21 22:24:00 +0000576 # we didn't get anything or this was our only try
Walter Dörwald9fa09462005-01-10 12:01:39 +0000577 if not data or size is not None:
Walter Dörwalde57d7b12004-12-21 22:24:00 +0000578 if line and not keepends:
Ezio Melottid8b509b2011-09-28 17:37:55 +0300579 line = line.splitlines(keepends=False)[0]
Walter Dörwalde57d7b12004-12-21 22:24:00 +0000580 break
Georg Brandl02524622010-12-02 18:06:51 +0000581 if readsize < 8000:
Walter Dörwalde57d7b12004-12-21 22:24:00 +0000582 readsize *= 2
583 return line
Guido van Rossum1c89b0e2000-04-11 15:41:38 +0000584
Walter Dörwald69652032004-09-07 20:24:22 +0000585 def readlines(self, sizehint=None, keepends=True):
Guido van Rossuma3277132000-04-11 15:37:43 +0000586
587 """ Read all lines available on the input stream
588 and return them as list of lines.
589
590 Line breaks are implemented using the codec's decoder
591 method and are included in the list entries.
Guido van Rossum1c89b0e2000-04-11 15:41:38 +0000592
Marc-André Lemburgd5948492004-02-26 15:22:17 +0000593 sizehint, if given, is ignored since there is no efficient
594 way to finding the true end-of-line.
Guido van Rossuma3277132000-04-11 15:37:43 +0000595
596 """
Walter Dörwald69652032004-09-07 20:24:22 +0000597 data = self.read()
Hye-Shik Changaf5c7cf2004-10-17 23:51:21 +0000598 return data.splitlines(keepends)
Guido van Rossum0612d842000-03-10 23:20:43 +0000599
600 def reset(self):
601
602 """ Resets the codec buffers used for keeping state.
603
604 Note that no stream repositioning should take place.
Thomas Wouters7e474022000-07-16 12:04:32 +0000605 This method is primarily intended to be able to recover
Guido van Rossum0612d842000-03-10 23:20:43 +0000606 from decoding errors.
607
608 """
Walter Dörwaldca8a8d02007-05-04 13:05:09 +0000609 self.bytebuffer = b""
Georg Brandl02524622010-12-02 18:06:51 +0000610 self.charbuffer = self._empty_charbuffer
Martin v. Löwis4ed67382005-09-18 08:34:39 +0000611 self.linebuffer = None
Walter Dörwald729c31f2005-03-14 19:06:30 +0000612
Walter Dörwald71fd90d2005-03-14 19:25:41 +0000613 def seek(self, offset, whence=0):
Walter Dörwald729c31f2005-03-14 19:06:30 +0000614 """ Set the input stream's current position.
615
616 Resets the codec buffers used for keeping state.
617 """
Walter Dörwald729c31f2005-03-14 19:06:30 +0000618 self.stream.seek(offset, whence)
Victor Stinnera92ad7e2010-05-22 16:59:09 +0000619 self.reset()
Guido van Rossum0612d842000-03-10 23:20:43 +0000620
Georg Brandla18af4e2007-04-21 15:47:16 +0000621 def __next__(self):
Walter Dörwald4dbf1922002-11-06 16:53:44 +0000622
623 """ Return the next decoded line from the input stream."""
624 line = self.readline()
625 if line:
626 return line
627 raise StopIteration
628
629 def __iter__(self):
630 return self
631
Tim Peters30324a72001-05-15 17:19:16 +0000632 def __getattr__(self, name,
Guido van Rossum0612d842000-03-10 23:20:43 +0000633 getattr=getattr):
634
635 """ Inherit all other methods from the underlying stream.
636 """
Tim Peters30324a72001-05-15 17:19:16 +0000637 return getattr(self.stream, name)
Guido van Rossum0612d842000-03-10 23:20:43 +0000638
Thomas Wouters89f507f2006-12-13 04:49:30 +0000639 def __enter__(self):
640 return self
641
642 def __exit__(self, type, value, tb):
643 self.stream.close()
644
Guido van Rossum0612d842000-03-10 23:20:43 +0000645###
646
647class StreamReaderWriter:
648
Fred Drake49fd1072000-04-13 14:11:21 +0000649 """ StreamReaderWriter instances allow wrapping streams which
650 work in both read and write modes.
651
652 The design is such that one can use the factory functions
Thomas Wouters7e474022000-07-16 12:04:32 +0000653 returned by the codec.lookup() function to construct the
Fred Drake49fd1072000-04-13 14:11:21 +0000654 instance.
655
656 """
Guido van Rossuma3277132000-04-11 15:37:43 +0000657 # Optional attributes set by the file wrappers below
658 encoding = 'unknown'
659
Tim Peters30324a72001-05-15 17:19:16 +0000660 def __init__(self, stream, Reader, Writer, errors='strict'):
Guido van Rossum0612d842000-03-10 23:20:43 +0000661
662 """ Creates a StreamReaderWriter instance.
663
664 stream must be a Stream-like object.
665
666 Reader, Writer must be factory functions or classes
667 providing the StreamReader, StreamWriter interface resp.
668
669 Error handling is done in the same way as defined for the
670 StreamWriter/Readers.
671
672 """
673 self.stream = stream
674 self.reader = Reader(stream, errors)
675 self.writer = Writer(stream, errors)
676 self.errors = errors
677
Tim Peters30324a72001-05-15 17:19:16 +0000678 def read(self, size=-1):
Guido van Rossum0612d842000-03-10 23:20:43 +0000679
680 return self.reader.read(size)
681
Guido van Rossumd58c26f2000-05-01 16:17:32 +0000682 def readline(self, size=None):
Guido van Rossuma3277132000-04-11 15:37:43 +0000683
684 return self.reader.readline(size)
685
Guido van Rossumd58c26f2000-05-01 16:17:32 +0000686 def readlines(self, sizehint=None):
Guido van Rossuma3277132000-04-11 15:37:43 +0000687
688 return self.reader.readlines(sizehint)
689
Georg Brandla18af4e2007-04-21 15:47:16 +0000690 def __next__(self):
Walter Dörwald4dbf1922002-11-06 16:53:44 +0000691
692 """ Return the next decoded line from the input stream."""
Georg Brandla18af4e2007-04-21 15:47:16 +0000693 return next(self.reader)
Walter Dörwald4dbf1922002-11-06 16:53:44 +0000694
695 def __iter__(self):
696 return self
697
Tim Peters30324a72001-05-15 17:19:16 +0000698 def write(self, data):
Guido van Rossum0612d842000-03-10 23:20:43 +0000699
700 return self.writer.write(data)
701
Tim Peters30324a72001-05-15 17:19:16 +0000702 def writelines(self, list):
Guido van Rossuma3277132000-04-11 15:37:43 +0000703
704 return self.writer.writelines(list)
705
Guido van Rossum0612d842000-03-10 23:20:43 +0000706 def reset(self):
707
708 self.reader.reset()
709 self.writer.reset()
710
Victor Stinner3fed0872010-05-22 02:16:27 +0000711 def seek(self, offset, whence=0):
Victor Stinnera92ad7e2010-05-22 16:59:09 +0000712 self.stream.seek(offset, whence)
713 self.reader.reset()
714 if whence == 0 and offset == 0:
715 self.writer.reset()
Victor Stinner3fed0872010-05-22 02:16:27 +0000716
Tim Peters30324a72001-05-15 17:19:16 +0000717 def __getattr__(self, name,
Guido van Rossum0612d842000-03-10 23:20:43 +0000718 getattr=getattr):
719
720 """ Inherit all other methods from the underlying stream.
721 """
Tim Peters30324a72001-05-15 17:19:16 +0000722 return getattr(self.stream, name)
Guido van Rossum0612d842000-03-10 23:20:43 +0000723
Thomas Wouters89f507f2006-12-13 04:49:30 +0000724 # these are needed to make "with codecs.open(...)" work properly
725
726 def __enter__(self):
727 return self
728
729 def __exit__(self, type, value, tb):
730 self.stream.close()
731
Guido van Rossum0612d842000-03-10 23:20:43 +0000732###
733
734class StreamRecoder:
735
Fred Drake49fd1072000-04-13 14:11:21 +0000736 """ StreamRecoder instances provide a frontend - backend
737 view of encoding data.
738
739 They use the complete set of APIs returned by the
740 codecs.lookup() function to implement their task.
741
742 Data written to the stream is first decoded into an
743 intermediate format (which is dependent on the given codec
744 combination) and then written to the stream using an instance
745 of the provided Writer class.
746
747 In the other direction, data is read from the stream using a
748 Reader instance and then return encoded data to the caller.
749
750 """
Guido van Rossuma3277132000-04-11 15:37:43 +0000751 # Optional attributes set by the file wrappers below
752 data_encoding = 'unknown'
753 file_encoding = 'unknown'
754
Tim Peters30324a72001-05-15 17:19:16 +0000755 def __init__(self, stream, encode, decode, Reader, Writer,
756 errors='strict'):
Guido van Rossum0612d842000-03-10 23:20:43 +0000757
758 """ Creates a StreamRecoder instance which implements a two-way
759 conversion: encode and decode work on the frontend (the
Guido van Rossum1c89b0e2000-04-11 15:41:38 +0000760 input to .read() and output of .write()) while
Guido van Rossum0612d842000-03-10 23:20:43 +0000761 Reader and Writer work on the backend (reading and
Fred Drake908670c2000-03-17 15:42:11 +0000762 writing to the stream).
Guido van Rossum0612d842000-03-10 23:20:43 +0000763
764 You can use these objects to do transparent direct
765 recodings from e.g. latin-1 to utf-8 and back.
766
767 stream must be a file-like object.
768
769 encode, decode must adhere to the Codec interface, Reader,
770 Writer must be factory functions or classes providing the
771 StreamReader, StreamWriter interface resp.
772
773 encode and decode are needed for the frontend translation,
774 Reader and Writer for the backend translation. Unicode is
775 used as intermediate encoding.
776
777 Error handling is done in the same way as defined for the
778 StreamWriter/Readers.
779
780 """
781 self.stream = stream
782 self.encode = encode
783 self.decode = decode
784 self.reader = Reader(stream, errors)
785 self.writer = Writer(stream, errors)
786 self.errors = errors
787
Tim Peters30324a72001-05-15 17:19:16 +0000788 def read(self, size=-1):
Guido van Rossum0612d842000-03-10 23:20:43 +0000789
790 data = self.reader.read(size)
791 data, bytesencoded = self.encode(data, self.errors)
792 return data
793
Tim Peters30324a72001-05-15 17:19:16 +0000794 def readline(self, size=None):
Guido van Rossuma3277132000-04-11 15:37:43 +0000795
796 if size is None:
797 data = self.reader.readline()
798 else:
799 data = self.reader.readline(size)
800 data, bytesencoded = self.encode(data, self.errors)
801 return data
802
Tim Peters30324a72001-05-15 17:19:16 +0000803 def readlines(self, sizehint=None):
Guido van Rossuma3277132000-04-11 15:37:43 +0000804
Marc-André Lemburgd5948492004-02-26 15:22:17 +0000805 data = self.reader.read()
Guido van Rossuma3277132000-04-11 15:37:43 +0000806 data, bytesencoded = self.encode(data, self.errors)
Ezio Melottid8b509b2011-09-28 17:37:55 +0300807 return data.splitlines(keepends=True)
Guido van Rossuma3277132000-04-11 15:37:43 +0000808
Georg Brandla18af4e2007-04-21 15:47:16 +0000809 def __next__(self):
Walter Dörwald4dbf1922002-11-06 16:53:44 +0000810
811 """ Return the next decoded line from the input stream."""
Georg Brandla18af4e2007-04-21 15:47:16 +0000812 data = next(self.reader)
Walter Dörwaldc5238b82005-09-01 11:56:53 +0000813 data, bytesencoded = self.encode(data, self.errors)
814 return data
Walter Dörwald4dbf1922002-11-06 16:53:44 +0000815
816 def __iter__(self):
817 return self
818
Tim Peters30324a72001-05-15 17:19:16 +0000819 def write(self, data):
Guido van Rossum0612d842000-03-10 23:20:43 +0000820
821 data, bytesdecoded = self.decode(data, self.errors)
822 return self.writer.write(data)
823
Tim Peters30324a72001-05-15 17:19:16 +0000824 def writelines(self, list):
Guido van Rossuma3277132000-04-11 15:37:43 +0000825
826 data = ''.join(list)
827 data, bytesdecoded = self.decode(data, self.errors)
828 return self.writer.write(data)
Guido van Rossum0612d842000-03-10 23:20:43 +0000829
830 def reset(self):
831
832 self.reader.reset()
833 self.writer.reset()
834
Tim Peters30324a72001-05-15 17:19:16 +0000835 def __getattr__(self, name,
Guido van Rossum0612d842000-03-10 23:20:43 +0000836 getattr=getattr):
837
838 """ Inherit all other methods from the underlying stream.
839 """
Tim Peters30324a72001-05-15 17:19:16 +0000840 return getattr(self.stream, name)
Guido van Rossum0612d842000-03-10 23:20:43 +0000841
Thomas Wouters89f507f2006-12-13 04:49:30 +0000842 def __enter__(self):
843 return self
844
845 def __exit__(self, type, value, tb):
846 self.stream.close()
847
Guido van Rossum0612d842000-03-10 23:20:43 +0000848### Shortcuts
849
Marc-André Lemburg349a3d32000-06-21 21:21:04 +0000850def open(filename, mode='rb', encoding=None, errors='strict', buffering=1):
Guido van Rossum0612d842000-03-10 23:20:43 +0000851
852 """ Open an encoded file using the given mode and return
853 a wrapped version providing transparent encoding/decoding.
854
855 Note: The wrapped version will only accept the object format
856 defined by the codecs, i.e. Unicode objects for most builtin
Skip Montanaro9f5f9d92005-03-16 03:51:56 +0000857 codecs. Output is also codec dependent and will usually be
Guido van Rossum0612d842000-03-10 23:20:43 +0000858 Unicode as well.
859
Marc-André Lemburg349a3d32000-06-21 21:21:04 +0000860 Files are always opened in binary mode, even if no binary mode
Walter Dörwald7f3ed742003-02-02 23:08:27 +0000861 was specified. This is done to avoid data loss due to encodings
Marc-André Lemburg349a3d32000-06-21 21:21:04 +0000862 using 8-bit values. The default file mode is 'rb' meaning to
863 open the file in binary read mode.
864
Guido van Rossum0612d842000-03-10 23:20:43 +0000865 encoding specifies the encoding which is to be used for the
Walter Dörwald7f3ed742003-02-02 23:08:27 +0000866 file.
Guido van Rossum0612d842000-03-10 23:20:43 +0000867
868 errors may be given to define the error handling. It defaults
869 to 'strict' which causes ValueErrors to be raised in case an
870 encoding error occurs.
871
872 buffering has the same meaning as for the builtin open() API.
873 It defaults to line buffered.
874
Fred Drake49fd1072000-04-13 14:11:21 +0000875 The returned wrapped file object provides an extra attribute
876 .encoding which allows querying the used encoding. This
877 attribute is only available if an encoding was specified as
878 parameter.
879
Guido van Rossum0612d842000-03-10 23:20:43 +0000880 """
881 if encoding is not None and \
882 'b' not in mode:
883 # Force opening of the file in binary mode
884 mode = mode + 'b'
Georg Brandl1a3284e2007-12-02 09:40:06 +0000885 file = builtins.open(filename, mode, buffering)
Guido van Rossum0612d842000-03-10 23:20:43 +0000886 if encoding is None:
887 return file
Thomas Woutersa9773292006-04-21 09:43:23 +0000888 info = lookup(encoding)
889 srw = StreamReaderWriter(file, info.streamreader, info.streamwriter, errors)
Guido van Rossuma3277132000-04-11 15:37:43 +0000890 # Add attributes to simplify introspection
891 srw.encoding = encoding
892 return srw
Guido van Rossum0612d842000-03-10 23:20:43 +0000893
Guido van Rossuma3277132000-04-11 15:37:43 +0000894def EncodedFile(file, data_encoding, file_encoding=None, errors='strict'):
Guido van Rossum0612d842000-03-10 23:20:43 +0000895
896 """ Return a wrapped version of file which provides transparent
897 encoding translation.
898
899 Strings written to the wrapped file are interpreted according
Guido van Rossuma3277132000-04-11 15:37:43 +0000900 to the given data_encoding and then written to the original
901 file as string using file_encoding. The intermediate encoding
902 will usually be Unicode but depends on the specified codecs.
Guido van Rossum0612d842000-03-10 23:20:43 +0000903
Guido van Rossuma3277132000-04-11 15:37:43 +0000904 Strings are read from the file using file_encoding and then
905 passed back to the caller as string using data_encoding.
906
907 If file_encoding is not given, it defaults to data_encoding.
Guido van Rossum0612d842000-03-10 23:20:43 +0000908
909 errors may be given to define the error handling. It defaults
910 to 'strict' which causes ValueErrors to be raised in case an
911 encoding error occurs.
912
Fred Drake49fd1072000-04-13 14:11:21 +0000913 The returned wrapped file object provides two extra attributes
914 .data_encoding and .file_encoding which reflect the given
915 parameters of the same name. The attributes can be used for
916 introspection by Python programs.
917
Guido van Rossum0612d842000-03-10 23:20:43 +0000918 """
Guido van Rossuma3277132000-04-11 15:37:43 +0000919 if file_encoding is None:
920 file_encoding = data_encoding
Thomas Wouters89f507f2006-12-13 04:49:30 +0000921 data_info = lookup(data_encoding)
922 file_info = lookup(file_encoding)
923 sr = StreamRecoder(file, data_info.encode, data_info.decode,
924 file_info.streamreader, file_info.streamwriter, errors)
Guido van Rossuma3277132000-04-11 15:37:43 +0000925 # Add attributes to simplify introspection
926 sr.data_encoding = data_encoding
927 sr.file_encoding = file_encoding
928 return sr
Guido van Rossum0612d842000-03-10 23:20:43 +0000929
Marc-André Lemburgaa32c5a2001-09-19 11:24:48 +0000930### Helpers for codec lookup
931
932def getencoder(encoding):
933
934 """ Lookup up the codec for the given encoding and return
935 its encoder function.
936
937 Raises a LookupError in case the encoding cannot be found.
938
939 """
Thomas Woutersa9773292006-04-21 09:43:23 +0000940 return lookup(encoding).encode
Marc-André Lemburgaa32c5a2001-09-19 11:24:48 +0000941
942def getdecoder(encoding):
943
944 """ Lookup up the codec for the given encoding and return
945 its decoder function.
946
947 Raises a LookupError in case the encoding cannot be found.
948
949 """
Thomas Woutersa9773292006-04-21 09:43:23 +0000950 return lookup(encoding).decode
951
952def getincrementalencoder(encoding):
953
954 """ Lookup up the codec for the given encoding and return
955 its IncrementalEncoder class or factory function.
956
957 Raises a LookupError in case the encoding cannot be found
958 or the codecs doesn't provide an incremental encoder.
959
960 """
961 encoder = lookup(encoding).incrementalencoder
962 if encoder is None:
963 raise LookupError(encoding)
964 return encoder
965
966def getincrementaldecoder(encoding):
967
968 """ Lookup up the codec for the given encoding and return
969 its IncrementalDecoder class or factory function.
970
971 Raises a LookupError in case the encoding cannot be found
972 or the codecs doesn't provide an incremental decoder.
973
974 """
975 decoder = lookup(encoding).incrementaldecoder
976 if decoder is None:
977 raise LookupError(encoding)
978 return decoder
Marc-André Lemburgaa32c5a2001-09-19 11:24:48 +0000979
980def getreader(encoding):
981
982 """ Lookup up the codec for the given encoding and return
983 its StreamReader class or factory function.
984
985 Raises a LookupError in case the encoding cannot be found.
986
987 """
Thomas Woutersa9773292006-04-21 09:43:23 +0000988 return lookup(encoding).streamreader
Marc-André Lemburgaa32c5a2001-09-19 11:24:48 +0000989
990def getwriter(encoding):
991
992 """ Lookup up the codec for the given encoding and return
993 its StreamWriter class or factory function.
994
995 Raises a LookupError in case the encoding cannot be found.
996
997 """
Thomas Woutersa9773292006-04-21 09:43:23 +0000998 return lookup(encoding).streamwriter
999
1000def iterencode(iterator, encoding, errors='strict', **kwargs):
1001 """
1002 Encoding iterator.
1003
1004 Encodes the input strings from the iterator using a IncrementalEncoder.
1005
1006 errors and kwargs are passed through to the IncrementalEncoder
1007 constructor.
1008 """
1009 encoder = getincrementalencoder(encoding)(errors, **kwargs)
1010 for input in iterator:
1011 output = encoder.encode(input)
1012 if output:
1013 yield output
1014 output = encoder.encode("", True)
1015 if output:
1016 yield output
1017
1018def iterdecode(iterator, encoding, errors='strict', **kwargs):
1019 """
1020 Decoding iterator.
1021
1022 Decodes the input strings from the iterator using a IncrementalDecoder.
1023
1024 errors and kwargs are passed through to the IncrementalDecoder
1025 constructor.
1026 """
1027 decoder = getincrementaldecoder(encoding)(errors, **kwargs)
1028 for input in iterator:
1029 output = decoder.decode(input)
1030 if output:
1031 yield output
Walter Dörwaldca8a8d02007-05-04 13:05:09 +00001032 output = decoder.decode(b"", True)
Thomas Woutersa9773292006-04-21 09:43:23 +00001033 if output:
1034 yield output
Marc-André Lemburgaa32c5a2001-09-19 11:24:48 +00001035
Marc-André Lemburga866df82001-01-03 21:29:14 +00001036### Helpers for charmap-based codecs
1037
1038def make_identity_dict(rng):
1039
1040 """ make_identity_dict(rng) -> dict
1041
1042 Return a dictionary where elements of the rng sequence are
1043 mapped to themselves.
Tim Peters88869f92001-01-14 23:36:06 +00001044
Marc-André Lemburga866df82001-01-03 21:29:14 +00001045 """
Antoine Pitrouaaefac72012-06-16 22:48:21 +02001046 return {i:i for i in rng}
Marc-André Lemburga866df82001-01-03 21:29:14 +00001047
Marc-André Lemburg716cf912001-05-16 09:41:45 +00001048def make_encoding_map(decoding_map):
1049
1050 """ Creates an encoding map from a decoding map.
1051
Walter Dörwald7f3ed742003-02-02 23:08:27 +00001052 If a target mapping in the decoding map occurs multiple
Marc-André Lemburg716cf912001-05-16 09:41:45 +00001053 times, then that target is mapped to None (undefined mapping),
1054 causing an exception when encountered by the charmap codec
1055 during translation.
1056
1057 One example where this happens is cp875.py which decodes
1058 multiple character to \u001a.
1059
1060 """
1061 m = {}
1062 for k,v in decoding_map.items():
Raymond Hettinger54f02222002-06-01 14:18:47 +00001063 if not v in m:
Marc-André Lemburg716cf912001-05-16 09:41:45 +00001064 m[v] = k
1065 else:
1066 m[v] = None
1067 return m
Tim Peters3a2ab1a2001-05-29 06:06:54 +00001068
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001069### error handlers
1070
Martin v. Löwise2713be2005-03-08 15:03:08 +00001071try:
1072 strict_errors = lookup_error("strict")
1073 ignore_errors = lookup_error("ignore")
1074 replace_errors = lookup_error("replace")
1075 xmlcharrefreplace_errors = lookup_error("xmlcharrefreplace")
1076 backslashreplace_errors = lookup_error("backslashreplace")
1077except LookupError:
1078 # In --disable-unicode builds, these error handler are missing
1079 strict_errors = None
1080 ignore_errors = None
1081 replace_errors = None
1082 xmlcharrefreplace_errors = None
1083 backslashreplace_errors = None
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001084
Martin v. Löwis6cd441d2001-07-31 08:54:55 +00001085# Tell modulefinder that using codecs probably needs the encodings
1086# package
1087_false = 0
1088if _false:
1089 import encodings
1090
Guido van Rossum0612d842000-03-10 23:20:43 +00001091### Tests
Guido van Rossum1c89b0e2000-04-11 15:41:38 +00001092
Guido van Rossum0612d842000-03-10 23:20:43 +00001093if __name__ == '__main__':
1094
Guido van Rossuma3277132000-04-11 15:37:43 +00001095 # Make stdout translate Latin-1 output into UTF-8 output
1096 sys.stdout = EncodedFile(sys.stdout, 'latin-1', 'utf-8')
Guido van Rossum1c89b0e2000-04-11 15:41:38 +00001097
Guido van Rossuma3277132000-04-11 15:37:43 +00001098 # Have stdin translate Latin-1 input into UTF-8 input
1099 sys.stdin = EncodedFile(sys.stdin, 'utf-8', 'latin-1')