blob: ec7879fb6be84a7c33fa3788fb4281b57c5bfc4c [file] [log] [blame]
Guido van Rossum0612d842000-03-10 23:20:43 +00001""" codecs -- Python Codec Registry, API and helpers.
2
3
4Written by Marc-Andre Lemburg (mal@lemburg.com).
5
6(c) Copyright CNRI, All Rights Reserved. NO WARRANTY.
7
8"""#"
9
Georg Brandl1a3284e2007-12-02 09:40:06 +000010import builtins, sys
Guido van Rossum0612d842000-03-10 23:20:43 +000011
12### Registry and builtin stateless codec functions
13
Guido van Rossumb95de4f2000-03-31 17:25:23 +000014try:
15 from _codecs import *
Guido van Rossumb940e112007-01-10 16:19:56 +000016except ImportError as why:
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000017 raise SystemError('Failed to load the builtin codecs: %s' % why)
Guido van Rossum0612d842000-03-10 23:20:43 +000018
Tim Peters30324a72001-05-15 17:19:16 +000019__all__ = ["register", "lookup", "open", "EncodedFile", "BOM", "BOM_BE",
Walter Dörwald474458d2002-06-04 15:16:29 +000020 "BOM_LE", "BOM32_BE", "BOM32_LE", "BOM64_BE", "BOM64_LE",
21 "BOM_UTF8", "BOM_UTF16", "BOM_UTF16_LE", "BOM_UTF16_BE",
Walter Dörwald3aeb6322002-09-02 13:14:32 +000022 "BOM_UTF32", "BOM_UTF32_LE", "BOM_UTF32_BE",
23 "strict_errors", "ignore_errors", "replace_errors",
24 "xmlcharrefreplace_errors",
25 "register_error", "lookup_error"]
Skip Montanaroe99d5ea2001-01-20 19:54:20 +000026
Guido van Rossum0612d842000-03-10 23:20:43 +000027### Constants
28
29#
Walter Dörwald474458d2002-06-04 15:16:29 +000030# Byte Order Mark (BOM = ZERO WIDTH NO-BREAK SPACE = U+FEFF)
31# and its possible byte string values
32# for UTF8/UTF16/UTF32 output and little/big endian machines
Guido van Rossum0612d842000-03-10 23:20:43 +000033#
Guido van Rossum0612d842000-03-10 23:20:43 +000034
Walter Dörwald474458d2002-06-04 15:16:29 +000035# UTF-8
Walter Dörwaldca8a8d02007-05-04 13:05:09 +000036BOM_UTF8 = b'\xef\xbb\xbf'
Walter Dörwald474458d2002-06-04 15:16:29 +000037
38# UTF-16, little endian
Walter Dörwaldca8a8d02007-05-04 13:05:09 +000039BOM_LE = BOM_UTF16_LE = b'\xff\xfe'
Walter Dörwald474458d2002-06-04 15:16:29 +000040
41# UTF-16, big endian
Walter Dörwaldca8a8d02007-05-04 13:05:09 +000042BOM_BE = BOM_UTF16_BE = b'\xfe\xff'
Walter Dörwald474458d2002-06-04 15:16:29 +000043
44# UTF-32, little endian
Walter Dörwaldca8a8d02007-05-04 13:05:09 +000045BOM_UTF32_LE = b'\xff\xfe\x00\x00'
Walter Dörwald474458d2002-06-04 15:16:29 +000046
47# UTF-32, big endian
Walter Dörwaldca8a8d02007-05-04 13:05:09 +000048BOM_UTF32_BE = b'\x00\x00\xfe\xff'
Walter Dörwald474458d2002-06-04 15:16:29 +000049
Marc-André Lemburgb28de0d2002-12-12 17:37:50 +000050if sys.byteorder == 'little':
Walter Dörwald474458d2002-06-04 15:16:29 +000051
Marc-André Lemburgb28de0d2002-12-12 17:37:50 +000052 # UTF-16, native endianness
53 BOM = BOM_UTF16 = BOM_UTF16_LE
54
55 # UTF-32, native endianness
56 BOM_UTF32 = BOM_UTF32_LE
57
58else:
59
60 # UTF-16, native endianness
61 BOM = BOM_UTF16 = BOM_UTF16_BE
62
63 # UTF-32, native endianness
64 BOM_UTF32 = BOM_UTF32_BE
Walter Dörwald474458d2002-06-04 15:16:29 +000065
66# Old broken names (don't use in new code)
67BOM32_LE = BOM_UTF16_LE
68BOM32_BE = BOM_UTF16_BE
69BOM64_LE = BOM_UTF32_LE
70BOM64_BE = BOM_UTF32_BE
Guido van Rossum0612d842000-03-10 23:20:43 +000071
72
73### Codec base classes (defining the API)
74
Thomas Woutersa9773292006-04-21 09:43:23 +000075class CodecInfo(tuple):
76
77 def __new__(cls, encode, decode, streamreader=None, streamwriter=None,
78 incrementalencoder=None, incrementaldecoder=None, name=None):
79 self = tuple.__new__(cls, (encode, decode, streamreader, streamwriter))
80 self.name = name
81 self.encode = encode
82 self.decode = decode
83 self.incrementalencoder = incrementalencoder
84 self.incrementaldecoder = incrementaldecoder
85 self.streamwriter = streamwriter
86 self.streamreader = streamreader
87 return self
88
89 def __repr__(self):
Walter Dörwald3abcb012007-04-16 22:10:50 +000090 return "<%s.%s object for encoding %s at 0x%x>" % \
91 (self.__class__.__module__, self.__class__.__name__,
92 self.name, id(self))
Thomas Woutersa9773292006-04-21 09:43:23 +000093
Guido van Rossum0612d842000-03-10 23:20:43 +000094class Codec:
95
96 """ Defines the interface for stateless encoders/decoders.
97
Walter Dörwald7f82f792002-11-19 21:42:53 +000098 The .encode()/.decode() methods may use different error
Guido van Rossum0612d842000-03-10 23:20:43 +000099 handling schemes by providing the errors argument. These
Walter Dörwald7f82f792002-11-19 21:42:53 +0000100 string values are predefined:
Guido van Rossum0612d842000-03-10 23:20:43 +0000101
Guido van Rossumd8855fd2000-03-24 22:14:19 +0000102 'strict' - raise a ValueError error (or a subclass)
Guido van Rossum0612d842000-03-10 23:20:43 +0000103 'ignore' - ignore the character and continue with the next
104 'replace' - replace with a suitable replacement character;
105 Python will use the official U+FFFD REPLACEMENT
Walter Dörwald7f82f792002-11-19 21:42:53 +0000106 CHARACTER for the builtin Unicode codecs on
107 decoding and '?' on encoding.
108 'xmlcharrefreplace' - Replace with the appropriate XML
109 character reference (only for encoding).
110 'backslashreplace' - Replace with backslashed escape sequences
111 (only for encoding).
112
113 The set of allowed values can be extended via register_error.
Guido van Rossum0612d842000-03-10 23:20:43 +0000114
115 """
Tim Peters30324a72001-05-15 17:19:16 +0000116 def encode(self, input, errors='strict'):
Guido van Rossum1c89b0e2000-04-11 15:41:38 +0000117
Fred Drake3e74c0d2000-03-17 15:40:35 +0000118 """ Encodes the object input and returns a tuple (output
Guido van Rossum0612d842000-03-10 23:20:43 +0000119 object, length consumed).
120
121 errors defines the error handling to apply. It defaults to
122 'strict' handling.
123
124 The method may not store state in the Codec instance. Use
125 StreamCodec for codecs which have to keep state in order to
126 make encoding/decoding efficient.
127
128 The encoder must be able to handle zero length input and
129 return an empty object of the output object type in this
130 situation.
131
132 """
133 raise NotImplementedError
134
Tim Peters30324a72001-05-15 17:19:16 +0000135 def decode(self, input, errors='strict'):
Guido van Rossum0612d842000-03-10 23:20:43 +0000136
137 """ Decodes the object input and returns a tuple (output
138 object, length consumed).
139
140 input must be an object which provides the bf_getreadbuf
141 buffer slot. Python strings, buffer objects and memory
142 mapped files are examples of objects providing this slot.
Guido van Rossum1c89b0e2000-04-11 15:41:38 +0000143
Guido van Rossum0612d842000-03-10 23:20:43 +0000144 errors defines the error handling to apply. It defaults to
145 'strict' handling.
146
147 The method may not store state in the Codec instance. Use
148 StreamCodec for codecs which have to keep state in order to
149 make encoding/decoding efficient.
150
151 The decoder must be able to handle zero length input and
152 return an empty object of the output object type in this
153 situation.
154
Guido van Rossum1c89b0e2000-04-11 15:41:38 +0000155 """
Guido van Rossum0612d842000-03-10 23:20:43 +0000156 raise NotImplementedError
157
Thomas Woutersa9773292006-04-21 09:43:23 +0000158class IncrementalEncoder(object):
159 """
Walter Dörwald3abcb012007-04-16 22:10:50 +0000160 An IncrementalEncoder encodes an input in multiple steps. The input can
161 be passed piece by piece to the encode() method. The IncrementalEncoder
162 remembers the state of the encoding process between calls to encode().
Thomas Woutersa9773292006-04-21 09:43:23 +0000163 """
164 def __init__(self, errors='strict'):
165 """
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000166 Creates an IncrementalEncoder instance.
Thomas Woutersa9773292006-04-21 09:43:23 +0000167
168 The IncrementalEncoder may use different error handling schemes by
169 providing the errors keyword argument. See the module docstring
170 for a list of possible values.
171 """
172 self.errors = errors
173 self.buffer = ""
174
175 def encode(self, input, final=False):
176 """
177 Encodes input and returns the resulting object.
178 """
179 raise NotImplementedError
180
181 def reset(self):
182 """
183 Resets the encoder to the initial state.
184 """
185
Walter Dörwald3abcb012007-04-16 22:10:50 +0000186 def getstate(self):
187 """
188 Return the current state of the encoder.
189 """
190 return 0
191
192 def setstate(self, state):
193 """
194 Set the current state of the encoder. state must have been
195 returned by getstate().
196 """
197
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000198class BufferedIncrementalEncoder(IncrementalEncoder):
199 """
200 This subclass of IncrementalEncoder can be used as the baseclass for an
201 incremental encoder if the encoder must keep some of the output in a
202 buffer between calls to encode().
203 """
204 def __init__(self, errors='strict'):
205 IncrementalEncoder.__init__(self, errors)
Walter Dörwald3abcb012007-04-16 22:10:50 +0000206 # unencoded input that is kept between calls to encode()
207 self.buffer = ""
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000208
209 def _buffer_encode(self, input, errors, final):
210 # Overwrite this method in subclasses: It must encode input
211 # and return an (output, length consumed) tuple
212 raise NotImplementedError
213
214 def encode(self, input, final=False):
215 # encode input (taking the buffer into account)
216 data = self.buffer + input
217 (result, consumed) = self._buffer_encode(data, self.errors, final)
218 # keep unencoded input until the next call
219 self.buffer = data[consumed:]
220 return result
221
222 def reset(self):
223 IncrementalEncoder.reset(self)
224 self.buffer = ""
225
Walter Dörwald3abcb012007-04-16 22:10:50 +0000226 def getstate(self):
227 return self.buffer or 0
228
229 def setstate(self, state):
230 self.buffer = state or ""
231
Thomas Woutersa9773292006-04-21 09:43:23 +0000232class IncrementalDecoder(object):
233 """
Walter Dörwald3abcb012007-04-16 22:10:50 +0000234 An IncrementalDecoder decodes an input in multiple steps. The input can
235 be passed piece by piece to the decode() method. The IncrementalDecoder
Thomas Woutersa9773292006-04-21 09:43:23 +0000236 remembers the state of the decoding process between calls to decode().
237 """
238 def __init__(self, errors='strict'):
239 """
Ka-Ping Yeef44c7e82008-03-18 04:51:32 +0000240 Create a IncrementalDecoder instance.
Thomas Woutersa9773292006-04-21 09:43:23 +0000241
242 The IncrementalDecoder may use different error handling schemes by
243 providing the errors keyword argument. See the module docstring
244 for a list of possible values.
245 """
246 self.errors = errors
247
248 def decode(self, input, final=False):
249 """
Ka-Ping Yeef44c7e82008-03-18 04:51:32 +0000250 Decode input and returns the resulting object.
Thomas Woutersa9773292006-04-21 09:43:23 +0000251 """
252 raise NotImplementedError
253
254 def reset(self):
255 """
Ka-Ping Yeef44c7e82008-03-18 04:51:32 +0000256 Reset the decoder to the initial state.
Thomas Woutersa9773292006-04-21 09:43:23 +0000257 """
258
Walter Dörwald3abcb012007-04-16 22:10:50 +0000259 def getstate(self):
260 """
Ka-Ping Yeef44c7e82008-03-18 04:51:32 +0000261 Return the current state of the decoder.
262
263 This must be a (buffered_input, additional_state_info) tuple.
264 buffered_input must be a bytes object containing bytes that
265 were passed to decode() that have not yet been converted.
266 additional_state_info must be a non-negative integer
267 representing the state of the decoder WITHOUT yet having
268 processed the contents of buffered_input. In the initial state
269 and after reset(), getstate() must return (b"", 0).
Walter Dörwald3abcb012007-04-16 22:10:50 +0000270 """
Walter Dörwaldca8a8d02007-05-04 13:05:09 +0000271 return (b"", 0)
Walter Dörwald3abcb012007-04-16 22:10:50 +0000272
273 def setstate(self, state):
274 """
Ka-Ping Yeef44c7e82008-03-18 04:51:32 +0000275 Set the current state of the decoder.
276
277 state must have been returned by getstate(). The effect of
278 setstate((b"", 0)) must be equivalent to reset().
Walter Dörwald3abcb012007-04-16 22:10:50 +0000279 """
280
Thomas Woutersa9773292006-04-21 09:43:23 +0000281class BufferedIncrementalDecoder(IncrementalDecoder):
282 """
283 This subclass of IncrementalDecoder can be used as the baseclass for an
Walter Dörwald3abcb012007-04-16 22:10:50 +0000284 incremental decoder if the decoder must be able to handle incomplete
285 byte sequences.
Thomas Woutersa9773292006-04-21 09:43:23 +0000286 """
287 def __init__(self, errors='strict'):
288 IncrementalDecoder.__init__(self, errors)
Walter Dörwald3abcb012007-04-16 22:10:50 +0000289 # undecoded input that is kept between calls to decode()
Walter Dörwaldca8a8d02007-05-04 13:05:09 +0000290 self.buffer = b""
Thomas Woutersa9773292006-04-21 09:43:23 +0000291
292 def _buffer_decode(self, input, errors, final):
293 # Overwrite this method in subclasses: It must decode input
294 # and return an (output, length consumed) tuple
295 raise NotImplementedError
296
297 def decode(self, input, final=False):
298 # decode input (taking the buffer into account)
299 data = self.buffer + input
300 (result, consumed) = self._buffer_decode(data, self.errors, final)
301 # keep undecoded input until the next call
302 self.buffer = data[consumed:]
303 return result
304
305 def reset(self):
306 IncrementalDecoder.reset(self)
Walter Dörwaldca8a8d02007-05-04 13:05:09 +0000307 self.buffer = b""
Thomas Woutersa9773292006-04-21 09:43:23 +0000308
Walter Dörwald3abcb012007-04-16 22:10:50 +0000309 def getstate(self):
310 # additional state info is always 0
311 return (self.buffer, 0)
312
313 def setstate(self, state):
314 # ignore additional state info
315 self.buffer = state[0]
316
Guido van Rossum0612d842000-03-10 23:20:43 +0000317#
318# The StreamWriter and StreamReader class provide generic working
Andrew M. Kuchling97c56352001-09-18 20:29:48 +0000319# interfaces which can be used to implement new encoding submodules
Guido van Rossum0612d842000-03-10 23:20:43 +0000320# very easily. See encodings/utf_8.py for an example on how this is
321# done.
Guido van Rossum1c89b0e2000-04-11 15:41:38 +0000322#
Guido van Rossum0612d842000-03-10 23:20:43 +0000323
324class StreamWriter(Codec):
325
Tim Peters30324a72001-05-15 17:19:16 +0000326 def __init__(self, stream, errors='strict'):
Guido van Rossum0612d842000-03-10 23:20:43 +0000327
328 """ Creates a StreamWriter instance.
329
330 stream must be a file-like object open for writing
331 (binary) data.
332
Walter Dörwald7f82f792002-11-19 21:42:53 +0000333 The StreamWriter may use different error handling
Guido van Rossum0612d842000-03-10 23:20:43 +0000334 schemes by providing the errors keyword argument. These
Walter Dörwald7f82f792002-11-19 21:42:53 +0000335 parameters are predefined:
Guido van Rossum0612d842000-03-10 23:20:43 +0000336
337 'strict' - raise a ValueError (or a subclass)
338 'ignore' - ignore the character and continue with the next
339 'replace'- replace with a suitable replacement character
Walter Dörwald7f82f792002-11-19 21:42:53 +0000340 'xmlcharrefreplace' - Replace with the appropriate XML
341 character reference.
342 'backslashreplace' - Replace with backslashed escape
343 sequences (only for encoding).
Guido van Rossum0612d842000-03-10 23:20:43 +0000344
Walter Dörwald7f82f792002-11-19 21:42:53 +0000345 The set of allowed parameter values can be extended via
346 register_error.
Guido van Rossum0612d842000-03-10 23:20:43 +0000347 """
Victor Stinner98fe1a02011-05-27 01:51:18 +0200348 import warnings
349 warnings.warn('use io.TextIOWrapper', DeprecationWarning, stacklevel=2)
Guido van Rossum0612d842000-03-10 23:20:43 +0000350 self.stream = stream
351 self.errors = errors
352
Guido van Rossuma3277132000-04-11 15:37:43 +0000353 def write(self, object):
Guido van Rossum0612d842000-03-10 23:20:43 +0000354
355 """ Writes the object's contents encoded to self.stream.
356 """
Tim Peters30324a72001-05-15 17:19:16 +0000357 data, consumed = self.encode(object, self.errors)
Guido van Rossum0612d842000-03-10 23:20:43 +0000358 self.stream.write(data)
359
Guido van Rossuma3277132000-04-11 15:37:43 +0000360 def writelines(self, list):
361
362 """ Writes the concatenated list of strings to the stream
363 using .write().
364 """
365 self.write(''.join(list))
Guido van Rossum1c89b0e2000-04-11 15:41:38 +0000366
Guido van Rossum0612d842000-03-10 23:20:43 +0000367 def reset(self):
368
369 """ Flushes and resets the codec buffers used for keeping state.
370
371 Calling this method should ensure that the data on the
372 output is put into a clean state, that allows appending
373 of new fresh data without having to rescan the whole
374 stream to recover state.
375
376 """
377 pass
378
Victor Stinnera92ad7e2010-05-22 16:59:09 +0000379 def seek(self, offset, whence=0):
380 self.stream.seek(offset, whence)
381 if whence == 0 and offset == 0:
382 self.reset()
383
Tim Peters30324a72001-05-15 17:19:16 +0000384 def __getattr__(self, name,
Guido van Rossum0612d842000-03-10 23:20:43 +0000385 getattr=getattr):
386
387 """ Inherit all other methods from the underlying stream.
388 """
Tim Peters30324a72001-05-15 17:19:16 +0000389 return getattr(self.stream, name)
Guido van Rossum0612d842000-03-10 23:20:43 +0000390
Thomas Wouters89f507f2006-12-13 04:49:30 +0000391 def __enter__(self):
392 return self
393
394 def __exit__(self, type, value, tb):
395 self.stream.close()
396
Guido van Rossum0612d842000-03-10 23:20:43 +0000397###
398
399class StreamReader(Codec):
400
Georg Brandl02524622010-12-02 18:06:51 +0000401 charbuffertype = str
402
Tim Peters30324a72001-05-15 17:19:16 +0000403 def __init__(self, stream, errors='strict'):
Guido van Rossum0612d842000-03-10 23:20:43 +0000404
405 """ Creates a StreamReader instance.
406
407 stream must be a file-like object open for reading
408 (binary) data.
409
Walter Dörwald7f82f792002-11-19 21:42:53 +0000410 The StreamReader may use different error handling
Guido van Rossum0612d842000-03-10 23:20:43 +0000411 schemes by providing the errors keyword argument. These
Walter Dörwald7f82f792002-11-19 21:42:53 +0000412 parameters are predefined:
Guido van Rossum0612d842000-03-10 23:20:43 +0000413
414 'strict' - raise a ValueError (or a subclass)
415 'ignore' - ignore the character and continue with the next
416 'replace'- replace with a suitable replacement character;
417
Walter Dörwald7f82f792002-11-19 21:42:53 +0000418 The set of allowed parameter values can be extended via
419 register_error.
Guido van Rossum0612d842000-03-10 23:20:43 +0000420 """
Victor Stinner98fe1a02011-05-27 01:51:18 +0200421 import warnings
422 warnings.warn('use io.TextIOWrapper', DeprecationWarning, stacklevel=2)
Guido van Rossum0612d842000-03-10 23:20:43 +0000423 self.stream = stream
424 self.errors = errors
Walter Dörwaldca8a8d02007-05-04 13:05:09 +0000425 self.bytebuffer = b""
Georg Brandl02524622010-12-02 18:06:51 +0000426 self._empty_charbuffer = self.charbuffertype()
427 self.charbuffer = self._empty_charbuffer
Martin v. Löwis4ed67382005-09-18 08:34:39 +0000428 self.linebuffer = None
Guido van Rossum0612d842000-03-10 23:20:43 +0000429
Walter Dörwald69652032004-09-07 20:24:22 +0000430 def decode(self, input, errors='strict'):
431 raise NotImplementedError
432
Martin v. Löwis56066d22005-08-24 07:38:12 +0000433 def read(self, size=-1, chars=-1, firstline=False):
Guido van Rossum0612d842000-03-10 23:20:43 +0000434
435 """ Decodes data from the stream self.stream and returns the
436 resulting object.
437
Walter Dörwald69652032004-09-07 20:24:22 +0000438 chars indicates the number of characters to read from the
439 stream. read() will never return more than chars
440 characters, but it might return less, if there are not enough
441 characters available.
442
Guido van Rossum0612d842000-03-10 23:20:43 +0000443 size indicates the approximate maximum number of bytes to
444 read from the stream for decoding purposes. The decoder
445 can modify this setting as appropriate. The default value
446 -1 indicates to read and decode as much as possible. size
447 is intended to prevent having to decode huge files in one
448 step.
449
Martin v. Löwis56066d22005-08-24 07:38:12 +0000450 If firstline is true, and a UnicodeDecodeError happens
451 after the first line terminator in the input only the first line
452 will be returned, the rest of the input will be kept until the
453 next call to read().
454
Guido van Rossum0612d842000-03-10 23:20:43 +0000455 The method should use a greedy read strategy meaning that
456 it should read as much data as is allowed within the
457 definition of the encoding and the given size, e.g. if
458 optional encoding endings or state markers are available
459 on the stream, these should be read too.
Guido van Rossum0612d842000-03-10 23:20:43 +0000460 """
Martin v. Löwis4ed67382005-09-18 08:34:39 +0000461 # If we have lines cached, first merge them back into characters
462 if self.linebuffer:
Georg Brandl02524622010-12-02 18:06:51 +0000463 self.charbuffer = self._empty_charbuffer.join(self.linebuffer)
Martin v. Löwis4ed67382005-09-18 08:34:39 +0000464 self.linebuffer = None
Tim Peters536cf992005-12-25 23:18:31 +0000465
Walter Dörwald69652032004-09-07 20:24:22 +0000466 # read until we get the required number of characters (if available)
Walter Dörwald69652032004-09-07 20:24:22 +0000467 while True:
468 # can the request can be satisfied from the character buffer?
469 if chars < 0:
Walter Dörwaldca199432006-03-06 22:39:12 +0000470 if size < 0:
471 if self.charbuffer:
472 break
473 elif len(self.charbuffer) >= size:
Walter Dörwalde57d7b12004-12-21 22:24:00 +0000474 break
Guido van Rossum0612d842000-03-10 23:20:43 +0000475 else:
Walter Dörwald69652032004-09-07 20:24:22 +0000476 if len(self.charbuffer) >= chars:
Walter Dörwald69652032004-09-07 20:24:22 +0000477 break
478 # we need more data
479 if size < 0:
480 newdata = self.stream.read()
481 else:
482 newdata = self.stream.read(size)
Walter Dörwalde57d7b12004-12-21 22:24:00 +0000483 # decode bytes (those remaining from the last call included)
Walter Dörwald69652032004-09-07 20:24:22 +0000484 data = self.bytebuffer + newdata
Martin v. Löwis56066d22005-08-24 07:38:12 +0000485 try:
486 newchars, decodedbytes = self.decode(data, self.errors)
Guido van Rossumb940e112007-01-10 16:19:56 +0000487 except UnicodeDecodeError as exc:
Martin v. Löwis56066d22005-08-24 07:38:12 +0000488 if firstline:
Walter Dörwald3abcb012007-04-16 22:10:50 +0000489 newchars, decodedbytes = \
490 self.decode(data[:exc.start], self.errors)
Martin v. Löwis56066d22005-08-24 07:38:12 +0000491 lines = newchars.splitlines(True)
492 if len(lines)<=1:
493 raise
494 else:
495 raise
Walter Dörwald69652032004-09-07 20:24:22 +0000496 # keep undecoded bytes until the next call
497 self.bytebuffer = data[decodedbytes:]
498 # put new characters in the character buffer
Walter Dörwalde57d7b12004-12-21 22:24:00 +0000499 self.charbuffer += newchars
Walter Dörwald69652032004-09-07 20:24:22 +0000500 # there was no data available
501 if not newdata:
Walter Dörwalde57d7b12004-12-21 22:24:00 +0000502 break
503 if chars < 0:
504 # Return everything we've got
505 result = self.charbuffer
Georg Brandl02524622010-12-02 18:06:51 +0000506 self.charbuffer = self._empty_charbuffer
Walter Dörwalde57d7b12004-12-21 22:24:00 +0000507 else:
508 # Return the first chars characters
509 result = self.charbuffer[:chars]
510 self.charbuffer = self.charbuffer[chars:]
Walter Dörwald69652032004-09-07 20:24:22 +0000511 return result
Guido van Rossum0612d842000-03-10 23:20:43 +0000512
Walter Dörwald69652032004-09-07 20:24:22 +0000513 def readline(self, size=None, keepends=True):
Guido van Rossuma3277132000-04-11 15:37:43 +0000514
515 """ Read one line from the input stream and return the
516 decoded data.
517
Walter Dörwald69652032004-09-07 20:24:22 +0000518 size, if given, is passed as size argument to the
519 read() method.
Guido van Rossum1c89b0e2000-04-11 15:41:38 +0000520
Guido van Rossuma3277132000-04-11 15:37:43 +0000521 """
Martin v. Löwis4ed67382005-09-18 08:34:39 +0000522 # If we have lines cached from an earlier read, return
523 # them unconditionally
524 if self.linebuffer:
525 line = self.linebuffer[0]
526 del self.linebuffer[0]
527 if len(self.linebuffer) == 1:
528 # revert to charbuffer mode; we might need more data
529 # next time
530 self.charbuffer = self.linebuffer[0]
531 self.linebuffer = None
532 if not keepends:
533 line = line.splitlines(False)[0]
534 return line
Tim Peters536cf992005-12-25 23:18:31 +0000535
Walter Dörwalde57d7b12004-12-21 22:24:00 +0000536 readsize = size or 72
Georg Brandl02524622010-12-02 18:06:51 +0000537 line = self._empty_charbuffer
Walter Dörwalde57d7b12004-12-21 22:24:00 +0000538 # If size is given, we call read() only once
Walter Dörwald69652032004-09-07 20:24:22 +0000539 while True:
Martin v. Löwis56066d22005-08-24 07:38:12 +0000540 data = self.read(readsize, firstline=True)
Walter Dörwalde57d7b12004-12-21 22:24:00 +0000541 if data:
Walter Dörwalda4eb2d52005-04-21 21:42:35 +0000542 # If we're at a "\r" read one extra character (which might
543 # be a "\n") to get a proper line ending. If the stream is
Walter Dörwaldbc8e6422005-04-21 21:32:03 +0000544 # temporarily exhausted we return the wrong line ending.
Georg Brandl02524622010-12-02 18:06:51 +0000545 if (isinstance(data, str) and data.endswith("\r")) or \
546 (isinstance(data, bytes) and data.endswith(b"\r")):
Walter Dörwald7a6dc132005-04-04 21:38:47 +0000547 data += self.read(size=1, chars=1)
Walter Dörwald7a6dc132005-04-04 21:38:47 +0000548
Walter Dörwald69652032004-09-07 20:24:22 +0000549 line += data
Walter Dörwalde57d7b12004-12-21 22:24:00 +0000550 lines = line.splitlines(True)
551 if lines:
Martin v. Löwis4ed67382005-09-18 08:34:39 +0000552 if len(lines) > 1:
553 # More than one line result; the first line is a full line
554 # to return
555 line = lines[0]
556 del lines[0]
557 if len(lines) > 1:
558 # cache the remaining lines
559 lines[-1] += self.charbuffer
560 self.linebuffer = lines
561 self.charbuffer = None
562 else:
563 # only one remaining line, put it back into charbuffer
564 self.charbuffer = lines[0] + self.charbuffer
565 if not keepends:
566 line = line.splitlines(False)[0]
567 break
Walter Dörwalde57d7b12004-12-21 22:24:00 +0000568 line0withend = lines[0]
569 line0withoutend = lines[0].splitlines(False)[0]
570 if line0withend != line0withoutend: # We really have a line end
571 # Put the rest back together and keep it until the next call
Georg Brandl02524622010-12-02 18:06:51 +0000572 self.charbuffer = self._empty_charbuffer.join(lines[1:]) + \
573 self.charbuffer
Walter Dörwalde57d7b12004-12-21 22:24:00 +0000574 if keepends:
575 line = line0withend
576 else:
577 line = line0withoutend
Walter Dörwald9fa09462005-01-10 12:01:39 +0000578 break
Walter Dörwalde57d7b12004-12-21 22:24:00 +0000579 # we didn't get anything or this was our only try
Walter Dörwald9fa09462005-01-10 12:01:39 +0000580 if not data or size is not None:
Walter Dörwalde57d7b12004-12-21 22:24:00 +0000581 if line and not keepends:
582 line = line.splitlines(False)[0]
583 break
Georg Brandl02524622010-12-02 18:06:51 +0000584 if readsize < 8000:
Walter Dörwalde57d7b12004-12-21 22:24:00 +0000585 readsize *= 2
586 return line
Guido van Rossum1c89b0e2000-04-11 15:41:38 +0000587
Walter Dörwald69652032004-09-07 20:24:22 +0000588 def readlines(self, sizehint=None, keepends=True):
Guido van Rossuma3277132000-04-11 15:37:43 +0000589
590 """ Read all lines available on the input stream
591 and return them as list of lines.
592
593 Line breaks are implemented using the codec's decoder
594 method and are included in the list entries.
Guido van Rossum1c89b0e2000-04-11 15:41:38 +0000595
Marc-André Lemburgd5948492004-02-26 15:22:17 +0000596 sizehint, if given, is ignored since there is no efficient
597 way to finding the true end-of-line.
Guido van Rossuma3277132000-04-11 15:37:43 +0000598
599 """
Walter Dörwald69652032004-09-07 20:24:22 +0000600 data = self.read()
Hye-Shik Changaf5c7cf2004-10-17 23:51:21 +0000601 return data.splitlines(keepends)
Guido van Rossum0612d842000-03-10 23:20:43 +0000602
603 def reset(self):
604
605 """ Resets the codec buffers used for keeping state.
606
607 Note that no stream repositioning should take place.
Thomas Wouters7e474022000-07-16 12:04:32 +0000608 This method is primarily intended to be able to recover
Guido van Rossum0612d842000-03-10 23:20:43 +0000609 from decoding errors.
610
611 """
Walter Dörwaldca8a8d02007-05-04 13:05:09 +0000612 self.bytebuffer = b""
Georg Brandl02524622010-12-02 18:06:51 +0000613 self.charbuffer = self._empty_charbuffer
Martin v. Löwis4ed67382005-09-18 08:34:39 +0000614 self.linebuffer = None
Walter Dörwald729c31f2005-03-14 19:06:30 +0000615
Walter Dörwald71fd90d2005-03-14 19:25:41 +0000616 def seek(self, offset, whence=0):
Walter Dörwald729c31f2005-03-14 19:06:30 +0000617 """ Set the input stream's current position.
618
619 Resets the codec buffers used for keeping state.
620 """
Walter Dörwald729c31f2005-03-14 19:06:30 +0000621 self.stream.seek(offset, whence)
Victor Stinnera92ad7e2010-05-22 16:59:09 +0000622 self.reset()
Guido van Rossum0612d842000-03-10 23:20:43 +0000623
Georg Brandla18af4e2007-04-21 15:47:16 +0000624 def __next__(self):
Walter Dörwald4dbf1922002-11-06 16:53:44 +0000625
626 """ Return the next decoded line from the input stream."""
627 line = self.readline()
628 if line:
629 return line
630 raise StopIteration
631
632 def __iter__(self):
633 return self
634
Tim Peters30324a72001-05-15 17:19:16 +0000635 def __getattr__(self, name,
Guido van Rossum0612d842000-03-10 23:20:43 +0000636 getattr=getattr):
637
638 """ Inherit all other methods from the underlying stream.
639 """
Tim Peters30324a72001-05-15 17:19:16 +0000640 return getattr(self.stream, name)
Guido van Rossum0612d842000-03-10 23:20:43 +0000641
Thomas Wouters89f507f2006-12-13 04:49:30 +0000642 def __enter__(self):
643 return self
644
645 def __exit__(self, type, value, tb):
646 self.stream.close()
647
Guido van Rossum0612d842000-03-10 23:20:43 +0000648###
649
650class StreamReaderWriter:
651
Fred Drake49fd1072000-04-13 14:11:21 +0000652 """ StreamReaderWriter instances allow wrapping streams which
653 work in both read and write modes.
654
655 The design is such that one can use the factory functions
Thomas Wouters7e474022000-07-16 12:04:32 +0000656 returned by the codec.lookup() function to construct the
Fred Drake49fd1072000-04-13 14:11:21 +0000657 instance.
658
659 """
Guido van Rossuma3277132000-04-11 15:37:43 +0000660 # Optional attributes set by the file wrappers below
661 encoding = 'unknown'
662
Tim Peters30324a72001-05-15 17:19:16 +0000663 def __init__(self, stream, Reader, Writer, errors='strict'):
Guido van Rossum0612d842000-03-10 23:20:43 +0000664
665 """ Creates a StreamReaderWriter instance.
666
667 stream must be a Stream-like object.
668
669 Reader, Writer must be factory functions or classes
670 providing the StreamReader, StreamWriter interface resp.
671
672 Error handling is done in the same way as defined for the
673 StreamWriter/Readers.
674
675 """
676 self.stream = stream
677 self.reader = Reader(stream, errors)
678 self.writer = Writer(stream, errors)
679 self.errors = errors
680
Tim Peters30324a72001-05-15 17:19:16 +0000681 def read(self, size=-1):
Guido van Rossum0612d842000-03-10 23:20:43 +0000682
683 return self.reader.read(size)
684
Guido van Rossumd58c26f2000-05-01 16:17:32 +0000685 def readline(self, size=None):
Guido van Rossuma3277132000-04-11 15:37:43 +0000686
687 return self.reader.readline(size)
688
Guido van Rossumd58c26f2000-05-01 16:17:32 +0000689 def readlines(self, sizehint=None):
Guido van Rossuma3277132000-04-11 15:37:43 +0000690
691 return self.reader.readlines(sizehint)
692
Georg Brandla18af4e2007-04-21 15:47:16 +0000693 def __next__(self):
Walter Dörwald4dbf1922002-11-06 16:53:44 +0000694
695 """ Return the next decoded line from the input stream."""
Georg Brandla18af4e2007-04-21 15:47:16 +0000696 return next(self.reader)
Walter Dörwald4dbf1922002-11-06 16:53:44 +0000697
698 def __iter__(self):
699 return self
700
Tim Peters30324a72001-05-15 17:19:16 +0000701 def write(self, data):
Guido van Rossum0612d842000-03-10 23:20:43 +0000702
703 return self.writer.write(data)
704
Tim Peters30324a72001-05-15 17:19:16 +0000705 def writelines(self, list):
Guido van Rossuma3277132000-04-11 15:37:43 +0000706
707 return self.writer.writelines(list)
708
Guido van Rossum0612d842000-03-10 23:20:43 +0000709 def reset(self):
710
711 self.reader.reset()
712 self.writer.reset()
713
Victor Stinner3fed0872010-05-22 02:16:27 +0000714 def seek(self, offset, whence=0):
Victor Stinnera92ad7e2010-05-22 16:59:09 +0000715 self.stream.seek(offset, whence)
716 self.reader.reset()
717 if whence == 0 and offset == 0:
718 self.writer.reset()
Victor Stinner3fed0872010-05-22 02:16:27 +0000719
Tim Peters30324a72001-05-15 17:19:16 +0000720 def __getattr__(self, name,
Guido van Rossum0612d842000-03-10 23:20:43 +0000721 getattr=getattr):
722
723 """ Inherit all other methods from the underlying stream.
724 """
Tim Peters30324a72001-05-15 17:19:16 +0000725 return getattr(self.stream, name)
Guido van Rossum0612d842000-03-10 23:20:43 +0000726
Thomas Wouters89f507f2006-12-13 04:49:30 +0000727 # these are needed to make "with codecs.open(...)" work properly
728
729 def __enter__(self):
730 return self
731
732 def __exit__(self, type, value, tb):
733 self.stream.close()
734
Guido van Rossum0612d842000-03-10 23:20:43 +0000735###
736
737class StreamRecoder:
738
Fred Drake49fd1072000-04-13 14:11:21 +0000739 """ StreamRecoder instances provide a frontend - backend
740 view of encoding data.
741
742 They use the complete set of APIs returned by the
743 codecs.lookup() function to implement their task.
744
745 Data written to the stream is first decoded into an
746 intermediate format (which is dependent on the given codec
747 combination) and then written to the stream using an instance
748 of the provided Writer class.
749
750 In the other direction, data is read from the stream using a
751 Reader instance and then return encoded data to the caller.
752
753 """
Guido van Rossuma3277132000-04-11 15:37:43 +0000754 # Optional attributes set by the file wrappers below
755 data_encoding = 'unknown'
756 file_encoding = 'unknown'
757
Tim Peters30324a72001-05-15 17:19:16 +0000758 def __init__(self, stream, encode, decode, Reader, Writer,
759 errors='strict'):
Guido van Rossum0612d842000-03-10 23:20:43 +0000760
761 """ Creates a StreamRecoder instance which implements a two-way
762 conversion: encode and decode work on the frontend (the
Guido van Rossum1c89b0e2000-04-11 15:41:38 +0000763 input to .read() and output of .write()) while
Guido van Rossum0612d842000-03-10 23:20:43 +0000764 Reader and Writer work on the backend (reading and
Fred Drake908670c2000-03-17 15:42:11 +0000765 writing to the stream).
Guido van Rossum0612d842000-03-10 23:20:43 +0000766
767 You can use these objects to do transparent direct
768 recodings from e.g. latin-1 to utf-8 and back.
769
770 stream must be a file-like object.
771
772 encode, decode must adhere to the Codec interface, Reader,
773 Writer must be factory functions or classes providing the
774 StreamReader, StreamWriter interface resp.
775
776 encode and decode are needed for the frontend translation,
777 Reader and Writer for the backend translation. Unicode is
778 used as intermediate encoding.
779
780 Error handling is done in the same way as defined for the
781 StreamWriter/Readers.
782
783 """
784 self.stream = stream
785 self.encode = encode
786 self.decode = decode
787 self.reader = Reader(stream, errors)
788 self.writer = Writer(stream, errors)
789 self.errors = errors
790
Tim Peters30324a72001-05-15 17:19:16 +0000791 def read(self, size=-1):
Guido van Rossum0612d842000-03-10 23:20:43 +0000792
793 data = self.reader.read(size)
794 data, bytesencoded = self.encode(data, self.errors)
795 return data
796
Tim Peters30324a72001-05-15 17:19:16 +0000797 def readline(self, size=None):
Guido van Rossuma3277132000-04-11 15:37:43 +0000798
799 if size is None:
800 data = self.reader.readline()
801 else:
802 data = self.reader.readline(size)
803 data, bytesencoded = self.encode(data, self.errors)
804 return data
805
Tim Peters30324a72001-05-15 17:19:16 +0000806 def readlines(self, sizehint=None):
Guido van Rossuma3277132000-04-11 15:37:43 +0000807
Marc-André Lemburgd5948492004-02-26 15:22:17 +0000808 data = self.reader.read()
Guido van Rossuma3277132000-04-11 15:37:43 +0000809 data, bytesencoded = self.encode(data, self.errors)
810 return data.splitlines(1)
811
Georg Brandla18af4e2007-04-21 15:47:16 +0000812 def __next__(self):
Walter Dörwald4dbf1922002-11-06 16:53:44 +0000813
814 """ Return the next decoded line from the input stream."""
Georg Brandla18af4e2007-04-21 15:47:16 +0000815 data = next(self.reader)
Walter Dörwaldc5238b82005-09-01 11:56:53 +0000816 data, bytesencoded = self.encode(data, self.errors)
817 return data
Walter Dörwald4dbf1922002-11-06 16:53:44 +0000818
819 def __iter__(self):
820 return self
821
Tim Peters30324a72001-05-15 17:19:16 +0000822 def write(self, data):
Guido van Rossum0612d842000-03-10 23:20:43 +0000823
824 data, bytesdecoded = self.decode(data, self.errors)
825 return self.writer.write(data)
826
Tim Peters30324a72001-05-15 17:19:16 +0000827 def writelines(self, list):
Guido van Rossuma3277132000-04-11 15:37:43 +0000828
829 data = ''.join(list)
830 data, bytesdecoded = self.decode(data, self.errors)
831 return self.writer.write(data)
Guido van Rossum0612d842000-03-10 23:20:43 +0000832
833 def reset(self):
834
835 self.reader.reset()
836 self.writer.reset()
837
Tim Peters30324a72001-05-15 17:19:16 +0000838 def __getattr__(self, name,
Guido van Rossum0612d842000-03-10 23:20:43 +0000839 getattr=getattr):
840
841 """ Inherit all other methods from the underlying stream.
842 """
Tim Peters30324a72001-05-15 17:19:16 +0000843 return getattr(self.stream, name)
Guido van Rossum0612d842000-03-10 23:20:43 +0000844
Thomas Wouters89f507f2006-12-13 04:49:30 +0000845 def __enter__(self):
846 return self
847
848 def __exit__(self, type, value, tb):
849 self.stream.close()
850
Guido van Rossum0612d842000-03-10 23:20:43 +0000851### Shortcuts
852
Victor Stinner98fe1a02011-05-27 01:51:18 +0200853def open(filename, mode='r', encoding=None, errors=None, buffering=1):
Guido van Rossum0612d842000-03-10 23:20:43 +0000854
855 """ Open an encoded file using the given mode and return
856 a wrapped version providing transparent encoding/decoding.
857
858 Note: The wrapped version will only accept the object format
859 defined by the codecs, i.e. Unicode objects for most builtin
Skip Montanaro9f5f9d92005-03-16 03:51:56 +0000860 codecs. Output is also codec dependent and will usually be
Guido van Rossum0612d842000-03-10 23:20:43 +0000861 Unicode as well.
862
Marc-André Lemburg349a3d32000-06-21 21:21:04 +0000863 Files are always opened in binary mode, even if no binary mode
Walter Dörwald7f3ed742003-02-02 23:08:27 +0000864 was specified. This is done to avoid data loss due to encodings
Marc-André Lemburg349a3d32000-06-21 21:21:04 +0000865 using 8-bit values. The default file mode is 'rb' meaning to
866 open the file in binary read mode.
867
Guido van Rossum0612d842000-03-10 23:20:43 +0000868 encoding specifies the encoding which is to be used for the
Walter Dörwald7f3ed742003-02-02 23:08:27 +0000869 file.
Guido van Rossum0612d842000-03-10 23:20:43 +0000870
871 errors may be given to define the error handling. It defaults
872 to 'strict' which causes ValueErrors to be raised in case an
873 encoding error occurs.
874
875 buffering has the same meaning as for the builtin open() API.
876 It defaults to line buffered.
877
Fred Drake49fd1072000-04-13 14:11:21 +0000878 The returned wrapped file object provides an extra attribute
879 .encoding which allows querying the used encoding. This
880 attribute is only available if an encoding was specified as
881 parameter.
882
Guido van Rossum0612d842000-03-10 23:20:43 +0000883 """
Victor Stinner98fe1a02011-05-27 01:51:18 +0200884 if encoding is not None:
885 return builtins.open(filename, mode, buffering,
886 encoding, errors, newline='')
887 else:
888 if 'b' not in mode:
889 mode = mode + 'b'
890 return builtins.open(filename, mode, buffering, encoding, errors)
Guido van Rossum0612d842000-03-10 23:20:43 +0000891
Guido van Rossuma3277132000-04-11 15:37:43 +0000892def EncodedFile(file, data_encoding, file_encoding=None, errors='strict'):
Guido van Rossum0612d842000-03-10 23:20:43 +0000893
894 """ Return a wrapped version of file which provides transparent
895 encoding translation.
896
897 Strings written to the wrapped file are interpreted according
Guido van Rossuma3277132000-04-11 15:37:43 +0000898 to the given data_encoding and then written to the original
899 file as string using file_encoding. The intermediate encoding
900 will usually be Unicode but depends on the specified codecs.
Guido van Rossum0612d842000-03-10 23:20:43 +0000901
Guido van Rossuma3277132000-04-11 15:37:43 +0000902 Strings are read from the file using file_encoding and then
903 passed back to the caller as string using data_encoding.
904
905 If file_encoding is not given, it defaults to data_encoding.
Guido van Rossum0612d842000-03-10 23:20:43 +0000906
907 errors may be given to define the error handling. It defaults
908 to 'strict' which causes ValueErrors to be raised in case an
909 encoding error occurs.
910
Fred Drake49fd1072000-04-13 14:11:21 +0000911 The returned wrapped file object provides two extra attributes
912 .data_encoding and .file_encoding which reflect the given
913 parameters of the same name. The attributes can be used for
914 introspection by Python programs.
915
Guido van Rossum0612d842000-03-10 23:20:43 +0000916 """
Guido van Rossuma3277132000-04-11 15:37:43 +0000917 if file_encoding is None:
918 file_encoding = data_encoding
Thomas Wouters89f507f2006-12-13 04:49:30 +0000919 data_info = lookup(data_encoding)
920 file_info = lookup(file_encoding)
921 sr = StreamRecoder(file, data_info.encode, data_info.decode,
922 file_info.streamreader, file_info.streamwriter, errors)
Guido van Rossuma3277132000-04-11 15:37:43 +0000923 # Add attributes to simplify introspection
924 sr.data_encoding = data_encoding
925 sr.file_encoding = file_encoding
926 return sr
Guido van Rossum0612d842000-03-10 23:20:43 +0000927
Marc-André Lemburgaa32c5a2001-09-19 11:24:48 +0000928### Helpers for codec lookup
929
930def getencoder(encoding):
931
932 """ Lookup up the codec for the given encoding and return
933 its encoder function.
934
935 Raises a LookupError in case the encoding cannot be found.
936
937 """
Thomas Woutersa9773292006-04-21 09:43:23 +0000938 return lookup(encoding).encode
Marc-André Lemburgaa32c5a2001-09-19 11:24:48 +0000939
940def getdecoder(encoding):
941
942 """ Lookup up the codec for the given encoding and return
943 its decoder function.
944
945 Raises a LookupError in case the encoding cannot be found.
946
947 """
Thomas Woutersa9773292006-04-21 09:43:23 +0000948 return lookup(encoding).decode
949
950def getincrementalencoder(encoding):
951
952 """ Lookup up the codec for the given encoding and return
953 its IncrementalEncoder class or factory function.
954
955 Raises a LookupError in case the encoding cannot be found
956 or the codecs doesn't provide an incremental encoder.
957
958 """
959 encoder = lookup(encoding).incrementalencoder
960 if encoder is None:
961 raise LookupError(encoding)
962 return encoder
963
964def getincrementaldecoder(encoding):
965
966 """ Lookup up the codec for the given encoding and return
967 its IncrementalDecoder class or factory function.
968
969 Raises a LookupError in case the encoding cannot be found
970 or the codecs doesn't provide an incremental decoder.
971
972 """
973 decoder = lookup(encoding).incrementaldecoder
974 if decoder is None:
975 raise LookupError(encoding)
976 return decoder
Marc-André Lemburgaa32c5a2001-09-19 11:24:48 +0000977
978def getreader(encoding):
979
980 """ Lookup up the codec for the given encoding and return
981 its StreamReader class or factory function.
982
983 Raises a LookupError in case the encoding cannot be found.
984
985 """
Thomas Woutersa9773292006-04-21 09:43:23 +0000986 return lookup(encoding).streamreader
Marc-André Lemburgaa32c5a2001-09-19 11:24:48 +0000987
988def getwriter(encoding):
989
990 """ Lookup up the codec for the given encoding and return
991 its StreamWriter class or factory function.
992
993 Raises a LookupError in case the encoding cannot be found.
994
995 """
Thomas Woutersa9773292006-04-21 09:43:23 +0000996 return lookup(encoding).streamwriter
997
998def iterencode(iterator, encoding, errors='strict', **kwargs):
999 """
1000 Encoding iterator.
1001
1002 Encodes the input strings from the iterator using a IncrementalEncoder.
1003
1004 errors and kwargs are passed through to the IncrementalEncoder
1005 constructor.
1006 """
1007 encoder = getincrementalencoder(encoding)(errors, **kwargs)
1008 for input in iterator:
1009 output = encoder.encode(input)
1010 if output:
1011 yield output
1012 output = encoder.encode("", True)
1013 if output:
1014 yield output
1015
1016def iterdecode(iterator, encoding, errors='strict', **kwargs):
1017 """
1018 Decoding iterator.
1019
1020 Decodes the input strings from the iterator using a IncrementalDecoder.
1021
1022 errors and kwargs are passed through to the IncrementalDecoder
1023 constructor.
1024 """
1025 decoder = getincrementaldecoder(encoding)(errors, **kwargs)
1026 for input in iterator:
1027 output = decoder.decode(input)
1028 if output:
1029 yield output
Walter Dörwaldca8a8d02007-05-04 13:05:09 +00001030 output = decoder.decode(b"", True)
Thomas Woutersa9773292006-04-21 09:43:23 +00001031 if output:
1032 yield output
Marc-André Lemburgaa32c5a2001-09-19 11:24:48 +00001033
Marc-André Lemburga866df82001-01-03 21:29:14 +00001034### Helpers for charmap-based codecs
1035
1036def make_identity_dict(rng):
1037
1038 """ make_identity_dict(rng) -> dict
1039
1040 Return a dictionary where elements of the rng sequence are
1041 mapped to themselves.
Tim Peters88869f92001-01-14 23:36:06 +00001042
Marc-André Lemburga866df82001-01-03 21:29:14 +00001043 """
1044 res = {}
1045 for i in rng:
1046 res[i]=i
1047 return res
1048
Marc-André Lemburg716cf912001-05-16 09:41:45 +00001049def make_encoding_map(decoding_map):
1050
1051 """ Creates an encoding map from a decoding map.
1052
Walter Dörwald7f3ed742003-02-02 23:08:27 +00001053 If a target mapping in the decoding map occurs multiple
Marc-André Lemburg716cf912001-05-16 09:41:45 +00001054 times, then that target is mapped to None (undefined mapping),
1055 causing an exception when encountered by the charmap codec
1056 during translation.
1057
1058 One example where this happens is cp875.py which decodes
1059 multiple character to \u001a.
1060
1061 """
1062 m = {}
1063 for k,v in decoding_map.items():
Raymond Hettinger54f02222002-06-01 14:18:47 +00001064 if not v in m:
Marc-André Lemburg716cf912001-05-16 09:41:45 +00001065 m[v] = k
1066 else:
1067 m[v] = None
1068 return m
Tim Peters3a2ab1a2001-05-29 06:06:54 +00001069
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001070### error handlers
1071
Martin v. Löwise2713be2005-03-08 15:03:08 +00001072try:
1073 strict_errors = lookup_error("strict")
1074 ignore_errors = lookup_error("ignore")
1075 replace_errors = lookup_error("replace")
1076 xmlcharrefreplace_errors = lookup_error("xmlcharrefreplace")
1077 backslashreplace_errors = lookup_error("backslashreplace")
1078except LookupError:
1079 # In --disable-unicode builds, these error handler are missing
1080 strict_errors = None
1081 ignore_errors = None
1082 replace_errors = None
1083 xmlcharrefreplace_errors = None
1084 backslashreplace_errors = None
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001085
Martin v. Löwis6cd441d2001-07-31 08:54:55 +00001086# Tell modulefinder that using codecs probably needs the encodings
1087# package
1088_false = 0
1089if _false:
1090 import encodings
1091
Guido van Rossum0612d842000-03-10 23:20:43 +00001092### Tests
Guido van Rossum1c89b0e2000-04-11 15:41:38 +00001093
Guido van Rossum0612d842000-03-10 23:20:43 +00001094if __name__ == '__main__':
1095
Guido van Rossuma3277132000-04-11 15:37:43 +00001096 # Make stdout translate Latin-1 output into UTF-8 output
1097 sys.stdout = EncodedFile(sys.stdout, 'latin-1', 'utf-8')
Guido van Rossum1c89b0e2000-04-11 15:41:38 +00001098
Guido van Rossuma3277132000-04-11 15:37:43 +00001099 # Have stdin translate Latin-1 input into UTF-8 input
1100 sys.stdin = EncodedFile(sys.stdin, 'utf-8', 'latin-1')