blob: 85df89af63d0ac1d62d27b308c97b215fb69ed67 [file] [log] [blame]
Guido van Rossum0612d842000-03-10 23:20:43 +00001""" codecs -- Python Codec Registry, API and helpers.
2
3
4Written by Marc-Andre Lemburg (mal@lemburg.com).
5
6(c) Copyright CNRI, All Rights Reserved. NO WARRANTY.
7
8"""#"
9
Georg Brandl1a3284e2007-12-02 09:40:06 +000010import builtins, sys
Guido van Rossum0612d842000-03-10 23:20:43 +000011
12### Registry and builtin stateless codec functions
13
Guido van Rossumb95de4f2000-03-31 17:25:23 +000014try:
15 from _codecs import *
Guido van Rossumb940e112007-01-10 16:19:56 +000016except ImportError as why:
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000017 raise SystemError('Failed to load the builtin codecs: %s' % why)
Guido van Rossum0612d842000-03-10 23:20:43 +000018
Tim Peters30324a72001-05-15 17:19:16 +000019__all__ = ["register", "lookup", "open", "EncodedFile", "BOM", "BOM_BE",
Walter Dörwald474458d2002-06-04 15:16:29 +000020 "BOM_LE", "BOM32_BE", "BOM32_LE", "BOM64_BE", "BOM64_LE",
21 "BOM_UTF8", "BOM_UTF16", "BOM_UTF16_LE", "BOM_UTF16_BE",
Walter Dörwald3aeb6322002-09-02 13:14:32 +000022 "BOM_UTF32", "BOM_UTF32_LE", "BOM_UTF32_BE",
23 "strict_errors", "ignore_errors", "replace_errors",
24 "xmlcharrefreplace_errors",
Serhiy Storchaka166ebc42014-11-25 13:57:17 +020025 "backslashreplace_errors", "namereplace_errors",
Walter Dörwald3aeb6322002-09-02 13:14:32 +000026 "register_error", "lookup_error"]
Skip Montanaroe99d5ea2001-01-20 19:54:20 +000027
Guido van Rossum0612d842000-03-10 23:20:43 +000028### Constants
29
30#
Walter Dörwald474458d2002-06-04 15:16:29 +000031# Byte Order Mark (BOM = ZERO WIDTH NO-BREAK SPACE = U+FEFF)
32# and its possible byte string values
33# for UTF8/UTF16/UTF32 output and little/big endian machines
Guido van Rossum0612d842000-03-10 23:20:43 +000034#
Guido van Rossum0612d842000-03-10 23:20:43 +000035
Walter Dörwald474458d2002-06-04 15:16:29 +000036# UTF-8
Walter Dörwaldca8a8d02007-05-04 13:05:09 +000037BOM_UTF8 = b'\xef\xbb\xbf'
Walter Dörwald474458d2002-06-04 15:16:29 +000038
39# UTF-16, little endian
Walter Dörwaldca8a8d02007-05-04 13:05:09 +000040BOM_LE = BOM_UTF16_LE = b'\xff\xfe'
Walter Dörwald474458d2002-06-04 15:16:29 +000041
42# UTF-16, big endian
Walter Dörwaldca8a8d02007-05-04 13:05:09 +000043BOM_BE = BOM_UTF16_BE = b'\xfe\xff'
Walter Dörwald474458d2002-06-04 15:16:29 +000044
45# UTF-32, little endian
Walter Dörwaldca8a8d02007-05-04 13:05:09 +000046BOM_UTF32_LE = b'\xff\xfe\x00\x00'
Walter Dörwald474458d2002-06-04 15:16:29 +000047
48# UTF-32, big endian
Walter Dörwaldca8a8d02007-05-04 13:05:09 +000049BOM_UTF32_BE = b'\x00\x00\xfe\xff'
Walter Dörwald474458d2002-06-04 15:16:29 +000050
Marc-André Lemburgb28de0d2002-12-12 17:37:50 +000051if sys.byteorder == 'little':
Walter Dörwald474458d2002-06-04 15:16:29 +000052
Marc-André Lemburgb28de0d2002-12-12 17:37:50 +000053 # UTF-16, native endianness
54 BOM = BOM_UTF16 = BOM_UTF16_LE
55
56 # UTF-32, native endianness
57 BOM_UTF32 = BOM_UTF32_LE
58
59else:
60
61 # UTF-16, native endianness
62 BOM = BOM_UTF16 = BOM_UTF16_BE
63
64 # UTF-32, native endianness
65 BOM_UTF32 = BOM_UTF32_BE
Walter Dörwald474458d2002-06-04 15:16:29 +000066
67# Old broken names (don't use in new code)
68BOM32_LE = BOM_UTF16_LE
69BOM32_BE = BOM_UTF16_BE
70BOM64_LE = BOM_UTF32_LE
71BOM64_BE = BOM_UTF32_BE
Guido van Rossum0612d842000-03-10 23:20:43 +000072
73
74### Codec base classes (defining the API)
75
Thomas Woutersa9773292006-04-21 09:43:23 +000076class CodecInfo(tuple):
Nick Coghlanc72e4e62013-11-22 22:39:36 +100077 """Codec details when looking up the codec registry"""
78
79 # Private API to allow Python 3.4 to blacklist the known non-Unicode
80 # codecs in the standard library. A more general mechanism to
81 # reliably distinguish test encodings from other codecs will hopefully
82 # be defined for Python 3.5
83 #
84 # See http://bugs.python.org/issue19619
85 _is_text_encoding = True # Assume codecs are text encodings by default
Thomas Woutersa9773292006-04-21 09:43:23 +000086
87 def __new__(cls, encode, decode, streamreader=None, streamwriter=None,
Nick Coghlanc72e4e62013-11-22 22:39:36 +100088 incrementalencoder=None, incrementaldecoder=None, name=None,
89 *, _is_text_encoding=None):
Thomas Woutersa9773292006-04-21 09:43:23 +000090 self = tuple.__new__(cls, (encode, decode, streamreader, streamwriter))
91 self.name = name
92 self.encode = encode
93 self.decode = decode
94 self.incrementalencoder = incrementalencoder
95 self.incrementaldecoder = incrementaldecoder
96 self.streamwriter = streamwriter
97 self.streamreader = streamreader
Nick Coghlanc72e4e62013-11-22 22:39:36 +100098 if _is_text_encoding is not None:
99 self._is_text_encoding = _is_text_encoding
Thomas Woutersa9773292006-04-21 09:43:23 +0000100 return self
101
102 def __repr__(self):
Serhiy Storchaka521e5862014-07-22 15:00:37 +0300103 return "<%s.%s object for encoding %s at %#x>" % \
104 (self.__class__.__module__, self.__class__.__qualname__,
Walter Dörwald3abcb012007-04-16 22:10:50 +0000105 self.name, id(self))
Thomas Woutersa9773292006-04-21 09:43:23 +0000106
Guido van Rossum0612d842000-03-10 23:20:43 +0000107class Codec:
108
109 """ Defines the interface for stateless encoders/decoders.
110
Walter Dörwald7f82f792002-11-19 21:42:53 +0000111 The .encode()/.decode() methods may use different error
Guido van Rossum0612d842000-03-10 23:20:43 +0000112 handling schemes by providing the errors argument. These
Walter Dörwald7f82f792002-11-19 21:42:53 +0000113 string values are predefined:
Guido van Rossum0612d842000-03-10 23:20:43 +0000114
Guido van Rossumd8855fd2000-03-24 22:14:19 +0000115 'strict' - raise a ValueError error (or a subclass)
Guido van Rossum0612d842000-03-10 23:20:43 +0000116 'ignore' - ignore the character and continue with the next
117 'replace' - replace with a suitable replacement character;
118 Python will use the official U+FFFD REPLACEMENT
Walter Dörwald7f82f792002-11-19 21:42:53 +0000119 CHARACTER for the builtin Unicode codecs on
120 decoding and '?' on encoding.
Andrew Kuchlingc7b6c502013-06-16 12:58:48 -0400121 'surrogateescape' - replace with private codepoints U+DCnn.
Walter Dörwald7f82f792002-11-19 21:42:53 +0000122 'xmlcharrefreplace' - Replace with the appropriate XML
123 character reference (only for encoding).
124 'backslashreplace' - Replace with backslashed escape sequences
125 (only for encoding).
126
127 The set of allowed values can be extended via register_error.
Guido van Rossum0612d842000-03-10 23:20:43 +0000128
129 """
Tim Peters30324a72001-05-15 17:19:16 +0000130 def encode(self, input, errors='strict'):
Guido van Rossum1c89b0e2000-04-11 15:41:38 +0000131
Fred Drake3e74c0d2000-03-17 15:40:35 +0000132 """ Encodes the object input and returns a tuple (output
Guido van Rossum0612d842000-03-10 23:20:43 +0000133 object, length consumed).
134
135 errors defines the error handling to apply. It defaults to
136 'strict' handling.
137
138 The method may not store state in the Codec instance. Use
139 StreamCodec for codecs which have to keep state in order to
140 make encoding/decoding efficient.
141
142 The encoder must be able to handle zero length input and
143 return an empty object of the output object type in this
144 situation.
145
146 """
147 raise NotImplementedError
148
Tim Peters30324a72001-05-15 17:19:16 +0000149 def decode(self, input, errors='strict'):
Guido van Rossum0612d842000-03-10 23:20:43 +0000150
151 """ Decodes the object input and returns a tuple (output
152 object, length consumed).
153
154 input must be an object which provides the bf_getreadbuf
155 buffer slot. Python strings, buffer objects and memory
156 mapped files are examples of objects providing this slot.
Guido van Rossum1c89b0e2000-04-11 15:41:38 +0000157
Guido van Rossum0612d842000-03-10 23:20:43 +0000158 errors defines the error handling to apply. It defaults to
159 'strict' handling.
160
161 The method may not store state in the Codec instance. Use
162 StreamCodec for codecs which have to keep state in order to
163 make encoding/decoding efficient.
164
165 The decoder must be able to handle zero length input and
166 return an empty object of the output object type in this
167 situation.
168
Guido van Rossum1c89b0e2000-04-11 15:41:38 +0000169 """
Guido van Rossum0612d842000-03-10 23:20:43 +0000170 raise NotImplementedError
171
Thomas Woutersa9773292006-04-21 09:43:23 +0000172class IncrementalEncoder(object):
173 """
Walter Dörwald3abcb012007-04-16 22:10:50 +0000174 An IncrementalEncoder encodes an input in multiple steps. The input can
175 be passed piece by piece to the encode() method. The IncrementalEncoder
176 remembers the state of the encoding process between calls to encode().
Thomas Woutersa9773292006-04-21 09:43:23 +0000177 """
178 def __init__(self, errors='strict'):
179 """
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000180 Creates an IncrementalEncoder instance.
Thomas Woutersa9773292006-04-21 09:43:23 +0000181
182 The IncrementalEncoder may use different error handling schemes by
183 providing the errors keyword argument. See the module docstring
184 for a list of possible values.
185 """
186 self.errors = errors
187 self.buffer = ""
188
189 def encode(self, input, final=False):
190 """
191 Encodes input and returns the resulting object.
192 """
193 raise NotImplementedError
194
195 def reset(self):
196 """
197 Resets the encoder to the initial state.
198 """
199
Walter Dörwald3abcb012007-04-16 22:10:50 +0000200 def getstate(self):
201 """
202 Return the current state of the encoder.
203 """
204 return 0
205
206 def setstate(self, state):
207 """
208 Set the current state of the encoder. state must have been
209 returned by getstate().
210 """
211
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000212class BufferedIncrementalEncoder(IncrementalEncoder):
213 """
214 This subclass of IncrementalEncoder can be used as the baseclass for an
215 incremental encoder if the encoder must keep some of the output in a
216 buffer between calls to encode().
217 """
218 def __init__(self, errors='strict'):
219 IncrementalEncoder.__init__(self, errors)
Walter Dörwald3abcb012007-04-16 22:10:50 +0000220 # unencoded input that is kept between calls to encode()
221 self.buffer = ""
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000222
223 def _buffer_encode(self, input, errors, final):
224 # Overwrite this method in subclasses: It must encode input
225 # and return an (output, length consumed) tuple
226 raise NotImplementedError
227
228 def encode(self, input, final=False):
229 # encode input (taking the buffer into account)
230 data = self.buffer + input
231 (result, consumed) = self._buffer_encode(data, self.errors, final)
232 # keep unencoded input until the next call
233 self.buffer = data[consumed:]
234 return result
235
236 def reset(self):
237 IncrementalEncoder.reset(self)
238 self.buffer = ""
239
Walter Dörwald3abcb012007-04-16 22:10:50 +0000240 def getstate(self):
241 return self.buffer or 0
242
243 def setstate(self, state):
244 self.buffer = state or ""
245
Thomas Woutersa9773292006-04-21 09:43:23 +0000246class IncrementalDecoder(object):
247 """
Walter Dörwald3abcb012007-04-16 22:10:50 +0000248 An IncrementalDecoder decodes an input in multiple steps. The input can
249 be passed piece by piece to the decode() method. The IncrementalDecoder
Thomas Woutersa9773292006-04-21 09:43:23 +0000250 remembers the state of the decoding process between calls to decode().
251 """
252 def __init__(self, errors='strict'):
253 """
Ka-Ping Yeef44c7e82008-03-18 04:51:32 +0000254 Create a IncrementalDecoder instance.
Thomas Woutersa9773292006-04-21 09:43:23 +0000255
256 The IncrementalDecoder may use different error handling schemes by
257 providing the errors keyword argument. See the module docstring
258 for a list of possible values.
259 """
260 self.errors = errors
261
262 def decode(self, input, final=False):
263 """
Ka-Ping Yeef44c7e82008-03-18 04:51:32 +0000264 Decode input and returns the resulting object.
Thomas Woutersa9773292006-04-21 09:43:23 +0000265 """
266 raise NotImplementedError
267
268 def reset(self):
269 """
Ka-Ping Yeef44c7e82008-03-18 04:51:32 +0000270 Reset the decoder to the initial state.
Thomas Woutersa9773292006-04-21 09:43:23 +0000271 """
272
Walter Dörwald3abcb012007-04-16 22:10:50 +0000273 def getstate(self):
274 """
Ka-Ping Yeef44c7e82008-03-18 04:51:32 +0000275 Return the current state of the decoder.
276
277 This must be a (buffered_input, additional_state_info) tuple.
278 buffered_input must be a bytes object containing bytes that
279 were passed to decode() that have not yet been converted.
280 additional_state_info must be a non-negative integer
281 representing the state of the decoder WITHOUT yet having
282 processed the contents of buffered_input. In the initial state
283 and after reset(), getstate() must return (b"", 0).
Walter Dörwald3abcb012007-04-16 22:10:50 +0000284 """
Walter Dörwaldca8a8d02007-05-04 13:05:09 +0000285 return (b"", 0)
Walter Dörwald3abcb012007-04-16 22:10:50 +0000286
287 def setstate(self, state):
288 """
Ka-Ping Yeef44c7e82008-03-18 04:51:32 +0000289 Set the current state of the decoder.
290
291 state must have been returned by getstate(). The effect of
292 setstate((b"", 0)) must be equivalent to reset().
Walter Dörwald3abcb012007-04-16 22:10:50 +0000293 """
294
Thomas Woutersa9773292006-04-21 09:43:23 +0000295class BufferedIncrementalDecoder(IncrementalDecoder):
296 """
297 This subclass of IncrementalDecoder can be used as the baseclass for an
Walter Dörwald3abcb012007-04-16 22:10:50 +0000298 incremental decoder if the decoder must be able to handle incomplete
299 byte sequences.
Thomas Woutersa9773292006-04-21 09:43:23 +0000300 """
301 def __init__(self, errors='strict'):
302 IncrementalDecoder.__init__(self, errors)
Walter Dörwald3abcb012007-04-16 22:10:50 +0000303 # undecoded input that is kept between calls to decode()
Walter Dörwaldca8a8d02007-05-04 13:05:09 +0000304 self.buffer = b""
Thomas Woutersa9773292006-04-21 09:43:23 +0000305
306 def _buffer_decode(self, input, errors, final):
307 # Overwrite this method in subclasses: It must decode input
308 # and return an (output, length consumed) tuple
309 raise NotImplementedError
310
311 def decode(self, input, final=False):
312 # decode input (taking the buffer into account)
313 data = self.buffer + input
314 (result, consumed) = self._buffer_decode(data, self.errors, final)
315 # keep undecoded input until the next call
316 self.buffer = data[consumed:]
317 return result
318
319 def reset(self):
320 IncrementalDecoder.reset(self)
Walter Dörwaldca8a8d02007-05-04 13:05:09 +0000321 self.buffer = b""
Thomas Woutersa9773292006-04-21 09:43:23 +0000322
Walter Dörwald3abcb012007-04-16 22:10:50 +0000323 def getstate(self):
324 # additional state info is always 0
325 return (self.buffer, 0)
326
327 def setstate(self, state):
328 # ignore additional state info
329 self.buffer = state[0]
330
Guido van Rossum0612d842000-03-10 23:20:43 +0000331#
332# The StreamWriter and StreamReader class provide generic working
Andrew M. Kuchling97c56352001-09-18 20:29:48 +0000333# interfaces which can be used to implement new encoding submodules
Guido van Rossum0612d842000-03-10 23:20:43 +0000334# very easily. See encodings/utf_8.py for an example on how this is
335# done.
Guido van Rossum1c89b0e2000-04-11 15:41:38 +0000336#
Guido van Rossum0612d842000-03-10 23:20:43 +0000337
338class StreamWriter(Codec):
339
Tim Peters30324a72001-05-15 17:19:16 +0000340 def __init__(self, stream, errors='strict'):
Guido van Rossum0612d842000-03-10 23:20:43 +0000341
342 """ Creates a StreamWriter instance.
343
344 stream must be a file-like object open for writing
345 (binary) data.
346
Walter Dörwald7f82f792002-11-19 21:42:53 +0000347 The StreamWriter may use different error handling
Guido van Rossum0612d842000-03-10 23:20:43 +0000348 schemes by providing the errors keyword argument. These
Walter Dörwald7f82f792002-11-19 21:42:53 +0000349 parameters are predefined:
Guido van Rossum0612d842000-03-10 23:20:43 +0000350
351 'strict' - raise a ValueError (or a subclass)
352 'ignore' - ignore the character and continue with the next
353 'replace'- replace with a suitable replacement character
Walter Dörwald7f82f792002-11-19 21:42:53 +0000354 'xmlcharrefreplace' - Replace with the appropriate XML
355 character reference.
356 'backslashreplace' - Replace with backslashed escape
357 sequences (only for encoding).
Guido van Rossum0612d842000-03-10 23:20:43 +0000358
Walter Dörwald7f82f792002-11-19 21:42:53 +0000359 The set of allowed parameter values can be extended via
360 register_error.
Guido van Rossum0612d842000-03-10 23:20:43 +0000361 """
362 self.stream = stream
363 self.errors = errors
364
Guido van Rossuma3277132000-04-11 15:37:43 +0000365 def write(self, object):
Guido van Rossum0612d842000-03-10 23:20:43 +0000366
367 """ Writes the object's contents encoded to self.stream.
368 """
Tim Peters30324a72001-05-15 17:19:16 +0000369 data, consumed = self.encode(object, self.errors)
Guido van Rossum0612d842000-03-10 23:20:43 +0000370 self.stream.write(data)
371
Guido van Rossuma3277132000-04-11 15:37:43 +0000372 def writelines(self, list):
373
374 """ Writes the concatenated list of strings to the stream
375 using .write().
376 """
377 self.write(''.join(list))
Guido van Rossum1c89b0e2000-04-11 15:41:38 +0000378
Guido van Rossum0612d842000-03-10 23:20:43 +0000379 def reset(self):
380
381 """ Flushes and resets the codec buffers used for keeping state.
382
383 Calling this method should ensure that the data on the
384 output is put into a clean state, that allows appending
385 of new fresh data without having to rescan the whole
386 stream to recover state.
387
388 """
389 pass
390
Victor Stinnera92ad7e2010-05-22 16:59:09 +0000391 def seek(self, offset, whence=0):
392 self.stream.seek(offset, whence)
393 if whence == 0 and offset == 0:
394 self.reset()
395
Tim Peters30324a72001-05-15 17:19:16 +0000396 def __getattr__(self, name,
Guido van Rossum0612d842000-03-10 23:20:43 +0000397 getattr=getattr):
398
399 """ Inherit all other methods from the underlying stream.
400 """
Tim Peters30324a72001-05-15 17:19:16 +0000401 return getattr(self.stream, name)
Guido van Rossum0612d842000-03-10 23:20:43 +0000402
Thomas Wouters89f507f2006-12-13 04:49:30 +0000403 def __enter__(self):
404 return self
405
406 def __exit__(self, type, value, tb):
407 self.stream.close()
408
Guido van Rossum0612d842000-03-10 23:20:43 +0000409###
410
411class StreamReader(Codec):
412
Georg Brandl02524622010-12-02 18:06:51 +0000413 charbuffertype = str
414
Tim Peters30324a72001-05-15 17:19:16 +0000415 def __init__(self, stream, errors='strict'):
Guido van Rossum0612d842000-03-10 23:20:43 +0000416
417 """ Creates a StreamReader instance.
418
419 stream must be a file-like object open for reading
420 (binary) data.
421
Walter Dörwald7f82f792002-11-19 21:42:53 +0000422 The StreamReader may use different error handling
Guido van Rossum0612d842000-03-10 23:20:43 +0000423 schemes by providing the errors keyword argument. These
Walter Dörwald7f82f792002-11-19 21:42:53 +0000424 parameters are predefined:
Guido van Rossum0612d842000-03-10 23:20:43 +0000425
426 'strict' - raise a ValueError (or a subclass)
427 'ignore' - ignore the character and continue with the next
428 'replace'- replace with a suitable replacement character;
429
Walter Dörwald7f82f792002-11-19 21:42:53 +0000430 The set of allowed parameter values can be extended via
431 register_error.
Guido van Rossum0612d842000-03-10 23:20:43 +0000432 """
433 self.stream = stream
434 self.errors = errors
Walter Dörwaldca8a8d02007-05-04 13:05:09 +0000435 self.bytebuffer = b""
Georg Brandl02524622010-12-02 18:06:51 +0000436 self._empty_charbuffer = self.charbuffertype()
437 self.charbuffer = self._empty_charbuffer
Martin v. Löwis4ed67382005-09-18 08:34:39 +0000438 self.linebuffer = None
Guido van Rossum0612d842000-03-10 23:20:43 +0000439
Walter Dörwald69652032004-09-07 20:24:22 +0000440 def decode(self, input, errors='strict'):
441 raise NotImplementedError
442
Martin v. Löwis56066d22005-08-24 07:38:12 +0000443 def read(self, size=-1, chars=-1, firstline=False):
Guido van Rossum0612d842000-03-10 23:20:43 +0000444
445 """ Decodes data from the stream self.stream and returns the
446 resulting object.
447
Walter Dörwald69652032004-09-07 20:24:22 +0000448 chars indicates the number of characters to read from the
449 stream. read() will never return more than chars
450 characters, but it might return less, if there are not enough
451 characters available.
452
Guido van Rossum0612d842000-03-10 23:20:43 +0000453 size indicates the approximate maximum number of bytes to
454 read from the stream for decoding purposes. The decoder
455 can modify this setting as appropriate. The default value
456 -1 indicates to read and decode as much as possible. size
457 is intended to prevent having to decode huge files in one
458 step.
459
Martin v. Löwis56066d22005-08-24 07:38:12 +0000460 If firstline is true, and a UnicodeDecodeError happens
461 after the first line terminator in the input only the first line
462 will be returned, the rest of the input will be kept until the
463 next call to read().
464
Guido van Rossum0612d842000-03-10 23:20:43 +0000465 The method should use a greedy read strategy meaning that
466 it should read as much data as is allowed within the
467 definition of the encoding and the given size, e.g. if
468 optional encoding endings or state markers are available
469 on the stream, these should be read too.
Guido van Rossum0612d842000-03-10 23:20:43 +0000470 """
Martin v. Löwis4ed67382005-09-18 08:34:39 +0000471 # If we have lines cached, first merge them back into characters
472 if self.linebuffer:
Georg Brandl02524622010-12-02 18:06:51 +0000473 self.charbuffer = self._empty_charbuffer.join(self.linebuffer)
Martin v. Löwis4ed67382005-09-18 08:34:39 +0000474 self.linebuffer = None
Tim Peters536cf992005-12-25 23:18:31 +0000475
Walter Dörwald69652032004-09-07 20:24:22 +0000476 # read until we get the required number of characters (if available)
Walter Dörwald69652032004-09-07 20:24:22 +0000477 while True:
Tim Golden621302c2012-10-01 16:40:40 +0100478 # can the request be satisfied from the character buffer?
Serhiy Storchakadbe09822014-01-26 19:27:56 +0200479 if chars >= 0:
Walter Dörwald69652032004-09-07 20:24:22 +0000480 if len(self.charbuffer) >= chars:
Walter Dörwald69652032004-09-07 20:24:22 +0000481 break
Serhiy Storchakadbe09822014-01-26 19:27:56 +0200482 elif size >= 0:
483 if len(self.charbuffer) >= size:
484 break
Walter Dörwald69652032004-09-07 20:24:22 +0000485 # we need more data
486 if size < 0:
487 newdata = self.stream.read()
488 else:
489 newdata = self.stream.read(size)
Walter Dörwalde57d7b12004-12-21 22:24:00 +0000490 # decode bytes (those remaining from the last call included)
Walter Dörwald69652032004-09-07 20:24:22 +0000491 data = self.bytebuffer + newdata
Serhiy Storchakadbe09822014-01-26 19:27:56 +0200492 if not data:
493 break
Martin v. Löwis56066d22005-08-24 07:38:12 +0000494 try:
495 newchars, decodedbytes = self.decode(data, self.errors)
Guido van Rossumb940e112007-01-10 16:19:56 +0000496 except UnicodeDecodeError as exc:
Martin v. Löwis56066d22005-08-24 07:38:12 +0000497 if firstline:
Walter Dörwald3abcb012007-04-16 22:10:50 +0000498 newchars, decodedbytes = \
499 self.decode(data[:exc.start], self.errors)
Ezio Melottid8b509b2011-09-28 17:37:55 +0300500 lines = newchars.splitlines(keepends=True)
Martin v. Löwis56066d22005-08-24 07:38:12 +0000501 if len(lines)<=1:
502 raise
503 else:
504 raise
Walter Dörwald69652032004-09-07 20:24:22 +0000505 # keep undecoded bytes until the next call
506 self.bytebuffer = data[decodedbytes:]
507 # put new characters in the character buffer
Walter Dörwalde57d7b12004-12-21 22:24:00 +0000508 self.charbuffer += newchars
Walter Dörwald69652032004-09-07 20:24:22 +0000509 # there was no data available
510 if not newdata:
Walter Dörwalde57d7b12004-12-21 22:24:00 +0000511 break
512 if chars < 0:
513 # Return everything we've got
514 result = self.charbuffer
Georg Brandl02524622010-12-02 18:06:51 +0000515 self.charbuffer = self._empty_charbuffer
Walter Dörwalde57d7b12004-12-21 22:24:00 +0000516 else:
517 # Return the first chars characters
518 result = self.charbuffer[:chars]
519 self.charbuffer = self.charbuffer[chars:]
Walter Dörwald69652032004-09-07 20:24:22 +0000520 return result
Guido van Rossum0612d842000-03-10 23:20:43 +0000521
Walter Dörwald69652032004-09-07 20:24:22 +0000522 def readline(self, size=None, keepends=True):
Guido van Rossuma3277132000-04-11 15:37:43 +0000523
524 """ Read one line from the input stream and return the
525 decoded data.
526
Walter Dörwald69652032004-09-07 20:24:22 +0000527 size, if given, is passed as size argument to the
528 read() method.
Guido van Rossum1c89b0e2000-04-11 15:41:38 +0000529
Guido van Rossuma3277132000-04-11 15:37:43 +0000530 """
Martin v. Löwis4ed67382005-09-18 08:34:39 +0000531 # If we have lines cached from an earlier read, return
532 # them unconditionally
533 if self.linebuffer:
534 line = self.linebuffer[0]
535 del self.linebuffer[0]
536 if len(self.linebuffer) == 1:
537 # revert to charbuffer mode; we might need more data
538 # next time
539 self.charbuffer = self.linebuffer[0]
540 self.linebuffer = None
541 if not keepends:
Ezio Melottid8b509b2011-09-28 17:37:55 +0300542 line = line.splitlines(keepends=False)[0]
Martin v. Löwis4ed67382005-09-18 08:34:39 +0000543 return line
Tim Peters536cf992005-12-25 23:18:31 +0000544
Walter Dörwalde57d7b12004-12-21 22:24:00 +0000545 readsize = size or 72
Georg Brandl02524622010-12-02 18:06:51 +0000546 line = self._empty_charbuffer
Walter Dörwalde57d7b12004-12-21 22:24:00 +0000547 # If size is given, we call read() only once
Walter Dörwald69652032004-09-07 20:24:22 +0000548 while True:
Martin v. Löwis56066d22005-08-24 07:38:12 +0000549 data = self.read(readsize, firstline=True)
Walter Dörwalde57d7b12004-12-21 22:24:00 +0000550 if data:
Walter Dörwalda4eb2d52005-04-21 21:42:35 +0000551 # If we're at a "\r" read one extra character (which might
552 # be a "\n") to get a proper line ending. If the stream is
Walter Dörwaldbc8e6422005-04-21 21:32:03 +0000553 # temporarily exhausted we return the wrong line ending.
Georg Brandl02524622010-12-02 18:06:51 +0000554 if (isinstance(data, str) and data.endswith("\r")) or \
555 (isinstance(data, bytes) and data.endswith(b"\r")):
Walter Dörwald7a6dc132005-04-04 21:38:47 +0000556 data += self.read(size=1, chars=1)
Walter Dörwald7a6dc132005-04-04 21:38:47 +0000557
Walter Dörwald69652032004-09-07 20:24:22 +0000558 line += data
Ezio Melottid8b509b2011-09-28 17:37:55 +0300559 lines = line.splitlines(keepends=True)
Walter Dörwalde57d7b12004-12-21 22:24:00 +0000560 if lines:
Martin v. Löwis4ed67382005-09-18 08:34:39 +0000561 if len(lines) > 1:
562 # More than one line result; the first line is a full line
563 # to return
564 line = lines[0]
565 del lines[0]
566 if len(lines) > 1:
567 # cache the remaining lines
568 lines[-1] += self.charbuffer
569 self.linebuffer = lines
570 self.charbuffer = None
571 else:
572 # only one remaining line, put it back into charbuffer
573 self.charbuffer = lines[0] + self.charbuffer
574 if not keepends:
Ezio Melottid8b509b2011-09-28 17:37:55 +0300575 line = line.splitlines(keepends=False)[0]
Martin v. Löwis4ed67382005-09-18 08:34:39 +0000576 break
Walter Dörwalde57d7b12004-12-21 22:24:00 +0000577 line0withend = lines[0]
Ezio Melottid8b509b2011-09-28 17:37:55 +0300578 line0withoutend = lines[0].splitlines(keepends=False)[0]
Walter Dörwalde57d7b12004-12-21 22:24:00 +0000579 if line0withend != line0withoutend: # We really have a line end
580 # Put the rest back together and keep it until the next call
Georg Brandl02524622010-12-02 18:06:51 +0000581 self.charbuffer = self._empty_charbuffer.join(lines[1:]) + \
582 self.charbuffer
Walter Dörwalde57d7b12004-12-21 22:24:00 +0000583 if keepends:
584 line = line0withend
585 else:
586 line = line0withoutend
Walter Dörwald9fa09462005-01-10 12:01:39 +0000587 break
Walter Dörwalde57d7b12004-12-21 22:24:00 +0000588 # we didn't get anything or this was our only try
Walter Dörwald9fa09462005-01-10 12:01:39 +0000589 if not data or size is not None:
Walter Dörwalde57d7b12004-12-21 22:24:00 +0000590 if line and not keepends:
Ezio Melottid8b509b2011-09-28 17:37:55 +0300591 line = line.splitlines(keepends=False)[0]
Walter Dörwalde57d7b12004-12-21 22:24:00 +0000592 break
Georg Brandl02524622010-12-02 18:06:51 +0000593 if readsize < 8000:
Walter Dörwalde57d7b12004-12-21 22:24:00 +0000594 readsize *= 2
595 return line
Guido van Rossum1c89b0e2000-04-11 15:41:38 +0000596
Walter Dörwald69652032004-09-07 20:24:22 +0000597 def readlines(self, sizehint=None, keepends=True):
Guido van Rossuma3277132000-04-11 15:37:43 +0000598
599 """ Read all lines available on the input stream
600 and return them as list of lines.
601
602 Line breaks are implemented using the codec's decoder
603 method and are included in the list entries.
Guido van Rossum1c89b0e2000-04-11 15:41:38 +0000604
Marc-André Lemburgd5948492004-02-26 15:22:17 +0000605 sizehint, if given, is ignored since there is no efficient
606 way to finding the true end-of-line.
Guido van Rossuma3277132000-04-11 15:37:43 +0000607
608 """
Walter Dörwald69652032004-09-07 20:24:22 +0000609 data = self.read()
Hye-Shik Changaf5c7cf2004-10-17 23:51:21 +0000610 return data.splitlines(keepends)
Guido van Rossum0612d842000-03-10 23:20:43 +0000611
612 def reset(self):
613
614 """ Resets the codec buffers used for keeping state.
615
616 Note that no stream repositioning should take place.
Thomas Wouters7e474022000-07-16 12:04:32 +0000617 This method is primarily intended to be able to recover
Guido van Rossum0612d842000-03-10 23:20:43 +0000618 from decoding errors.
619
620 """
Walter Dörwaldca8a8d02007-05-04 13:05:09 +0000621 self.bytebuffer = b""
Georg Brandl02524622010-12-02 18:06:51 +0000622 self.charbuffer = self._empty_charbuffer
Martin v. Löwis4ed67382005-09-18 08:34:39 +0000623 self.linebuffer = None
Walter Dörwald729c31f2005-03-14 19:06:30 +0000624
Walter Dörwald71fd90d2005-03-14 19:25:41 +0000625 def seek(self, offset, whence=0):
Walter Dörwald729c31f2005-03-14 19:06:30 +0000626 """ Set the input stream's current position.
627
628 Resets the codec buffers used for keeping state.
629 """
Walter Dörwald729c31f2005-03-14 19:06:30 +0000630 self.stream.seek(offset, whence)
Victor Stinnera92ad7e2010-05-22 16:59:09 +0000631 self.reset()
Guido van Rossum0612d842000-03-10 23:20:43 +0000632
Georg Brandla18af4e2007-04-21 15:47:16 +0000633 def __next__(self):
Walter Dörwald4dbf1922002-11-06 16:53:44 +0000634
635 """ Return the next decoded line from the input stream."""
636 line = self.readline()
637 if line:
638 return line
639 raise StopIteration
640
641 def __iter__(self):
642 return self
643
Tim Peters30324a72001-05-15 17:19:16 +0000644 def __getattr__(self, name,
Guido van Rossum0612d842000-03-10 23:20:43 +0000645 getattr=getattr):
646
647 """ Inherit all other methods from the underlying stream.
648 """
Tim Peters30324a72001-05-15 17:19:16 +0000649 return getattr(self.stream, name)
Guido van Rossum0612d842000-03-10 23:20:43 +0000650
Thomas Wouters89f507f2006-12-13 04:49:30 +0000651 def __enter__(self):
652 return self
653
654 def __exit__(self, type, value, tb):
655 self.stream.close()
656
Guido van Rossum0612d842000-03-10 23:20:43 +0000657###
658
659class StreamReaderWriter:
660
Fred Drake49fd1072000-04-13 14:11:21 +0000661 """ StreamReaderWriter instances allow wrapping streams which
662 work in both read and write modes.
663
664 The design is such that one can use the factory functions
Thomas Wouters7e474022000-07-16 12:04:32 +0000665 returned by the codec.lookup() function to construct the
Fred Drake49fd1072000-04-13 14:11:21 +0000666 instance.
667
668 """
Guido van Rossuma3277132000-04-11 15:37:43 +0000669 # Optional attributes set by the file wrappers below
670 encoding = 'unknown'
671
Tim Peters30324a72001-05-15 17:19:16 +0000672 def __init__(self, stream, Reader, Writer, errors='strict'):
Guido van Rossum0612d842000-03-10 23:20:43 +0000673
674 """ Creates a StreamReaderWriter instance.
675
676 stream must be a Stream-like object.
677
678 Reader, Writer must be factory functions or classes
679 providing the StreamReader, StreamWriter interface resp.
680
681 Error handling is done in the same way as defined for the
682 StreamWriter/Readers.
683
684 """
685 self.stream = stream
686 self.reader = Reader(stream, errors)
687 self.writer = Writer(stream, errors)
688 self.errors = errors
689
Tim Peters30324a72001-05-15 17:19:16 +0000690 def read(self, size=-1):
Guido van Rossum0612d842000-03-10 23:20:43 +0000691
692 return self.reader.read(size)
693
Guido van Rossumd58c26f2000-05-01 16:17:32 +0000694 def readline(self, size=None):
Guido van Rossuma3277132000-04-11 15:37:43 +0000695
696 return self.reader.readline(size)
697
Guido van Rossumd58c26f2000-05-01 16:17:32 +0000698 def readlines(self, sizehint=None):
Guido van Rossuma3277132000-04-11 15:37:43 +0000699
700 return self.reader.readlines(sizehint)
701
Georg Brandla18af4e2007-04-21 15:47:16 +0000702 def __next__(self):
Walter Dörwald4dbf1922002-11-06 16:53:44 +0000703
704 """ Return the next decoded line from the input stream."""
Georg Brandla18af4e2007-04-21 15:47:16 +0000705 return next(self.reader)
Walter Dörwald4dbf1922002-11-06 16:53:44 +0000706
707 def __iter__(self):
708 return self
709
Tim Peters30324a72001-05-15 17:19:16 +0000710 def write(self, data):
Guido van Rossum0612d842000-03-10 23:20:43 +0000711
712 return self.writer.write(data)
713
Tim Peters30324a72001-05-15 17:19:16 +0000714 def writelines(self, list):
Guido van Rossuma3277132000-04-11 15:37:43 +0000715
716 return self.writer.writelines(list)
717
Guido van Rossum0612d842000-03-10 23:20:43 +0000718 def reset(self):
719
720 self.reader.reset()
721 self.writer.reset()
722
Victor Stinner3fed0872010-05-22 02:16:27 +0000723 def seek(self, offset, whence=0):
Victor Stinnera92ad7e2010-05-22 16:59:09 +0000724 self.stream.seek(offset, whence)
725 self.reader.reset()
726 if whence == 0 and offset == 0:
727 self.writer.reset()
Victor Stinner3fed0872010-05-22 02:16:27 +0000728
Tim Peters30324a72001-05-15 17:19:16 +0000729 def __getattr__(self, name,
Guido van Rossum0612d842000-03-10 23:20:43 +0000730 getattr=getattr):
731
732 """ Inherit all other methods from the underlying stream.
733 """
Tim Peters30324a72001-05-15 17:19:16 +0000734 return getattr(self.stream, name)
Guido van Rossum0612d842000-03-10 23:20:43 +0000735
Thomas Wouters89f507f2006-12-13 04:49:30 +0000736 # these are needed to make "with codecs.open(...)" work properly
737
738 def __enter__(self):
739 return self
740
741 def __exit__(self, type, value, tb):
742 self.stream.close()
743
Guido van Rossum0612d842000-03-10 23:20:43 +0000744###
745
746class StreamRecoder:
747
Fred Drake49fd1072000-04-13 14:11:21 +0000748 """ StreamRecoder instances provide a frontend - backend
749 view of encoding data.
750
751 They use the complete set of APIs returned by the
752 codecs.lookup() function to implement their task.
753
754 Data written to the stream is first decoded into an
755 intermediate format (which is dependent on the given codec
756 combination) and then written to the stream using an instance
757 of the provided Writer class.
758
759 In the other direction, data is read from the stream using a
760 Reader instance and then return encoded data to the caller.
761
762 """
Guido van Rossuma3277132000-04-11 15:37:43 +0000763 # Optional attributes set by the file wrappers below
764 data_encoding = 'unknown'
765 file_encoding = 'unknown'
766
Tim Peters30324a72001-05-15 17:19:16 +0000767 def __init__(self, stream, encode, decode, Reader, Writer,
768 errors='strict'):
Guido van Rossum0612d842000-03-10 23:20:43 +0000769
770 """ Creates a StreamRecoder instance which implements a two-way
771 conversion: encode and decode work on the frontend (the
Guido van Rossum1c89b0e2000-04-11 15:41:38 +0000772 input to .read() and output of .write()) while
Guido van Rossum0612d842000-03-10 23:20:43 +0000773 Reader and Writer work on the backend (reading and
Fred Drake908670c2000-03-17 15:42:11 +0000774 writing to the stream).
Guido van Rossum0612d842000-03-10 23:20:43 +0000775
776 You can use these objects to do transparent direct
777 recodings from e.g. latin-1 to utf-8 and back.
778
779 stream must be a file-like object.
780
781 encode, decode must adhere to the Codec interface, Reader,
782 Writer must be factory functions or classes providing the
783 StreamReader, StreamWriter interface resp.
784
785 encode and decode are needed for the frontend translation,
786 Reader and Writer for the backend translation. Unicode is
787 used as intermediate encoding.
788
789 Error handling is done in the same way as defined for the
790 StreamWriter/Readers.
791
792 """
793 self.stream = stream
794 self.encode = encode
795 self.decode = decode
796 self.reader = Reader(stream, errors)
797 self.writer = Writer(stream, errors)
798 self.errors = errors
799
Tim Peters30324a72001-05-15 17:19:16 +0000800 def read(self, size=-1):
Guido van Rossum0612d842000-03-10 23:20:43 +0000801
802 data = self.reader.read(size)
803 data, bytesencoded = self.encode(data, self.errors)
804 return data
805
Tim Peters30324a72001-05-15 17:19:16 +0000806 def readline(self, size=None):
Guido van Rossuma3277132000-04-11 15:37:43 +0000807
808 if size is None:
809 data = self.reader.readline()
810 else:
811 data = self.reader.readline(size)
812 data, bytesencoded = self.encode(data, self.errors)
813 return data
814
Tim Peters30324a72001-05-15 17:19:16 +0000815 def readlines(self, sizehint=None):
Guido van Rossuma3277132000-04-11 15:37:43 +0000816
Marc-André Lemburgd5948492004-02-26 15:22:17 +0000817 data = self.reader.read()
Guido van Rossuma3277132000-04-11 15:37:43 +0000818 data, bytesencoded = self.encode(data, self.errors)
Ezio Melottid8b509b2011-09-28 17:37:55 +0300819 return data.splitlines(keepends=True)
Guido van Rossuma3277132000-04-11 15:37:43 +0000820
Georg Brandla18af4e2007-04-21 15:47:16 +0000821 def __next__(self):
Walter Dörwald4dbf1922002-11-06 16:53:44 +0000822
823 """ Return the next decoded line from the input stream."""
Georg Brandla18af4e2007-04-21 15:47:16 +0000824 data = next(self.reader)
Walter Dörwaldc5238b82005-09-01 11:56:53 +0000825 data, bytesencoded = self.encode(data, self.errors)
826 return data
Walter Dörwald4dbf1922002-11-06 16:53:44 +0000827
828 def __iter__(self):
829 return self
830
Tim Peters30324a72001-05-15 17:19:16 +0000831 def write(self, data):
Guido van Rossum0612d842000-03-10 23:20:43 +0000832
833 data, bytesdecoded = self.decode(data, self.errors)
834 return self.writer.write(data)
835
Tim Peters30324a72001-05-15 17:19:16 +0000836 def writelines(self, list):
Guido van Rossuma3277132000-04-11 15:37:43 +0000837
838 data = ''.join(list)
839 data, bytesdecoded = self.decode(data, self.errors)
840 return self.writer.write(data)
Guido van Rossum0612d842000-03-10 23:20:43 +0000841
842 def reset(self):
843
844 self.reader.reset()
845 self.writer.reset()
846
Tim Peters30324a72001-05-15 17:19:16 +0000847 def __getattr__(self, name,
Guido van Rossum0612d842000-03-10 23:20:43 +0000848 getattr=getattr):
849
850 """ Inherit all other methods from the underlying stream.
851 """
Tim Peters30324a72001-05-15 17:19:16 +0000852 return getattr(self.stream, name)
Guido van Rossum0612d842000-03-10 23:20:43 +0000853
Thomas Wouters89f507f2006-12-13 04:49:30 +0000854 def __enter__(self):
855 return self
856
857 def __exit__(self, type, value, tb):
858 self.stream.close()
859
Guido van Rossum0612d842000-03-10 23:20:43 +0000860### Shortcuts
861
Marc-André Lemburg349a3d32000-06-21 21:21:04 +0000862def open(filename, mode='rb', encoding=None, errors='strict', buffering=1):
Guido van Rossum0612d842000-03-10 23:20:43 +0000863
864 """ Open an encoded file using the given mode and return
865 a wrapped version providing transparent encoding/decoding.
866
867 Note: The wrapped version will only accept the object format
868 defined by the codecs, i.e. Unicode objects for most builtin
Skip Montanaro9f5f9d92005-03-16 03:51:56 +0000869 codecs. Output is also codec dependent and will usually be
Guido van Rossum0612d842000-03-10 23:20:43 +0000870 Unicode as well.
871
Marc-André Lemburg349a3d32000-06-21 21:21:04 +0000872 Files are always opened in binary mode, even if no binary mode
Walter Dörwald7f3ed742003-02-02 23:08:27 +0000873 was specified. This is done to avoid data loss due to encodings
Marc-André Lemburg349a3d32000-06-21 21:21:04 +0000874 using 8-bit values. The default file mode is 'rb' meaning to
875 open the file in binary read mode.
876
Guido van Rossum0612d842000-03-10 23:20:43 +0000877 encoding specifies the encoding which is to be used for the
Walter Dörwald7f3ed742003-02-02 23:08:27 +0000878 file.
Guido van Rossum0612d842000-03-10 23:20:43 +0000879
880 errors may be given to define the error handling. It defaults
881 to 'strict' which causes ValueErrors to be raised in case an
882 encoding error occurs.
883
884 buffering has the same meaning as for the builtin open() API.
885 It defaults to line buffered.
886
Fred Drake49fd1072000-04-13 14:11:21 +0000887 The returned wrapped file object provides an extra attribute
888 .encoding which allows querying the used encoding. This
889 attribute is only available if an encoding was specified as
890 parameter.
891
Guido van Rossum0612d842000-03-10 23:20:43 +0000892 """
893 if encoding is not None and \
894 'b' not in mode:
895 # Force opening of the file in binary mode
896 mode = mode + 'b'
Georg Brandl1a3284e2007-12-02 09:40:06 +0000897 file = builtins.open(filename, mode, buffering)
Guido van Rossum0612d842000-03-10 23:20:43 +0000898 if encoding is None:
899 return file
Thomas Woutersa9773292006-04-21 09:43:23 +0000900 info = lookup(encoding)
901 srw = StreamReaderWriter(file, info.streamreader, info.streamwriter, errors)
Guido van Rossuma3277132000-04-11 15:37:43 +0000902 # Add attributes to simplify introspection
903 srw.encoding = encoding
904 return srw
Guido van Rossum0612d842000-03-10 23:20:43 +0000905
Guido van Rossuma3277132000-04-11 15:37:43 +0000906def EncodedFile(file, data_encoding, file_encoding=None, errors='strict'):
Guido van Rossum0612d842000-03-10 23:20:43 +0000907
908 """ Return a wrapped version of file which provides transparent
909 encoding translation.
910
911 Strings written to the wrapped file are interpreted according
Guido van Rossuma3277132000-04-11 15:37:43 +0000912 to the given data_encoding and then written to the original
913 file as string using file_encoding. The intermediate encoding
914 will usually be Unicode but depends on the specified codecs.
Guido van Rossum0612d842000-03-10 23:20:43 +0000915
Guido van Rossuma3277132000-04-11 15:37:43 +0000916 Strings are read from the file using file_encoding and then
917 passed back to the caller as string using data_encoding.
918
919 If file_encoding is not given, it defaults to data_encoding.
Guido van Rossum0612d842000-03-10 23:20:43 +0000920
921 errors may be given to define the error handling. It defaults
922 to 'strict' which causes ValueErrors to be raised in case an
923 encoding error occurs.
924
Fred Drake49fd1072000-04-13 14:11:21 +0000925 The returned wrapped file object provides two extra attributes
926 .data_encoding and .file_encoding which reflect the given
927 parameters of the same name. The attributes can be used for
928 introspection by Python programs.
929
Guido van Rossum0612d842000-03-10 23:20:43 +0000930 """
Guido van Rossuma3277132000-04-11 15:37:43 +0000931 if file_encoding is None:
932 file_encoding = data_encoding
Thomas Wouters89f507f2006-12-13 04:49:30 +0000933 data_info = lookup(data_encoding)
934 file_info = lookup(file_encoding)
935 sr = StreamRecoder(file, data_info.encode, data_info.decode,
936 file_info.streamreader, file_info.streamwriter, errors)
Guido van Rossuma3277132000-04-11 15:37:43 +0000937 # Add attributes to simplify introspection
938 sr.data_encoding = data_encoding
939 sr.file_encoding = file_encoding
940 return sr
Guido van Rossum0612d842000-03-10 23:20:43 +0000941
Marc-André Lemburgaa32c5a2001-09-19 11:24:48 +0000942### Helpers for codec lookup
943
944def getencoder(encoding):
945
946 """ Lookup up the codec for the given encoding and return
947 its encoder function.
948
949 Raises a LookupError in case the encoding cannot be found.
950
951 """
Thomas Woutersa9773292006-04-21 09:43:23 +0000952 return lookup(encoding).encode
Marc-André Lemburgaa32c5a2001-09-19 11:24:48 +0000953
954def getdecoder(encoding):
955
956 """ Lookup up the codec for the given encoding and return
957 its decoder function.
958
959 Raises a LookupError in case the encoding cannot be found.
960
961 """
Thomas Woutersa9773292006-04-21 09:43:23 +0000962 return lookup(encoding).decode
963
964def getincrementalencoder(encoding):
965
966 """ Lookup up the codec for the given encoding and return
967 its IncrementalEncoder class or factory function.
968
969 Raises a LookupError in case the encoding cannot be found
970 or the codecs doesn't provide an incremental encoder.
971
972 """
973 encoder = lookup(encoding).incrementalencoder
974 if encoder is None:
975 raise LookupError(encoding)
976 return encoder
977
978def getincrementaldecoder(encoding):
979
980 """ Lookup up the codec for the given encoding and return
981 its IncrementalDecoder class or factory function.
982
983 Raises a LookupError in case the encoding cannot be found
984 or the codecs doesn't provide an incremental decoder.
985
986 """
987 decoder = lookup(encoding).incrementaldecoder
988 if decoder is None:
989 raise LookupError(encoding)
990 return decoder
Marc-André Lemburgaa32c5a2001-09-19 11:24:48 +0000991
992def getreader(encoding):
993
994 """ Lookup up the codec for the given encoding and return
995 its StreamReader class or factory function.
996
997 Raises a LookupError in case the encoding cannot be found.
998
999 """
Thomas Woutersa9773292006-04-21 09:43:23 +00001000 return lookup(encoding).streamreader
Marc-André Lemburgaa32c5a2001-09-19 11:24:48 +00001001
1002def getwriter(encoding):
1003
1004 """ Lookup up the codec for the given encoding and return
1005 its StreamWriter class or factory function.
1006
1007 Raises a LookupError in case the encoding cannot be found.
1008
1009 """
Thomas Woutersa9773292006-04-21 09:43:23 +00001010 return lookup(encoding).streamwriter
1011
1012def iterencode(iterator, encoding, errors='strict', **kwargs):
1013 """
1014 Encoding iterator.
1015
1016 Encodes the input strings from the iterator using a IncrementalEncoder.
1017
1018 errors and kwargs are passed through to the IncrementalEncoder
1019 constructor.
1020 """
1021 encoder = getincrementalencoder(encoding)(errors, **kwargs)
1022 for input in iterator:
1023 output = encoder.encode(input)
1024 if output:
1025 yield output
1026 output = encoder.encode("", True)
1027 if output:
1028 yield output
1029
1030def iterdecode(iterator, encoding, errors='strict', **kwargs):
1031 """
1032 Decoding iterator.
1033
1034 Decodes the input strings from the iterator using a IncrementalDecoder.
1035
1036 errors and kwargs are passed through to the IncrementalDecoder
1037 constructor.
1038 """
1039 decoder = getincrementaldecoder(encoding)(errors, **kwargs)
1040 for input in iterator:
1041 output = decoder.decode(input)
1042 if output:
1043 yield output
Walter Dörwaldca8a8d02007-05-04 13:05:09 +00001044 output = decoder.decode(b"", True)
Thomas Woutersa9773292006-04-21 09:43:23 +00001045 if output:
1046 yield output
Marc-André Lemburgaa32c5a2001-09-19 11:24:48 +00001047
Marc-André Lemburga866df82001-01-03 21:29:14 +00001048### Helpers for charmap-based codecs
1049
1050def make_identity_dict(rng):
1051
1052 """ make_identity_dict(rng) -> dict
1053
1054 Return a dictionary where elements of the rng sequence are
1055 mapped to themselves.
Tim Peters88869f92001-01-14 23:36:06 +00001056
Marc-André Lemburga866df82001-01-03 21:29:14 +00001057 """
Antoine Pitrouaaefac72012-06-16 22:48:21 +02001058 return {i:i for i in rng}
Marc-André Lemburga866df82001-01-03 21:29:14 +00001059
Marc-André Lemburg716cf912001-05-16 09:41:45 +00001060def make_encoding_map(decoding_map):
1061
1062 """ Creates an encoding map from a decoding map.
1063
Walter Dörwald7f3ed742003-02-02 23:08:27 +00001064 If a target mapping in the decoding map occurs multiple
Marc-André Lemburg716cf912001-05-16 09:41:45 +00001065 times, then that target is mapped to None (undefined mapping),
1066 causing an exception when encountered by the charmap codec
1067 during translation.
1068
1069 One example where this happens is cp875.py which decodes
1070 multiple character to \u001a.
1071
1072 """
1073 m = {}
1074 for k,v in decoding_map.items():
Raymond Hettinger54f02222002-06-01 14:18:47 +00001075 if not v in m:
Marc-André Lemburg716cf912001-05-16 09:41:45 +00001076 m[v] = k
1077 else:
1078 m[v] = None
1079 return m
Tim Peters3a2ab1a2001-05-29 06:06:54 +00001080
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001081### error handlers
1082
Martin v. Löwise2713be2005-03-08 15:03:08 +00001083try:
1084 strict_errors = lookup_error("strict")
1085 ignore_errors = lookup_error("ignore")
1086 replace_errors = lookup_error("replace")
1087 xmlcharrefreplace_errors = lookup_error("xmlcharrefreplace")
1088 backslashreplace_errors = lookup_error("backslashreplace")
Serhiy Storchaka166ebc42014-11-25 13:57:17 +02001089 namereplace_errors = lookup_error("namereplace")
Martin v. Löwise2713be2005-03-08 15:03:08 +00001090except LookupError:
1091 # In --disable-unicode builds, these error handler are missing
1092 strict_errors = None
1093 ignore_errors = None
1094 replace_errors = None
1095 xmlcharrefreplace_errors = None
1096 backslashreplace_errors = None
Serhiy Storchaka166ebc42014-11-25 13:57:17 +02001097 namereplace_errors = None
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001098
Martin v. Löwis6cd441d2001-07-31 08:54:55 +00001099# Tell modulefinder that using codecs probably needs the encodings
1100# package
1101_false = 0
1102if _false:
1103 import encodings
1104
Guido van Rossum0612d842000-03-10 23:20:43 +00001105### Tests
Guido van Rossum1c89b0e2000-04-11 15:41:38 +00001106
Guido van Rossum0612d842000-03-10 23:20:43 +00001107if __name__ == '__main__':
1108
Guido van Rossuma3277132000-04-11 15:37:43 +00001109 # Make stdout translate Latin-1 output into UTF-8 output
1110 sys.stdout = EncodedFile(sys.stdout, 'latin-1', 'utf-8')
Guido van Rossum1c89b0e2000-04-11 15:41:38 +00001111
Guido van Rossuma3277132000-04-11 15:37:43 +00001112 # Have stdin translate Latin-1 input into UTF-8 input
1113 sys.stdin = EncodedFile(sys.stdin, 'utf-8', 'latin-1')