blob: 2e2e7555a4829724fb6e72de10a210048de10781 [file] [log] [blame]
Guido van Rossum0612d842000-03-10 23:20:43 +00001""" codecs -- Python Codec Registry, API and helpers.
2
3
4Written by Marc-Andre Lemburg (mal@lemburg.com).
5
6(c) Copyright CNRI, All Rights Reserved. NO WARRANTY.
7
8"""#"
9
Georg Brandl1a3284e2007-12-02 09:40:06 +000010import builtins, sys
Guido van Rossum0612d842000-03-10 23:20:43 +000011
12### Registry and builtin stateless codec functions
13
Guido van Rossumb95de4f2000-03-31 17:25:23 +000014try:
15 from _codecs import *
Guido van Rossumb940e112007-01-10 16:19:56 +000016except ImportError as why:
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000017 raise SystemError('Failed to load the builtin codecs: %s' % why)
Guido van Rossum0612d842000-03-10 23:20:43 +000018
Tim Peters30324a72001-05-15 17:19:16 +000019__all__ = ["register", "lookup", "open", "EncodedFile", "BOM", "BOM_BE",
Walter Dörwald474458d2002-06-04 15:16:29 +000020 "BOM_LE", "BOM32_BE", "BOM32_LE", "BOM64_BE", "BOM64_LE",
21 "BOM_UTF8", "BOM_UTF16", "BOM_UTF16_LE", "BOM_UTF16_BE",
Walter Dörwald3aeb6322002-09-02 13:14:32 +000022 "BOM_UTF32", "BOM_UTF32_LE", "BOM_UTF32_BE",
23 "strict_errors", "ignore_errors", "replace_errors",
24 "xmlcharrefreplace_errors",
25 "register_error", "lookup_error"]
Skip Montanaroe99d5ea2001-01-20 19:54:20 +000026
Guido van Rossum0612d842000-03-10 23:20:43 +000027### Constants
28
29#
Walter Dörwald474458d2002-06-04 15:16:29 +000030# Byte Order Mark (BOM = ZERO WIDTH NO-BREAK SPACE = U+FEFF)
31# and its possible byte string values
32# for UTF8/UTF16/UTF32 output and little/big endian machines
Guido van Rossum0612d842000-03-10 23:20:43 +000033#
Guido van Rossum0612d842000-03-10 23:20:43 +000034
Walter Dörwald474458d2002-06-04 15:16:29 +000035# UTF-8
Walter Dörwaldca8a8d02007-05-04 13:05:09 +000036BOM_UTF8 = b'\xef\xbb\xbf'
Walter Dörwald474458d2002-06-04 15:16:29 +000037
38# UTF-16, little endian
Walter Dörwaldca8a8d02007-05-04 13:05:09 +000039BOM_LE = BOM_UTF16_LE = b'\xff\xfe'
Walter Dörwald474458d2002-06-04 15:16:29 +000040
41# UTF-16, big endian
Walter Dörwaldca8a8d02007-05-04 13:05:09 +000042BOM_BE = BOM_UTF16_BE = b'\xfe\xff'
Walter Dörwald474458d2002-06-04 15:16:29 +000043
44# UTF-32, little endian
Walter Dörwaldca8a8d02007-05-04 13:05:09 +000045BOM_UTF32_LE = b'\xff\xfe\x00\x00'
Walter Dörwald474458d2002-06-04 15:16:29 +000046
47# UTF-32, big endian
Walter Dörwaldca8a8d02007-05-04 13:05:09 +000048BOM_UTF32_BE = b'\x00\x00\xfe\xff'
Walter Dörwald474458d2002-06-04 15:16:29 +000049
Marc-André Lemburgb28de0d2002-12-12 17:37:50 +000050if sys.byteorder == 'little':
Walter Dörwald474458d2002-06-04 15:16:29 +000051
Marc-André Lemburgb28de0d2002-12-12 17:37:50 +000052 # UTF-16, native endianness
53 BOM = BOM_UTF16 = BOM_UTF16_LE
54
55 # UTF-32, native endianness
56 BOM_UTF32 = BOM_UTF32_LE
57
58else:
59
60 # UTF-16, native endianness
61 BOM = BOM_UTF16 = BOM_UTF16_BE
62
63 # UTF-32, native endianness
64 BOM_UTF32 = BOM_UTF32_BE
Walter Dörwald474458d2002-06-04 15:16:29 +000065
66# Old broken names (don't use in new code)
67BOM32_LE = BOM_UTF16_LE
68BOM32_BE = BOM_UTF16_BE
69BOM64_LE = BOM_UTF32_LE
70BOM64_BE = BOM_UTF32_BE
Guido van Rossum0612d842000-03-10 23:20:43 +000071
72
73### Codec base classes (defining the API)
74
Thomas Woutersa9773292006-04-21 09:43:23 +000075class CodecInfo(tuple):
Nick Coghlanc72e4e62013-11-22 22:39:36 +100076 """Codec details when looking up the codec registry"""
77
78 # Private API to allow Python 3.4 to blacklist the known non-Unicode
79 # codecs in the standard library. A more general mechanism to
80 # reliably distinguish test encodings from other codecs will hopefully
81 # be defined for Python 3.5
82 #
83 # See http://bugs.python.org/issue19619
84 _is_text_encoding = True # Assume codecs are text encodings by default
Thomas Woutersa9773292006-04-21 09:43:23 +000085
86 def __new__(cls, encode, decode, streamreader=None, streamwriter=None,
Nick Coghlanc72e4e62013-11-22 22:39:36 +100087 incrementalencoder=None, incrementaldecoder=None, name=None,
88 *, _is_text_encoding=None):
Thomas Woutersa9773292006-04-21 09:43:23 +000089 self = tuple.__new__(cls, (encode, decode, streamreader, streamwriter))
90 self.name = name
91 self.encode = encode
92 self.decode = decode
93 self.incrementalencoder = incrementalencoder
94 self.incrementaldecoder = incrementaldecoder
95 self.streamwriter = streamwriter
96 self.streamreader = streamreader
Nick Coghlanc72e4e62013-11-22 22:39:36 +100097 if _is_text_encoding is not None:
98 self._is_text_encoding = _is_text_encoding
Thomas Woutersa9773292006-04-21 09:43:23 +000099 return self
100
101 def __repr__(self):
Walter Dörwald3abcb012007-04-16 22:10:50 +0000102 return "<%s.%s object for encoding %s at 0x%x>" % \
103 (self.__class__.__module__, self.__class__.__name__,
104 self.name, id(self))
Thomas Woutersa9773292006-04-21 09:43:23 +0000105
Guido van Rossum0612d842000-03-10 23:20:43 +0000106class Codec:
107
108 """ Defines the interface for stateless encoders/decoders.
109
Walter Dörwald7f82f792002-11-19 21:42:53 +0000110 The .encode()/.decode() methods may use different error
Guido van Rossum0612d842000-03-10 23:20:43 +0000111 handling schemes by providing the errors argument. These
Walter Dörwald7f82f792002-11-19 21:42:53 +0000112 string values are predefined:
Guido van Rossum0612d842000-03-10 23:20:43 +0000113
Guido van Rossumd8855fd2000-03-24 22:14:19 +0000114 'strict' - raise a ValueError error (or a subclass)
Guido van Rossum0612d842000-03-10 23:20:43 +0000115 'ignore' - ignore the character and continue with the next
116 'replace' - replace with a suitable replacement character;
117 Python will use the official U+FFFD REPLACEMENT
Walter Dörwald7f82f792002-11-19 21:42:53 +0000118 CHARACTER for the builtin Unicode codecs on
119 decoding and '?' on encoding.
Andrew Kuchlingc7b6c502013-06-16 12:58:48 -0400120 'surrogateescape' - replace with private codepoints U+DCnn.
Walter Dörwald7f82f792002-11-19 21:42:53 +0000121 'xmlcharrefreplace' - Replace with the appropriate XML
122 character reference (only for encoding).
123 'backslashreplace' - Replace with backslashed escape sequences
124 (only for encoding).
125
126 The set of allowed values can be extended via register_error.
Guido van Rossum0612d842000-03-10 23:20:43 +0000127
128 """
Tim Peters30324a72001-05-15 17:19:16 +0000129 def encode(self, input, errors='strict'):
Guido van Rossum1c89b0e2000-04-11 15:41:38 +0000130
Fred Drake3e74c0d2000-03-17 15:40:35 +0000131 """ Encodes the object input and returns a tuple (output
Guido van Rossum0612d842000-03-10 23:20:43 +0000132 object, length consumed).
133
134 errors defines the error handling to apply. It defaults to
135 'strict' handling.
136
137 The method may not store state in the Codec instance. Use
138 StreamCodec for codecs which have to keep state in order to
139 make encoding/decoding efficient.
140
141 The encoder must be able to handle zero length input and
142 return an empty object of the output object type in this
143 situation.
144
145 """
146 raise NotImplementedError
147
Tim Peters30324a72001-05-15 17:19:16 +0000148 def decode(self, input, errors='strict'):
Guido van Rossum0612d842000-03-10 23:20:43 +0000149
150 """ Decodes the object input and returns a tuple (output
151 object, length consumed).
152
153 input must be an object which provides the bf_getreadbuf
154 buffer slot. Python strings, buffer objects and memory
155 mapped files are examples of objects providing this slot.
Guido van Rossum1c89b0e2000-04-11 15:41:38 +0000156
Guido van Rossum0612d842000-03-10 23:20:43 +0000157 errors defines the error handling to apply. It defaults to
158 'strict' handling.
159
160 The method may not store state in the Codec instance. Use
161 StreamCodec for codecs which have to keep state in order to
162 make encoding/decoding efficient.
163
164 The decoder must be able to handle zero length input and
165 return an empty object of the output object type in this
166 situation.
167
Guido van Rossum1c89b0e2000-04-11 15:41:38 +0000168 """
Guido van Rossum0612d842000-03-10 23:20:43 +0000169 raise NotImplementedError
170
Thomas Woutersa9773292006-04-21 09:43:23 +0000171class IncrementalEncoder(object):
172 """
Walter Dörwald3abcb012007-04-16 22:10:50 +0000173 An IncrementalEncoder encodes an input in multiple steps. The input can
174 be passed piece by piece to the encode() method. The IncrementalEncoder
175 remembers the state of the encoding process between calls to encode().
Thomas Woutersa9773292006-04-21 09:43:23 +0000176 """
177 def __init__(self, errors='strict'):
178 """
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000179 Creates an IncrementalEncoder instance.
Thomas Woutersa9773292006-04-21 09:43:23 +0000180
181 The IncrementalEncoder may use different error handling schemes by
182 providing the errors keyword argument. See the module docstring
183 for a list of possible values.
184 """
185 self.errors = errors
186 self.buffer = ""
187
188 def encode(self, input, final=False):
189 """
190 Encodes input and returns the resulting object.
191 """
192 raise NotImplementedError
193
194 def reset(self):
195 """
196 Resets the encoder to the initial state.
197 """
198
Walter Dörwald3abcb012007-04-16 22:10:50 +0000199 def getstate(self):
200 """
201 Return the current state of the encoder.
202 """
203 return 0
204
205 def setstate(self, state):
206 """
207 Set the current state of the encoder. state must have been
208 returned by getstate().
209 """
210
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000211class BufferedIncrementalEncoder(IncrementalEncoder):
212 """
213 This subclass of IncrementalEncoder can be used as the baseclass for an
214 incremental encoder if the encoder must keep some of the output in a
215 buffer between calls to encode().
216 """
217 def __init__(self, errors='strict'):
218 IncrementalEncoder.__init__(self, errors)
Walter Dörwald3abcb012007-04-16 22:10:50 +0000219 # unencoded input that is kept between calls to encode()
220 self.buffer = ""
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000221
222 def _buffer_encode(self, input, errors, final):
223 # Overwrite this method in subclasses: It must encode input
224 # and return an (output, length consumed) tuple
225 raise NotImplementedError
226
227 def encode(self, input, final=False):
228 # encode input (taking the buffer into account)
229 data = self.buffer + input
230 (result, consumed) = self._buffer_encode(data, self.errors, final)
231 # keep unencoded input until the next call
232 self.buffer = data[consumed:]
233 return result
234
235 def reset(self):
236 IncrementalEncoder.reset(self)
237 self.buffer = ""
238
Walter Dörwald3abcb012007-04-16 22:10:50 +0000239 def getstate(self):
240 return self.buffer or 0
241
242 def setstate(self, state):
243 self.buffer = state or ""
244
Thomas Woutersa9773292006-04-21 09:43:23 +0000245class IncrementalDecoder(object):
246 """
Walter Dörwald3abcb012007-04-16 22:10:50 +0000247 An IncrementalDecoder decodes an input in multiple steps. The input can
248 be passed piece by piece to the decode() method. The IncrementalDecoder
Thomas Woutersa9773292006-04-21 09:43:23 +0000249 remembers the state of the decoding process between calls to decode().
250 """
251 def __init__(self, errors='strict'):
252 """
Ka-Ping Yeef44c7e82008-03-18 04:51:32 +0000253 Create a IncrementalDecoder instance.
Thomas Woutersa9773292006-04-21 09:43:23 +0000254
255 The IncrementalDecoder may use different error handling schemes by
256 providing the errors keyword argument. See the module docstring
257 for a list of possible values.
258 """
259 self.errors = errors
260
261 def decode(self, input, final=False):
262 """
Ka-Ping Yeef44c7e82008-03-18 04:51:32 +0000263 Decode input and returns the resulting object.
Thomas Woutersa9773292006-04-21 09:43:23 +0000264 """
265 raise NotImplementedError
266
267 def reset(self):
268 """
Ka-Ping Yeef44c7e82008-03-18 04:51:32 +0000269 Reset the decoder to the initial state.
Thomas Woutersa9773292006-04-21 09:43:23 +0000270 """
271
Walter Dörwald3abcb012007-04-16 22:10:50 +0000272 def getstate(self):
273 """
Ka-Ping Yeef44c7e82008-03-18 04:51:32 +0000274 Return the current state of the decoder.
275
276 This must be a (buffered_input, additional_state_info) tuple.
277 buffered_input must be a bytes object containing bytes that
278 were passed to decode() that have not yet been converted.
279 additional_state_info must be a non-negative integer
280 representing the state of the decoder WITHOUT yet having
281 processed the contents of buffered_input. In the initial state
282 and after reset(), getstate() must return (b"", 0).
Walter Dörwald3abcb012007-04-16 22:10:50 +0000283 """
Walter Dörwaldca8a8d02007-05-04 13:05:09 +0000284 return (b"", 0)
Walter Dörwald3abcb012007-04-16 22:10:50 +0000285
286 def setstate(self, state):
287 """
Ka-Ping Yeef44c7e82008-03-18 04:51:32 +0000288 Set the current state of the decoder.
289
290 state must have been returned by getstate(). The effect of
291 setstate((b"", 0)) must be equivalent to reset().
Walter Dörwald3abcb012007-04-16 22:10:50 +0000292 """
293
Thomas Woutersa9773292006-04-21 09:43:23 +0000294class BufferedIncrementalDecoder(IncrementalDecoder):
295 """
296 This subclass of IncrementalDecoder can be used as the baseclass for an
Walter Dörwald3abcb012007-04-16 22:10:50 +0000297 incremental decoder if the decoder must be able to handle incomplete
298 byte sequences.
Thomas Woutersa9773292006-04-21 09:43:23 +0000299 """
300 def __init__(self, errors='strict'):
301 IncrementalDecoder.__init__(self, errors)
Walter Dörwald3abcb012007-04-16 22:10:50 +0000302 # undecoded input that is kept between calls to decode()
Walter Dörwaldca8a8d02007-05-04 13:05:09 +0000303 self.buffer = b""
Thomas Woutersa9773292006-04-21 09:43:23 +0000304
305 def _buffer_decode(self, input, errors, final):
306 # Overwrite this method in subclasses: It must decode input
307 # and return an (output, length consumed) tuple
308 raise NotImplementedError
309
310 def decode(self, input, final=False):
311 # decode input (taking the buffer into account)
312 data = self.buffer + input
313 (result, consumed) = self._buffer_decode(data, self.errors, final)
314 # keep undecoded input until the next call
315 self.buffer = data[consumed:]
316 return result
317
318 def reset(self):
319 IncrementalDecoder.reset(self)
Walter Dörwaldca8a8d02007-05-04 13:05:09 +0000320 self.buffer = b""
Thomas Woutersa9773292006-04-21 09:43:23 +0000321
Walter Dörwald3abcb012007-04-16 22:10:50 +0000322 def getstate(self):
323 # additional state info is always 0
324 return (self.buffer, 0)
325
326 def setstate(self, state):
327 # ignore additional state info
328 self.buffer = state[0]
329
Guido van Rossum0612d842000-03-10 23:20:43 +0000330#
331# The StreamWriter and StreamReader class provide generic working
Andrew M. Kuchling97c56352001-09-18 20:29:48 +0000332# interfaces which can be used to implement new encoding submodules
Guido van Rossum0612d842000-03-10 23:20:43 +0000333# very easily. See encodings/utf_8.py for an example on how this is
334# done.
Guido van Rossum1c89b0e2000-04-11 15:41:38 +0000335#
Guido van Rossum0612d842000-03-10 23:20:43 +0000336
337class StreamWriter(Codec):
338
Tim Peters30324a72001-05-15 17:19:16 +0000339 def __init__(self, stream, errors='strict'):
Guido van Rossum0612d842000-03-10 23:20:43 +0000340
341 """ Creates a StreamWriter instance.
342
343 stream must be a file-like object open for writing
344 (binary) data.
345
Walter Dörwald7f82f792002-11-19 21:42:53 +0000346 The StreamWriter may use different error handling
Guido van Rossum0612d842000-03-10 23:20:43 +0000347 schemes by providing the errors keyword argument. These
Walter Dörwald7f82f792002-11-19 21:42:53 +0000348 parameters are predefined:
Guido van Rossum0612d842000-03-10 23:20:43 +0000349
350 'strict' - raise a ValueError (or a subclass)
351 'ignore' - ignore the character and continue with the next
352 'replace'- replace with a suitable replacement character
Walter Dörwald7f82f792002-11-19 21:42:53 +0000353 'xmlcharrefreplace' - Replace with the appropriate XML
354 character reference.
355 'backslashreplace' - Replace with backslashed escape
356 sequences (only for encoding).
Guido van Rossum0612d842000-03-10 23:20:43 +0000357
Walter Dörwald7f82f792002-11-19 21:42:53 +0000358 The set of allowed parameter values can be extended via
359 register_error.
Guido van Rossum0612d842000-03-10 23:20:43 +0000360 """
361 self.stream = stream
362 self.errors = errors
363
Guido van Rossuma3277132000-04-11 15:37:43 +0000364 def write(self, object):
Guido van Rossum0612d842000-03-10 23:20:43 +0000365
366 """ Writes the object's contents encoded to self.stream.
367 """
Tim Peters30324a72001-05-15 17:19:16 +0000368 data, consumed = self.encode(object, self.errors)
Guido van Rossum0612d842000-03-10 23:20:43 +0000369 self.stream.write(data)
370
Guido van Rossuma3277132000-04-11 15:37:43 +0000371 def writelines(self, list):
372
373 """ Writes the concatenated list of strings to the stream
374 using .write().
375 """
376 self.write(''.join(list))
Guido van Rossum1c89b0e2000-04-11 15:41:38 +0000377
Guido van Rossum0612d842000-03-10 23:20:43 +0000378 def reset(self):
379
380 """ Flushes and resets the codec buffers used for keeping state.
381
382 Calling this method should ensure that the data on the
383 output is put into a clean state, that allows appending
384 of new fresh data without having to rescan the whole
385 stream to recover state.
386
387 """
388 pass
389
Victor Stinnera92ad7e2010-05-22 16:59:09 +0000390 def seek(self, offset, whence=0):
391 self.stream.seek(offset, whence)
392 if whence == 0 and offset == 0:
393 self.reset()
394
Tim Peters30324a72001-05-15 17:19:16 +0000395 def __getattr__(self, name,
Guido van Rossum0612d842000-03-10 23:20:43 +0000396 getattr=getattr):
397
398 """ Inherit all other methods from the underlying stream.
399 """
Tim Peters30324a72001-05-15 17:19:16 +0000400 return getattr(self.stream, name)
Guido van Rossum0612d842000-03-10 23:20:43 +0000401
Thomas Wouters89f507f2006-12-13 04:49:30 +0000402 def __enter__(self):
403 return self
404
405 def __exit__(self, type, value, tb):
406 self.stream.close()
407
Guido van Rossum0612d842000-03-10 23:20:43 +0000408###
409
410class StreamReader(Codec):
411
Georg Brandl02524622010-12-02 18:06:51 +0000412 charbuffertype = str
413
Tim Peters30324a72001-05-15 17:19:16 +0000414 def __init__(self, stream, errors='strict'):
Guido van Rossum0612d842000-03-10 23:20:43 +0000415
416 """ Creates a StreamReader instance.
417
418 stream must be a file-like object open for reading
419 (binary) data.
420
Walter Dörwald7f82f792002-11-19 21:42:53 +0000421 The StreamReader may use different error handling
Guido van Rossum0612d842000-03-10 23:20:43 +0000422 schemes by providing the errors keyword argument. These
Walter Dörwald7f82f792002-11-19 21:42:53 +0000423 parameters are predefined:
Guido van Rossum0612d842000-03-10 23:20:43 +0000424
425 'strict' - raise a ValueError (or a subclass)
426 'ignore' - ignore the character and continue with the next
427 'replace'- replace with a suitable replacement character;
428
Walter Dörwald7f82f792002-11-19 21:42:53 +0000429 The set of allowed parameter values can be extended via
430 register_error.
Guido van Rossum0612d842000-03-10 23:20:43 +0000431 """
432 self.stream = stream
433 self.errors = errors
Walter Dörwaldca8a8d02007-05-04 13:05:09 +0000434 self.bytebuffer = b""
Georg Brandl02524622010-12-02 18:06:51 +0000435 self._empty_charbuffer = self.charbuffertype()
436 self.charbuffer = self._empty_charbuffer
Martin v. Löwis4ed67382005-09-18 08:34:39 +0000437 self.linebuffer = None
Guido van Rossum0612d842000-03-10 23:20:43 +0000438
Walter Dörwald69652032004-09-07 20:24:22 +0000439 def decode(self, input, errors='strict'):
440 raise NotImplementedError
441
Martin v. Löwis56066d22005-08-24 07:38:12 +0000442 def read(self, size=-1, chars=-1, firstline=False):
Guido van Rossum0612d842000-03-10 23:20:43 +0000443
444 """ Decodes data from the stream self.stream and returns the
445 resulting object.
446
Walter Dörwald69652032004-09-07 20:24:22 +0000447 chars indicates the number of characters to read from the
448 stream. read() will never return more than chars
449 characters, but it might return less, if there are not enough
450 characters available.
451
Guido van Rossum0612d842000-03-10 23:20:43 +0000452 size indicates the approximate maximum number of bytes to
453 read from the stream for decoding purposes. The decoder
454 can modify this setting as appropriate. The default value
455 -1 indicates to read and decode as much as possible. size
456 is intended to prevent having to decode huge files in one
457 step.
458
Martin v. Löwis56066d22005-08-24 07:38:12 +0000459 If firstline is true, and a UnicodeDecodeError happens
460 after the first line terminator in the input only the first line
461 will be returned, the rest of the input will be kept until the
462 next call to read().
463
Guido van Rossum0612d842000-03-10 23:20:43 +0000464 The method should use a greedy read strategy meaning that
465 it should read as much data as is allowed within the
466 definition of the encoding and the given size, e.g. if
467 optional encoding endings or state markers are available
468 on the stream, these should be read too.
Guido van Rossum0612d842000-03-10 23:20:43 +0000469 """
Martin v. Löwis4ed67382005-09-18 08:34:39 +0000470 # If we have lines cached, first merge them back into characters
471 if self.linebuffer:
Georg Brandl02524622010-12-02 18:06:51 +0000472 self.charbuffer = self._empty_charbuffer.join(self.linebuffer)
Martin v. Löwis4ed67382005-09-18 08:34:39 +0000473 self.linebuffer = None
Tim Peters536cf992005-12-25 23:18:31 +0000474
Walter Dörwald69652032004-09-07 20:24:22 +0000475 # read until we get the required number of characters (if available)
Walter Dörwald69652032004-09-07 20:24:22 +0000476 while True:
Tim Golden621302c2012-10-01 16:40:40 +0100477 # can the request be satisfied from the character buffer?
Walter Dörwald69652032004-09-07 20:24:22 +0000478 if chars < 0:
Walter Dörwaldca199432006-03-06 22:39:12 +0000479 if size < 0:
480 if self.charbuffer:
481 break
482 elif len(self.charbuffer) >= size:
Walter Dörwalde57d7b12004-12-21 22:24:00 +0000483 break
Guido van Rossum0612d842000-03-10 23:20:43 +0000484 else:
Walter Dörwald69652032004-09-07 20:24:22 +0000485 if len(self.charbuffer) >= chars:
Walter Dörwald69652032004-09-07 20:24:22 +0000486 break
487 # we need more data
488 if size < 0:
489 newdata = self.stream.read()
490 else:
491 newdata = self.stream.read(size)
Walter Dörwalde57d7b12004-12-21 22:24:00 +0000492 # decode bytes (those remaining from the last call included)
Walter Dörwald69652032004-09-07 20:24:22 +0000493 data = self.bytebuffer + newdata
Martin v. Löwis56066d22005-08-24 07:38:12 +0000494 try:
495 newchars, decodedbytes = self.decode(data, self.errors)
Guido van Rossumb940e112007-01-10 16:19:56 +0000496 except UnicodeDecodeError as exc:
Martin v. Löwis56066d22005-08-24 07:38:12 +0000497 if firstline:
Walter Dörwald3abcb012007-04-16 22:10:50 +0000498 newchars, decodedbytes = \
499 self.decode(data[:exc.start], self.errors)
Ezio Melottid8b509b2011-09-28 17:37:55 +0300500 lines = newchars.splitlines(keepends=True)
Martin v. Löwis56066d22005-08-24 07:38:12 +0000501 if len(lines)<=1:
502 raise
503 else:
504 raise
Walter Dörwald69652032004-09-07 20:24:22 +0000505 # keep undecoded bytes until the next call
506 self.bytebuffer = data[decodedbytes:]
507 # put new characters in the character buffer
Walter Dörwalde57d7b12004-12-21 22:24:00 +0000508 self.charbuffer += newchars
Walter Dörwald69652032004-09-07 20:24:22 +0000509 # there was no data available
510 if not newdata:
Walter Dörwalde57d7b12004-12-21 22:24:00 +0000511 break
512 if chars < 0:
513 # Return everything we've got
514 result = self.charbuffer
Georg Brandl02524622010-12-02 18:06:51 +0000515 self.charbuffer = self._empty_charbuffer
Walter Dörwalde57d7b12004-12-21 22:24:00 +0000516 else:
517 # Return the first chars characters
518 result = self.charbuffer[:chars]
519 self.charbuffer = self.charbuffer[chars:]
Walter Dörwald69652032004-09-07 20:24:22 +0000520 return result
Guido van Rossum0612d842000-03-10 23:20:43 +0000521
Walter Dörwald69652032004-09-07 20:24:22 +0000522 def readline(self, size=None, keepends=True):
Guido van Rossuma3277132000-04-11 15:37:43 +0000523
524 """ Read one line from the input stream and return the
525 decoded data.
526
Walter Dörwald69652032004-09-07 20:24:22 +0000527 size, if given, is passed as size argument to the
528 read() method.
Guido van Rossum1c89b0e2000-04-11 15:41:38 +0000529
Guido van Rossuma3277132000-04-11 15:37:43 +0000530 """
Martin v. Löwis4ed67382005-09-18 08:34:39 +0000531 # If we have lines cached from an earlier read, return
532 # them unconditionally
533 if self.linebuffer:
534 line = self.linebuffer[0]
535 del self.linebuffer[0]
536 if len(self.linebuffer) == 1:
537 # revert to charbuffer mode; we might need more data
538 # next time
539 self.charbuffer = self.linebuffer[0]
540 self.linebuffer = None
541 if not keepends:
Ezio Melottid8b509b2011-09-28 17:37:55 +0300542 line = line.splitlines(keepends=False)[0]
Martin v. Löwis4ed67382005-09-18 08:34:39 +0000543 return line
Tim Peters536cf992005-12-25 23:18:31 +0000544
Walter Dörwalde57d7b12004-12-21 22:24:00 +0000545 readsize = size or 72
Georg Brandl02524622010-12-02 18:06:51 +0000546 line = self._empty_charbuffer
Walter Dörwalde57d7b12004-12-21 22:24:00 +0000547 # If size is given, we call read() only once
Walter Dörwald69652032004-09-07 20:24:22 +0000548 while True:
Martin v. Löwis56066d22005-08-24 07:38:12 +0000549 data = self.read(readsize, firstline=True)
Walter Dörwalde57d7b12004-12-21 22:24:00 +0000550 if data:
Walter Dörwalda4eb2d52005-04-21 21:42:35 +0000551 # If we're at a "\r" read one extra character (which might
552 # be a "\n") to get a proper line ending. If the stream is
Walter Dörwaldbc8e6422005-04-21 21:32:03 +0000553 # temporarily exhausted we return the wrong line ending.
Georg Brandl02524622010-12-02 18:06:51 +0000554 if (isinstance(data, str) and data.endswith("\r")) or \
555 (isinstance(data, bytes) and data.endswith(b"\r")):
Walter Dörwald7a6dc132005-04-04 21:38:47 +0000556 data += self.read(size=1, chars=1)
Walter Dörwald7a6dc132005-04-04 21:38:47 +0000557
Walter Dörwald69652032004-09-07 20:24:22 +0000558 line += data
Ezio Melottid8b509b2011-09-28 17:37:55 +0300559 lines = line.splitlines(keepends=True)
Walter Dörwalde57d7b12004-12-21 22:24:00 +0000560 if lines:
Martin v. Löwis4ed67382005-09-18 08:34:39 +0000561 if len(lines) > 1:
562 # More than one line result; the first line is a full line
563 # to return
564 line = lines[0]
565 del lines[0]
566 if len(lines) > 1:
567 # cache the remaining lines
568 lines[-1] += self.charbuffer
569 self.linebuffer = lines
570 self.charbuffer = None
571 else:
572 # only one remaining line, put it back into charbuffer
573 self.charbuffer = lines[0] + self.charbuffer
574 if not keepends:
Ezio Melottid8b509b2011-09-28 17:37:55 +0300575 line = line.splitlines(keepends=False)[0]
Martin v. Löwis4ed67382005-09-18 08:34:39 +0000576 break
Walter Dörwalde57d7b12004-12-21 22:24:00 +0000577 line0withend = lines[0]
Ezio Melottid8b509b2011-09-28 17:37:55 +0300578 line0withoutend = lines[0].splitlines(keepends=False)[0]
Walter Dörwalde57d7b12004-12-21 22:24:00 +0000579 if line0withend != line0withoutend: # We really have a line end
580 # Put the rest back together and keep it until the next call
Georg Brandl02524622010-12-02 18:06:51 +0000581 self.charbuffer = self._empty_charbuffer.join(lines[1:]) + \
582 self.charbuffer
Walter Dörwalde57d7b12004-12-21 22:24:00 +0000583 if keepends:
584 line = line0withend
585 else:
586 line = line0withoutend
Walter Dörwald9fa09462005-01-10 12:01:39 +0000587 break
Walter Dörwalde57d7b12004-12-21 22:24:00 +0000588 # we didn't get anything or this was our only try
Walter Dörwald9fa09462005-01-10 12:01:39 +0000589 if not data or size is not None:
Walter Dörwalde57d7b12004-12-21 22:24:00 +0000590 if line and not keepends:
Ezio Melottid8b509b2011-09-28 17:37:55 +0300591 line = line.splitlines(keepends=False)[0]
Walter Dörwalde57d7b12004-12-21 22:24:00 +0000592 break
Georg Brandl02524622010-12-02 18:06:51 +0000593 if readsize < 8000:
Walter Dörwalde57d7b12004-12-21 22:24:00 +0000594 readsize *= 2
595 return line
Guido van Rossum1c89b0e2000-04-11 15:41:38 +0000596
Walter Dörwald69652032004-09-07 20:24:22 +0000597 def readlines(self, sizehint=None, keepends=True):
Guido van Rossuma3277132000-04-11 15:37:43 +0000598
599 """ Read all lines available on the input stream
600 and return them as list of lines.
601
602 Line breaks are implemented using the codec's decoder
603 method and are included in the list entries.
Guido van Rossum1c89b0e2000-04-11 15:41:38 +0000604
Marc-André Lemburgd5948492004-02-26 15:22:17 +0000605 sizehint, if given, is ignored since there is no efficient
606 way to finding the true end-of-line.
Guido van Rossuma3277132000-04-11 15:37:43 +0000607
608 """
Walter Dörwald69652032004-09-07 20:24:22 +0000609 data = self.read()
Hye-Shik Changaf5c7cf2004-10-17 23:51:21 +0000610 return data.splitlines(keepends)
Guido van Rossum0612d842000-03-10 23:20:43 +0000611
612 def reset(self):
613
614 """ Resets the codec buffers used for keeping state.
615
616 Note that no stream repositioning should take place.
Thomas Wouters7e474022000-07-16 12:04:32 +0000617 This method is primarily intended to be able to recover
Guido van Rossum0612d842000-03-10 23:20:43 +0000618 from decoding errors.
619
620 """
Walter Dörwaldca8a8d02007-05-04 13:05:09 +0000621 self.bytebuffer = b""
Georg Brandl02524622010-12-02 18:06:51 +0000622 self.charbuffer = self._empty_charbuffer
Martin v. Löwis4ed67382005-09-18 08:34:39 +0000623 self.linebuffer = None
Walter Dörwald729c31f2005-03-14 19:06:30 +0000624
Walter Dörwald71fd90d2005-03-14 19:25:41 +0000625 def seek(self, offset, whence=0):
Walter Dörwald729c31f2005-03-14 19:06:30 +0000626 """ Set the input stream's current position.
627
628 Resets the codec buffers used for keeping state.
629 """
Walter Dörwald729c31f2005-03-14 19:06:30 +0000630 self.stream.seek(offset, whence)
Victor Stinnera92ad7e2010-05-22 16:59:09 +0000631 self.reset()
Guido van Rossum0612d842000-03-10 23:20:43 +0000632
Georg Brandla18af4e2007-04-21 15:47:16 +0000633 def __next__(self):
Walter Dörwald4dbf1922002-11-06 16:53:44 +0000634
635 """ Return the next decoded line from the input stream."""
636 line = self.readline()
637 if line:
638 return line
639 raise StopIteration
640
641 def __iter__(self):
642 return self
643
Tim Peters30324a72001-05-15 17:19:16 +0000644 def __getattr__(self, name,
Guido van Rossum0612d842000-03-10 23:20:43 +0000645 getattr=getattr):
646
647 """ Inherit all other methods from the underlying stream.
648 """
Tim Peters30324a72001-05-15 17:19:16 +0000649 return getattr(self.stream, name)
Guido van Rossum0612d842000-03-10 23:20:43 +0000650
Thomas Wouters89f507f2006-12-13 04:49:30 +0000651 def __enter__(self):
652 return self
653
654 def __exit__(self, type, value, tb):
655 self.stream.close()
656
Guido van Rossum0612d842000-03-10 23:20:43 +0000657###
658
659class StreamReaderWriter:
660
Fred Drake49fd1072000-04-13 14:11:21 +0000661 """ StreamReaderWriter instances allow wrapping streams which
662 work in both read and write modes.
663
664 The design is such that one can use the factory functions
Thomas Wouters7e474022000-07-16 12:04:32 +0000665 returned by the codec.lookup() function to construct the
Fred Drake49fd1072000-04-13 14:11:21 +0000666 instance.
667
668 """
Guido van Rossuma3277132000-04-11 15:37:43 +0000669 # Optional attributes set by the file wrappers below
670 encoding = 'unknown'
671
Tim Peters30324a72001-05-15 17:19:16 +0000672 def __init__(self, stream, Reader, Writer, errors='strict'):
Guido van Rossum0612d842000-03-10 23:20:43 +0000673
674 """ Creates a StreamReaderWriter instance.
675
676 stream must be a Stream-like object.
677
678 Reader, Writer must be factory functions or classes
679 providing the StreamReader, StreamWriter interface resp.
680
681 Error handling is done in the same way as defined for the
682 StreamWriter/Readers.
683
684 """
685 self.stream = stream
686 self.reader = Reader(stream, errors)
687 self.writer = Writer(stream, errors)
688 self.errors = errors
689
Tim Peters30324a72001-05-15 17:19:16 +0000690 def read(self, size=-1):
Guido van Rossum0612d842000-03-10 23:20:43 +0000691
692 return self.reader.read(size)
693
Guido van Rossumd58c26f2000-05-01 16:17:32 +0000694 def readline(self, size=None):
Guido van Rossuma3277132000-04-11 15:37:43 +0000695
696 return self.reader.readline(size)
697
Guido van Rossumd58c26f2000-05-01 16:17:32 +0000698 def readlines(self, sizehint=None):
Guido van Rossuma3277132000-04-11 15:37:43 +0000699
700 return self.reader.readlines(sizehint)
701
Georg Brandla18af4e2007-04-21 15:47:16 +0000702 def __next__(self):
Walter Dörwald4dbf1922002-11-06 16:53:44 +0000703
704 """ Return the next decoded line from the input stream."""
Georg Brandla18af4e2007-04-21 15:47:16 +0000705 return next(self.reader)
Walter Dörwald4dbf1922002-11-06 16:53:44 +0000706
707 def __iter__(self):
708 return self
709
Tim Peters30324a72001-05-15 17:19:16 +0000710 def write(self, data):
Guido van Rossum0612d842000-03-10 23:20:43 +0000711
712 return self.writer.write(data)
713
Tim Peters30324a72001-05-15 17:19:16 +0000714 def writelines(self, list):
Guido van Rossuma3277132000-04-11 15:37:43 +0000715
716 return self.writer.writelines(list)
717
Guido van Rossum0612d842000-03-10 23:20:43 +0000718 def reset(self):
719
720 self.reader.reset()
721 self.writer.reset()
722
Victor Stinner3fed0872010-05-22 02:16:27 +0000723 def seek(self, offset, whence=0):
Victor Stinnera92ad7e2010-05-22 16:59:09 +0000724 self.stream.seek(offset, whence)
725 self.reader.reset()
726 if whence == 0 and offset == 0:
727 self.writer.reset()
Victor Stinner3fed0872010-05-22 02:16:27 +0000728
Tim Peters30324a72001-05-15 17:19:16 +0000729 def __getattr__(self, name,
Guido van Rossum0612d842000-03-10 23:20:43 +0000730 getattr=getattr):
731
732 """ Inherit all other methods from the underlying stream.
733 """
Tim Peters30324a72001-05-15 17:19:16 +0000734 return getattr(self.stream, name)
Guido van Rossum0612d842000-03-10 23:20:43 +0000735
Thomas Wouters89f507f2006-12-13 04:49:30 +0000736 # these are needed to make "with codecs.open(...)" work properly
737
738 def __enter__(self):
739 return self
740
741 def __exit__(self, type, value, tb):
742 self.stream.close()
743
Guido van Rossum0612d842000-03-10 23:20:43 +0000744###
745
746class StreamRecoder:
747
Fred Drake49fd1072000-04-13 14:11:21 +0000748 """ StreamRecoder instances provide a frontend - backend
749 view of encoding data.
750
751 They use the complete set of APIs returned by the
752 codecs.lookup() function to implement their task.
753
754 Data written to the stream is first decoded into an
755 intermediate format (which is dependent on the given codec
756 combination) and then written to the stream using an instance
757 of the provided Writer class.
758
759 In the other direction, data is read from the stream using a
760 Reader instance and then return encoded data to the caller.
761
762 """
Guido van Rossuma3277132000-04-11 15:37:43 +0000763 # Optional attributes set by the file wrappers below
764 data_encoding = 'unknown'
765 file_encoding = 'unknown'
766
Tim Peters30324a72001-05-15 17:19:16 +0000767 def __init__(self, stream, encode, decode, Reader, Writer,
768 errors='strict'):
Guido van Rossum0612d842000-03-10 23:20:43 +0000769
770 """ Creates a StreamRecoder instance which implements a two-way
771 conversion: encode and decode work on the frontend (the
Guido van Rossum1c89b0e2000-04-11 15:41:38 +0000772 input to .read() and output of .write()) while
Guido van Rossum0612d842000-03-10 23:20:43 +0000773 Reader and Writer work on the backend (reading and
Fred Drake908670c2000-03-17 15:42:11 +0000774 writing to the stream).
Guido van Rossum0612d842000-03-10 23:20:43 +0000775
776 You can use these objects to do transparent direct
777 recodings from e.g. latin-1 to utf-8 and back.
778
779 stream must be a file-like object.
780
781 encode, decode must adhere to the Codec interface, Reader,
782 Writer must be factory functions or classes providing the
783 StreamReader, StreamWriter interface resp.
784
785 encode and decode are needed for the frontend translation,
786 Reader and Writer for the backend translation. Unicode is
787 used as intermediate encoding.
788
789 Error handling is done in the same way as defined for the
790 StreamWriter/Readers.
791
792 """
793 self.stream = stream
794 self.encode = encode
795 self.decode = decode
796 self.reader = Reader(stream, errors)
797 self.writer = Writer(stream, errors)
798 self.errors = errors
799
Tim Peters30324a72001-05-15 17:19:16 +0000800 def read(self, size=-1):
Guido van Rossum0612d842000-03-10 23:20:43 +0000801
802 data = self.reader.read(size)
803 data, bytesencoded = self.encode(data, self.errors)
804 return data
805
Tim Peters30324a72001-05-15 17:19:16 +0000806 def readline(self, size=None):
Guido van Rossuma3277132000-04-11 15:37:43 +0000807
808 if size is None:
809 data = self.reader.readline()
810 else:
811 data = self.reader.readline(size)
812 data, bytesencoded = self.encode(data, self.errors)
813 return data
814
Tim Peters30324a72001-05-15 17:19:16 +0000815 def readlines(self, sizehint=None):
Guido van Rossuma3277132000-04-11 15:37:43 +0000816
Marc-André Lemburgd5948492004-02-26 15:22:17 +0000817 data = self.reader.read()
Guido van Rossuma3277132000-04-11 15:37:43 +0000818 data, bytesencoded = self.encode(data, self.errors)
Ezio Melottid8b509b2011-09-28 17:37:55 +0300819 return data.splitlines(keepends=True)
Guido van Rossuma3277132000-04-11 15:37:43 +0000820
Georg Brandla18af4e2007-04-21 15:47:16 +0000821 def __next__(self):
Walter Dörwald4dbf1922002-11-06 16:53:44 +0000822
823 """ Return the next decoded line from the input stream."""
Georg Brandla18af4e2007-04-21 15:47:16 +0000824 data = next(self.reader)
Walter Dörwaldc5238b82005-09-01 11:56:53 +0000825 data, bytesencoded = self.encode(data, self.errors)
826 return data
Walter Dörwald4dbf1922002-11-06 16:53:44 +0000827
828 def __iter__(self):
829 return self
830
Tim Peters30324a72001-05-15 17:19:16 +0000831 def write(self, data):
Guido van Rossum0612d842000-03-10 23:20:43 +0000832
833 data, bytesdecoded = self.decode(data, self.errors)
834 return self.writer.write(data)
835
Tim Peters30324a72001-05-15 17:19:16 +0000836 def writelines(self, list):
Guido van Rossuma3277132000-04-11 15:37:43 +0000837
838 data = ''.join(list)
839 data, bytesdecoded = self.decode(data, self.errors)
840 return self.writer.write(data)
Guido van Rossum0612d842000-03-10 23:20:43 +0000841
842 def reset(self):
843
844 self.reader.reset()
845 self.writer.reset()
846
Tim Peters30324a72001-05-15 17:19:16 +0000847 def __getattr__(self, name,
Guido van Rossum0612d842000-03-10 23:20:43 +0000848 getattr=getattr):
849
850 """ Inherit all other methods from the underlying stream.
851 """
Tim Peters30324a72001-05-15 17:19:16 +0000852 return getattr(self.stream, name)
Guido van Rossum0612d842000-03-10 23:20:43 +0000853
Thomas Wouters89f507f2006-12-13 04:49:30 +0000854 def __enter__(self):
855 return self
856
857 def __exit__(self, type, value, tb):
858 self.stream.close()
859
Guido van Rossum0612d842000-03-10 23:20:43 +0000860### Shortcuts
861
Marc-André Lemburg349a3d32000-06-21 21:21:04 +0000862def open(filename, mode='rb', encoding=None, errors='strict', buffering=1):
Guido van Rossum0612d842000-03-10 23:20:43 +0000863
864 """ Open an encoded file using the given mode and return
865 a wrapped version providing transparent encoding/decoding.
866
867 Note: The wrapped version will only accept the object format
868 defined by the codecs, i.e. Unicode objects for most builtin
Skip Montanaro9f5f9d92005-03-16 03:51:56 +0000869 codecs. Output is also codec dependent and will usually be
Guido van Rossum0612d842000-03-10 23:20:43 +0000870 Unicode as well.
871
Marc-André Lemburg349a3d32000-06-21 21:21:04 +0000872 Files are always opened in binary mode, even if no binary mode
Walter Dörwald7f3ed742003-02-02 23:08:27 +0000873 was specified. This is done to avoid data loss due to encodings
Marc-André Lemburg349a3d32000-06-21 21:21:04 +0000874 using 8-bit values. The default file mode is 'rb' meaning to
875 open the file in binary read mode.
876
Guido van Rossum0612d842000-03-10 23:20:43 +0000877 encoding specifies the encoding which is to be used for the
Walter Dörwald7f3ed742003-02-02 23:08:27 +0000878 file.
Guido van Rossum0612d842000-03-10 23:20:43 +0000879
880 errors may be given to define the error handling. It defaults
881 to 'strict' which causes ValueErrors to be raised in case an
882 encoding error occurs.
883
884 buffering has the same meaning as for the builtin open() API.
885 It defaults to line buffered.
886
Fred Drake49fd1072000-04-13 14:11:21 +0000887 The returned wrapped file object provides an extra attribute
888 .encoding which allows querying the used encoding. This
889 attribute is only available if an encoding was specified as
890 parameter.
891
Guido van Rossum0612d842000-03-10 23:20:43 +0000892 """
893 if encoding is not None and \
894 'b' not in mode:
895 # Force opening of the file in binary mode
896 mode = mode + 'b'
Georg Brandl1a3284e2007-12-02 09:40:06 +0000897 file = builtins.open(filename, mode, buffering)
Guido van Rossum0612d842000-03-10 23:20:43 +0000898 if encoding is None:
899 return file
Thomas Woutersa9773292006-04-21 09:43:23 +0000900 info = lookup(encoding)
901 srw = StreamReaderWriter(file, info.streamreader, info.streamwriter, errors)
Guido van Rossuma3277132000-04-11 15:37:43 +0000902 # Add attributes to simplify introspection
903 srw.encoding = encoding
904 return srw
Guido van Rossum0612d842000-03-10 23:20:43 +0000905
Guido van Rossuma3277132000-04-11 15:37:43 +0000906def EncodedFile(file, data_encoding, file_encoding=None, errors='strict'):
Guido van Rossum0612d842000-03-10 23:20:43 +0000907
908 """ Return a wrapped version of file which provides transparent
909 encoding translation.
910
911 Strings written to the wrapped file are interpreted according
Guido van Rossuma3277132000-04-11 15:37:43 +0000912 to the given data_encoding and then written to the original
913 file as string using file_encoding. The intermediate encoding
914 will usually be Unicode but depends on the specified codecs.
Guido van Rossum0612d842000-03-10 23:20:43 +0000915
Guido van Rossuma3277132000-04-11 15:37:43 +0000916 Strings are read from the file using file_encoding and then
917 passed back to the caller as string using data_encoding.
918
919 If file_encoding is not given, it defaults to data_encoding.
Guido van Rossum0612d842000-03-10 23:20:43 +0000920
921 errors may be given to define the error handling. It defaults
922 to 'strict' which causes ValueErrors to be raised in case an
923 encoding error occurs.
924
Fred Drake49fd1072000-04-13 14:11:21 +0000925 The returned wrapped file object provides two extra attributes
926 .data_encoding and .file_encoding which reflect the given
927 parameters of the same name. The attributes can be used for
928 introspection by Python programs.
929
Guido van Rossum0612d842000-03-10 23:20:43 +0000930 """
Guido van Rossuma3277132000-04-11 15:37:43 +0000931 if file_encoding is None:
932 file_encoding = data_encoding
Thomas Wouters89f507f2006-12-13 04:49:30 +0000933 data_info = lookup(data_encoding)
934 file_info = lookup(file_encoding)
935 sr = StreamRecoder(file, data_info.encode, data_info.decode,
936 file_info.streamreader, file_info.streamwriter, errors)
Guido van Rossuma3277132000-04-11 15:37:43 +0000937 # Add attributes to simplify introspection
938 sr.data_encoding = data_encoding
939 sr.file_encoding = file_encoding
940 return sr
Guido van Rossum0612d842000-03-10 23:20:43 +0000941
Marc-André Lemburgaa32c5a2001-09-19 11:24:48 +0000942### Helpers for codec lookup
943
944def getencoder(encoding):
945
946 """ Lookup up the codec for the given encoding and return
947 its encoder function.
948
949 Raises a LookupError in case the encoding cannot be found.
950
951 """
Thomas Woutersa9773292006-04-21 09:43:23 +0000952 return lookup(encoding).encode
Marc-André Lemburgaa32c5a2001-09-19 11:24:48 +0000953
954def getdecoder(encoding):
955
956 """ Lookup up the codec for the given encoding and return
957 its decoder function.
958
959 Raises a LookupError in case the encoding cannot be found.
960
961 """
Thomas Woutersa9773292006-04-21 09:43:23 +0000962 return lookup(encoding).decode
963
964def getincrementalencoder(encoding):
965
966 """ Lookup up the codec for the given encoding and return
967 its IncrementalEncoder class or factory function.
968
969 Raises a LookupError in case the encoding cannot be found
970 or the codecs doesn't provide an incremental encoder.
971
972 """
973 encoder = lookup(encoding).incrementalencoder
974 if encoder is None:
975 raise LookupError(encoding)
976 return encoder
977
978def getincrementaldecoder(encoding):
979
980 """ Lookup up the codec for the given encoding and return
981 its IncrementalDecoder class or factory function.
982
983 Raises a LookupError in case the encoding cannot be found
984 or the codecs doesn't provide an incremental decoder.
985
986 """
987 decoder = lookup(encoding).incrementaldecoder
988 if decoder is None:
989 raise LookupError(encoding)
990 return decoder
Marc-André Lemburgaa32c5a2001-09-19 11:24:48 +0000991
992def getreader(encoding):
993
994 """ Lookup up the codec for the given encoding and return
995 its StreamReader class or factory function.
996
997 Raises a LookupError in case the encoding cannot be found.
998
999 """
Thomas Woutersa9773292006-04-21 09:43:23 +00001000 return lookup(encoding).streamreader
Marc-André Lemburgaa32c5a2001-09-19 11:24:48 +00001001
1002def getwriter(encoding):
1003
1004 """ Lookup up the codec for the given encoding and return
1005 its StreamWriter class or factory function.
1006
1007 Raises a LookupError in case the encoding cannot be found.
1008
1009 """
Thomas Woutersa9773292006-04-21 09:43:23 +00001010 return lookup(encoding).streamwriter
1011
1012def iterencode(iterator, encoding, errors='strict', **kwargs):
1013 """
1014 Encoding iterator.
1015
1016 Encodes the input strings from the iterator using a IncrementalEncoder.
1017
1018 errors and kwargs are passed through to the IncrementalEncoder
1019 constructor.
1020 """
1021 encoder = getincrementalencoder(encoding)(errors, **kwargs)
1022 for input in iterator:
1023 output = encoder.encode(input)
1024 if output:
1025 yield output
1026 output = encoder.encode("", True)
1027 if output:
1028 yield output
1029
1030def iterdecode(iterator, encoding, errors='strict', **kwargs):
1031 """
1032 Decoding iterator.
1033
1034 Decodes the input strings from the iterator using a IncrementalDecoder.
1035
1036 errors and kwargs are passed through to the IncrementalDecoder
1037 constructor.
1038 """
1039 decoder = getincrementaldecoder(encoding)(errors, **kwargs)
1040 for input in iterator:
1041 output = decoder.decode(input)
1042 if output:
1043 yield output
Walter Dörwaldca8a8d02007-05-04 13:05:09 +00001044 output = decoder.decode(b"", True)
Thomas Woutersa9773292006-04-21 09:43:23 +00001045 if output:
1046 yield output
Marc-André Lemburgaa32c5a2001-09-19 11:24:48 +00001047
Marc-André Lemburga866df82001-01-03 21:29:14 +00001048### Helpers for charmap-based codecs
1049
1050def make_identity_dict(rng):
1051
1052 """ make_identity_dict(rng) -> dict
1053
1054 Return a dictionary where elements of the rng sequence are
1055 mapped to themselves.
Tim Peters88869f92001-01-14 23:36:06 +00001056
Marc-André Lemburga866df82001-01-03 21:29:14 +00001057 """
Antoine Pitrouaaefac72012-06-16 22:48:21 +02001058 return {i:i for i in rng}
Marc-André Lemburga866df82001-01-03 21:29:14 +00001059
Marc-André Lemburg716cf912001-05-16 09:41:45 +00001060def make_encoding_map(decoding_map):
1061
1062 """ Creates an encoding map from a decoding map.
1063
Walter Dörwald7f3ed742003-02-02 23:08:27 +00001064 If a target mapping in the decoding map occurs multiple
Marc-André Lemburg716cf912001-05-16 09:41:45 +00001065 times, then that target is mapped to None (undefined mapping),
1066 causing an exception when encountered by the charmap codec
1067 during translation.
1068
1069 One example where this happens is cp875.py which decodes
1070 multiple character to \u001a.
1071
1072 """
1073 m = {}
1074 for k,v in decoding_map.items():
Raymond Hettinger54f02222002-06-01 14:18:47 +00001075 if not v in m:
Marc-André Lemburg716cf912001-05-16 09:41:45 +00001076 m[v] = k
1077 else:
1078 m[v] = None
1079 return m
Tim Peters3a2ab1a2001-05-29 06:06:54 +00001080
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001081### error handlers
1082
Martin v. Löwise2713be2005-03-08 15:03:08 +00001083try:
1084 strict_errors = lookup_error("strict")
1085 ignore_errors = lookup_error("ignore")
1086 replace_errors = lookup_error("replace")
1087 xmlcharrefreplace_errors = lookup_error("xmlcharrefreplace")
1088 backslashreplace_errors = lookup_error("backslashreplace")
1089except LookupError:
1090 # In --disable-unicode builds, these error handler are missing
1091 strict_errors = None
1092 ignore_errors = None
1093 replace_errors = None
1094 xmlcharrefreplace_errors = None
1095 backslashreplace_errors = None
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001096
Martin v. Löwis6cd441d2001-07-31 08:54:55 +00001097# Tell modulefinder that using codecs probably needs the encodings
1098# package
1099_false = 0
1100if _false:
1101 import encodings
1102
Guido van Rossum0612d842000-03-10 23:20:43 +00001103### Tests
Guido van Rossum1c89b0e2000-04-11 15:41:38 +00001104
Guido van Rossum0612d842000-03-10 23:20:43 +00001105if __name__ == '__main__':
1106
Guido van Rossuma3277132000-04-11 15:37:43 +00001107 # Make stdout translate Latin-1 output into UTF-8 output
1108 sys.stdout = EncodedFile(sys.stdout, 'latin-1', 'utf-8')
Guido van Rossum1c89b0e2000-04-11 15:41:38 +00001109
Guido van Rossuma3277132000-04-11 15:37:43 +00001110 # Have stdin translate Latin-1 input into UTF-8 input
1111 sys.stdin = EncodedFile(sys.stdin, 'utf-8', 'latin-1')