blob: 993451711af3d00ff98c90187c0740c5a3706d34 [file] [log] [blame]
Guido van Rossum0612d842000-03-10 23:20:43 +00001""" codecs -- Python Codec Registry, API and helpers.
2
3
4Written by Marc-Andre Lemburg (mal@lemburg.com).
5
6(c) Copyright CNRI, All Rights Reserved. NO WARRANTY.
7
8"""#"
9
Georg Brandl1a3284e2007-12-02 09:40:06 +000010import builtins, sys
Guido van Rossum0612d842000-03-10 23:20:43 +000011
12### Registry and builtin stateless codec functions
13
Guido van Rossumb95de4f2000-03-31 17:25:23 +000014try:
15 from _codecs import *
Guido van Rossumb940e112007-01-10 16:19:56 +000016except ImportError as why:
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000017 raise SystemError('Failed to load the builtin codecs: %s' % why)
Guido van Rossum0612d842000-03-10 23:20:43 +000018
Tim Peters30324a72001-05-15 17:19:16 +000019__all__ = ["register", "lookup", "open", "EncodedFile", "BOM", "BOM_BE",
Walter Dörwald474458d2002-06-04 15:16:29 +000020 "BOM_LE", "BOM32_BE", "BOM32_LE", "BOM64_BE", "BOM64_LE",
21 "BOM_UTF8", "BOM_UTF16", "BOM_UTF16_LE", "BOM_UTF16_BE",
Walter Dörwald3aeb6322002-09-02 13:14:32 +000022 "BOM_UTF32", "BOM_UTF32_LE", "BOM_UTF32_BE",
23 "strict_errors", "ignore_errors", "replace_errors",
24 "xmlcharrefreplace_errors",
25 "register_error", "lookup_error"]
Skip Montanaroe99d5ea2001-01-20 19:54:20 +000026
Guido van Rossum0612d842000-03-10 23:20:43 +000027### Constants
28
29#
Walter Dörwald474458d2002-06-04 15:16:29 +000030# Byte Order Mark (BOM = ZERO WIDTH NO-BREAK SPACE = U+FEFF)
31# and its possible byte string values
32# for UTF8/UTF16/UTF32 output and little/big endian machines
Guido van Rossum0612d842000-03-10 23:20:43 +000033#
Guido van Rossum0612d842000-03-10 23:20:43 +000034
Walter Dörwald474458d2002-06-04 15:16:29 +000035# UTF-8
Walter Dörwaldca8a8d02007-05-04 13:05:09 +000036BOM_UTF8 = b'\xef\xbb\xbf'
Walter Dörwald474458d2002-06-04 15:16:29 +000037
38# UTF-16, little endian
Walter Dörwaldca8a8d02007-05-04 13:05:09 +000039BOM_LE = BOM_UTF16_LE = b'\xff\xfe'
Walter Dörwald474458d2002-06-04 15:16:29 +000040
41# UTF-16, big endian
Walter Dörwaldca8a8d02007-05-04 13:05:09 +000042BOM_BE = BOM_UTF16_BE = b'\xfe\xff'
Walter Dörwald474458d2002-06-04 15:16:29 +000043
44# UTF-32, little endian
Walter Dörwaldca8a8d02007-05-04 13:05:09 +000045BOM_UTF32_LE = b'\xff\xfe\x00\x00'
Walter Dörwald474458d2002-06-04 15:16:29 +000046
47# UTF-32, big endian
Walter Dörwaldca8a8d02007-05-04 13:05:09 +000048BOM_UTF32_BE = b'\x00\x00\xfe\xff'
Walter Dörwald474458d2002-06-04 15:16:29 +000049
Marc-André Lemburgb28de0d2002-12-12 17:37:50 +000050if sys.byteorder == 'little':
Walter Dörwald474458d2002-06-04 15:16:29 +000051
Marc-André Lemburgb28de0d2002-12-12 17:37:50 +000052 # UTF-16, native endianness
53 BOM = BOM_UTF16 = BOM_UTF16_LE
54
55 # UTF-32, native endianness
56 BOM_UTF32 = BOM_UTF32_LE
57
58else:
59
60 # UTF-16, native endianness
61 BOM = BOM_UTF16 = BOM_UTF16_BE
62
63 # UTF-32, native endianness
64 BOM_UTF32 = BOM_UTF32_BE
Walter Dörwald474458d2002-06-04 15:16:29 +000065
66# Old broken names (don't use in new code)
67BOM32_LE = BOM_UTF16_LE
68BOM32_BE = BOM_UTF16_BE
69BOM64_LE = BOM_UTF32_LE
70BOM64_BE = BOM_UTF32_BE
Guido van Rossum0612d842000-03-10 23:20:43 +000071
72
73### Codec base classes (defining the API)
74
Thomas Woutersa9773292006-04-21 09:43:23 +000075class CodecInfo(tuple):
Nick Coghlanc72e4e62013-11-22 22:39:36 +100076 """Codec details when looking up the codec registry"""
77
78 # Private API to allow Python 3.4 to blacklist the known non-Unicode
79 # codecs in the standard library. A more general mechanism to
80 # reliably distinguish test encodings from other codecs will hopefully
81 # be defined for Python 3.5
82 #
83 # See http://bugs.python.org/issue19619
84 _is_text_encoding = True # Assume codecs are text encodings by default
Thomas Woutersa9773292006-04-21 09:43:23 +000085
86 def __new__(cls, encode, decode, streamreader=None, streamwriter=None,
Nick Coghlanc72e4e62013-11-22 22:39:36 +100087 incrementalencoder=None, incrementaldecoder=None, name=None,
88 *, _is_text_encoding=None):
Thomas Woutersa9773292006-04-21 09:43:23 +000089 self = tuple.__new__(cls, (encode, decode, streamreader, streamwriter))
90 self.name = name
91 self.encode = encode
92 self.decode = decode
93 self.incrementalencoder = incrementalencoder
94 self.incrementaldecoder = incrementaldecoder
95 self.streamwriter = streamwriter
96 self.streamreader = streamreader
Nick Coghlanc72e4e62013-11-22 22:39:36 +100097 if _is_text_encoding is not None:
98 self._is_text_encoding = _is_text_encoding
Thomas Woutersa9773292006-04-21 09:43:23 +000099 return self
100
101 def __repr__(self):
Serhiy Storchaka521e5862014-07-22 15:00:37 +0300102 return "<%s.%s object for encoding %s at %#x>" % \
103 (self.__class__.__module__, self.__class__.__qualname__,
Walter Dörwald3abcb012007-04-16 22:10:50 +0000104 self.name, id(self))
Thomas Woutersa9773292006-04-21 09:43:23 +0000105
Guido van Rossum0612d842000-03-10 23:20:43 +0000106class Codec:
107
108 """ Defines the interface for stateless encoders/decoders.
109
Walter Dörwald7f82f792002-11-19 21:42:53 +0000110 The .encode()/.decode() methods may use different error
Guido van Rossum0612d842000-03-10 23:20:43 +0000111 handling schemes by providing the errors argument. These
Walter Dörwald7f82f792002-11-19 21:42:53 +0000112 string values are predefined:
Guido van Rossum0612d842000-03-10 23:20:43 +0000113
Guido van Rossumd8855fd2000-03-24 22:14:19 +0000114 'strict' - raise a ValueError error (or a subclass)
Guido van Rossum0612d842000-03-10 23:20:43 +0000115 'ignore' - ignore the character and continue with the next
116 'replace' - replace with a suitable replacement character;
117 Python will use the official U+FFFD REPLACEMENT
Walter Dörwald7f82f792002-11-19 21:42:53 +0000118 CHARACTER for the builtin Unicode codecs on
119 decoding and '?' on encoding.
Andrew Kuchlingc7b6c502013-06-16 12:58:48 -0400120 'surrogateescape' - replace with private codepoints U+DCnn.
Walter Dörwald7f82f792002-11-19 21:42:53 +0000121 'xmlcharrefreplace' - Replace with the appropriate XML
122 character reference (only for encoding).
123 'backslashreplace' - Replace with backslashed escape sequences
124 (only for encoding).
125
126 The set of allowed values can be extended via register_error.
Guido van Rossum0612d842000-03-10 23:20:43 +0000127
128 """
Tim Peters30324a72001-05-15 17:19:16 +0000129 def encode(self, input, errors='strict'):
Guido van Rossum1c89b0e2000-04-11 15:41:38 +0000130
Fred Drake3e74c0d2000-03-17 15:40:35 +0000131 """ Encodes the object input and returns a tuple (output
Guido van Rossum0612d842000-03-10 23:20:43 +0000132 object, length consumed).
133
134 errors defines the error handling to apply. It defaults to
135 'strict' handling.
136
137 The method may not store state in the Codec instance. Use
138 StreamCodec for codecs which have to keep state in order to
139 make encoding/decoding efficient.
140
141 The encoder must be able to handle zero length input and
142 return an empty object of the output object type in this
143 situation.
144
145 """
146 raise NotImplementedError
147
Tim Peters30324a72001-05-15 17:19:16 +0000148 def decode(self, input, errors='strict'):
Guido van Rossum0612d842000-03-10 23:20:43 +0000149
150 """ Decodes the object input and returns a tuple (output
151 object, length consumed).
152
153 input must be an object which provides the bf_getreadbuf
154 buffer slot. Python strings, buffer objects and memory
155 mapped files are examples of objects providing this slot.
Guido van Rossum1c89b0e2000-04-11 15:41:38 +0000156
Guido van Rossum0612d842000-03-10 23:20:43 +0000157 errors defines the error handling to apply. It defaults to
158 'strict' handling.
159
160 The method may not store state in the Codec instance. Use
161 StreamCodec for codecs which have to keep state in order to
162 make encoding/decoding efficient.
163
164 The decoder must be able to handle zero length input and
165 return an empty object of the output object type in this
166 situation.
167
Guido van Rossum1c89b0e2000-04-11 15:41:38 +0000168 """
Guido van Rossum0612d842000-03-10 23:20:43 +0000169 raise NotImplementedError
170
Thomas Woutersa9773292006-04-21 09:43:23 +0000171class IncrementalEncoder(object):
172 """
Walter Dörwald3abcb012007-04-16 22:10:50 +0000173 An IncrementalEncoder encodes an input in multiple steps. The input can
174 be passed piece by piece to the encode() method. The IncrementalEncoder
175 remembers the state of the encoding process between calls to encode().
Thomas Woutersa9773292006-04-21 09:43:23 +0000176 """
177 def __init__(self, errors='strict'):
178 """
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000179 Creates an IncrementalEncoder instance.
Thomas Woutersa9773292006-04-21 09:43:23 +0000180
181 The IncrementalEncoder may use different error handling schemes by
182 providing the errors keyword argument. See the module docstring
183 for a list of possible values.
184 """
185 self.errors = errors
186 self.buffer = ""
187
188 def encode(self, input, final=False):
189 """
190 Encodes input and returns the resulting object.
191 """
192 raise NotImplementedError
193
194 def reset(self):
195 """
196 Resets the encoder to the initial state.
197 """
198
Walter Dörwald3abcb012007-04-16 22:10:50 +0000199 def getstate(self):
200 """
201 Return the current state of the encoder.
202 """
203 return 0
204
205 def setstate(self, state):
206 """
207 Set the current state of the encoder. state must have been
208 returned by getstate().
209 """
210
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000211class BufferedIncrementalEncoder(IncrementalEncoder):
212 """
213 This subclass of IncrementalEncoder can be used as the baseclass for an
214 incremental encoder if the encoder must keep some of the output in a
215 buffer between calls to encode().
216 """
217 def __init__(self, errors='strict'):
218 IncrementalEncoder.__init__(self, errors)
Walter Dörwald3abcb012007-04-16 22:10:50 +0000219 # unencoded input that is kept between calls to encode()
220 self.buffer = ""
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000221
222 def _buffer_encode(self, input, errors, final):
223 # Overwrite this method in subclasses: It must encode input
224 # and return an (output, length consumed) tuple
225 raise NotImplementedError
226
227 def encode(self, input, final=False):
228 # encode input (taking the buffer into account)
229 data = self.buffer + input
230 (result, consumed) = self._buffer_encode(data, self.errors, final)
231 # keep unencoded input until the next call
232 self.buffer = data[consumed:]
233 return result
234
235 def reset(self):
236 IncrementalEncoder.reset(self)
237 self.buffer = ""
238
Walter Dörwald3abcb012007-04-16 22:10:50 +0000239 def getstate(self):
240 return self.buffer or 0
241
242 def setstate(self, state):
243 self.buffer = state or ""
244
Thomas Woutersa9773292006-04-21 09:43:23 +0000245class IncrementalDecoder(object):
246 """
Walter Dörwald3abcb012007-04-16 22:10:50 +0000247 An IncrementalDecoder decodes an input in multiple steps. The input can
248 be passed piece by piece to the decode() method. The IncrementalDecoder
Thomas Woutersa9773292006-04-21 09:43:23 +0000249 remembers the state of the decoding process between calls to decode().
250 """
251 def __init__(self, errors='strict'):
252 """
Ka-Ping Yeef44c7e82008-03-18 04:51:32 +0000253 Create a IncrementalDecoder instance.
Thomas Woutersa9773292006-04-21 09:43:23 +0000254
255 The IncrementalDecoder may use different error handling schemes by
256 providing the errors keyword argument. See the module docstring
257 for a list of possible values.
258 """
259 self.errors = errors
260
261 def decode(self, input, final=False):
262 """
Ka-Ping Yeef44c7e82008-03-18 04:51:32 +0000263 Decode input and returns the resulting object.
Thomas Woutersa9773292006-04-21 09:43:23 +0000264 """
265 raise NotImplementedError
266
267 def reset(self):
268 """
Ka-Ping Yeef44c7e82008-03-18 04:51:32 +0000269 Reset the decoder to the initial state.
Thomas Woutersa9773292006-04-21 09:43:23 +0000270 """
271
Walter Dörwald3abcb012007-04-16 22:10:50 +0000272 def getstate(self):
273 """
Ka-Ping Yeef44c7e82008-03-18 04:51:32 +0000274 Return the current state of the decoder.
275
276 This must be a (buffered_input, additional_state_info) tuple.
277 buffered_input must be a bytes object containing bytes that
278 were passed to decode() that have not yet been converted.
279 additional_state_info must be a non-negative integer
280 representing the state of the decoder WITHOUT yet having
281 processed the contents of buffered_input. In the initial state
282 and after reset(), getstate() must return (b"", 0).
Walter Dörwald3abcb012007-04-16 22:10:50 +0000283 """
Walter Dörwaldca8a8d02007-05-04 13:05:09 +0000284 return (b"", 0)
Walter Dörwald3abcb012007-04-16 22:10:50 +0000285
286 def setstate(self, state):
287 """
Ka-Ping Yeef44c7e82008-03-18 04:51:32 +0000288 Set the current state of the decoder.
289
290 state must have been returned by getstate(). The effect of
291 setstate((b"", 0)) must be equivalent to reset().
Walter Dörwald3abcb012007-04-16 22:10:50 +0000292 """
293
Thomas Woutersa9773292006-04-21 09:43:23 +0000294class BufferedIncrementalDecoder(IncrementalDecoder):
295 """
296 This subclass of IncrementalDecoder can be used as the baseclass for an
Walter Dörwald3abcb012007-04-16 22:10:50 +0000297 incremental decoder if the decoder must be able to handle incomplete
298 byte sequences.
Thomas Woutersa9773292006-04-21 09:43:23 +0000299 """
300 def __init__(self, errors='strict'):
301 IncrementalDecoder.__init__(self, errors)
Walter Dörwald3abcb012007-04-16 22:10:50 +0000302 # undecoded input that is kept between calls to decode()
Walter Dörwaldca8a8d02007-05-04 13:05:09 +0000303 self.buffer = b""
Thomas Woutersa9773292006-04-21 09:43:23 +0000304
305 def _buffer_decode(self, input, errors, final):
306 # Overwrite this method in subclasses: It must decode input
307 # and return an (output, length consumed) tuple
308 raise NotImplementedError
309
310 def decode(self, input, final=False):
311 # decode input (taking the buffer into account)
312 data = self.buffer + input
313 (result, consumed) = self._buffer_decode(data, self.errors, final)
314 # keep undecoded input until the next call
315 self.buffer = data[consumed:]
316 return result
317
318 def reset(self):
319 IncrementalDecoder.reset(self)
Walter Dörwaldca8a8d02007-05-04 13:05:09 +0000320 self.buffer = b""
Thomas Woutersa9773292006-04-21 09:43:23 +0000321
Walter Dörwald3abcb012007-04-16 22:10:50 +0000322 def getstate(self):
323 # additional state info is always 0
324 return (self.buffer, 0)
325
326 def setstate(self, state):
327 # ignore additional state info
328 self.buffer = state[0]
329
Guido van Rossum0612d842000-03-10 23:20:43 +0000330#
331# The StreamWriter and StreamReader class provide generic working
Andrew M. Kuchling97c56352001-09-18 20:29:48 +0000332# interfaces which can be used to implement new encoding submodules
Guido van Rossum0612d842000-03-10 23:20:43 +0000333# very easily. See encodings/utf_8.py for an example on how this is
334# done.
Guido van Rossum1c89b0e2000-04-11 15:41:38 +0000335#
Guido van Rossum0612d842000-03-10 23:20:43 +0000336
337class StreamWriter(Codec):
338
Tim Peters30324a72001-05-15 17:19:16 +0000339 def __init__(self, stream, errors='strict'):
Guido van Rossum0612d842000-03-10 23:20:43 +0000340
341 """ Creates a StreamWriter instance.
342
343 stream must be a file-like object open for writing
344 (binary) data.
345
Walter Dörwald7f82f792002-11-19 21:42:53 +0000346 The StreamWriter may use different error handling
Guido van Rossum0612d842000-03-10 23:20:43 +0000347 schemes by providing the errors keyword argument. These
Walter Dörwald7f82f792002-11-19 21:42:53 +0000348 parameters are predefined:
Guido van Rossum0612d842000-03-10 23:20:43 +0000349
350 'strict' - raise a ValueError (or a subclass)
351 'ignore' - ignore the character and continue with the next
352 'replace'- replace with a suitable replacement character
Walter Dörwald7f82f792002-11-19 21:42:53 +0000353 'xmlcharrefreplace' - Replace with the appropriate XML
354 character reference.
355 'backslashreplace' - Replace with backslashed escape
356 sequences (only for encoding).
Guido van Rossum0612d842000-03-10 23:20:43 +0000357
Walter Dörwald7f82f792002-11-19 21:42:53 +0000358 The set of allowed parameter values can be extended via
359 register_error.
Guido van Rossum0612d842000-03-10 23:20:43 +0000360 """
361 self.stream = stream
362 self.errors = errors
363
Guido van Rossuma3277132000-04-11 15:37:43 +0000364 def write(self, object):
Guido van Rossum0612d842000-03-10 23:20:43 +0000365
366 """ Writes the object's contents encoded to self.stream.
367 """
Tim Peters30324a72001-05-15 17:19:16 +0000368 data, consumed = self.encode(object, self.errors)
Guido van Rossum0612d842000-03-10 23:20:43 +0000369 self.stream.write(data)
370
Guido van Rossuma3277132000-04-11 15:37:43 +0000371 def writelines(self, list):
372
373 """ Writes the concatenated list of strings to the stream
374 using .write().
375 """
376 self.write(''.join(list))
Guido van Rossum1c89b0e2000-04-11 15:41:38 +0000377
Guido van Rossum0612d842000-03-10 23:20:43 +0000378 def reset(self):
379
380 """ Flushes and resets the codec buffers used for keeping state.
381
382 Calling this method should ensure that the data on the
383 output is put into a clean state, that allows appending
384 of new fresh data without having to rescan the whole
385 stream to recover state.
386
387 """
388 pass
389
Victor Stinnera92ad7e2010-05-22 16:59:09 +0000390 def seek(self, offset, whence=0):
391 self.stream.seek(offset, whence)
392 if whence == 0 and offset == 0:
393 self.reset()
394
Tim Peters30324a72001-05-15 17:19:16 +0000395 def __getattr__(self, name,
Guido van Rossum0612d842000-03-10 23:20:43 +0000396 getattr=getattr):
397
398 """ Inherit all other methods from the underlying stream.
399 """
Tim Peters30324a72001-05-15 17:19:16 +0000400 return getattr(self.stream, name)
Guido van Rossum0612d842000-03-10 23:20:43 +0000401
Thomas Wouters89f507f2006-12-13 04:49:30 +0000402 def __enter__(self):
403 return self
404
405 def __exit__(self, type, value, tb):
406 self.stream.close()
407
Guido van Rossum0612d842000-03-10 23:20:43 +0000408###
409
410class StreamReader(Codec):
411
Georg Brandl02524622010-12-02 18:06:51 +0000412 charbuffertype = str
413
Tim Peters30324a72001-05-15 17:19:16 +0000414 def __init__(self, stream, errors='strict'):
Guido van Rossum0612d842000-03-10 23:20:43 +0000415
416 """ Creates a StreamReader instance.
417
418 stream must be a file-like object open for reading
419 (binary) data.
420
Walter Dörwald7f82f792002-11-19 21:42:53 +0000421 The StreamReader may use different error handling
Guido van Rossum0612d842000-03-10 23:20:43 +0000422 schemes by providing the errors keyword argument. These
Walter Dörwald7f82f792002-11-19 21:42:53 +0000423 parameters are predefined:
Guido van Rossum0612d842000-03-10 23:20:43 +0000424
425 'strict' - raise a ValueError (or a subclass)
426 'ignore' - ignore the character and continue with the next
427 'replace'- replace with a suitable replacement character;
428
Walter Dörwald7f82f792002-11-19 21:42:53 +0000429 The set of allowed parameter values can be extended via
430 register_error.
Guido van Rossum0612d842000-03-10 23:20:43 +0000431 """
432 self.stream = stream
433 self.errors = errors
Walter Dörwaldca8a8d02007-05-04 13:05:09 +0000434 self.bytebuffer = b""
Georg Brandl02524622010-12-02 18:06:51 +0000435 self._empty_charbuffer = self.charbuffertype()
436 self.charbuffer = self._empty_charbuffer
Martin v. Löwis4ed67382005-09-18 08:34:39 +0000437 self.linebuffer = None
Guido van Rossum0612d842000-03-10 23:20:43 +0000438
Walter Dörwald69652032004-09-07 20:24:22 +0000439 def decode(self, input, errors='strict'):
440 raise NotImplementedError
441
Martin v. Löwis56066d22005-08-24 07:38:12 +0000442 def read(self, size=-1, chars=-1, firstline=False):
Guido van Rossum0612d842000-03-10 23:20:43 +0000443
444 """ Decodes data from the stream self.stream and returns the
445 resulting object.
446
Walter Dörwald69652032004-09-07 20:24:22 +0000447 chars indicates the number of characters to read from the
448 stream. read() will never return more than chars
449 characters, but it might return less, if there are not enough
450 characters available.
451
Guido van Rossum0612d842000-03-10 23:20:43 +0000452 size indicates the approximate maximum number of bytes to
453 read from the stream for decoding purposes. The decoder
454 can modify this setting as appropriate. The default value
455 -1 indicates to read and decode as much as possible. size
456 is intended to prevent having to decode huge files in one
457 step.
458
Martin v. Löwis56066d22005-08-24 07:38:12 +0000459 If firstline is true, and a UnicodeDecodeError happens
460 after the first line terminator in the input only the first line
461 will be returned, the rest of the input will be kept until the
462 next call to read().
463
Guido van Rossum0612d842000-03-10 23:20:43 +0000464 The method should use a greedy read strategy meaning that
465 it should read as much data as is allowed within the
466 definition of the encoding and the given size, e.g. if
467 optional encoding endings or state markers are available
468 on the stream, these should be read too.
Guido van Rossum0612d842000-03-10 23:20:43 +0000469 """
Martin v. Löwis4ed67382005-09-18 08:34:39 +0000470 # If we have lines cached, first merge them back into characters
471 if self.linebuffer:
Georg Brandl02524622010-12-02 18:06:51 +0000472 self.charbuffer = self._empty_charbuffer.join(self.linebuffer)
Martin v. Löwis4ed67382005-09-18 08:34:39 +0000473 self.linebuffer = None
Tim Peters536cf992005-12-25 23:18:31 +0000474
Walter Dörwald69652032004-09-07 20:24:22 +0000475 # read until we get the required number of characters (if available)
Walter Dörwald69652032004-09-07 20:24:22 +0000476 while True:
Tim Golden621302c2012-10-01 16:40:40 +0100477 # can the request be satisfied from the character buffer?
Serhiy Storchakadbe09822014-01-26 19:27:56 +0200478 if chars >= 0:
Walter Dörwald69652032004-09-07 20:24:22 +0000479 if len(self.charbuffer) >= chars:
Walter Dörwald69652032004-09-07 20:24:22 +0000480 break
Serhiy Storchakadbe09822014-01-26 19:27:56 +0200481 elif size >= 0:
482 if len(self.charbuffer) >= size:
483 break
Walter Dörwald69652032004-09-07 20:24:22 +0000484 # we need more data
485 if size < 0:
486 newdata = self.stream.read()
487 else:
488 newdata = self.stream.read(size)
Walter Dörwalde57d7b12004-12-21 22:24:00 +0000489 # decode bytes (those remaining from the last call included)
Walter Dörwald69652032004-09-07 20:24:22 +0000490 data = self.bytebuffer + newdata
Serhiy Storchakadbe09822014-01-26 19:27:56 +0200491 if not data:
492 break
Martin v. Löwis56066d22005-08-24 07:38:12 +0000493 try:
494 newchars, decodedbytes = self.decode(data, self.errors)
Guido van Rossumb940e112007-01-10 16:19:56 +0000495 except UnicodeDecodeError as exc:
Martin v. Löwis56066d22005-08-24 07:38:12 +0000496 if firstline:
Walter Dörwald3abcb012007-04-16 22:10:50 +0000497 newchars, decodedbytes = \
498 self.decode(data[:exc.start], self.errors)
Ezio Melottid8b509b2011-09-28 17:37:55 +0300499 lines = newchars.splitlines(keepends=True)
Martin v. Löwis56066d22005-08-24 07:38:12 +0000500 if len(lines)<=1:
501 raise
502 else:
503 raise
Walter Dörwald69652032004-09-07 20:24:22 +0000504 # keep undecoded bytes until the next call
505 self.bytebuffer = data[decodedbytes:]
506 # put new characters in the character buffer
Walter Dörwalde57d7b12004-12-21 22:24:00 +0000507 self.charbuffer += newchars
Walter Dörwald69652032004-09-07 20:24:22 +0000508 # there was no data available
509 if not newdata:
Walter Dörwalde57d7b12004-12-21 22:24:00 +0000510 break
511 if chars < 0:
512 # Return everything we've got
513 result = self.charbuffer
Georg Brandl02524622010-12-02 18:06:51 +0000514 self.charbuffer = self._empty_charbuffer
Walter Dörwalde57d7b12004-12-21 22:24:00 +0000515 else:
516 # Return the first chars characters
517 result = self.charbuffer[:chars]
518 self.charbuffer = self.charbuffer[chars:]
Walter Dörwald69652032004-09-07 20:24:22 +0000519 return result
Guido van Rossum0612d842000-03-10 23:20:43 +0000520
Walter Dörwald69652032004-09-07 20:24:22 +0000521 def readline(self, size=None, keepends=True):
Guido van Rossuma3277132000-04-11 15:37:43 +0000522
523 """ Read one line from the input stream and return the
524 decoded data.
525
Walter Dörwald69652032004-09-07 20:24:22 +0000526 size, if given, is passed as size argument to the
527 read() method.
Guido van Rossum1c89b0e2000-04-11 15:41:38 +0000528
Guido van Rossuma3277132000-04-11 15:37:43 +0000529 """
Martin v. Löwis4ed67382005-09-18 08:34:39 +0000530 # If we have lines cached from an earlier read, return
531 # them unconditionally
532 if self.linebuffer:
533 line = self.linebuffer[0]
534 del self.linebuffer[0]
535 if len(self.linebuffer) == 1:
536 # revert to charbuffer mode; we might need more data
537 # next time
538 self.charbuffer = self.linebuffer[0]
539 self.linebuffer = None
540 if not keepends:
Ezio Melottid8b509b2011-09-28 17:37:55 +0300541 line = line.splitlines(keepends=False)[0]
Martin v. Löwis4ed67382005-09-18 08:34:39 +0000542 return line
Tim Peters536cf992005-12-25 23:18:31 +0000543
Walter Dörwalde57d7b12004-12-21 22:24:00 +0000544 readsize = size or 72
Georg Brandl02524622010-12-02 18:06:51 +0000545 line = self._empty_charbuffer
Walter Dörwalde57d7b12004-12-21 22:24:00 +0000546 # If size is given, we call read() only once
Walter Dörwald69652032004-09-07 20:24:22 +0000547 while True:
Martin v. Löwis56066d22005-08-24 07:38:12 +0000548 data = self.read(readsize, firstline=True)
Walter Dörwalde57d7b12004-12-21 22:24:00 +0000549 if data:
Walter Dörwalda4eb2d52005-04-21 21:42:35 +0000550 # If we're at a "\r" read one extra character (which might
551 # be a "\n") to get a proper line ending. If the stream is
Walter Dörwaldbc8e6422005-04-21 21:32:03 +0000552 # temporarily exhausted we return the wrong line ending.
Georg Brandl02524622010-12-02 18:06:51 +0000553 if (isinstance(data, str) and data.endswith("\r")) or \
554 (isinstance(data, bytes) and data.endswith(b"\r")):
Walter Dörwald7a6dc132005-04-04 21:38:47 +0000555 data += self.read(size=1, chars=1)
Walter Dörwald7a6dc132005-04-04 21:38:47 +0000556
Walter Dörwald69652032004-09-07 20:24:22 +0000557 line += data
Ezio Melottid8b509b2011-09-28 17:37:55 +0300558 lines = line.splitlines(keepends=True)
Walter Dörwalde57d7b12004-12-21 22:24:00 +0000559 if lines:
Martin v. Löwis4ed67382005-09-18 08:34:39 +0000560 if len(lines) > 1:
561 # More than one line result; the first line is a full line
562 # to return
563 line = lines[0]
564 del lines[0]
565 if len(lines) > 1:
566 # cache the remaining lines
567 lines[-1] += self.charbuffer
568 self.linebuffer = lines
569 self.charbuffer = None
570 else:
571 # only one remaining line, put it back into charbuffer
572 self.charbuffer = lines[0] + self.charbuffer
573 if not keepends:
Ezio Melottid8b509b2011-09-28 17:37:55 +0300574 line = line.splitlines(keepends=False)[0]
Martin v. Löwis4ed67382005-09-18 08:34:39 +0000575 break
Walter Dörwalde57d7b12004-12-21 22:24:00 +0000576 line0withend = lines[0]
Ezio Melottid8b509b2011-09-28 17:37:55 +0300577 line0withoutend = lines[0].splitlines(keepends=False)[0]
Walter Dörwalde57d7b12004-12-21 22:24:00 +0000578 if line0withend != line0withoutend: # We really have a line end
579 # Put the rest back together and keep it until the next call
Georg Brandl02524622010-12-02 18:06:51 +0000580 self.charbuffer = self._empty_charbuffer.join(lines[1:]) + \
581 self.charbuffer
Walter Dörwalde57d7b12004-12-21 22:24:00 +0000582 if keepends:
583 line = line0withend
584 else:
585 line = line0withoutend
Walter Dörwald9fa09462005-01-10 12:01:39 +0000586 break
Walter Dörwalde57d7b12004-12-21 22:24:00 +0000587 # we didn't get anything or this was our only try
Walter Dörwald9fa09462005-01-10 12:01:39 +0000588 if not data or size is not None:
Walter Dörwalde57d7b12004-12-21 22:24:00 +0000589 if line and not keepends:
Ezio Melottid8b509b2011-09-28 17:37:55 +0300590 line = line.splitlines(keepends=False)[0]
Walter Dörwalde57d7b12004-12-21 22:24:00 +0000591 break
Georg Brandl02524622010-12-02 18:06:51 +0000592 if readsize < 8000:
Walter Dörwalde57d7b12004-12-21 22:24:00 +0000593 readsize *= 2
594 return line
Guido van Rossum1c89b0e2000-04-11 15:41:38 +0000595
Walter Dörwald69652032004-09-07 20:24:22 +0000596 def readlines(self, sizehint=None, keepends=True):
Guido van Rossuma3277132000-04-11 15:37:43 +0000597
598 """ Read all lines available on the input stream
599 and return them as list of lines.
600
601 Line breaks are implemented using the codec's decoder
602 method and are included in the list entries.
Guido van Rossum1c89b0e2000-04-11 15:41:38 +0000603
Marc-André Lemburgd5948492004-02-26 15:22:17 +0000604 sizehint, if given, is ignored since there is no efficient
605 way to finding the true end-of-line.
Guido van Rossuma3277132000-04-11 15:37:43 +0000606
607 """
Walter Dörwald69652032004-09-07 20:24:22 +0000608 data = self.read()
Hye-Shik Changaf5c7cf2004-10-17 23:51:21 +0000609 return data.splitlines(keepends)
Guido van Rossum0612d842000-03-10 23:20:43 +0000610
611 def reset(self):
612
613 """ Resets the codec buffers used for keeping state.
614
615 Note that no stream repositioning should take place.
Thomas Wouters7e474022000-07-16 12:04:32 +0000616 This method is primarily intended to be able to recover
Guido van Rossum0612d842000-03-10 23:20:43 +0000617 from decoding errors.
618
619 """
Walter Dörwaldca8a8d02007-05-04 13:05:09 +0000620 self.bytebuffer = b""
Georg Brandl02524622010-12-02 18:06:51 +0000621 self.charbuffer = self._empty_charbuffer
Martin v. Löwis4ed67382005-09-18 08:34:39 +0000622 self.linebuffer = None
Walter Dörwald729c31f2005-03-14 19:06:30 +0000623
Walter Dörwald71fd90d2005-03-14 19:25:41 +0000624 def seek(self, offset, whence=0):
Walter Dörwald729c31f2005-03-14 19:06:30 +0000625 """ Set the input stream's current position.
626
627 Resets the codec buffers used for keeping state.
628 """
Walter Dörwald729c31f2005-03-14 19:06:30 +0000629 self.stream.seek(offset, whence)
Victor Stinnera92ad7e2010-05-22 16:59:09 +0000630 self.reset()
Guido van Rossum0612d842000-03-10 23:20:43 +0000631
Georg Brandla18af4e2007-04-21 15:47:16 +0000632 def __next__(self):
Walter Dörwald4dbf1922002-11-06 16:53:44 +0000633
634 """ Return the next decoded line from the input stream."""
635 line = self.readline()
636 if line:
637 return line
638 raise StopIteration
639
640 def __iter__(self):
641 return self
642
Tim Peters30324a72001-05-15 17:19:16 +0000643 def __getattr__(self, name,
Guido van Rossum0612d842000-03-10 23:20:43 +0000644 getattr=getattr):
645
646 """ Inherit all other methods from the underlying stream.
647 """
Tim Peters30324a72001-05-15 17:19:16 +0000648 return getattr(self.stream, name)
Guido van Rossum0612d842000-03-10 23:20:43 +0000649
Thomas Wouters89f507f2006-12-13 04:49:30 +0000650 def __enter__(self):
651 return self
652
653 def __exit__(self, type, value, tb):
654 self.stream.close()
655
Guido van Rossum0612d842000-03-10 23:20:43 +0000656###
657
658class StreamReaderWriter:
659
Fred Drake49fd1072000-04-13 14:11:21 +0000660 """ StreamReaderWriter instances allow wrapping streams which
661 work in both read and write modes.
662
663 The design is such that one can use the factory functions
Thomas Wouters7e474022000-07-16 12:04:32 +0000664 returned by the codec.lookup() function to construct the
Fred Drake49fd1072000-04-13 14:11:21 +0000665 instance.
666
667 """
Guido van Rossuma3277132000-04-11 15:37:43 +0000668 # Optional attributes set by the file wrappers below
669 encoding = 'unknown'
670
Tim Peters30324a72001-05-15 17:19:16 +0000671 def __init__(self, stream, Reader, Writer, errors='strict'):
Guido van Rossum0612d842000-03-10 23:20:43 +0000672
673 """ Creates a StreamReaderWriter instance.
674
675 stream must be a Stream-like object.
676
677 Reader, Writer must be factory functions or classes
678 providing the StreamReader, StreamWriter interface resp.
679
680 Error handling is done in the same way as defined for the
681 StreamWriter/Readers.
682
683 """
684 self.stream = stream
685 self.reader = Reader(stream, errors)
686 self.writer = Writer(stream, errors)
687 self.errors = errors
688
Tim Peters30324a72001-05-15 17:19:16 +0000689 def read(self, size=-1):
Guido van Rossum0612d842000-03-10 23:20:43 +0000690
691 return self.reader.read(size)
692
Guido van Rossumd58c26f2000-05-01 16:17:32 +0000693 def readline(self, size=None):
Guido van Rossuma3277132000-04-11 15:37:43 +0000694
695 return self.reader.readline(size)
696
Guido van Rossumd58c26f2000-05-01 16:17:32 +0000697 def readlines(self, sizehint=None):
Guido van Rossuma3277132000-04-11 15:37:43 +0000698
699 return self.reader.readlines(sizehint)
700
Georg Brandla18af4e2007-04-21 15:47:16 +0000701 def __next__(self):
Walter Dörwald4dbf1922002-11-06 16:53:44 +0000702
703 """ Return the next decoded line from the input stream."""
Georg Brandla18af4e2007-04-21 15:47:16 +0000704 return next(self.reader)
Walter Dörwald4dbf1922002-11-06 16:53:44 +0000705
706 def __iter__(self):
707 return self
708
Tim Peters30324a72001-05-15 17:19:16 +0000709 def write(self, data):
Guido van Rossum0612d842000-03-10 23:20:43 +0000710
711 return self.writer.write(data)
712
Tim Peters30324a72001-05-15 17:19:16 +0000713 def writelines(self, list):
Guido van Rossuma3277132000-04-11 15:37:43 +0000714
715 return self.writer.writelines(list)
716
Guido van Rossum0612d842000-03-10 23:20:43 +0000717 def reset(self):
718
719 self.reader.reset()
720 self.writer.reset()
721
Victor Stinner3fed0872010-05-22 02:16:27 +0000722 def seek(self, offset, whence=0):
Victor Stinnera92ad7e2010-05-22 16:59:09 +0000723 self.stream.seek(offset, whence)
724 self.reader.reset()
725 if whence == 0 and offset == 0:
726 self.writer.reset()
Victor Stinner3fed0872010-05-22 02:16:27 +0000727
Tim Peters30324a72001-05-15 17:19:16 +0000728 def __getattr__(self, name,
Guido van Rossum0612d842000-03-10 23:20:43 +0000729 getattr=getattr):
730
731 """ Inherit all other methods from the underlying stream.
732 """
Tim Peters30324a72001-05-15 17:19:16 +0000733 return getattr(self.stream, name)
Guido van Rossum0612d842000-03-10 23:20:43 +0000734
Thomas Wouters89f507f2006-12-13 04:49:30 +0000735 # these are needed to make "with codecs.open(...)" work properly
736
737 def __enter__(self):
738 return self
739
740 def __exit__(self, type, value, tb):
741 self.stream.close()
742
Guido van Rossum0612d842000-03-10 23:20:43 +0000743###
744
745class StreamRecoder:
746
Fred Drake49fd1072000-04-13 14:11:21 +0000747 """ StreamRecoder instances provide a frontend - backend
748 view of encoding data.
749
750 They use the complete set of APIs returned by the
751 codecs.lookup() function to implement their task.
752
753 Data written to the stream is first decoded into an
754 intermediate format (which is dependent on the given codec
755 combination) and then written to the stream using an instance
756 of the provided Writer class.
757
758 In the other direction, data is read from the stream using a
759 Reader instance and then return encoded data to the caller.
760
761 """
Guido van Rossuma3277132000-04-11 15:37:43 +0000762 # Optional attributes set by the file wrappers below
763 data_encoding = 'unknown'
764 file_encoding = 'unknown'
765
Tim Peters30324a72001-05-15 17:19:16 +0000766 def __init__(self, stream, encode, decode, Reader, Writer,
767 errors='strict'):
Guido van Rossum0612d842000-03-10 23:20:43 +0000768
769 """ Creates a StreamRecoder instance which implements a two-way
770 conversion: encode and decode work on the frontend (the
Guido van Rossum1c89b0e2000-04-11 15:41:38 +0000771 input to .read() and output of .write()) while
Guido van Rossum0612d842000-03-10 23:20:43 +0000772 Reader and Writer work on the backend (reading and
Fred Drake908670c2000-03-17 15:42:11 +0000773 writing to the stream).
Guido van Rossum0612d842000-03-10 23:20:43 +0000774
775 You can use these objects to do transparent direct
776 recodings from e.g. latin-1 to utf-8 and back.
777
778 stream must be a file-like object.
779
780 encode, decode must adhere to the Codec interface, Reader,
781 Writer must be factory functions or classes providing the
782 StreamReader, StreamWriter interface resp.
783
784 encode and decode are needed for the frontend translation,
785 Reader and Writer for the backend translation. Unicode is
786 used as intermediate encoding.
787
788 Error handling is done in the same way as defined for the
789 StreamWriter/Readers.
790
791 """
792 self.stream = stream
793 self.encode = encode
794 self.decode = decode
795 self.reader = Reader(stream, errors)
796 self.writer = Writer(stream, errors)
797 self.errors = errors
798
Tim Peters30324a72001-05-15 17:19:16 +0000799 def read(self, size=-1):
Guido van Rossum0612d842000-03-10 23:20:43 +0000800
801 data = self.reader.read(size)
802 data, bytesencoded = self.encode(data, self.errors)
803 return data
804
Tim Peters30324a72001-05-15 17:19:16 +0000805 def readline(self, size=None):
Guido van Rossuma3277132000-04-11 15:37:43 +0000806
807 if size is None:
808 data = self.reader.readline()
809 else:
810 data = self.reader.readline(size)
811 data, bytesencoded = self.encode(data, self.errors)
812 return data
813
Tim Peters30324a72001-05-15 17:19:16 +0000814 def readlines(self, sizehint=None):
Guido van Rossuma3277132000-04-11 15:37:43 +0000815
Marc-André Lemburgd5948492004-02-26 15:22:17 +0000816 data = self.reader.read()
Guido van Rossuma3277132000-04-11 15:37:43 +0000817 data, bytesencoded = self.encode(data, self.errors)
Ezio Melottid8b509b2011-09-28 17:37:55 +0300818 return data.splitlines(keepends=True)
Guido van Rossuma3277132000-04-11 15:37:43 +0000819
Georg Brandla18af4e2007-04-21 15:47:16 +0000820 def __next__(self):
Walter Dörwald4dbf1922002-11-06 16:53:44 +0000821
822 """ Return the next decoded line from the input stream."""
Georg Brandla18af4e2007-04-21 15:47:16 +0000823 data = next(self.reader)
Walter Dörwaldc5238b82005-09-01 11:56:53 +0000824 data, bytesencoded = self.encode(data, self.errors)
825 return data
Walter Dörwald4dbf1922002-11-06 16:53:44 +0000826
827 def __iter__(self):
828 return self
829
Tim Peters30324a72001-05-15 17:19:16 +0000830 def write(self, data):
Guido van Rossum0612d842000-03-10 23:20:43 +0000831
832 data, bytesdecoded = self.decode(data, self.errors)
833 return self.writer.write(data)
834
Tim Peters30324a72001-05-15 17:19:16 +0000835 def writelines(self, list):
Guido van Rossuma3277132000-04-11 15:37:43 +0000836
837 data = ''.join(list)
838 data, bytesdecoded = self.decode(data, self.errors)
839 return self.writer.write(data)
Guido van Rossum0612d842000-03-10 23:20:43 +0000840
841 def reset(self):
842
843 self.reader.reset()
844 self.writer.reset()
845
Tim Peters30324a72001-05-15 17:19:16 +0000846 def __getattr__(self, name,
Guido van Rossum0612d842000-03-10 23:20:43 +0000847 getattr=getattr):
848
849 """ Inherit all other methods from the underlying stream.
850 """
Tim Peters30324a72001-05-15 17:19:16 +0000851 return getattr(self.stream, name)
Guido van Rossum0612d842000-03-10 23:20:43 +0000852
Thomas Wouters89f507f2006-12-13 04:49:30 +0000853 def __enter__(self):
854 return self
855
856 def __exit__(self, type, value, tb):
857 self.stream.close()
858
Guido van Rossum0612d842000-03-10 23:20:43 +0000859### Shortcuts
860
Marc-André Lemburg349a3d32000-06-21 21:21:04 +0000861def open(filename, mode='rb', encoding=None, errors='strict', buffering=1):
Guido van Rossum0612d842000-03-10 23:20:43 +0000862
863 """ Open an encoded file using the given mode and return
864 a wrapped version providing transparent encoding/decoding.
865
866 Note: The wrapped version will only accept the object format
867 defined by the codecs, i.e. Unicode objects for most builtin
Skip Montanaro9f5f9d92005-03-16 03:51:56 +0000868 codecs. Output is also codec dependent and will usually be
Guido van Rossum0612d842000-03-10 23:20:43 +0000869 Unicode as well.
870
Marc-André Lemburg349a3d32000-06-21 21:21:04 +0000871 Files are always opened in binary mode, even if no binary mode
Walter Dörwald7f3ed742003-02-02 23:08:27 +0000872 was specified. This is done to avoid data loss due to encodings
Marc-André Lemburg349a3d32000-06-21 21:21:04 +0000873 using 8-bit values. The default file mode is 'rb' meaning to
874 open the file in binary read mode.
875
Guido van Rossum0612d842000-03-10 23:20:43 +0000876 encoding specifies the encoding which is to be used for the
Walter Dörwald7f3ed742003-02-02 23:08:27 +0000877 file.
Guido van Rossum0612d842000-03-10 23:20:43 +0000878
879 errors may be given to define the error handling. It defaults
880 to 'strict' which causes ValueErrors to be raised in case an
881 encoding error occurs.
882
883 buffering has the same meaning as for the builtin open() API.
884 It defaults to line buffered.
885
Fred Drake49fd1072000-04-13 14:11:21 +0000886 The returned wrapped file object provides an extra attribute
887 .encoding which allows querying the used encoding. This
888 attribute is only available if an encoding was specified as
889 parameter.
890
Guido van Rossum0612d842000-03-10 23:20:43 +0000891 """
892 if encoding is not None and \
893 'b' not in mode:
894 # Force opening of the file in binary mode
895 mode = mode + 'b'
Georg Brandl1a3284e2007-12-02 09:40:06 +0000896 file = builtins.open(filename, mode, buffering)
Guido van Rossum0612d842000-03-10 23:20:43 +0000897 if encoding is None:
898 return file
Thomas Woutersa9773292006-04-21 09:43:23 +0000899 info = lookup(encoding)
900 srw = StreamReaderWriter(file, info.streamreader, info.streamwriter, errors)
Guido van Rossuma3277132000-04-11 15:37:43 +0000901 # Add attributes to simplify introspection
902 srw.encoding = encoding
903 return srw
Guido van Rossum0612d842000-03-10 23:20:43 +0000904
Guido van Rossuma3277132000-04-11 15:37:43 +0000905def EncodedFile(file, data_encoding, file_encoding=None, errors='strict'):
Guido van Rossum0612d842000-03-10 23:20:43 +0000906
907 """ Return a wrapped version of file which provides transparent
908 encoding translation.
909
910 Strings written to the wrapped file are interpreted according
Guido van Rossuma3277132000-04-11 15:37:43 +0000911 to the given data_encoding and then written to the original
912 file as string using file_encoding. The intermediate encoding
913 will usually be Unicode but depends on the specified codecs.
Guido van Rossum0612d842000-03-10 23:20:43 +0000914
Guido van Rossuma3277132000-04-11 15:37:43 +0000915 Strings are read from the file using file_encoding and then
916 passed back to the caller as string using data_encoding.
917
918 If file_encoding is not given, it defaults to data_encoding.
Guido van Rossum0612d842000-03-10 23:20:43 +0000919
920 errors may be given to define the error handling. It defaults
921 to 'strict' which causes ValueErrors to be raised in case an
922 encoding error occurs.
923
Fred Drake49fd1072000-04-13 14:11:21 +0000924 The returned wrapped file object provides two extra attributes
925 .data_encoding and .file_encoding which reflect the given
926 parameters of the same name. The attributes can be used for
927 introspection by Python programs.
928
Guido van Rossum0612d842000-03-10 23:20:43 +0000929 """
Guido van Rossuma3277132000-04-11 15:37:43 +0000930 if file_encoding is None:
931 file_encoding = data_encoding
Thomas Wouters89f507f2006-12-13 04:49:30 +0000932 data_info = lookup(data_encoding)
933 file_info = lookup(file_encoding)
934 sr = StreamRecoder(file, data_info.encode, data_info.decode,
935 file_info.streamreader, file_info.streamwriter, errors)
Guido van Rossuma3277132000-04-11 15:37:43 +0000936 # Add attributes to simplify introspection
937 sr.data_encoding = data_encoding
938 sr.file_encoding = file_encoding
939 return sr
Guido van Rossum0612d842000-03-10 23:20:43 +0000940
Marc-André Lemburgaa32c5a2001-09-19 11:24:48 +0000941### Helpers for codec lookup
942
943def getencoder(encoding):
944
945 """ Lookup up the codec for the given encoding and return
946 its encoder function.
947
948 Raises a LookupError in case the encoding cannot be found.
949
950 """
Thomas Woutersa9773292006-04-21 09:43:23 +0000951 return lookup(encoding).encode
Marc-André Lemburgaa32c5a2001-09-19 11:24:48 +0000952
953def getdecoder(encoding):
954
955 """ Lookup up the codec for the given encoding and return
956 its decoder function.
957
958 Raises a LookupError in case the encoding cannot be found.
959
960 """
Thomas Woutersa9773292006-04-21 09:43:23 +0000961 return lookup(encoding).decode
962
963def getincrementalencoder(encoding):
964
965 """ Lookup up the codec for the given encoding and return
966 its IncrementalEncoder class or factory function.
967
968 Raises a LookupError in case the encoding cannot be found
969 or the codecs doesn't provide an incremental encoder.
970
971 """
972 encoder = lookup(encoding).incrementalencoder
973 if encoder is None:
974 raise LookupError(encoding)
975 return encoder
976
977def getincrementaldecoder(encoding):
978
979 """ Lookup up the codec for the given encoding and return
980 its IncrementalDecoder class or factory function.
981
982 Raises a LookupError in case the encoding cannot be found
983 or the codecs doesn't provide an incremental decoder.
984
985 """
986 decoder = lookup(encoding).incrementaldecoder
987 if decoder is None:
988 raise LookupError(encoding)
989 return decoder
Marc-André Lemburgaa32c5a2001-09-19 11:24:48 +0000990
991def getreader(encoding):
992
993 """ Lookup up the codec for the given encoding and return
994 its StreamReader class or factory function.
995
996 Raises a LookupError in case the encoding cannot be found.
997
998 """
Thomas Woutersa9773292006-04-21 09:43:23 +0000999 return lookup(encoding).streamreader
Marc-André Lemburgaa32c5a2001-09-19 11:24:48 +00001000
1001def getwriter(encoding):
1002
1003 """ Lookup up the codec for the given encoding and return
1004 its StreamWriter class or factory function.
1005
1006 Raises a LookupError in case the encoding cannot be found.
1007
1008 """
Thomas Woutersa9773292006-04-21 09:43:23 +00001009 return lookup(encoding).streamwriter
1010
1011def iterencode(iterator, encoding, errors='strict', **kwargs):
1012 """
1013 Encoding iterator.
1014
1015 Encodes the input strings from the iterator using a IncrementalEncoder.
1016
1017 errors and kwargs are passed through to the IncrementalEncoder
1018 constructor.
1019 """
1020 encoder = getincrementalencoder(encoding)(errors, **kwargs)
1021 for input in iterator:
1022 output = encoder.encode(input)
1023 if output:
1024 yield output
1025 output = encoder.encode("", True)
1026 if output:
1027 yield output
1028
1029def iterdecode(iterator, encoding, errors='strict', **kwargs):
1030 """
1031 Decoding iterator.
1032
1033 Decodes the input strings from the iterator using a IncrementalDecoder.
1034
1035 errors and kwargs are passed through to the IncrementalDecoder
1036 constructor.
1037 """
1038 decoder = getincrementaldecoder(encoding)(errors, **kwargs)
1039 for input in iterator:
1040 output = decoder.decode(input)
1041 if output:
1042 yield output
Walter Dörwaldca8a8d02007-05-04 13:05:09 +00001043 output = decoder.decode(b"", True)
Thomas Woutersa9773292006-04-21 09:43:23 +00001044 if output:
1045 yield output
Marc-André Lemburgaa32c5a2001-09-19 11:24:48 +00001046
Marc-André Lemburga866df82001-01-03 21:29:14 +00001047### Helpers for charmap-based codecs
1048
1049def make_identity_dict(rng):
1050
1051 """ make_identity_dict(rng) -> dict
1052
1053 Return a dictionary where elements of the rng sequence are
1054 mapped to themselves.
Tim Peters88869f92001-01-14 23:36:06 +00001055
Marc-André Lemburga866df82001-01-03 21:29:14 +00001056 """
Antoine Pitrouaaefac72012-06-16 22:48:21 +02001057 return {i:i for i in rng}
Marc-André Lemburga866df82001-01-03 21:29:14 +00001058
Marc-André Lemburg716cf912001-05-16 09:41:45 +00001059def make_encoding_map(decoding_map):
1060
1061 """ Creates an encoding map from a decoding map.
1062
Walter Dörwald7f3ed742003-02-02 23:08:27 +00001063 If a target mapping in the decoding map occurs multiple
Marc-André Lemburg716cf912001-05-16 09:41:45 +00001064 times, then that target is mapped to None (undefined mapping),
1065 causing an exception when encountered by the charmap codec
1066 during translation.
1067
1068 One example where this happens is cp875.py which decodes
1069 multiple character to \u001a.
1070
1071 """
1072 m = {}
1073 for k,v in decoding_map.items():
Raymond Hettinger54f02222002-06-01 14:18:47 +00001074 if not v in m:
Marc-André Lemburg716cf912001-05-16 09:41:45 +00001075 m[v] = k
1076 else:
1077 m[v] = None
1078 return m
Tim Peters3a2ab1a2001-05-29 06:06:54 +00001079
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001080### error handlers
1081
Martin v. Löwise2713be2005-03-08 15:03:08 +00001082try:
1083 strict_errors = lookup_error("strict")
1084 ignore_errors = lookup_error("ignore")
1085 replace_errors = lookup_error("replace")
1086 xmlcharrefreplace_errors = lookup_error("xmlcharrefreplace")
1087 backslashreplace_errors = lookup_error("backslashreplace")
1088except LookupError:
1089 # In --disable-unicode builds, these error handler are missing
1090 strict_errors = None
1091 ignore_errors = None
1092 replace_errors = None
1093 xmlcharrefreplace_errors = None
1094 backslashreplace_errors = None
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001095
Martin v. Löwis6cd441d2001-07-31 08:54:55 +00001096# Tell modulefinder that using codecs probably needs the encodings
1097# package
1098_false = 0
1099if _false:
1100 import encodings
1101
Guido van Rossum0612d842000-03-10 23:20:43 +00001102### Tests
Guido van Rossum1c89b0e2000-04-11 15:41:38 +00001103
Guido van Rossum0612d842000-03-10 23:20:43 +00001104if __name__ == '__main__':
1105
Guido van Rossuma3277132000-04-11 15:37:43 +00001106 # Make stdout translate Latin-1 output into UTF-8 output
1107 sys.stdout = EncodedFile(sys.stdout, 'latin-1', 'utf-8')
Guido van Rossum1c89b0e2000-04-11 15:41:38 +00001108
Guido van Rossuma3277132000-04-11 15:37:43 +00001109 # Have stdin translate Latin-1 input into UTF-8 input
1110 sys.stdin = EncodedFile(sys.stdin, 'utf-8', 'latin-1')