blob: 982c2825926ce53e7ef78d25a17d49c8a6697b16 [file] [log] [blame]
Guido van Rossum0612d842000-03-10 23:20:43 +00001""" codecs -- Python Codec Registry, API and helpers.
2
3
4Written by Marc-Andre Lemburg (mal@lemburg.com).
5
6(c) Copyright CNRI, All Rights Reserved. NO WARRANTY.
7
8"""#"
9
Marc-André Lemburgb28de0d2002-12-12 17:37:50 +000010import __builtin__, sys
Guido van Rossum0612d842000-03-10 23:20:43 +000011
12### Registry and builtin stateless codec functions
13
Guido van Rossumb95de4f2000-03-31 17:25:23 +000014try:
15 from _codecs import *
Guido van Rossumb940e112007-01-10 16:19:56 +000016except ImportError as why:
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000017 raise SystemError('Failed to load the builtin codecs: %s' % why)
Guido van Rossum0612d842000-03-10 23:20:43 +000018
Tim Peters30324a72001-05-15 17:19:16 +000019__all__ = ["register", "lookup", "open", "EncodedFile", "BOM", "BOM_BE",
Walter Dörwald474458d2002-06-04 15:16:29 +000020 "BOM_LE", "BOM32_BE", "BOM32_LE", "BOM64_BE", "BOM64_LE",
21 "BOM_UTF8", "BOM_UTF16", "BOM_UTF16_LE", "BOM_UTF16_BE",
Walter Dörwald3aeb6322002-09-02 13:14:32 +000022 "BOM_UTF32", "BOM_UTF32_LE", "BOM_UTF32_BE",
23 "strict_errors", "ignore_errors", "replace_errors",
24 "xmlcharrefreplace_errors",
25 "register_error", "lookup_error"]
Skip Montanaroe99d5ea2001-01-20 19:54:20 +000026
Guido van Rossum0612d842000-03-10 23:20:43 +000027### Constants
28
29#
Walter Dörwald474458d2002-06-04 15:16:29 +000030# Byte Order Mark (BOM = ZERO WIDTH NO-BREAK SPACE = U+FEFF)
31# and its possible byte string values
32# for UTF8/UTF16/UTF32 output and little/big endian machines
Guido van Rossum0612d842000-03-10 23:20:43 +000033#
Guido van Rossum0612d842000-03-10 23:20:43 +000034
Walter Dörwald474458d2002-06-04 15:16:29 +000035# UTF-8
Walter Dörwaldca8a8d02007-05-04 13:05:09 +000036BOM_UTF8 = b'\xef\xbb\xbf'
Walter Dörwald474458d2002-06-04 15:16:29 +000037
38# UTF-16, little endian
Walter Dörwaldca8a8d02007-05-04 13:05:09 +000039BOM_LE = BOM_UTF16_LE = b'\xff\xfe'
Walter Dörwald474458d2002-06-04 15:16:29 +000040
41# UTF-16, big endian
Walter Dörwaldca8a8d02007-05-04 13:05:09 +000042BOM_BE = BOM_UTF16_BE = b'\xfe\xff'
Walter Dörwald474458d2002-06-04 15:16:29 +000043
44# UTF-32, little endian
Walter Dörwaldca8a8d02007-05-04 13:05:09 +000045BOM_UTF32_LE = b'\xff\xfe\x00\x00'
Walter Dörwald474458d2002-06-04 15:16:29 +000046
47# UTF-32, big endian
Walter Dörwaldca8a8d02007-05-04 13:05:09 +000048BOM_UTF32_BE = b'\x00\x00\xfe\xff'
Walter Dörwald474458d2002-06-04 15:16:29 +000049
Marc-André Lemburgb28de0d2002-12-12 17:37:50 +000050if sys.byteorder == 'little':
Walter Dörwald474458d2002-06-04 15:16:29 +000051
Marc-André Lemburgb28de0d2002-12-12 17:37:50 +000052 # UTF-16, native endianness
53 BOM = BOM_UTF16 = BOM_UTF16_LE
54
55 # UTF-32, native endianness
56 BOM_UTF32 = BOM_UTF32_LE
57
58else:
59
60 # UTF-16, native endianness
61 BOM = BOM_UTF16 = BOM_UTF16_BE
62
63 # UTF-32, native endianness
64 BOM_UTF32 = BOM_UTF32_BE
Walter Dörwald474458d2002-06-04 15:16:29 +000065
66# Old broken names (don't use in new code)
67BOM32_LE = BOM_UTF16_LE
68BOM32_BE = BOM_UTF16_BE
69BOM64_LE = BOM_UTF32_LE
70BOM64_BE = BOM_UTF32_BE
Guido van Rossum0612d842000-03-10 23:20:43 +000071
72
73### Codec base classes (defining the API)
74
Thomas Woutersa9773292006-04-21 09:43:23 +000075class CodecInfo(tuple):
76
77 def __new__(cls, encode, decode, streamreader=None, streamwriter=None,
78 incrementalencoder=None, incrementaldecoder=None, name=None):
79 self = tuple.__new__(cls, (encode, decode, streamreader, streamwriter))
80 self.name = name
81 self.encode = encode
82 self.decode = decode
83 self.incrementalencoder = incrementalencoder
84 self.incrementaldecoder = incrementaldecoder
85 self.streamwriter = streamwriter
86 self.streamreader = streamreader
87 return self
88
89 def __repr__(self):
Walter Dörwald3abcb012007-04-16 22:10:50 +000090 return "<%s.%s object for encoding %s at 0x%x>" % \
91 (self.__class__.__module__, self.__class__.__name__,
92 self.name, id(self))
Thomas Woutersa9773292006-04-21 09:43:23 +000093
Guido van Rossum0612d842000-03-10 23:20:43 +000094class Codec:
95
96 """ Defines the interface for stateless encoders/decoders.
97
Walter Dörwald7f82f792002-11-19 21:42:53 +000098 The .encode()/.decode() methods may use different error
Guido van Rossum0612d842000-03-10 23:20:43 +000099 handling schemes by providing the errors argument. These
Walter Dörwald7f82f792002-11-19 21:42:53 +0000100 string values are predefined:
Guido van Rossum0612d842000-03-10 23:20:43 +0000101
Guido van Rossumd8855fd2000-03-24 22:14:19 +0000102 'strict' - raise a ValueError error (or a subclass)
Guido van Rossum0612d842000-03-10 23:20:43 +0000103 'ignore' - ignore the character and continue with the next
104 'replace' - replace with a suitable replacement character;
105 Python will use the official U+FFFD REPLACEMENT
Walter Dörwald7f82f792002-11-19 21:42:53 +0000106 CHARACTER for the builtin Unicode codecs on
107 decoding and '?' on encoding.
108 'xmlcharrefreplace' - Replace with the appropriate XML
109 character reference (only for encoding).
110 'backslashreplace' - Replace with backslashed escape sequences
111 (only for encoding).
112
113 The set of allowed values can be extended via register_error.
Guido van Rossum0612d842000-03-10 23:20:43 +0000114
115 """
Tim Peters30324a72001-05-15 17:19:16 +0000116 def encode(self, input, errors='strict'):
Guido van Rossum1c89b0e2000-04-11 15:41:38 +0000117
Fred Drake3e74c0d2000-03-17 15:40:35 +0000118 """ Encodes the object input and returns a tuple (output
Guido van Rossum0612d842000-03-10 23:20:43 +0000119 object, length consumed).
120
121 errors defines the error handling to apply. It defaults to
122 'strict' handling.
123
124 The method may not store state in the Codec instance. Use
125 StreamCodec for codecs which have to keep state in order to
126 make encoding/decoding efficient.
127
128 The encoder must be able to handle zero length input and
129 return an empty object of the output object type in this
130 situation.
131
132 """
133 raise NotImplementedError
134
Tim Peters30324a72001-05-15 17:19:16 +0000135 def decode(self, input, errors='strict'):
Guido van Rossum0612d842000-03-10 23:20:43 +0000136
137 """ Decodes the object input and returns a tuple (output
138 object, length consumed).
139
140 input must be an object which provides the bf_getreadbuf
141 buffer slot. Python strings, buffer objects and memory
142 mapped files are examples of objects providing this slot.
Guido van Rossum1c89b0e2000-04-11 15:41:38 +0000143
Guido van Rossum0612d842000-03-10 23:20:43 +0000144 errors defines the error handling to apply. It defaults to
145 'strict' handling.
146
147 The method may not store state in the Codec instance. Use
148 StreamCodec for codecs which have to keep state in order to
149 make encoding/decoding efficient.
150
151 The decoder must be able to handle zero length input and
152 return an empty object of the output object type in this
153 situation.
154
Guido van Rossum1c89b0e2000-04-11 15:41:38 +0000155 """
Guido van Rossum0612d842000-03-10 23:20:43 +0000156 raise NotImplementedError
157
Thomas Woutersa9773292006-04-21 09:43:23 +0000158class IncrementalEncoder(object):
159 """
Walter Dörwald3abcb012007-04-16 22:10:50 +0000160 An IncrementalEncoder encodes an input in multiple steps. The input can
161 be passed piece by piece to the encode() method. The IncrementalEncoder
162 remembers the state of the encoding process between calls to encode().
Thomas Woutersa9773292006-04-21 09:43:23 +0000163 """
164 def __init__(self, errors='strict'):
165 """
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000166 Creates an IncrementalEncoder instance.
Thomas Woutersa9773292006-04-21 09:43:23 +0000167
168 The IncrementalEncoder may use different error handling schemes by
169 providing the errors keyword argument. See the module docstring
170 for a list of possible values.
171 """
172 self.errors = errors
173 self.buffer = ""
174
175 def encode(self, input, final=False):
176 """
177 Encodes input and returns the resulting object.
178 """
179 raise NotImplementedError
180
181 def reset(self):
182 """
183 Resets the encoder to the initial state.
184 """
185
Walter Dörwald3abcb012007-04-16 22:10:50 +0000186 def getstate(self):
187 """
188 Return the current state of the encoder.
189 """
190 return 0
191
192 def setstate(self, state):
193 """
194 Set the current state of the encoder. state must have been
195 returned by getstate().
196 """
197
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000198class BufferedIncrementalEncoder(IncrementalEncoder):
199 """
200 This subclass of IncrementalEncoder can be used as the baseclass for an
201 incremental encoder if the encoder must keep some of the output in a
202 buffer between calls to encode().
203 """
204 def __init__(self, errors='strict'):
205 IncrementalEncoder.__init__(self, errors)
Walter Dörwald3abcb012007-04-16 22:10:50 +0000206 # unencoded input that is kept between calls to encode()
207 self.buffer = ""
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000208
209 def _buffer_encode(self, input, errors, final):
210 # Overwrite this method in subclasses: It must encode input
211 # and return an (output, length consumed) tuple
212 raise NotImplementedError
213
214 def encode(self, input, final=False):
215 # encode input (taking the buffer into account)
216 data = self.buffer + input
217 (result, consumed) = self._buffer_encode(data, self.errors, final)
218 # keep unencoded input until the next call
219 self.buffer = data[consumed:]
220 return result
221
222 def reset(self):
223 IncrementalEncoder.reset(self)
224 self.buffer = ""
225
Walter Dörwald3abcb012007-04-16 22:10:50 +0000226 def getstate(self):
227 return self.buffer or 0
228
229 def setstate(self, state):
230 self.buffer = state or ""
231
Thomas Woutersa9773292006-04-21 09:43:23 +0000232class IncrementalDecoder(object):
233 """
Walter Dörwald3abcb012007-04-16 22:10:50 +0000234 An IncrementalDecoder decodes an input in multiple steps. The input can
235 be passed piece by piece to the decode() method. The IncrementalDecoder
Thomas Woutersa9773292006-04-21 09:43:23 +0000236 remembers the state of the decoding process between calls to decode().
237 """
238 def __init__(self, errors='strict'):
239 """
240 Creates a IncrementalDecoder instance.
241
242 The IncrementalDecoder may use different error handling schemes by
243 providing the errors keyword argument. See the module docstring
244 for a list of possible values.
245 """
246 self.errors = errors
247
248 def decode(self, input, final=False):
249 """
250 Decodes input and returns the resulting object.
251 """
252 raise NotImplementedError
253
254 def reset(self):
255 """
256 Resets the decoder to the initial state.
257 """
258
Walter Dörwald3abcb012007-04-16 22:10:50 +0000259 def getstate(self):
260 """
261 Return the current state of the decoder. This must be a
262 (buffered_input, additional_state_info) tuple.
263 """
Walter Dörwaldca8a8d02007-05-04 13:05:09 +0000264 return (b"", 0)
Walter Dörwald3abcb012007-04-16 22:10:50 +0000265
266 def setstate(self, state):
267 """
268 Set the current state of the decoder. state must have been
269 returned by getstate().
270 """
271
Thomas Woutersa9773292006-04-21 09:43:23 +0000272class BufferedIncrementalDecoder(IncrementalDecoder):
273 """
274 This subclass of IncrementalDecoder can be used as the baseclass for an
Walter Dörwald3abcb012007-04-16 22:10:50 +0000275 incremental decoder if the decoder must be able to handle incomplete
276 byte sequences.
Thomas Woutersa9773292006-04-21 09:43:23 +0000277 """
278 def __init__(self, errors='strict'):
279 IncrementalDecoder.__init__(self, errors)
Walter Dörwald3abcb012007-04-16 22:10:50 +0000280 # undecoded input that is kept between calls to decode()
Walter Dörwaldca8a8d02007-05-04 13:05:09 +0000281 self.buffer = b""
Thomas Woutersa9773292006-04-21 09:43:23 +0000282
283 def _buffer_decode(self, input, errors, final):
284 # Overwrite this method in subclasses: It must decode input
285 # and return an (output, length consumed) tuple
286 raise NotImplementedError
287
288 def decode(self, input, final=False):
289 # decode input (taking the buffer into account)
290 data = self.buffer + input
291 (result, consumed) = self._buffer_decode(data, self.errors, final)
292 # keep undecoded input until the next call
293 self.buffer = data[consumed:]
294 return result
295
296 def reset(self):
297 IncrementalDecoder.reset(self)
Walter Dörwaldca8a8d02007-05-04 13:05:09 +0000298 self.buffer = b""
Thomas Woutersa9773292006-04-21 09:43:23 +0000299
Walter Dörwald3abcb012007-04-16 22:10:50 +0000300 def getstate(self):
301 # additional state info is always 0
302 return (self.buffer, 0)
303
304 def setstate(self, state):
305 # ignore additional state info
306 self.buffer = state[0]
307
Guido van Rossum0612d842000-03-10 23:20:43 +0000308#
309# The StreamWriter and StreamReader class provide generic working
Andrew M. Kuchling97c56352001-09-18 20:29:48 +0000310# interfaces which can be used to implement new encoding submodules
Guido van Rossum0612d842000-03-10 23:20:43 +0000311# very easily. See encodings/utf_8.py for an example on how this is
312# done.
Guido van Rossum1c89b0e2000-04-11 15:41:38 +0000313#
Guido van Rossum0612d842000-03-10 23:20:43 +0000314
315class StreamWriter(Codec):
316
Tim Peters30324a72001-05-15 17:19:16 +0000317 def __init__(self, stream, errors='strict'):
Guido van Rossum0612d842000-03-10 23:20:43 +0000318
319 """ Creates a StreamWriter instance.
320
321 stream must be a file-like object open for writing
322 (binary) data.
323
Walter Dörwald7f82f792002-11-19 21:42:53 +0000324 The StreamWriter may use different error handling
Guido van Rossum0612d842000-03-10 23:20:43 +0000325 schemes by providing the errors keyword argument. These
Walter Dörwald7f82f792002-11-19 21:42:53 +0000326 parameters are predefined:
Guido van Rossum0612d842000-03-10 23:20:43 +0000327
328 'strict' - raise a ValueError (or a subclass)
329 'ignore' - ignore the character and continue with the next
330 'replace'- replace with a suitable replacement character
Walter Dörwald7f82f792002-11-19 21:42:53 +0000331 'xmlcharrefreplace' - Replace with the appropriate XML
332 character reference.
333 'backslashreplace' - Replace with backslashed escape
334 sequences (only for encoding).
Guido van Rossum0612d842000-03-10 23:20:43 +0000335
Walter Dörwald7f82f792002-11-19 21:42:53 +0000336 The set of allowed parameter values can be extended via
337 register_error.
Guido van Rossum0612d842000-03-10 23:20:43 +0000338 """
339 self.stream = stream
340 self.errors = errors
341
Guido van Rossuma3277132000-04-11 15:37:43 +0000342 def write(self, object):
Guido van Rossum0612d842000-03-10 23:20:43 +0000343
344 """ Writes the object's contents encoded to self.stream.
345 """
Tim Peters30324a72001-05-15 17:19:16 +0000346 data, consumed = self.encode(object, self.errors)
Guido van Rossum0612d842000-03-10 23:20:43 +0000347 self.stream.write(data)
348
Guido van Rossuma3277132000-04-11 15:37:43 +0000349 def writelines(self, list):
350
351 """ Writes the concatenated list of strings to the stream
352 using .write().
353 """
354 self.write(''.join(list))
Guido van Rossum1c89b0e2000-04-11 15:41:38 +0000355
Guido van Rossum0612d842000-03-10 23:20:43 +0000356 def reset(self):
357
358 """ Flushes and resets the codec buffers used for keeping state.
359
360 Calling this method should ensure that the data on the
361 output is put into a clean state, that allows appending
362 of new fresh data without having to rescan the whole
363 stream to recover state.
364
365 """
366 pass
367
Tim Peters30324a72001-05-15 17:19:16 +0000368 def __getattr__(self, name,
Guido van Rossum0612d842000-03-10 23:20:43 +0000369 getattr=getattr):
370
371 """ Inherit all other methods from the underlying stream.
372 """
Tim Peters30324a72001-05-15 17:19:16 +0000373 return getattr(self.stream, name)
Guido van Rossum0612d842000-03-10 23:20:43 +0000374
Thomas Wouters89f507f2006-12-13 04:49:30 +0000375 def __enter__(self):
376 return self
377
378 def __exit__(self, type, value, tb):
379 self.stream.close()
380
Guido van Rossum0612d842000-03-10 23:20:43 +0000381###
382
383class StreamReader(Codec):
384
Tim Peters30324a72001-05-15 17:19:16 +0000385 def __init__(self, stream, errors='strict'):
Guido van Rossum0612d842000-03-10 23:20:43 +0000386
387 """ Creates a StreamReader instance.
388
389 stream must be a file-like object open for reading
390 (binary) data.
391
Walter Dörwald7f82f792002-11-19 21:42:53 +0000392 The StreamReader may use different error handling
Guido van Rossum0612d842000-03-10 23:20:43 +0000393 schemes by providing the errors keyword argument. These
Walter Dörwald7f82f792002-11-19 21:42:53 +0000394 parameters are predefined:
Guido van Rossum0612d842000-03-10 23:20:43 +0000395
396 'strict' - raise a ValueError (or a subclass)
397 'ignore' - ignore the character and continue with the next
398 'replace'- replace with a suitable replacement character;
399
Walter Dörwald7f82f792002-11-19 21:42:53 +0000400 The set of allowed parameter values can be extended via
401 register_error.
Guido van Rossum0612d842000-03-10 23:20:43 +0000402 """
403 self.stream = stream
404 self.errors = errors
Walter Dörwaldca8a8d02007-05-04 13:05:09 +0000405 self.bytebuffer = b""
Walter Dörwaldc9878e12005-07-20 22:15:39 +0000406 # For str->str decoding this will stay a str
407 # For str->unicode decoding the first read will promote it to unicode
408 self.charbuffer = ""
Martin v. Löwis4ed67382005-09-18 08:34:39 +0000409 self.linebuffer = None
Guido van Rossum0612d842000-03-10 23:20:43 +0000410
Walter Dörwald69652032004-09-07 20:24:22 +0000411 def decode(self, input, errors='strict'):
412 raise NotImplementedError
413
Martin v. Löwis56066d22005-08-24 07:38:12 +0000414 def read(self, size=-1, chars=-1, firstline=False):
Guido van Rossum0612d842000-03-10 23:20:43 +0000415
416 """ Decodes data from the stream self.stream and returns the
417 resulting object.
418
Walter Dörwald69652032004-09-07 20:24:22 +0000419 chars indicates the number of characters to read from the
420 stream. read() will never return more than chars
421 characters, but it might return less, if there are not enough
422 characters available.
423
Guido van Rossum0612d842000-03-10 23:20:43 +0000424 size indicates the approximate maximum number of bytes to
425 read from the stream for decoding purposes. The decoder
426 can modify this setting as appropriate. The default value
427 -1 indicates to read and decode as much as possible. size
428 is intended to prevent having to decode huge files in one
429 step.
430
Martin v. Löwis56066d22005-08-24 07:38:12 +0000431 If firstline is true, and a UnicodeDecodeError happens
432 after the first line terminator in the input only the first line
433 will be returned, the rest of the input will be kept until the
434 next call to read().
435
Guido van Rossum0612d842000-03-10 23:20:43 +0000436 The method should use a greedy read strategy meaning that
437 it should read as much data as is allowed within the
438 definition of the encoding and the given size, e.g. if
439 optional encoding endings or state markers are available
440 on the stream, these should be read too.
Guido van Rossum0612d842000-03-10 23:20:43 +0000441 """
Martin v. Löwis4ed67382005-09-18 08:34:39 +0000442 # If we have lines cached, first merge them back into characters
443 if self.linebuffer:
444 self.charbuffer = "".join(self.linebuffer)
445 self.linebuffer = None
Tim Peters536cf992005-12-25 23:18:31 +0000446
Walter Dörwald69652032004-09-07 20:24:22 +0000447 # read until we get the required number of characters (if available)
Walter Dörwald69652032004-09-07 20:24:22 +0000448 while True:
449 # can the request can be satisfied from the character buffer?
450 if chars < 0:
Walter Dörwaldca199432006-03-06 22:39:12 +0000451 if size < 0:
452 if self.charbuffer:
453 break
454 elif len(self.charbuffer) >= size:
Walter Dörwalde57d7b12004-12-21 22:24:00 +0000455 break
Guido van Rossum0612d842000-03-10 23:20:43 +0000456 else:
Walter Dörwald69652032004-09-07 20:24:22 +0000457 if len(self.charbuffer) >= chars:
Walter Dörwald69652032004-09-07 20:24:22 +0000458 break
459 # we need more data
460 if size < 0:
461 newdata = self.stream.read()
462 else:
463 newdata = self.stream.read(size)
Walter Dörwalde57d7b12004-12-21 22:24:00 +0000464 # decode bytes (those remaining from the last call included)
Walter Dörwald69652032004-09-07 20:24:22 +0000465 data = self.bytebuffer + newdata
Martin v. Löwis56066d22005-08-24 07:38:12 +0000466 try:
467 newchars, decodedbytes = self.decode(data, self.errors)
Guido van Rossumb940e112007-01-10 16:19:56 +0000468 except UnicodeDecodeError as exc:
Martin v. Löwis56066d22005-08-24 07:38:12 +0000469 if firstline:
Walter Dörwald3abcb012007-04-16 22:10:50 +0000470 newchars, decodedbytes = \
471 self.decode(data[:exc.start], self.errors)
Martin v. Löwis56066d22005-08-24 07:38:12 +0000472 lines = newchars.splitlines(True)
473 if len(lines)<=1:
474 raise
475 else:
476 raise
Walter Dörwald69652032004-09-07 20:24:22 +0000477 # keep undecoded bytes until the next call
478 self.bytebuffer = data[decodedbytes:]
479 # put new characters in the character buffer
Walter Dörwalde57d7b12004-12-21 22:24:00 +0000480 self.charbuffer += newchars
Walter Dörwald69652032004-09-07 20:24:22 +0000481 # there was no data available
482 if not newdata:
Walter Dörwalde57d7b12004-12-21 22:24:00 +0000483 break
484 if chars < 0:
485 # Return everything we've got
486 result = self.charbuffer
Walter Dörwaldc9878e12005-07-20 22:15:39 +0000487 self.charbuffer = ""
Walter Dörwalde57d7b12004-12-21 22:24:00 +0000488 else:
489 # Return the first chars characters
490 result = self.charbuffer[:chars]
491 self.charbuffer = self.charbuffer[chars:]
Walter Dörwald69652032004-09-07 20:24:22 +0000492 return result
Guido van Rossum0612d842000-03-10 23:20:43 +0000493
Walter Dörwald69652032004-09-07 20:24:22 +0000494 def readline(self, size=None, keepends=True):
Guido van Rossuma3277132000-04-11 15:37:43 +0000495
496 """ Read one line from the input stream and return the
497 decoded data.
498
Walter Dörwald69652032004-09-07 20:24:22 +0000499 size, if given, is passed as size argument to the
500 read() method.
Guido van Rossum1c89b0e2000-04-11 15:41:38 +0000501
Guido van Rossuma3277132000-04-11 15:37:43 +0000502 """
Martin v. Löwis4ed67382005-09-18 08:34:39 +0000503 # If we have lines cached from an earlier read, return
504 # them unconditionally
505 if self.linebuffer:
506 line = self.linebuffer[0]
507 del self.linebuffer[0]
508 if len(self.linebuffer) == 1:
509 # revert to charbuffer mode; we might need more data
510 # next time
511 self.charbuffer = self.linebuffer[0]
512 self.linebuffer = None
513 if not keepends:
514 line = line.splitlines(False)[0]
515 return line
Tim Peters536cf992005-12-25 23:18:31 +0000516
Walter Dörwalde57d7b12004-12-21 22:24:00 +0000517 readsize = size or 72
Walter Dörwaldc9878e12005-07-20 22:15:39 +0000518 line = ""
Walter Dörwalde57d7b12004-12-21 22:24:00 +0000519 # If size is given, we call read() only once
Walter Dörwald69652032004-09-07 20:24:22 +0000520 while True:
Martin v. Löwis56066d22005-08-24 07:38:12 +0000521 data = self.read(readsize, firstline=True)
Walter Dörwalde57d7b12004-12-21 22:24:00 +0000522 if data:
Walter Dörwalda4eb2d52005-04-21 21:42:35 +0000523 # If we're at a "\r" read one extra character (which might
524 # be a "\n") to get a proper line ending. If the stream is
Walter Dörwaldbc8e6422005-04-21 21:32:03 +0000525 # temporarily exhausted we return the wrong line ending.
Walter Dörwaldc9878e12005-07-20 22:15:39 +0000526 if data.endswith("\r"):
Walter Dörwald7a6dc132005-04-04 21:38:47 +0000527 data += self.read(size=1, chars=1)
Walter Dörwald7a6dc132005-04-04 21:38:47 +0000528
Walter Dörwald69652032004-09-07 20:24:22 +0000529 line += data
Walter Dörwalde57d7b12004-12-21 22:24:00 +0000530 lines = line.splitlines(True)
531 if lines:
Martin v. Löwis4ed67382005-09-18 08:34:39 +0000532 if len(lines) > 1:
533 # More than one line result; the first line is a full line
534 # to return
535 line = lines[0]
536 del lines[0]
537 if len(lines) > 1:
538 # cache the remaining lines
539 lines[-1] += self.charbuffer
540 self.linebuffer = lines
541 self.charbuffer = None
542 else:
543 # only one remaining line, put it back into charbuffer
544 self.charbuffer = lines[0] + self.charbuffer
545 if not keepends:
546 line = line.splitlines(False)[0]
547 break
Walter Dörwalde57d7b12004-12-21 22:24:00 +0000548 line0withend = lines[0]
549 line0withoutend = lines[0].splitlines(False)[0]
550 if line0withend != line0withoutend: # We really have a line end
551 # Put the rest back together and keep it until the next call
Walter Dörwaldc9878e12005-07-20 22:15:39 +0000552 self.charbuffer = "".join(lines[1:]) + self.charbuffer
Walter Dörwalde57d7b12004-12-21 22:24:00 +0000553 if keepends:
554 line = line0withend
555 else:
556 line = line0withoutend
Walter Dörwald9fa09462005-01-10 12:01:39 +0000557 break
Walter Dörwalde57d7b12004-12-21 22:24:00 +0000558 # we didn't get anything or this was our only try
Walter Dörwald9fa09462005-01-10 12:01:39 +0000559 if not data or size is not None:
Walter Dörwalde57d7b12004-12-21 22:24:00 +0000560 if line and not keepends:
561 line = line.splitlines(False)[0]
562 break
563 if readsize<8000:
564 readsize *= 2
565 return line
Guido van Rossum1c89b0e2000-04-11 15:41:38 +0000566
Walter Dörwald69652032004-09-07 20:24:22 +0000567 def readlines(self, sizehint=None, keepends=True):
Guido van Rossuma3277132000-04-11 15:37:43 +0000568
569 """ Read all lines available on the input stream
570 and return them as list of lines.
571
572 Line breaks are implemented using the codec's decoder
573 method and are included in the list entries.
Guido van Rossum1c89b0e2000-04-11 15:41:38 +0000574
Marc-André Lemburgd5948492004-02-26 15:22:17 +0000575 sizehint, if given, is ignored since there is no efficient
576 way to finding the true end-of-line.
Guido van Rossuma3277132000-04-11 15:37:43 +0000577
578 """
Walter Dörwald69652032004-09-07 20:24:22 +0000579 data = self.read()
Hye-Shik Changaf5c7cf2004-10-17 23:51:21 +0000580 return data.splitlines(keepends)
Guido van Rossum0612d842000-03-10 23:20:43 +0000581
582 def reset(self):
583
584 """ Resets the codec buffers used for keeping state.
585
586 Note that no stream repositioning should take place.
Thomas Wouters7e474022000-07-16 12:04:32 +0000587 This method is primarily intended to be able to recover
Guido van Rossum0612d842000-03-10 23:20:43 +0000588 from decoding errors.
589
590 """
Walter Dörwaldca8a8d02007-05-04 13:05:09 +0000591 self.bytebuffer = b""
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000592 self.charbuffer = ""
Martin v. Löwis4ed67382005-09-18 08:34:39 +0000593 self.linebuffer = None
Walter Dörwald729c31f2005-03-14 19:06:30 +0000594
Walter Dörwald71fd90d2005-03-14 19:25:41 +0000595 def seek(self, offset, whence=0):
Walter Dörwald729c31f2005-03-14 19:06:30 +0000596 """ Set the input stream's current position.
597
598 Resets the codec buffers used for keeping state.
599 """
600 self.reset()
601 self.stream.seek(offset, whence)
Guido van Rossum0612d842000-03-10 23:20:43 +0000602
Georg Brandla18af4e2007-04-21 15:47:16 +0000603 def __next__(self):
Walter Dörwald4dbf1922002-11-06 16:53:44 +0000604
605 """ Return the next decoded line from the input stream."""
606 line = self.readline()
607 if line:
608 return line
609 raise StopIteration
610
611 def __iter__(self):
612 return self
613
Tim Peters30324a72001-05-15 17:19:16 +0000614 def __getattr__(self, name,
Guido van Rossum0612d842000-03-10 23:20:43 +0000615 getattr=getattr):
616
617 """ Inherit all other methods from the underlying stream.
618 """
Tim Peters30324a72001-05-15 17:19:16 +0000619 return getattr(self.stream, name)
Guido van Rossum0612d842000-03-10 23:20:43 +0000620
Thomas Wouters89f507f2006-12-13 04:49:30 +0000621 def __enter__(self):
622 return self
623
624 def __exit__(self, type, value, tb):
625 self.stream.close()
626
Guido van Rossum0612d842000-03-10 23:20:43 +0000627###
628
629class StreamReaderWriter:
630
Fred Drake49fd1072000-04-13 14:11:21 +0000631 """ StreamReaderWriter instances allow wrapping streams which
632 work in both read and write modes.
633
634 The design is such that one can use the factory functions
Thomas Wouters7e474022000-07-16 12:04:32 +0000635 returned by the codec.lookup() function to construct the
Fred Drake49fd1072000-04-13 14:11:21 +0000636 instance.
637
638 """
Guido van Rossuma3277132000-04-11 15:37:43 +0000639 # Optional attributes set by the file wrappers below
640 encoding = 'unknown'
641
Tim Peters30324a72001-05-15 17:19:16 +0000642 def __init__(self, stream, Reader, Writer, errors='strict'):
Guido van Rossum0612d842000-03-10 23:20:43 +0000643
644 """ Creates a StreamReaderWriter instance.
645
646 stream must be a Stream-like object.
647
648 Reader, Writer must be factory functions or classes
649 providing the StreamReader, StreamWriter interface resp.
650
651 Error handling is done in the same way as defined for the
652 StreamWriter/Readers.
653
654 """
655 self.stream = stream
656 self.reader = Reader(stream, errors)
657 self.writer = Writer(stream, errors)
658 self.errors = errors
659
Tim Peters30324a72001-05-15 17:19:16 +0000660 def read(self, size=-1):
Guido van Rossum0612d842000-03-10 23:20:43 +0000661
662 return self.reader.read(size)
663
Guido van Rossumd58c26f2000-05-01 16:17:32 +0000664 def readline(self, size=None):
Guido van Rossuma3277132000-04-11 15:37:43 +0000665
666 return self.reader.readline(size)
667
Guido van Rossumd58c26f2000-05-01 16:17:32 +0000668 def readlines(self, sizehint=None):
Guido van Rossuma3277132000-04-11 15:37:43 +0000669
670 return self.reader.readlines(sizehint)
671
Georg Brandla18af4e2007-04-21 15:47:16 +0000672 def __next__(self):
Walter Dörwald4dbf1922002-11-06 16:53:44 +0000673
674 """ Return the next decoded line from the input stream."""
Georg Brandla18af4e2007-04-21 15:47:16 +0000675 return next(self.reader)
Walter Dörwald4dbf1922002-11-06 16:53:44 +0000676
677 def __iter__(self):
678 return self
679
Tim Peters30324a72001-05-15 17:19:16 +0000680 def write(self, data):
Guido van Rossum0612d842000-03-10 23:20:43 +0000681
682 return self.writer.write(data)
683
Tim Peters30324a72001-05-15 17:19:16 +0000684 def writelines(self, list):
Guido van Rossuma3277132000-04-11 15:37:43 +0000685
686 return self.writer.writelines(list)
687
Guido van Rossum0612d842000-03-10 23:20:43 +0000688 def reset(self):
689
690 self.reader.reset()
691 self.writer.reset()
692
Tim Peters30324a72001-05-15 17:19:16 +0000693 def __getattr__(self, name,
Guido van Rossum0612d842000-03-10 23:20:43 +0000694 getattr=getattr):
695
696 """ Inherit all other methods from the underlying stream.
697 """
Tim Peters30324a72001-05-15 17:19:16 +0000698 return getattr(self.stream, name)
Guido van Rossum0612d842000-03-10 23:20:43 +0000699
Thomas Wouters89f507f2006-12-13 04:49:30 +0000700 # these are needed to make "with codecs.open(...)" work properly
701
702 def __enter__(self):
703 return self
704
705 def __exit__(self, type, value, tb):
706 self.stream.close()
707
Guido van Rossum0612d842000-03-10 23:20:43 +0000708###
709
710class StreamRecoder:
711
Fred Drake49fd1072000-04-13 14:11:21 +0000712 """ StreamRecoder instances provide a frontend - backend
713 view of encoding data.
714
715 They use the complete set of APIs returned by the
716 codecs.lookup() function to implement their task.
717
718 Data written to the stream is first decoded into an
719 intermediate format (which is dependent on the given codec
720 combination) and then written to the stream using an instance
721 of the provided Writer class.
722
723 In the other direction, data is read from the stream using a
724 Reader instance and then return encoded data to the caller.
725
726 """
Guido van Rossuma3277132000-04-11 15:37:43 +0000727 # Optional attributes set by the file wrappers below
728 data_encoding = 'unknown'
729 file_encoding = 'unknown'
730
Tim Peters30324a72001-05-15 17:19:16 +0000731 def __init__(self, stream, encode, decode, Reader, Writer,
732 errors='strict'):
Guido van Rossum0612d842000-03-10 23:20:43 +0000733
734 """ Creates a StreamRecoder instance which implements a two-way
735 conversion: encode and decode work on the frontend (the
Guido van Rossum1c89b0e2000-04-11 15:41:38 +0000736 input to .read() and output of .write()) while
Guido van Rossum0612d842000-03-10 23:20:43 +0000737 Reader and Writer work on the backend (reading and
Fred Drake908670c2000-03-17 15:42:11 +0000738 writing to the stream).
Guido van Rossum0612d842000-03-10 23:20:43 +0000739
740 You can use these objects to do transparent direct
741 recodings from e.g. latin-1 to utf-8 and back.
742
743 stream must be a file-like object.
744
745 encode, decode must adhere to the Codec interface, Reader,
746 Writer must be factory functions or classes providing the
747 StreamReader, StreamWriter interface resp.
748
749 encode and decode are needed for the frontend translation,
750 Reader and Writer for the backend translation. Unicode is
751 used as intermediate encoding.
752
753 Error handling is done in the same way as defined for the
754 StreamWriter/Readers.
755
756 """
757 self.stream = stream
758 self.encode = encode
759 self.decode = decode
760 self.reader = Reader(stream, errors)
761 self.writer = Writer(stream, errors)
762 self.errors = errors
763
Tim Peters30324a72001-05-15 17:19:16 +0000764 def read(self, size=-1):
Guido van Rossum0612d842000-03-10 23:20:43 +0000765
766 data = self.reader.read(size)
767 data, bytesencoded = self.encode(data, self.errors)
768 return data
769
Tim Peters30324a72001-05-15 17:19:16 +0000770 def readline(self, size=None):
Guido van Rossuma3277132000-04-11 15:37:43 +0000771
772 if size is None:
773 data = self.reader.readline()
774 else:
775 data = self.reader.readline(size)
776 data, bytesencoded = self.encode(data, self.errors)
777 return data
778
Tim Peters30324a72001-05-15 17:19:16 +0000779 def readlines(self, sizehint=None):
Guido van Rossuma3277132000-04-11 15:37:43 +0000780
Marc-André Lemburgd5948492004-02-26 15:22:17 +0000781 data = self.reader.read()
Guido van Rossuma3277132000-04-11 15:37:43 +0000782 data, bytesencoded = self.encode(data, self.errors)
783 return data.splitlines(1)
784
Georg Brandla18af4e2007-04-21 15:47:16 +0000785 def __next__(self):
Walter Dörwald4dbf1922002-11-06 16:53:44 +0000786
787 """ Return the next decoded line from the input stream."""
Georg Brandla18af4e2007-04-21 15:47:16 +0000788 data = next(self.reader)
Walter Dörwaldc5238b82005-09-01 11:56:53 +0000789 data, bytesencoded = self.encode(data, self.errors)
790 return data
Walter Dörwald4dbf1922002-11-06 16:53:44 +0000791
792 def __iter__(self):
793 return self
794
Tim Peters30324a72001-05-15 17:19:16 +0000795 def write(self, data):
Guido van Rossum0612d842000-03-10 23:20:43 +0000796
797 data, bytesdecoded = self.decode(data, self.errors)
798 return self.writer.write(data)
799
Tim Peters30324a72001-05-15 17:19:16 +0000800 def writelines(self, list):
Guido van Rossuma3277132000-04-11 15:37:43 +0000801
802 data = ''.join(list)
803 data, bytesdecoded = self.decode(data, self.errors)
804 return self.writer.write(data)
Guido van Rossum0612d842000-03-10 23:20:43 +0000805
806 def reset(self):
807
808 self.reader.reset()
809 self.writer.reset()
810
Tim Peters30324a72001-05-15 17:19:16 +0000811 def __getattr__(self, name,
Guido van Rossum0612d842000-03-10 23:20:43 +0000812 getattr=getattr):
813
814 """ Inherit all other methods from the underlying stream.
815 """
Tim Peters30324a72001-05-15 17:19:16 +0000816 return getattr(self.stream, name)
Guido van Rossum0612d842000-03-10 23:20:43 +0000817
Thomas Wouters89f507f2006-12-13 04:49:30 +0000818 def __enter__(self):
819 return self
820
821 def __exit__(self, type, value, tb):
822 self.stream.close()
823
Guido van Rossum0612d842000-03-10 23:20:43 +0000824### Shortcuts
825
Marc-André Lemburg349a3d32000-06-21 21:21:04 +0000826def open(filename, mode='rb', encoding=None, errors='strict', buffering=1):
Guido van Rossum0612d842000-03-10 23:20:43 +0000827
828 """ Open an encoded file using the given mode and return
829 a wrapped version providing transparent encoding/decoding.
830
831 Note: The wrapped version will only accept the object format
832 defined by the codecs, i.e. Unicode objects for most builtin
Skip Montanaro9f5f9d92005-03-16 03:51:56 +0000833 codecs. Output is also codec dependent and will usually be
Guido van Rossum0612d842000-03-10 23:20:43 +0000834 Unicode as well.
835
Marc-André Lemburg349a3d32000-06-21 21:21:04 +0000836 Files are always opened in binary mode, even if no binary mode
Walter Dörwald7f3ed742003-02-02 23:08:27 +0000837 was specified. This is done to avoid data loss due to encodings
Marc-André Lemburg349a3d32000-06-21 21:21:04 +0000838 using 8-bit values. The default file mode is 'rb' meaning to
839 open the file in binary read mode.
840
Guido van Rossum0612d842000-03-10 23:20:43 +0000841 encoding specifies the encoding which is to be used for the
Walter Dörwald7f3ed742003-02-02 23:08:27 +0000842 file.
Guido van Rossum0612d842000-03-10 23:20:43 +0000843
844 errors may be given to define the error handling. It defaults
845 to 'strict' which causes ValueErrors to be raised in case an
846 encoding error occurs.
847
848 buffering has the same meaning as for the builtin open() API.
849 It defaults to line buffered.
850
Fred Drake49fd1072000-04-13 14:11:21 +0000851 The returned wrapped file object provides an extra attribute
852 .encoding which allows querying the used encoding. This
853 attribute is only available if an encoding was specified as
854 parameter.
855
Guido van Rossum0612d842000-03-10 23:20:43 +0000856 """
857 if encoding is not None and \
858 'b' not in mode:
859 # Force opening of the file in binary mode
860 mode = mode + 'b'
861 file = __builtin__.open(filename, mode, buffering)
862 if encoding is None:
863 return file
Thomas Woutersa9773292006-04-21 09:43:23 +0000864 info = lookup(encoding)
865 srw = StreamReaderWriter(file, info.streamreader, info.streamwriter, errors)
Guido van Rossuma3277132000-04-11 15:37:43 +0000866 # Add attributes to simplify introspection
867 srw.encoding = encoding
868 return srw
Guido van Rossum0612d842000-03-10 23:20:43 +0000869
Guido van Rossuma3277132000-04-11 15:37:43 +0000870def EncodedFile(file, data_encoding, file_encoding=None, errors='strict'):
Guido van Rossum0612d842000-03-10 23:20:43 +0000871
872 """ Return a wrapped version of file which provides transparent
873 encoding translation.
874
875 Strings written to the wrapped file are interpreted according
Guido van Rossuma3277132000-04-11 15:37:43 +0000876 to the given data_encoding and then written to the original
877 file as string using file_encoding. The intermediate encoding
878 will usually be Unicode but depends on the specified codecs.
Guido van Rossum0612d842000-03-10 23:20:43 +0000879
Guido van Rossuma3277132000-04-11 15:37:43 +0000880 Strings are read from the file using file_encoding and then
881 passed back to the caller as string using data_encoding.
882
883 If file_encoding is not given, it defaults to data_encoding.
Guido van Rossum0612d842000-03-10 23:20:43 +0000884
885 errors may be given to define the error handling. It defaults
886 to 'strict' which causes ValueErrors to be raised in case an
887 encoding error occurs.
888
Fred Drake49fd1072000-04-13 14:11:21 +0000889 The returned wrapped file object provides two extra attributes
890 .data_encoding and .file_encoding which reflect the given
891 parameters of the same name. The attributes can be used for
892 introspection by Python programs.
893
Guido van Rossum0612d842000-03-10 23:20:43 +0000894 """
Guido van Rossuma3277132000-04-11 15:37:43 +0000895 if file_encoding is None:
896 file_encoding = data_encoding
Thomas Wouters89f507f2006-12-13 04:49:30 +0000897 data_info = lookup(data_encoding)
898 file_info = lookup(file_encoding)
899 sr = StreamRecoder(file, data_info.encode, data_info.decode,
900 file_info.streamreader, file_info.streamwriter, errors)
Guido van Rossuma3277132000-04-11 15:37:43 +0000901 # Add attributes to simplify introspection
902 sr.data_encoding = data_encoding
903 sr.file_encoding = file_encoding
904 return sr
Guido van Rossum0612d842000-03-10 23:20:43 +0000905
Marc-André Lemburgaa32c5a2001-09-19 11:24:48 +0000906### Helpers for codec lookup
907
908def getencoder(encoding):
909
910 """ Lookup up the codec for the given encoding and return
911 its encoder function.
912
913 Raises a LookupError in case the encoding cannot be found.
914
915 """
Thomas Woutersa9773292006-04-21 09:43:23 +0000916 return lookup(encoding).encode
Marc-André Lemburgaa32c5a2001-09-19 11:24:48 +0000917
918def getdecoder(encoding):
919
920 """ Lookup up the codec for the given encoding and return
921 its decoder function.
922
923 Raises a LookupError in case the encoding cannot be found.
924
925 """
Thomas Woutersa9773292006-04-21 09:43:23 +0000926 return lookup(encoding).decode
927
928def getincrementalencoder(encoding):
929
930 """ Lookup up the codec for the given encoding and return
931 its IncrementalEncoder class or factory function.
932
933 Raises a LookupError in case the encoding cannot be found
934 or the codecs doesn't provide an incremental encoder.
935
936 """
937 encoder = lookup(encoding).incrementalencoder
938 if encoder is None:
939 raise LookupError(encoding)
940 return encoder
941
942def getincrementaldecoder(encoding):
943
944 """ Lookup up the codec for the given encoding and return
945 its IncrementalDecoder class or factory function.
946
947 Raises a LookupError in case the encoding cannot be found
948 or the codecs doesn't provide an incremental decoder.
949
950 """
951 decoder = lookup(encoding).incrementaldecoder
952 if decoder is None:
953 raise LookupError(encoding)
954 return decoder
Marc-André Lemburgaa32c5a2001-09-19 11:24:48 +0000955
956def getreader(encoding):
957
958 """ Lookup up the codec for the given encoding and return
959 its StreamReader class or factory function.
960
961 Raises a LookupError in case the encoding cannot be found.
962
963 """
Thomas Woutersa9773292006-04-21 09:43:23 +0000964 return lookup(encoding).streamreader
Marc-André Lemburgaa32c5a2001-09-19 11:24:48 +0000965
966def getwriter(encoding):
967
968 """ Lookup up the codec for the given encoding and return
969 its StreamWriter class or factory function.
970
971 Raises a LookupError in case the encoding cannot be found.
972
973 """
Thomas Woutersa9773292006-04-21 09:43:23 +0000974 return lookup(encoding).streamwriter
975
976def iterencode(iterator, encoding, errors='strict', **kwargs):
977 """
978 Encoding iterator.
979
980 Encodes the input strings from the iterator using a IncrementalEncoder.
981
982 errors and kwargs are passed through to the IncrementalEncoder
983 constructor.
984 """
985 encoder = getincrementalencoder(encoding)(errors, **kwargs)
986 for input in iterator:
987 output = encoder.encode(input)
988 if output:
989 yield output
990 output = encoder.encode("", True)
991 if output:
992 yield output
993
994def iterdecode(iterator, encoding, errors='strict', **kwargs):
995 """
996 Decoding iterator.
997
998 Decodes the input strings from the iterator using a IncrementalDecoder.
999
1000 errors and kwargs are passed through to the IncrementalDecoder
1001 constructor.
1002 """
1003 decoder = getincrementaldecoder(encoding)(errors, **kwargs)
1004 for input in iterator:
1005 output = decoder.decode(input)
1006 if output:
1007 yield output
Walter Dörwaldca8a8d02007-05-04 13:05:09 +00001008 output = decoder.decode(b"", True)
Thomas Woutersa9773292006-04-21 09:43:23 +00001009 if output:
1010 yield output
Marc-André Lemburgaa32c5a2001-09-19 11:24:48 +00001011
Marc-André Lemburga866df82001-01-03 21:29:14 +00001012### Helpers for charmap-based codecs
1013
1014def make_identity_dict(rng):
1015
1016 """ make_identity_dict(rng) -> dict
1017
1018 Return a dictionary where elements of the rng sequence are
1019 mapped to themselves.
Tim Peters88869f92001-01-14 23:36:06 +00001020
Marc-André Lemburga866df82001-01-03 21:29:14 +00001021 """
1022 res = {}
1023 for i in rng:
1024 res[i]=i
1025 return res
1026
Marc-André Lemburg716cf912001-05-16 09:41:45 +00001027def make_encoding_map(decoding_map):
1028
1029 """ Creates an encoding map from a decoding map.
1030
Walter Dörwald7f3ed742003-02-02 23:08:27 +00001031 If a target mapping in the decoding map occurs multiple
Marc-André Lemburg716cf912001-05-16 09:41:45 +00001032 times, then that target is mapped to None (undefined mapping),
1033 causing an exception when encountered by the charmap codec
1034 during translation.
1035
1036 One example where this happens is cp875.py which decodes
1037 multiple character to \u001a.
1038
1039 """
1040 m = {}
1041 for k,v in decoding_map.items():
Raymond Hettinger54f02222002-06-01 14:18:47 +00001042 if not v in m:
Marc-André Lemburg716cf912001-05-16 09:41:45 +00001043 m[v] = k
1044 else:
1045 m[v] = None
1046 return m
Tim Peters3a2ab1a2001-05-29 06:06:54 +00001047
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001048### error handlers
1049
Martin v. Löwise2713be2005-03-08 15:03:08 +00001050try:
1051 strict_errors = lookup_error("strict")
1052 ignore_errors = lookup_error("ignore")
1053 replace_errors = lookup_error("replace")
1054 xmlcharrefreplace_errors = lookup_error("xmlcharrefreplace")
1055 backslashreplace_errors = lookup_error("backslashreplace")
1056except LookupError:
1057 # In --disable-unicode builds, these error handler are missing
1058 strict_errors = None
1059 ignore_errors = None
1060 replace_errors = None
1061 xmlcharrefreplace_errors = None
1062 backslashreplace_errors = None
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001063
Martin v. Löwis6cd441d2001-07-31 08:54:55 +00001064# Tell modulefinder that using codecs probably needs the encodings
1065# package
1066_false = 0
1067if _false:
1068 import encodings
1069
Guido van Rossum0612d842000-03-10 23:20:43 +00001070### Tests
Guido van Rossum1c89b0e2000-04-11 15:41:38 +00001071
Guido van Rossum0612d842000-03-10 23:20:43 +00001072if __name__ == '__main__':
1073
Guido van Rossuma3277132000-04-11 15:37:43 +00001074 # Make stdout translate Latin-1 output into UTF-8 output
1075 sys.stdout = EncodedFile(sys.stdout, 'latin-1', 'utf-8')
Guido van Rossum1c89b0e2000-04-11 15:41:38 +00001076
Guido van Rossuma3277132000-04-11 15:37:43 +00001077 # Have stdin translate Latin-1 input into UTF-8 input
1078 sys.stdin = EncodedFile(sys.stdin, 'utf-8', 'latin-1')