blob: 93c16c358ee1712fb6b6c6e9b2c6ffcb198451ef [file] [log] [blame]
Guido van Rossum0612d842000-03-10 23:20:43 +00001""" codecs -- Python Codec Registry, API and helpers.
2
3
4Written by Marc-Andre Lemburg (mal@lemburg.com).
5
6(c) Copyright CNRI, All Rights Reserved. NO WARRANTY.
7
8"""#"
9
Marc-André Lemburgb28de0d2002-12-12 17:37:50 +000010import __builtin__, sys
Guido van Rossum0612d842000-03-10 23:20:43 +000011
12### Registry and builtin stateless codec functions
13
Guido van Rossumb95de4f2000-03-31 17:25:23 +000014try:
15 from _codecs import *
Tim Peters30324a72001-05-15 17:19:16 +000016except ImportError, why:
Walter Dörwald6a7ec7c2006-03-18 16:35:17 +000017 raise SystemError('Failed to load the builtin codecs: %s' % why)
Guido van Rossum0612d842000-03-10 23:20:43 +000018
Tim Peters30324a72001-05-15 17:19:16 +000019__all__ = ["register", "lookup", "open", "EncodedFile", "BOM", "BOM_BE",
Walter Dörwald474458d2002-06-04 15:16:29 +000020 "BOM_LE", "BOM32_BE", "BOM32_LE", "BOM64_BE", "BOM64_LE",
21 "BOM_UTF8", "BOM_UTF16", "BOM_UTF16_LE", "BOM_UTF16_BE",
Walter Dörwald3aeb6322002-09-02 13:14:32 +000022 "BOM_UTF32", "BOM_UTF32_LE", "BOM_UTF32_BE",
23 "strict_errors", "ignore_errors", "replace_errors",
24 "xmlcharrefreplace_errors",
25 "register_error", "lookup_error"]
Skip Montanaroe99d5ea2001-01-20 19:54:20 +000026
Guido van Rossum0612d842000-03-10 23:20:43 +000027### Constants
28
29#
Walter Dörwald474458d2002-06-04 15:16:29 +000030# Byte Order Mark (BOM = ZERO WIDTH NO-BREAK SPACE = U+FEFF)
31# and its possible byte string values
32# for UTF8/UTF16/UTF32 output and little/big endian machines
Guido van Rossum0612d842000-03-10 23:20:43 +000033#
Guido van Rossum0612d842000-03-10 23:20:43 +000034
Walter Dörwald474458d2002-06-04 15:16:29 +000035# UTF-8
36BOM_UTF8 = '\xef\xbb\xbf'
37
38# UTF-16, little endian
39BOM_LE = BOM_UTF16_LE = '\xff\xfe'
40
41# UTF-16, big endian
42BOM_BE = BOM_UTF16_BE = '\xfe\xff'
43
44# UTF-32, little endian
45BOM_UTF32_LE = '\xff\xfe\x00\x00'
46
47# UTF-32, big endian
48BOM_UTF32_BE = '\x00\x00\xfe\xff'
49
Marc-André Lemburgb28de0d2002-12-12 17:37:50 +000050if sys.byteorder == 'little':
Walter Dörwald474458d2002-06-04 15:16:29 +000051
Marc-André Lemburgb28de0d2002-12-12 17:37:50 +000052 # UTF-16, native endianness
53 BOM = BOM_UTF16 = BOM_UTF16_LE
54
55 # UTF-32, native endianness
56 BOM_UTF32 = BOM_UTF32_LE
57
58else:
59
60 # UTF-16, native endianness
61 BOM = BOM_UTF16 = BOM_UTF16_BE
62
63 # UTF-32, native endianness
64 BOM_UTF32 = BOM_UTF32_BE
Walter Dörwald474458d2002-06-04 15:16:29 +000065
66# Old broken names (don't use in new code)
67BOM32_LE = BOM_UTF16_LE
68BOM32_BE = BOM_UTF16_BE
69BOM64_LE = BOM_UTF32_LE
70BOM64_BE = BOM_UTF32_BE
Guido van Rossum0612d842000-03-10 23:20:43 +000071
72
73### Codec base classes (defining the API)
74
Walter Dörwaldabb02e52006-03-15 11:35:15 +000075class CodecInfo(tuple):
76
77 def __new__(cls, encode, decode, streamreader=None, streamwriter=None,
78 incrementalencoder=None, incrementaldecoder=None, name=None):
79 self = tuple.__new__(cls, (encode, decode, streamreader, streamwriter))
80 self.name = name
81 self.encode = encode
82 self.decode = decode
83 self.incrementalencoder = incrementalencoder
84 self.incrementaldecoder = incrementaldecoder
85 self.streamwriter = streamwriter
86 self.streamreader = streamreader
87 return self
88
89 def __repr__(self):
90 return "<%s.%s object for encoding %s at 0x%x>" % (self.__class__.__module__, self.__class__.__name__, self.name, id(self))
91
Guido van Rossum0612d842000-03-10 23:20:43 +000092class Codec:
93
94 """ Defines the interface for stateless encoders/decoders.
95
Walter Dörwald7f82f792002-11-19 21:42:53 +000096 The .encode()/.decode() methods may use different error
Guido van Rossum0612d842000-03-10 23:20:43 +000097 handling schemes by providing the errors argument. These
Walter Dörwald7f82f792002-11-19 21:42:53 +000098 string values are predefined:
Guido van Rossum0612d842000-03-10 23:20:43 +000099
Guido van Rossumd8855fd2000-03-24 22:14:19 +0000100 'strict' - raise a ValueError error (or a subclass)
Guido van Rossum0612d842000-03-10 23:20:43 +0000101 'ignore' - ignore the character and continue with the next
102 'replace' - replace with a suitable replacement character;
103 Python will use the official U+FFFD REPLACEMENT
Walter Dörwald7f82f792002-11-19 21:42:53 +0000104 CHARACTER for the builtin Unicode codecs on
105 decoding and '?' on encoding.
106 'xmlcharrefreplace' - Replace with the appropriate XML
107 character reference (only for encoding).
108 'backslashreplace' - Replace with backslashed escape sequences
109 (only for encoding).
110
111 The set of allowed values can be extended via register_error.
Guido van Rossum0612d842000-03-10 23:20:43 +0000112
113 """
Tim Peters30324a72001-05-15 17:19:16 +0000114 def encode(self, input, errors='strict'):
Guido van Rossum1c89b0e2000-04-11 15:41:38 +0000115
Fred Drake3e74c0d2000-03-17 15:40:35 +0000116 """ Encodes the object input and returns a tuple (output
Guido van Rossum0612d842000-03-10 23:20:43 +0000117 object, length consumed).
118
119 errors defines the error handling to apply. It defaults to
120 'strict' handling.
121
122 The method may not store state in the Codec instance. Use
123 StreamCodec for codecs which have to keep state in order to
124 make encoding/decoding efficient.
125
126 The encoder must be able to handle zero length input and
127 return an empty object of the output object type in this
128 situation.
129
130 """
131 raise NotImplementedError
132
Tim Peters30324a72001-05-15 17:19:16 +0000133 def decode(self, input, errors='strict'):
Guido van Rossum0612d842000-03-10 23:20:43 +0000134
135 """ Decodes the object input and returns a tuple (output
136 object, length consumed).
137
138 input must be an object which provides the bf_getreadbuf
139 buffer slot. Python strings, buffer objects and memory
140 mapped files are examples of objects providing this slot.
Guido van Rossum1c89b0e2000-04-11 15:41:38 +0000141
Guido van Rossum0612d842000-03-10 23:20:43 +0000142 errors defines the error handling to apply. It defaults to
143 'strict' handling.
144
145 The method may not store state in the Codec instance. Use
146 StreamCodec for codecs which have to keep state in order to
147 make encoding/decoding efficient.
148
149 The decoder must be able to handle zero length input and
150 return an empty object of the output object type in this
151 situation.
152
Guido van Rossum1c89b0e2000-04-11 15:41:38 +0000153 """
Guido van Rossum0612d842000-03-10 23:20:43 +0000154 raise NotImplementedError
155
Walter Dörwaldabb02e52006-03-15 11:35:15 +0000156class IncrementalEncoder(object):
157 """
Neal Norwitz6bed1c12006-03-16 07:49:19 +0000158 An IncrementalEncoder encodes an input in multiple steps. The input can be
Walter Dörwaldabb02e52006-03-15 11:35:15 +0000159 passed piece by piece to the encode() method. The IncrementalEncoder remembers
160 the state of the Encoding process between calls to encode().
161 """
162 def __init__(self, errors='strict'):
163 """
Neal Norwitz6bed1c12006-03-16 07:49:19 +0000164 Creates an IncrementalEncoder instance.
Walter Dörwaldabb02e52006-03-15 11:35:15 +0000165
166 The IncrementalEncoder may use different error handling schemes by
167 providing the errors keyword argument. See the module docstring
168 for a list of possible values.
169 """
170 self.errors = errors
171 self.buffer = ""
172
173 def encode(self, input, final=False):
174 """
175 Encodes input and returns the resulting object.
176 """
177 raise NotImplementedError
178
179 def reset(self):
180 """
181 Resets the encoder to the initial state.
182 """
183
Christian Heimes1a6387e2008-03-26 12:49:49 +0000184 def getstate(self):
185 """
186 Return the current state of the encoder.
187 """
188 return 0
189
190 def setstate(self, state):
191 """
192 Set the current state of the encoder. state must have been
193 returned by getstate().
194 """
195
Walter Dörwald78a0be62006-04-14 18:25:39 +0000196class BufferedIncrementalEncoder(IncrementalEncoder):
197 """
198 This subclass of IncrementalEncoder can be used as the baseclass for an
199 incremental encoder if the encoder must keep some of the output in a
200 buffer between calls to encode().
201 """
202 def __init__(self, errors='strict'):
203 IncrementalEncoder.__init__(self, errors)
204 self.buffer = "" # unencoded input that is kept between calls to encode()
205
206 def _buffer_encode(self, input, errors, final):
207 # Overwrite this method in subclasses: It must encode input
208 # and return an (output, length consumed) tuple
209 raise NotImplementedError
210
211 def encode(self, input, final=False):
212 # encode input (taking the buffer into account)
213 data = self.buffer + input
214 (result, consumed) = self._buffer_encode(data, self.errors, final)
215 # keep unencoded input until the next call
216 self.buffer = data[consumed:]
217 return result
218
219 def reset(self):
220 IncrementalEncoder.reset(self)
221 self.buffer = ""
222
Christian Heimes1a6387e2008-03-26 12:49:49 +0000223 def getstate(self):
224 return self.buffer or 0
225
226 def setstate(self, state):
227 self.buffer = state or ""
228
Walter Dörwaldabb02e52006-03-15 11:35:15 +0000229class IncrementalDecoder(object):
230 """
231 An IncrementalDecoder decodes an input in multiple steps. The input can be
232 passed piece by piece to the decode() method. The IncrementalDecoder
233 remembers the state of the decoding process between calls to decode().
234 """
235 def __init__(self, errors='strict'):
236 """
237 Creates a IncrementalDecoder instance.
238
239 The IncrementalDecoder may use different error handling schemes by
240 providing the errors keyword argument. See the module docstring
241 for a list of possible values.
242 """
243 self.errors = errors
244
245 def decode(self, input, final=False):
246 """
247 Decodes input and returns the resulting object.
248 """
249 raise NotImplementedError
250
251 def reset(self):
252 """
253 Resets the decoder to the initial state.
254 """
255
Christian Heimes1a6387e2008-03-26 12:49:49 +0000256 def getstate(self):
257 """
258 Return the current state of the decoder.
259
260 This must be a (buffered_input, additional_state_info) tuple.
261 buffered_input must be a bytes object containing bytes that
262 were passed to decode() that have not yet been converted.
263 additional_state_info must be a non-negative integer
264 representing the state of the decoder WITHOUT yet having
265 processed the contents of buffered_input. In the initial state
266 and after reset(), getstate() must return (b"", 0).
267 """
268 return (b"", 0)
269
270 def setstate(self, state):
271 """
272 Set the current state of the decoder.
273
274 state must have been returned by getstate(). The effect of
275 setstate((b"", 0)) must be equivalent to reset().
276 """
277
Walter Dörwaldabb02e52006-03-15 11:35:15 +0000278class BufferedIncrementalDecoder(IncrementalDecoder):
279 """
280 This subclass of IncrementalDecoder can be used as the baseclass for an
281 incremental decoder if the decoder must be able to handle incomplete byte
282 sequences.
283 """
284 def __init__(self, errors='strict'):
285 IncrementalDecoder.__init__(self, errors)
286 self.buffer = "" # undecoded input that is kept between calls to decode()
287
288 def _buffer_decode(self, input, errors, final):
289 # Overwrite this method in subclasses: It must decode input
290 # and return an (output, length consumed) tuple
291 raise NotImplementedError
292
293 def decode(self, input, final=False):
294 # decode input (taking the buffer into account)
295 data = self.buffer + input
296 (result, consumed) = self._buffer_decode(data, self.errors, final)
297 # keep undecoded input until the next call
298 self.buffer = data[consumed:]
299 return result
300
301 def reset(self):
302 IncrementalDecoder.reset(self)
Walter Dörwaldb17f12b2006-04-14 15:40:54 +0000303 self.buffer = ""
Walter Dörwaldabb02e52006-03-15 11:35:15 +0000304
Christian Heimes1a6387e2008-03-26 12:49:49 +0000305 def getstate(self):
306 # additional state info is always 0
307 return (self.buffer, 0)
308
309 def setstate(self, state):
310 # ignore additional state info
311 self.buffer = state[0]
312
Guido van Rossum0612d842000-03-10 23:20:43 +0000313#
314# The StreamWriter and StreamReader class provide generic working
Andrew M. Kuchling97c56352001-09-18 20:29:48 +0000315# interfaces which can be used to implement new encoding submodules
Guido van Rossum0612d842000-03-10 23:20:43 +0000316# very easily. See encodings/utf_8.py for an example on how this is
317# done.
Guido van Rossum1c89b0e2000-04-11 15:41:38 +0000318#
Guido van Rossum0612d842000-03-10 23:20:43 +0000319
320class StreamWriter(Codec):
321
Tim Peters30324a72001-05-15 17:19:16 +0000322 def __init__(self, stream, errors='strict'):
Guido van Rossum0612d842000-03-10 23:20:43 +0000323
324 """ Creates a StreamWriter instance.
325
326 stream must be a file-like object open for writing
327 (binary) data.
328
Walter Dörwald7f82f792002-11-19 21:42:53 +0000329 The StreamWriter may use different error handling
Guido van Rossum0612d842000-03-10 23:20:43 +0000330 schemes by providing the errors keyword argument. These
Walter Dörwald7f82f792002-11-19 21:42:53 +0000331 parameters are predefined:
Guido van Rossum0612d842000-03-10 23:20:43 +0000332
333 'strict' - raise a ValueError (or a subclass)
334 'ignore' - ignore the character and continue with the next
335 'replace'- replace with a suitable replacement character
Walter Dörwald7f82f792002-11-19 21:42:53 +0000336 'xmlcharrefreplace' - Replace with the appropriate XML
337 character reference.
338 'backslashreplace' - Replace with backslashed escape
339 sequences (only for encoding).
Guido van Rossum0612d842000-03-10 23:20:43 +0000340
Walter Dörwald7f82f792002-11-19 21:42:53 +0000341 The set of allowed parameter values can be extended via
342 register_error.
Guido van Rossum0612d842000-03-10 23:20:43 +0000343 """
344 self.stream = stream
345 self.errors = errors
346
Guido van Rossuma3277132000-04-11 15:37:43 +0000347 def write(self, object):
Guido van Rossum0612d842000-03-10 23:20:43 +0000348
349 """ Writes the object's contents encoded to self.stream.
350 """
Tim Peters30324a72001-05-15 17:19:16 +0000351 data, consumed = self.encode(object, self.errors)
Guido van Rossum0612d842000-03-10 23:20:43 +0000352 self.stream.write(data)
353
Guido van Rossuma3277132000-04-11 15:37:43 +0000354 def writelines(self, list):
355
356 """ Writes the concatenated list of strings to the stream
357 using .write().
358 """
359 self.write(''.join(list))
Guido van Rossum1c89b0e2000-04-11 15:41:38 +0000360
Guido van Rossum0612d842000-03-10 23:20:43 +0000361 def reset(self):
362
363 """ Flushes and resets the codec buffers used for keeping state.
364
365 Calling this method should ensure that the data on the
366 output is put into a clean state, that allows appending
367 of new fresh data without having to rescan the whole
368 stream to recover state.
369
370 """
371 pass
372
Victor Stinner7df55da2010-05-22 13:37:56 +0000373 def seek(self, offset, whence=0):
374 self.stream.seek(offset, whence)
375 if whence == 0 and offset == 0:
376 self.reset()
377
Tim Peters30324a72001-05-15 17:19:16 +0000378 def __getattr__(self, name,
Guido van Rossum0612d842000-03-10 23:20:43 +0000379 getattr=getattr):
380
381 """ Inherit all other methods from the underlying stream.
382 """
Tim Peters30324a72001-05-15 17:19:16 +0000383 return getattr(self.stream, name)
Guido van Rossum0612d842000-03-10 23:20:43 +0000384
Georg Brandl8f99f812006-10-29 08:39:22 +0000385 def __enter__(self):
386 return self
387
388 def __exit__(self, type, value, tb):
389 self.stream.close()
390
Guido van Rossum0612d842000-03-10 23:20:43 +0000391###
392
393class StreamReader(Codec):
394
Tim Peters30324a72001-05-15 17:19:16 +0000395 def __init__(self, stream, errors='strict'):
Guido van Rossum0612d842000-03-10 23:20:43 +0000396
397 """ Creates a StreamReader instance.
398
399 stream must be a file-like object open for reading
400 (binary) data.
401
Walter Dörwald7f82f792002-11-19 21:42:53 +0000402 The StreamReader may use different error handling
Guido van Rossum0612d842000-03-10 23:20:43 +0000403 schemes by providing the errors keyword argument. These
Walter Dörwald7f82f792002-11-19 21:42:53 +0000404 parameters are predefined:
Guido van Rossum0612d842000-03-10 23:20:43 +0000405
406 'strict' - raise a ValueError (or a subclass)
407 'ignore' - ignore the character and continue with the next
408 'replace'- replace with a suitable replacement character;
409
Walter Dörwald7f82f792002-11-19 21:42:53 +0000410 The set of allowed parameter values can be extended via
411 register_error.
Guido van Rossum0612d842000-03-10 23:20:43 +0000412 """
413 self.stream = stream
414 self.errors = errors
Walter Dörwald69652032004-09-07 20:24:22 +0000415 self.bytebuffer = ""
Walter Dörwaldc9878e12005-07-20 22:15:39 +0000416 # For str->str decoding this will stay a str
417 # For str->unicode decoding the first read will promote it to unicode
418 self.charbuffer = ""
Martin v. Löwis4ed67382005-09-18 08:34:39 +0000419 self.linebuffer = None
Guido van Rossum0612d842000-03-10 23:20:43 +0000420
Walter Dörwald69652032004-09-07 20:24:22 +0000421 def decode(self, input, errors='strict'):
422 raise NotImplementedError
423
Martin v. Löwis56066d22005-08-24 07:38:12 +0000424 def read(self, size=-1, chars=-1, firstline=False):
Guido van Rossum0612d842000-03-10 23:20:43 +0000425
426 """ Decodes data from the stream self.stream and returns the
427 resulting object.
428
Walter Dörwald69652032004-09-07 20:24:22 +0000429 chars indicates the number of characters to read from the
430 stream. read() will never return more than chars
431 characters, but it might return less, if there are not enough
432 characters available.
433
Guido van Rossum0612d842000-03-10 23:20:43 +0000434 size indicates the approximate maximum number of bytes to
435 read from the stream for decoding purposes. The decoder
436 can modify this setting as appropriate. The default value
437 -1 indicates to read and decode as much as possible. size
438 is intended to prevent having to decode huge files in one
439 step.
440
Martin v. Löwis56066d22005-08-24 07:38:12 +0000441 If firstline is true, and a UnicodeDecodeError happens
442 after the first line terminator in the input only the first line
443 will be returned, the rest of the input will be kept until the
444 next call to read().
445
Guido van Rossum0612d842000-03-10 23:20:43 +0000446 The method should use a greedy read strategy meaning that
447 it should read as much data as is allowed within the
448 definition of the encoding and the given size, e.g. if
449 optional encoding endings or state markers are available
450 on the stream, these should be read too.
Guido van Rossum0612d842000-03-10 23:20:43 +0000451 """
Martin v. Löwis4ed67382005-09-18 08:34:39 +0000452 # If we have lines cached, first merge them back into characters
453 if self.linebuffer:
454 self.charbuffer = "".join(self.linebuffer)
455 self.linebuffer = None
Tim Peters536cf992005-12-25 23:18:31 +0000456
Walter Dörwald69652032004-09-07 20:24:22 +0000457 # read until we get the required number of characters (if available)
Walter Dörwald69652032004-09-07 20:24:22 +0000458 while True:
Serhiy Storchaka2403a782014-01-26 19:20:24 +0200459 # can the request be satisfied from the character buffer?
460 if chars >= 0:
Walter Dörwald69652032004-09-07 20:24:22 +0000461 if len(self.charbuffer) >= chars:
Walter Dörwald69652032004-09-07 20:24:22 +0000462 break
Serhiy Storchaka2403a782014-01-26 19:20:24 +0200463 elif size >= 0:
464 if len(self.charbuffer) >= size:
465 break
Walter Dörwald69652032004-09-07 20:24:22 +0000466 # we need more data
467 if size < 0:
468 newdata = self.stream.read()
469 else:
470 newdata = self.stream.read(size)
Walter Dörwalde57d7b12004-12-21 22:24:00 +0000471 # decode bytes (those remaining from the last call included)
Walter Dörwald69652032004-09-07 20:24:22 +0000472 data = self.bytebuffer + newdata
Martin v. Löwis56066d22005-08-24 07:38:12 +0000473 try:
474 newchars, decodedbytes = self.decode(data, self.errors)
475 except UnicodeDecodeError, exc:
476 if firstline:
477 newchars, decodedbytes = self.decode(data[:exc.start], self.errors)
478 lines = newchars.splitlines(True)
479 if len(lines)<=1:
480 raise
481 else:
482 raise
Walter Dörwald69652032004-09-07 20:24:22 +0000483 # keep undecoded bytes until the next call
484 self.bytebuffer = data[decodedbytes:]
485 # put new characters in the character buffer
Walter Dörwalde57d7b12004-12-21 22:24:00 +0000486 self.charbuffer += newchars
Walter Dörwald69652032004-09-07 20:24:22 +0000487 # there was no data available
488 if not newdata:
Walter Dörwalde57d7b12004-12-21 22:24:00 +0000489 break
490 if chars < 0:
491 # Return everything we've got
492 result = self.charbuffer
Walter Dörwaldc9878e12005-07-20 22:15:39 +0000493 self.charbuffer = ""
Walter Dörwalde57d7b12004-12-21 22:24:00 +0000494 else:
495 # Return the first chars characters
496 result = self.charbuffer[:chars]
497 self.charbuffer = self.charbuffer[chars:]
Walter Dörwald69652032004-09-07 20:24:22 +0000498 return result
Guido van Rossum0612d842000-03-10 23:20:43 +0000499
Walter Dörwald69652032004-09-07 20:24:22 +0000500 def readline(self, size=None, keepends=True):
Guido van Rossuma3277132000-04-11 15:37:43 +0000501
502 """ Read one line from the input stream and return the
503 decoded data.
504
Walter Dörwald69652032004-09-07 20:24:22 +0000505 size, if given, is passed as size argument to the
506 read() method.
Guido van Rossum1c89b0e2000-04-11 15:41:38 +0000507
Guido van Rossuma3277132000-04-11 15:37:43 +0000508 """
Martin v. Löwis4ed67382005-09-18 08:34:39 +0000509 # If we have lines cached from an earlier read, return
510 # them unconditionally
511 if self.linebuffer:
512 line = self.linebuffer[0]
513 del self.linebuffer[0]
514 if len(self.linebuffer) == 1:
515 # revert to charbuffer mode; we might need more data
516 # next time
517 self.charbuffer = self.linebuffer[0]
518 self.linebuffer = None
519 if not keepends:
520 line = line.splitlines(False)[0]
521 return line
Tim Peters536cf992005-12-25 23:18:31 +0000522
Walter Dörwalde57d7b12004-12-21 22:24:00 +0000523 readsize = size or 72
Walter Dörwaldc9878e12005-07-20 22:15:39 +0000524 line = ""
Walter Dörwalde57d7b12004-12-21 22:24:00 +0000525 # If size is given, we call read() only once
Walter Dörwald69652032004-09-07 20:24:22 +0000526 while True:
Martin v. Löwis56066d22005-08-24 07:38:12 +0000527 data = self.read(readsize, firstline=True)
Walter Dörwalde57d7b12004-12-21 22:24:00 +0000528 if data:
Walter Dörwalda4eb2d52005-04-21 21:42:35 +0000529 # If we're at a "\r" read one extra character (which might
530 # be a "\n") to get a proper line ending. If the stream is
Walter Dörwaldbc8e6422005-04-21 21:32:03 +0000531 # temporarily exhausted we return the wrong line ending.
Walter Dörwaldc9878e12005-07-20 22:15:39 +0000532 if data.endswith("\r"):
Walter Dörwald7a6dc132005-04-04 21:38:47 +0000533 data += self.read(size=1, chars=1)
Walter Dörwald7a6dc132005-04-04 21:38:47 +0000534
Walter Dörwald69652032004-09-07 20:24:22 +0000535 line += data
Walter Dörwalde57d7b12004-12-21 22:24:00 +0000536 lines = line.splitlines(True)
537 if lines:
Martin v. Löwis4ed67382005-09-18 08:34:39 +0000538 if len(lines) > 1:
539 # More than one line result; the first line is a full line
540 # to return
541 line = lines[0]
542 del lines[0]
543 if len(lines) > 1:
544 # cache the remaining lines
545 lines[-1] += self.charbuffer
546 self.linebuffer = lines
547 self.charbuffer = None
548 else:
549 # only one remaining line, put it back into charbuffer
550 self.charbuffer = lines[0] + self.charbuffer
551 if not keepends:
552 line = line.splitlines(False)[0]
553 break
Walter Dörwalde57d7b12004-12-21 22:24:00 +0000554 line0withend = lines[0]
555 line0withoutend = lines[0].splitlines(False)[0]
556 if line0withend != line0withoutend: # We really have a line end
557 # Put the rest back together and keep it until the next call
Walter Dörwaldc9878e12005-07-20 22:15:39 +0000558 self.charbuffer = "".join(lines[1:]) + self.charbuffer
Walter Dörwalde57d7b12004-12-21 22:24:00 +0000559 if keepends:
560 line = line0withend
561 else:
562 line = line0withoutend
Walter Dörwald9fa09462005-01-10 12:01:39 +0000563 break
Walter Dörwalde57d7b12004-12-21 22:24:00 +0000564 # we didn't get anything or this was our only try
Walter Dörwald9fa09462005-01-10 12:01:39 +0000565 if not data or size is not None:
Walter Dörwalde57d7b12004-12-21 22:24:00 +0000566 if line and not keepends:
567 line = line.splitlines(False)[0]
568 break
569 if readsize<8000:
570 readsize *= 2
571 return line
Guido van Rossum1c89b0e2000-04-11 15:41:38 +0000572
Walter Dörwald69652032004-09-07 20:24:22 +0000573 def readlines(self, sizehint=None, keepends=True):
Guido van Rossuma3277132000-04-11 15:37:43 +0000574
575 """ Read all lines available on the input stream
576 and return them as list of lines.
577
578 Line breaks are implemented using the codec's decoder
579 method and are included in the list entries.
Guido van Rossum1c89b0e2000-04-11 15:41:38 +0000580
Marc-André Lemburgd5948492004-02-26 15:22:17 +0000581 sizehint, if given, is ignored since there is no efficient
582 way to finding the true end-of-line.
Guido van Rossuma3277132000-04-11 15:37:43 +0000583
584 """
Walter Dörwald69652032004-09-07 20:24:22 +0000585 data = self.read()
Hye-Shik Changaf5c7cf2004-10-17 23:51:21 +0000586 return data.splitlines(keepends)
Guido van Rossum0612d842000-03-10 23:20:43 +0000587
588 def reset(self):
589
590 """ Resets the codec buffers used for keeping state.
591
592 Note that no stream repositioning should take place.
Thomas Wouters7e474022000-07-16 12:04:32 +0000593 This method is primarily intended to be able to recover
Guido van Rossum0612d842000-03-10 23:20:43 +0000594 from decoding errors.
595
596 """
Walter Dörwald729c31f2005-03-14 19:06:30 +0000597 self.bytebuffer = ""
598 self.charbuffer = u""
Martin v. Löwis4ed67382005-09-18 08:34:39 +0000599 self.linebuffer = None
Walter Dörwald729c31f2005-03-14 19:06:30 +0000600
Walter Dörwald71fd90d2005-03-14 19:25:41 +0000601 def seek(self, offset, whence=0):
Walter Dörwald729c31f2005-03-14 19:06:30 +0000602 """ Set the input stream's current position.
603
604 Resets the codec buffers used for keeping state.
605 """
Walter Dörwald729c31f2005-03-14 19:06:30 +0000606 self.stream.seek(offset, whence)
Victor Stinner7df55da2010-05-22 13:37:56 +0000607 self.reset()
Guido van Rossum0612d842000-03-10 23:20:43 +0000608
Walter Dörwald4dbf1922002-11-06 16:53:44 +0000609 def next(self):
610
611 """ Return the next decoded line from the input stream."""
612 line = self.readline()
613 if line:
614 return line
615 raise StopIteration
616
617 def __iter__(self):
618 return self
619
Tim Peters30324a72001-05-15 17:19:16 +0000620 def __getattr__(self, name,
Guido van Rossum0612d842000-03-10 23:20:43 +0000621 getattr=getattr):
622
623 """ Inherit all other methods from the underlying stream.
624 """
Tim Peters30324a72001-05-15 17:19:16 +0000625 return getattr(self.stream, name)
Guido van Rossum0612d842000-03-10 23:20:43 +0000626
Georg Brandl8f99f812006-10-29 08:39:22 +0000627 def __enter__(self):
628 return self
629
630 def __exit__(self, type, value, tb):
631 self.stream.close()
632
Guido van Rossum0612d842000-03-10 23:20:43 +0000633###
634
635class StreamReaderWriter:
636
Fred Drake49fd1072000-04-13 14:11:21 +0000637 """ StreamReaderWriter instances allow wrapping streams which
638 work in both read and write modes.
639
640 The design is such that one can use the factory functions
Thomas Wouters7e474022000-07-16 12:04:32 +0000641 returned by the codec.lookup() function to construct the
Fred Drake49fd1072000-04-13 14:11:21 +0000642 instance.
643
644 """
Guido van Rossuma3277132000-04-11 15:37:43 +0000645 # Optional attributes set by the file wrappers below
646 encoding = 'unknown'
647
Tim Peters30324a72001-05-15 17:19:16 +0000648 def __init__(self, stream, Reader, Writer, errors='strict'):
Guido van Rossum0612d842000-03-10 23:20:43 +0000649
650 """ Creates a StreamReaderWriter instance.
651
652 stream must be a Stream-like object.
653
654 Reader, Writer must be factory functions or classes
655 providing the StreamReader, StreamWriter interface resp.
656
657 Error handling is done in the same way as defined for the
658 StreamWriter/Readers.
659
660 """
661 self.stream = stream
662 self.reader = Reader(stream, errors)
663 self.writer = Writer(stream, errors)
664 self.errors = errors
665
Tim Peters30324a72001-05-15 17:19:16 +0000666 def read(self, size=-1):
Guido van Rossum0612d842000-03-10 23:20:43 +0000667
668 return self.reader.read(size)
669
Guido van Rossumd58c26f2000-05-01 16:17:32 +0000670 def readline(self, size=None):
Guido van Rossuma3277132000-04-11 15:37:43 +0000671
672 return self.reader.readline(size)
673
Guido van Rossumd58c26f2000-05-01 16:17:32 +0000674 def readlines(self, sizehint=None):
Guido van Rossuma3277132000-04-11 15:37:43 +0000675
676 return self.reader.readlines(sizehint)
677
Walter Dörwald4dbf1922002-11-06 16:53:44 +0000678 def next(self):
679
680 """ Return the next decoded line from the input stream."""
681 return self.reader.next()
682
683 def __iter__(self):
684 return self
685
Tim Peters30324a72001-05-15 17:19:16 +0000686 def write(self, data):
Guido van Rossum0612d842000-03-10 23:20:43 +0000687
688 return self.writer.write(data)
689
Tim Peters30324a72001-05-15 17:19:16 +0000690 def writelines(self, list):
Guido van Rossuma3277132000-04-11 15:37:43 +0000691
692 return self.writer.writelines(list)
693
Guido van Rossum0612d842000-03-10 23:20:43 +0000694 def reset(self):
695
696 self.reader.reset()
697 self.writer.reset()
698
Victor Stinner262be5e2010-05-22 02:11:07 +0000699 def seek(self, offset, whence=0):
Victor Stinner7df55da2010-05-22 13:37:56 +0000700 self.stream.seek(offset, whence)
701 self.reader.reset()
702 if whence == 0 and offset == 0:
703 self.writer.reset()
Victor Stinner262be5e2010-05-22 02:11:07 +0000704
Tim Peters30324a72001-05-15 17:19:16 +0000705 def __getattr__(self, name,
Guido van Rossum0612d842000-03-10 23:20:43 +0000706 getattr=getattr):
707
708 """ Inherit all other methods from the underlying stream.
709 """
Tim Peters30324a72001-05-15 17:19:16 +0000710 return getattr(self.stream, name)
Guido van Rossum0612d842000-03-10 23:20:43 +0000711
Georg Brandl8f99f812006-10-29 08:39:22 +0000712 # these are needed to make "with codecs.open(...)" work properly
713
714 def __enter__(self):
715 return self
716
717 def __exit__(self, type, value, tb):
718 self.stream.close()
719
Guido van Rossum0612d842000-03-10 23:20:43 +0000720###
721
722class StreamRecoder:
723
Fred Drake49fd1072000-04-13 14:11:21 +0000724 """ StreamRecoder instances provide a frontend - backend
725 view of encoding data.
726
727 They use the complete set of APIs returned by the
728 codecs.lookup() function to implement their task.
729
730 Data written to the stream is first decoded into an
731 intermediate format (which is dependent on the given codec
732 combination) and then written to the stream using an instance
733 of the provided Writer class.
734
735 In the other direction, data is read from the stream using a
736 Reader instance and then return encoded data to the caller.
737
738 """
Guido van Rossuma3277132000-04-11 15:37:43 +0000739 # Optional attributes set by the file wrappers below
740 data_encoding = 'unknown'
741 file_encoding = 'unknown'
742
Tim Peters30324a72001-05-15 17:19:16 +0000743 def __init__(self, stream, encode, decode, Reader, Writer,
744 errors='strict'):
Guido van Rossum0612d842000-03-10 23:20:43 +0000745
746 """ Creates a StreamRecoder instance which implements a two-way
747 conversion: encode and decode work on the frontend (the
Guido van Rossum1c89b0e2000-04-11 15:41:38 +0000748 input to .read() and output of .write()) while
Guido van Rossum0612d842000-03-10 23:20:43 +0000749 Reader and Writer work on the backend (reading and
Fred Drake908670c2000-03-17 15:42:11 +0000750 writing to the stream).
Guido van Rossum0612d842000-03-10 23:20:43 +0000751
752 You can use these objects to do transparent direct
753 recodings from e.g. latin-1 to utf-8 and back.
754
755 stream must be a file-like object.
756
757 encode, decode must adhere to the Codec interface, Reader,
758 Writer must be factory functions or classes providing the
759 StreamReader, StreamWriter interface resp.
760
761 encode and decode are needed for the frontend translation,
762 Reader and Writer for the backend translation. Unicode is
763 used as intermediate encoding.
764
765 Error handling is done in the same way as defined for the
766 StreamWriter/Readers.
767
768 """
769 self.stream = stream
770 self.encode = encode
771 self.decode = decode
772 self.reader = Reader(stream, errors)
773 self.writer = Writer(stream, errors)
774 self.errors = errors
775
Tim Peters30324a72001-05-15 17:19:16 +0000776 def read(self, size=-1):
Guido van Rossum0612d842000-03-10 23:20:43 +0000777
778 data = self.reader.read(size)
779 data, bytesencoded = self.encode(data, self.errors)
780 return data
781
Tim Peters30324a72001-05-15 17:19:16 +0000782 def readline(self, size=None):
Guido van Rossuma3277132000-04-11 15:37:43 +0000783
784 if size is None:
785 data = self.reader.readline()
786 else:
787 data = self.reader.readline(size)
788 data, bytesencoded = self.encode(data, self.errors)
789 return data
790
Tim Peters30324a72001-05-15 17:19:16 +0000791 def readlines(self, sizehint=None):
Guido van Rossuma3277132000-04-11 15:37:43 +0000792
Marc-André Lemburgd5948492004-02-26 15:22:17 +0000793 data = self.reader.read()
Guido van Rossuma3277132000-04-11 15:37:43 +0000794 data, bytesencoded = self.encode(data, self.errors)
795 return data.splitlines(1)
796
Walter Dörwald4dbf1922002-11-06 16:53:44 +0000797 def next(self):
798
799 """ Return the next decoded line from the input stream."""
Walter Dörwaldc5238b82005-09-01 11:56:53 +0000800 data = self.reader.next()
801 data, bytesencoded = self.encode(data, self.errors)
802 return data
Walter Dörwald4dbf1922002-11-06 16:53:44 +0000803
804 def __iter__(self):
805 return self
806
Tim Peters30324a72001-05-15 17:19:16 +0000807 def write(self, data):
Guido van Rossum0612d842000-03-10 23:20:43 +0000808
809 data, bytesdecoded = self.decode(data, self.errors)
810 return self.writer.write(data)
811
Tim Peters30324a72001-05-15 17:19:16 +0000812 def writelines(self, list):
Guido van Rossuma3277132000-04-11 15:37:43 +0000813
814 data = ''.join(list)
815 data, bytesdecoded = self.decode(data, self.errors)
816 return self.writer.write(data)
Guido van Rossum0612d842000-03-10 23:20:43 +0000817
818 def reset(self):
819
820 self.reader.reset()
821 self.writer.reset()
822
Tim Peters30324a72001-05-15 17:19:16 +0000823 def __getattr__(self, name,
Guido van Rossum0612d842000-03-10 23:20:43 +0000824 getattr=getattr):
825
826 """ Inherit all other methods from the underlying stream.
827 """
Tim Peters30324a72001-05-15 17:19:16 +0000828 return getattr(self.stream, name)
Guido van Rossum0612d842000-03-10 23:20:43 +0000829
Georg Brandl8f99f812006-10-29 08:39:22 +0000830 def __enter__(self):
831 return self
832
833 def __exit__(self, type, value, tb):
834 self.stream.close()
835
Guido van Rossum0612d842000-03-10 23:20:43 +0000836### Shortcuts
837
Marc-André Lemburg349a3d32000-06-21 21:21:04 +0000838def open(filename, mode='rb', encoding=None, errors='strict', buffering=1):
Guido van Rossum0612d842000-03-10 23:20:43 +0000839
840 """ Open an encoded file using the given mode and return
841 a wrapped version providing transparent encoding/decoding.
842
843 Note: The wrapped version will only accept the object format
844 defined by the codecs, i.e. Unicode objects for most builtin
Skip Montanaro9f5f9d92005-03-16 03:51:56 +0000845 codecs. Output is also codec dependent and will usually be
Guido van Rossum0612d842000-03-10 23:20:43 +0000846 Unicode as well.
847
Marc-André Lemburg349a3d32000-06-21 21:21:04 +0000848 Files are always opened in binary mode, even if no binary mode
Walter Dörwald7f3ed742003-02-02 23:08:27 +0000849 was specified. This is done to avoid data loss due to encodings
Marc-André Lemburg349a3d32000-06-21 21:21:04 +0000850 using 8-bit values. The default file mode is 'rb' meaning to
851 open the file in binary read mode.
852
Guido van Rossum0612d842000-03-10 23:20:43 +0000853 encoding specifies the encoding which is to be used for the
Walter Dörwald7f3ed742003-02-02 23:08:27 +0000854 file.
Guido van Rossum0612d842000-03-10 23:20:43 +0000855
856 errors may be given to define the error handling. It defaults
857 to 'strict' which causes ValueErrors to be raised in case an
858 encoding error occurs.
859
860 buffering has the same meaning as for the builtin open() API.
861 It defaults to line buffered.
862
Fred Drake49fd1072000-04-13 14:11:21 +0000863 The returned wrapped file object provides an extra attribute
864 .encoding which allows querying the used encoding. This
865 attribute is only available if an encoding was specified as
866 parameter.
867
Guido van Rossum0612d842000-03-10 23:20:43 +0000868 """
Florent Xiclunaf4b61862010-02-26 10:40:58 +0000869 if encoding is not None:
870 if 'U' in mode:
871 # No automatic conversion of '\n' is done on reading and writing
872 mode = mode.strip().replace('U', '')
873 if mode[:1] not in set('rwa'):
874 mode = 'r' + mode
875 if 'b' not in mode:
876 # Force opening of the file in binary mode
877 mode = mode + 'b'
Guido van Rossum0612d842000-03-10 23:20:43 +0000878 file = __builtin__.open(filename, mode, buffering)
879 if encoding is None:
880 return file
Walter Dörwaldabb02e52006-03-15 11:35:15 +0000881 info = lookup(encoding)
882 srw = StreamReaderWriter(file, info.streamreader, info.streamwriter, errors)
Guido van Rossuma3277132000-04-11 15:37:43 +0000883 # Add attributes to simplify introspection
884 srw.encoding = encoding
885 return srw
Guido van Rossum0612d842000-03-10 23:20:43 +0000886
Guido van Rossuma3277132000-04-11 15:37:43 +0000887def EncodedFile(file, data_encoding, file_encoding=None, errors='strict'):
Guido van Rossum0612d842000-03-10 23:20:43 +0000888
889 """ Return a wrapped version of file which provides transparent
890 encoding translation.
891
892 Strings written to the wrapped file are interpreted according
Guido van Rossuma3277132000-04-11 15:37:43 +0000893 to the given data_encoding and then written to the original
894 file as string using file_encoding. The intermediate encoding
895 will usually be Unicode but depends on the specified codecs.
Guido van Rossum0612d842000-03-10 23:20:43 +0000896
Guido van Rossuma3277132000-04-11 15:37:43 +0000897 Strings are read from the file using file_encoding and then
898 passed back to the caller as string using data_encoding.
899
900 If file_encoding is not given, it defaults to data_encoding.
Guido van Rossum0612d842000-03-10 23:20:43 +0000901
902 errors may be given to define the error handling. It defaults
903 to 'strict' which causes ValueErrors to be raised in case an
904 encoding error occurs.
905
Fred Drake49fd1072000-04-13 14:11:21 +0000906 The returned wrapped file object provides two extra attributes
907 .data_encoding and .file_encoding which reflect the given
908 parameters of the same name. The attributes can be used for
909 introspection by Python programs.
910
Guido van Rossum0612d842000-03-10 23:20:43 +0000911 """
Guido van Rossuma3277132000-04-11 15:37:43 +0000912 if file_encoding is None:
913 file_encoding = data_encoding
Georg Brandl8f99f812006-10-29 08:39:22 +0000914 data_info = lookup(data_encoding)
915 file_info = lookup(file_encoding)
916 sr = StreamRecoder(file, data_info.encode, data_info.decode,
917 file_info.streamreader, file_info.streamwriter, errors)
Guido van Rossuma3277132000-04-11 15:37:43 +0000918 # Add attributes to simplify introspection
919 sr.data_encoding = data_encoding
920 sr.file_encoding = file_encoding
921 return sr
Guido van Rossum0612d842000-03-10 23:20:43 +0000922
Marc-André Lemburgaa32c5a2001-09-19 11:24:48 +0000923### Helpers for codec lookup
924
925def getencoder(encoding):
926
927 """ Lookup up the codec for the given encoding and return
928 its encoder function.
929
930 Raises a LookupError in case the encoding cannot be found.
931
932 """
Walter Dörwaldabb02e52006-03-15 11:35:15 +0000933 return lookup(encoding).encode
Marc-André Lemburgaa32c5a2001-09-19 11:24:48 +0000934
935def getdecoder(encoding):
936
937 """ Lookup up the codec for the given encoding and return
938 its decoder function.
939
940 Raises a LookupError in case the encoding cannot be found.
941
942 """
Walter Dörwaldabb02e52006-03-15 11:35:15 +0000943 return lookup(encoding).decode
944
945def getincrementalencoder(encoding):
946
947 """ Lookup up the codec for the given encoding and return
948 its IncrementalEncoder class or factory function.
949
950 Raises a LookupError in case the encoding cannot be found
951 or the codecs doesn't provide an incremental encoder.
952
953 """
954 encoder = lookup(encoding).incrementalencoder
955 if encoder is None:
956 raise LookupError(encoding)
957 return encoder
958
959def getincrementaldecoder(encoding):
960
961 """ Lookup up the codec for the given encoding and return
962 its IncrementalDecoder class or factory function.
963
964 Raises a LookupError in case the encoding cannot be found
965 or the codecs doesn't provide an incremental decoder.
966
967 """
968 decoder = lookup(encoding).incrementaldecoder
969 if decoder is None:
970 raise LookupError(encoding)
971 return decoder
Marc-André Lemburgaa32c5a2001-09-19 11:24:48 +0000972
973def getreader(encoding):
974
975 """ Lookup up the codec for the given encoding and return
976 its StreamReader class or factory function.
977
978 Raises a LookupError in case the encoding cannot be found.
979
980 """
Walter Dörwaldabb02e52006-03-15 11:35:15 +0000981 return lookup(encoding).streamreader
Marc-André Lemburgaa32c5a2001-09-19 11:24:48 +0000982
983def getwriter(encoding):
984
985 """ Lookup up the codec for the given encoding and return
986 its StreamWriter class or factory function.
987
988 Raises a LookupError in case the encoding cannot be found.
989
990 """
Walter Dörwaldabb02e52006-03-15 11:35:15 +0000991 return lookup(encoding).streamwriter
992
993def iterencode(iterator, encoding, errors='strict', **kwargs):
994 """
995 Encoding iterator.
996
997 Encodes the input strings from the iterator using a IncrementalEncoder.
998
999 errors and kwargs are passed through to the IncrementalEncoder
1000 constructor.
1001 """
1002 encoder = getincrementalencoder(encoding)(errors, **kwargs)
1003 for input in iterator:
1004 output = encoder.encode(input)
1005 if output:
1006 yield output
1007 output = encoder.encode("", True)
1008 if output:
1009 yield output
1010
1011def iterdecode(iterator, encoding, errors='strict', **kwargs):
1012 """
1013 Decoding iterator.
1014
1015 Decodes the input strings from the iterator using a IncrementalDecoder.
1016
1017 errors and kwargs are passed through to the IncrementalDecoder
1018 constructor.
1019 """
1020 decoder = getincrementaldecoder(encoding)(errors, **kwargs)
1021 for input in iterator:
1022 output = decoder.decode(input)
1023 if output:
1024 yield output
1025 output = decoder.decode("", True)
1026 if output:
1027 yield output
Marc-André Lemburgaa32c5a2001-09-19 11:24:48 +00001028
Marc-André Lemburga866df82001-01-03 21:29:14 +00001029### Helpers for charmap-based codecs
1030
1031def make_identity_dict(rng):
1032
1033 """ make_identity_dict(rng) -> dict
1034
1035 Return a dictionary where elements of the rng sequence are
1036 mapped to themselves.
Tim Peters88869f92001-01-14 23:36:06 +00001037
Marc-André Lemburga866df82001-01-03 21:29:14 +00001038 """
1039 res = {}
1040 for i in rng:
1041 res[i]=i
1042 return res
1043
Marc-André Lemburg716cf912001-05-16 09:41:45 +00001044def make_encoding_map(decoding_map):
1045
1046 """ Creates an encoding map from a decoding map.
1047
Walter Dörwald7f3ed742003-02-02 23:08:27 +00001048 If a target mapping in the decoding map occurs multiple
Marc-André Lemburg716cf912001-05-16 09:41:45 +00001049 times, then that target is mapped to None (undefined mapping),
1050 causing an exception when encountered by the charmap codec
1051 during translation.
1052
1053 One example where this happens is cp875.py which decodes
1054 multiple character to \u001a.
1055
1056 """
1057 m = {}
1058 for k,v in decoding_map.items():
Raymond Hettinger54f02222002-06-01 14:18:47 +00001059 if not v in m:
Marc-André Lemburg716cf912001-05-16 09:41:45 +00001060 m[v] = k
1061 else:
1062 m[v] = None
1063 return m
Tim Peters3a2ab1a2001-05-29 06:06:54 +00001064
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001065### error handlers
1066
Martin v. Löwise2713be2005-03-08 15:03:08 +00001067try:
1068 strict_errors = lookup_error("strict")
1069 ignore_errors = lookup_error("ignore")
1070 replace_errors = lookup_error("replace")
1071 xmlcharrefreplace_errors = lookup_error("xmlcharrefreplace")
1072 backslashreplace_errors = lookup_error("backslashreplace")
1073except LookupError:
1074 # In --disable-unicode builds, these error handler are missing
1075 strict_errors = None
1076 ignore_errors = None
1077 replace_errors = None
1078 xmlcharrefreplace_errors = None
1079 backslashreplace_errors = None
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001080
Martin v. Löwis6cd441d2001-07-31 08:54:55 +00001081# Tell modulefinder that using codecs probably needs the encodings
1082# package
1083_false = 0
1084if _false:
1085 import encodings
1086
Guido van Rossum0612d842000-03-10 23:20:43 +00001087### Tests
Guido van Rossum1c89b0e2000-04-11 15:41:38 +00001088
Guido van Rossum0612d842000-03-10 23:20:43 +00001089if __name__ == '__main__':
1090
Guido van Rossuma3277132000-04-11 15:37:43 +00001091 # Make stdout translate Latin-1 output into UTF-8 output
1092 sys.stdout = EncodedFile(sys.stdout, 'latin-1', 'utf-8')
Guido van Rossum1c89b0e2000-04-11 15:41:38 +00001093
Guido van Rossuma3277132000-04-11 15:37:43 +00001094 # Have stdin translate Latin-1 input into UTF-8 input
1095 sys.stdin = EncodedFile(sys.stdin, 'utf-8', 'latin-1')