blob: 049a3f0fd1f3c65d795f5b5176212a2cb4345a39 [file] [log] [blame]
Guido van Rossum0612d842000-03-10 23:20:43 +00001""" codecs -- Python Codec Registry, API and helpers.
2
3
4Written by Marc-Andre Lemburg (mal@lemburg.com).
5
6(c) Copyright CNRI, All Rights Reserved. NO WARRANTY.
7
8"""#"
9
Marc-André Lemburgb28de0d2002-12-12 17:37:50 +000010import __builtin__, sys
Guido van Rossum0612d842000-03-10 23:20:43 +000011
12### Registry and builtin stateless codec functions
13
Guido van Rossumb95de4f2000-03-31 17:25:23 +000014try:
15 from _codecs import *
Tim Peters30324a72001-05-15 17:19:16 +000016except ImportError, why:
Walter Dörwald6a7ec7c2006-03-18 16:35:17 +000017 raise SystemError('Failed to load the builtin codecs: %s' % why)
Guido van Rossum0612d842000-03-10 23:20:43 +000018
Tim Peters30324a72001-05-15 17:19:16 +000019__all__ = ["register", "lookup", "open", "EncodedFile", "BOM", "BOM_BE",
Walter Dörwald474458d2002-06-04 15:16:29 +000020 "BOM_LE", "BOM32_BE", "BOM32_LE", "BOM64_BE", "BOM64_LE",
21 "BOM_UTF8", "BOM_UTF16", "BOM_UTF16_LE", "BOM_UTF16_BE",
Walter Dörwald3aeb6322002-09-02 13:14:32 +000022 "BOM_UTF32", "BOM_UTF32_LE", "BOM_UTF32_BE",
Serhiy Storchaka74a651b2014-12-20 17:42:24 +020023 "CodecInfo", "Codec", "IncrementalEncoder", "IncrementalDecoder",
24 "StreamReader", "StreamWriter",
25 "StreamReaderWriter", "StreamRecoder",
26 "getencoder", "getdecoder", "getincrementalencoder",
27 "getincrementaldecoder", "getreader", "getwriter",
28 "encode", "decode", "iterencode", "iterdecode",
Walter Dörwald3aeb6322002-09-02 13:14:32 +000029 "strict_errors", "ignore_errors", "replace_errors",
Serhiy Storchaka74a651b2014-12-20 17:42:24 +020030 "xmlcharrefreplace_errors", "backslashreplace_errors",
Walter Dörwald3aeb6322002-09-02 13:14:32 +000031 "register_error", "lookup_error"]
Skip Montanaroe99d5ea2001-01-20 19:54:20 +000032
Guido van Rossum0612d842000-03-10 23:20:43 +000033### Constants
34
35#
Walter Dörwald474458d2002-06-04 15:16:29 +000036# Byte Order Mark (BOM = ZERO WIDTH NO-BREAK SPACE = U+FEFF)
37# and its possible byte string values
38# for UTF8/UTF16/UTF32 output and little/big endian machines
Guido van Rossum0612d842000-03-10 23:20:43 +000039#
Guido van Rossum0612d842000-03-10 23:20:43 +000040
Walter Dörwald474458d2002-06-04 15:16:29 +000041# UTF-8
42BOM_UTF8 = '\xef\xbb\xbf'
43
44# UTF-16, little endian
45BOM_LE = BOM_UTF16_LE = '\xff\xfe'
46
47# UTF-16, big endian
48BOM_BE = BOM_UTF16_BE = '\xfe\xff'
49
50# UTF-32, little endian
51BOM_UTF32_LE = '\xff\xfe\x00\x00'
52
53# UTF-32, big endian
54BOM_UTF32_BE = '\x00\x00\xfe\xff'
55
Marc-André Lemburgb28de0d2002-12-12 17:37:50 +000056if sys.byteorder == 'little':
Walter Dörwald474458d2002-06-04 15:16:29 +000057
Marc-André Lemburgb28de0d2002-12-12 17:37:50 +000058 # UTF-16, native endianness
59 BOM = BOM_UTF16 = BOM_UTF16_LE
60
61 # UTF-32, native endianness
62 BOM_UTF32 = BOM_UTF32_LE
63
64else:
65
66 # UTF-16, native endianness
67 BOM = BOM_UTF16 = BOM_UTF16_BE
68
69 # UTF-32, native endianness
70 BOM_UTF32 = BOM_UTF32_BE
Walter Dörwald474458d2002-06-04 15:16:29 +000071
72# Old broken names (don't use in new code)
73BOM32_LE = BOM_UTF16_LE
74BOM32_BE = BOM_UTF16_BE
75BOM64_LE = BOM_UTF32_LE
76BOM64_BE = BOM_UTF32_BE
Guido van Rossum0612d842000-03-10 23:20:43 +000077
78
79### Codec base classes (defining the API)
80
Walter Dörwaldabb02e52006-03-15 11:35:15 +000081class CodecInfo(tuple):
82
83 def __new__(cls, encode, decode, streamreader=None, streamwriter=None,
84 incrementalencoder=None, incrementaldecoder=None, name=None):
85 self = tuple.__new__(cls, (encode, decode, streamreader, streamwriter))
86 self.name = name
87 self.encode = encode
88 self.decode = decode
89 self.incrementalencoder = incrementalencoder
90 self.incrementaldecoder = incrementaldecoder
91 self.streamwriter = streamwriter
92 self.streamreader = streamreader
93 return self
94
95 def __repr__(self):
96 return "<%s.%s object for encoding %s at 0x%x>" % (self.__class__.__module__, self.__class__.__name__, self.name, id(self))
97
Guido van Rossum0612d842000-03-10 23:20:43 +000098class Codec:
99
100 """ Defines the interface for stateless encoders/decoders.
101
Walter Dörwald7f82f792002-11-19 21:42:53 +0000102 The .encode()/.decode() methods may use different error
Guido van Rossum0612d842000-03-10 23:20:43 +0000103 handling schemes by providing the errors argument. These
Walter Dörwald7f82f792002-11-19 21:42:53 +0000104 string values are predefined:
Guido van Rossum0612d842000-03-10 23:20:43 +0000105
Guido van Rossumd8855fd2000-03-24 22:14:19 +0000106 'strict' - raise a ValueError error (or a subclass)
Guido van Rossum0612d842000-03-10 23:20:43 +0000107 'ignore' - ignore the character and continue with the next
108 'replace' - replace with a suitable replacement character;
109 Python will use the official U+FFFD REPLACEMENT
Walter Dörwald7f82f792002-11-19 21:42:53 +0000110 CHARACTER for the builtin Unicode codecs on
111 decoding and '?' on encoding.
112 'xmlcharrefreplace' - Replace with the appropriate XML
113 character reference (only for encoding).
114 'backslashreplace' - Replace with backslashed escape sequences
115 (only for encoding).
116
117 The set of allowed values can be extended via register_error.
Guido van Rossum0612d842000-03-10 23:20:43 +0000118
119 """
Tim Peters30324a72001-05-15 17:19:16 +0000120 def encode(self, input, errors='strict'):
Guido van Rossum1c89b0e2000-04-11 15:41:38 +0000121
Fred Drake3e74c0d2000-03-17 15:40:35 +0000122 """ Encodes the object input and returns a tuple (output
Guido van Rossum0612d842000-03-10 23:20:43 +0000123 object, length consumed).
124
125 errors defines the error handling to apply. It defaults to
126 'strict' handling.
127
128 The method may not store state in the Codec instance. Use
129 StreamCodec for codecs which have to keep state in order to
130 make encoding/decoding efficient.
131
132 The encoder must be able to handle zero length input and
133 return an empty object of the output object type in this
134 situation.
135
136 """
137 raise NotImplementedError
138
Tim Peters30324a72001-05-15 17:19:16 +0000139 def decode(self, input, errors='strict'):
Guido van Rossum0612d842000-03-10 23:20:43 +0000140
141 """ Decodes the object input and returns a tuple (output
142 object, length consumed).
143
144 input must be an object which provides the bf_getreadbuf
145 buffer slot. Python strings, buffer objects and memory
146 mapped files are examples of objects providing this slot.
Guido van Rossum1c89b0e2000-04-11 15:41:38 +0000147
Guido van Rossum0612d842000-03-10 23:20:43 +0000148 errors defines the error handling to apply. It defaults to
149 'strict' handling.
150
151 The method may not store state in the Codec instance. Use
152 StreamCodec for codecs which have to keep state in order to
153 make encoding/decoding efficient.
154
155 The decoder must be able to handle zero length input and
156 return an empty object of the output object type in this
157 situation.
158
Guido van Rossum1c89b0e2000-04-11 15:41:38 +0000159 """
Guido van Rossum0612d842000-03-10 23:20:43 +0000160 raise NotImplementedError
161
Walter Dörwaldabb02e52006-03-15 11:35:15 +0000162class IncrementalEncoder(object):
163 """
Neal Norwitz6bed1c12006-03-16 07:49:19 +0000164 An IncrementalEncoder encodes an input in multiple steps. The input can be
Walter Dörwaldabb02e52006-03-15 11:35:15 +0000165 passed piece by piece to the encode() method. The IncrementalEncoder remembers
166 the state of the Encoding process between calls to encode().
167 """
168 def __init__(self, errors='strict'):
169 """
Neal Norwitz6bed1c12006-03-16 07:49:19 +0000170 Creates an IncrementalEncoder instance.
Walter Dörwaldabb02e52006-03-15 11:35:15 +0000171
172 The IncrementalEncoder may use different error handling schemes by
173 providing the errors keyword argument. See the module docstring
174 for a list of possible values.
175 """
176 self.errors = errors
177 self.buffer = ""
178
179 def encode(self, input, final=False):
180 """
181 Encodes input and returns the resulting object.
182 """
183 raise NotImplementedError
184
185 def reset(self):
186 """
187 Resets the encoder to the initial state.
188 """
189
Christian Heimes1a6387e2008-03-26 12:49:49 +0000190 def getstate(self):
191 """
192 Return the current state of the encoder.
193 """
194 return 0
195
196 def setstate(self, state):
197 """
198 Set the current state of the encoder. state must have been
199 returned by getstate().
200 """
201
Walter Dörwald78a0be62006-04-14 18:25:39 +0000202class BufferedIncrementalEncoder(IncrementalEncoder):
203 """
204 This subclass of IncrementalEncoder can be used as the baseclass for an
205 incremental encoder if the encoder must keep some of the output in a
206 buffer between calls to encode().
207 """
208 def __init__(self, errors='strict'):
209 IncrementalEncoder.__init__(self, errors)
210 self.buffer = "" # unencoded input that is kept between calls to encode()
211
212 def _buffer_encode(self, input, errors, final):
213 # Overwrite this method in subclasses: It must encode input
214 # and return an (output, length consumed) tuple
215 raise NotImplementedError
216
217 def encode(self, input, final=False):
218 # encode input (taking the buffer into account)
219 data = self.buffer + input
220 (result, consumed) = self._buffer_encode(data, self.errors, final)
221 # keep unencoded input until the next call
222 self.buffer = data[consumed:]
223 return result
224
225 def reset(self):
226 IncrementalEncoder.reset(self)
227 self.buffer = ""
228
Christian Heimes1a6387e2008-03-26 12:49:49 +0000229 def getstate(self):
230 return self.buffer or 0
231
232 def setstate(self, state):
233 self.buffer = state or ""
234
Walter Dörwaldabb02e52006-03-15 11:35:15 +0000235class IncrementalDecoder(object):
236 """
237 An IncrementalDecoder decodes an input in multiple steps. The input can be
238 passed piece by piece to the decode() method. The IncrementalDecoder
239 remembers the state of the decoding process between calls to decode().
240 """
241 def __init__(self, errors='strict'):
242 """
243 Creates a IncrementalDecoder instance.
244
245 The IncrementalDecoder may use different error handling schemes by
246 providing the errors keyword argument. See the module docstring
247 for a list of possible values.
248 """
249 self.errors = errors
250
251 def decode(self, input, final=False):
252 """
253 Decodes input and returns the resulting object.
254 """
255 raise NotImplementedError
256
257 def reset(self):
258 """
259 Resets the decoder to the initial state.
260 """
261
Christian Heimes1a6387e2008-03-26 12:49:49 +0000262 def getstate(self):
263 """
264 Return the current state of the decoder.
265
266 This must be a (buffered_input, additional_state_info) tuple.
267 buffered_input must be a bytes object containing bytes that
268 were passed to decode() that have not yet been converted.
269 additional_state_info must be a non-negative integer
270 representing the state of the decoder WITHOUT yet having
271 processed the contents of buffered_input. In the initial state
272 and after reset(), getstate() must return (b"", 0).
273 """
274 return (b"", 0)
275
276 def setstate(self, state):
277 """
278 Set the current state of the decoder.
279
280 state must have been returned by getstate(). The effect of
281 setstate((b"", 0)) must be equivalent to reset().
282 """
283
Walter Dörwaldabb02e52006-03-15 11:35:15 +0000284class BufferedIncrementalDecoder(IncrementalDecoder):
285 """
286 This subclass of IncrementalDecoder can be used as the baseclass for an
287 incremental decoder if the decoder must be able to handle incomplete byte
288 sequences.
289 """
290 def __init__(self, errors='strict'):
291 IncrementalDecoder.__init__(self, errors)
292 self.buffer = "" # undecoded input that is kept between calls to decode()
293
294 def _buffer_decode(self, input, errors, final):
295 # Overwrite this method in subclasses: It must decode input
296 # and return an (output, length consumed) tuple
297 raise NotImplementedError
298
299 def decode(self, input, final=False):
300 # decode input (taking the buffer into account)
301 data = self.buffer + input
302 (result, consumed) = self._buffer_decode(data, self.errors, final)
303 # keep undecoded input until the next call
304 self.buffer = data[consumed:]
305 return result
306
307 def reset(self):
308 IncrementalDecoder.reset(self)
Walter Dörwaldb17f12b2006-04-14 15:40:54 +0000309 self.buffer = ""
Walter Dörwaldabb02e52006-03-15 11:35:15 +0000310
Christian Heimes1a6387e2008-03-26 12:49:49 +0000311 def getstate(self):
312 # additional state info is always 0
313 return (self.buffer, 0)
314
315 def setstate(self, state):
316 # ignore additional state info
317 self.buffer = state[0]
318
Guido van Rossum0612d842000-03-10 23:20:43 +0000319#
320# The StreamWriter and StreamReader class provide generic working
Andrew M. Kuchling97c56352001-09-18 20:29:48 +0000321# interfaces which can be used to implement new encoding submodules
Guido van Rossum0612d842000-03-10 23:20:43 +0000322# very easily. See encodings/utf_8.py for an example on how this is
323# done.
Guido van Rossum1c89b0e2000-04-11 15:41:38 +0000324#
Guido van Rossum0612d842000-03-10 23:20:43 +0000325
326class StreamWriter(Codec):
327
Tim Peters30324a72001-05-15 17:19:16 +0000328 def __init__(self, stream, errors='strict'):
Guido van Rossum0612d842000-03-10 23:20:43 +0000329
330 """ Creates a StreamWriter instance.
331
332 stream must be a file-like object open for writing
333 (binary) data.
334
Walter Dörwald7f82f792002-11-19 21:42:53 +0000335 The StreamWriter may use different error handling
Guido van Rossum0612d842000-03-10 23:20:43 +0000336 schemes by providing the errors keyword argument. These
Walter Dörwald7f82f792002-11-19 21:42:53 +0000337 parameters are predefined:
Guido van Rossum0612d842000-03-10 23:20:43 +0000338
339 'strict' - raise a ValueError (or a subclass)
340 'ignore' - ignore the character and continue with the next
341 'replace'- replace with a suitable replacement character
Walter Dörwald7f82f792002-11-19 21:42:53 +0000342 'xmlcharrefreplace' - Replace with the appropriate XML
343 character reference.
344 'backslashreplace' - Replace with backslashed escape
345 sequences (only for encoding).
Guido van Rossum0612d842000-03-10 23:20:43 +0000346
Walter Dörwald7f82f792002-11-19 21:42:53 +0000347 The set of allowed parameter values can be extended via
348 register_error.
Guido van Rossum0612d842000-03-10 23:20:43 +0000349 """
350 self.stream = stream
351 self.errors = errors
352
Guido van Rossuma3277132000-04-11 15:37:43 +0000353 def write(self, object):
Guido van Rossum0612d842000-03-10 23:20:43 +0000354
355 """ Writes the object's contents encoded to self.stream.
356 """
Tim Peters30324a72001-05-15 17:19:16 +0000357 data, consumed = self.encode(object, self.errors)
Guido van Rossum0612d842000-03-10 23:20:43 +0000358 self.stream.write(data)
359
Guido van Rossuma3277132000-04-11 15:37:43 +0000360 def writelines(self, list):
361
362 """ Writes the concatenated list of strings to the stream
363 using .write().
364 """
365 self.write(''.join(list))
Guido van Rossum1c89b0e2000-04-11 15:41:38 +0000366
Guido van Rossum0612d842000-03-10 23:20:43 +0000367 def reset(self):
368
369 """ Flushes and resets the codec buffers used for keeping state.
370
371 Calling this method should ensure that the data on the
372 output is put into a clean state, that allows appending
373 of new fresh data without having to rescan the whole
374 stream to recover state.
375
376 """
377 pass
378
Victor Stinner7df55da2010-05-22 13:37:56 +0000379 def seek(self, offset, whence=0):
380 self.stream.seek(offset, whence)
381 if whence == 0 and offset == 0:
382 self.reset()
383
Tim Peters30324a72001-05-15 17:19:16 +0000384 def __getattr__(self, name,
Guido van Rossum0612d842000-03-10 23:20:43 +0000385 getattr=getattr):
386
387 """ Inherit all other methods from the underlying stream.
388 """
Tim Peters30324a72001-05-15 17:19:16 +0000389 return getattr(self.stream, name)
Guido van Rossum0612d842000-03-10 23:20:43 +0000390
Georg Brandl8f99f812006-10-29 08:39:22 +0000391 def __enter__(self):
392 return self
393
394 def __exit__(self, type, value, tb):
395 self.stream.close()
396
Guido van Rossum0612d842000-03-10 23:20:43 +0000397###
398
399class StreamReader(Codec):
400
Tim Peters30324a72001-05-15 17:19:16 +0000401 def __init__(self, stream, errors='strict'):
Guido van Rossum0612d842000-03-10 23:20:43 +0000402
403 """ Creates a StreamReader instance.
404
405 stream must be a file-like object open for reading
406 (binary) data.
407
Walter Dörwald7f82f792002-11-19 21:42:53 +0000408 The StreamReader may use different error handling
Guido van Rossum0612d842000-03-10 23:20:43 +0000409 schemes by providing the errors keyword argument. These
Walter Dörwald7f82f792002-11-19 21:42:53 +0000410 parameters are predefined:
Guido van Rossum0612d842000-03-10 23:20:43 +0000411
412 'strict' - raise a ValueError (or a subclass)
413 'ignore' - ignore the character and continue with the next
414 'replace'- replace with a suitable replacement character;
415
Walter Dörwald7f82f792002-11-19 21:42:53 +0000416 The set of allowed parameter values can be extended via
417 register_error.
Guido van Rossum0612d842000-03-10 23:20:43 +0000418 """
419 self.stream = stream
420 self.errors = errors
Walter Dörwald69652032004-09-07 20:24:22 +0000421 self.bytebuffer = ""
Walter Dörwaldc9878e12005-07-20 22:15:39 +0000422 # For str->str decoding this will stay a str
423 # For str->unicode decoding the first read will promote it to unicode
424 self.charbuffer = ""
Martin v. Löwis4ed67382005-09-18 08:34:39 +0000425 self.linebuffer = None
Guido van Rossum0612d842000-03-10 23:20:43 +0000426
Walter Dörwald69652032004-09-07 20:24:22 +0000427 def decode(self, input, errors='strict'):
428 raise NotImplementedError
429
Martin v. Löwis56066d22005-08-24 07:38:12 +0000430 def read(self, size=-1, chars=-1, firstline=False):
Guido van Rossum0612d842000-03-10 23:20:43 +0000431
432 """ Decodes data from the stream self.stream and returns the
433 resulting object.
434
Walter Dörwald69652032004-09-07 20:24:22 +0000435 chars indicates the number of characters to read from the
436 stream. read() will never return more than chars
437 characters, but it might return less, if there are not enough
438 characters available.
439
Guido van Rossum0612d842000-03-10 23:20:43 +0000440 size indicates the approximate maximum number of bytes to
441 read from the stream for decoding purposes. The decoder
442 can modify this setting as appropriate. The default value
443 -1 indicates to read and decode as much as possible. size
444 is intended to prevent having to decode huge files in one
445 step.
446
Martin v. Löwis56066d22005-08-24 07:38:12 +0000447 If firstline is true, and a UnicodeDecodeError happens
448 after the first line terminator in the input only the first line
449 will be returned, the rest of the input will be kept until the
450 next call to read().
451
Guido van Rossum0612d842000-03-10 23:20:43 +0000452 The method should use a greedy read strategy meaning that
453 it should read as much data as is allowed within the
454 definition of the encoding and the given size, e.g. if
455 optional encoding endings or state markers are available
456 on the stream, these should be read too.
Guido van Rossum0612d842000-03-10 23:20:43 +0000457 """
Martin v. Löwis4ed67382005-09-18 08:34:39 +0000458 # If we have lines cached, first merge them back into characters
459 if self.linebuffer:
460 self.charbuffer = "".join(self.linebuffer)
461 self.linebuffer = None
Tim Peters536cf992005-12-25 23:18:31 +0000462
Walter Dörwald69652032004-09-07 20:24:22 +0000463 # read until we get the required number of characters (if available)
Walter Dörwald69652032004-09-07 20:24:22 +0000464 while True:
Serhiy Storchaka2403a782014-01-26 19:20:24 +0200465 # can the request be satisfied from the character buffer?
466 if chars >= 0:
Walter Dörwald69652032004-09-07 20:24:22 +0000467 if len(self.charbuffer) >= chars:
Walter Dörwald69652032004-09-07 20:24:22 +0000468 break
Serhiy Storchaka2403a782014-01-26 19:20:24 +0200469 elif size >= 0:
470 if len(self.charbuffer) >= size:
471 break
Walter Dörwald69652032004-09-07 20:24:22 +0000472 # we need more data
473 if size < 0:
474 newdata = self.stream.read()
475 else:
476 newdata = self.stream.read(size)
Walter Dörwalde57d7b12004-12-21 22:24:00 +0000477 # decode bytes (those remaining from the last call included)
Walter Dörwald69652032004-09-07 20:24:22 +0000478 data = self.bytebuffer + newdata
Martin v. Löwis56066d22005-08-24 07:38:12 +0000479 try:
480 newchars, decodedbytes = self.decode(data, self.errors)
481 except UnicodeDecodeError, exc:
482 if firstline:
483 newchars, decodedbytes = self.decode(data[:exc.start], self.errors)
484 lines = newchars.splitlines(True)
485 if len(lines)<=1:
486 raise
487 else:
488 raise
Walter Dörwald69652032004-09-07 20:24:22 +0000489 # keep undecoded bytes until the next call
490 self.bytebuffer = data[decodedbytes:]
491 # put new characters in the character buffer
Walter Dörwalde57d7b12004-12-21 22:24:00 +0000492 self.charbuffer += newchars
Walter Dörwald69652032004-09-07 20:24:22 +0000493 # there was no data available
494 if not newdata:
Walter Dörwalde57d7b12004-12-21 22:24:00 +0000495 break
496 if chars < 0:
497 # Return everything we've got
498 result = self.charbuffer
Walter Dörwaldc9878e12005-07-20 22:15:39 +0000499 self.charbuffer = ""
Walter Dörwalde57d7b12004-12-21 22:24:00 +0000500 else:
501 # Return the first chars characters
502 result = self.charbuffer[:chars]
503 self.charbuffer = self.charbuffer[chars:]
Walter Dörwald69652032004-09-07 20:24:22 +0000504 return result
Guido van Rossum0612d842000-03-10 23:20:43 +0000505
Walter Dörwald69652032004-09-07 20:24:22 +0000506 def readline(self, size=None, keepends=True):
Guido van Rossuma3277132000-04-11 15:37:43 +0000507
508 """ Read one line from the input stream and return the
509 decoded data.
510
Walter Dörwald69652032004-09-07 20:24:22 +0000511 size, if given, is passed as size argument to the
512 read() method.
Guido van Rossum1c89b0e2000-04-11 15:41:38 +0000513
Guido van Rossuma3277132000-04-11 15:37:43 +0000514 """
Martin v. Löwis4ed67382005-09-18 08:34:39 +0000515 # If we have lines cached from an earlier read, return
516 # them unconditionally
517 if self.linebuffer:
518 line = self.linebuffer[0]
519 del self.linebuffer[0]
520 if len(self.linebuffer) == 1:
521 # revert to charbuffer mode; we might need more data
522 # next time
523 self.charbuffer = self.linebuffer[0]
524 self.linebuffer = None
525 if not keepends:
526 line = line.splitlines(False)[0]
527 return line
Tim Peters536cf992005-12-25 23:18:31 +0000528
Walter Dörwalde57d7b12004-12-21 22:24:00 +0000529 readsize = size or 72
Walter Dörwaldc9878e12005-07-20 22:15:39 +0000530 line = ""
Walter Dörwalde57d7b12004-12-21 22:24:00 +0000531 # If size is given, we call read() only once
Walter Dörwald69652032004-09-07 20:24:22 +0000532 while True:
Martin v. Löwis56066d22005-08-24 07:38:12 +0000533 data = self.read(readsize, firstline=True)
Walter Dörwalde57d7b12004-12-21 22:24:00 +0000534 if data:
Walter Dörwalda4eb2d52005-04-21 21:42:35 +0000535 # If we're at a "\r" read one extra character (which might
536 # be a "\n") to get a proper line ending. If the stream is
Walter Dörwaldbc8e6422005-04-21 21:32:03 +0000537 # temporarily exhausted we return the wrong line ending.
Walter Dörwaldc9878e12005-07-20 22:15:39 +0000538 if data.endswith("\r"):
Walter Dörwald7a6dc132005-04-04 21:38:47 +0000539 data += self.read(size=1, chars=1)
Walter Dörwald7a6dc132005-04-04 21:38:47 +0000540
Walter Dörwald69652032004-09-07 20:24:22 +0000541 line += data
Walter Dörwalde57d7b12004-12-21 22:24:00 +0000542 lines = line.splitlines(True)
543 if lines:
Martin v. Löwis4ed67382005-09-18 08:34:39 +0000544 if len(lines) > 1:
545 # More than one line result; the first line is a full line
546 # to return
547 line = lines[0]
548 del lines[0]
549 if len(lines) > 1:
550 # cache the remaining lines
551 lines[-1] += self.charbuffer
552 self.linebuffer = lines
553 self.charbuffer = None
554 else:
555 # only one remaining line, put it back into charbuffer
556 self.charbuffer = lines[0] + self.charbuffer
557 if not keepends:
558 line = line.splitlines(False)[0]
559 break
Walter Dörwalde57d7b12004-12-21 22:24:00 +0000560 line0withend = lines[0]
561 line0withoutend = lines[0].splitlines(False)[0]
562 if line0withend != line0withoutend: # We really have a line end
563 # Put the rest back together and keep it until the next call
Walter Dörwaldc9878e12005-07-20 22:15:39 +0000564 self.charbuffer = "".join(lines[1:]) + self.charbuffer
Walter Dörwalde57d7b12004-12-21 22:24:00 +0000565 if keepends:
566 line = line0withend
567 else:
568 line = line0withoutend
Walter Dörwald9fa09462005-01-10 12:01:39 +0000569 break
Walter Dörwalde57d7b12004-12-21 22:24:00 +0000570 # we didn't get anything or this was our only try
Walter Dörwald9fa09462005-01-10 12:01:39 +0000571 if not data or size is not None:
Walter Dörwalde57d7b12004-12-21 22:24:00 +0000572 if line and not keepends:
573 line = line.splitlines(False)[0]
574 break
575 if readsize<8000:
576 readsize *= 2
577 return line
Guido van Rossum1c89b0e2000-04-11 15:41:38 +0000578
Walter Dörwald69652032004-09-07 20:24:22 +0000579 def readlines(self, sizehint=None, keepends=True):
Guido van Rossuma3277132000-04-11 15:37:43 +0000580
581 """ Read all lines available on the input stream
582 and return them as list of lines.
583
584 Line breaks are implemented using the codec's decoder
585 method and are included in the list entries.
Guido van Rossum1c89b0e2000-04-11 15:41:38 +0000586
Marc-André Lemburgd5948492004-02-26 15:22:17 +0000587 sizehint, if given, is ignored since there is no efficient
588 way to finding the true end-of-line.
Guido van Rossuma3277132000-04-11 15:37:43 +0000589
590 """
Walter Dörwald69652032004-09-07 20:24:22 +0000591 data = self.read()
Hye-Shik Changaf5c7cf2004-10-17 23:51:21 +0000592 return data.splitlines(keepends)
Guido van Rossum0612d842000-03-10 23:20:43 +0000593
594 def reset(self):
595
596 """ Resets the codec buffers used for keeping state.
597
598 Note that no stream repositioning should take place.
Thomas Wouters7e474022000-07-16 12:04:32 +0000599 This method is primarily intended to be able to recover
Guido van Rossum0612d842000-03-10 23:20:43 +0000600 from decoding errors.
601
602 """
Walter Dörwald729c31f2005-03-14 19:06:30 +0000603 self.bytebuffer = ""
604 self.charbuffer = u""
Martin v. Löwis4ed67382005-09-18 08:34:39 +0000605 self.linebuffer = None
Walter Dörwald729c31f2005-03-14 19:06:30 +0000606
Walter Dörwald71fd90d2005-03-14 19:25:41 +0000607 def seek(self, offset, whence=0):
Walter Dörwald729c31f2005-03-14 19:06:30 +0000608 """ Set the input stream's current position.
609
610 Resets the codec buffers used for keeping state.
611 """
Walter Dörwald729c31f2005-03-14 19:06:30 +0000612 self.stream.seek(offset, whence)
Victor Stinner7df55da2010-05-22 13:37:56 +0000613 self.reset()
Guido van Rossum0612d842000-03-10 23:20:43 +0000614
Walter Dörwald4dbf1922002-11-06 16:53:44 +0000615 def next(self):
616
617 """ Return the next decoded line from the input stream."""
618 line = self.readline()
619 if line:
620 return line
621 raise StopIteration
622
623 def __iter__(self):
624 return self
625
Tim Peters30324a72001-05-15 17:19:16 +0000626 def __getattr__(self, name,
Guido van Rossum0612d842000-03-10 23:20:43 +0000627 getattr=getattr):
628
629 """ Inherit all other methods from the underlying stream.
630 """
Tim Peters30324a72001-05-15 17:19:16 +0000631 return getattr(self.stream, name)
Guido van Rossum0612d842000-03-10 23:20:43 +0000632
Georg Brandl8f99f812006-10-29 08:39:22 +0000633 def __enter__(self):
634 return self
635
636 def __exit__(self, type, value, tb):
637 self.stream.close()
638
Guido van Rossum0612d842000-03-10 23:20:43 +0000639###
640
641class StreamReaderWriter:
642
Fred Drake49fd1072000-04-13 14:11:21 +0000643 """ StreamReaderWriter instances allow wrapping streams which
644 work in both read and write modes.
645
646 The design is such that one can use the factory functions
Thomas Wouters7e474022000-07-16 12:04:32 +0000647 returned by the codec.lookup() function to construct the
Fred Drake49fd1072000-04-13 14:11:21 +0000648 instance.
649
650 """
Guido van Rossuma3277132000-04-11 15:37:43 +0000651 # Optional attributes set by the file wrappers below
652 encoding = 'unknown'
653
Tim Peters30324a72001-05-15 17:19:16 +0000654 def __init__(self, stream, Reader, Writer, errors='strict'):
Guido van Rossum0612d842000-03-10 23:20:43 +0000655
656 """ Creates a StreamReaderWriter instance.
657
658 stream must be a Stream-like object.
659
660 Reader, Writer must be factory functions or classes
661 providing the StreamReader, StreamWriter interface resp.
662
663 Error handling is done in the same way as defined for the
664 StreamWriter/Readers.
665
666 """
667 self.stream = stream
668 self.reader = Reader(stream, errors)
669 self.writer = Writer(stream, errors)
670 self.errors = errors
671
Tim Peters30324a72001-05-15 17:19:16 +0000672 def read(self, size=-1):
Guido van Rossum0612d842000-03-10 23:20:43 +0000673
674 return self.reader.read(size)
675
Guido van Rossumd58c26f2000-05-01 16:17:32 +0000676 def readline(self, size=None):
Guido van Rossuma3277132000-04-11 15:37:43 +0000677
678 return self.reader.readline(size)
679
Guido van Rossumd58c26f2000-05-01 16:17:32 +0000680 def readlines(self, sizehint=None):
Guido van Rossuma3277132000-04-11 15:37:43 +0000681
682 return self.reader.readlines(sizehint)
683
Walter Dörwald4dbf1922002-11-06 16:53:44 +0000684 def next(self):
685
686 """ Return the next decoded line from the input stream."""
687 return self.reader.next()
688
689 def __iter__(self):
690 return self
691
Tim Peters30324a72001-05-15 17:19:16 +0000692 def write(self, data):
Guido van Rossum0612d842000-03-10 23:20:43 +0000693
694 return self.writer.write(data)
695
Tim Peters30324a72001-05-15 17:19:16 +0000696 def writelines(self, list):
Guido van Rossuma3277132000-04-11 15:37:43 +0000697
698 return self.writer.writelines(list)
699
Guido van Rossum0612d842000-03-10 23:20:43 +0000700 def reset(self):
701
702 self.reader.reset()
703 self.writer.reset()
704
Victor Stinner262be5e2010-05-22 02:11:07 +0000705 def seek(self, offset, whence=0):
Victor Stinner7df55da2010-05-22 13:37:56 +0000706 self.stream.seek(offset, whence)
707 self.reader.reset()
708 if whence == 0 and offset == 0:
709 self.writer.reset()
Victor Stinner262be5e2010-05-22 02:11:07 +0000710
Tim Peters30324a72001-05-15 17:19:16 +0000711 def __getattr__(self, name,
Guido van Rossum0612d842000-03-10 23:20:43 +0000712 getattr=getattr):
713
714 """ Inherit all other methods from the underlying stream.
715 """
Tim Peters30324a72001-05-15 17:19:16 +0000716 return getattr(self.stream, name)
Guido van Rossum0612d842000-03-10 23:20:43 +0000717
Georg Brandl8f99f812006-10-29 08:39:22 +0000718 # these are needed to make "with codecs.open(...)" work properly
719
720 def __enter__(self):
721 return self
722
723 def __exit__(self, type, value, tb):
724 self.stream.close()
725
Guido van Rossum0612d842000-03-10 23:20:43 +0000726###
727
728class StreamRecoder:
729
Fred Drake49fd1072000-04-13 14:11:21 +0000730 """ StreamRecoder instances provide a frontend - backend
731 view of encoding data.
732
733 They use the complete set of APIs returned by the
734 codecs.lookup() function to implement their task.
735
736 Data written to the stream is first decoded into an
737 intermediate format (which is dependent on the given codec
738 combination) and then written to the stream using an instance
739 of the provided Writer class.
740
741 In the other direction, data is read from the stream using a
742 Reader instance and then return encoded data to the caller.
743
744 """
Guido van Rossuma3277132000-04-11 15:37:43 +0000745 # Optional attributes set by the file wrappers below
746 data_encoding = 'unknown'
747 file_encoding = 'unknown'
748
Tim Peters30324a72001-05-15 17:19:16 +0000749 def __init__(self, stream, encode, decode, Reader, Writer,
750 errors='strict'):
Guido van Rossum0612d842000-03-10 23:20:43 +0000751
752 """ Creates a StreamRecoder instance which implements a two-way
753 conversion: encode and decode work on the frontend (the
Guido van Rossum1c89b0e2000-04-11 15:41:38 +0000754 input to .read() and output of .write()) while
Guido van Rossum0612d842000-03-10 23:20:43 +0000755 Reader and Writer work on the backend (reading and
Fred Drake908670c2000-03-17 15:42:11 +0000756 writing to the stream).
Guido van Rossum0612d842000-03-10 23:20:43 +0000757
758 You can use these objects to do transparent direct
759 recodings from e.g. latin-1 to utf-8 and back.
760
761 stream must be a file-like object.
762
763 encode, decode must adhere to the Codec interface, Reader,
764 Writer must be factory functions or classes providing the
765 StreamReader, StreamWriter interface resp.
766
767 encode and decode are needed for the frontend translation,
768 Reader and Writer for the backend translation. Unicode is
769 used as intermediate encoding.
770
771 Error handling is done in the same way as defined for the
772 StreamWriter/Readers.
773
774 """
775 self.stream = stream
776 self.encode = encode
777 self.decode = decode
778 self.reader = Reader(stream, errors)
779 self.writer = Writer(stream, errors)
780 self.errors = errors
781
Tim Peters30324a72001-05-15 17:19:16 +0000782 def read(self, size=-1):
Guido van Rossum0612d842000-03-10 23:20:43 +0000783
784 data = self.reader.read(size)
785 data, bytesencoded = self.encode(data, self.errors)
786 return data
787
Tim Peters30324a72001-05-15 17:19:16 +0000788 def readline(self, size=None):
Guido van Rossuma3277132000-04-11 15:37:43 +0000789
790 if size is None:
791 data = self.reader.readline()
792 else:
793 data = self.reader.readline(size)
794 data, bytesencoded = self.encode(data, self.errors)
795 return data
796
Tim Peters30324a72001-05-15 17:19:16 +0000797 def readlines(self, sizehint=None):
Guido van Rossuma3277132000-04-11 15:37:43 +0000798
Marc-André Lemburgd5948492004-02-26 15:22:17 +0000799 data = self.reader.read()
Guido van Rossuma3277132000-04-11 15:37:43 +0000800 data, bytesencoded = self.encode(data, self.errors)
801 return data.splitlines(1)
802
Walter Dörwald4dbf1922002-11-06 16:53:44 +0000803 def next(self):
804
805 """ Return the next decoded line from the input stream."""
Walter Dörwaldc5238b82005-09-01 11:56:53 +0000806 data = self.reader.next()
807 data, bytesencoded = self.encode(data, self.errors)
808 return data
Walter Dörwald4dbf1922002-11-06 16:53:44 +0000809
810 def __iter__(self):
811 return self
812
Tim Peters30324a72001-05-15 17:19:16 +0000813 def write(self, data):
Guido van Rossum0612d842000-03-10 23:20:43 +0000814
815 data, bytesdecoded = self.decode(data, self.errors)
816 return self.writer.write(data)
817
Tim Peters30324a72001-05-15 17:19:16 +0000818 def writelines(self, list):
Guido van Rossuma3277132000-04-11 15:37:43 +0000819
820 data = ''.join(list)
821 data, bytesdecoded = self.decode(data, self.errors)
822 return self.writer.write(data)
Guido van Rossum0612d842000-03-10 23:20:43 +0000823
824 def reset(self):
825
826 self.reader.reset()
827 self.writer.reset()
828
Tim Peters30324a72001-05-15 17:19:16 +0000829 def __getattr__(self, name,
Guido van Rossum0612d842000-03-10 23:20:43 +0000830 getattr=getattr):
831
832 """ Inherit all other methods from the underlying stream.
833 """
Tim Peters30324a72001-05-15 17:19:16 +0000834 return getattr(self.stream, name)
Guido van Rossum0612d842000-03-10 23:20:43 +0000835
Georg Brandl8f99f812006-10-29 08:39:22 +0000836 def __enter__(self):
837 return self
838
839 def __exit__(self, type, value, tb):
840 self.stream.close()
841
Guido van Rossum0612d842000-03-10 23:20:43 +0000842### Shortcuts
843
Marc-André Lemburg349a3d32000-06-21 21:21:04 +0000844def open(filename, mode='rb', encoding=None, errors='strict', buffering=1):
Guido van Rossum0612d842000-03-10 23:20:43 +0000845
846 """ Open an encoded file using the given mode and return
847 a wrapped version providing transparent encoding/decoding.
848
849 Note: The wrapped version will only accept the object format
850 defined by the codecs, i.e. Unicode objects for most builtin
Skip Montanaro9f5f9d92005-03-16 03:51:56 +0000851 codecs. Output is also codec dependent and will usually be
Guido van Rossum0612d842000-03-10 23:20:43 +0000852 Unicode as well.
853
Marc-André Lemburg349a3d32000-06-21 21:21:04 +0000854 Files are always opened in binary mode, even if no binary mode
Walter Dörwald7f3ed742003-02-02 23:08:27 +0000855 was specified. This is done to avoid data loss due to encodings
Marc-André Lemburg349a3d32000-06-21 21:21:04 +0000856 using 8-bit values. The default file mode is 'rb' meaning to
857 open the file in binary read mode.
858
Guido van Rossum0612d842000-03-10 23:20:43 +0000859 encoding specifies the encoding which is to be used for the
Walter Dörwald7f3ed742003-02-02 23:08:27 +0000860 file.
Guido van Rossum0612d842000-03-10 23:20:43 +0000861
862 errors may be given to define the error handling. It defaults
863 to 'strict' which causes ValueErrors to be raised in case an
864 encoding error occurs.
865
866 buffering has the same meaning as for the builtin open() API.
867 It defaults to line buffered.
868
Fred Drake49fd1072000-04-13 14:11:21 +0000869 The returned wrapped file object provides an extra attribute
870 .encoding which allows querying the used encoding. This
871 attribute is only available if an encoding was specified as
872 parameter.
873
Guido van Rossum0612d842000-03-10 23:20:43 +0000874 """
Florent Xiclunaf4b61862010-02-26 10:40:58 +0000875 if encoding is not None:
876 if 'U' in mode:
877 # No automatic conversion of '\n' is done on reading and writing
878 mode = mode.strip().replace('U', '')
879 if mode[:1] not in set('rwa'):
880 mode = 'r' + mode
881 if 'b' not in mode:
882 # Force opening of the file in binary mode
883 mode = mode + 'b'
Guido van Rossum0612d842000-03-10 23:20:43 +0000884 file = __builtin__.open(filename, mode, buffering)
885 if encoding is None:
886 return file
Walter Dörwaldabb02e52006-03-15 11:35:15 +0000887 info = lookup(encoding)
888 srw = StreamReaderWriter(file, info.streamreader, info.streamwriter, errors)
Guido van Rossuma3277132000-04-11 15:37:43 +0000889 # Add attributes to simplify introspection
890 srw.encoding = encoding
891 return srw
Guido van Rossum0612d842000-03-10 23:20:43 +0000892
Guido van Rossuma3277132000-04-11 15:37:43 +0000893def EncodedFile(file, data_encoding, file_encoding=None, errors='strict'):
Guido van Rossum0612d842000-03-10 23:20:43 +0000894
895 """ Return a wrapped version of file which provides transparent
896 encoding translation.
897
898 Strings written to the wrapped file are interpreted according
Guido van Rossuma3277132000-04-11 15:37:43 +0000899 to the given data_encoding and then written to the original
900 file as string using file_encoding. The intermediate encoding
901 will usually be Unicode but depends on the specified codecs.
Guido van Rossum0612d842000-03-10 23:20:43 +0000902
Guido van Rossuma3277132000-04-11 15:37:43 +0000903 Strings are read from the file using file_encoding and then
904 passed back to the caller as string using data_encoding.
905
906 If file_encoding is not given, it defaults to data_encoding.
Guido van Rossum0612d842000-03-10 23:20:43 +0000907
908 errors may be given to define the error handling. It defaults
909 to 'strict' which causes ValueErrors to be raised in case an
910 encoding error occurs.
911
Fred Drake49fd1072000-04-13 14:11:21 +0000912 The returned wrapped file object provides two extra attributes
913 .data_encoding and .file_encoding which reflect the given
914 parameters of the same name. The attributes can be used for
915 introspection by Python programs.
916
Guido van Rossum0612d842000-03-10 23:20:43 +0000917 """
Guido van Rossuma3277132000-04-11 15:37:43 +0000918 if file_encoding is None:
919 file_encoding = data_encoding
Georg Brandl8f99f812006-10-29 08:39:22 +0000920 data_info = lookup(data_encoding)
921 file_info = lookup(file_encoding)
922 sr = StreamRecoder(file, data_info.encode, data_info.decode,
923 file_info.streamreader, file_info.streamwriter, errors)
Guido van Rossuma3277132000-04-11 15:37:43 +0000924 # Add attributes to simplify introspection
925 sr.data_encoding = data_encoding
926 sr.file_encoding = file_encoding
927 return sr
Guido van Rossum0612d842000-03-10 23:20:43 +0000928
Marc-André Lemburgaa32c5a2001-09-19 11:24:48 +0000929### Helpers for codec lookup
930
931def getencoder(encoding):
932
933 """ Lookup up the codec for the given encoding and return
934 its encoder function.
935
936 Raises a LookupError in case the encoding cannot be found.
937
938 """
Walter Dörwaldabb02e52006-03-15 11:35:15 +0000939 return lookup(encoding).encode
Marc-André Lemburgaa32c5a2001-09-19 11:24:48 +0000940
941def getdecoder(encoding):
942
943 """ Lookup up the codec for the given encoding and return
944 its decoder function.
945
946 Raises a LookupError in case the encoding cannot be found.
947
948 """
Walter Dörwaldabb02e52006-03-15 11:35:15 +0000949 return lookup(encoding).decode
950
951def getincrementalencoder(encoding):
952
953 """ Lookup up the codec for the given encoding and return
954 its IncrementalEncoder class or factory function.
955
956 Raises a LookupError in case the encoding cannot be found
957 or the codecs doesn't provide an incremental encoder.
958
959 """
960 encoder = lookup(encoding).incrementalencoder
961 if encoder is None:
962 raise LookupError(encoding)
963 return encoder
964
965def getincrementaldecoder(encoding):
966
967 """ Lookup up the codec for the given encoding and return
968 its IncrementalDecoder class or factory function.
969
970 Raises a LookupError in case the encoding cannot be found
971 or the codecs doesn't provide an incremental decoder.
972
973 """
974 decoder = lookup(encoding).incrementaldecoder
975 if decoder is None:
976 raise LookupError(encoding)
977 return decoder
Marc-André Lemburgaa32c5a2001-09-19 11:24:48 +0000978
979def getreader(encoding):
980
981 """ Lookup up the codec for the given encoding and return
982 its StreamReader class or factory function.
983
984 Raises a LookupError in case the encoding cannot be found.
985
986 """
Walter Dörwaldabb02e52006-03-15 11:35:15 +0000987 return lookup(encoding).streamreader
Marc-André Lemburgaa32c5a2001-09-19 11:24:48 +0000988
989def getwriter(encoding):
990
991 """ Lookup up the codec for the given encoding and return
992 its StreamWriter class or factory function.
993
994 Raises a LookupError in case the encoding cannot be found.
995
996 """
Walter Dörwaldabb02e52006-03-15 11:35:15 +0000997 return lookup(encoding).streamwriter
998
999def iterencode(iterator, encoding, errors='strict', **kwargs):
1000 """
1001 Encoding iterator.
1002
1003 Encodes the input strings from the iterator using a IncrementalEncoder.
1004
1005 errors and kwargs are passed through to the IncrementalEncoder
1006 constructor.
1007 """
1008 encoder = getincrementalencoder(encoding)(errors, **kwargs)
1009 for input in iterator:
1010 output = encoder.encode(input)
1011 if output:
1012 yield output
1013 output = encoder.encode("", True)
1014 if output:
1015 yield output
1016
1017def iterdecode(iterator, encoding, errors='strict', **kwargs):
1018 """
1019 Decoding iterator.
1020
1021 Decodes the input strings from the iterator using a IncrementalDecoder.
1022
1023 errors and kwargs are passed through to the IncrementalDecoder
1024 constructor.
1025 """
1026 decoder = getincrementaldecoder(encoding)(errors, **kwargs)
1027 for input in iterator:
1028 output = decoder.decode(input)
1029 if output:
1030 yield output
1031 output = decoder.decode("", True)
1032 if output:
1033 yield output
Marc-André Lemburgaa32c5a2001-09-19 11:24:48 +00001034
Marc-André Lemburga866df82001-01-03 21:29:14 +00001035### Helpers for charmap-based codecs
1036
1037def make_identity_dict(rng):
1038
1039 """ make_identity_dict(rng) -> dict
1040
1041 Return a dictionary where elements of the rng sequence are
1042 mapped to themselves.
Tim Peters88869f92001-01-14 23:36:06 +00001043
Marc-André Lemburga866df82001-01-03 21:29:14 +00001044 """
1045 res = {}
1046 for i in rng:
1047 res[i]=i
1048 return res
1049
Marc-André Lemburg716cf912001-05-16 09:41:45 +00001050def make_encoding_map(decoding_map):
1051
1052 """ Creates an encoding map from a decoding map.
1053
Walter Dörwald7f3ed742003-02-02 23:08:27 +00001054 If a target mapping in the decoding map occurs multiple
Marc-André Lemburg716cf912001-05-16 09:41:45 +00001055 times, then that target is mapped to None (undefined mapping),
1056 causing an exception when encountered by the charmap codec
1057 during translation.
1058
1059 One example where this happens is cp875.py which decodes
Serhiy Storchakac8113282015-04-03 18:12:32 +03001060 multiple character to \\u001a.
Marc-André Lemburg716cf912001-05-16 09:41:45 +00001061
1062 """
1063 m = {}
1064 for k,v in decoding_map.items():
Raymond Hettinger54f02222002-06-01 14:18:47 +00001065 if not v in m:
Marc-André Lemburg716cf912001-05-16 09:41:45 +00001066 m[v] = k
1067 else:
1068 m[v] = None
1069 return m
Tim Peters3a2ab1a2001-05-29 06:06:54 +00001070
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001071### error handlers
1072
Martin v. Löwise2713be2005-03-08 15:03:08 +00001073try:
1074 strict_errors = lookup_error("strict")
1075 ignore_errors = lookup_error("ignore")
1076 replace_errors = lookup_error("replace")
1077 xmlcharrefreplace_errors = lookup_error("xmlcharrefreplace")
1078 backslashreplace_errors = lookup_error("backslashreplace")
1079except LookupError:
1080 # In --disable-unicode builds, these error handler are missing
1081 strict_errors = None
1082 ignore_errors = None
1083 replace_errors = None
1084 xmlcharrefreplace_errors = None
1085 backslashreplace_errors = None
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001086
Martin v. Löwis6cd441d2001-07-31 08:54:55 +00001087# Tell modulefinder that using codecs probably needs the encodings
1088# package
1089_false = 0
1090if _false:
1091 import encodings
1092
Guido van Rossum0612d842000-03-10 23:20:43 +00001093### Tests
Guido van Rossum1c89b0e2000-04-11 15:41:38 +00001094
Guido van Rossum0612d842000-03-10 23:20:43 +00001095if __name__ == '__main__':
1096
Guido van Rossuma3277132000-04-11 15:37:43 +00001097 # Make stdout translate Latin-1 output into UTF-8 output
1098 sys.stdout = EncodedFile(sys.stdout, 'latin-1', 'utf-8')
Guido van Rossum1c89b0e2000-04-11 15:41:38 +00001099
Guido van Rossuma3277132000-04-11 15:37:43 +00001100 # Have stdin translate Latin-1 input into UTF-8 input
1101 sys.stdin = EncodedFile(sys.stdin, 'utf-8', 'latin-1')