blob: 6d8d5544edd65001dc9dbb4ec0a249644f5dc561 [file] [log] [blame]
Guido van Rossum0612d842000-03-10 23:20:43 +00001""" codecs -- Python Codec Registry, API and helpers.
2
3
4Written by Marc-Andre Lemburg (mal@lemburg.com).
5
6(c) Copyright CNRI, All Rights Reserved. NO WARRANTY.
7
8"""#"
9
Georg Brandl1a3284e2007-12-02 09:40:06 +000010import builtins, sys
Guido van Rossum0612d842000-03-10 23:20:43 +000011
12### Registry and builtin stateless codec functions
13
Guido van Rossumb95de4f2000-03-31 17:25:23 +000014try:
15 from _codecs import *
Guido van Rossumb940e112007-01-10 16:19:56 +000016except ImportError as why:
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000017 raise SystemError('Failed to load the builtin codecs: %s' % why)
Guido van Rossum0612d842000-03-10 23:20:43 +000018
Tim Peters30324a72001-05-15 17:19:16 +000019__all__ = ["register", "lookup", "open", "EncodedFile", "BOM", "BOM_BE",
Walter Dörwald474458d2002-06-04 15:16:29 +000020 "BOM_LE", "BOM32_BE", "BOM32_LE", "BOM64_BE", "BOM64_LE",
21 "BOM_UTF8", "BOM_UTF16", "BOM_UTF16_LE", "BOM_UTF16_BE",
Walter Dörwald3aeb6322002-09-02 13:14:32 +000022 "BOM_UTF32", "BOM_UTF32_LE", "BOM_UTF32_BE",
23 "strict_errors", "ignore_errors", "replace_errors",
24 "xmlcharrefreplace_errors",
25 "register_error", "lookup_error"]
Skip Montanaroe99d5ea2001-01-20 19:54:20 +000026
Guido van Rossum0612d842000-03-10 23:20:43 +000027### Constants
28
29#
Walter Dörwald474458d2002-06-04 15:16:29 +000030# Byte Order Mark (BOM = ZERO WIDTH NO-BREAK SPACE = U+FEFF)
31# and its possible byte string values
32# for UTF8/UTF16/UTF32 output and little/big endian machines
Guido van Rossum0612d842000-03-10 23:20:43 +000033#
Guido van Rossum0612d842000-03-10 23:20:43 +000034
Walter Dörwald474458d2002-06-04 15:16:29 +000035# UTF-8
Walter Dörwaldca8a8d02007-05-04 13:05:09 +000036BOM_UTF8 = b'\xef\xbb\xbf'
Walter Dörwald474458d2002-06-04 15:16:29 +000037
38# UTF-16, little endian
Walter Dörwaldca8a8d02007-05-04 13:05:09 +000039BOM_LE = BOM_UTF16_LE = b'\xff\xfe'
Walter Dörwald474458d2002-06-04 15:16:29 +000040
41# UTF-16, big endian
Walter Dörwaldca8a8d02007-05-04 13:05:09 +000042BOM_BE = BOM_UTF16_BE = b'\xfe\xff'
Walter Dörwald474458d2002-06-04 15:16:29 +000043
44# UTF-32, little endian
Walter Dörwaldca8a8d02007-05-04 13:05:09 +000045BOM_UTF32_LE = b'\xff\xfe\x00\x00'
Walter Dörwald474458d2002-06-04 15:16:29 +000046
47# UTF-32, big endian
Walter Dörwaldca8a8d02007-05-04 13:05:09 +000048BOM_UTF32_BE = b'\x00\x00\xfe\xff'
Walter Dörwald474458d2002-06-04 15:16:29 +000049
Marc-André Lemburgb28de0d2002-12-12 17:37:50 +000050if sys.byteorder == 'little':
Walter Dörwald474458d2002-06-04 15:16:29 +000051
Marc-André Lemburgb28de0d2002-12-12 17:37:50 +000052 # UTF-16, native endianness
53 BOM = BOM_UTF16 = BOM_UTF16_LE
54
55 # UTF-32, native endianness
56 BOM_UTF32 = BOM_UTF32_LE
57
58else:
59
60 # UTF-16, native endianness
61 BOM = BOM_UTF16 = BOM_UTF16_BE
62
63 # UTF-32, native endianness
64 BOM_UTF32 = BOM_UTF32_BE
Walter Dörwald474458d2002-06-04 15:16:29 +000065
66# Old broken names (don't use in new code)
67BOM32_LE = BOM_UTF16_LE
68BOM32_BE = BOM_UTF16_BE
69BOM64_LE = BOM_UTF32_LE
70BOM64_BE = BOM_UTF32_BE
Guido van Rossum0612d842000-03-10 23:20:43 +000071
72
73### Codec base classes (defining the API)
74
Thomas Woutersa9773292006-04-21 09:43:23 +000075class CodecInfo(tuple):
76
77 def __new__(cls, encode, decode, streamreader=None, streamwriter=None,
78 incrementalencoder=None, incrementaldecoder=None, name=None):
79 self = tuple.__new__(cls, (encode, decode, streamreader, streamwriter))
80 self.name = name
81 self.encode = encode
82 self.decode = decode
83 self.incrementalencoder = incrementalencoder
84 self.incrementaldecoder = incrementaldecoder
85 self.streamwriter = streamwriter
86 self.streamreader = streamreader
87 return self
88
89 def __repr__(self):
Walter Dörwald3abcb012007-04-16 22:10:50 +000090 return "<%s.%s object for encoding %s at 0x%x>" % \
91 (self.__class__.__module__, self.__class__.__name__,
92 self.name, id(self))
Thomas Woutersa9773292006-04-21 09:43:23 +000093
Guido van Rossum0612d842000-03-10 23:20:43 +000094class Codec:
95
96 """ Defines the interface for stateless encoders/decoders.
97
Walter Dörwald7f82f792002-11-19 21:42:53 +000098 The .encode()/.decode() methods may use different error
Guido van Rossum0612d842000-03-10 23:20:43 +000099 handling schemes by providing the errors argument. These
Walter Dörwald7f82f792002-11-19 21:42:53 +0000100 string values are predefined:
Guido van Rossum0612d842000-03-10 23:20:43 +0000101
Guido van Rossumd8855fd2000-03-24 22:14:19 +0000102 'strict' - raise a ValueError error (or a subclass)
Guido van Rossum0612d842000-03-10 23:20:43 +0000103 'ignore' - ignore the character and continue with the next
104 'replace' - replace with a suitable replacement character;
105 Python will use the official U+FFFD REPLACEMENT
Walter Dörwald7f82f792002-11-19 21:42:53 +0000106 CHARACTER for the builtin Unicode codecs on
107 decoding and '?' on encoding.
108 'xmlcharrefreplace' - Replace with the appropriate XML
109 character reference (only for encoding).
110 'backslashreplace' - Replace with backslashed escape sequences
111 (only for encoding).
112
113 The set of allowed values can be extended via register_error.
Guido van Rossum0612d842000-03-10 23:20:43 +0000114
115 """
Tim Peters30324a72001-05-15 17:19:16 +0000116 def encode(self, input, errors='strict'):
Guido van Rossum1c89b0e2000-04-11 15:41:38 +0000117
Fred Drake3e74c0d2000-03-17 15:40:35 +0000118 """ Encodes the object input and returns a tuple (output
Guido van Rossum0612d842000-03-10 23:20:43 +0000119 object, length consumed).
120
121 errors defines the error handling to apply. It defaults to
122 'strict' handling.
123
124 The method may not store state in the Codec instance. Use
125 StreamCodec for codecs which have to keep state in order to
126 make encoding/decoding efficient.
127
128 The encoder must be able to handle zero length input and
129 return an empty object of the output object type in this
130 situation.
131
132 """
133 raise NotImplementedError
134
Tim Peters30324a72001-05-15 17:19:16 +0000135 def decode(self, input, errors='strict'):
Guido van Rossum0612d842000-03-10 23:20:43 +0000136
137 """ Decodes the object input and returns a tuple (output
138 object, length consumed).
139
140 input must be an object which provides the bf_getreadbuf
141 buffer slot. Python strings, buffer objects and memory
142 mapped files are examples of objects providing this slot.
Guido van Rossum1c89b0e2000-04-11 15:41:38 +0000143
Guido van Rossum0612d842000-03-10 23:20:43 +0000144 errors defines the error handling to apply. It defaults to
145 'strict' handling.
146
147 The method may not store state in the Codec instance. Use
148 StreamCodec for codecs which have to keep state in order to
149 make encoding/decoding efficient.
150
151 The decoder must be able to handle zero length input and
152 return an empty object of the output object type in this
153 situation.
154
Guido van Rossum1c89b0e2000-04-11 15:41:38 +0000155 """
Guido van Rossum0612d842000-03-10 23:20:43 +0000156 raise NotImplementedError
157
Thomas Woutersa9773292006-04-21 09:43:23 +0000158class IncrementalEncoder(object):
159 """
Walter Dörwald3abcb012007-04-16 22:10:50 +0000160 An IncrementalEncoder encodes an input in multiple steps. The input can
161 be passed piece by piece to the encode() method. The IncrementalEncoder
162 remembers the state of the encoding process between calls to encode().
Thomas Woutersa9773292006-04-21 09:43:23 +0000163 """
164 def __init__(self, errors='strict'):
165 """
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000166 Creates an IncrementalEncoder instance.
Thomas Woutersa9773292006-04-21 09:43:23 +0000167
168 The IncrementalEncoder may use different error handling schemes by
169 providing the errors keyword argument. See the module docstring
170 for a list of possible values.
171 """
172 self.errors = errors
173 self.buffer = ""
174
175 def encode(self, input, final=False):
176 """
177 Encodes input and returns the resulting object.
178 """
179 raise NotImplementedError
180
181 def reset(self):
182 """
183 Resets the encoder to the initial state.
184 """
185
Walter Dörwald3abcb012007-04-16 22:10:50 +0000186 def getstate(self):
187 """
188 Return the current state of the encoder.
189 """
190 return 0
191
192 def setstate(self, state):
193 """
194 Set the current state of the encoder. state must have been
195 returned by getstate().
196 """
197
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000198class BufferedIncrementalEncoder(IncrementalEncoder):
199 """
200 This subclass of IncrementalEncoder can be used as the baseclass for an
201 incremental encoder if the encoder must keep some of the output in a
202 buffer between calls to encode().
203 """
204 def __init__(self, errors='strict'):
205 IncrementalEncoder.__init__(self, errors)
Walter Dörwald3abcb012007-04-16 22:10:50 +0000206 # unencoded input that is kept between calls to encode()
207 self.buffer = ""
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000208
209 def _buffer_encode(self, input, errors, final):
210 # Overwrite this method in subclasses: It must encode input
211 # and return an (output, length consumed) tuple
212 raise NotImplementedError
213
214 def encode(self, input, final=False):
215 # encode input (taking the buffer into account)
216 data = self.buffer + input
217 (result, consumed) = self._buffer_encode(data, self.errors, final)
218 # keep unencoded input until the next call
219 self.buffer = data[consumed:]
220 return result
221
222 def reset(self):
223 IncrementalEncoder.reset(self)
224 self.buffer = ""
225
Walter Dörwald3abcb012007-04-16 22:10:50 +0000226 def getstate(self):
227 return self.buffer or 0
228
229 def setstate(self, state):
230 self.buffer = state or ""
231
Thomas Woutersa9773292006-04-21 09:43:23 +0000232class IncrementalDecoder(object):
233 """
Walter Dörwald3abcb012007-04-16 22:10:50 +0000234 An IncrementalDecoder decodes an input in multiple steps. The input can
235 be passed piece by piece to the decode() method. The IncrementalDecoder
Thomas Woutersa9773292006-04-21 09:43:23 +0000236 remembers the state of the decoding process between calls to decode().
237 """
238 def __init__(self, errors='strict'):
239 """
240 Creates a IncrementalDecoder instance.
241
242 The IncrementalDecoder may use different error handling schemes by
243 providing the errors keyword argument. See the module docstring
244 for a list of possible values.
245 """
246 self.errors = errors
247
248 def decode(self, input, final=False):
249 """
250 Decodes input and returns the resulting object.
251 """
252 raise NotImplementedError
253
254 def reset(self):
255 """
256 Resets the decoder to the initial state.
257 """
258
Walter Dörwald3abcb012007-04-16 22:10:50 +0000259 def getstate(self):
260 """
261 Return the current state of the decoder. This must be a
Ka-Ping Yeee84b6332008-03-17 20:30:22 +0000262 (buffered_input, additional_state_info) tuple. By convention,
263 additional_state_info should represent the state of the decoder
264 WITHOUT yet having processed the contents of buffered_input.
Walter Dörwald3abcb012007-04-16 22:10:50 +0000265 """
Walter Dörwaldca8a8d02007-05-04 13:05:09 +0000266 return (b"", 0)
Walter Dörwald3abcb012007-04-16 22:10:50 +0000267
268 def setstate(self, state):
269 """
270 Set the current state of the decoder. state must have been
271 returned by getstate().
272 """
273
Thomas Woutersa9773292006-04-21 09:43:23 +0000274class BufferedIncrementalDecoder(IncrementalDecoder):
275 """
276 This subclass of IncrementalDecoder can be used as the baseclass for an
Walter Dörwald3abcb012007-04-16 22:10:50 +0000277 incremental decoder if the decoder must be able to handle incomplete
278 byte sequences.
Thomas Woutersa9773292006-04-21 09:43:23 +0000279 """
280 def __init__(self, errors='strict'):
281 IncrementalDecoder.__init__(self, errors)
Walter Dörwald3abcb012007-04-16 22:10:50 +0000282 # undecoded input that is kept between calls to decode()
Walter Dörwaldca8a8d02007-05-04 13:05:09 +0000283 self.buffer = b""
Thomas Woutersa9773292006-04-21 09:43:23 +0000284
285 def _buffer_decode(self, input, errors, final):
286 # Overwrite this method in subclasses: It must decode input
287 # and return an (output, length consumed) tuple
288 raise NotImplementedError
289
290 def decode(self, input, final=False):
291 # decode input (taking the buffer into account)
292 data = self.buffer + input
293 (result, consumed) = self._buffer_decode(data, self.errors, final)
294 # keep undecoded input until the next call
295 self.buffer = data[consumed:]
296 return result
297
298 def reset(self):
299 IncrementalDecoder.reset(self)
Walter Dörwaldca8a8d02007-05-04 13:05:09 +0000300 self.buffer = b""
Thomas Woutersa9773292006-04-21 09:43:23 +0000301
Walter Dörwald3abcb012007-04-16 22:10:50 +0000302 def getstate(self):
303 # additional state info is always 0
304 return (self.buffer, 0)
305
306 def setstate(self, state):
307 # ignore additional state info
308 self.buffer = state[0]
309
Guido van Rossum0612d842000-03-10 23:20:43 +0000310#
311# The StreamWriter and StreamReader class provide generic working
Andrew M. Kuchling97c56352001-09-18 20:29:48 +0000312# interfaces which can be used to implement new encoding submodules
Guido van Rossum0612d842000-03-10 23:20:43 +0000313# very easily. See encodings/utf_8.py for an example on how this is
314# done.
Guido van Rossum1c89b0e2000-04-11 15:41:38 +0000315#
Guido van Rossum0612d842000-03-10 23:20:43 +0000316
317class StreamWriter(Codec):
318
Tim Peters30324a72001-05-15 17:19:16 +0000319 def __init__(self, stream, errors='strict'):
Guido van Rossum0612d842000-03-10 23:20:43 +0000320
321 """ Creates a StreamWriter instance.
322
323 stream must be a file-like object open for writing
324 (binary) data.
325
Walter Dörwald7f82f792002-11-19 21:42:53 +0000326 The StreamWriter may use different error handling
Guido van Rossum0612d842000-03-10 23:20:43 +0000327 schemes by providing the errors keyword argument. These
Walter Dörwald7f82f792002-11-19 21:42:53 +0000328 parameters are predefined:
Guido van Rossum0612d842000-03-10 23:20:43 +0000329
330 'strict' - raise a ValueError (or a subclass)
331 'ignore' - ignore the character and continue with the next
332 'replace'- replace with a suitable replacement character
Walter Dörwald7f82f792002-11-19 21:42:53 +0000333 'xmlcharrefreplace' - Replace with the appropriate XML
334 character reference.
335 'backslashreplace' - Replace with backslashed escape
336 sequences (only for encoding).
Guido van Rossum0612d842000-03-10 23:20:43 +0000337
Walter Dörwald7f82f792002-11-19 21:42:53 +0000338 The set of allowed parameter values can be extended via
339 register_error.
Guido van Rossum0612d842000-03-10 23:20:43 +0000340 """
341 self.stream = stream
342 self.errors = errors
343
Guido van Rossuma3277132000-04-11 15:37:43 +0000344 def write(self, object):
Guido van Rossum0612d842000-03-10 23:20:43 +0000345
346 """ Writes the object's contents encoded to self.stream.
347 """
Tim Peters30324a72001-05-15 17:19:16 +0000348 data, consumed = self.encode(object, self.errors)
Guido van Rossum0612d842000-03-10 23:20:43 +0000349 self.stream.write(data)
350
Guido van Rossuma3277132000-04-11 15:37:43 +0000351 def writelines(self, list):
352
353 """ Writes the concatenated list of strings to the stream
354 using .write().
355 """
356 self.write(''.join(list))
Guido van Rossum1c89b0e2000-04-11 15:41:38 +0000357
Guido van Rossum0612d842000-03-10 23:20:43 +0000358 def reset(self):
359
360 """ Flushes and resets the codec buffers used for keeping state.
361
362 Calling this method should ensure that the data on the
363 output is put into a clean state, that allows appending
364 of new fresh data without having to rescan the whole
365 stream to recover state.
366
367 """
368 pass
369
Tim Peters30324a72001-05-15 17:19:16 +0000370 def __getattr__(self, name,
Guido van Rossum0612d842000-03-10 23:20:43 +0000371 getattr=getattr):
372
373 """ Inherit all other methods from the underlying stream.
374 """
Tim Peters30324a72001-05-15 17:19:16 +0000375 return getattr(self.stream, name)
Guido van Rossum0612d842000-03-10 23:20:43 +0000376
Thomas Wouters89f507f2006-12-13 04:49:30 +0000377 def __enter__(self):
378 return self
379
380 def __exit__(self, type, value, tb):
381 self.stream.close()
382
Guido van Rossum0612d842000-03-10 23:20:43 +0000383###
384
385class StreamReader(Codec):
386
Tim Peters30324a72001-05-15 17:19:16 +0000387 def __init__(self, stream, errors='strict'):
Guido van Rossum0612d842000-03-10 23:20:43 +0000388
389 """ Creates a StreamReader instance.
390
391 stream must be a file-like object open for reading
392 (binary) data.
393
Walter Dörwald7f82f792002-11-19 21:42:53 +0000394 The StreamReader may use different error handling
Guido van Rossum0612d842000-03-10 23:20:43 +0000395 schemes by providing the errors keyword argument. These
Walter Dörwald7f82f792002-11-19 21:42:53 +0000396 parameters are predefined:
Guido van Rossum0612d842000-03-10 23:20:43 +0000397
398 'strict' - raise a ValueError (or a subclass)
399 'ignore' - ignore the character and continue with the next
400 'replace'- replace with a suitable replacement character;
401
Walter Dörwald7f82f792002-11-19 21:42:53 +0000402 The set of allowed parameter values can be extended via
403 register_error.
Guido van Rossum0612d842000-03-10 23:20:43 +0000404 """
405 self.stream = stream
406 self.errors = errors
Walter Dörwaldca8a8d02007-05-04 13:05:09 +0000407 self.bytebuffer = b""
Walter Dörwaldc9878e12005-07-20 22:15:39 +0000408 # For str->str decoding this will stay a str
409 # For str->unicode decoding the first read will promote it to unicode
410 self.charbuffer = ""
Martin v. Löwis4ed67382005-09-18 08:34:39 +0000411 self.linebuffer = None
Guido van Rossum0612d842000-03-10 23:20:43 +0000412
Walter Dörwald69652032004-09-07 20:24:22 +0000413 def decode(self, input, errors='strict'):
414 raise NotImplementedError
415
Martin v. Löwis56066d22005-08-24 07:38:12 +0000416 def read(self, size=-1, chars=-1, firstline=False):
Guido van Rossum0612d842000-03-10 23:20:43 +0000417
418 """ Decodes data from the stream self.stream and returns the
419 resulting object.
420
Walter Dörwald69652032004-09-07 20:24:22 +0000421 chars indicates the number of characters to read from the
422 stream. read() will never return more than chars
423 characters, but it might return less, if there are not enough
424 characters available.
425
Guido van Rossum0612d842000-03-10 23:20:43 +0000426 size indicates the approximate maximum number of bytes to
427 read from the stream for decoding purposes. The decoder
428 can modify this setting as appropriate. The default value
429 -1 indicates to read and decode as much as possible. size
430 is intended to prevent having to decode huge files in one
431 step.
432
Martin v. Löwis56066d22005-08-24 07:38:12 +0000433 If firstline is true, and a UnicodeDecodeError happens
434 after the first line terminator in the input only the first line
435 will be returned, the rest of the input will be kept until the
436 next call to read().
437
Guido van Rossum0612d842000-03-10 23:20:43 +0000438 The method should use a greedy read strategy meaning that
439 it should read as much data as is allowed within the
440 definition of the encoding and the given size, e.g. if
441 optional encoding endings or state markers are available
442 on the stream, these should be read too.
Guido van Rossum0612d842000-03-10 23:20:43 +0000443 """
Martin v. Löwis4ed67382005-09-18 08:34:39 +0000444 # If we have lines cached, first merge them back into characters
445 if self.linebuffer:
446 self.charbuffer = "".join(self.linebuffer)
447 self.linebuffer = None
Tim Peters536cf992005-12-25 23:18:31 +0000448
Walter Dörwald69652032004-09-07 20:24:22 +0000449 # read until we get the required number of characters (if available)
Walter Dörwald69652032004-09-07 20:24:22 +0000450 while True:
451 # can the request can be satisfied from the character buffer?
452 if chars < 0:
Walter Dörwaldca199432006-03-06 22:39:12 +0000453 if size < 0:
454 if self.charbuffer:
455 break
456 elif len(self.charbuffer) >= size:
Walter Dörwalde57d7b12004-12-21 22:24:00 +0000457 break
Guido van Rossum0612d842000-03-10 23:20:43 +0000458 else:
Walter Dörwald69652032004-09-07 20:24:22 +0000459 if len(self.charbuffer) >= chars:
Walter Dörwald69652032004-09-07 20:24:22 +0000460 break
461 # we need more data
462 if size < 0:
463 newdata = self.stream.read()
464 else:
465 newdata = self.stream.read(size)
Walter Dörwalde57d7b12004-12-21 22:24:00 +0000466 # decode bytes (those remaining from the last call included)
Walter Dörwald69652032004-09-07 20:24:22 +0000467 data = self.bytebuffer + newdata
Martin v. Löwis56066d22005-08-24 07:38:12 +0000468 try:
469 newchars, decodedbytes = self.decode(data, self.errors)
Guido van Rossumb940e112007-01-10 16:19:56 +0000470 except UnicodeDecodeError as exc:
Martin v. Löwis56066d22005-08-24 07:38:12 +0000471 if firstline:
Walter Dörwald3abcb012007-04-16 22:10:50 +0000472 newchars, decodedbytes = \
473 self.decode(data[:exc.start], self.errors)
Martin v. Löwis56066d22005-08-24 07:38:12 +0000474 lines = newchars.splitlines(True)
475 if len(lines)<=1:
476 raise
477 else:
478 raise
Walter Dörwald69652032004-09-07 20:24:22 +0000479 # keep undecoded bytes until the next call
480 self.bytebuffer = data[decodedbytes:]
481 # put new characters in the character buffer
Walter Dörwalde57d7b12004-12-21 22:24:00 +0000482 self.charbuffer += newchars
Walter Dörwald69652032004-09-07 20:24:22 +0000483 # there was no data available
484 if not newdata:
Walter Dörwalde57d7b12004-12-21 22:24:00 +0000485 break
486 if chars < 0:
487 # Return everything we've got
488 result = self.charbuffer
Walter Dörwaldc9878e12005-07-20 22:15:39 +0000489 self.charbuffer = ""
Walter Dörwalde57d7b12004-12-21 22:24:00 +0000490 else:
491 # Return the first chars characters
492 result = self.charbuffer[:chars]
493 self.charbuffer = self.charbuffer[chars:]
Walter Dörwald69652032004-09-07 20:24:22 +0000494 return result
Guido van Rossum0612d842000-03-10 23:20:43 +0000495
Walter Dörwald69652032004-09-07 20:24:22 +0000496 def readline(self, size=None, keepends=True):
Guido van Rossuma3277132000-04-11 15:37:43 +0000497
498 """ Read one line from the input stream and return the
499 decoded data.
500
Walter Dörwald69652032004-09-07 20:24:22 +0000501 size, if given, is passed as size argument to the
502 read() method.
Guido van Rossum1c89b0e2000-04-11 15:41:38 +0000503
Guido van Rossuma3277132000-04-11 15:37:43 +0000504 """
Martin v. Löwis4ed67382005-09-18 08:34:39 +0000505 # If we have lines cached from an earlier read, return
506 # them unconditionally
507 if self.linebuffer:
508 line = self.linebuffer[0]
509 del self.linebuffer[0]
510 if len(self.linebuffer) == 1:
511 # revert to charbuffer mode; we might need more data
512 # next time
513 self.charbuffer = self.linebuffer[0]
514 self.linebuffer = None
515 if not keepends:
516 line = line.splitlines(False)[0]
517 return line
Tim Peters536cf992005-12-25 23:18:31 +0000518
Walter Dörwalde57d7b12004-12-21 22:24:00 +0000519 readsize = size or 72
Walter Dörwaldc9878e12005-07-20 22:15:39 +0000520 line = ""
Walter Dörwalde57d7b12004-12-21 22:24:00 +0000521 # If size is given, we call read() only once
Walter Dörwald69652032004-09-07 20:24:22 +0000522 while True:
Martin v. Löwis56066d22005-08-24 07:38:12 +0000523 data = self.read(readsize, firstline=True)
Walter Dörwalde57d7b12004-12-21 22:24:00 +0000524 if data:
Walter Dörwalda4eb2d52005-04-21 21:42:35 +0000525 # If we're at a "\r" read one extra character (which might
526 # be a "\n") to get a proper line ending. If the stream is
Walter Dörwaldbc8e6422005-04-21 21:32:03 +0000527 # temporarily exhausted we return the wrong line ending.
Walter Dörwaldc9878e12005-07-20 22:15:39 +0000528 if data.endswith("\r"):
Walter Dörwald7a6dc132005-04-04 21:38:47 +0000529 data += self.read(size=1, chars=1)
Walter Dörwald7a6dc132005-04-04 21:38:47 +0000530
Walter Dörwald69652032004-09-07 20:24:22 +0000531 line += data
Walter Dörwalde57d7b12004-12-21 22:24:00 +0000532 lines = line.splitlines(True)
533 if lines:
Martin v. Löwis4ed67382005-09-18 08:34:39 +0000534 if len(lines) > 1:
535 # More than one line result; the first line is a full line
536 # to return
537 line = lines[0]
538 del lines[0]
539 if len(lines) > 1:
540 # cache the remaining lines
541 lines[-1] += self.charbuffer
542 self.linebuffer = lines
543 self.charbuffer = None
544 else:
545 # only one remaining line, put it back into charbuffer
546 self.charbuffer = lines[0] + self.charbuffer
547 if not keepends:
548 line = line.splitlines(False)[0]
549 break
Walter Dörwalde57d7b12004-12-21 22:24:00 +0000550 line0withend = lines[0]
551 line0withoutend = lines[0].splitlines(False)[0]
552 if line0withend != line0withoutend: # We really have a line end
553 # Put the rest back together and keep it until the next call
Walter Dörwaldc9878e12005-07-20 22:15:39 +0000554 self.charbuffer = "".join(lines[1:]) + self.charbuffer
Walter Dörwalde57d7b12004-12-21 22:24:00 +0000555 if keepends:
556 line = line0withend
557 else:
558 line = line0withoutend
Walter Dörwald9fa09462005-01-10 12:01:39 +0000559 break
Walter Dörwalde57d7b12004-12-21 22:24:00 +0000560 # we didn't get anything or this was our only try
Walter Dörwald9fa09462005-01-10 12:01:39 +0000561 if not data or size is not None:
Walter Dörwalde57d7b12004-12-21 22:24:00 +0000562 if line and not keepends:
563 line = line.splitlines(False)[0]
564 break
565 if readsize<8000:
566 readsize *= 2
567 return line
Guido van Rossum1c89b0e2000-04-11 15:41:38 +0000568
Walter Dörwald69652032004-09-07 20:24:22 +0000569 def readlines(self, sizehint=None, keepends=True):
Guido van Rossuma3277132000-04-11 15:37:43 +0000570
571 """ Read all lines available on the input stream
572 and return them as list of lines.
573
574 Line breaks are implemented using the codec's decoder
575 method and are included in the list entries.
Guido van Rossum1c89b0e2000-04-11 15:41:38 +0000576
Marc-André Lemburgd5948492004-02-26 15:22:17 +0000577 sizehint, if given, is ignored since there is no efficient
578 way to finding the true end-of-line.
Guido van Rossuma3277132000-04-11 15:37:43 +0000579
580 """
Walter Dörwald69652032004-09-07 20:24:22 +0000581 data = self.read()
Hye-Shik Changaf5c7cf2004-10-17 23:51:21 +0000582 return data.splitlines(keepends)
Guido van Rossum0612d842000-03-10 23:20:43 +0000583
584 def reset(self):
585
586 """ Resets the codec buffers used for keeping state.
587
588 Note that no stream repositioning should take place.
Thomas Wouters7e474022000-07-16 12:04:32 +0000589 This method is primarily intended to be able to recover
Guido van Rossum0612d842000-03-10 23:20:43 +0000590 from decoding errors.
591
592 """
Walter Dörwaldca8a8d02007-05-04 13:05:09 +0000593 self.bytebuffer = b""
Guido van Rossumef87d6e2007-05-02 19:09:54 +0000594 self.charbuffer = ""
Martin v. Löwis4ed67382005-09-18 08:34:39 +0000595 self.linebuffer = None
Walter Dörwald729c31f2005-03-14 19:06:30 +0000596
Walter Dörwald71fd90d2005-03-14 19:25:41 +0000597 def seek(self, offset, whence=0):
Walter Dörwald729c31f2005-03-14 19:06:30 +0000598 """ Set the input stream's current position.
599
600 Resets the codec buffers used for keeping state.
601 """
602 self.reset()
603 self.stream.seek(offset, whence)
Guido van Rossum0612d842000-03-10 23:20:43 +0000604
Georg Brandla18af4e2007-04-21 15:47:16 +0000605 def __next__(self):
Walter Dörwald4dbf1922002-11-06 16:53:44 +0000606
607 """ Return the next decoded line from the input stream."""
608 line = self.readline()
609 if line:
610 return line
611 raise StopIteration
612
613 def __iter__(self):
614 return self
615
Tim Peters30324a72001-05-15 17:19:16 +0000616 def __getattr__(self, name,
Guido van Rossum0612d842000-03-10 23:20:43 +0000617 getattr=getattr):
618
619 """ Inherit all other methods from the underlying stream.
620 """
Tim Peters30324a72001-05-15 17:19:16 +0000621 return getattr(self.stream, name)
Guido van Rossum0612d842000-03-10 23:20:43 +0000622
Thomas Wouters89f507f2006-12-13 04:49:30 +0000623 def __enter__(self):
624 return self
625
626 def __exit__(self, type, value, tb):
627 self.stream.close()
628
Guido van Rossum0612d842000-03-10 23:20:43 +0000629###
630
631class StreamReaderWriter:
632
Fred Drake49fd1072000-04-13 14:11:21 +0000633 """ StreamReaderWriter instances allow wrapping streams which
634 work in both read and write modes.
635
636 The design is such that one can use the factory functions
Thomas Wouters7e474022000-07-16 12:04:32 +0000637 returned by the codec.lookup() function to construct the
Fred Drake49fd1072000-04-13 14:11:21 +0000638 instance.
639
640 """
Guido van Rossuma3277132000-04-11 15:37:43 +0000641 # Optional attributes set by the file wrappers below
642 encoding = 'unknown'
643
Tim Peters30324a72001-05-15 17:19:16 +0000644 def __init__(self, stream, Reader, Writer, errors='strict'):
Guido van Rossum0612d842000-03-10 23:20:43 +0000645
646 """ Creates a StreamReaderWriter instance.
647
648 stream must be a Stream-like object.
649
650 Reader, Writer must be factory functions or classes
651 providing the StreamReader, StreamWriter interface resp.
652
653 Error handling is done in the same way as defined for the
654 StreamWriter/Readers.
655
656 """
657 self.stream = stream
658 self.reader = Reader(stream, errors)
659 self.writer = Writer(stream, errors)
660 self.errors = errors
661
Tim Peters30324a72001-05-15 17:19:16 +0000662 def read(self, size=-1):
Guido van Rossum0612d842000-03-10 23:20:43 +0000663
664 return self.reader.read(size)
665
Guido van Rossumd58c26f2000-05-01 16:17:32 +0000666 def readline(self, size=None):
Guido van Rossuma3277132000-04-11 15:37:43 +0000667
668 return self.reader.readline(size)
669
Guido van Rossumd58c26f2000-05-01 16:17:32 +0000670 def readlines(self, sizehint=None):
Guido van Rossuma3277132000-04-11 15:37:43 +0000671
672 return self.reader.readlines(sizehint)
673
Georg Brandla18af4e2007-04-21 15:47:16 +0000674 def __next__(self):
Walter Dörwald4dbf1922002-11-06 16:53:44 +0000675
676 """ Return the next decoded line from the input stream."""
Georg Brandla18af4e2007-04-21 15:47:16 +0000677 return next(self.reader)
Walter Dörwald4dbf1922002-11-06 16:53:44 +0000678
679 def __iter__(self):
680 return self
681
Tim Peters30324a72001-05-15 17:19:16 +0000682 def write(self, data):
Guido van Rossum0612d842000-03-10 23:20:43 +0000683
684 return self.writer.write(data)
685
Tim Peters30324a72001-05-15 17:19:16 +0000686 def writelines(self, list):
Guido van Rossuma3277132000-04-11 15:37:43 +0000687
688 return self.writer.writelines(list)
689
Guido van Rossum0612d842000-03-10 23:20:43 +0000690 def reset(self):
691
692 self.reader.reset()
693 self.writer.reset()
694
Tim Peters30324a72001-05-15 17:19:16 +0000695 def __getattr__(self, name,
Guido van Rossum0612d842000-03-10 23:20:43 +0000696 getattr=getattr):
697
698 """ Inherit all other methods from the underlying stream.
699 """
Tim Peters30324a72001-05-15 17:19:16 +0000700 return getattr(self.stream, name)
Guido van Rossum0612d842000-03-10 23:20:43 +0000701
Thomas Wouters89f507f2006-12-13 04:49:30 +0000702 # these are needed to make "with codecs.open(...)" work properly
703
704 def __enter__(self):
705 return self
706
707 def __exit__(self, type, value, tb):
708 self.stream.close()
709
Guido van Rossum0612d842000-03-10 23:20:43 +0000710###
711
712class StreamRecoder:
713
Fred Drake49fd1072000-04-13 14:11:21 +0000714 """ StreamRecoder instances provide a frontend - backend
715 view of encoding data.
716
717 They use the complete set of APIs returned by the
718 codecs.lookup() function to implement their task.
719
720 Data written to the stream is first decoded into an
721 intermediate format (which is dependent on the given codec
722 combination) and then written to the stream using an instance
723 of the provided Writer class.
724
725 In the other direction, data is read from the stream using a
726 Reader instance and then return encoded data to the caller.
727
728 """
Guido van Rossuma3277132000-04-11 15:37:43 +0000729 # Optional attributes set by the file wrappers below
730 data_encoding = 'unknown'
731 file_encoding = 'unknown'
732
Tim Peters30324a72001-05-15 17:19:16 +0000733 def __init__(self, stream, encode, decode, Reader, Writer,
734 errors='strict'):
Guido van Rossum0612d842000-03-10 23:20:43 +0000735
736 """ Creates a StreamRecoder instance which implements a two-way
737 conversion: encode and decode work on the frontend (the
Guido van Rossum1c89b0e2000-04-11 15:41:38 +0000738 input to .read() and output of .write()) while
Guido van Rossum0612d842000-03-10 23:20:43 +0000739 Reader and Writer work on the backend (reading and
Fred Drake908670c2000-03-17 15:42:11 +0000740 writing to the stream).
Guido van Rossum0612d842000-03-10 23:20:43 +0000741
742 You can use these objects to do transparent direct
743 recodings from e.g. latin-1 to utf-8 and back.
744
745 stream must be a file-like object.
746
747 encode, decode must adhere to the Codec interface, Reader,
748 Writer must be factory functions or classes providing the
749 StreamReader, StreamWriter interface resp.
750
751 encode and decode are needed for the frontend translation,
752 Reader and Writer for the backend translation. Unicode is
753 used as intermediate encoding.
754
755 Error handling is done in the same way as defined for the
756 StreamWriter/Readers.
757
758 """
759 self.stream = stream
760 self.encode = encode
761 self.decode = decode
762 self.reader = Reader(stream, errors)
763 self.writer = Writer(stream, errors)
764 self.errors = errors
765
Tim Peters30324a72001-05-15 17:19:16 +0000766 def read(self, size=-1):
Guido van Rossum0612d842000-03-10 23:20:43 +0000767
768 data = self.reader.read(size)
769 data, bytesencoded = self.encode(data, self.errors)
770 return data
771
Tim Peters30324a72001-05-15 17:19:16 +0000772 def readline(self, size=None):
Guido van Rossuma3277132000-04-11 15:37:43 +0000773
774 if size is None:
775 data = self.reader.readline()
776 else:
777 data = self.reader.readline(size)
778 data, bytesencoded = self.encode(data, self.errors)
779 return data
780
Tim Peters30324a72001-05-15 17:19:16 +0000781 def readlines(self, sizehint=None):
Guido van Rossuma3277132000-04-11 15:37:43 +0000782
Marc-André Lemburgd5948492004-02-26 15:22:17 +0000783 data = self.reader.read()
Guido van Rossuma3277132000-04-11 15:37:43 +0000784 data, bytesencoded = self.encode(data, self.errors)
785 return data.splitlines(1)
786
Georg Brandla18af4e2007-04-21 15:47:16 +0000787 def __next__(self):
Walter Dörwald4dbf1922002-11-06 16:53:44 +0000788
789 """ Return the next decoded line from the input stream."""
Georg Brandla18af4e2007-04-21 15:47:16 +0000790 data = next(self.reader)
Walter Dörwaldc5238b82005-09-01 11:56:53 +0000791 data, bytesencoded = self.encode(data, self.errors)
792 return data
Walter Dörwald4dbf1922002-11-06 16:53:44 +0000793
794 def __iter__(self):
795 return self
796
Tim Peters30324a72001-05-15 17:19:16 +0000797 def write(self, data):
Guido van Rossum0612d842000-03-10 23:20:43 +0000798
799 data, bytesdecoded = self.decode(data, self.errors)
800 return self.writer.write(data)
801
Tim Peters30324a72001-05-15 17:19:16 +0000802 def writelines(self, list):
Guido van Rossuma3277132000-04-11 15:37:43 +0000803
804 data = ''.join(list)
805 data, bytesdecoded = self.decode(data, self.errors)
806 return self.writer.write(data)
Guido van Rossum0612d842000-03-10 23:20:43 +0000807
808 def reset(self):
809
810 self.reader.reset()
811 self.writer.reset()
812
Tim Peters30324a72001-05-15 17:19:16 +0000813 def __getattr__(self, name,
Guido van Rossum0612d842000-03-10 23:20:43 +0000814 getattr=getattr):
815
816 """ Inherit all other methods from the underlying stream.
817 """
Tim Peters30324a72001-05-15 17:19:16 +0000818 return getattr(self.stream, name)
Guido van Rossum0612d842000-03-10 23:20:43 +0000819
Thomas Wouters89f507f2006-12-13 04:49:30 +0000820 def __enter__(self):
821 return self
822
823 def __exit__(self, type, value, tb):
824 self.stream.close()
825
Guido van Rossum0612d842000-03-10 23:20:43 +0000826### Shortcuts
827
Marc-André Lemburg349a3d32000-06-21 21:21:04 +0000828def open(filename, mode='rb', encoding=None, errors='strict', buffering=1):
Guido van Rossum0612d842000-03-10 23:20:43 +0000829
830 """ Open an encoded file using the given mode and return
831 a wrapped version providing transparent encoding/decoding.
832
833 Note: The wrapped version will only accept the object format
834 defined by the codecs, i.e. Unicode objects for most builtin
Skip Montanaro9f5f9d92005-03-16 03:51:56 +0000835 codecs. Output is also codec dependent and will usually be
Guido van Rossum0612d842000-03-10 23:20:43 +0000836 Unicode as well.
837
Marc-André Lemburg349a3d32000-06-21 21:21:04 +0000838 Files are always opened in binary mode, even if no binary mode
Walter Dörwald7f3ed742003-02-02 23:08:27 +0000839 was specified. This is done to avoid data loss due to encodings
Marc-André Lemburg349a3d32000-06-21 21:21:04 +0000840 using 8-bit values. The default file mode is 'rb' meaning to
841 open the file in binary read mode.
842
Guido van Rossum0612d842000-03-10 23:20:43 +0000843 encoding specifies the encoding which is to be used for the
Walter Dörwald7f3ed742003-02-02 23:08:27 +0000844 file.
Guido van Rossum0612d842000-03-10 23:20:43 +0000845
846 errors may be given to define the error handling. It defaults
847 to 'strict' which causes ValueErrors to be raised in case an
848 encoding error occurs.
849
850 buffering has the same meaning as for the builtin open() API.
851 It defaults to line buffered.
852
Fred Drake49fd1072000-04-13 14:11:21 +0000853 The returned wrapped file object provides an extra attribute
854 .encoding which allows querying the used encoding. This
855 attribute is only available if an encoding was specified as
856 parameter.
857
Guido van Rossum0612d842000-03-10 23:20:43 +0000858 """
859 if encoding is not None and \
860 'b' not in mode:
861 # Force opening of the file in binary mode
862 mode = mode + 'b'
Georg Brandl1a3284e2007-12-02 09:40:06 +0000863 file = builtins.open(filename, mode, buffering)
Guido van Rossum0612d842000-03-10 23:20:43 +0000864 if encoding is None:
865 return file
Thomas Woutersa9773292006-04-21 09:43:23 +0000866 info = lookup(encoding)
867 srw = StreamReaderWriter(file, info.streamreader, info.streamwriter, errors)
Guido van Rossuma3277132000-04-11 15:37:43 +0000868 # Add attributes to simplify introspection
869 srw.encoding = encoding
870 return srw
Guido van Rossum0612d842000-03-10 23:20:43 +0000871
Guido van Rossuma3277132000-04-11 15:37:43 +0000872def EncodedFile(file, data_encoding, file_encoding=None, errors='strict'):
Guido van Rossum0612d842000-03-10 23:20:43 +0000873
874 """ Return a wrapped version of file which provides transparent
875 encoding translation.
876
877 Strings written to the wrapped file are interpreted according
Guido van Rossuma3277132000-04-11 15:37:43 +0000878 to the given data_encoding and then written to the original
879 file as string using file_encoding. The intermediate encoding
880 will usually be Unicode but depends on the specified codecs.
Guido van Rossum0612d842000-03-10 23:20:43 +0000881
Guido van Rossuma3277132000-04-11 15:37:43 +0000882 Strings are read from the file using file_encoding and then
883 passed back to the caller as string using data_encoding.
884
885 If file_encoding is not given, it defaults to data_encoding.
Guido van Rossum0612d842000-03-10 23:20:43 +0000886
887 errors may be given to define the error handling. It defaults
888 to 'strict' which causes ValueErrors to be raised in case an
889 encoding error occurs.
890
Fred Drake49fd1072000-04-13 14:11:21 +0000891 The returned wrapped file object provides two extra attributes
892 .data_encoding and .file_encoding which reflect the given
893 parameters of the same name. The attributes can be used for
894 introspection by Python programs.
895
Guido van Rossum0612d842000-03-10 23:20:43 +0000896 """
Guido van Rossuma3277132000-04-11 15:37:43 +0000897 if file_encoding is None:
898 file_encoding = data_encoding
Thomas Wouters89f507f2006-12-13 04:49:30 +0000899 data_info = lookup(data_encoding)
900 file_info = lookup(file_encoding)
901 sr = StreamRecoder(file, data_info.encode, data_info.decode,
902 file_info.streamreader, file_info.streamwriter, errors)
Guido van Rossuma3277132000-04-11 15:37:43 +0000903 # Add attributes to simplify introspection
904 sr.data_encoding = data_encoding
905 sr.file_encoding = file_encoding
906 return sr
Guido van Rossum0612d842000-03-10 23:20:43 +0000907
Marc-André Lemburgaa32c5a2001-09-19 11:24:48 +0000908### Helpers for codec lookup
909
910def getencoder(encoding):
911
912 """ Lookup up the codec for the given encoding and return
913 its encoder function.
914
915 Raises a LookupError in case the encoding cannot be found.
916
917 """
Thomas Woutersa9773292006-04-21 09:43:23 +0000918 return lookup(encoding).encode
Marc-André Lemburgaa32c5a2001-09-19 11:24:48 +0000919
920def getdecoder(encoding):
921
922 """ Lookup up the codec for the given encoding and return
923 its decoder function.
924
925 Raises a LookupError in case the encoding cannot be found.
926
927 """
Thomas Woutersa9773292006-04-21 09:43:23 +0000928 return lookup(encoding).decode
929
930def getincrementalencoder(encoding):
931
932 """ Lookup up the codec for the given encoding and return
933 its IncrementalEncoder class or factory function.
934
935 Raises a LookupError in case the encoding cannot be found
936 or the codecs doesn't provide an incremental encoder.
937
938 """
939 encoder = lookup(encoding).incrementalencoder
940 if encoder is None:
941 raise LookupError(encoding)
942 return encoder
943
944def getincrementaldecoder(encoding):
945
946 """ Lookup up the codec for the given encoding and return
947 its IncrementalDecoder class or factory function.
948
949 Raises a LookupError in case the encoding cannot be found
950 or the codecs doesn't provide an incremental decoder.
951
952 """
953 decoder = lookup(encoding).incrementaldecoder
954 if decoder is None:
955 raise LookupError(encoding)
956 return decoder
Marc-André Lemburgaa32c5a2001-09-19 11:24:48 +0000957
958def getreader(encoding):
959
960 """ Lookup up the codec for the given encoding and return
961 its StreamReader class or factory function.
962
963 Raises a LookupError in case the encoding cannot be found.
964
965 """
Thomas Woutersa9773292006-04-21 09:43:23 +0000966 return lookup(encoding).streamreader
Marc-André Lemburgaa32c5a2001-09-19 11:24:48 +0000967
968def getwriter(encoding):
969
970 """ Lookup up the codec for the given encoding and return
971 its StreamWriter class or factory function.
972
973 Raises a LookupError in case the encoding cannot be found.
974
975 """
Thomas Woutersa9773292006-04-21 09:43:23 +0000976 return lookup(encoding).streamwriter
977
978def iterencode(iterator, encoding, errors='strict', **kwargs):
979 """
980 Encoding iterator.
981
982 Encodes the input strings from the iterator using a IncrementalEncoder.
983
984 errors and kwargs are passed through to the IncrementalEncoder
985 constructor.
986 """
987 encoder = getincrementalencoder(encoding)(errors, **kwargs)
988 for input in iterator:
989 output = encoder.encode(input)
990 if output:
991 yield output
992 output = encoder.encode("", True)
993 if output:
994 yield output
995
996def iterdecode(iterator, encoding, errors='strict', **kwargs):
997 """
998 Decoding iterator.
999
1000 Decodes the input strings from the iterator using a IncrementalDecoder.
1001
1002 errors and kwargs are passed through to the IncrementalDecoder
1003 constructor.
1004 """
1005 decoder = getincrementaldecoder(encoding)(errors, **kwargs)
1006 for input in iterator:
1007 output = decoder.decode(input)
1008 if output:
1009 yield output
Walter Dörwaldca8a8d02007-05-04 13:05:09 +00001010 output = decoder.decode(b"", True)
Thomas Woutersa9773292006-04-21 09:43:23 +00001011 if output:
1012 yield output
Marc-André Lemburgaa32c5a2001-09-19 11:24:48 +00001013
Marc-André Lemburga866df82001-01-03 21:29:14 +00001014### Helpers for charmap-based codecs
1015
1016def make_identity_dict(rng):
1017
1018 """ make_identity_dict(rng) -> dict
1019
1020 Return a dictionary where elements of the rng sequence are
1021 mapped to themselves.
Tim Peters88869f92001-01-14 23:36:06 +00001022
Marc-André Lemburga866df82001-01-03 21:29:14 +00001023 """
1024 res = {}
1025 for i in rng:
1026 res[i]=i
1027 return res
1028
Marc-André Lemburg716cf912001-05-16 09:41:45 +00001029def make_encoding_map(decoding_map):
1030
1031 """ Creates an encoding map from a decoding map.
1032
Walter Dörwald7f3ed742003-02-02 23:08:27 +00001033 If a target mapping in the decoding map occurs multiple
Marc-André Lemburg716cf912001-05-16 09:41:45 +00001034 times, then that target is mapped to None (undefined mapping),
1035 causing an exception when encountered by the charmap codec
1036 during translation.
1037
1038 One example where this happens is cp875.py which decodes
1039 multiple character to \u001a.
1040
1041 """
1042 m = {}
1043 for k,v in decoding_map.items():
Raymond Hettinger54f02222002-06-01 14:18:47 +00001044 if not v in m:
Marc-André Lemburg716cf912001-05-16 09:41:45 +00001045 m[v] = k
1046 else:
1047 m[v] = None
1048 return m
Tim Peters3a2ab1a2001-05-29 06:06:54 +00001049
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001050### error handlers
1051
Martin v. Löwise2713be2005-03-08 15:03:08 +00001052try:
1053 strict_errors = lookup_error("strict")
1054 ignore_errors = lookup_error("ignore")
1055 replace_errors = lookup_error("replace")
1056 xmlcharrefreplace_errors = lookup_error("xmlcharrefreplace")
1057 backslashreplace_errors = lookup_error("backslashreplace")
1058except LookupError:
1059 # In --disable-unicode builds, these error handler are missing
1060 strict_errors = None
1061 ignore_errors = None
1062 replace_errors = None
1063 xmlcharrefreplace_errors = None
1064 backslashreplace_errors = None
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001065
Martin v. Löwis6cd441d2001-07-31 08:54:55 +00001066# Tell modulefinder that using codecs probably needs the encodings
1067# package
1068_false = 0
1069if _false:
1070 import encodings
1071
Guido van Rossum0612d842000-03-10 23:20:43 +00001072### Tests
Guido van Rossum1c89b0e2000-04-11 15:41:38 +00001073
Guido van Rossum0612d842000-03-10 23:20:43 +00001074if __name__ == '__main__':
1075
Guido van Rossuma3277132000-04-11 15:37:43 +00001076 # Make stdout translate Latin-1 output into UTF-8 output
1077 sys.stdout = EncodedFile(sys.stdout, 'latin-1', 'utf-8')
Guido van Rossum1c89b0e2000-04-11 15:41:38 +00001078
Guido van Rossuma3277132000-04-11 15:37:43 +00001079 # Have stdin translate Latin-1 input into UTF-8 input
1080 sys.stdin = EncodedFile(sys.stdin, 'utf-8', 'latin-1')