blob: 1518d75f9d20fd926558077dea05cba49a38662f [file] [log] [blame]
Guido van Rossum0612d842000-03-10 23:20:43 +00001""" codecs -- Python Codec Registry, API and helpers.
2
3
4Written by Marc-Andre Lemburg (mal@lemburg.com).
5
6(c) Copyright CNRI, All Rights Reserved. NO WARRANTY.
7
8"""#"
9
Marc-André Lemburgb28de0d2002-12-12 17:37:50 +000010import __builtin__, sys
Guido van Rossum0612d842000-03-10 23:20:43 +000011
12### Registry and builtin stateless codec functions
13
Guido van Rossumb95de4f2000-03-31 17:25:23 +000014try:
15 from _codecs import *
Tim Peters30324a72001-05-15 17:19:16 +000016except ImportError, why:
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000017 raise SystemError('Failed to load the builtin codecs: %s' % why)
Guido van Rossum0612d842000-03-10 23:20:43 +000018
Tim Peters30324a72001-05-15 17:19:16 +000019__all__ = ["register", "lookup", "open", "EncodedFile", "BOM", "BOM_BE",
Walter Dörwald474458d2002-06-04 15:16:29 +000020 "BOM_LE", "BOM32_BE", "BOM32_LE", "BOM64_BE", "BOM64_LE",
21 "BOM_UTF8", "BOM_UTF16", "BOM_UTF16_LE", "BOM_UTF16_BE",
Walter Dörwald3aeb6322002-09-02 13:14:32 +000022 "BOM_UTF32", "BOM_UTF32_LE", "BOM_UTF32_BE",
23 "strict_errors", "ignore_errors", "replace_errors",
24 "xmlcharrefreplace_errors",
25 "register_error", "lookup_error"]
Skip Montanaroe99d5ea2001-01-20 19:54:20 +000026
Guido van Rossum0612d842000-03-10 23:20:43 +000027### Constants
28
29#
Walter Dörwald474458d2002-06-04 15:16:29 +000030# Byte Order Mark (BOM = ZERO WIDTH NO-BREAK SPACE = U+FEFF)
31# and its possible byte string values
32# for UTF8/UTF16/UTF32 output and little/big endian machines
Guido van Rossum0612d842000-03-10 23:20:43 +000033#
Guido van Rossum0612d842000-03-10 23:20:43 +000034
Walter Dörwald474458d2002-06-04 15:16:29 +000035# UTF-8
36BOM_UTF8 = '\xef\xbb\xbf'
37
38# UTF-16, little endian
39BOM_LE = BOM_UTF16_LE = '\xff\xfe'
40
41# UTF-16, big endian
42BOM_BE = BOM_UTF16_BE = '\xfe\xff'
43
44# UTF-32, little endian
45BOM_UTF32_LE = '\xff\xfe\x00\x00'
46
47# UTF-32, big endian
48BOM_UTF32_BE = '\x00\x00\xfe\xff'
49
Marc-André Lemburgb28de0d2002-12-12 17:37:50 +000050if sys.byteorder == 'little':
Walter Dörwald474458d2002-06-04 15:16:29 +000051
Marc-André Lemburgb28de0d2002-12-12 17:37:50 +000052 # UTF-16, native endianness
53 BOM = BOM_UTF16 = BOM_UTF16_LE
54
55 # UTF-32, native endianness
56 BOM_UTF32 = BOM_UTF32_LE
57
58else:
59
60 # UTF-16, native endianness
61 BOM = BOM_UTF16 = BOM_UTF16_BE
62
63 # UTF-32, native endianness
64 BOM_UTF32 = BOM_UTF32_BE
Walter Dörwald474458d2002-06-04 15:16:29 +000065
66# Old broken names (don't use in new code)
67BOM32_LE = BOM_UTF16_LE
68BOM32_BE = BOM_UTF16_BE
69BOM64_LE = BOM_UTF32_LE
70BOM64_BE = BOM_UTF32_BE
Guido van Rossum0612d842000-03-10 23:20:43 +000071
72
73### Codec base classes (defining the API)
74
Thomas Woutersa9773292006-04-21 09:43:23 +000075class CodecInfo(tuple):
76
77 def __new__(cls, encode, decode, streamreader=None, streamwriter=None,
78 incrementalencoder=None, incrementaldecoder=None, name=None):
79 self = tuple.__new__(cls, (encode, decode, streamreader, streamwriter))
80 self.name = name
81 self.encode = encode
82 self.decode = decode
83 self.incrementalencoder = incrementalencoder
84 self.incrementaldecoder = incrementaldecoder
85 self.streamwriter = streamwriter
86 self.streamreader = streamreader
87 return self
88
89 def __repr__(self):
90 return "<%s.%s object for encoding %s at 0x%x>" % (self.__class__.__module__, self.__class__.__name__, self.name, id(self))
91
Guido van Rossum0612d842000-03-10 23:20:43 +000092class Codec:
93
94 """ Defines the interface for stateless encoders/decoders.
95
Walter Dörwald7f82f792002-11-19 21:42:53 +000096 The .encode()/.decode() methods may use different error
Guido van Rossum0612d842000-03-10 23:20:43 +000097 handling schemes by providing the errors argument. These
Walter Dörwald7f82f792002-11-19 21:42:53 +000098 string values are predefined:
Guido van Rossum0612d842000-03-10 23:20:43 +000099
Guido van Rossumd8855fd2000-03-24 22:14:19 +0000100 'strict' - raise a ValueError error (or a subclass)
Guido van Rossum0612d842000-03-10 23:20:43 +0000101 'ignore' - ignore the character and continue with the next
102 'replace' - replace with a suitable replacement character;
103 Python will use the official U+FFFD REPLACEMENT
Walter Dörwald7f82f792002-11-19 21:42:53 +0000104 CHARACTER for the builtin Unicode codecs on
105 decoding and '?' on encoding.
106 'xmlcharrefreplace' - Replace with the appropriate XML
107 character reference (only for encoding).
108 'backslashreplace' - Replace with backslashed escape sequences
109 (only for encoding).
110
111 The set of allowed values can be extended via register_error.
Guido van Rossum0612d842000-03-10 23:20:43 +0000112
113 """
Tim Peters30324a72001-05-15 17:19:16 +0000114 def encode(self, input, errors='strict'):
Guido van Rossum1c89b0e2000-04-11 15:41:38 +0000115
Fred Drake3e74c0d2000-03-17 15:40:35 +0000116 """ Encodes the object input and returns a tuple (output
Guido van Rossum0612d842000-03-10 23:20:43 +0000117 object, length consumed).
118
119 errors defines the error handling to apply. It defaults to
120 'strict' handling.
121
122 The method may not store state in the Codec instance. Use
123 StreamCodec for codecs which have to keep state in order to
124 make encoding/decoding efficient.
125
126 The encoder must be able to handle zero length input and
127 return an empty object of the output object type in this
128 situation.
129
130 """
131 raise NotImplementedError
132
Tim Peters30324a72001-05-15 17:19:16 +0000133 def decode(self, input, errors='strict'):
Guido van Rossum0612d842000-03-10 23:20:43 +0000134
135 """ Decodes the object input and returns a tuple (output
136 object, length consumed).
137
138 input must be an object which provides the bf_getreadbuf
139 buffer slot. Python strings, buffer objects and memory
140 mapped files are examples of objects providing this slot.
Guido van Rossum1c89b0e2000-04-11 15:41:38 +0000141
Guido van Rossum0612d842000-03-10 23:20:43 +0000142 errors defines the error handling to apply. It defaults to
143 'strict' handling.
144
145 The method may not store state in the Codec instance. Use
146 StreamCodec for codecs which have to keep state in order to
147 make encoding/decoding efficient.
148
149 The decoder must be able to handle zero length input and
150 return an empty object of the output object type in this
151 situation.
152
Guido van Rossum1c89b0e2000-04-11 15:41:38 +0000153 """
Guido van Rossum0612d842000-03-10 23:20:43 +0000154 raise NotImplementedError
155
Thomas Woutersa9773292006-04-21 09:43:23 +0000156class IncrementalEncoder(object):
157 """
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000158 An IncrementalEncoder encodes an input in multiple steps. The input can be
Thomas Woutersa9773292006-04-21 09:43:23 +0000159 passed piece by piece to the encode() method. The IncrementalEncoder remembers
160 the state of the Encoding process between calls to encode().
161 """
162 def __init__(self, errors='strict'):
163 """
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000164 Creates an IncrementalEncoder instance.
Thomas Woutersa9773292006-04-21 09:43:23 +0000165
166 The IncrementalEncoder may use different error handling schemes by
167 providing the errors keyword argument. See the module docstring
168 for a list of possible values.
169 """
170 self.errors = errors
171 self.buffer = ""
172
173 def encode(self, input, final=False):
174 """
175 Encodes input and returns the resulting object.
176 """
177 raise NotImplementedError
178
179 def reset(self):
180 """
181 Resets the encoder to the initial state.
182 """
183
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000184class BufferedIncrementalEncoder(IncrementalEncoder):
185 """
186 This subclass of IncrementalEncoder can be used as the baseclass for an
187 incremental encoder if the encoder must keep some of the output in a
188 buffer between calls to encode().
189 """
190 def __init__(self, errors='strict'):
191 IncrementalEncoder.__init__(self, errors)
192 self.buffer = "" # unencoded input that is kept between calls to encode()
193
194 def _buffer_encode(self, input, errors, final):
195 # Overwrite this method in subclasses: It must encode input
196 # and return an (output, length consumed) tuple
197 raise NotImplementedError
198
199 def encode(self, input, final=False):
200 # encode input (taking the buffer into account)
201 data = self.buffer + input
202 (result, consumed) = self._buffer_encode(data, self.errors, final)
203 # keep unencoded input until the next call
204 self.buffer = data[consumed:]
205 return result
206
207 def reset(self):
208 IncrementalEncoder.reset(self)
209 self.buffer = ""
210
Thomas Woutersa9773292006-04-21 09:43:23 +0000211class IncrementalDecoder(object):
212 """
213 An IncrementalDecoder decodes an input in multiple steps. The input can be
214 passed piece by piece to the decode() method. The IncrementalDecoder
215 remembers the state of the decoding process between calls to decode().
216 """
217 def __init__(self, errors='strict'):
218 """
219 Creates a IncrementalDecoder instance.
220
221 The IncrementalDecoder may use different error handling schemes by
222 providing the errors keyword argument. See the module docstring
223 for a list of possible values.
224 """
225 self.errors = errors
226
227 def decode(self, input, final=False):
228 """
229 Decodes input and returns the resulting object.
230 """
231 raise NotImplementedError
232
233 def reset(self):
234 """
235 Resets the decoder to the initial state.
236 """
237
238class BufferedIncrementalDecoder(IncrementalDecoder):
239 """
240 This subclass of IncrementalDecoder can be used as the baseclass for an
241 incremental decoder if the decoder must be able to handle incomplete byte
242 sequences.
243 """
244 def __init__(self, errors='strict'):
245 IncrementalDecoder.__init__(self, errors)
246 self.buffer = "" # undecoded input that is kept between calls to decode()
247
248 def _buffer_decode(self, input, errors, final):
249 # Overwrite this method in subclasses: It must decode input
250 # and return an (output, length consumed) tuple
251 raise NotImplementedError
252
253 def decode(self, input, final=False):
254 # decode input (taking the buffer into account)
255 data = self.buffer + input
256 (result, consumed) = self._buffer_decode(data, self.errors, final)
257 # keep undecoded input until the next call
258 self.buffer = data[consumed:]
259 return result
260
261 def reset(self):
262 IncrementalDecoder.reset(self)
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000263 self.buffer = ""
Thomas Woutersa9773292006-04-21 09:43:23 +0000264
Guido van Rossum0612d842000-03-10 23:20:43 +0000265#
266# The StreamWriter and StreamReader class provide generic working
Andrew M. Kuchling97c56352001-09-18 20:29:48 +0000267# interfaces which can be used to implement new encoding submodules
Guido van Rossum0612d842000-03-10 23:20:43 +0000268# very easily. See encodings/utf_8.py for an example on how this is
269# done.
Guido van Rossum1c89b0e2000-04-11 15:41:38 +0000270#
Guido van Rossum0612d842000-03-10 23:20:43 +0000271
272class StreamWriter(Codec):
273
Tim Peters30324a72001-05-15 17:19:16 +0000274 def __init__(self, stream, errors='strict'):
Guido van Rossum0612d842000-03-10 23:20:43 +0000275
276 """ Creates a StreamWriter instance.
277
278 stream must be a file-like object open for writing
279 (binary) data.
280
Walter Dörwald7f82f792002-11-19 21:42:53 +0000281 The StreamWriter may use different error handling
Guido van Rossum0612d842000-03-10 23:20:43 +0000282 schemes by providing the errors keyword argument. These
Walter Dörwald7f82f792002-11-19 21:42:53 +0000283 parameters are predefined:
Guido van Rossum0612d842000-03-10 23:20:43 +0000284
285 'strict' - raise a ValueError (or a subclass)
286 'ignore' - ignore the character and continue with the next
287 'replace'- replace with a suitable replacement character
Walter Dörwald7f82f792002-11-19 21:42:53 +0000288 'xmlcharrefreplace' - Replace with the appropriate XML
289 character reference.
290 'backslashreplace' - Replace with backslashed escape
291 sequences (only for encoding).
Guido van Rossum0612d842000-03-10 23:20:43 +0000292
Walter Dörwald7f82f792002-11-19 21:42:53 +0000293 The set of allowed parameter values can be extended via
294 register_error.
Guido van Rossum0612d842000-03-10 23:20:43 +0000295 """
296 self.stream = stream
297 self.errors = errors
298
Guido van Rossuma3277132000-04-11 15:37:43 +0000299 def write(self, object):
Guido van Rossum0612d842000-03-10 23:20:43 +0000300
301 """ Writes the object's contents encoded to self.stream.
302 """
Tim Peters30324a72001-05-15 17:19:16 +0000303 data, consumed = self.encode(object, self.errors)
Guido van Rossum0612d842000-03-10 23:20:43 +0000304 self.stream.write(data)
305
Guido van Rossuma3277132000-04-11 15:37:43 +0000306 def writelines(self, list):
307
308 """ Writes the concatenated list of strings to the stream
309 using .write().
310 """
311 self.write(''.join(list))
Guido van Rossum1c89b0e2000-04-11 15:41:38 +0000312
Guido van Rossum0612d842000-03-10 23:20:43 +0000313 def reset(self):
314
315 """ Flushes and resets the codec buffers used for keeping state.
316
317 Calling this method should ensure that the data on the
318 output is put into a clean state, that allows appending
319 of new fresh data without having to rescan the whole
320 stream to recover state.
321
322 """
323 pass
324
Tim Peters30324a72001-05-15 17:19:16 +0000325 def __getattr__(self, name,
Guido van Rossum0612d842000-03-10 23:20:43 +0000326 getattr=getattr):
327
328 """ Inherit all other methods from the underlying stream.
329 """
Tim Peters30324a72001-05-15 17:19:16 +0000330 return getattr(self.stream, name)
Guido van Rossum0612d842000-03-10 23:20:43 +0000331
332###
333
334class StreamReader(Codec):
335
Tim Peters30324a72001-05-15 17:19:16 +0000336 def __init__(self, stream, errors='strict'):
Guido van Rossum0612d842000-03-10 23:20:43 +0000337
338 """ Creates a StreamReader instance.
339
340 stream must be a file-like object open for reading
341 (binary) data.
342
Walter Dörwald7f82f792002-11-19 21:42:53 +0000343 The StreamReader may use different error handling
Guido van Rossum0612d842000-03-10 23:20:43 +0000344 schemes by providing the errors keyword argument. These
Walter Dörwald7f82f792002-11-19 21:42:53 +0000345 parameters are predefined:
Guido van Rossum0612d842000-03-10 23:20:43 +0000346
347 'strict' - raise a ValueError (or a subclass)
348 'ignore' - ignore the character and continue with the next
349 'replace'- replace with a suitable replacement character;
350
Walter Dörwald7f82f792002-11-19 21:42:53 +0000351 The set of allowed parameter values can be extended via
352 register_error.
Guido van Rossum0612d842000-03-10 23:20:43 +0000353 """
354 self.stream = stream
355 self.errors = errors
Walter Dörwald69652032004-09-07 20:24:22 +0000356 self.bytebuffer = ""
Walter Dörwaldc9878e12005-07-20 22:15:39 +0000357 # For str->str decoding this will stay a str
358 # For str->unicode decoding the first read will promote it to unicode
359 self.charbuffer = ""
Martin v. Löwis4ed67382005-09-18 08:34:39 +0000360 self.linebuffer = None
Guido van Rossum0612d842000-03-10 23:20:43 +0000361
Walter Dörwald69652032004-09-07 20:24:22 +0000362 def decode(self, input, errors='strict'):
363 raise NotImplementedError
364
Martin v. Löwis56066d22005-08-24 07:38:12 +0000365 def read(self, size=-1, chars=-1, firstline=False):
Guido van Rossum0612d842000-03-10 23:20:43 +0000366
367 """ Decodes data from the stream self.stream and returns the
368 resulting object.
369
Walter Dörwald69652032004-09-07 20:24:22 +0000370 chars indicates the number of characters to read from the
371 stream. read() will never return more than chars
372 characters, but it might return less, if there are not enough
373 characters available.
374
Guido van Rossum0612d842000-03-10 23:20:43 +0000375 size indicates the approximate maximum number of bytes to
376 read from the stream for decoding purposes. The decoder
377 can modify this setting as appropriate. The default value
378 -1 indicates to read and decode as much as possible. size
379 is intended to prevent having to decode huge files in one
380 step.
381
Martin v. Löwis56066d22005-08-24 07:38:12 +0000382 If firstline is true, and a UnicodeDecodeError happens
383 after the first line terminator in the input only the first line
384 will be returned, the rest of the input will be kept until the
385 next call to read().
386
Guido van Rossum0612d842000-03-10 23:20:43 +0000387 The method should use a greedy read strategy meaning that
388 it should read as much data as is allowed within the
389 definition of the encoding and the given size, e.g. if
390 optional encoding endings or state markers are available
391 on the stream, these should be read too.
Guido van Rossum0612d842000-03-10 23:20:43 +0000392 """
Martin v. Löwis4ed67382005-09-18 08:34:39 +0000393 # If we have lines cached, first merge them back into characters
394 if self.linebuffer:
395 self.charbuffer = "".join(self.linebuffer)
396 self.linebuffer = None
Tim Peters536cf992005-12-25 23:18:31 +0000397
Walter Dörwald69652032004-09-07 20:24:22 +0000398 # read until we get the required number of characters (if available)
Walter Dörwald69652032004-09-07 20:24:22 +0000399 while True:
400 # can the request can be satisfied from the character buffer?
401 if chars < 0:
Walter Dörwaldca199432006-03-06 22:39:12 +0000402 if size < 0:
403 if self.charbuffer:
404 break
405 elif len(self.charbuffer) >= size:
Walter Dörwalde57d7b12004-12-21 22:24:00 +0000406 break
Guido van Rossum0612d842000-03-10 23:20:43 +0000407 else:
Walter Dörwald69652032004-09-07 20:24:22 +0000408 if len(self.charbuffer) >= chars:
Walter Dörwald69652032004-09-07 20:24:22 +0000409 break
410 # we need more data
411 if size < 0:
412 newdata = self.stream.read()
413 else:
414 newdata = self.stream.read(size)
Walter Dörwalde57d7b12004-12-21 22:24:00 +0000415 # decode bytes (those remaining from the last call included)
Walter Dörwald69652032004-09-07 20:24:22 +0000416 data = self.bytebuffer + newdata
Martin v. Löwis56066d22005-08-24 07:38:12 +0000417 try:
418 newchars, decodedbytes = self.decode(data, self.errors)
419 except UnicodeDecodeError, exc:
420 if firstline:
421 newchars, decodedbytes = self.decode(data[:exc.start], self.errors)
422 lines = newchars.splitlines(True)
423 if len(lines)<=1:
424 raise
425 else:
426 raise
Walter Dörwald69652032004-09-07 20:24:22 +0000427 # keep undecoded bytes until the next call
428 self.bytebuffer = data[decodedbytes:]
429 # put new characters in the character buffer
Walter Dörwalde57d7b12004-12-21 22:24:00 +0000430 self.charbuffer += newchars
Walter Dörwald69652032004-09-07 20:24:22 +0000431 # there was no data available
432 if not newdata:
Walter Dörwalde57d7b12004-12-21 22:24:00 +0000433 break
434 if chars < 0:
435 # Return everything we've got
436 result = self.charbuffer
Walter Dörwaldc9878e12005-07-20 22:15:39 +0000437 self.charbuffer = ""
Walter Dörwalde57d7b12004-12-21 22:24:00 +0000438 else:
439 # Return the first chars characters
440 result = self.charbuffer[:chars]
441 self.charbuffer = self.charbuffer[chars:]
Walter Dörwald69652032004-09-07 20:24:22 +0000442 return result
Guido van Rossum0612d842000-03-10 23:20:43 +0000443
Walter Dörwald69652032004-09-07 20:24:22 +0000444 def readline(self, size=None, keepends=True):
Guido van Rossuma3277132000-04-11 15:37:43 +0000445
446 """ Read one line from the input stream and return the
447 decoded data.
448
Walter Dörwald69652032004-09-07 20:24:22 +0000449 size, if given, is passed as size argument to the
450 read() method.
Guido van Rossum1c89b0e2000-04-11 15:41:38 +0000451
Guido van Rossuma3277132000-04-11 15:37:43 +0000452 """
Martin v. Löwis4ed67382005-09-18 08:34:39 +0000453 # If we have lines cached from an earlier read, return
454 # them unconditionally
455 if self.linebuffer:
456 line = self.linebuffer[0]
457 del self.linebuffer[0]
458 if len(self.linebuffer) == 1:
459 # revert to charbuffer mode; we might need more data
460 # next time
461 self.charbuffer = self.linebuffer[0]
462 self.linebuffer = None
463 if not keepends:
464 line = line.splitlines(False)[0]
465 return line
Tim Peters536cf992005-12-25 23:18:31 +0000466
Walter Dörwalde57d7b12004-12-21 22:24:00 +0000467 readsize = size or 72
Walter Dörwaldc9878e12005-07-20 22:15:39 +0000468 line = ""
Walter Dörwalde57d7b12004-12-21 22:24:00 +0000469 # If size is given, we call read() only once
Walter Dörwald69652032004-09-07 20:24:22 +0000470 while True:
Martin v. Löwis56066d22005-08-24 07:38:12 +0000471 data = self.read(readsize, firstline=True)
Walter Dörwalde57d7b12004-12-21 22:24:00 +0000472 if data:
Walter Dörwalda4eb2d52005-04-21 21:42:35 +0000473 # If we're at a "\r" read one extra character (which might
474 # be a "\n") to get a proper line ending. If the stream is
Walter Dörwaldbc8e6422005-04-21 21:32:03 +0000475 # temporarily exhausted we return the wrong line ending.
Walter Dörwaldc9878e12005-07-20 22:15:39 +0000476 if data.endswith("\r"):
Walter Dörwald7a6dc132005-04-04 21:38:47 +0000477 data += self.read(size=1, chars=1)
Walter Dörwald7a6dc132005-04-04 21:38:47 +0000478
Walter Dörwald69652032004-09-07 20:24:22 +0000479 line += data
Walter Dörwalde57d7b12004-12-21 22:24:00 +0000480 lines = line.splitlines(True)
481 if lines:
Martin v. Löwis4ed67382005-09-18 08:34:39 +0000482 if len(lines) > 1:
483 # More than one line result; the first line is a full line
484 # to return
485 line = lines[0]
486 del lines[0]
487 if len(lines) > 1:
488 # cache the remaining lines
489 lines[-1] += self.charbuffer
490 self.linebuffer = lines
491 self.charbuffer = None
492 else:
493 # only one remaining line, put it back into charbuffer
494 self.charbuffer = lines[0] + self.charbuffer
495 if not keepends:
496 line = line.splitlines(False)[0]
497 break
Walter Dörwalde57d7b12004-12-21 22:24:00 +0000498 line0withend = lines[0]
499 line0withoutend = lines[0].splitlines(False)[0]
500 if line0withend != line0withoutend: # We really have a line end
501 # Put the rest back together and keep it until the next call
Walter Dörwaldc9878e12005-07-20 22:15:39 +0000502 self.charbuffer = "".join(lines[1:]) + self.charbuffer
Walter Dörwalde57d7b12004-12-21 22:24:00 +0000503 if keepends:
504 line = line0withend
505 else:
506 line = line0withoutend
Walter Dörwald9fa09462005-01-10 12:01:39 +0000507 break
Walter Dörwalde57d7b12004-12-21 22:24:00 +0000508 # we didn't get anything or this was our only try
Walter Dörwald9fa09462005-01-10 12:01:39 +0000509 if not data or size is not None:
Walter Dörwalde57d7b12004-12-21 22:24:00 +0000510 if line and not keepends:
511 line = line.splitlines(False)[0]
512 break
513 if readsize<8000:
514 readsize *= 2
515 return line
Guido van Rossum1c89b0e2000-04-11 15:41:38 +0000516
Walter Dörwald69652032004-09-07 20:24:22 +0000517 def readlines(self, sizehint=None, keepends=True):
Guido van Rossuma3277132000-04-11 15:37:43 +0000518
519 """ Read all lines available on the input stream
520 and return them as list of lines.
521
522 Line breaks are implemented using the codec's decoder
523 method and are included in the list entries.
Guido van Rossum1c89b0e2000-04-11 15:41:38 +0000524
Marc-André Lemburgd5948492004-02-26 15:22:17 +0000525 sizehint, if given, is ignored since there is no efficient
526 way to finding the true end-of-line.
Guido van Rossuma3277132000-04-11 15:37:43 +0000527
528 """
Walter Dörwald69652032004-09-07 20:24:22 +0000529 data = self.read()
Hye-Shik Changaf5c7cf2004-10-17 23:51:21 +0000530 return data.splitlines(keepends)
Guido van Rossum0612d842000-03-10 23:20:43 +0000531
532 def reset(self):
533
534 """ Resets the codec buffers used for keeping state.
535
536 Note that no stream repositioning should take place.
Thomas Wouters7e474022000-07-16 12:04:32 +0000537 This method is primarily intended to be able to recover
Guido van Rossum0612d842000-03-10 23:20:43 +0000538 from decoding errors.
539
540 """
Walter Dörwald729c31f2005-03-14 19:06:30 +0000541 self.bytebuffer = ""
542 self.charbuffer = u""
Martin v. Löwis4ed67382005-09-18 08:34:39 +0000543 self.linebuffer = None
Walter Dörwald729c31f2005-03-14 19:06:30 +0000544
Walter Dörwald71fd90d2005-03-14 19:25:41 +0000545 def seek(self, offset, whence=0):
Walter Dörwald729c31f2005-03-14 19:06:30 +0000546 """ Set the input stream's current position.
547
548 Resets the codec buffers used for keeping state.
549 """
550 self.reset()
551 self.stream.seek(offset, whence)
Guido van Rossum0612d842000-03-10 23:20:43 +0000552
Walter Dörwald4dbf1922002-11-06 16:53:44 +0000553 def next(self):
554
555 """ Return the next decoded line from the input stream."""
556 line = self.readline()
557 if line:
558 return line
559 raise StopIteration
560
561 def __iter__(self):
562 return self
563
Tim Peters30324a72001-05-15 17:19:16 +0000564 def __getattr__(self, name,
Guido van Rossum0612d842000-03-10 23:20:43 +0000565 getattr=getattr):
566
567 """ Inherit all other methods from the underlying stream.
568 """
Tim Peters30324a72001-05-15 17:19:16 +0000569 return getattr(self.stream, name)
Guido van Rossum0612d842000-03-10 23:20:43 +0000570
571###
572
573class StreamReaderWriter:
574
Fred Drake49fd1072000-04-13 14:11:21 +0000575 """ StreamReaderWriter instances allow wrapping streams which
576 work in both read and write modes.
577
578 The design is such that one can use the factory functions
Thomas Wouters7e474022000-07-16 12:04:32 +0000579 returned by the codec.lookup() function to construct the
Fred Drake49fd1072000-04-13 14:11:21 +0000580 instance.
581
582 """
Guido van Rossuma3277132000-04-11 15:37:43 +0000583 # Optional attributes set by the file wrappers below
584 encoding = 'unknown'
585
Tim Peters30324a72001-05-15 17:19:16 +0000586 def __init__(self, stream, Reader, Writer, errors='strict'):
Guido van Rossum0612d842000-03-10 23:20:43 +0000587
588 """ Creates a StreamReaderWriter instance.
589
590 stream must be a Stream-like object.
591
592 Reader, Writer must be factory functions or classes
593 providing the StreamReader, StreamWriter interface resp.
594
595 Error handling is done in the same way as defined for the
596 StreamWriter/Readers.
597
598 """
599 self.stream = stream
600 self.reader = Reader(stream, errors)
601 self.writer = Writer(stream, errors)
602 self.errors = errors
603
Tim Peters30324a72001-05-15 17:19:16 +0000604 def read(self, size=-1):
Guido van Rossum0612d842000-03-10 23:20:43 +0000605
606 return self.reader.read(size)
607
Guido van Rossumd58c26f2000-05-01 16:17:32 +0000608 def readline(self, size=None):
Guido van Rossuma3277132000-04-11 15:37:43 +0000609
610 return self.reader.readline(size)
611
Guido van Rossumd58c26f2000-05-01 16:17:32 +0000612 def readlines(self, sizehint=None):
Guido van Rossuma3277132000-04-11 15:37:43 +0000613
614 return self.reader.readlines(sizehint)
615
Walter Dörwald4dbf1922002-11-06 16:53:44 +0000616 def next(self):
617
618 """ Return the next decoded line from the input stream."""
619 return self.reader.next()
620
621 def __iter__(self):
622 return self
623
Tim Peters30324a72001-05-15 17:19:16 +0000624 def write(self, data):
Guido van Rossum0612d842000-03-10 23:20:43 +0000625
626 return self.writer.write(data)
627
Tim Peters30324a72001-05-15 17:19:16 +0000628 def writelines(self, list):
Guido van Rossuma3277132000-04-11 15:37:43 +0000629
630 return self.writer.writelines(list)
631
Guido van Rossum0612d842000-03-10 23:20:43 +0000632 def reset(self):
633
634 self.reader.reset()
635 self.writer.reset()
636
Tim Peters30324a72001-05-15 17:19:16 +0000637 def __getattr__(self, name,
Guido van Rossum0612d842000-03-10 23:20:43 +0000638 getattr=getattr):
639
640 """ Inherit all other methods from the underlying stream.
641 """
Tim Peters30324a72001-05-15 17:19:16 +0000642 return getattr(self.stream, name)
Guido van Rossum0612d842000-03-10 23:20:43 +0000643
644###
645
646class StreamRecoder:
647
Fred Drake49fd1072000-04-13 14:11:21 +0000648 """ StreamRecoder instances provide a frontend - backend
649 view of encoding data.
650
651 They use the complete set of APIs returned by the
652 codecs.lookup() function to implement their task.
653
654 Data written to the stream is first decoded into an
655 intermediate format (which is dependent on the given codec
656 combination) and then written to the stream using an instance
657 of the provided Writer class.
658
659 In the other direction, data is read from the stream using a
660 Reader instance and then return encoded data to the caller.
661
662 """
Guido van Rossuma3277132000-04-11 15:37:43 +0000663 # Optional attributes set by the file wrappers below
664 data_encoding = 'unknown'
665 file_encoding = 'unknown'
666
Tim Peters30324a72001-05-15 17:19:16 +0000667 def __init__(self, stream, encode, decode, Reader, Writer,
668 errors='strict'):
Guido van Rossum0612d842000-03-10 23:20:43 +0000669
670 """ Creates a StreamRecoder instance which implements a two-way
671 conversion: encode and decode work on the frontend (the
Guido van Rossum1c89b0e2000-04-11 15:41:38 +0000672 input to .read() and output of .write()) while
Guido van Rossum0612d842000-03-10 23:20:43 +0000673 Reader and Writer work on the backend (reading and
Fred Drake908670c2000-03-17 15:42:11 +0000674 writing to the stream).
Guido van Rossum0612d842000-03-10 23:20:43 +0000675
676 You can use these objects to do transparent direct
677 recodings from e.g. latin-1 to utf-8 and back.
678
679 stream must be a file-like object.
680
681 encode, decode must adhere to the Codec interface, Reader,
682 Writer must be factory functions or classes providing the
683 StreamReader, StreamWriter interface resp.
684
685 encode and decode are needed for the frontend translation,
686 Reader and Writer for the backend translation. Unicode is
687 used as intermediate encoding.
688
689 Error handling is done in the same way as defined for the
690 StreamWriter/Readers.
691
692 """
693 self.stream = stream
694 self.encode = encode
695 self.decode = decode
696 self.reader = Reader(stream, errors)
697 self.writer = Writer(stream, errors)
698 self.errors = errors
699
Tim Peters30324a72001-05-15 17:19:16 +0000700 def read(self, size=-1):
Guido van Rossum0612d842000-03-10 23:20:43 +0000701
702 data = self.reader.read(size)
703 data, bytesencoded = self.encode(data, self.errors)
704 return data
705
Tim Peters30324a72001-05-15 17:19:16 +0000706 def readline(self, size=None):
Guido van Rossuma3277132000-04-11 15:37:43 +0000707
708 if size is None:
709 data = self.reader.readline()
710 else:
711 data = self.reader.readline(size)
712 data, bytesencoded = self.encode(data, self.errors)
713 return data
714
Tim Peters30324a72001-05-15 17:19:16 +0000715 def readlines(self, sizehint=None):
Guido van Rossuma3277132000-04-11 15:37:43 +0000716
Marc-André Lemburgd5948492004-02-26 15:22:17 +0000717 data = self.reader.read()
Guido van Rossuma3277132000-04-11 15:37:43 +0000718 data, bytesencoded = self.encode(data, self.errors)
719 return data.splitlines(1)
720
Walter Dörwald4dbf1922002-11-06 16:53:44 +0000721 def next(self):
722
723 """ Return the next decoded line from the input stream."""
Walter Dörwaldc5238b82005-09-01 11:56:53 +0000724 data = self.reader.next()
725 data, bytesencoded = self.encode(data, self.errors)
726 return data
Walter Dörwald4dbf1922002-11-06 16:53:44 +0000727
728 def __iter__(self):
729 return self
730
Tim Peters30324a72001-05-15 17:19:16 +0000731 def write(self, data):
Guido van Rossum0612d842000-03-10 23:20:43 +0000732
733 data, bytesdecoded = self.decode(data, self.errors)
734 return self.writer.write(data)
735
Tim Peters30324a72001-05-15 17:19:16 +0000736 def writelines(self, list):
Guido van Rossuma3277132000-04-11 15:37:43 +0000737
738 data = ''.join(list)
739 data, bytesdecoded = self.decode(data, self.errors)
740 return self.writer.write(data)
Guido van Rossum0612d842000-03-10 23:20:43 +0000741
742 def reset(self):
743
744 self.reader.reset()
745 self.writer.reset()
746
Tim Peters30324a72001-05-15 17:19:16 +0000747 def __getattr__(self, name,
Guido van Rossum0612d842000-03-10 23:20:43 +0000748 getattr=getattr):
749
750 """ Inherit all other methods from the underlying stream.
751 """
Tim Peters30324a72001-05-15 17:19:16 +0000752 return getattr(self.stream, name)
Guido van Rossum0612d842000-03-10 23:20:43 +0000753
754### Shortcuts
755
Marc-André Lemburg349a3d32000-06-21 21:21:04 +0000756def open(filename, mode='rb', encoding=None, errors='strict', buffering=1):
Guido van Rossum0612d842000-03-10 23:20:43 +0000757
758 """ Open an encoded file using the given mode and return
759 a wrapped version providing transparent encoding/decoding.
760
761 Note: The wrapped version will only accept the object format
762 defined by the codecs, i.e. Unicode objects for most builtin
Skip Montanaro9f5f9d92005-03-16 03:51:56 +0000763 codecs. Output is also codec dependent and will usually be
Guido van Rossum0612d842000-03-10 23:20:43 +0000764 Unicode as well.
765
Marc-André Lemburg349a3d32000-06-21 21:21:04 +0000766 Files are always opened in binary mode, even if no binary mode
Walter Dörwald7f3ed742003-02-02 23:08:27 +0000767 was specified. This is done to avoid data loss due to encodings
Marc-André Lemburg349a3d32000-06-21 21:21:04 +0000768 using 8-bit values. The default file mode is 'rb' meaning to
769 open the file in binary read mode.
770
Guido van Rossum0612d842000-03-10 23:20:43 +0000771 encoding specifies the encoding which is to be used for the
Walter Dörwald7f3ed742003-02-02 23:08:27 +0000772 file.
Guido van Rossum0612d842000-03-10 23:20:43 +0000773
774 errors may be given to define the error handling. It defaults
775 to 'strict' which causes ValueErrors to be raised in case an
776 encoding error occurs.
777
778 buffering has the same meaning as for the builtin open() API.
779 It defaults to line buffered.
780
Fred Drake49fd1072000-04-13 14:11:21 +0000781 The returned wrapped file object provides an extra attribute
782 .encoding which allows querying the used encoding. This
783 attribute is only available if an encoding was specified as
784 parameter.
785
Guido van Rossum0612d842000-03-10 23:20:43 +0000786 """
787 if encoding is not None and \
788 'b' not in mode:
789 # Force opening of the file in binary mode
790 mode = mode + 'b'
791 file = __builtin__.open(filename, mode, buffering)
792 if encoding is None:
793 return file
Thomas Woutersa9773292006-04-21 09:43:23 +0000794 info = lookup(encoding)
795 srw = StreamReaderWriter(file, info.streamreader, info.streamwriter, errors)
Guido van Rossuma3277132000-04-11 15:37:43 +0000796 # Add attributes to simplify introspection
797 srw.encoding = encoding
798 return srw
Guido van Rossum0612d842000-03-10 23:20:43 +0000799
Guido van Rossuma3277132000-04-11 15:37:43 +0000800def EncodedFile(file, data_encoding, file_encoding=None, errors='strict'):
Guido van Rossum0612d842000-03-10 23:20:43 +0000801
802 """ Return a wrapped version of file which provides transparent
803 encoding translation.
804
805 Strings written to the wrapped file are interpreted according
Guido van Rossuma3277132000-04-11 15:37:43 +0000806 to the given data_encoding and then written to the original
807 file as string using file_encoding. The intermediate encoding
808 will usually be Unicode but depends on the specified codecs.
Guido van Rossum0612d842000-03-10 23:20:43 +0000809
Guido van Rossuma3277132000-04-11 15:37:43 +0000810 Strings are read from the file using file_encoding and then
811 passed back to the caller as string using data_encoding.
812
813 If file_encoding is not given, it defaults to data_encoding.
Guido van Rossum0612d842000-03-10 23:20:43 +0000814
815 errors may be given to define the error handling. It defaults
816 to 'strict' which causes ValueErrors to be raised in case an
817 encoding error occurs.
818
Fred Drake49fd1072000-04-13 14:11:21 +0000819 The returned wrapped file object provides two extra attributes
820 .data_encoding and .file_encoding which reflect the given
821 parameters of the same name. The attributes can be used for
822 introspection by Python programs.
823
Guido van Rossum0612d842000-03-10 23:20:43 +0000824 """
Guido van Rossuma3277132000-04-11 15:37:43 +0000825 if file_encoding is None:
826 file_encoding = data_encoding
Thomas Woutersa9773292006-04-21 09:43:23 +0000827 info = lookup(data_encoding)
828 sr = StreamRecoder(file, info.encode, info.decode,
829 info.streamreader, info.streamwriter, errors)
Guido van Rossuma3277132000-04-11 15:37:43 +0000830 # Add attributes to simplify introspection
831 sr.data_encoding = data_encoding
832 sr.file_encoding = file_encoding
833 return sr
Guido van Rossum0612d842000-03-10 23:20:43 +0000834
Marc-André Lemburgaa32c5a2001-09-19 11:24:48 +0000835### Helpers for codec lookup
836
837def getencoder(encoding):
838
839 """ Lookup up the codec for the given encoding and return
840 its encoder function.
841
842 Raises a LookupError in case the encoding cannot be found.
843
844 """
Thomas Woutersa9773292006-04-21 09:43:23 +0000845 return lookup(encoding).encode
Marc-André Lemburgaa32c5a2001-09-19 11:24:48 +0000846
847def getdecoder(encoding):
848
849 """ Lookup up the codec for the given encoding and return
850 its decoder function.
851
852 Raises a LookupError in case the encoding cannot be found.
853
854 """
Thomas Woutersa9773292006-04-21 09:43:23 +0000855 return lookup(encoding).decode
856
857def getincrementalencoder(encoding):
858
859 """ Lookup up the codec for the given encoding and return
860 its IncrementalEncoder class or factory function.
861
862 Raises a LookupError in case the encoding cannot be found
863 or the codecs doesn't provide an incremental encoder.
864
865 """
866 encoder = lookup(encoding).incrementalencoder
867 if encoder is None:
868 raise LookupError(encoding)
869 return encoder
870
871def getincrementaldecoder(encoding):
872
873 """ Lookup up the codec for the given encoding and return
874 its IncrementalDecoder class or factory function.
875
876 Raises a LookupError in case the encoding cannot be found
877 or the codecs doesn't provide an incremental decoder.
878
879 """
880 decoder = lookup(encoding).incrementaldecoder
881 if decoder is None:
882 raise LookupError(encoding)
883 return decoder
Marc-André Lemburgaa32c5a2001-09-19 11:24:48 +0000884
885def getreader(encoding):
886
887 """ Lookup up the codec for the given encoding and return
888 its StreamReader class or factory function.
889
890 Raises a LookupError in case the encoding cannot be found.
891
892 """
Thomas Woutersa9773292006-04-21 09:43:23 +0000893 return lookup(encoding).streamreader
Marc-André Lemburgaa32c5a2001-09-19 11:24:48 +0000894
895def getwriter(encoding):
896
897 """ Lookup up the codec for the given encoding and return
898 its StreamWriter class or factory function.
899
900 Raises a LookupError in case the encoding cannot be found.
901
902 """
Thomas Woutersa9773292006-04-21 09:43:23 +0000903 return lookup(encoding).streamwriter
904
905def iterencode(iterator, encoding, errors='strict', **kwargs):
906 """
907 Encoding iterator.
908
909 Encodes the input strings from the iterator using a IncrementalEncoder.
910
911 errors and kwargs are passed through to the IncrementalEncoder
912 constructor.
913 """
914 encoder = getincrementalencoder(encoding)(errors, **kwargs)
915 for input in iterator:
916 output = encoder.encode(input)
917 if output:
918 yield output
919 output = encoder.encode("", True)
920 if output:
921 yield output
922
923def iterdecode(iterator, encoding, errors='strict', **kwargs):
924 """
925 Decoding iterator.
926
927 Decodes the input strings from the iterator using a IncrementalDecoder.
928
929 errors and kwargs are passed through to the IncrementalDecoder
930 constructor.
931 """
932 decoder = getincrementaldecoder(encoding)(errors, **kwargs)
933 for input in iterator:
934 output = decoder.decode(input)
935 if output:
936 yield output
937 output = decoder.decode("", True)
938 if output:
939 yield output
Marc-André Lemburgaa32c5a2001-09-19 11:24:48 +0000940
Marc-André Lemburga866df82001-01-03 21:29:14 +0000941### Helpers for charmap-based codecs
942
943def make_identity_dict(rng):
944
945 """ make_identity_dict(rng) -> dict
946
947 Return a dictionary where elements of the rng sequence are
948 mapped to themselves.
Tim Peters88869f92001-01-14 23:36:06 +0000949
Marc-André Lemburga866df82001-01-03 21:29:14 +0000950 """
951 res = {}
952 for i in rng:
953 res[i]=i
954 return res
955
Marc-André Lemburg716cf912001-05-16 09:41:45 +0000956def make_encoding_map(decoding_map):
957
958 """ Creates an encoding map from a decoding map.
959
Walter Dörwald7f3ed742003-02-02 23:08:27 +0000960 If a target mapping in the decoding map occurs multiple
Marc-André Lemburg716cf912001-05-16 09:41:45 +0000961 times, then that target is mapped to None (undefined mapping),
962 causing an exception when encountered by the charmap codec
963 during translation.
964
965 One example where this happens is cp875.py which decodes
966 multiple character to \u001a.
967
968 """
969 m = {}
970 for k,v in decoding_map.items():
Raymond Hettinger54f02222002-06-01 14:18:47 +0000971 if not v in m:
Marc-André Lemburg716cf912001-05-16 09:41:45 +0000972 m[v] = k
973 else:
974 m[v] = None
975 return m
Tim Peters3a2ab1a2001-05-29 06:06:54 +0000976
Walter Dörwald3aeb6322002-09-02 13:14:32 +0000977### error handlers
978
Martin v. Löwise2713be2005-03-08 15:03:08 +0000979try:
980 strict_errors = lookup_error("strict")
981 ignore_errors = lookup_error("ignore")
982 replace_errors = lookup_error("replace")
983 xmlcharrefreplace_errors = lookup_error("xmlcharrefreplace")
984 backslashreplace_errors = lookup_error("backslashreplace")
985except LookupError:
986 # In --disable-unicode builds, these error handler are missing
987 strict_errors = None
988 ignore_errors = None
989 replace_errors = None
990 xmlcharrefreplace_errors = None
991 backslashreplace_errors = None
Walter Dörwald3aeb6322002-09-02 13:14:32 +0000992
Martin v. Löwis6cd441d2001-07-31 08:54:55 +0000993# Tell modulefinder that using codecs probably needs the encodings
994# package
995_false = 0
996if _false:
997 import encodings
998
Guido van Rossum0612d842000-03-10 23:20:43 +0000999### Tests
Guido van Rossum1c89b0e2000-04-11 15:41:38 +00001000
Guido van Rossum0612d842000-03-10 23:20:43 +00001001if __name__ == '__main__':
1002
Guido van Rossuma3277132000-04-11 15:37:43 +00001003 # Make stdout translate Latin-1 output into UTF-8 output
1004 sys.stdout = EncodedFile(sys.stdout, 'latin-1', 'utf-8')
Guido van Rossum1c89b0e2000-04-11 15:41:38 +00001005
Guido van Rossuma3277132000-04-11 15:37:43 +00001006 # Have stdin translate Latin-1 input into UTF-8 input
1007 sys.stdin = EncodedFile(sys.stdin, 'utf-8', 'latin-1')