blob: e4e14cfec01044ecfcd0d3640eddedad538df673 [file] [log] [blame]
Guido van Rossum0612d842000-03-10 23:20:43 +00001""" codecs -- Python Codec Registry, API and helpers.
2
3
4Written by Marc-Andre Lemburg (mal@lemburg.com).
5
6(c) Copyright CNRI, All Rights Reserved. NO WARRANTY.
7
8"""#"
9
Marc-André Lemburgb28de0d2002-12-12 17:37:50 +000010import __builtin__, sys
Guido van Rossum0612d842000-03-10 23:20:43 +000011
12### Registry and builtin stateless codec functions
13
Guido van Rossumb95de4f2000-03-31 17:25:23 +000014try:
15 from _codecs import *
Guido van Rossumb940e112007-01-10 16:19:56 +000016except ImportError as why:
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000017 raise SystemError('Failed to load the builtin codecs: %s' % why)
Guido van Rossum0612d842000-03-10 23:20:43 +000018
Tim Peters30324a72001-05-15 17:19:16 +000019__all__ = ["register", "lookup", "open", "EncodedFile", "BOM", "BOM_BE",
Walter Dörwald474458d2002-06-04 15:16:29 +000020 "BOM_LE", "BOM32_BE", "BOM32_LE", "BOM64_BE", "BOM64_LE",
21 "BOM_UTF8", "BOM_UTF16", "BOM_UTF16_LE", "BOM_UTF16_BE",
Walter Dörwald3aeb6322002-09-02 13:14:32 +000022 "BOM_UTF32", "BOM_UTF32_LE", "BOM_UTF32_BE",
23 "strict_errors", "ignore_errors", "replace_errors",
24 "xmlcharrefreplace_errors",
25 "register_error", "lookup_error"]
Skip Montanaroe99d5ea2001-01-20 19:54:20 +000026
Guido van Rossum0612d842000-03-10 23:20:43 +000027### Constants
28
29#
Walter Dörwald474458d2002-06-04 15:16:29 +000030# Byte Order Mark (BOM = ZERO WIDTH NO-BREAK SPACE = U+FEFF)
31# and its possible byte string values
32# for UTF8/UTF16/UTF32 output and little/big endian machines
Guido van Rossum0612d842000-03-10 23:20:43 +000033#
Guido van Rossum0612d842000-03-10 23:20:43 +000034
Walter Dörwald474458d2002-06-04 15:16:29 +000035# UTF-8
36BOM_UTF8 = '\xef\xbb\xbf'
37
38# UTF-16, little endian
39BOM_LE = BOM_UTF16_LE = '\xff\xfe'
40
41# UTF-16, big endian
42BOM_BE = BOM_UTF16_BE = '\xfe\xff'
43
44# UTF-32, little endian
45BOM_UTF32_LE = '\xff\xfe\x00\x00'
46
47# UTF-32, big endian
48BOM_UTF32_BE = '\x00\x00\xfe\xff'
49
Marc-André Lemburgb28de0d2002-12-12 17:37:50 +000050if sys.byteorder == 'little':
Walter Dörwald474458d2002-06-04 15:16:29 +000051
Marc-André Lemburgb28de0d2002-12-12 17:37:50 +000052 # UTF-16, native endianness
53 BOM = BOM_UTF16 = BOM_UTF16_LE
54
55 # UTF-32, native endianness
56 BOM_UTF32 = BOM_UTF32_LE
57
58else:
59
60 # UTF-16, native endianness
61 BOM = BOM_UTF16 = BOM_UTF16_BE
62
63 # UTF-32, native endianness
64 BOM_UTF32 = BOM_UTF32_BE
Walter Dörwald474458d2002-06-04 15:16:29 +000065
66# Old broken names (don't use in new code)
67BOM32_LE = BOM_UTF16_LE
68BOM32_BE = BOM_UTF16_BE
69BOM64_LE = BOM_UTF32_LE
70BOM64_BE = BOM_UTF32_BE
Guido van Rossum0612d842000-03-10 23:20:43 +000071
72
73### Codec base classes (defining the API)
74
Thomas Woutersa9773292006-04-21 09:43:23 +000075class CodecInfo(tuple):
76
77 def __new__(cls, encode, decode, streamreader=None, streamwriter=None,
78 incrementalencoder=None, incrementaldecoder=None, name=None):
79 self = tuple.__new__(cls, (encode, decode, streamreader, streamwriter))
80 self.name = name
81 self.encode = encode
82 self.decode = decode
83 self.incrementalencoder = incrementalencoder
84 self.incrementaldecoder = incrementaldecoder
85 self.streamwriter = streamwriter
86 self.streamreader = streamreader
87 return self
88
89 def __repr__(self):
90 return "<%s.%s object for encoding %s at 0x%x>" % (self.__class__.__module__, self.__class__.__name__, self.name, id(self))
91
Guido van Rossum0612d842000-03-10 23:20:43 +000092class Codec:
93
94 """ Defines the interface for stateless encoders/decoders.
95
Walter Dörwald7f82f792002-11-19 21:42:53 +000096 The .encode()/.decode() methods may use different error
Guido van Rossum0612d842000-03-10 23:20:43 +000097 handling schemes by providing the errors argument. These
Walter Dörwald7f82f792002-11-19 21:42:53 +000098 string values are predefined:
Guido van Rossum0612d842000-03-10 23:20:43 +000099
Guido van Rossumd8855fd2000-03-24 22:14:19 +0000100 'strict' - raise a ValueError error (or a subclass)
Guido van Rossum0612d842000-03-10 23:20:43 +0000101 'ignore' - ignore the character and continue with the next
102 'replace' - replace with a suitable replacement character;
103 Python will use the official U+FFFD REPLACEMENT
Walter Dörwald7f82f792002-11-19 21:42:53 +0000104 CHARACTER for the builtin Unicode codecs on
105 decoding and '?' on encoding.
106 'xmlcharrefreplace' - Replace with the appropriate XML
107 character reference (only for encoding).
108 'backslashreplace' - Replace with backslashed escape sequences
109 (only for encoding).
110
111 The set of allowed values can be extended via register_error.
Guido van Rossum0612d842000-03-10 23:20:43 +0000112
113 """
Tim Peters30324a72001-05-15 17:19:16 +0000114 def encode(self, input, errors='strict'):
Guido van Rossum1c89b0e2000-04-11 15:41:38 +0000115
Fred Drake3e74c0d2000-03-17 15:40:35 +0000116 """ Encodes the object input and returns a tuple (output
Guido van Rossum0612d842000-03-10 23:20:43 +0000117 object, length consumed).
118
119 errors defines the error handling to apply. It defaults to
120 'strict' handling.
121
122 The method may not store state in the Codec instance. Use
123 StreamCodec for codecs which have to keep state in order to
124 make encoding/decoding efficient.
125
126 The encoder must be able to handle zero length input and
127 return an empty object of the output object type in this
128 situation.
129
130 """
131 raise NotImplementedError
132
Tim Peters30324a72001-05-15 17:19:16 +0000133 def decode(self, input, errors='strict'):
Guido van Rossum0612d842000-03-10 23:20:43 +0000134
135 """ Decodes the object input and returns a tuple (output
136 object, length consumed).
137
138 input must be an object which provides the bf_getreadbuf
139 buffer slot. Python strings, buffer objects and memory
140 mapped files are examples of objects providing this slot.
Guido van Rossum1c89b0e2000-04-11 15:41:38 +0000141
Guido van Rossum0612d842000-03-10 23:20:43 +0000142 errors defines the error handling to apply. It defaults to
143 'strict' handling.
144
145 The method may not store state in the Codec instance. Use
146 StreamCodec for codecs which have to keep state in order to
147 make encoding/decoding efficient.
148
149 The decoder must be able to handle zero length input and
150 return an empty object of the output object type in this
151 situation.
152
Guido van Rossum1c89b0e2000-04-11 15:41:38 +0000153 """
Guido van Rossum0612d842000-03-10 23:20:43 +0000154 raise NotImplementedError
155
Thomas Woutersa9773292006-04-21 09:43:23 +0000156class IncrementalEncoder(object):
157 """
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000158 An IncrementalEncoder encodes an input in multiple steps. The input can be
Thomas Woutersa9773292006-04-21 09:43:23 +0000159 passed piece by piece to the encode() method. The IncrementalEncoder remembers
160 the state of the Encoding process between calls to encode().
161 """
162 def __init__(self, errors='strict'):
163 """
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000164 Creates an IncrementalEncoder instance.
Thomas Woutersa9773292006-04-21 09:43:23 +0000165
166 The IncrementalEncoder may use different error handling schemes by
167 providing the errors keyword argument. See the module docstring
168 for a list of possible values.
169 """
170 self.errors = errors
171 self.buffer = ""
172
173 def encode(self, input, final=False):
174 """
175 Encodes input and returns the resulting object.
176 """
177 raise NotImplementedError
178
179 def reset(self):
180 """
181 Resets the encoder to the initial state.
182 """
183
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000184class BufferedIncrementalEncoder(IncrementalEncoder):
185 """
186 This subclass of IncrementalEncoder can be used as the baseclass for an
187 incremental encoder if the encoder must keep some of the output in a
188 buffer between calls to encode().
189 """
190 def __init__(self, errors='strict'):
191 IncrementalEncoder.__init__(self, errors)
192 self.buffer = "" # unencoded input that is kept between calls to encode()
193
194 def _buffer_encode(self, input, errors, final):
195 # Overwrite this method in subclasses: It must encode input
196 # and return an (output, length consumed) tuple
197 raise NotImplementedError
198
199 def encode(self, input, final=False):
200 # encode input (taking the buffer into account)
201 data = self.buffer + input
202 (result, consumed) = self._buffer_encode(data, self.errors, final)
203 # keep unencoded input until the next call
204 self.buffer = data[consumed:]
205 return result
206
207 def reset(self):
208 IncrementalEncoder.reset(self)
209 self.buffer = ""
210
Thomas Woutersa9773292006-04-21 09:43:23 +0000211class IncrementalDecoder(object):
212 """
213 An IncrementalDecoder decodes an input in multiple steps. The input can be
214 passed piece by piece to the decode() method. The IncrementalDecoder
215 remembers the state of the decoding process between calls to decode().
216 """
217 def __init__(self, errors='strict'):
218 """
219 Creates a IncrementalDecoder instance.
220
221 The IncrementalDecoder may use different error handling schemes by
222 providing the errors keyword argument. See the module docstring
223 for a list of possible values.
224 """
225 self.errors = errors
226
227 def decode(self, input, final=False):
228 """
229 Decodes input and returns the resulting object.
230 """
231 raise NotImplementedError
232
233 def reset(self):
234 """
235 Resets the decoder to the initial state.
236 """
237
238class BufferedIncrementalDecoder(IncrementalDecoder):
239 """
240 This subclass of IncrementalDecoder can be used as the baseclass for an
241 incremental decoder if the decoder must be able to handle incomplete byte
242 sequences.
243 """
244 def __init__(self, errors='strict'):
245 IncrementalDecoder.__init__(self, errors)
246 self.buffer = "" # undecoded input that is kept between calls to decode()
247
248 def _buffer_decode(self, input, errors, final):
249 # Overwrite this method in subclasses: It must decode input
250 # and return an (output, length consumed) tuple
251 raise NotImplementedError
252
253 def decode(self, input, final=False):
254 # decode input (taking the buffer into account)
255 data = self.buffer + input
256 (result, consumed) = self._buffer_decode(data, self.errors, final)
257 # keep undecoded input until the next call
258 self.buffer = data[consumed:]
259 return result
260
261 def reset(self):
262 IncrementalDecoder.reset(self)
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000263 self.buffer = ""
Thomas Woutersa9773292006-04-21 09:43:23 +0000264
Guido van Rossum0612d842000-03-10 23:20:43 +0000265#
266# The StreamWriter and StreamReader class provide generic working
Andrew M. Kuchling97c56352001-09-18 20:29:48 +0000267# interfaces which can be used to implement new encoding submodules
Guido van Rossum0612d842000-03-10 23:20:43 +0000268# very easily. See encodings/utf_8.py for an example on how this is
269# done.
Guido van Rossum1c89b0e2000-04-11 15:41:38 +0000270#
Guido van Rossum0612d842000-03-10 23:20:43 +0000271
272class StreamWriter(Codec):
273
Tim Peters30324a72001-05-15 17:19:16 +0000274 def __init__(self, stream, errors='strict'):
Guido van Rossum0612d842000-03-10 23:20:43 +0000275
276 """ Creates a StreamWriter instance.
277
278 stream must be a file-like object open for writing
279 (binary) data.
280
Walter Dörwald7f82f792002-11-19 21:42:53 +0000281 The StreamWriter may use different error handling
Guido van Rossum0612d842000-03-10 23:20:43 +0000282 schemes by providing the errors keyword argument. These
Walter Dörwald7f82f792002-11-19 21:42:53 +0000283 parameters are predefined:
Guido van Rossum0612d842000-03-10 23:20:43 +0000284
285 'strict' - raise a ValueError (or a subclass)
286 'ignore' - ignore the character and continue with the next
287 'replace'- replace with a suitable replacement character
Walter Dörwald7f82f792002-11-19 21:42:53 +0000288 'xmlcharrefreplace' - Replace with the appropriate XML
289 character reference.
290 'backslashreplace' - Replace with backslashed escape
291 sequences (only for encoding).
Guido van Rossum0612d842000-03-10 23:20:43 +0000292
Walter Dörwald7f82f792002-11-19 21:42:53 +0000293 The set of allowed parameter values can be extended via
294 register_error.
Guido van Rossum0612d842000-03-10 23:20:43 +0000295 """
296 self.stream = stream
297 self.errors = errors
298
Guido van Rossuma3277132000-04-11 15:37:43 +0000299 def write(self, object):
Guido van Rossum0612d842000-03-10 23:20:43 +0000300
301 """ Writes the object's contents encoded to self.stream.
302 """
Tim Peters30324a72001-05-15 17:19:16 +0000303 data, consumed = self.encode(object, self.errors)
Guido van Rossum0612d842000-03-10 23:20:43 +0000304 self.stream.write(data)
305
Guido van Rossuma3277132000-04-11 15:37:43 +0000306 def writelines(self, list):
307
308 """ Writes the concatenated list of strings to the stream
309 using .write().
310 """
311 self.write(''.join(list))
Guido van Rossum1c89b0e2000-04-11 15:41:38 +0000312
Guido van Rossum0612d842000-03-10 23:20:43 +0000313 def reset(self):
314
315 """ Flushes and resets the codec buffers used for keeping state.
316
317 Calling this method should ensure that the data on the
318 output is put into a clean state, that allows appending
319 of new fresh data without having to rescan the whole
320 stream to recover state.
321
322 """
323 pass
324
Tim Peters30324a72001-05-15 17:19:16 +0000325 def __getattr__(self, name,
Guido van Rossum0612d842000-03-10 23:20:43 +0000326 getattr=getattr):
327
328 """ Inherit all other methods from the underlying stream.
329 """
Tim Peters30324a72001-05-15 17:19:16 +0000330 return getattr(self.stream, name)
Guido van Rossum0612d842000-03-10 23:20:43 +0000331
Thomas Wouters89f507f2006-12-13 04:49:30 +0000332 def __enter__(self):
333 return self
334
335 def __exit__(self, type, value, tb):
336 self.stream.close()
337
Guido van Rossum0612d842000-03-10 23:20:43 +0000338###
339
340class StreamReader(Codec):
341
Tim Peters30324a72001-05-15 17:19:16 +0000342 def __init__(self, stream, errors='strict'):
Guido van Rossum0612d842000-03-10 23:20:43 +0000343
344 """ Creates a StreamReader instance.
345
346 stream must be a file-like object open for reading
347 (binary) data.
348
Walter Dörwald7f82f792002-11-19 21:42:53 +0000349 The StreamReader may use different error handling
Guido van Rossum0612d842000-03-10 23:20:43 +0000350 schemes by providing the errors keyword argument. These
Walter Dörwald7f82f792002-11-19 21:42:53 +0000351 parameters are predefined:
Guido van Rossum0612d842000-03-10 23:20:43 +0000352
353 'strict' - raise a ValueError (or a subclass)
354 'ignore' - ignore the character and continue with the next
355 'replace'- replace with a suitable replacement character;
356
Walter Dörwald7f82f792002-11-19 21:42:53 +0000357 The set of allowed parameter values can be extended via
358 register_error.
Guido van Rossum0612d842000-03-10 23:20:43 +0000359 """
360 self.stream = stream
361 self.errors = errors
Walter Dörwald69652032004-09-07 20:24:22 +0000362 self.bytebuffer = ""
Walter Dörwaldc9878e12005-07-20 22:15:39 +0000363 # For str->str decoding this will stay a str
364 # For str->unicode decoding the first read will promote it to unicode
365 self.charbuffer = ""
Martin v. Löwis4ed67382005-09-18 08:34:39 +0000366 self.linebuffer = None
Guido van Rossum0612d842000-03-10 23:20:43 +0000367
Walter Dörwald69652032004-09-07 20:24:22 +0000368 def decode(self, input, errors='strict'):
369 raise NotImplementedError
370
Martin v. Löwis56066d22005-08-24 07:38:12 +0000371 def read(self, size=-1, chars=-1, firstline=False):
Guido van Rossum0612d842000-03-10 23:20:43 +0000372
373 """ Decodes data from the stream self.stream and returns the
374 resulting object.
375
Walter Dörwald69652032004-09-07 20:24:22 +0000376 chars indicates the number of characters to read from the
377 stream. read() will never return more than chars
378 characters, but it might return less, if there are not enough
379 characters available.
380
Guido van Rossum0612d842000-03-10 23:20:43 +0000381 size indicates the approximate maximum number of bytes to
382 read from the stream for decoding purposes. The decoder
383 can modify this setting as appropriate. The default value
384 -1 indicates to read and decode as much as possible. size
385 is intended to prevent having to decode huge files in one
386 step.
387
Martin v. Löwis56066d22005-08-24 07:38:12 +0000388 If firstline is true, and a UnicodeDecodeError happens
389 after the first line terminator in the input only the first line
390 will be returned, the rest of the input will be kept until the
391 next call to read().
392
Guido van Rossum0612d842000-03-10 23:20:43 +0000393 The method should use a greedy read strategy meaning that
394 it should read as much data as is allowed within the
395 definition of the encoding and the given size, e.g. if
396 optional encoding endings or state markers are available
397 on the stream, these should be read too.
Guido van Rossum0612d842000-03-10 23:20:43 +0000398 """
Martin v. Löwis4ed67382005-09-18 08:34:39 +0000399 # If we have lines cached, first merge them back into characters
400 if self.linebuffer:
401 self.charbuffer = "".join(self.linebuffer)
402 self.linebuffer = None
Tim Peters536cf992005-12-25 23:18:31 +0000403
Walter Dörwald69652032004-09-07 20:24:22 +0000404 # read until we get the required number of characters (if available)
Walter Dörwald69652032004-09-07 20:24:22 +0000405 while True:
406 # can the request can be satisfied from the character buffer?
407 if chars < 0:
Walter Dörwaldca199432006-03-06 22:39:12 +0000408 if size < 0:
409 if self.charbuffer:
410 break
411 elif len(self.charbuffer) >= size:
Walter Dörwalde57d7b12004-12-21 22:24:00 +0000412 break
Guido van Rossum0612d842000-03-10 23:20:43 +0000413 else:
Walter Dörwald69652032004-09-07 20:24:22 +0000414 if len(self.charbuffer) >= chars:
Walter Dörwald69652032004-09-07 20:24:22 +0000415 break
416 # we need more data
417 if size < 0:
418 newdata = self.stream.read()
419 else:
420 newdata = self.stream.read(size)
Walter Dörwalde57d7b12004-12-21 22:24:00 +0000421 # decode bytes (those remaining from the last call included)
Walter Dörwald69652032004-09-07 20:24:22 +0000422 data = self.bytebuffer + newdata
Martin v. Löwis56066d22005-08-24 07:38:12 +0000423 try:
424 newchars, decodedbytes = self.decode(data, self.errors)
Guido van Rossumb940e112007-01-10 16:19:56 +0000425 except UnicodeDecodeError as exc:
Martin v. Löwis56066d22005-08-24 07:38:12 +0000426 if firstline:
427 newchars, decodedbytes = self.decode(data[:exc.start], self.errors)
428 lines = newchars.splitlines(True)
429 if len(lines)<=1:
430 raise
431 else:
432 raise
Walter Dörwald69652032004-09-07 20:24:22 +0000433 # keep undecoded bytes until the next call
434 self.bytebuffer = data[decodedbytes:]
435 # put new characters in the character buffer
Walter Dörwalde57d7b12004-12-21 22:24:00 +0000436 self.charbuffer += newchars
Walter Dörwald69652032004-09-07 20:24:22 +0000437 # there was no data available
438 if not newdata:
Walter Dörwalde57d7b12004-12-21 22:24:00 +0000439 break
440 if chars < 0:
441 # Return everything we've got
442 result = self.charbuffer
Walter Dörwaldc9878e12005-07-20 22:15:39 +0000443 self.charbuffer = ""
Walter Dörwalde57d7b12004-12-21 22:24:00 +0000444 else:
445 # Return the first chars characters
446 result = self.charbuffer[:chars]
447 self.charbuffer = self.charbuffer[chars:]
Walter Dörwald69652032004-09-07 20:24:22 +0000448 return result
Guido van Rossum0612d842000-03-10 23:20:43 +0000449
Walter Dörwald69652032004-09-07 20:24:22 +0000450 def readline(self, size=None, keepends=True):
Guido van Rossuma3277132000-04-11 15:37:43 +0000451
452 """ Read one line from the input stream and return the
453 decoded data.
454
Walter Dörwald69652032004-09-07 20:24:22 +0000455 size, if given, is passed as size argument to the
456 read() method.
Guido van Rossum1c89b0e2000-04-11 15:41:38 +0000457
Guido van Rossuma3277132000-04-11 15:37:43 +0000458 """
Martin v. Löwis4ed67382005-09-18 08:34:39 +0000459 # If we have lines cached from an earlier read, return
460 # them unconditionally
461 if self.linebuffer:
462 line = self.linebuffer[0]
463 del self.linebuffer[0]
464 if len(self.linebuffer) == 1:
465 # revert to charbuffer mode; we might need more data
466 # next time
467 self.charbuffer = self.linebuffer[0]
468 self.linebuffer = None
469 if not keepends:
470 line = line.splitlines(False)[0]
471 return line
Tim Peters536cf992005-12-25 23:18:31 +0000472
Walter Dörwalde57d7b12004-12-21 22:24:00 +0000473 readsize = size or 72
Walter Dörwaldc9878e12005-07-20 22:15:39 +0000474 line = ""
Walter Dörwalde57d7b12004-12-21 22:24:00 +0000475 # If size is given, we call read() only once
Walter Dörwald69652032004-09-07 20:24:22 +0000476 while True:
Martin v. Löwis56066d22005-08-24 07:38:12 +0000477 data = self.read(readsize, firstline=True)
Walter Dörwalde57d7b12004-12-21 22:24:00 +0000478 if data:
Walter Dörwalda4eb2d52005-04-21 21:42:35 +0000479 # If we're at a "\r" read one extra character (which might
480 # be a "\n") to get a proper line ending. If the stream is
Walter Dörwaldbc8e6422005-04-21 21:32:03 +0000481 # temporarily exhausted we return the wrong line ending.
Walter Dörwaldc9878e12005-07-20 22:15:39 +0000482 if data.endswith("\r"):
Walter Dörwald7a6dc132005-04-04 21:38:47 +0000483 data += self.read(size=1, chars=1)
Walter Dörwald7a6dc132005-04-04 21:38:47 +0000484
Walter Dörwald69652032004-09-07 20:24:22 +0000485 line += data
Walter Dörwalde57d7b12004-12-21 22:24:00 +0000486 lines = line.splitlines(True)
487 if lines:
Martin v. Löwis4ed67382005-09-18 08:34:39 +0000488 if len(lines) > 1:
489 # More than one line result; the first line is a full line
490 # to return
491 line = lines[0]
492 del lines[0]
493 if len(lines) > 1:
494 # cache the remaining lines
495 lines[-1] += self.charbuffer
496 self.linebuffer = lines
497 self.charbuffer = None
498 else:
499 # only one remaining line, put it back into charbuffer
500 self.charbuffer = lines[0] + self.charbuffer
501 if not keepends:
502 line = line.splitlines(False)[0]
503 break
Walter Dörwalde57d7b12004-12-21 22:24:00 +0000504 line0withend = lines[0]
505 line0withoutend = lines[0].splitlines(False)[0]
506 if line0withend != line0withoutend: # We really have a line end
507 # Put the rest back together and keep it until the next call
Walter Dörwaldc9878e12005-07-20 22:15:39 +0000508 self.charbuffer = "".join(lines[1:]) + self.charbuffer
Walter Dörwalde57d7b12004-12-21 22:24:00 +0000509 if keepends:
510 line = line0withend
511 else:
512 line = line0withoutend
Walter Dörwald9fa09462005-01-10 12:01:39 +0000513 break
Walter Dörwalde57d7b12004-12-21 22:24:00 +0000514 # we didn't get anything or this was our only try
Walter Dörwald9fa09462005-01-10 12:01:39 +0000515 if not data or size is not None:
Walter Dörwalde57d7b12004-12-21 22:24:00 +0000516 if line and not keepends:
517 line = line.splitlines(False)[0]
518 break
519 if readsize<8000:
520 readsize *= 2
521 return line
Guido van Rossum1c89b0e2000-04-11 15:41:38 +0000522
Walter Dörwald69652032004-09-07 20:24:22 +0000523 def readlines(self, sizehint=None, keepends=True):
Guido van Rossuma3277132000-04-11 15:37:43 +0000524
525 """ Read all lines available on the input stream
526 and return them as list of lines.
527
528 Line breaks are implemented using the codec's decoder
529 method and are included in the list entries.
Guido van Rossum1c89b0e2000-04-11 15:41:38 +0000530
Marc-André Lemburgd5948492004-02-26 15:22:17 +0000531 sizehint, if given, is ignored since there is no efficient
532 way to finding the true end-of-line.
Guido van Rossuma3277132000-04-11 15:37:43 +0000533
534 """
Walter Dörwald69652032004-09-07 20:24:22 +0000535 data = self.read()
Hye-Shik Changaf5c7cf2004-10-17 23:51:21 +0000536 return data.splitlines(keepends)
Guido van Rossum0612d842000-03-10 23:20:43 +0000537
538 def reset(self):
539
540 """ Resets the codec buffers used for keeping state.
541
542 Note that no stream repositioning should take place.
Thomas Wouters7e474022000-07-16 12:04:32 +0000543 This method is primarily intended to be able to recover
Guido van Rossum0612d842000-03-10 23:20:43 +0000544 from decoding errors.
545
546 """
Walter Dörwald729c31f2005-03-14 19:06:30 +0000547 self.bytebuffer = ""
548 self.charbuffer = u""
Martin v. Löwis4ed67382005-09-18 08:34:39 +0000549 self.linebuffer = None
Walter Dörwald729c31f2005-03-14 19:06:30 +0000550
Walter Dörwald71fd90d2005-03-14 19:25:41 +0000551 def seek(self, offset, whence=0):
Walter Dörwald729c31f2005-03-14 19:06:30 +0000552 """ Set the input stream's current position.
553
554 Resets the codec buffers used for keeping state.
555 """
556 self.reset()
557 self.stream.seek(offset, whence)
Guido van Rossum0612d842000-03-10 23:20:43 +0000558
Walter Dörwald4dbf1922002-11-06 16:53:44 +0000559 def next(self):
560
561 """ Return the next decoded line from the input stream."""
562 line = self.readline()
563 if line:
564 return line
565 raise StopIteration
566
567 def __iter__(self):
568 return self
569
Tim Peters30324a72001-05-15 17:19:16 +0000570 def __getattr__(self, name,
Guido van Rossum0612d842000-03-10 23:20:43 +0000571 getattr=getattr):
572
573 """ Inherit all other methods from the underlying stream.
574 """
Tim Peters30324a72001-05-15 17:19:16 +0000575 return getattr(self.stream, name)
Guido van Rossum0612d842000-03-10 23:20:43 +0000576
Thomas Wouters89f507f2006-12-13 04:49:30 +0000577 def __enter__(self):
578 return self
579
580 def __exit__(self, type, value, tb):
581 self.stream.close()
582
Guido van Rossum0612d842000-03-10 23:20:43 +0000583###
584
585class StreamReaderWriter:
586
Fred Drake49fd1072000-04-13 14:11:21 +0000587 """ StreamReaderWriter instances allow wrapping streams which
588 work in both read and write modes.
589
590 The design is such that one can use the factory functions
Thomas Wouters7e474022000-07-16 12:04:32 +0000591 returned by the codec.lookup() function to construct the
Fred Drake49fd1072000-04-13 14:11:21 +0000592 instance.
593
594 """
Guido van Rossuma3277132000-04-11 15:37:43 +0000595 # Optional attributes set by the file wrappers below
596 encoding = 'unknown'
597
Tim Peters30324a72001-05-15 17:19:16 +0000598 def __init__(self, stream, Reader, Writer, errors='strict'):
Guido van Rossum0612d842000-03-10 23:20:43 +0000599
600 """ Creates a StreamReaderWriter instance.
601
602 stream must be a Stream-like object.
603
604 Reader, Writer must be factory functions or classes
605 providing the StreamReader, StreamWriter interface resp.
606
607 Error handling is done in the same way as defined for the
608 StreamWriter/Readers.
609
610 """
611 self.stream = stream
612 self.reader = Reader(stream, errors)
613 self.writer = Writer(stream, errors)
614 self.errors = errors
615
Tim Peters30324a72001-05-15 17:19:16 +0000616 def read(self, size=-1):
Guido van Rossum0612d842000-03-10 23:20:43 +0000617
618 return self.reader.read(size)
619
Guido van Rossumd58c26f2000-05-01 16:17:32 +0000620 def readline(self, size=None):
Guido van Rossuma3277132000-04-11 15:37:43 +0000621
622 return self.reader.readline(size)
623
Guido van Rossumd58c26f2000-05-01 16:17:32 +0000624 def readlines(self, sizehint=None):
Guido van Rossuma3277132000-04-11 15:37:43 +0000625
626 return self.reader.readlines(sizehint)
627
Walter Dörwald4dbf1922002-11-06 16:53:44 +0000628 def next(self):
629
630 """ Return the next decoded line from the input stream."""
631 return self.reader.next()
632
633 def __iter__(self):
634 return self
635
Tim Peters30324a72001-05-15 17:19:16 +0000636 def write(self, data):
Guido van Rossum0612d842000-03-10 23:20:43 +0000637
638 return self.writer.write(data)
639
Tim Peters30324a72001-05-15 17:19:16 +0000640 def writelines(self, list):
Guido van Rossuma3277132000-04-11 15:37:43 +0000641
642 return self.writer.writelines(list)
643
Guido van Rossum0612d842000-03-10 23:20:43 +0000644 def reset(self):
645
646 self.reader.reset()
647 self.writer.reset()
648
Tim Peters30324a72001-05-15 17:19:16 +0000649 def __getattr__(self, name,
Guido van Rossum0612d842000-03-10 23:20:43 +0000650 getattr=getattr):
651
652 """ Inherit all other methods from the underlying stream.
653 """
Tim Peters30324a72001-05-15 17:19:16 +0000654 return getattr(self.stream, name)
Guido van Rossum0612d842000-03-10 23:20:43 +0000655
Thomas Wouters89f507f2006-12-13 04:49:30 +0000656 # these are needed to make "with codecs.open(...)" work properly
657
658 def __enter__(self):
659 return self
660
661 def __exit__(self, type, value, tb):
662 self.stream.close()
663
Guido van Rossum0612d842000-03-10 23:20:43 +0000664###
665
666class StreamRecoder:
667
Fred Drake49fd1072000-04-13 14:11:21 +0000668 """ StreamRecoder instances provide a frontend - backend
669 view of encoding data.
670
671 They use the complete set of APIs returned by the
672 codecs.lookup() function to implement their task.
673
674 Data written to the stream is first decoded into an
675 intermediate format (which is dependent on the given codec
676 combination) and then written to the stream using an instance
677 of the provided Writer class.
678
679 In the other direction, data is read from the stream using a
680 Reader instance and then return encoded data to the caller.
681
682 """
Guido van Rossuma3277132000-04-11 15:37:43 +0000683 # Optional attributes set by the file wrappers below
684 data_encoding = 'unknown'
685 file_encoding = 'unknown'
686
Tim Peters30324a72001-05-15 17:19:16 +0000687 def __init__(self, stream, encode, decode, Reader, Writer,
688 errors='strict'):
Guido van Rossum0612d842000-03-10 23:20:43 +0000689
690 """ Creates a StreamRecoder instance which implements a two-way
691 conversion: encode and decode work on the frontend (the
Guido van Rossum1c89b0e2000-04-11 15:41:38 +0000692 input to .read() and output of .write()) while
Guido van Rossum0612d842000-03-10 23:20:43 +0000693 Reader and Writer work on the backend (reading and
Fred Drake908670c2000-03-17 15:42:11 +0000694 writing to the stream).
Guido van Rossum0612d842000-03-10 23:20:43 +0000695
696 You can use these objects to do transparent direct
697 recodings from e.g. latin-1 to utf-8 and back.
698
699 stream must be a file-like object.
700
701 encode, decode must adhere to the Codec interface, Reader,
702 Writer must be factory functions or classes providing the
703 StreamReader, StreamWriter interface resp.
704
705 encode and decode are needed for the frontend translation,
706 Reader and Writer for the backend translation. Unicode is
707 used as intermediate encoding.
708
709 Error handling is done in the same way as defined for the
710 StreamWriter/Readers.
711
712 """
713 self.stream = stream
714 self.encode = encode
715 self.decode = decode
716 self.reader = Reader(stream, errors)
717 self.writer = Writer(stream, errors)
718 self.errors = errors
719
Tim Peters30324a72001-05-15 17:19:16 +0000720 def read(self, size=-1):
Guido van Rossum0612d842000-03-10 23:20:43 +0000721
722 data = self.reader.read(size)
723 data, bytesencoded = self.encode(data, self.errors)
724 return data
725
Tim Peters30324a72001-05-15 17:19:16 +0000726 def readline(self, size=None):
Guido van Rossuma3277132000-04-11 15:37:43 +0000727
728 if size is None:
729 data = self.reader.readline()
730 else:
731 data = self.reader.readline(size)
732 data, bytesencoded = self.encode(data, self.errors)
733 return data
734
Tim Peters30324a72001-05-15 17:19:16 +0000735 def readlines(self, sizehint=None):
Guido van Rossuma3277132000-04-11 15:37:43 +0000736
Marc-André Lemburgd5948492004-02-26 15:22:17 +0000737 data = self.reader.read()
Guido van Rossuma3277132000-04-11 15:37:43 +0000738 data, bytesencoded = self.encode(data, self.errors)
739 return data.splitlines(1)
740
Walter Dörwald4dbf1922002-11-06 16:53:44 +0000741 def next(self):
742
743 """ Return the next decoded line from the input stream."""
Walter Dörwaldc5238b82005-09-01 11:56:53 +0000744 data = self.reader.next()
745 data, bytesencoded = self.encode(data, self.errors)
746 return data
Walter Dörwald4dbf1922002-11-06 16:53:44 +0000747
748 def __iter__(self):
749 return self
750
Tim Peters30324a72001-05-15 17:19:16 +0000751 def write(self, data):
Guido van Rossum0612d842000-03-10 23:20:43 +0000752
753 data, bytesdecoded = self.decode(data, self.errors)
754 return self.writer.write(data)
755
Tim Peters30324a72001-05-15 17:19:16 +0000756 def writelines(self, list):
Guido van Rossuma3277132000-04-11 15:37:43 +0000757
758 data = ''.join(list)
759 data, bytesdecoded = self.decode(data, self.errors)
760 return self.writer.write(data)
Guido van Rossum0612d842000-03-10 23:20:43 +0000761
762 def reset(self):
763
764 self.reader.reset()
765 self.writer.reset()
766
Tim Peters30324a72001-05-15 17:19:16 +0000767 def __getattr__(self, name,
Guido van Rossum0612d842000-03-10 23:20:43 +0000768 getattr=getattr):
769
770 """ Inherit all other methods from the underlying stream.
771 """
Tim Peters30324a72001-05-15 17:19:16 +0000772 return getattr(self.stream, name)
Guido van Rossum0612d842000-03-10 23:20:43 +0000773
Thomas Wouters89f507f2006-12-13 04:49:30 +0000774 def __enter__(self):
775 return self
776
777 def __exit__(self, type, value, tb):
778 self.stream.close()
779
Guido van Rossum0612d842000-03-10 23:20:43 +0000780### Shortcuts
781
Marc-André Lemburg349a3d32000-06-21 21:21:04 +0000782def open(filename, mode='rb', encoding=None, errors='strict', buffering=1):
Guido van Rossum0612d842000-03-10 23:20:43 +0000783
784 """ Open an encoded file using the given mode and return
785 a wrapped version providing transparent encoding/decoding.
786
787 Note: The wrapped version will only accept the object format
788 defined by the codecs, i.e. Unicode objects for most builtin
Skip Montanaro9f5f9d92005-03-16 03:51:56 +0000789 codecs. Output is also codec dependent and will usually be
Guido van Rossum0612d842000-03-10 23:20:43 +0000790 Unicode as well.
791
Marc-André Lemburg349a3d32000-06-21 21:21:04 +0000792 Files are always opened in binary mode, even if no binary mode
Walter Dörwald7f3ed742003-02-02 23:08:27 +0000793 was specified. This is done to avoid data loss due to encodings
Marc-André Lemburg349a3d32000-06-21 21:21:04 +0000794 using 8-bit values. The default file mode is 'rb' meaning to
795 open the file in binary read mode.
796
Guido van Rossum0612d842000-03-10 23:20:43 +0000797 encoding specifies the encoding which is to be used for the
Walter Dörwald7f3ed742003-02-02 23:08:27 +0000798 file.
Guido van Rossum0612d842000-03-10 23:20:43 +0000799
800 errors may be given to define the error handling. It defaults
801 to 'strict' which causes ValueErrors to be raised in case an
802 encoding error occurs.
803
804 buffering has the same meaning as for the builtin open() API.
805 It defaults to line buffered.
806
Fred Drake49fd1072000-04-13 14:11:21 +0000807 The returned wrapped file object provides an extra attribute
808 .encoding which allows querying the used encoding. This
809 attribute is only available if an encoding was specified as
810 parameter.
811
Guido van Rossum0612d842000-03-10 23:20:43 +0000812 """
813 if encoding is not None and \
814 'b' not in mode:
815 # Force opening of the file in binary mode
816 mode = mode + 'b'
817 file = __builtin__.open(filename, mode, buffering)
818 if encoding is None:
819 return file
Thomas Woutersa9773292006-04-21 09:43:23 +0000820 info = lookup(encoding)
821 srw = StreamReaderWriter(file, info.streamreader, info.streamwriter, errors)
Guido van Rossuma3277132000-04-11 15:37:43 +0000822 # Add attributes to simplify introspection
823 srw.encoding = encoding
824 return srw
Guido van Rossum0612d842000-03-10 23:20:43 +0000825
Guido van Rossuma3277132000-04-11 15:37:43 +0000826def EncodedFile(file, data_encoding, file_encoding=None, errors='strict'):
Guido van Rossum0612d842000-03-10 23:20:43 +0000827
828 """ Return a wrapped version of file which provides transparent
829 encoding translation.
830
831 Strings written to the wrapped file are interpreted according
Guido van Rossuma3277132000-04-11 15:37:43 +0000832 to the given data_encoding and then written to the original
833 file as string using file_encoding. The intermediate encoding
834 will usually be Unicode but depends on the specified codecs.
Guido van Rossum0612d842000-03-10 23:20:43 +0000835
Guido van Rossuma3277132000-04-11 15:37:43 +0000836 Strings are read from the file using file_encoding and then
837 passed back to the caller as string using data_encoding.
838
839 If file_encoding is not given, it defaults to data_encoding.
Guido van Rossum0612d842000-03-10 23:20:43 +0000840
841 errors may be given to define the error handling. It defaults
842 to 'strict' which causes ValueErrors to be raised in case an
843 encoding error occurs.
844
Fred Drake49fd1072000-04-13 14:11:21 +0000845 The returned wrapped file object provides two extra attributes
846 .data_encoding and .file_encoding which reflect the given
847 parameters of the same name. The attributes can be used for
848 introspection by Python programs.
849
Guido van Rossum0612d842000-03-10 23:20:43 +0000850 """
Guido van Rossuma3277132000-04-11 15:37:43 +0000851 if file_encoding is None:
852 file_encoding = data_encoding
Thomas Wouters89f507f2006-12-13 04:49:30 +0000853 data_info = lookup(data_encoding)
854 file_info = lookup(file_encoding)
855 sr = StreamRecoder(file, data_info.encode, data_info.decode,
856 file_info.streamreader, file_info.streamwriter, errors)
Guido van Rossuma3277132000-04-11 15:37:43 +0000857 # Add attributes to simplify introspection
858 sr.data_encoding = data_encoding
859 sr.file_encoding = file_encoding
860 return sr
Guido van Rossum0612d842000-03-10 23:20:43 +0000861
Marc-André Lemburgaa32c5a2001-09-19 11:24:48 +0000862### Helpers for codec lookup
863
864def getencoder(encoding):
865
866 """ Lookup up the codec for the given encoding and return
867 its encoder function.
868
869 Raises a LookupError in case the encoding cannot be found.
870
871 """
Thomas Woutersa9773292006-04-21 09:43:23 +0000872 return lookup(encoding).encode
Marc-André Lemburgaa32c5a2001-09-19 11:24:48 +0000873
874def getdecoder(encoding):
875
876 """ Lookup up the codec for the given encoding and return
877 its decoder function.
878
879 Raises a LookupError in case the encoding cannot be found.
880
881 """
Thomas Woutersa9773292006-04-21 09:43:23 +0000882 return lookup(encoding).decode
883
884def getincrementalencoder(encoding):
885
886 """ Lookup up the codec for the given encoding and return
887 its IncrementalEncoder class or factory function.
888
889 Raises a LookupError in case the encoding cannot be found
890 or the codecs doesn't provide an incremental encoder.
891
892 """
893 encoder = lookup(encoding).incrementalencoder
894 if encoder is None:
895 raise LookupError(encoding)
896 return encoder
897
898def getincrementaldecoder(encoding):
899
900 """ Lookup up the codec for the given encoding and return
901 its IncrementalDecoder class or factory function.
902
903 Raises a LookupError in case the encoding cannot be found
904 or the codecs doesn't provide an incremental decoder.
905
906 """
907 decoder = lookup(encoding).incrementaldecoder
908 if decoder is None:
909 raise LookupError(encoding)
910 return decoder
Marc-André Lemburgaa32c5a2001-09-19 11:24:48 +0000911
912def getreader(encoding):
913
914 """ Lookup up the codec for the given encoding and return
915 its StreamReader class or factory function.
916
917 Raises a LookupError in case the encoding cannot be found.
918
919 """
Thomas Woutersa9773292006-04-21 09:43:23 +0000920 return lookup(encoding).streamreader
Marc-André Lemburgaa32c5a2001-09-19 11:24:48 +0000921
922def getwriter(encoding):
923
924 """ Lookup up the codec for the given encoding and return
925 its StreamWriter class or factory function.
926
927 Raises a LookupError in case the encoding cannot be found.
928
929 """
Thomas Woutersa9773292006-04-21 09:43:23 +0000930 return lookup(encoding).streamwriter
931
932def iterencode(iterator, encoding, errors='strict', **kwargs):
933 """
934 Encoding iterator.
935
936 Encodes the input strings from the iterator using a IncrementalEncoder.
937
938 errors and kwargs are passed through to the IncrementalEncoder
939 constructor.
940 """
941 encoder = getincrementalencoder(encoding)(errors, **kwargs)
942 for input in iterator:
943 output = encoder.encode(input)
944 if output:
945 yield output
946 output = encoder.encode("", True)
947 if output:
948 yield output
949
950def iterdecode(iterator, encoding, errors='strict', **kwargs):
951 """
952 Decoding iterator.
953
954 Decodes the input strings from the iterator using a IncrementalDecoder.
955
956 errors and kwargs are passed through to the IncrementalDecoder
957 constructor.
958 """
959 decoder = getincrementaldecoder(encoding)(errors, **kwargs)
960 for input in iterator:
961 output = decoder.decode(input)
962 if output:
963 yield output
964 output = decoder.decode("", True)
965 if output:
966 yield output
Marc-André Lemburgaa32c5a2001-09-19 11:24:48 +0000967
Marc-André Lemburga866df82001-01-03 21:29:14 +0000968### Helpers for charmap-based codecs
969
970def make_identity_dict(rng):
971
972 """ make_identity_dict(rng) -> dict
973
974 Return a dictionary where elements of the rng sequence are
975 mapped to themselves.
Tim Peters88869f92001-01-14 23:36:06 +0000976
Marc-André Lemburga866df82001-01-03 21:29:14 +0000977 """
978 res = {}
979 for i in rng:
980 res[i]=i
981 return res
982
Marc-André Lemburg716cf912001-05-16 09:41:45 +0000983def make_encoding_map(decoding_map):
984
985 """ Creates an encoding map from a decoding map.
986
Walter Dörwald7f3ed742003-02-02 23:08:27 +0000987 If a target mapping in the decoding map occurs multiple
Marc-André Lemburg716cf912001-05-16 09:41:45 +0000988 times, then that target is mapped to None (undefined mapping),
989 causing an exception when encountered by the charmap codec
990 during translation.
991
992 One example where this happens is cp875.py which decodes
993 multiple character to \u001a.
994
995 """
996 m = {}
997 for k,v in decoding_map.items():
Raymond Hettinger54f02222002-06-01 14:18:47 +0000998 if not v in m:
Marc-André Lemburg716cf912001-05-16 09:41:45 +0000999 m[v] = k
1000 else:
1001 m[v] = None
1002 return m
Tim Peters3a2ab1a2001-05-29 06:06:54 +00001003
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001004### error handlers
1005
Martin v. Löwise2713be2005-03-08 15:03:08 +00001006try:
1007 strict_errors = lookup_error("strict")
1008 ignore_errors = lookup_error("ignore")
1009 replace_errors = lookup_error("replace")
1010 xmlcharrefreplace_errors = lookup_error("xmlcharrefreplace")
1011 backslashreplace_errors = lookup_error("backslashreplace")
1012except LookupError:
1013 # In --disable-unicode builds, these error handler are missing
1014 strict_errors = None
1015 ignore_errors = None
1016 replace_errors = None
1017 xmlcharrefreplace_errors = None
1018 backslashreplace_errors = None
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001019
Martin v. Löwis6cd441d2001-07-31 08:54:55 +00001020# Tell modulefinder that using codecs probably needs the encodings
1021# package
1022_false = 0
1023if _false:
1024 import encodings
1025
Guido van Rossum0612d842000-03-10 23:20:43 +00001026### Tests
Guido van Rossum1c89b0e2000-04-11 15:41:38 +00001027
Guido van Rossum0612d842000-03-10 23:20:43 +00001028if __name__ == '__main__':
1029
Guido van Rossuma3277132000-04-11 15:37:43 +00001030 # Make stdout translate Latin-1 output into UTF-8 output
1031 sys.stdout = EncodedFile(sys.stdout, 'latin-1', 'utf-8')
Guido van Rossum1c89b0e2000-04-11 15:41:38 +00001032
Guido van Rossuma3277132000-04-11 15:37:43 +00001033 # Have stdin translate Latin-1 input into UTF-8 input
1034 sys.stdin = EncodedFile(sys.stdin, 'utf-8', 'latin-1')