blob: 04912a3244f8eb65260a2ee308b092b119868355 [file] [log] [blame]
Guido van Rossum0612d842000-03-10 23:20:43 +00001""" codecs -- Python Codec Registry, API and helpers.
2
3
4Written by Marc-Andre Lemburg (mal@lemburg.com).
5
6(c) Copyright CNRI, All Rights Reserved. NO WARRANTY.
7
8"""#"
9
Marc-André Lemburgb28de0d2002-12-12 17:37:50 +000010import __builtin__, sys
Guido van Rossum0612d842000-03-10 23:20:43 +000011
12### Registry and builtin stateless codec functions
13
Guido van Rossumb95de4f2000-03-31 17:25:23 +000014try:
15 from _codecs import *
Tim Peters30324a72001-05-15 17:19:16 +000016except ImportError, why:
Walter Dörwald6a7ec7c2006-03-18 16:35:17 +000017 raise SystemError('Failed to load the builtin codecs: %s' % why)
Guido van Rossum0612d842000-03-10 23:20:43 +000018
Tim Peters30324a72001-05-15 17:19:16 +000019__all__ = ["register", "lookup", "open", "EncodedFile", "BOM", "BOM_BE",
Walter Dörwald474458d2002-06-04 15:16:29 +000020 "BOM_LE", "BOM32_BE", "BOM32_LE", "BOM64_BE", "BOM64_LE",
21 "BOM_UTF8", "BOM_UTF16", "BOM_UTF16_LE", "BOM_UTF16_BE",
Walter Dörwald3aeb6322002-09-02 13:14:32 +000022 "BOM_UTF32", "BOM_UTF32_LE", "BOM_UTF32_BE",
23 "strict_errors", "ignore_errors", "replace_errors",
24 "xmlcharrefreplace_errors",
25 "register_error", "lookup_error"]
Skip Montanaroe99d5ea2001-01-20 19:54:20 +000026
Guido van Rossum0612d842000-03-10 23:20:43 +000027### Constants
28
29#
Walter Dörwald474458d2002-06-04 15:16:29 +000030# Byte Order Mark (BOM = ZERO WIDTH NO-BREAK SPACE = U+FEFF)
31# and its possible byte string values
32# for UTF8/UTF16/UTF32 output and little/big endian machines
Guido van Rossum0612d842000-03-10 23:20:43 +000033#
Guido van Rossum0612d842000-03-10 23:20:43 +000034
Walter Dörwald474458d2002-06-04 15:16:29 +000035# UTF-8
36BOM_UTF8 = '\xef\xbb\xbf'
37
38# UTF-16, little endian
39BOM_LE = BOM_UTF16_LE = '\xff\xfe'
40
41# UTF-16, big endian
42BOM_BE = BOM_UTF16_BE = '\xfe\xff'
43
44# UTF-32, little endian
45BOM_UTF32_LE = '\xff\xfe\x00\x00'
46
47# UTF-32, big endian
48BOM_UTF32_BE = '\x00\x00\xfe\xff'
49
Marc-André Lemburgb28de0d2002-12-12 17:37:50 +000050if sys.byteorder == 'little':
Walter Dörwald474458d2002-06-04 15:16:29 +000051
Marc-André Lemburgb28de0d2002-12-12 17:37:50 +000052 # UTF-16, native endianness
53 BOM = BOM_UTF16 = BOM_UTF16_LE
54
55 # UTF-32, native endianness
56 BOM_UTF32 = BOM_UTF32_LE
57
58else:
59
60 # UTF-16, native endianness
61 BOM = BOM_UTF16 = BOM_UTF16_BE
62
63 # UTF-32, native endianness
64 BOM_UTF32 = BOM_UTF32_BE
Walter Dörwald474458d2002-06-04 15:16:29 +000065
66# Old broken names (don't use in new code)
67BOM32_LE = BOM_UTF16_LE
68BOM32_BE = BOM_UTF16_BE
69BOM64_LE = BOM_UTF32_LE
70BOM64_BE = BOM_UTF32_BE
Guido van Rossum0612d842000-03-10 23:20:43 +000071
72
73### Codec base classes (defining the API)
74
Walter Dörwaldabb02e52006-03-15 11:35:15 +000075class CodecInfo(tuple):
76
77 def __new__(cls, encode, decode, streamreader=None, streamwriter=None,
78 incrementalencoder=None, incrementaldecoder=None, name=None):
79 self = tuple.__new__(cls, (encode, decode, streamreader, streamwriter))
80 self.name = name
81 self.encode = encode
82 self.decode = decode
83 self.incrementalencoder = incrementalencoder
84 self.incrementaldecoder = incrementaldecoder
85 self.streamwriter = streamwriter
86 self.streamreader = streamreader
87 return self
88
89 def __repr__(self):
90 return "<%s.%s object for encoding %s at 0x%x>" % (self.__class__.__module__, self.__class__.__name__, self.name, id(self))
91
Guido van Rossum0612d842000-03-10 23:20:43 +000092class Codec:
93
94 """ Defines the interface for stateless encoders/decoders.
95
Walter Dörwald7f82f792002-11-19 21:42:53 +000096 The .encode()/.decode() methods may use different error
Guido van Rossum0612d842000-03-10 23:20:43 +000097 handling schemes by providing the errors argument. These
Walter Dörwald7f82f792002-11-19 21:42:53 +000098 string values are predefined:
Guido van Rossum0612d842000-03-10 23:20:43 +000099
Guido van Rossumd8855fd2000-03-24 22:14:19 +0000100 'strict' - raise a ValueError error (or a subclass)
Guido van Rossum0612d842000-03-10 23:20:43 +0000101 'ignore' - ignore the character and continue with the next
102 'replace' - replace with a suitable replacement character;
103 Python will use the official U+FFFD REPLACEMENT
Walter Dörwald7f82f792002-11-19 21:42:53 +0000104 CHARACTER for the builtin Unicode codecs on
105 decoding and '?' on encoding.
106 'xmlcharrefreplace' - Replace with the appropriate XML
107 character reference (only for encoding).
108 'backslashreplace' - Replace with backslashed escape sequences
109 (only for encoding).
110
111 The set of allowed values can be extended via register_error.
Guido van Rossum0612d842000-03-10 23:20:43 +0000112
113 """
Tim Peters30324a72001-05-15 17:19:16 +0000114 def encode(self, input, errors='strict'):
Guido van Rossum1c89b0e2000-04-11 15:41:38 +0000115
Fred Drake3e74c0d2000-03-17 15:40:35 +0000116 """ Encodes the object input and returns a tuple (output
Guido van Rossum0612d842000-03-10 23:20:43 +0000117 object, length consumed).
118
119 errors defines the error handling to apply. It defaults to
120 'strict' handling.
121
122 The method may not store state in the Codec instance. Use
123 StreamCodec for codecs which have to keep state in order to
124 make encoding/decoding efficient.
125
126 The encoder must be able to handle zero length input and
127 return an empty object of the output object type in this
128 situation.
129
130 """
131 raise NotImplementedError
132
Tim Peters30324a72001-05-15 17:19:16 +0000133 def decode(self, input, errors='strict'):
Guido van Rossum0612d842000-03-10 23:20:43 +0000134
135 """ Decodes the object input and returns a tuple (output
136 object, length consumed).
137
138 input must be an object which provides the bf_getreadbuf
139 buffer slot. Python strings, buffer objects and memory
140 mapped files are examples of objects providing this slot.
Guido van Rossum1c89b0e2000-04-11 15:41:38 +0000141
Guido van Rossum0612d842000-03-10 23:20:43 +0000142 errors defines the error handling to apply. It defaults to
143 'strict' handling.
144
145 The method may not store state in the Codec instance. Use
146 StreamCodec for codecs which have to keep state in order to
147 make encoding/decoding efficient.
148
149 The decoder must be able to handle zero length input and
150 return an empty object of the output object type in this
151 situation.
152
Guido van Rossum1c89b0e2000-04-11 15:41:38 +0000153 """
Guido van Rossum0612d842000-03-10 23:20:43 +0000154 raise NotImplementedError
155
Walter Dörwaldabb02e52006-03-15 11:35:15 +0000156class IncrementalEncoder(object):
157 """
Neal Norwitz6bed1c12006-03-16 07:49:19 +0000158 An IncrementalEncoder encodes an input in multiple steps. The input can be
Walter Dörwaldabb02e52006-03-15 11:35:15 +0000159 passed piece by piece to the encode() method. The IncrementalEncoder remembers
160 the state of the Encoding process between calls to encode().
161 """
162 def __init__(self, errors='strict'):
163 """
Neal Norwitz6bed1c12006-03-16 07:49:19 +0000164 Creates an IncrementalEncoder instance.
Walter Dörwaldabb02e52006-03-15 11:35:15 +0000165
166 The IncrementalEncoder may use different error handling schemes by
167 providing the errors keyword argument. See the module docstring
168 for a list of possible values.
169 """
170 self.errors = errors
171 self.buffer = ""
172
173 def encode(self, input, final=False):
174 """
175 Encodes input and returns the resulting object.
176 """
177 raise NotImplementedError
178
179 def reset(self):
180 """
181 Resets the encoder to the initial state.
182 """
183
184class IncrementalDecoder(object):
185 """
186 An IncrementalDecoder decodes an input in multiple steps. The input can be
187 passed piece by piece to the decode() method. The IncrementalDecoder
188 remembers the state of the decoding process between calls to decode().
189 """
190 def __init__(self, errors='strict'):
191 """
192 Creates a IncrementalDecoder instance.
193
194 The IncrementalDecoder may use different error handling schemes by
195 providing the errors keyword argument. See the module docstring
196 for a list of possible values.
197 """
198 self.errors = errors
199
200 def decode(self, input, final=False):
201 """
202 Decodes input and returns the resulting object.
203 """
204 raise NotImplementedError
205
206 def reset(self):
207 """
208 Resets the decoder to the initial state.
209 """
210
211class BufferedIncrementalDecoder(IncrementalDecoder):
212 """
213 This subclass of IncrementalDecoder can be used as the baseclass for an
214 incremental decoder if the decoder must be able to handle incomplete byte
215 sequences.
216 """
217 def __init__(self, errors='strict'):
218 IncrementalDecoder.__init__(self, errors)
219 self.buffer = "" # undecoded input that is kept between calls to decode()
220
221 def _buffer_decode(self, input, errors, final):
222 # Overwrite this method in subclasses: It must decode input
223 # and return an (output, length consumed) tuple
224 raise NotImplementedError
225
226 def decode(self, input, final=False):
227 # decode input (taking the buffer into account)
228 data = self.buffer + input
229 (result, consumed) = self._buffer_decode(data, self.errors, final)
230 # keep undecoded input until the next call
231 self.buffer = data[consumed:]
232 return result
233
234 def reset(self):
235 IncrementalDecoder.reset(self)
236 self.bytebuffer = ""
237
Guido van Rossum0612d842000-03-10 23:20:43 +0000238#
239# The StreamWriter and StreamReader class provide generic working
Andrew M. Kuchling97c56352001-09-18 20:29:48 +0000240# interfaces which can be used to implement new encoding submodules
Guido van Rossum0612d842000-03-10 23:20:43 +0000241# very easily. See encodings/utf_8.py for an example on how this is
242# done.
Guido van Rossum1c89b0e2000-04-11 15:41:38 +0000243#
Guido van Rossum0612d842000-03-10 23:20:43 +0000244
245class StreamWriter(Codec):
246
Tim Peters30324a72001-05-15 17:19:16 +0000247 def __init__(self, stream, errors='strict'):
Guido van Rossum0612d842000-03-10 23:20:43 +0000248
249 """ Creates a StreamWriter instance.
250
251 stream must be a file-like object open for writing
252 (binary) data.
253
Walter Dörwald7f82f792002-11-19 21:42:53 +0000254 The StreamWriter may use different error handling
Guido van Rossum0612d842000-03-10 23:20:43 +0000255 schemes by providing the errors keyword argument. These
Walter Dörwald7f82f792002-11-19 21:42:53 +0000256 parameters are predefined:
Guido van Rossum0612d842000-03-10 23:20:43 +0000257
258 'strict' - raise a ValueError (or a subclass)
259 'ignore' - ignore the character and continue with the next
260 'replace'- replace with a suitable replacement character
Walter Dörwald7f82f792002-11-19 21:42:53 +0000261 'xmlcharrefreplace' - Replace with the appropriate XML
262 character reference.
263 'backslashreplace' - Replace with backslashed escape
264 sequences (only for encoding).
Guido van Rossum0612d842000-03-10 23:20:43 +0000265
Walter Dörwald7f82f792002-11-19 21:42:53 +0000266 The set of allowed parameter values can be extended via
267 register_error.
Guido van Rossum0612d842000-03-10 23:20:43 +0000268 """
269 self.stream = stream
270 self.errors = errors
271
Guido van Rossuma3277132000-04-11 15:37:43 +0000272 def write(self, object):
Guido van Rossum0612d842000-03-10 23:20:43 +0000273
274 """ Writes the object's contents encoded to self.stream.
275 """
Tim Peters30324a72001-05-15 17:19:16 +0000276 data, consumed = self.encode(object, self.errors)
Guido van Rossum0612d842000-03-10 23:20:43 +0000277 self.stream.write(data)
278
Guido van Rossuma3277132000-04-11 15:37:43 +0000279 def writelines(self, list):
280
281 """ Writes the concatenated list of strings to the stream
282 using .write().
283 """
284 self.write(''.join(list))
Guido van Rossum1c89b0e2000-04-11 15:41:38 +0000285
Guido van Rossum0612d842000-03-10 23:20:43 +0000286 def reset(self):
287
288 """ Flushes and resets the codec buffers used for keeping state.
289
290 Calling this method should ensure that the data on the
291 output is put into a clean state, that allows appending
292 of new fresh data without having to rescan the whole
293 stream to recover state.
294
295 """
296 pass
297
Tim Peters30324a72001-05-15 17:19:16 +0000298 def __getattr__(self, name,
Guido van Rossum0612d842000-03-10 23:20:43 +0000299 getattr=getattr):
300
301 """ Inherit all other methods from the underlying stream.
302 """
Tim Peters30324a72001-05-15 17:19:16 +0000303 return getattr(self.stream, name)
Guido van Rossum0612d842000-03-10 23:20:43 +0000304
305###
306
307class StreamReader(Codec):
308
Tim Peters30324a72001-05-15 17:19:16 +0000309 def __init__(self, stream, errors='strict'):
Guido van Rossum0612d842000-03-10 23:20:43 +0000310
311 """ Creates a StreamReader instance.
312
313 stream must be a file-like object open for reading
314 (binary) data.
315
Walter Dörwald7f82f792002-11-19 21:42:53 +0000316 The StreamReader may use different error handling
Guido van Rossum0612d842000-03-10 23:20:43 +0000317 schemes by providing the errors keyword argument. These
Walter Dörwald7f82f792002-11-19 21:42:53 +0000318 parameters are predefined:
Guido van Rossum0612d842000-03-10 23:20:43 +0000319
320 'strict' - raise a ValueError (or a subclass)
321 'ignore' - ignore the character and continue with the next
322 'replace'- replace with a suitable replacement character;
323
Walter Dörwald7f82f792002-11-19 21:42:53 +0000324 The set of allowed parameter values can be extended via
325 register_error.
Guido van Rossum0612d842000-03-10 23:20:43 +0000326 """
327 self.stream = stream
328 self.errors = errors
Walter Dörwald69652032004-09-07 20:24:22 +0000329 self.bytebuffer = ""
Walter Dörwaldc9878e12005-07-20 22:15:39 +0000330 # For str->str decoding this will stay a str
331 # For str->unicode decoding the first read will promote it to unicode
332 self.charbuffer = ""
Martin v. Löwis4ed67382005-09-18 08:34:39 +0000333 self.linebuffer = None
Guido van Rossum0612d842000-03-10 23:20:43 +0000334
Walter Dörwald69652032004-09-07 20:24:22 +0000335 def decode(self, input, errors='strict'):
336 raise NotImplementedError
337
Martin v. Löwis56066d22005-08-24 07:38:12 +0000338 def read(self, size=-1, chars=-1, firstline=False):
Guido van Rossum0612d842000-03-10 23:20:43 +0000339
340 """ Decodes data from the stream self.stream and returns the
341 resulting object.
342
Walter Dörwald69652032004-09-07 20:24:22 +0000343 chars indicates the number of characters to read from the
344 stream. read() will never return more than chars
345 characters, but it might return less, if there are not enough
346 characters available.
347
Guido van Rossum0612d842000-03-10 23:20:43 +0000348 size indicates the approximate maximum number of bytes to
349 read from the stream for decoding purposes. The decoder
350 can modify this setting as appropriate. The default value
351 -1 indicates to read and decode as much as possible. size
352 is intended to prevent having to decode huge files in one
353 step.
354
Martin v. Löwis56066d22005-08-24 07:38:12 +0000355 If firstline is true, and a UnicodeDecodeError happens
356 after the first line terminator in the input only the first line
357 will be returned, the rest of the input will be kept until the
358 next call to read().
359
Guido van Rossum0612d842000-03-10 23:20:43 +0000360 The method should use a greedy read strategy meaning that
361 it should read as much data as is allowed within the
362 definition of the encoding and the given size, e.g. if
363 optional encoding endings or state markers are available
364 on the stream, these should be read too.
Guido van Rossum0612d842000-03-10 23:20:43 +0000365 """
Martin v. Löwis4ed67382005-09-18 08:34:39 +0000366 # If we have lines cached, first merge them back into characters
367 if self.linebuffer:
368 self.charbuffer = "".join(self.linebuffer)
369 self.linebuffer = None
Tim Peters536cf992005-12-25 23:18:31 +0000370
Walter Dörwald69652032004-09-07 20:24:22 +0000371 # read until we get the required number of characters (if available)
Walter Dörwald69652032004-09-07 20:24:22 +0000372 while True:
373 # can the request can be satisfied from the character buffer?
374 if chars < 0:
Walter Dörwaldca199432006-03-06 22:39:12 +0000375 if size < 0:
376 if self.charbuffer:
377 break
378 elif len(self.charbuffer) >= size:
Walter Dörwalde57d7b12004-12-21 22:24:00 +0000379 break
Guido van Rossum0612d842000-03-10 23:20:43 +0000380 else:
Walter Dörwald69652032004-09-07 20:24:22 +0000381 if len(self.charbuffer) >= chars:
Walter Dörwald69652032004-09-07 20:24:22 +0000382 break
383 # we need more data
384 if size < 0:
385 newdata = self.stream.read()
386 else:
387 newdata = self.stream.read(size)
Walter Dörwalde57d7b12004-12-21 22:24:00 +0000388 # decode bytes (those remaining from the last call included)
Walter Dörwald69652032004-09-07 20:24:22 +0000389 data = self.bytebuffer + newdata
Martin v. Löwis56066d22005-08-24 07:38:12 +0000390 try:
391 newchars, decodedbytes = self.decode(data, self.errors)
392 except UnicodeDecodeError, exc:
393 if firstline:
394 newchars, decodedbytes = self.decode(data[:exc.start], self.errors)
395 lines = newchars.splitlines(True)
396 if len(lines)<=1:
397 raise
398 else:
399 raise
Walter Dörwald69652032004-09-07 20:24:22 +0000400 # keep undecoded bytes until the next call
401 self.bytebuffer = data[decodedbytes:]
402 # put new characters in the character buffer
Walter Dörwalde57d7b12004-12-21 22:24:00 +0000403 self.charbuffer += newchars
Walter Dörwald69652032004-09-07 20:24:22 +0000404 # there was no data available
405 if not newdata:
Walter Dörwalde57d7b12004-12-21 22:24:00 +0000406 break
407 if chars < 0:
408 # Return everything we've got
409 result = self.charbuffer
Walter Dörwaldc9878e12005-07-20 22:15:39 +0000410 self.charbuffer = ""
Walter Dörwalde57d7b12004-12-21 22:24:00 +0000411 else:
412 # Return the first chars characters
413 result = self.charbuffer[:chars]
414 self.charbuffer = self.charbuffer[chars:]
Walter Dörwald69652032004-09-07 20:24:22 +0000415 return result
Guido van Rossum0612d842000-03-10 23:20:43 +0000416
Walter Dörwald69652032004-09-07 20:24:22 +0000417 def readline(self, size=None, keepends=True):
Guido van Rossuma3277132000-04-11 15:37:43 +0000418
419 """ Read one line from the input stream and return the
420 decoded data.
421
Walter Dörwald69652032004-09-07 20:24:22 +0000422 size, if given, is passed as size argument to the
423 read() method.
Guido van Rossum1c89b0e2000-04-11 15:41:38 +0000424
Guido van Rossuma3277132000-04-11 15:37:43 +0000425 """
Martin v. Löwis4ed67382005-09-18 08:34:39 +0000426 # If we have lines cached from an earlier read, return
427 # them unconditionally
428 if self.linebuffer:
429 line = self.linebuffer[0]
430 del self.linebuffer[0]
431 if len(self.linebuffer) == 1:
432 # revert to charbuffer mode; we might need more data
433 # next time
434 self.charbuffer = self.linebuffer[0]
435 self.linebuffer = None
436 if not keepends:
437 line = line.splitlines(False)[0]
438 return line
Tim Peters536cf992005-12-25 23:18:31 +0000439
Walter Dörwalde57d7b12004-12-21 22:24:00 +0000440 readsize = size or 72
Walter Dörwaldc9878e12005-07-20 22:15:39 +0000441 line = ""
Walter Dörwalde57d7b12004-12-21 22:24:00 +0000442 # If size is given, we call read() only once
Walter Dörwald69652032004-09-07 20:24:22 +0000443 while True:
Martin v. Löwis56066d22005-08-24 07:38:12 +0000444 data = self.read(readsize, firstline=True)
Walter Dörwalde57d7b12004-12-21 22:24:00 +0000445 if data:
Walter Dörwalda4eb2d52005-04-21 21:42:35 +0000446 # If we're at a "\r" read one extra character (which might
447 # be a "\n") to get a proper line ending. If the stream is
Walter Dörwaldbc8e6422005-04-21 21:32:03 +0000448 # temporarily exhausted we return the wrong line ending.
Walter Dörwaldc9878e12005-07-20 22:15:39 +0000449 if data.endswith("\r"):
Walter Dörwald7a6dc132005-04-04 21:38:47 +0000450 data += self.read(size=1, chars=1)
Walter Dörwald7a6dc132005-04-04 21:38:47 +0000451
Walter Dörwald69652032004-09-07 20:24:22 +0000452 line += data
Walter Dörwalde57d7b12004-12-21 22:24:00 +0000453 lines = line.splitlines(True)
454 if lines:
Martin v. Löwis4ed67382005-09-18 08:34:39 +0000455 if len(lines) > 1:
456 # More than one line result; the first line is a full line
457 # to return
458 line = lines[0]
459 del lines[0]
460 if len(lines) > 1:
461 # cache the remaining lines
462 lines[-1] += self.charbuffer
463 self.linebuffer = lines
464 self.charbuffer = None
465 else:
466 # only one remaining line, put it back into charbuffer
467 self.charbuffer = lines[0] + self.charbuffer
468 if not keepends:
469 line = line.splitlines(False)[0]
470 break
Walter Dörwalde57d7b12004-12-21 22:24:00 +0000471 line0withend = lines[0]
472 line0withoutend = lines[0].splitlines(False)[0]
473 if line0withend != line0withoutend: # We really have a line end
474 # Put the rest back together and keep it until the next call
Walter Dörwaldc9878e12005-07-20 22:15:39 +0000475 self.charbuffer = "".join(lines[1:]) + self.charbuffer
Walter Dörwalde57d7b12004-12-21 22:24:00 +0000476 if keepends:
477 line = line0withend
478 else:
479 line = line0withoutend
Walter Dörwald9fa09462005-01-10 12:01:39 +0000480 break
Walter Dörwalde57d7b12004-12-21 22:24:00 +0000481 # we didn't get anything or this was our only try
Walter Dörwald9fa09462005-01-10 12:01:39 +0000482 if not data or size is not None:
Walter Dörwalde57d7b12004-12-21 22:24:00 +0000483 if line and not keepends:
484 line = line.splitlines(False)[0]
485 break
486 if readsize<8000:
487 readsize *= 2
488 return line
Guido van Rossum1c89b0e2000-04-11 15:41:38 +0000489
Walter Dörwald69652032004-09-07 20:24:22 +0000490 def readlines(self, sizehint=None, keepends=True):
Guido van Rossuma3277132000-04-11 15:37:43 +0000491
492 """ Read all lines available on the input stream
493 and return them as list of lines.
494
495 Line breaks are implemented using the codec's decoder
496 method and are included in the list entries.
Guido van Rossum1c89b0e2000-04-11 15:41:38 +0000497
Marc-André Lemburgd5948492004-02-26 15:22:17 +0000498 sizehint, if given, is ignored since there is no efficient
499 way to finding the true end-of-line.
Guido van Rossuma3277132000-04-11 15:37:43 +0000500
501 """
Walter Dörwald69652032004-09-07 20:24:22 +0000502 data = self.read()
Hye-Shik Changaf5c7cf2004-10-17 23:51:21 +0000503 return data.splitlines(keepends)
Guido van Rossum0612d842000-03-10 23:20:43 +0000504
505 def reset(self):
506
507 """ Resets the codec buffers used for keeping state.
508
509 Note that no stream repositioning should take place.
Thomas Wouters7e474022000-07-16 12:04:32 +0000510 This method is primarily intended to be able to recover
Guido van Rossum0612d842000-03-10 23:20:43 +0000511 from decoding errors.
512
513 """
Walter Dörwald729c31f2005-03-14 19:06:30 +0000514 self.bytebuffer = ""
515 self.charbuffer = u""
Martin v. Löwis4ed67382005-09-18 08:34:39 +0000516 self.linebuffer = None
Walter Dörwald729c31f2005-03-14 19:06:30 +0000517
Walter Dörwald71fd90d2005-03-14 19:25:41 +0000518 def seek(self, offset, whence=0):
Walter Dörwald729c31f2005-03-14 19:06:30 +0000519 """ Set the input stream's current position.
520
521 Resets the codec buffers used for keeping state.
522 """
523 self.reset()
524 self.stream.seek(offset, whence)
Guido van Rossum0612d842000-03-10 23:20:43 +0000525
Walter Dörwald4dbf1922002-11-06 16:53:44 +0000526 def next(self):
527
528 """ Return the next decoded line from the input stream."""
529 line = self.readline()
530 if line:
531 return line
532 raise StopIteration
533
534 def __iter__(self):
535 return self
536
Tim Peters30324a72001-05-15 17:19:16 +0000537 def __getattr__(self, name,
Guido van Rossum0612d842000-03-10 23:20:43 +0000538 getattr=getattr):
539
540 """ Inherit all other methods from the underlying stream.
541 """
Tim Peters30324a72001-05-15 17:19:16 +0000542 return getattr(self.stream, name)
Guido van Rossum0612d842000-03-10 23:20:43 +0000543
544###
545
546class StreamReaderWriter:
547
Fred Drake49fd1072000-04-13 14:11:21 +0000548 """ StreamReaderWriter instances allow wrapping streams which
549 work in both read and write modes.
550
551 The design is such that one can use the factory functions
Thomas Wouters7e474022000-07-16 12:04:32 +0000552 returned by the codec.lookup() function to construct the
Fred Drake49fd1072000-04-13 14:11:21 +0000553 instance.
554
555 """
Guido van Rossuma3277132000-04-11 15:37:43 +0000556 # Optional attributes set by the file wrappers below
557 encoding = 'unknown'
558
Tim Peters30324a72001-05-15 17:19:16 +0000559 def __init__(self, stream, Reader, Writer, errors='strict'):
Guido van Rossum0612d842000-03-10 23:20:43 +0000560
561 """ Creates a StreamReaderWriter instance.
562
563 stream must be a Stream-like object.
564
565 Reader, Writer must be factory functions or classes
566 providing the StreamReader, StreamWriter interface resp.
567
568 Error handling is done in the same way as defined for the
569 StreamWriter/Readers.
570
571 """
572 self.stream = stream
573 self.reader = Reader(stream, errors)
574 self.writer = Writer(stream, errors)
575 self.errors = errors
576
Tim Peters30324a72001-05-15 17:19:16 +0000577 def read(self, size=-1):
Guido van Rossum0612d842000-03-10 23:20:43 +0000578
579 return self.reader.read(size)
580
Guido van Rossumd58c26f2000-05-01 16:17:32 +0000581 def readline(self, size=None):
Guido van Rossuma3277132000-04-11 15:37:43 +0000582
583 return self.reader.readline(size)
584
Guido van Rossumd58c26f2000-05-01 16:17:32 +0000585 def readlines(self, sizehint=None):
Guido van Rossuma3277132000-04-11 15:37:43 +0000586
587 return self.reader.readlines(sizehint)
588
Walter Dörwald4dbf1922002-11-06 16:53:44 +0000589 def next(self):
590
591 """ Return the next decoded line from the input stream."""
592 return self.reader.next()
593
594 def __iter__(self):
595 return self
596
Tim Peters30324a72001-05-15 17:19:16 +0000597 def write(self, data):
Guido van Rossum0612d842000-03-10 23:20:43 +0000598
599 return self.writer.write(data)
600
Tim Peters30324a72001-05-15 17:19:16 +0000601 def writelines(self, list):
Guido van Rossuma3277132000-04-11 15:37:43 +0000602
603 return self.writer.writelines(list)
604
Guido van Rossum0612d842000-03-10 23:20:43 +0000605 def reset(self):
606
607 self.reader.reset()
608 self.writer.reset()
609
Tim Peters30324a72001-05-15 17:19:16 +0000610 def __getattr__(self, name,
Guido van Rossum0612d842000-03-10 23:20:43 +0000611 getattr=getattr):
612
613 """ Inherit all other methods from the underlying stream.
614 """
Tim Peters30324a72001-05-15 17:19:16 +0000615 return getattr(self.stream, name)
Guido van Rossum0612d842000-03-10 23:20:43 +0000616
617###
618
619class StreamRecoder:
620
Fred Drake49fd1072000-04-13 14:11:21 +0000621 """ StreamRecoder instances provide a frontend - backend
622 view of encoding data.
623
624 They use the complete set of APIs returned by the
625 codecs.lookup() function to implement their task.
626
627 Data written to the stream is first decoded into an
628 intermediate format (which is dependent on the given codec
629 combination) and then written to the stream using an instance
630 of the provided Writer class.
631
632 In the other direction, data is read from the stream using a
633 Reader instance and then return encoded data to the caller.
634
635 """
Guido van Rossuma3277132000-04-11 15:37:43 +0000636 # Optional attributes set by the file wrappers below
637 data_encoding = 'unknown'
638 file_encoding = 'unknown'
639
Tim Peters30324a72001-05-15 17:19:16 +0000640 def __init__(self, stream, encode, decode, Reader, Writer,
641 errors='strict'):
Guido van Rossum0612d842000-03-10 23:20:43 +0000642
643 """ Creates a StreamRecoder instance which implements a two-way
644 conversion: encode and decode work on the frontend (the
Guido van Rossum1c89b0e2000-04-11 15:41:38 +0000645 input to .read() and output of .write()) while
Guido van Rossum0612d842000-03-10 23:20:43 +0000646 Reader and Writer work on the backend (reading and
Fred Drake908670c2000-03-17 15:42:11 +0000647 writing to the stream).
Guido van Rossum0612d842000-03-10 23:20:43 +0000648
649 You can use these objects to do transparent direct
650 recodings from e.g. latin-1 to utf-8 and back.
651
652 stream must be a file-like object.
653
654 encode, decode must adhere to the Codec interface, Reader,
655 Writer must be factory functions or classes providing the
656 StreamReader, StreamWriter interface resp.
657
658 encode and decode are needed for the frontend translation,
659 Reader and Writer for the backend translation. Unicode is
660 used as intermediate encoding.
661
662 Error handling is done in the same way as defined for the
663 StreamWriter/Readers.
664
665 """
666 self.stream = stream
667 self.encode = encode
668 self.decode = decode
669 self.reader = Reader(stream, errors)
670 self.writer = Writer(stream, errors)
671 self.errors = errors
672
Tim Peters30324a72001-05-15 17:19:16 +0000673 def read(self, size=-1):
Guido van Rossum0612d842000-03-10 23:20:43 +0000674
675 data = self.reader.read(size)
676 data, bytesencoded = self.encode(data, self.errors)
677 return data
678
Tim Peters30324a72001-05-15 17:19:16 +0000679 def readline(self, size=None):
Guido van Rossuma3277132000-04-11 15:37:43 +0000680
681 if size is None:
682 data = self.reader.readline()
683 else:
684 data = self.reader.readline(size)
685 data, bytesencoded = self.encode(data, self.errors)
686 return data
687
Tim Peters30324a72001-05-15 17:19:16 +0000688 def readlines(self, sizehint=None):
Guido van Rossuma3277132000-04-11 15:37:43 +0000689
Marc-André Lemburgd5948492004-02-26 15:22:17 +0000690 data = self.reader.read()
Guido van Rossuma3277132000-04-11 15:37:43 +0000691 data, bytesencoded = self.encode(data, self.errors)
692 return data.splitlines(1)
693
Walter Dörwald4dbf1922002-11-06 16:53:44 +0000694 def next(self):
695
696 """ Return the next decoded line from the input stream."""
Walter Dörwaldc5238b82005-09-01 11:56:53 +0000697 data = self.reader.next()
698 data, bytesencoded = self.encode(data, self.errors)
699 return data
Walter Dörwald4dbf1922002-11-06 16:53:44 +0000700
701 def __iter__(self):
702 return self
703
Tim Peters30324a72001-05-15 17:19:16 +0000704 def write(self, data):
Guido van Rossum0612d842000-03-10 23:20:43 +0000705
706 data, bytesdecoded = self.decode(data, self.errors)
707 return self.writer.write(data)
708
Tim Peters30324a72001-05-15 17:19:16 +0000709 def writelines(self, list):
Guido van Rossuma3277132000-04-11 15:37:43 +0000710
711 data = ''.join(list)
712 data, bytesdecoded = self.decode(data, self.errors)
713 return self.writer.write(data)
Guido van Rossum0612d842000-03-10 23:20:43 +0000714
715 def reset(self):
716
717 self.reader.reset()
718 self.writer.reset()
719
Tim Peters30324a72001-05-15 17:19:16 +0000720 def __getattr__(self, name,
Guido van Rossum0612d842000-03-10 23:20:43 +0000721 getattr=getattr):
722
723 """ Inherit all other methods from the underlying stream.
724 """
Tim Peters30324a72001-05-15 17:19:16 +0000725 return getattr(self.stream, name)
Guido van Rossum0612d842000-03-10 23:20:43 +0000726
727### Shortcuts
728
Marc-André Lemburg349a3d32000-06-21 21:21:04 +0000729def open(filename, mode='rb', encoding=None, errors='strict', buffering=1):
Guido van Rossum0612d842000-03-10 23:20:43 +0000730
731 """ Open an encoded file using the given mode and return
732 a wrapped version providing transparent encoding/decoding.
733
734 Note: The wrapped version will only accept the object format
735 defined by the codecs, i.e. Unicode objects for most builtin
Skip Montanaro9f5f9d92005-03-16 03:51:56 +0000736 codecs. Output is also codec dependent and will usually be
Guido van Rossum0612d842000-03-10 23:20:43 +0000737 Unicode as well.
738
Marc-André Lemburg349a3d32000-06-21 21:21:04 +0000739 Files are always opened in binary mode, even if no binary mode
Walter Dörwald7f3ed742003-02-02 23:08:27 +0000740 was specified. This is done to avoid data loss due to encodings
Marc-André Lemburg349a3d32000-06-21 21:21:04 +0000741 using 8-bit values. The default file mode is 'rb' meaning to
742 open the file in binary read mode.
743
Guido van Rossum0612d842000-03-10 23:20:43 +0000744 encoding specifies the encoding which is to be used for the
Walter Dörwald7f3ed742003-02-02 23:08:27 +0000745 file.
Guido van Rossum0612d842000-03-10 23:20:43 +0000746
747 errors may be given to define the error handling. It defaults
748 to 'strict' which causes ValueErrors to be raised in case an
749 encoding error occurs.
750
751 buffering has the same meaning as for the builtin open() API.
752 It defaults to line buffered.
753
Fred Drake49fd1072000-04-13 14:11:21 +0000754 The returned wrapped file object provides an extra attribute
755 .encoding which allows querying the used encoding. This
756 attribute is only available if an encoding was specified as
757 parameter.
758
Guido van Rossum0612d842000-03-10 23:20:43 +0000759 """
760 if encoding is not None and \
761 'b' not in mode:
762 # Force opening of the file in binary mode
763 mode = mode + 'b'
764 file = __builtin__.open(filename, mode, buffering)
765 if encoding is None:
766 return file
Walter Dörwaldabb02e52006-03-15 11:35:15 +0000767 info = lookup(encoding)
768 srw = StreamReaderWriter(file, info.streamreader, info.streamwriter, errors)
Guido van Rossuma3277132000-04-11 15:37:43 +0000769 # Add attributes to simplify introspection
770 srw.encoding = encoding
771 return srw
Guido van Rossum0612d842000-03-10 23:20:43 +0000772
Guido van Rossuma3277132000-04-11 15:37:43 +0000773def EncodedFile(file, data_encoding, file_encoding=None, errors='strict'):
Guido van Rossum0612d842000-03-10 23:20:43 +0000774
775 """ Return a wrapped version of file which provides transparent
776 encoding translation.
777
778 Strings written to the wrapped file are interpreted according
Guido van Rossuma3277132000-04-11 15:37:43 +0000779 to the given data_encoding and then written to the original
780 file as string using file_encoding. The intermediate encoding
781 will usually be Unicode but depends on the specified codecs.
Guido van Rossum0612d842000-03-10 23:20:43 +0000782
Guido van Rossuma3277132000-04-11 15:37:43 +0000783 Strings are read from the file using file_encoding and then
784 passed back to the caller as string using data_encoding.
785
786 If file_encoding is not given, it defaults to data_encoding.
Guido van Rossum0612d842000-03-10 23:20:43 +0000787
788 errors may be given to define the error handling. It defaults
789 to 'strict' which causes ValueErrors to be raised in case an
790 encoding error occurs.
791
Fred Drake49fd1072000-04-13 14:11:21 +0000792 The returned wrapped file object provides two extra attributes
793 .data_encoding and .file_encoding which reflect the given
794 parameters of the same name. The attributes can be used for
795 introspection by Python programs.
796
Guido van Rossum0612d842000-03-10 23:20:43 +0000797 """
Guido van Rossuma3277132000-04-11 15:37:43 +0000798 if file_encoding is None:
799 file_encoding = data_encoding
Walter Dörwaldabb02e52006-03-15 11:35:15 +0000800 info = lookup(data_encoding)
801 sr = StreamRecoder(file, info.encode, info.decode,
802 info.streamreader, info.streamwriter, errors)
Guido van Rossuma3277132000-04-11 15:37:43 +0000803 # Add attributes to simplify introspection
804 sr.data_encoding = data_encoding
805 sr.file_encoding = file_encoding
806 return sr
Guido van Rossum0612d842000-03-10 23:20:43 +0000807
Marc-André Lemburgaa32c5a2001-09-19 11:24:48 +0000808### Helpers for codec lookup
809
810def getencoder(encoding):
811
812 """ Lookup up the codec for the given encoding and return
813 its encoder function.
814
815 Raises a LookupError in case the encoding cannot be found.
816
817 """
Walter Dörwaldabb02e52006-03-15 11:35:15 +0000818 return lookup(encoding).encode
Marc-André Lemburgaa32c5a2001-09-19 11:24:48 +0000819
820def getdecoder(encoding):
821
822 """ Lookup up the codec for the given encoding and return
823 its decoder function.
824
825 Raises a LookupError in case the encoding cannot be found.
826
827 """
Walter Dörwaldabb02e52006-03-15 11:35:15 +0000828 return lookup(encoding).decode
829
830def getincrementalencoder(encoding):
831
832 """ Lookup up the codec for the given encoding and return
833 its IncrementalEncoder class or factory function.
834
835 Raises a LookupError in case the encoding cannot be found
836 or the codecs doesn't provide an incremental encoder.
837
838 """
839 encoder = lookup(encoding).incrementalencoder
840 if encoder is None:
841 raise LookupError(encoding)
842 return encoder
843
844def getincrementaldecoder(encoding):
845
846 """ Lookup up the codec for the given encoding and return
847 its IncrementalDecoder class or factory function.
848
849 Raises a LookupError in case the encoding cannot be found
850 or the codecs doesn't provide an incremental decoder.
851
852 """
853 decoder = lookup(encoding).incrementaldecoder
854 if decoder is None:
855 raise LookupError(encoding)
856 return decoder
Marc-André Lemburgaa32c5a2001-09-19 11:24:48 +0000857
858def getreader(encoding):
859
860 """ Lookup up the codec for the given encoding and return
861 its StreamReader class or factory function.
862
863 Raises a LookupError in case the encoding cannot be found.
864
865 """
Walter Dörwaldabb02e52006-03-15 11:35:15 +0000866 return lookup(encoding).streamreader
Marc-André Lemburgaa32c5a2001-09-19 11:24:48 +0000867
868def getwriter(encoding):
869
870 """ Lookup up the codec for the given encoding and return
871 its StreamWriter class or factory function.
872
873 Raises a LookupError in case the encoding cannot be found.
874
875 """
Walter Dörwaldabb02e52006-03-15 11:35:15 +0000876 return lookup(encoding).streamwriter
877
878def iterencode(iterator, encoding, errors='strict', **kwargs):
879 """
880 Encoding iterator.
881
882 Encodes the input strings from the iterator using a IncrementalEncoder.
883
884 errors and kwargs are passed through to the IncrementalEncoder
885 constructor.
886 """
887 encoder = getincrementalencoder(encoding)(errors, **kwargs)
888 for input in iterator:
889 output = encoder.encode(input)
890 if output:
891 yield output
892 output = encoder.encode("", True)
893 if output:
894 yield output
895
896def iterdecode(iterator, encoding, errors='strict', **kwargs):
897 """
898 Decoding iterator.
899
900 Decodes the input strings from the iterator using a IncrementalDecoder.
901
902 errors and kwargs are passed through to the IncrementalDecoder
903 constructor.
904 """
905 decoder = getincrementaldecoder(encoding)(errors, **kwargs)
906 for input in iterator:
907 output = decoder.decode(input)
908 if output:
909 yield output
910 output = decoder.decode("", True)
911 if output:
912 yield output
Marc-André Lemburgaa32c5a2001-09-19 11:24:48 +0000913
Marc-André Lemburga866df82001-01-03 21:29:14 +0000914### Helpers for charmap-based codecs
915
916def make_identity_dict(rng):
917
918 """ make_identity_dict(rng) -> dict
919
920 Return a dictionary where elements of the rng sequence are
921 mapped to themselves.
Tim Peters88869f92001-01-14 23:36:06 +0000922
Marc-André Lemburga866df82001-01-03 21:29:14 +0000923 """
924 res = {}
925 for i in rng:
926 res[i]=i
927 return res
928
Marc-André Lemburg716cf912001-05-16 09:41:45 +0000929def make_encoding_map(decoding_map):
930
931 """ Creates an encoding map from a decoding map.
932
Walter Dörwald7f3ed742003-02-02 23:08:27 +0000933 If a target mapping in the decoding map occurs multiple
Marc-André Lemburg716cf912001-05-16 09:41:45 +0000934 times, then that target is mapped to None (undefined mapping),
935 causing an exception when encountered by the charmap codec
936 during translation.
937
938 One example where this happens is cp875.py which decodes
939 multiple character to \u001a.
940
941 """
942 m = {}
943 for k,v in decoding_map.items():
Raymond Hettinger54f02222002-06-01 14:18:47 +0000944 if not v in m:
Marc-André Lemburg716cf912001-05-16 09:41:45 +0000945 m[v] = k
946 else:
947 m[v] = None
948 return m
Tim Peters3a2ab1a2001-05-29 06:06:54 +0000949
Walter Dörwald3aeb6322002-09-02 13:14:32 +0000950### error handlers
951
Martin v. Löwise2713be2005-03-08 15:03:08 +0000952try:
953 strict_errors = lookup_error("strict")
954 ignore_errors = lookup_error("ignore")
955 replace_errors = lookup_error("replace")
956 xmlcharrefreplace_errors = lookup_error("xmlcharrefreplace")
957 backslashreplace_errors = lookup_error("backslashreplace")
958except LookupError:
959 # In --disable-unicode builds, these error handler are missing
960 strict_errors = None
961 ignore_errors = None
962 replace_errors = None
963 xmlcharrefreplace_errors = None
964 backslashreplace_errors = None
Walter Dörwald3aeb6322002-09-02 13:14:32 +0000965
Martin v. Löwis6cd441d2001-07-31 08:54:55 +0000966# Tell modulefinder that using codecs probably needs the encodings
967# package
968_false = 0
969if _false:
970 import encodings
971
Guido van Rossum0612d842000-03-10 23:20:43 +0000972### Tests
Guido van Rossum1c89b0e2000-04-11 15:41:38 +0000973
Guido van Rossum0612d842000-03-10 23:20:43 +0000974if __name__ == '__main__':
975
Guido van Rossuma3277132000-04-11 15:37:43 +0000976 # Make stdout translate Latin-1 output into UTF-8 output
977 sys.stdout = EncodedFile(sys.stdout, 'latin-1', 'utf-8')
Guido van Rossum1c89b0e2000-04-11 15:41:38 +0000978
Guido van Rossuma3277132000-04-11 15:37:43 +0000979 # Have stdin translate Latin-1 input into UTF-8 input
980 sys.stdin = EncodedFile(sys.stdin, 'utf-8', 'latin-1')