blob: 28856c714e9d2b94ce3d6268c48822892bb0f606 [file] [log] [blame]
Guido van Rossum0612d842000-03-10 23:20:43 +00001""" codecs -- Python Codec Registry, API and helpers.
2
3
4Written by Marc-Andre Lemburg (mal@lemburg.com).
5
6(c) Copyright CNRI, All Rights Reserved. NO WARRANTY.
7
8"""#"
9
Marc-André Lemburgb28de0d2002-12-12 17:37:50 +000010import __builtin__, sys
Guido van Rossum0612d842000-03-10 23:20:43 +000011
12### Registry and builtin stateless codec functions
13
Guido van Rossumb95de4f2000-03-31 17:25:23 +000014try:
15 from _codecs import *
Tim Peters30324a72001-05-15 17:19:16 +000016except ImportError, why:
Guido van Rossumb95de4f2000-03-31 17:25:23 +000017 raise SystemError,\
18 'Failed to load the builtin codecs: %s' % why
Guido van Rossum0612d842000-03-10 23:20:43 +000019
Tim Peters30324a72001-05-15 17:19:16 +000020__all__ = ["register", "lookup", "open", "EncodedFile", "BOM", "BOM_BE",
Walter Dörwald474458d2002-06-04 15:16:29 +000021 "BOM_LE", "BOM32_BE", "BOM32_LE", "BOM64_BE", "BOM64_LE",
22 "BOM_UTF8", "BOM_UTF16", "BOM_UTF16_LE", "BOM_UTF16_BE",
Walter Dörwald3aeb6322002-09-02 13:14:32 +000023 "BOM_UTF32", "BOM_UTF32_LE", "BOM_UTF32_BE",
24 "strict_errors", "ignore_errors", "replace_errors",
25 "xmlcharrefreplace_errors",
26 "register_error", "lookup_error"]
Skip Montanaroe99d5ea2001-01-20 19:54:20 +000027
Guido van Rossum0612d842000-03-10 23:20:43 +000028### Constants
29
30#
Walter Dörwald474458d2002-06-04 15:16:29 +000031# Byte Order Mark (BOM = ZERO WIDTH NO-BREAK SPACE = U+FEFF)
32# and its possible byte string values
33# for UTF8/UTF16/UTF32 output and little/big endian machines
Guido van Rossum0612d842000-03-10 23:20:43 +000034#
Guido van Rossum0612d842000-03-10 23:20:43 +000035
Walter Dörwald474458d2002-06-04 15:16:29 +000036# UTF-8
37BOM_UTF8 = '\xef\xbb\xbf'
38
39# UTF-16, little endian
40BOM_LE = BOM_UTF16_LE = '\xff\xfe'
41
42# UTF-16, big endian
43BOM_BE = BOM_UTF16_BE = '\xfe\xff'
44
45# UTF-32, little endian
46BOM_UTF32_LE = '\xff\xfe\x00\x00'
47
48# UTF-32, big endian
49BOM_UTF32_BE = '\x00\x00\xfe\xff'
50
Marc-André Lemburgb28de0d2002-12-12 17:37:50 +000051if sys.byteorder == 'little':
Walter Dörwald474458d2002-06-04 15:16:29 +000052
Marc-André Lemburgb28de0d2002-12-12 17:37:50 +000053 # UTF-16, native endianness
54 BOM = BOM_UTF16 = BOM_UTF16_LE
55
56 # UTF-32, native endianness
57 BOM_UTF32 = BOM_UTF32_LE
58
59else:
60
61 # UTF-16, native endianness
62 BOM = BOM_UTF16 = BOM_UTF16_BE
63
64 # UTF-32, native endianness
65 BOM_UTF32 = BOM_UTF32_BE
Walter Dörwald474458d2002-06-04 15:16:29 +000066
67# Old broken names (don't use in new code)
68BOM32_LE = BOM_UTF16_LE
69BOM32_BE = BOM_UTF16_BE
70BOM64_LE = BOM_UTF32_LE
71BOM64_BE = BOM_UTF32_BE
Guido van Rossum0612d842000-03-10 23:20:43 +000072
73
74### Codec base classes (defining the API)
75
Thomas Woutersa9773292006-04-21 09:43:23 +000076class CodecInfo(tuple):
77
78 def __new__(cls, encode, decode, streamreader=None, streamwriter=None,
79 incrementalencoder=None, incrementaldecoder=None, name=None):
80 self = tuple.__new__(cls, (encode, decode, streamreader, streamwriter))
81 self.name = name
82 self.encode = encode
83 self.decode = decode
84 self.incrementalencoder = incrementalencoder
85 self.incrementaldecoder = incrementaldecoder
86 self.streamwriter = streamwriter
87 self.streamreader = streamreader
88 return self
89
90 def __repr__(self):
91 return "<%s.%s object for encoding %s at 0x%x>" % (self.__class__.__module__, self.__class__.__name__, self.name, id(self))
92
Guido van Rossum0612d842000-03-10 23:20:43 +000093class Codec:
94
95 """ Defines the interface for stateless encoders/decoders.
96
Walter Dörwald7f82f792002-11-19 21:42:53 +000097 The .encode()/.decode() methods may use different error
Guido van Rossum0612d842000-03-10 23:20:43 +000098 handling schemes by providing the errors argument. These
Walter Dörwald7f82f792002-11-19 21:42:53 +000099 string values are predefined:
Guido van Rossum0612d842000-03-10 23:20:43 +0000100
Guido van Rossumd8855fd2000-03-24 22:14:19 +0000101 'strict' - raise a ValueError error (or a subclass)
Guido van Rossum0612d842000-03-10 23:20:43 +0000102 'ignore' - ignore the character and continue with the next
103 'replace' - replace with a suitable replacement character;
104 Python will use the official U+FFFD REPLACEMENT
Walter Dörwald7f82f792002-11-19 21:42:53 +0000105 CHARACTER for the builtin Unicode codecs on
106 decoding and '?' on encoding.
107 'xmlcharrefreplace' - Replace with the appropriate XML
108 character reference (only for encoding).
109 'backslashreplace' - Replace with backslashed escape sequences
110 (only for encoding).
111
112 The set of allowed values can be extended via register_error.
Guido van Rossum0612d842000-03-10 23:20:43 +0000113
114 """
Tim Peters30324a72001-05-15 17:19:16 +0000115 def encode(self, input, errors='strict'):
Guido van Rossum1c89b0e2000-04-11 15:41:38 +0000116
Fred Drake3e74c0d2000-03-17 15:40:35 +0000117 """ Encodes the object input and returns a tuple (output
Guido van Rossum0612d842000-03-10 23:20:43 +0000118 object, length consumed).
119
120 errors defines the error handling to apply. It defaults to
121 'strict' handling.
122
123 The method may not store state in the Codec instance. Use
124 StreamCodec for codecs which have to keep state in order to
125 make encoding/decoding efficient.
126
127 The encoder must be able to handle zero length input and
128 return an empty object of the output object type in this
129 situation.
130
131 """
132 raise NotImplementedError
133
Tim Peters30324a72001-05-15 17:19:16 +0000134 def decode(self, input, errors='strict'):
Guido van Rossum0612d842000-03-10 23:20:43 +0000135
136 """ Decodes the object input and returns a tuple (output
137 object, length consumed).
138
139 input must be an object which provides the bf_getreadbuf
140 buffer slot. Python strings, buffer objects and memory
141 mapped files are examples of objects providing this slot.
Guido van Rossum1c89b0e2000-04-11 15:41:38 +0000142
Guido van Rossum0612d842000-03-10 23:20:43 +0000143 errors defines the error handling to apply. It defaults to
144 'strict' handling.
145
146 The method may not store state in the Codec instance. Use
147 StreamCodec for codecs which have to keep state in order to
148 make encoding/decoding efficient.
149
150 The decoder must be able to handle zero length input and
151 return an empty object of the output object type in this
152 situation.
153
Guido van Rossum1c89b0e2000-04-11 15:41:38 +0000154 """
Guido van Rossum0612d842000-03-10 23:20:43 +0000155 raise NotImplementedError
156
Thomas Woutersa9773292006-04-21 09:43:23 +0000157class IncrementalEncoder(object):
158 """
159 A IncrementalEncoder encodes an input in multiple steps. The input can be
160 passed piece by piece to the encode() method. The IncrementalEncoder remembers
161 the state of the Encoding process between calls to encode().
162 """
163 def __init__(self, errors='strict'):
164 """
165 Creates a IncrementalEncoder instance.
166
167 The IncrementalEncoder may use different error handling schemes by
168 providing the errors keyword argument. See the module docstring
169 for a list of possible values.
170 """
171 self.errors = errors
172 self.buffer = ""
173
174 def encode(self, input, final=False):
175 """
176 Encodes input and returns the resulting object.
177 """
178 raise NotImplementedError
179
180 def reset(self):
181 """
182 Resets the encoder to the initial state.
183 """
184
185class IncrementalDecoder(object):
186 """
187 An IncrementalDecoder decodes an input in multiple steps. The input can be
188 passed piece by piece to the decode() method. The IncrementalDecoder
189 remembers the state of the decoding process between calls to decode().
190 """
191 def __init__(self, errors='strict'):
192 """
193 Creates a IncrementalDecoder instance.
194
195 The IncrementalDecoder may use different error handling schemes by
196 providing the errors keyword argument. See the module docstring
197 for a list of possible values.
198 """
199 self.errors = errors
200
201 def decode(self, input, final=False):
202 """
203 Decodes input and returns the resulting object.
204 """
205 raise NotImplementedError
206
207 def reset(self):
208 """
209 Resets the decoder to the initial state.
210 """
211
212class BufferedIncrementalDecoder(IncrementalDecoder):
213 """
214 This subclass of IncrementalDecoder can be used as the baseclass for an
215 incremental decoder if the decoder must be able to handle incomplete byte
216 sequences.
217 """
218 def __init__(self, errors='strict'):
219 IncrementalDecoder.__init__(self, errors)
220 self.buffer = "" # undecoded input that is kept between calls to decode()
221
222 def _buffer_decode(self, input, errors, final):
223 # Overwrite this method in subclasses: It must decode input
224 # and return an (output, length consumed) tuple
225 raise NotImplementedError
226
227 def decode(self, input, final=False):
228 # decode input (taking the buffer into account)
229 data = self.buffer + input
230 (result, consumed) = self._buffer_decode(data, self.errors, final)
231 # keep undecoded input until the next call
232 self.buffer = data[consumed:]
233 return result
234
235 def reset(self):
236 IncrementalDecoder.reset(self)
237 self.bytebuffer = ""
238
Guido van Rossum0612d842000-03-10 23:20:43 +0000239#
240# The StreamWriter and StreamReader class provide generic working
Andrew M. Kuchling97c56352001-09-18 20:29:48 +0000241# interfaces which can be used to implement new encoding submodules
Guido van Rossum0612d842000-03-10 23:20:43 +0000242# very easily. See encodings/utf_8.py for an example on how this is
243# done.
Guido van Rossum1c89b0e2000-04-11 15:41:38 +0000244#
Guido van Rossum0612d842000-03-10 23:20:43 +0000245
246class StreamWriter(Codec):
247
Tim Peters30324a72001-05-15 17:19:16 +0000248 def __init__(self, stream, errors='strict'):
Guido van Rossum0612d842000-03-10 23:20:43 +0000249
250 """ Creates a StreamWriter instance.
251
252 stream must be a file-like object open for writing
253 (binary) data.
254
Walter Dörwald7f82f792002-11-19 21:42:53 +0000255 The StreamWriter may use different error handling
Guido van Rossum0612d842000-03-10 23:20:43 +0000256 schemes by providing the errors keyword argument. These
Walter Dörwald7f82f792002-11-19 21:42:53 +0000257 parameters are predefined:
Guido van Rossum0612d842000-03-10 23:20:43 +0000258
259 'strict' - raise a ValueError (or a subclass)
260 'ignore' - ignore the character and continue with the next
261 'replace'- replace with a suitable replacement character
Walter Dörwald7f82f792002-11-19 21:42:53 +0000262 'xmlcharrefreplace' - Replace with the appropriate XML
263 character reference.
264 'backslashreplace' - Replace with backslashed escape
265 sequences (only for encoding).
Guido van Rossum0612d842000-03-10 23:20:43 +0000266
Walter Dörwald7f82f792002-11-19 21:42:53 +0000267 The set of allowed parameter values can be extended via
268 register_error.
Guido van Rossum0612d842000-03-10 23:20:43 +0000269 """
270 self.stream = stream
271 self.errors = errors
272
Guido van Rossuma3277132000-04-11 15:37:43 +0000273 def write(self, object):
Guido van Rossum0612d842000-03-10 23:20:43 +0000274
275 """ Writes the object's contents encoded to self.stream.
276 """
Tim Peters30324a72001-05-15 17:19:16 +0000277 data, consumed = self.encode(object, self.errors)
Guido van Rossum0612d842000-03-10 23:20:43 +0000278 self.stream.write(data)
279
Guido van Rossuma3277132000-04-11 15:37:43 +0000280 def writelines(self, list):
281
282 """ Writes the concatenated list of strings to the stream
283 using .write().
284 """
285 self.write(''.join(list))
Guido van Rossum1c89b0e2000-04-11 15:41:38 +0000286
Guido van Rossum0612d842000-03-10 23:20:43 +0000287 def reset(self):
288
289 """ Flushes and resets the codec buffers used for keeping state.
290
291 Calling this method should ensure that the data on the
292 output is put into a clean state, that allows appending
293 of new fresh data without having to rescan the whole
294 stream to recover state.
295
296 """
297 pass
298
Tim Peters30324a72001-05-15 17:19:16 +0000299 def __getattr__(self, name,
Guido van Rossum0612d842000-03-10 23:20:43 +0000300 getattr=getattr):
301
302 """ Inherit all other methods from the underlying stream.
303 """
Tim Peters30324a72001-05-15 17:19:16 +0000304 return getattr(self.stream, name)
Guido van Rossum0612d842000-03-10 23:20:43 +0000305
306###
307
308class StreamReader(Codec):
309
Tim Peters30324a72001-05-15 17:19:16 +0000310 def __init__(self, stream, errors='strict'):
Guido van Rossum0612d842000-03-10 23:20:43 +0000311
312 """ Creates a StreamReader instance.
313
314 stream must be a file-like object open for reading
315 (binary) data.
316
Walter Dörwald7f82f792002-11-19 21:42:53 +0000317 The StreamReader may use different error handling
Guido van Rossum0612d842000-03-10 23:20:43 +0000318 schemes by providing the errors keyword argument. These
Walter Dörwald7f82f792002-11-19 21:42:53 +0000319 parameters are predefined:
Guido van Rossum0612d842000-03-10 23:20:43 +0000320
321 'strict' - raise a ValueError (or a subclass)
322 'ignore' - ignore the character and continue with the next
323 'replace'- replace with a suitable replacement character;
324
Walter Dörwald7f82f792002-11-19 21:42:53 +0000325 The set of allowed parameter values can be extended via
326 register_error.
Guido van Rossum0612d842000-03-10 23:20:43 +0000327 """
328 self.stream = stream
329 self.errors = errors
Walter Dörwald69652032004-09-07 20:24:22 +0000330 self.bytebuffer = ""
Walter Dörwaldc9878e12005-07-20 22:15:39 +0000331 # For str->str decoding this will stay a str
332 # For str->unicode decoding the first read will promote it to unicode
333 self.charbuffer = ""
Martin v. Löwis4ed67382005-09-18 08:34:39 +0000334 self.linebuffer = None
Guido van Rossum0612d842000-03-10 23:20:43 +0000335
Walter Dörwald69652032004-09-07 20:24:22 +0000336 def decode(self, input, errors='strict'):
337 raise NotImplementedError
338
Martin v. Löwis56066d22005-08-24 07:38:12 +0000339 def read(self, size=-1, chars=-1, firstline=False):
Guido van Rossum0612d842000-03-10 23:20:43 +0000340
341 """ Decodes data from the stream self.stream and returns the
342 resulting object.
343
Walter Dörwald69652032004-09-07 20:24:22 +0000344 chars indicates the number of characters to read from the
345 stream. read() will never return more than chars
346 characters, but it might return less, if there are not enough
347 characters available.
348
Guido van Rossum0612d842000-03-10 23:20:43 +0000349 size indicates the approximate maximum number of bytes to
350 read from the stream for decoding purposes. The decoder
351 can modify this setting as appropriate. The default value
352 -1 indicates to read and decode as much as possible. size
353 is intended to prevent having to decode huge files in one
354 step.
355
Martin v. Löwis56066d22005-08-24 07:38:12 +0000356 If firstline is true, and a UnicodeDecodeError happens
357 after the first line terminator in the input only the first line
358 will be returned, the rest of the input will be kept until the
359 next call to read().
360
Guido van Rossum0612d842000-03-10 23:20:43 +0000361 The method should use a greedy read strategy meaning that
362 it should read as much data as is allowed within the
363 definition of the encoding and the given size, e.g. if
364 optional encoding endings or state markers are available
365 on the stream, these should be read too.
Guido van Rossum0612d842000-03-10 23:20:43 +0000366 """
Martin v. Löwis4ed67382005-09-18 08:34:39 +0000367 # If we have lines cached, first merge them back into characters
368 if self.linebuffer:
369 self.charbuffer = "".join(self.linebuffer)
370 self.linebuffer = None
Tim Peters536cf992005-12-25 23:18:31 +0000371
Walter Dörwald69652032004-09-07 20:24:22 +0000372 # read until we get the required number of characters (if available)
Walter Dörwald69652032004-09-07 20:24:22 +0000373 while True:
374 # can the request can be satisfied from the character buffer?
375 if chars < 0:
Walter Dörwaldca199432006-03-06 22:39:12 +0000376 if size < 0:
377 if self.charbuffer:
378 break
379 elif len(self.charbuffer) >= size:
Walter Dörwalde57d7b12004-12-21 22:24:00 +0000380 break
Guido van Rossum0612d842000-03-10 23:20:43 +0000381 else:
Walter Dörwald69652032004-09-07 20:24:22 +0000382 if len(self.charbuffer) >= chars:
Walter Dörwald69652032004-09-07 20:24:22 +0000383 break
384 # we need more data
385 if size < 0:
386 newdata = self.stream.read()
387 else:
388 newdata = self.stream.read(size)
Walter Dörwalde57d7b12004-12-21 22:24:00 +0000389 # decode bytes (those remaining from the last call included)
Walter Dörwald69652032004-09-07 20:24:22 +0000390 data = self.bytebuffer + newdata
Martin v. Löwis56066d22005-08-24 07:38:12 +0000391 try:
392 newchars, decodedbytes = self.decode(data, self.errors)
393 except UnicodeDecodeError, exc:
394 if firstline:
395 newchars, decodedbytes = self.decode(data[:exc.start], self.errors)
396 lines = newchars.splitlines(True)
397 if len(lines)<=1:
398 raise
399 else:
400 raise
Walter Dörwald69652032004-09-07 20:24:22 +0000401 # keep undecoded bytes until the next call
402 self.bytebuffer = data[decodedbytes:]
403 # put new characters in the character buffer
Walter Dörwalde57d7b12004-12-21 22:24:00 +0000404 self.charbuffer += newchars
Walter Dörwald69652032004-09-07 20:24:22 +0000405 # there was no data available
406 if not newdata:
Walter Dörwalde57d7b12004-12-21 22:24:00 +0000407 break
408 if chars < 0:
409 # Return everything we've got
410 result = self.charbuffer
Walter Dörwaldc9878e12005-07-20 22:15:39 +0000411 self.charbuffer = ""
Walter Dörwalde57d7b12004-12-21 22:24:00 +0000412 else:
413 # Return the first chars characters
414 result = self.charbuffer[:chars]
415 self.charbuffer = self.charbuffer[chars:]
Walter Dörwald69652032004-09-07 20:24:22 +0000416 return result
Guido van Rossum0612d842000-03-10 23:20:43 +0000417
Walter Dörwald69652032004-09-07 20:24:22 +0000418 def readline(self, size=None, keepends=True):
Guido van Rossuma3277132000-04-11 15:37:43 +0000419
420 """ Read one line from the input stream and return the
421 decoded data.
422
Walter Dörwald69652032004-09-07 20:24:22 +0000423 size, if given, is passed as size argument to the
424 read() method.
Guido van Rossum1c89b0e2000-04-11 15:41:38 +0000425
Guido van Rossuma3277132000-04-11 15:37:43 +0000426 """
Martin v. Löwis4ed67382005-09-18 08:34:39 +0000427 # If we have lines cached from an earlier read, return
428 # them unconditionally
429 if self.linebuffer:
430 line = self.linebuffer[0]
431 del self.linebuffer[0]
432 if len(self.linebuffer) == 1:
433 # revert to charbuffer mode; we might need more data
434 # next time
435 self.charbuffer = self.linebuffer[0]
436 self.linebuffer = None
437 if not keepends:
438 line = line.splitlines(False)[0]
439 return line
Tim Peters536cf992005-12-25 23:18:31 +0000440
Walter Dörwalde57d7b12004-12-21 22:24:00 +0000441 readsize = size or 72
Walter Dörwaldc9878e12005-07-20 22:15:39 +0000442 line = ""
Walter Dörwalde57d7b12004-12-21 22:24:00 +0000443 # If size is given, we call read() only once
Walter Dörwald69652032004-09-07 20:24:22 +0000444 while True:
Martin v. Löwis56066d22005-08-24 07:38:12 +0000445 data = self.read(readsize, firstline=True)
Walter Dörwalde57d7b12004-12-21 22:24:00 +0000446 if data:
Walter Dörwalda4eb2d52005-04-21 21:42:35 +0000447 # If we're at a "\r" read one extra character (which might
448 # be a "\n") to get a proper line ending. If the stream is
Walter Dörwaldbc8e6422005-04-21 21:32:03 +0000449 # temporarily exhausted we return the wrong line ending.
Walter Dörwaldc9878e12005-07-20 22:15:39 +0000450 if data.endswith("\r"):
Walter Dörwald7a6dc132005-04-04 21:38:47 +0000451 data += self.read(size=1, chars=1)
Walter Dörwald7a6dc132005-04-04 21:38:47 +0000452
Walter Dörwald69652032004-09-07 20:24:22 +0000453 line += data
Walter Dörwalde57d7b12004-12-21 22:24:00 +0000454 lines = line.splitlines(True)
455 if lines:
Martin v. Löwis4ed67382005-09-18 08:34:39 +0000456 if len(lines) > 1:
457 # More than one line result; the first line is a full line
458 # to return
459 line = lines[0]
460 del lines[0]
461 if len(lines) > 1:
462 # cache the remaining lines
463 lines[-1] += self.charbuffer
464 self.linebuffer = lines
465 self.charbuffer = None
466 else:
467 # only one remaining line, put it back into charbuffer
468 self.charbuffer = lines[0] + self.charbuffer
469 if not keepends:
470 line = line.splitlines(False)[0]
471 break
Walter Dörwalde57d7b12004-12-21 22:24:00 +0000472 line0withend = lines[0]
473 line0withoutend = lines[0].splitlines(False)[0]
474 if line0withend != line0withoutend: # We really have a line end
475 # Put the rest back together and keep it until the next call
Walter Dörwaldc9878e12005-07-20 22:15:39 +0000476 self.charbuffer = "".join(lines[1:]) + self.charbuffer
Walter Dörwalde57d7b12004-12-21 22:24:00 +0000477 if keepends:
478 line = line0withend
479 else:
480 line = line0withoutend
Walter Dörwald9fa09462005-01-10 12:01:39 +0000481 break
Walter Dörwalde57d7b12004-12-21 22:24:00 +0000482 # we didn't get anything or this was our only try
Walter Dörwald9fa09462005-01-10 12:01:39 +0000483 if not data or size is not None:
Walter Dörwalde57d7b12004-12-21 22:24:00 +0000484 if line and not keepends:
485 line = line.splitlines(False)[0]
486 break
487 if readsize<8000:
488 readsize *= 2
489 return line
Guido van Rossum1c89b0e2000-04-11 15:41:38 +0000490
Walter Dörwald69652032004-09-07 20:24:22 +0000491 def readlines(self, sizehint=None, keepends=True):
Guido van Rossuma3277132000-04-11 15:37:43 +0000492
493 """ Read all lines available on the input stream
494 and return them as list of lines.
495
496 Line breaks are implemented using the codec's decoder
497 method and are included in the list entries.
Guido van Rossum1c89b0e2000-04-11 15:41:38 +0000498
Marc-André Lemburgd5948492004-02-26 15:22:17 +0000499 sizehint, if given, is ignored since there is no efficient
500 way to finding the true end-of-line.
Guido van Rossuma3277132000-04-11 15:37:43 +0000501
502 """
Walter Dörwald69652032004-09-07 20:24:22 +0000503 data = self.read()
Hye-Shik Changaf5c7cf2004-10-17 23:51:21 +0000504 return data.splitlines(keepends)
Guido van Rossum0612d842000-03-10 23:20:43 +0000505
506 def reset(self):
507
508 """ Resets the codec buffers used for keeping state.
509
510 Note that no stream repositioning should take place.
Thomas Wouters7e474022000-07-16 12:04:32 +0000511 This method is primarily intended to be able to recover
Guido van Rossum0612d842000-03-10 23:20:43 +0000512 from decoding errors.
513
514 """
Walter Dörwald729c31f2005-03-14 19:06:30 +0000515 self.bytebuffer = ""
516 self.charbuffer = u""
Martin v. Löwis4ed67382005-09-18 08:34:39 +0000517 self.linebuffer = None
Walter Dörwald729c31f2005-03-14 19:06:30 +0000518
Walter Dörwald71fd90d2005-03-14 19:25:41 +0000519 def seek(self, offset, whence=0):
Walter Dörwald729c31f2005-03-14 19:06:30 +0000520 """ Set the input stream's current position.
521
522 Resets the codec buffers used for keeping state.
523 """
524 self.reset()
525 self.stream.seek(offset, whence)
Guido van Rossum0612d842000-03-10 23:20:43 +0000526
Walter Dörwald4dbf1922002-11-06 16:53:44 +0000527 def next(self):
528
529 """ Return the next decoded line from the input stream."""
530 line = self.readline()
531 if line:
532 return line
533 raise StopIteration
534
535 def __iter__(self):
536 return self
537
Tim Peters30324a72001-05-15 17:19:16 +0000538 def __getattr__(self, name,
Guido van Rossum0612d842000-03-10 23:20:43 +0000539 getattr=getattr):
540
541 """ Inherit all other methods from the underlying stream.
542 """
Tim Peters30324a72001-05-15 17:19:16 +0000543 return getattr(self.stream, name)
Guido van Rossum0612d842000-03-10 23:20:43 +0000544
545###
546
547class StreamReaderWriter:
548
Fred Drake49fd1072000-04-13 14:11:21 +0000549 """ StreamReaderWriter instances allow wrapping streams which
550 work in both read and write modes.
551
552 The design is such that one can use the factory functions
Thomas Wouters7e474022000-07-16 12:04:32 +0000553 returned by the codec.lookup() function to construct the
Fred Drake49fd1072000-04-13 14:11:21 +0000554 instance.
555
556 """
Guido van Rossuma3277132000-04-11 15:37:43 +0000557 # Optional attributes set by the file wrappers below
558 encoding = 'unknown'
559
Tim Peters30324a72001-05-15 17:19:16 +0000560 def __init__(self, stream, Reader, Writer, errors='strict'):
Guido van Rossum0612d842000-03-10 23:20:43 +0000561
562 """ Creates a StreamReaderWriter instance.
563
564 stream must be a Stream-like object.
565
566 Reader, Writer must be factory functions or classes
567 providing the StreamReader, StreamWriter interface resp.
568
569 Error handling is done in the same way as defined for the
570 StreamWriter/Readers.
571
572 """
573 self.stream = stream
574 self.reader = Reader(stream, errors)
575 self.writer = Writer(stream, errors)
576 self.errors = errors
577
Tim Peters30324a72001-05-15 17:19:16 +0000578 def read(self, size=-1):
Guido van Rossum0612d842000-03-10 23:20:43 +0000579
580 return self.reader.read(size)
581
Guido van Rossumd58c26f2000-05-01 16:17:32 +0000582 def readline(self, size=None):
Guido van Rossuma3277132000-04-11 15:37:43 +0000583
584 return self.reader.readline(size)
585
Guido van Rossumd58c26f2000-05-01 16:17:32 +0000586 def readlines(self, sizehint=None):
Guido van Rossuma3277132000-04-11 15:37:43 +0000587
588 return self.reader.readlines(sizehint)
589
Walter Dörwald4dbf1922002-11-06 16:53:44 +0000590 def next(self):
591
592 """ Return the next decoded line from the input stream."""
593 return self.reader.next()
594
595 def __iter__(self):
596 return self
597
Tim Peters30324a72001-05-15 17:19:16 +0000598 def write(self, data):
Guido van Rossum0612d842000-03-10 23:20:43 +0000599
600 return self.writer.write(data)
601
Tim Peters30324a72001-05-15 17:19:16 +0000602 def writelines(self, list):
Guido van Rossuma3277132000-04-11 15:37:43 +0000603
604 return self.writer.writelines(list)
605
Guido van Rossum0612d842000-03-10 23:20:43 +0000606 def reset(self):
607
608 self.reader.reset()
609 self.writer.reset()
610
Tim Peters30324a72001-05-15 17:19:16 +0000611 def __getattr__(self, name,
Guido van Rossum0612d842000-03-10 23:20:43 +0000612 getattr=getattr):
613
614 """ Inherit all other methods from the underlying stream.
615 """
Tim Peters30324a72001-05-15 17:19:16 +0000616 return getattr(self.stream, name)
Guido van Rossum0612d842000-03-10 23:20:43 +0000617
618###
619
620class StreamRecoder:
621
Fred Drake49fd1072000-04-13 14:11:21 +0000622 """ StreamRecoder instances provide a frontend - backend
623 view of encoding data.
624
625 They use the complete set of APIs returned by the
626 codecs.lookup() function to implement their task.
627
628 Data written to the stream is first decoded into an
629 intermediate format (which is dependent on the given codec
630 combination) and then written to the stream using an instance
631 of the provided Writer class.
632
633 In the other direction, data is read from the stream using a
634 Reader instance and then return encoded data to the caller.
635
636 """
Guido van Rossuma3277132000-04-11 15:37:43 +0000637 # Optional attributes set by the file wrappers below
638 data_encoding = 'unknown'
639 file_encoding = 'unknown'
640
Tim Peters30324a72001-05-15 17:19:16 +0000641 def __init__(self, stream, encode, decode, Reader, Writer,
642 errors='strict'):
Guido van Rossum0612d842000-03-10 23:20:43 +0000643
644 """ Creates a StreamRecoder instance which implements a two-way
645 conversion: encode and decode work on the frontend (the
Guido van Rossum1c89b0e2000-04-11 15:41:38 +0000646 input to .read() and output of .write()) while
Guido van Rossum0612d842000-03-10 23:20:43 +0000647 Reader and Writer work on the backend (reading and
Fred Drake908670c2000-03-17 15:42:11 +0000648 writing to the stream).
Guido van Rossum0612d842000-03-10 23:20:43 +0000649
650 You can use these objects to do transparent direct
651 recodings from e.g. latin-1 to utf-8 and back.
652
653 stream must be a file-like object.
654
655 encode, decode must adhere to the Codec interface, Reader,
656 Writer must be factory functions or classes providing the
657 StreamReader, StreamWriter interface resp.
658
659 encode and decode are needed for the frontend translation,
660 Reader and Writer for the backend translation. Unicode is
661 used as intermediate encoding.
662
663 Error handling is done in the same way as defined for the
664 StreamWriter/Readers.
665
666 """
667 self.stream = stream
668 self.encode = encode
669 self.decode = decode
670 self.reader = Reader(stream, errors)
671 self.writer = Writer(stream, errors)
672 self.errors = errors
673
Tim Peters30324a72001-05-15 17:19:16 +0000674 def read(self, size=-1):
Guido van Rossum0612d842000-03-10 23:20:43 +0000675
676 data = self.reader.read(size)
677 data, bytesencoded = self.encode(data, self.errors)
678 return data
679
Tim Peters30324a72001-05-15 17:19:16 +0000680 def readline(self, size=None):
Guido van Rossuma3277132000-04-11 15:37:43 +0000681
682 if size is None:
683 data = self.reader.readline()
684 else:
685 data = self.reader.readline(size)
686 data, bytesencoded = self.encode(data, self.errors)
687 return data
688
Tim Peters30324a72001-05-15 17:19:16 +0000689 def readlines(self, sizehint=None):
Guido van Rossuma3277132000-04-11 15:37:43 +0000690
Marc-André Lemburgd5948492004-02-26 15:22:17 +0000691 data = self.reader.read()
Guido van Rossuma3277132000-04-11 15:37:43 +0000692 data, bytesencoded = self.encode(data, self.errors)
693 return data.splitlines(1)
694
Walter Dörwald4dbf1922002-11-06 16:53:44 +0000695 def next(self):
696
697 """ Return the next decoded line from the input stream."""
Walter Dörwaldc5238b82005-09-01 11:56:53 +0000698 data = self.reader.next()
699 data, bytesencoded = self.encode(data, self.errors)
700 return data
Walter Dörwald4dbf1922002-11-06 16:53:44 +0000701
702 def __iter__(self):
703 return self
704
Tim Peters30324a72001-05-15 17:19:16 +0000705 def write(self, data):
Guido van Rossum0612d842000-03-10 23:20:43 +0000706
707 data, bytesdecoded = self.decode(data, self.errors)
708 return self.writer.write(data)
709
Tim Peters30324a72001-05-15 17:19:16 +0000710 def writelines(self, list):
Guido van Rossuma3277132000-04-11 15:37:43 +0000711
712 data = ''.join(list)
713 data, bytesdecoded = self.decode(data, self.errors)
714 return self.writer.write(data)
Guido van Rossum0612d842000-03-10 23:20:43 +0000715
716 def reset(self):
717
718 self.reader.reset()
719 self.writer.reset()
720
Tim Peters30324a72001-05-15 17:19:16 +0000721 def __getattr__(self, name,
Guido van Rossum0612d842000-03-10 23:20:43 +0000722 getattr=getattr):
723
724 """ Inherit all other methods from the underlying stream.
725 """
Tim Peters30324a72001-05-15 17:19:16 +0000726 return getattr(self.stream, name)
Guido van Rossum0612d842000-03-10 23:20:43 +0000727
728### Shortcuts
729
Marc-André Lemburg349a3d32000-06-21 21:21:04 +0000730def open(filename, mode='rb', encoding=None, errors='strict', buffering=1):
Guido van Rossum0612d842000-03-10 23:20:43 +0000731
732 """ Open an encoded file using the given mode and return
733 a wrapped version providing transparent encoding/decoding.
734
735 Note: The wrapped version will only accept the object format
736 defined by the codecs, i.e. Unicode objects for most builtin
Skip Montanaro9f5f9d92005-03-16 03:51:56 +0000737 codecs. Output is also codec dependent and will usually be
Guido van Rossum0612d842000-03-10 23:20:43 +0000738 Unicode as well.
739
Marc-André Lemburg349a3d32000-06-21 21:21:04 +0000740 Files are always opened in binary mode, even if no binary mode
Walter Dörwald7f3ed742003-02-02 23:08:27 +0000741 was specified. This is done to avoid data loss due to encodings
Marc-André Lemburg349a3d32000-06-21 21:21:04 +0000742 using 8-bit values. The default file mode is 'rb' meaning to
743 open the file in binary read mode.
744
Guido van Rossum0612d842000-03-10 23:20:43 +0000745 encoding specifies the encoding which is to be used for the
Walter Dörwald7f3ed742003-02-02 23:08:27 +0000746 file.
Guido van Rossum0612d842000-03-10 23:20:43 +0000747
748 errors may be given to define the error handling. It defaults
749 to 'strict' which causes ValueErrors to be raised in case an
750 encoding error occurs.
751
752 buffering has the same meaning as for the builtin open() API.
753 It defaults to line buffered.
754
Fred Drake49fd1072000-04-13 14:11:21 +0000755 The returned wrapped file object provides an extra attribute
756 .encoding which allows querying the used encoding. This
757 attribute is only available if an encoding was specified as
758 parameter.
759
Guido van Rossum0612d842000-03-10 23:20:43 +0000760 """
761 if encoding is not None and \
762 'b' not in mode:
763 # Force opening of the file in binary mode
764 mode = mode + 'b'
765 file = __builtin__.open(filename, mode, buffering)
766 if encoding is None:
767 return file
Thomas Woutersa9773292006-04-21 09:43:23 +0000768 info = lookup(encoding)
769 srw = StreamReaderWriter(file, info.streamreader, info.streamwriter, errors)
Guido van Rossuma3277132000-04-11 15:37:43 +0000770 # Add attributes to simplify introspection
771 srw.encoding = encoding
772 return srw
Guido van Rossum0612d842000-03-10 23:20:43 +0000773
Guido van Rossuma3277132000-04-11 15:37:43 +0000774def EncodedFile(file, data_encoding, file_encoding=None, errors='strict'):
Guido van Rossum0612d842000-03-10 23:20:43 +0000775
776 """ Return a wrapped version of file which provides transparent
777 encoding translation.
778
779 Strings written to the wrapped file are interpreted according
Guido van Rossuma3277132000-04-11 15:37:43 +0000780 to the given data_encoding and then written to the original
781 file as string using file_encoding. The intermediate encoding
782 will usually be Unicode but depends on the specified codecs.
Guido van Rossum0612d842000-03-10 23:20:43 +0000783
Guido van Rossuma3277132000-04-11 15:37:43 +0000784 Strings are read from the file using file_encoding and then
785 passed back to the caller as string using data_encoding.
786
787 If file_encoding is not given, it defaults to data_encoding.
Guido van Rossum0612d842000-03-10 23:20:43 +0000788
789 errors may be given to define the error handling. It defaults
790 to 'strict' which causes ValueErrors to be raised in case an
791 encoding error occurs.
792
Fred Drake49fd1072000-04-13 14:11:21 +0000793 The returned wrapped file object provides two extra attributes
794 .data_encoding and .file_encoding which reflect the given
795 parameters of the same name. The attributes can be used for
796 introspection by Python programs.
797
Guido van Rossum0612d842000-03-10 23:20:43 +0000798 """
Guido van Rossuma3277132000-04-11 15:37:43 +0000799 if file_encoding is None:
800 file_encoding = data_encoding
Thomas Woutersa9773292006-04-21 09:43:23 +0000801 info = lookup(data_encoding)
802 sr = StreamRecoder(file, info.encode, info.decode,
803 info.streamreader, info.streamwriter, errors)
Guido van Rossuma3277132000-04-11 15:37:43 +0000804 # Add attributes to simplify introspection
805 sr.data_encoding = data_encoding
806 sr.file_encoding = file_encoding
807 return sr
Guido van Rossum0612d842000-03-10 23:20:43 +0000808
Marc-André Lemburgaa32c5a2001-09-19 11:24:48 +0000809### Helpers for codec lookup
810
811def getencoder(encoding):
812
813 """ Lookup up the codec for the given encoding and return
814 its encoder function.
815
816 Raises a LookupError in case the encoding cannot be found.
817
818 """
Thomas Woutersa9773292006-04-21 09:43:23 +0000819 return lookup(encoding).encode
Marc-André Lemburgaa32c5a2001-09-19 11:24:48 +0000820
821def getdecoder(encoding):
822
823 """ Lookup up the codec for the given encoding and return
824 its decoder function.
825
826 Raises a LookupError in case the encoding cannot be found.
827
828 """
Thomas Woutersa9773292006-04-21 09:43:23 +0000829 return lookup(encoding).decode
830
831def getincrementalencoder(encoding):
832
833 """ Lookup up the codec for the given encoding and return
834 its IncrementalEncoder class or factory function.
835
836 Raises a LookupError in case the encoding cannot be found
837 or the codecs doesn't provide an incremental encoder.
838
839 """
840 encoder = lookup(encoding).incrementalencoder
841 if encoder is None:
842 raise LookupError(encoding)
843 return encoder
844
845def getincrementaldecoder(encoding):
846
847 """ Lookup up the codec for the given encoding and return
848 its IncrementalDecoder class or factory function.
849
850 Raises a LookupError in case the encoding cannot be found
851 or the codecs doesn't provide an incremental decoder.
852
853 """
854 decoder = lookup(encoding).incrementaldecoder
855 if decoder is None:
856 raise LookupError(encoding)
857 return decoder
Marc-André Lemburgaa32c5a2001-09-19 11:24:48 +0000858
859def getreader(encoding):
860
861 """ Lookup up the codec for the given encoding and return
862 its StreamReader class or factory function.
863
864 Raises a LookupError in case the encoding cannot be found.
865
866 """
Thomas Woutersa9773292006-04-21 09:43:23 +0000867 return lookup(encoding).streamreader
Marc-André Lemburgaa32c5a2001-09-19 11:24:48 +0000868
869def getwriter(encoding):
870
871 """ Lookup up the codec for the given encoding and return
872 its StreamWriter class or factory function.
873
874 Raises a LookupError in case the encoding cannot be found.
875
876 """
Thomas Woutersa9773292006-04-21 09:43:23 +0000877 return lookup(encoding).streamwriter
878
879def iterencode(iterator, encoding, errors='strict', **kwargs):
880 """
881 Encoding iterator.
882
883 Encodes the input strings from the iterator using a IncrementalEncoder.
884
885 errors and kwargs are passed through to the IncrementalEncoder
886 constructor.
887 """
888 encoder = getincrementalencoder(encoding)(errors, **kwargs)
889 for input in iterator:
890 output = encoder.encode(input)
891 if output:
892 yield output
893 output = encoder.encode("", True)
894 if output:
895 yield output
896
897def iterdecode(iterator, encoding, errors='strict', **kwargs):
898 """
899 Decoding iterator.
900
901 Decodes the input strings from the iterator using a IncrementalDecoder.
902
903 errors and kwargs are passed through to the IncrementalDecoder
904 constructor.
905 """
906 decoder = getincrementaldecoder(encoding)(errors, **kwargs)
907 for input in iterator:
908 output = decoder.decode(input)
909 if output:
910 yield output
911 output = decoder.decode("", True)
912 if output:
913 yield output
Marc-André Lemburgaa32c5a2001-09-19 11:24:48 +0000914
Marc-André Lemburga866df82001-01-03 21:29:14 +0000915### Helpers for charmap-based codecs
916
917def make_identity_dict(rng):
918
919 """ make_identity_dict(rng) -> dict
920
921 Return a dictionary where elements of the rng sequence are
922 mapped to themselves.
Tim Peters88869f92001-01-14 23:36:06 +0000923
Marc-André Lemburga866df82001-01-03 21:29:14 +0000924 """
925 res = {}
926 for i in rng:
927 res[i]=i
928 return res
929
Marc-André Lemburg716cf912001-05-16 09:41:45 +0000930def make_encoding_map(decoding_map):
931
932 """ Creates an encoding map from a decoding map.
933
Walter Dörwald7f3ed742003-02-02 23:08:27 +0000934 If a target mapping in the decoding map occurs multiple
Marc-André Lemburg716cf912001-05-16 09:41:45 +0000935 times, then that target is mapped to None (undefined mapping),
936 causing an exception when encountered by the charmap codec
937 during translation.
938
939 One example where this happens is cp875.py which decodes
940 multiple character to \u001a.
941
942 """
943 m = {}
944 for k,v in decoding_map.items():
Raymond Hettinger54f02222002-06-01 14:18:47 +0000945 if not v in m:
Marc-André Lemburg716cf912001-05-16 09:41:45 +0000946 m[v] = k
947 else:
948 m[v] = None
949 return m
Tim Peters3a2ab1a2001-05-29 06:06:54 +0000950
Walter Dörwald3aeb6322002-09-02 13:14:32 +0000951### error handlers
952
Martin v. Löwise2713be2005-03-08 15:03:08 +0000953try:
954 strict_errors = lookup_error("strict")
955 ignore_errors = lookup_error("ignore")
956 replace_errors = lookup_error("replace")
957 xmlcharrefreplace_errors = lookup_error("xmlcharrefreplace")
958 backslashreplace_errors = lookup_error("backslashreplace")
959except LookupError:
960 # In --disable-unicode builds, these error handler are missing
961 strict_errors = None
962 ignore_errors = None
963 replace_errors = None
964 xmlcharrefreplace_errors = None
965 backslashreplace_errors = None
Walter Dörwald3aeb6322002-09-02 13:14:32 +0000966
Martin v. Löwis6cd441d2001-07-31 08:54:55 +0000967# Tell modulefinder that using codecs probably needs the encodings
968# package
969_false = 0
970if _false:
971 import encodings
972
Guido van Rossum0612d842000-03-10 23:20:43 +0000973### Tests
Guido van Rossum1c89b0e2000-04-11 15:41:38 +0000974
Guido van Rossum0612d842000-03-10 23:20:43 +0000975if __name__ == '__main__':
976
Guido van Rossuma3277132000-04-11 15:37:43 +0000977 # Make stdout translate Latin-1 output into UTF-8 output
978 sys.stdout = EncodedFile(sys.stdout, 'latin-1', 'utf-8')
Guido van Rossum1c89b0e2000-04-11 15:41:38 +0000979
Guido van Rossuma3277132000-04-11 15:37:43 +0000980 # Have stdin translate Latin-1 input into UTF-8 input
981 sys.stdin = EncodedFile(sys.stdin, 'utf-8', 'latin-1')