blob: 6895a228fed75a7be16a9be7dab8c925a151a61b [file] [log] [blame]
Guido van Rossum0612d842000-03-10 23:20:43 +00001""" codecs -- Python Codec Registry, API and helpers.
2
3
4Written by Marc-Andre Lemburg (mal@lemburg.com).
5
6(c) Copyright CNRI, All Rights Reserved. NO WARRANTY.
7
8"""#"
9
Marc-André Lemburgb28de0d2002-12-12 17:37:50 +000010import __builtin__, sys
Guido van Rossum0612d842000-03-10 23:20:43 +000011
12### Registry and builtin stateless codec functions
13
Guido van Rossumb95de4f2000-03-31 17:25:23 +000014try:
15 from _codecs import *
Tim Peters30324a72001-05-15 17:19:16 +000016except ImportError, why:
Guido van Rossumb95de4f2000-03-31 17:25:23 +000017 raise SystemError,\
18 'Failed to load the builtin codecs: %s' % why
Guido van Rossum0612d842000-03-10 23:20:43 +000019
Tim Peters30324a72001-05-15 17:19:16 +000020__all__ = ["register", "lookup", "open", "EncodedFile", "BOM", "BOM_BE",
Walter Dörwald474458d2002-06-04 15:16:29 +000021 "BOM_LE", "BOM32_BE", "BOM32_LE", "BOM64_BE", "BOM64_LE",
22 "BOM_UTF8", "BOM_UTF16", "BOM_UTF16_LE", "BOM_UTF16_BE",
Walter Dörwald3aeb6322002-09-02 13:14:32 +000023 "BOM_UTF32", "BOM_UTF32_LE", "BOM_UTF32_BE",
24 "strict_errors", "ignore_errors", "replace_errors",
25 "xmlcharrefreplace_errors",
26 "register_error", "lookup_error"]
Skip Montanaroe99d5ea2001-01-20 19:54:20 +000027
Guido van Rossum0612d842000-03-10 23:20:43 +000028### Constants
29
30#
Walter Dörwald474458d2002-06-04 15:16:29 +000031# Byte Order Mark (BOM = ZERO WIDTH NO-BREAK SPACE = U+FEFF)
32# and its possible byte string values
33# for UTF8/UTF16/UTF32 output and little/big endian machines
Guido van Rossum0612d842000-03-10 23:20:43 +000034#
Guido van Rossum0612d842000-03-10 23:20:43 +000035
Walter Dörwald474458d2002-06-04 15:16:29 +000036# UTF-8
37BOM_UTF8 = '\xef\xbb\xbf'
38
39# UTF-16, little endian
40BOM_LE = BOM_UTF16_LE = '\xff\xfe'
41
42# UTF-16, big endian
43BOM_BE = BOM_UTF16_BE = '\xfe\xff'
44
45# UTF-32, little endian
46BOM_UTF32_LE = '\xff\xfe\x00\x00'
47
48# UTF-32, big endian
49BOM_UTF32_BE = '\x00\x00\xfe\xff'
50
Marc-André Lemburgb28de0d2002-12-12 17:37:50 +000051if sys.byteorder == 'little':
Walter Dörwald474458d2002-06-04 15:16:29 +000052
Marc-André Lemburgb28de0d2002-12-12 17:37:50 +000053 # UTF-16, native endianness
54 BOM = BOM_UTF16 = BOM_UTF16_LE
55
56 # UTF-32, native endianness
57 BOM_UTF32 = BOM_UTF32_LE
58
59else:
60
61 # UTF-16, native endianness
62 BOM = BOM_UTF16 = BOM_UTF16_BE
63
64 # UTF-32, native endianness
65 BOM_UTF32 = BOM_UTF32_BE
Walter Dörwald474458d2002-06-04 15:16:29 +000066
67# Old broken names (don't use in new code)
68BOM32_LE = BOM_UTF16_LE
69BOM32_BE = BOM_UTF16_BE
70BOM64_LE = BOM_UTF32_LE
71BOM64_BE = BOM_UTF32_BE
Guido van Rossum0612d842000-03-10 23:20:43 +000072
73
74### Codec base classes (defining the API)
75
76class Codec:
77
78 """ Defines the interface for stateless encoders/decoders.
79
Walter Dörwald7f82f792002-11-19 21:42:53 +000080 The .encode()/.decode() methods may use different error
Guido van Rossum0612d842000-03-10 23:20:43 +000081 handling schemes by providing the errors argument. These
Walter Dörwald7f82f792002-11-19 21:42:53 +000082 string values are predefined:
Guido van Rossum0612d842000-03-10 23:20:43 +000083
Guido van Rossumd8855fd2000-03-24 22:14:19 +000084 'strict' - raise a ValueError error (or a subclass)
Guido van Rossum0612d842000-03-10 23:20:43 +000085 'ignore' - ignore the character and continue with the next
86 'replace' - replace with a suitable replacement character;
87 Python will use the official U+FFFD REPLACEMENT
Walter Dörwald7f82f792002-11-19 21:42:53 +000088 CHARACTER for the builtin Unicode codecs on
89 decoding and '?' on encoding.
90 'xmlcharrefreplace' - Replace with the appropriate XML
91 character reference (only for encoding).
92 'backslashreplace' - Replace with backslashed escape sequences
93 (only for encoding).
94
95 The set of allowed values can be extended via register_error.
Guido van Rossum0612d842000-03-10 23:20:43 +000096
97 """
Tim Peters30324a72001-05-15 17:19:16 +000098 def encode(self, input, errors='strict'):
Guido van Rossum1c89b0e2000-04-11 15:41:38 +000099
Fred Drake3e74c0d2000-03-17 15:40:35 +0000100 """ Encodes the object input and returns a tuple (output
Guido van Rossum0612d842000-03-10 23:20:43 +0000101 object, length consumed).
102
103 errors defines the error handling to apply. It defaults to
104 'strict' handling.
105
106 The method may not store state in the Codec instance. Use
107 StreamCodec for codecs which have to keep state in order to
108 make encoding/decoding efficient.
109
110 The encoder must be able to handle zero length input and
111 return an empty object of the output object type in this
112 situation.
113
114 """
115 raise NotImplementedError
116
Tim Peters30324a72001-05-15 17:19:16 +0000117 def decode(self, input, errors='strict'):
Guido van Rossum0612d842000-03-10 23:20:43 +0000118
119 """ Decodes the object input and returns a tuple (output
120 object, length consumed).
121
122 input must be an object which provides the bf_getreadbuf
123 buffer slot. Python strings, buffer objects and memory
124 mapped files are examples of objects providing this slot.
Guido van Rossum1c89b0e2000-04-11 15:41:38 +0000125
Guido van Rossum0612d842000-03-10 23:20:43 +0000126 errors defines the error handling to apply. It defaults to
127 'strict' handling.
128
129 The method may not store state in the Codec instance. Use
130 StreamCodec for codecs which have to keep state in order to
131 make encoding/decoding efficient.
132
133 The decoder must be able to handle zero length input and
134 return an empty object of the output object type in this
135 situation.
136
Guido van Rossum1c89b0e2000-04-11 15:41:38 +0000137 """
Guido van Rossum0612d842000-03-10 23:20:43 +0000138 raise NotImplementedError
139
140#
141# The StreamWriter and StreamReader class provide generic working
Andrew M. Kuchling97c56352001-09-18 20:29:48 +0000142# interfaces which can be used to implement new encoding submodules
Guido van Rossum0612d842000-03-10 23:20:43 +0000143# very easily. See encodings/utf_8.py for an example on how this is
144# done.
Guido van Rossum1c89b0e2000-04-11 15:41:38 +0000145#
Guido van Rossum0612d842000-03-10 23:20:43 +0000146
147class StreamWriter(Codec):
148
Tim Peters30324a72001-05-15 17:19:16 +0000149 def __init__(self, stream, errors='strict'):
Guido van Rossum0612d842000-03-10 23:20:43 +0000150
151 """ Creates a StreamWriter instance.
152
153 stream must be a file-like object open for writing
154 (binary) data.
155
Walter Dörwald7f82f792002-11-19 21:42:53 +0000156 The StreamWriter may use different error handling
Guido van Rossum0612d842000-03-10 23:20:43 +0000157 schemes by providing the errors keyword argument. These
Walter Dörwald7f82f792002-11-19 21:42:53 +0000158 parameters are predefined:
Guido van Rossum0612d842000-03-10 23:20:43 +0000159
160 'strict' - raise a ValueError (or a subclass)
161 'ignore' - ignore the character and continue with the next
162 'replace'- replace with a suitable replacement character
Walter Dörwald7f82f792002-11-19 21:42:53 +0000163 'xmlcharrefreplace' - Replace with the appropriate XML
164 character reference.
165 'backslashreplace' - Replace with backslashed escape
166 sequences (only for encoding).
Guido van Rossum0612d842000-03-10 23:20:43 +0000167
Walter Dörwald7f82f792002-11-19 21:42:53 +0000168 The set of allowed parameter values can be extended via
169 register_error.
Guido van Rossum0612d842000-03-10 23:20:43 +0000170 """
171 self.stream = stream
172 self.errors = errors
173
Guido van Rossuma3277132000-04-11 15:37:43 +0000174 def write(self, object):
Guido van Rossum0612d842000-03-10 23:20:43 +0000175
176 """ Writes the object's contents encoded to self.stream.
177 """
Tim Peters30324a72001-05-15 17:19:16 +0000178 data, consumed = self.encode(object, self.errors)
Guido van Rossum0612d842000-03-10 23:20:43 +0000179 self.stream.write(data)
180
Guido van Rossuma3277132000-04-11 15:37:43 +0000181 def writelines(self, list):
182
183 """ Writes the concatenated list of strings to the stream
184 using .write().
185 """
186 self.write(''.join(list))
Guido van Rossum1c89b0e2000-04-11 15:41:38 +0000187
Guido van Rossum0612d842000-03-10 23:20:43 +0000188 def reset(self):
189
190 """ Flushes and resets the codec buffers used for keeping state.
191
192 Calling this method should ensure that the data on the
193 output is put into a clean state, that allows appending
194 of new fresh data without having to rescan the whole
195 stream to recover state.
196
197 """
198 pass
199
Tim Peters30324a72001-05-15 17:19:16 +0000200 def __getattr__(self, name,
Guido van Rossum0612d842000-03-10 23:20:43 +0000201 getattr=getattr):
202
203 """ Inherit all other methods from the underlying stream.
204 """
Tim Peters30324a72001-05-15 17:19:16 +0000205 return getattr(self.stream, name)
Guido van Rossum0612d842000-03-10 23:20:43 +0000206
207###
208
209class StreamReader(Codec):
210
Tim Peters30324a72001-05-15 17:19:16 +0000211 def __init__(self, stream, errors='strict'):
Guido van Rossum0612d842000-03-10 23:20:43 +0000212
213 """ Creates a StreamReader instance.
214
215 stream must be a file-like object open for reading
216 (binary) data.
217
Walter Dörwald7f82f792002-11-19 21:42:53 +0000218 The StreamReader may use different error handling
Guido van Rossum0612d842000-03-10 23:20:43 +0000219 schemes by providing the errors keyword argument. These
Walter Dörwald7f82f792002-11-19 21:42:53 +0000220 parameters are predefined:
Guido van Rossum0612d842000-03-10 23:20:43 +0000221
222 'strict' - raise a ValueError (or a subclass)
223 'ignore' - ignore the character and continue with the next
224 'replace'- replace with a suitable replacement character;
225
Walter Dörwald7f82f792002-11-19 21:42:53 +0000226 The set of allowed parameter values can be extended via
227 register_error.
Guido van Rossum0612d842000-03-10 23:20:43 +0000228 """
229 self.stream = stream
230 self.errors = errors
Walter Dörwald69652032004-09-07 20:24:22 +0000231 self.bytebuffer = ""
Walter Dörwaldc9878e12005-07-20 22:15:39 +0000232 # For str->str decoding this will stay a str
233 # For str->unicode decoding the first read will promote it to unicode
234 self.charbuffer = ""
Martin v. Löwis4ed67382005-09-18 08:34:39 +0000235 self.linebuffer = None
Guido van Rossum0612d842000-03-10 23:20:43 +0000236
Walter Dörwald69652032004-09-07 20:24:22 +0000237 def decode(self, input, errors='strict'):
238 raise NotImplementedError
239
Martin v. Löwis56066d22005-08-24 07:38:12 +0000240 def read(self, size=-1, chars=-1, firstline=False):
Guido van Rossum0612d842000-03-10 23:20:43 +0000241
242 """ Decodes data from the stream self.stream and returns the
243 resulting object.
244
Walter Dörwald69652032004-09-07 20:24:22 +0000245 chars indicates the number of characters to read from the
246 stream. read() will never return more than chars
247 characters, but it might return less, if there are not enough
248 characters available.
249
Guido van Rossum0612d842000-03-10 23:20:43 +0000250 size indicates the approximate maximum number of bytes to
251 read from the stream for decoding purposes. The decoder
252 can modify this setting as appropriate. The default value
253 -1 indicates to read and decode as much as possible. size
254 is intended to prevent having to decode huge files in one
255 step.
256
Martin v. Löwis56066d22005-08-24 07:38:12 +0000257 If firstline is true, and a UnicodeDecodeError happens
258 after the first line terminator in the input only the first line
259 will be returned, the rest of the input will be kept until the
260 next call to read().
261
Guido van Rossum0612d842000-03-10 23:20:43 +0000262 The method should use a greedy read strategy meaning that
263 it should read as much data as is allowed within the
264 definition of the encoding and the given size, e.g. if
265 optional encoding endings or state markers are available
266 on the stream, these should be read too.
Guido van Rossum0612d842000-03-10 23:20:43 +0000267 """
Martin v. Löwis4ed67382005-09-18 08:34:39 +0000268 # If we have lines cached, first merge them back into characters
269 if self.linebuffer:
270 self.charbuffer = "".join(self.linebuffer)
271 self.linebuffer = None
Tim Peters536cf992005-12-25 23:18:31 +0000272
Walter Dörwald69652032004-09-07 20:24:22 +0000273 # read until we get the required number of characters (if available)
Walter Dörwald69652032004-09-07 20:24:22 +0000274 while True:
275 # can the request can be satisfied from the character buffer?
276 if chars < 0:
Walter Dörwaldca199432006-03-06 22:39:12 +0000277 if size < 0:
278 if self.charbuffer:
279 break
280 elif len(self.charbuffer) >= size:
Walter Dörwalde57d7b12004-12-21 22:24:00 +0000281 break
Guido van Rossum0612d842000-03-10 23:20:43 +0000282 else:
Walter Dörwald69652032004-09-07 20:24:22 +0000283 if len(self.charbuffer) >= chars:
Walter Dörwald69652032004-09-07 20:24:22 +0000284 break
285 # we need more data
286 if size < 0:
287 newdata = self.stream.read()
288 else:
289 newdata = self.stream.read(size)
Walter Dörwalde57d7b12004-12-21 22:24:00 +0000290 # decode bytes (those remaining from the last call included)
Walter Dörwald69652032004-09-07 20:24:22 +0000291 data = self.bytebuffer + newdata
Martin v. Löwis56066d22005-08-24 07:38:12 +0000292 try:
293 newchars, decodedbytes = self.decode(data, self.errors)
294 except UnicodeDecodeError, exc:
295 if firstline:
296 newchars, decodedbytes = self.decode(data[:exc.start], self.errors)
297 lines = newchars.splitlines(True)
298 if len(lines)<=1:
299 raise
300 else:
301 raise
Walter Dörwald69652032004-09-07 20:24:22 +0000302 # keep undecoded bytes until the next call
303 self.bytebuffer = data[decodedbytes:]
304 # put new characters in the character buffer
Walter Dörwalde57d7b12004-12-21 22:24:00 +0000305 self.charbuffer += newchars
Walter Dörwald69652032004-09-07 20:24:22 +0000306 # there was no data available
307 if not newdata:
Walter Dörwalde57d7b12004-12-21 22:24:00 +0000308 break
309 if chars < 0:
310 # Return everything we've got
311 result = self.charbuffer
Walter Dörwaldc9878e12005-07-20 22:15:39 +0000312 self.charbuffer = ""
Walter Dörwalde57d7b12004-12-21 22:24:00 +0000313 else:
314 # Return the first chars characters
315 result = self.charbuffer[:chars]
316 self.charbuffer = self.charbuffer[chars:]
Walter Dörwald69652032004-09-07 20:24:22 +0000317 return result
Guido van Rossum0612d842000-03-10 23:20:43 +0000318
Walter Dörwald69652032004-09-07 20:24:22 +0000319 def readline(self, size=None, keepends=True):
Guido van Rossuma3277132000-04-11 15:37:43 +0000320
321 """ Read one line from the input stream and return the
322 decoded data.
323
Walter Dörwald69652032004-09-07 20:24:22 +0000324 size, if given, is passed as size argument to the
325 read() method.
Guido van Rossum1c89b0e2000-04-11 15:41:38 +0000326
Guido van Rossuma3277132000-04-11 15:37:43 +0000327 """
Martin v. Löwis4ed67382005-09-18 08:34:39 +0000328 # If we have lines cached from an earlier read, return
329 # them unconditionally
330 if self.linebuffer:
331 line = self.linebuffer[0]
332 del self.linebuffer[0]
333 if len(self.linebuffer) == 1:
334 # revert to charbuffer mode; we might need more data
335 # next time
336 self.charbuffer = self.linebuffer[0]
337 self.linebuffer = None
338 if not keepends:
339 line = line.splitlines(False)[0]
340 return line
Tim Peters536cf992005-12-25 23:18:31 +0000341
Walter Dörwalde57d7b12004-12-21 22:24:00 +0000342 readsize = size or 72
Walter Dörwaldc9878e12005-07-20 22:15:39 +0000343 line = ""
Walter Dörwalde57d7b12004-12-21 22:24:00 +0000344 # If size is given, we call read() only once
Walter Dörwald69652032004-09-07 20:24:22 +0000345 while True:
Martin v. Löwis56066d22005-08-24 07:38:12 +0000346 data = self.read(readsize, firstline=True)
Walter Dörwalde57d7b12004-12-21 22:24:00 +0000347 if data:
Walter Dörwalda4eb2d52005-04-21 21:42:35 +0000348 # If we're at a "\r" read one extra character (which might
349 # be a "\n") to get a proper line ending. If the stream is
Walter Dörwaldbc8e6422005-04-21 21:32:03 +0000350 # temporarily exhausted we return the wrong line ending.
Walter Dörwaldc9878e12005-07-20 22:15:39 +0000351 if data.endswith("\r"):
Walter Dörwald7a6dc132005-04-04 21:38:47 +0000352 data += self.read(size=1, chars=1)
Walter Dörwald7a6dc132005-04-04 21:38:47 +0000353
Walter Dörwald69652032004-09-07 20:24:22 +0000354 line += data
Walter Dörwalde57d7b12004-12-21 22:24:00 +0000355 lines = line.splitlines(True)
356 if lines:
Martin v. Löwis4ed67382005-09-18 08:34:39 +0000357 if len(lines) > 1:
358 # More than one line result; the first line is a full line
359 # to return
360 line = lines[0]
361 del lines[0]
362 if len(lines) > 1:
363 # cache the remaining lines
364 lines[-1] += self.charbuffer
365 self.linebuffer = lines
366 self.charbuffer = None
367 else:
368 # only one remaining line, put it back into charbuffer
369 self.charbuffer = lines[0] + self.charbuffer
370 if not keepends:
371 line = line.splitlines(False)[0]
372 break
Walter Dörwalde57d7b12004-12-21 22:24:00 +0000373 line0withend = lines[0]
374 line0withoutend = lines[0].splitlines(False)[0]
375 if line0withend != line0withoutend: # We really have a line end
376 # Put the rest back together and keep it until the next call
Walter Dörwaldc9878e12005-07-20 22:15:39 +0000377 self.charbuffer = "".join(lines[1:]) + self.charbuffer
Walter Dörwalde57d7b12004-12-21 22:24:00 +0000378 if keepends:
379 line = line0withend
380 else:
381 line = line0withoutend
Walter Dörwald9fa09462005-01-10 12:01:39 +0000382 break
Walter Dörwalde57d7b12004-12-21 22:24:00 +0000383 # we didn't get anything or this was our only try
Walter Dörwald9fa09462005-01-10 12:01:39 +0000384 if not data or size is not None:
Walter Dörwalde57d7b12004-12-21 22:24:00 +0000385 if line and not keepends:
386 line = line.splitlines(False)[0]
387 break
388 if readsize<8000:
389 readsize *= 2
390 return line
Guido van Rossum1c89b0e2000-04-11 15:41:38 +0000391
Walter Dörwald69652032004-09-07 20:24:22 +0000392 def readlines(self, sizehint=None, keepends=True):
Guido van Rossuma3277132000-04-11 15:37:43 +0000393
394 """ Read all lines available on the input stream
395 and return them as list of lines.
396
397 Line breaks are implemented using the codec's decoder
398 method and are included in the list entries.
Guido van Rossum1c89b0e2000-04-11 15:41:38 +0000399
Marc-André Lemburgd5948492004-02-26 15:22:17 +0000400 sizehint, if given, is ignored since there is no efficient
401 way to finding the true end-of-line.
Guido van Rossuma3277132000-04-11 15:37:43 +0000402
403 """
Walter Dörwald69652032004-09-07 20:24:22 +0000404 data = self.read()
Hye-Shik Changaf5c7cf2004-10-17 23:51:21 +0000405 return data.splitlines(keepends)
Guido van Rossum0612d842000-03-10 23:20:43 +0000406
407 def reset(self):
408
409 """ Resets the codec buffers used for keeping state.
410
411 Note that no stream repositioning should take place.
Thomas Wouters7e474022000-07-16 12:04:32 +0000412 This method is primarily intended to be able to recover
Guido van Rossum0612d842000-03-10 23:20:43 +0000413 from decoding errors.
414
415 """
Walter Dörwald729c31f2005-03-14 19:06:30 +0000416 self.bytebuffer = ""
417 self.charbuffer = u""
Martin v. Löwis4ed67382005-09-18 08:34:39 +0000418 self.linebuffer = None
Walter Dörwald729c31f2005-03-14 19:06:30 +0000419
Walter Dörwald71fd90d2005-03-14 19:25:41 +0000420 def seek(self, offset, whence=0):
Walter Dörwald729c31f2005-03-14 19:06:30 +0000421 """ Set the input stream's current position.
422
423 Resets the codec buffers used for keeping state.
424 """
425 self.reset()
426 self.stream.seek(offset, whence)
Guido van Rossum0612d842000-03-10 23:20:43 +0000427
Walter Dörwald4dbf1922002-11-06 16:53:44 +0000428 def next(self):
429
430 """ Return the next decoded line from the input stream."""
431 line = self.readline()
432 if line:
433 return line
434 raise StopIteration
435
436 def __iter__(self):
437 return self
438
Tim Peters30324a72001-05-15 17:19:16 +0000439 def __getattr__(self, name,
Guido van Rossum0612d842000-03-10 23:20:43 +0000440 getattr=getattr):
441
442 """ Inherit all other methods from the underlying stream.
443 """
Tim Peters30324a72001-05-15 17:19:16 +0000444 return getattr(self.stream, name)
Guido van Rossum0612d842000-03-10 23:20:43 +0000445
446###
447
448class StreamReaderWriter:
449
Fred Drake49fd1072000-04-13 14:11:21 +0000450 """ StreamReaderWriter instances allow wrapping streams which
451 work in both read and write modes.
452
453 The design is such that one can use the factory functions
Thomas Wouters7e474022000-07-16 12:04:32 +0000454 returned by the codec.lookup() function to construct the
Fred Drake49fd1072000-04-13 14:11:21 +0000455 instance.
456
457 """
Guido van Rossuma3277132000-04-11 15:37:43 +0000458 # Optional attributes set by the file wrappers below
459 encoding = 'unknown'
460
Tim Peters30324a72001-05-15 17:19:16 +0000461 def __init__(self, stream, Reader, Writer, errors='strict'):
Guido van Rossum0612d842000-03-10 23:20:43 +0000462
463 """ Creates a StreamReaderWriter instance.
464
465 stream must be a Stream-like object.
466
467 Reader, Writer must be factory functions or classes
468 providing the StreamReader, StreamWriter interface resp.
469
470 Error handling is done in the same way as defined for the
471 StreamWriter/Readers.
472
473 """
474 self.stream = stream
475 self.reader = Reader(stream, errors)
476 self.writer = Writer(stream, errors)
477 self.errors = errors
478
Tim Peters30324a72001-05-15 17:19:16 +0000479 def read(self, size=-1):
Guido van Rossum0612d842000-03-10 23:20:43 +0000480
481 return self.reader.read(size)
482
Guido van Rossumd58c26f2000-05-01 16:17:32 +0000483 def readline(self, size=None):
Guido van Rossuma3277132000-04-11 15:37:43 +0000484
485 return self.reader.readline(size)
486
Guido van Rossumd58c26f2000-05-01 16:17:32 +0000487 def readlines(self, sizehint=None):
Guido van Rossuma3277132000-04-11 15:37:43 +0000488
489 return self.reader.readlines(sizehint)
490
Walter Dörwald4dbf1922002-11-06 16:53:44 +0000491 def next(self):
492
493 """ Return the next decoded line from the input stream."""
494 return self.reader.next()
495
496 def __iter__(self):
497 return self
498
Tim Peters30324a72001-05-15 17:19:16 +0000499 def write(self, data):
Guido van Rossum0612d842000-03-10 23:20:43 +0000500
501 return self.writer.write(data)
502
Tim Peters30324a72001-05-15 17:19:16 +0000503 def writelines(self, list):
Guido van Rossuma3277132000-04-11 15:37:43 +0000504
505 return self.writer.writelines(list)
506
Guido van Rossum0612d842000-03-10 23:20:43 +0000507 def reset(self):
508
509 self.reader.reset()
510 self.writer.reset()
511
Tim Peters30324a72001-05-15 17:19:16 +0000512 def __getattr__(self, name,
Guido van Rossum0612d842000-03-10 23:20:43 +0000513 getattr=getattr):
514
515 """ Inherit all other methods from the underlying stream.
516 """
Tim Peters30324a72001-05-15 17:19:16 +0000517 return getattr(self.stream, name)
Guido van Rossum0612d842000-03-10 23:20:43 +0000518
519###
520
521class StreamRecoder:
522
Fred Drake49fd1072000-04-13 14:11:21 +0000523 """ StreamRecoder instances provide a frontend - backend
524 view of encoding data.
525
526 They use the complete set of APIs returned by the
527 codecs.lookup() function to implement their task.
528
529 Data written to the stream is first decoded into an
530 intermediate format (which is dependent on the given codec
531 combination) and then written to the stream using an instance
532 of the provided Writer class.
533
534 In the other direction, data is read from the stream using a
535 Reader instance and then return encoded data to the caller.
536
537 """
Guido van Rossuma3277132000-04-11 15:37:43 +0000538 # Optional attributes set by the file wrappers below
539 data_encoding = 'unknown'
540 file_encoding = 'unknown'
541
Tim Peters30324a72001-05-15 17:19:16 +0000542 def __init__(self, stream, encode, decode, Reader, Writer,
543 errors='strict'):
Guido van Rossum0612d842000-03-10 23:20:43 +0000544
545 """ Creates a StreamRecoder instance which implements a two-way
546 conversion: encode and decode work on the frontend (the
Guido van Rossum1c89b0e2000-04-11 15:41:38 +0000547 input to .read() and output of .write()) while
Guido van Rossum0612d842000-03-10 23:20:43 +0000548 Reader and Writer work on the backend (reading and
Fred Drake908670c2000-03-17 15:42:11 +0000549 writing to the stream).
Guido van Rossum0612d842000-03-10 23:20:43 +0000550
551 You can use these objects to do transparent direct
552 recodings from e.g. latin-1 to utf-8 and back.
553
554 stream must be a file-like object.
555
556 encode, decode must adhere to the Codec interface, Reader,
557 Writer must be factory functions or classes providing the
558 StreamReader, StreamWriter interface resp.
559
560 encode and decode are needed for the frontend translation,
561 Reader and Writer for the backend translation. Unicode is
562 used as intermediate encoding.
563
564 Error handling is done in the same way as defined for the
565 StreamWriter/Readers.
566
567 """
568 self.stream = stream
569 self.encode = encode
570 self.decode = decode
571 self.reader = Reader(stream, errors)
572 self.writer = Writer(stream, errors)
573 self.errors = errors
574
Tim Peters30324a72001-05-15 17:19:16 +0000575 def read(self, size=-1):
Guido van Rossum0612d842000-03-10 23:20:43 +0000576
577 data = self.reader.read(size)
578 data, bytesencoded = self.encode(data, self.errors)
579 return data
580
Tim Peters30324a72001-05-15 17:19:16 +0000581 def readline(self, size=None):
Guido van Rossuma3277132000-04-11 15:37:43 +0000582
583 if size is None:
584 data = self.reader.readline()
585 else:
586 data = self.reader.readline(size)
587 data, bytesencoded = self.encode(data, self.errors)
588 return data
589
Tim Peters30324a72001-05-15 17:19:16 +0000590 def readlines(self, sizehint=None):
Guido van Rossuma3277132000-04-11 15:37:43 +0000591
Marc-André Lemburgd5948492004-02-26 15:22:17 +0000592 data = self.reader.read()
Guido van Rossuma3277132000-04-11 15:37:43 +0000593 data, bytesencoded = self.encode(data, self.errors)
594 return data.splitlines(1)
595
Walter Dörwald4dbf1922002-11-06 16:53:44 +0000596 def next(self):
597
598 """ Return the next decoded line from the input stream."""
Walter Dörwaldc5238b82005-09-01 11:56:53 +0000599 data = self.reader.next()
600 data, bytesencoded = self.encode(data, self.errors)
601 return data
Walter Dörwald4dbf1922002-11-06 16:53:44 +0000602
603 def __iter__(self):
604 return self
605
Tim Peters30324a72001-05-15 17:19:16 +0000606 def write(self, data):
Guido van Rossum0612d842000-03-10 23:20:43 +0000607
608 data, bytesdecoded = self.decode(data, self.errors)
609 return self.writer.write(data)
610
Tim Peters30324a72001-05-15 17:19:16 +0000611 def writelines(self, list):
Guido van Rossuma3277132000-04-11 15:37:43 +0000612
613 data = ''.join(list)
614 data, bytesdecoded = self.decode(data, self.errors)
615 return self.writer.write(data)
Guido van Rossum0612d842000-03-10 23:20:43 +0000616
617 def reset(self):
618
619 self.reader.reset()
620 self.writer.reset()
621
Tim Peters30324a72001-05-15 17:19:16 +0000622 def __getattr__(self, name,
Guido van Rossum0612d842000-03-10 23:20:43 +0000623 getattr=getattr):
624
625 """ Inherit all other methods from the underlying stream.
626 """
Tim Peters30324a72001-05-15 17:19:16 +0000627 return getattr(self.stream, name)
Guido van Rossum0612d842000-03-10 23:20:43 +0000628
629### Shortcuts
630
Marc-André Lemburg349a3d32000-06-21 21:21:04 +0000631def open(filename, mode='rb', encoding=None, errors='strict', buffering=1):
Guido van Rossum0612d842000-03-10 23:20:43 +0000632
633 """ Open an encoded file using the given mode and return
634 a wrapped version providing transparent encoding/decoding.
635
636 Note: The wrapped version will only accept the object format
637 defined by the codecs, i.e. Unicode objects for most builtin
Skip Montanaro9f5f9d92005-03-16 03:51:56 +0000638 codecs. Output is also codec dependent and will usually be
Guido van Rossum0612d842000-03-10 23:20:43 +0000639 Unicode as well.
640
Marc-André Lemburg349a3d32000-06-21 21:21:04 +0000641 Files are always opened in binary mode, even if no binary mode
Walter Dörwald7f3ed742003-02-02 23:08:27 +0000642 was specified. This is done to avoid data loss due to encodings
Marc-André Lemburg349a3d32000-06-21 21:21:04 +0000643 using 8-bit values. The default file mode is 'rb' meaning to
644 open the file in binary read mode.
645
Guido van Rossum0612d842000-03-10 23:20:43 +0000646 encoding specifies the encoding which is to be used for the
Walter Dörwald7f3ed742003-02-02 23:08:27 +0000647 file.
Guido van Rossum0612d842000-03-10 23:20:43 +0000648
649 errors may be given to define the error handling. It defaults
650 to 'strict' which causes ValueErrors to be raised in case an
651 encoding error occurs.
652
653 buffering has the same meaning as for the builtin open() API.
654 It defaults to line buffered.
655
Fred Drake49fd1072000-04-13 14:11:21 +0000656 The returned wrapped file object provides an extra attribute
657 .encoding which allows querying the used encoding. This
658 attribute is only available if an encoding was specified as
659 parameter.
660
Guido van Rossum0612d842000-03-10 23:20:43 +0000661 """
662 if encoding is not None and \
663 'b' not in mode:
664 # Force opening of the file in binary mode
665 mode = mode + 'b'
666 file = __builtin__.open(filename, mode, buffering)
667 if encoding is None:
668 return file
Tim Peters30324a72001-05-15 17:19:16 +0000669 (e, d, sr, sw) = lookup(encoding)
Guido van Rossuma3277132000-04-11 15:37:43 +0000670 srw = StreamReaderWriter(file, sr, sw, errors)
671 # Add attributes to simplify introspection
672 srw.encoding = encoding
673 return srw
Guido van Rossum0612d842000-03-10 23:20:43 +0000674
Guido van Rossuma3277132000-04-11 15:37:43 +0000675def EncodedFile(file, data_encoding, file_encoding=None, errors='strict'):
Guido van Rossum0612d842000-03-10 23:20:43 +0000676
677 """ Return a wrapped version of file which provides transparent
678 encoding translation.
679
680 Strings written to the wrapped file are interpreted according
Guido van Rossuma3277132000-04-11 15:37:43 +0000681 to the given data_encoding and then written to the original
682 file as string using file_encoding. The intermediate encoding
683 will usually be Unicode but depends on the specified codecs.
Guido van Rossum0612d842000-03-10 23:20:43 +0000684
Guido van Rossuma3277132000-04-11 15:37:43 +0000685 Strings are read from the file using file_encoding and then
686 passed back to the caller as string using data_encoding.
687
688 If file_encoding is not given, it defaults to data_encoding.
Guido van Rossum0612d842000-03-10 23:20:43 +0000689
690 errors may be given to define the error handling. It defaults
691 to 'strict' which causes ValueErrors to be raised in case an
692 encoding error occurs.
693
Fred Drake49fd1072000-04-13 14:11:21 +0000694 The returned wrapped file object provides two extra attributes
695 .data_encoding and .file_encoding which reflect the given
696 parameters of the same name. The attributes can be used for
697 introspection by Python programs.
698
Guido van Rossum0612d842000-03-10 23:20:43 +0000699 """
Guido van Rossuma3277132000-04-11 15:37:43 +0000700 if file_encoding is None:
701 file_encoding = data_encoding
702 encode, decode = lookup(data_encoding)[:2]
703 Reader, Writer = lookup(file_encoding)[2:]
704 sr = StreamRecoder(file,
Tim Peters30324a72001-05-15 17:19:16 +0000705 encode, decode, Reader, Writer,
Guido van Rossuma3277132000-04-11 15:37:43 +0000706 errors)
707 # Add attributes to simplify introspection
708 sr.data_encoding = data_encoding
709 sr.file_encoding = file_encoding
710 return sr
Guido van Rossum0612d842000-03-10 23:20:43 +0000711
Marc-André Lemburgaa32c5a2001-09-19 11:24:48 +0000712### Helpers for codec lookup
713
714def getencoder(encoding):
715
716 """ Lookup up the codec for the given encoding and return
717 its encoder function.
718
719 Raises a LookupError in case the encoding cannot be found.
720
721 """
722 return lookup(encoding)[0]
723
724def getdecoder(encoding):
725
726 """ Lookup up the codec for the given encoding and return
727 its decoder function.
728
729 Raises a LookupError in case the encoding cannot be found.
730
731 """
732 return lookup(encoding)[1]
733
734def getreader(encoding):
735
736 """ Lookup up the codec for the given encoding and return
737 its StreamReader class or factory function.
738
739 Raises a LookupError in case the encoding cannot be found.
740
741 """
742 return lookup(encoding)[2]
743
744def getwriter(encoding):
745
746 """ Lookup up the codec for the given encoding and return
747 its StreamWriter class or factory function.
748
749 Raises a LookupError in case the encoding cannot be found.
750
751 """
752 return lookup(encoding)[3]
753
Marc-André Lemburga866df82001-01-03 21:29:14 +0000754### Helpers for charmap-based codecs
755
756def make_identity_dict(rng):
757
758 """ make_identity_dict(rng) -> dict
759
760 Return a dictionary where elements of the rng sequence are
761 mapped to themselves.
Tim Peters88869f92001-01-14 23:36:06 +0000762
Marc-André Lemburga866df82001-01-03 21:29:14 +0000763 """
764 res = {}
765 for i in rng:
766 res[i]=i
767 return res
768
Marc-André Lemburg716cf912001-05-16 09:41:45 +0000769def make_encoding_map(decoding_map):
770
771 """ Creates an encoding map from a decoding map.
772
Walter Dörwald7f3ed742003-02-02 23:08:27 +0000773 If a target mapping in the decoding map occurs multiple
Marc-André Lemburg716cf912001-05-16 09:41:45 +0000774 times, then that target is mapped to None (undefined mapping),
775 causing an exception when encountered by the charmap codec
776 during translation.
777
778 One example where this happens is cp875.py which decodes
779 multiple character to \u001a.
780
781 """
782 m = {}
783 for k,v in decoding_map.items():
Raymond Hettinger54f02222002-06-01 14:18:47 +0000784 if not v in m:
Marc-André Lemburg716cf912001-05-16 09:41:45 +0000785 m[v] = k
786 else:
787 m[v] = None
788 return m
Tim Peters3a2ab1a2001-05-29 06:06:54 +0000789
Walter Dörwald3aeb6322002-09-02 13:14:32 +0000790### error handlers
791
Martin v. Löwise2713be2005-03-08 15:03:08 +0000792try:
793 strict_errors = lookup_error("strict")
794 ignore_errors = lookup_error("ignore")
795 replace_errors = lookup_error("replace")
796 xmlcharrefreplace_errors = lookup_error("xmlcharrefreplace")
797 backslashreplace_errors = lookup_error("backslashreplace")
798except LookupError:
799 # In --disable-unicode builds, these error handler are missing
800 strict_errors = None
801 ignore_errors = None
802 replace_errors = None
803 xmlcharrefreplace_errors = None
804 backslashreplace_errors = None
Walter Dörwald3aeb6322002-09-02 13:14:32 +0000805
Martin v. Löwis6cd441d2001-07-31 08:54:55 +0000806# Tell modulefinder that using codecs probably needs the encodings
807# package
808_false = 0
809if _false:
810 import encodings
811
Guido van Rossum0612d842000-03-10 23:20:43 +0000812### Tests
Guido van Rossum1c89b0e2000-04-11 15:41:38 +0000813
Guido van Rossum0612d842000-03-10 23:20:43 +0000814if __name__ == '__main__':
815
Guido van Rossuma3277132000-04-11 15:37:43 +0000816 # Make stdout translate Latin-1 output into UTF-8 output
817 sys.stdout = EncodedFile(sys.stdout, 'latin-1', 'utf-8')
Guido van Rossum1c89b0e2000-04-11 15:41:38 +0000818
Guido van Rossuma3277132000-04-11 15:37:43 +0000819 # Have stdin translate Latin-1 input into UTF-8 input
820 sys.stdin = EncodedFile(sys.stdin, 'utf-8', 'latin-1')