blob: 932f01bec727cc90fa61aab051a247e7251492e8 [file] [log] [blame]
Guido van Rossum0612d842000-03-10 23:20:43 +00001""" codecs -- Python Codec Registry, API and helpers.
2
3
4Written by Marc-Andre Lemburg (mal@lemburg.com).
5
6(c) Copyright CNRI, All Rights Reserved. NO WARRANTY.
7
8"""#"
9
Marc-André Lemburgb28de0d2002-12-12 17:37:50 +000010import __builtin__, sys
Guido van Rossum0612d842000-03-10 23:20:43 +000011
12### Registry and builtin stateless codec functions
13
Guido van Rossumb95de4f2000-03-31 17:25:23 +000014try:
15 from _codecs import *
Tim Peters30324a72001-05-15 17:19:16 +000016except ImportError, why:
Guido van Rossumb95de4f2000-03-31 17:25:23 +000017 raise SystemError,\
18 'Failed to load the builtin codecs: %s' % why
Guido van Rossum0612d842000-03-10 23:20:43 +000019
Tim Peters30324a72001-05-15 17:19:16 +000020__all__ = ["register", "lookup", "open", "EncodedFile", "BOM", "BOM_BE",
Walter Dörwald474458d2002-06-04 15:16:29 +000021 "BOM_LE", "BOM32_BE", "BOM32_LE", "BOM64_BE", "BOM64_LE",
22 "BOM_UTF8", "BOM_UTF16", "BOM_UTF16_LE", "BOM_UTF16_BE",
Walter Dörwald3aeb6322002-09-02 13:14:32 +000023 "BOM_UTF32", "BOM_UTF32_LE", "BOM_UTF32_BE",
24 "strict_errors", "ignore_errors", "replace_errors",
25 "xmlcharrefreplace_errors",
26 "register_error", "lookup_error"]
Skip Montanaroe99d5ea2001-01-20 19:54:20 +000027
Guido van Rossum0612d842000-03-10 23:20:43 +000028### Constants
29
30#
Walter Dörwald474458d2002-06-04 15:16:29 +000031# Byte Order Mark (BOM = ZERO WIDTH NO-BREAK SPACE = U+FEFF)
32# and its possible byte string values
33# for UTF8/UTF16/UTF32 output and little/big endian machines
Guido van Rossum0612d842000-03-10 23:20:43 +000034#
Guido van Rossum0612d842000-03-10 23:20:43 +000035
Walter Dörwald474458d2002-06-04 15:16:29 +000036# UTF-8
37BOM_UTF8 = '\xef\xbb\xbf'
38
39# UTF-16, little endian
40BOM_LE = BOM_UTF16_LE = '\xff\xfe'
41
42# UTF-16, big endian
43BOM_BE = BOM_UTF16_BE = '\xfe\xff'
44
45# UTF-32, little endian
46BOM_UTF32_LE = '\xff\xfe\x00\x00'
47
48# UTF-32, big endian
49BOM_UTF32_BE = '\x00\x00\xfe\xff'
50
Marc-André Lemburgb28de0d2002-12-12 17:37:50 +000051if sys.byteorder == 'little':
Walter Dörwald474458d2002-06-04 15:16:29 +000052
Marc-André Lemburgb28de0d2002-12-12 17:37:50 +000053 # UTF-16, native endianness
54 BOM = BOM_UTF16 = BOM_UTF16_LE
55
56 # UTF-32, native endianness
57 BOM_UTF32 = BOM_UTF32_LE
58
59else:
60
61 # UTF-16, native endianness
62 BOM = BOM_UTF16 = BOM_UTF16_BE
63
64 # UTF-32, native endianness
65 BOM_UTF32 = BOM_UTF32_BE
Walter Dörwald474458d2002-06-04 15:16:29 +000066
67# Old broken names (don't use in new code)
68BOM32_LE = BOM_UTF16_LE
69BOM32_BE = BOM_UTF16_BE
70BOM64_LE = BOM_UTF32_LE
71BOM64_BE = BOM_UTF32_BE
Guido van Rossum0612d842000-03-10 23:20:43 +000072
73
74### Codec base classes (defining the API)
75
76class Codec:
77
78 """ Defines the interface for stateless encoders/decoders.
79
Walter Dörwald7f82f792002-11-19 21:42:53 +000080 The .encode()/.decode() methods may use different error
Guido van Rossum0612d842000-03-10 23:20:43 +000081 handling schemes by providing the errors argument. These
Walter Dörwald7f82f792002-11-19 21:42:53 +000082 string values are predefined:
Guido van Rossum0612d842000-03-10 23:20:43 +000083
Guido van Rossumd8855fd2000-03-24 22:14:19 +000084 'strict' - raise a ValueError error (or a subclass)
Guido van Rossum0612d842000-03-10 23:20:43 +000085 'ignore' - ignore the character and continue with the next
86 'replace' - replace with a suitable replacement character;
87 Python will use the official U+FFFD REPLACEMENT
Walter Dörwald7f82f792002-11-19 21:42:53 +000088 CHARACTER for the builtin Unicode codecs on
89 decoding and '?' on encoding.
90 'xmlcharrefreplace' - Replace with the appropriate XML
91 character reference (only for encoding).
92 'backslashreplace' - Replace with backslashed escape sequences
93 (only for encoding).
94
95 The set of allowed values can be extended via register_error.
Guido van Rossum0612d842000-03-10 23:20:43 +000096
97 """
Tim Peters30324a72001-05-15 17:19:16 +000098 def encode(self, input, errors='strict'):
Guido van Rossum1c89b0e2000-04-11 15:41:38 +000099
Fred Drake3e74c0d2000-03-17 15:40:35 +0000100 """ Encodes the object input and returns a tuple (output
Guido van Rossum0612d842000-03-10 23:20:43 +0000101 object, length consumed).
102
103 errors defines the error handling to apply. It defaults to
104 'strict' handling.
105
106 The method may not store state in the Codec instance. Use
107 StreamCodec for codecs which have to keep state in order to
108 make encoding/decoding efficient.
109
110 The encoder must be able to handle zero length input and
111 return an empty object of the output object type in this
112 situation.
113
114 """
115 raise NotImplementedError
116
Tim Peters30324a72001-05-15 17:19:16 +0000117 def decode(self, input, errors='strict'):
Guido van Rossum0612d842000-03-10 23:20:43 +0000118
119 """ Decodes the object input and returns a tuple (output
120 object, length consumed).
121
122 input must be an object which provides the bf_getreadbuf
123 buffer slot. Python strings, buffer objects and memory
124 mapped files are examples of objects providing this slot.
Guido van Rossum1c89b0e2000-04-11 15:41:38 +0000125
Guido van Rossum0612d842000-03-10 23:20:43 +0000126 errors defines the error handling to apply. It defaults to
127 'strict' handling.
128
129 The method may not store state in the Codec instance. Use
130 StreamCodec for codecs which have to keep state in order to
131 make encoding/decoding efficient.
132
133 The decoder must be able to handle zero length input and
134 return an empty object of the output object type in this
135 situation.
136
Guido van Rossum1c89b0e2000-04-11 15:41:38 +0000137 """
Guido van Rossum0612d842000-03-10 23:20:43 +0000138 raise NotImplementedError
139
140#
141# The StreamWriter and StreamReader class provide generic working
Andrew M. Kuchling97c56352001-09-18 20:29:48 +0000142# interfaces which can be used to implement new encoding submodules
Guido van Rossum0612d842000-03-10 23:20:43 +0000143# very easily. See encodings/utf_8.py for an example on how this is
144# done.
Guido van Rossum1c89b0e2000-04-11 15:41:38 +0000145#
Guido van Rossum0612d842000-03-10 23:20:43 +0000146
147class StreamWriter(Codec):
148
Tim Peters30324a72001-05-15 17:19:16 +0000149 def __init__(self, stream, errors='strict'):
Guido van Rossum0612d842000-03-10 23:20:43 +0000150
151 """ Creates a StreamWriter instance.
152
153 stream must be a file-like object open for writing
154 (binary) data.
155
Walter Dörwald7f82f792002-11-19 21:42:53 +0000156 The StreamWriter may use different error handling
Guido van Rossum0612d842000-03-10 23:20:43 +0000157 schemes by providing the errors keyword argument. These
Walter Dörwald7f82f792002-11-19 21:42:53 +0000158 parameters are predefined:
Guido van Rossum0612d842000-03-10 23:20:43 +0000159
160 'strict' - raise a ValueError (or a subclass)
161 'ignore' - ignore the character and continue with the next
162 'replace'- replace with a suitable replacement character
Walter Dörwald7f82f792002-11-19 21:42:53 +0000163 'xmlcharrefreplace' - Replace with the appropriate XML
164 character reference.
165 'backslashreplace' - Replace with backslashed escape
166 sequences (only for encoding).
Guido van Rossum0612d842000-03-10 23:20:43 +0000167
Walter Dörwald7f82f792002-11-19 21:42:53 +0000168 The set of allowed parameter values can be extended via
169 register_error.
Guido van Rossum0612d842000-03-10 23:20:43 +0000170 """
171 self.stream = stream
172 self.errors = errors
173
Guido van Rossuma3277132000-04-11 15:37:43 +0000174 def write(self, object):
Guido van Rossum0612d842000-03-10 23:20:43 +0000175
176 """ Writes the object's contents encoded to self.stream.
177 """
Tim Peters30324a72001-05-15 17:19:16 +0000178 data, consumed = self.encode(object, self.errors)
Guido van Rossum0612d842000-03-10 23:20:43 +0000179 self.stream.write(data)
180
Guido van Rossuma3277132000-04-11 15:37:43 +0000181 def writelines(self, list):
182
183 """ Writes the concatenated list of strings to the stream
184 using .write().
185 """
186 self.write(''.join(list))
Guido van Rossum1c89b0e2000-04-11 15:41:38 +0000187
Guido van Rossum0612d842000-03-10 23:20:43 +0000188 def reset(self):
189
190 """ Flushes and resets the codec buffers used for keeping state.
191
192 Calling this method should ensure that the data on the
193 output is put into a clean state, that allows appending
194 of new fresh data without having to rescan the whole
195 stream to recover state.
196
197 """
198 pass
199
Tim Peters30324a72001-05-15 17:19:16 +0000200 def __getattr__(self, name,
Guido van Rossum0612d842000-03-10 23:20:43 +0000201 getattr=getattr):
202
203 """ Inherit all other methods from the underlying stream.
204 """
Tim Peters30324a72001-05-15 17:19:16 +0000205 return getattr(self.stream, name)
Guido van Rossum0612d842000-03-10 23:20:43 +0000206
207###
208
209class StreamReader(Codec):
210
Tim Peters30324a72001-05-15 17:19:16 +0000211 def __init__(self, stream, errors='strict'):
Guido van Rossum0612d842000-03-10 23:20:43 +0000212
213 """ Creates a StreamReader instance.
214
215 stream must be a file-like object open for reading
216 (binary) data.
217
Walter Dörwald7f82f792002-11-19 21:42:53 +0000218 The StreamReader may use different error handling
Guido van Rossum0612d842000-03-10 23:20:43 +0000219 schemes by providing the errors keyword argument. These
Walter Dörwald7f82f792002-11-19 21:42:53 +0000220 parameters are predefined:
Guido van Rossum0612d842000-03-10 23:20:43 +0000221
222 'strict' - raise a ValueError (or a subclass)
223 'ignore' - ignore the character and continue with the next
224 'replace'- replace with a suitable replacement character;
225
Walter Dörwald7f82f792002-11-19 21:42:53 +0000226 The set of allowed parameter values can be extended via
227 register_error.
Guido van Rossum0612d842000-03-10 23:20:43 +0000228 """
229 self.stream = stream
230 self.errors = errors
Walter Dörwald69652032004-09-07 20:24:22 +0000231 self.bytebuffer = ""
Walter Dörwaldc9878e12005-07-20 22:15:39 +0000232 # For str->str decoding this will stay a str
233 # For str->unicode decoding the first read will promote it to unicode
234 self.charbuffer = ""
Martin v. Löwis4ed67382005-09-18 08:34:39 +0000235 self.linebuffer = None
Guido van Rossum0612d842000-03-10 23:20:43 +0000236
Walter Dörwald69652032004-09-07 20:24:22 +0000237 def decode(self, input, errors='strict'):
238 raise NotImplementedError
239
Martin v. Löwis56066d22005-08-24 07:38:12 +0000240 def read(self, size=-1, chars=-1, firstline=False):
Guido van Rossum0612d842000-03-10 23:20:43 +0000241
242 """ Decodes data from the stream self.stream and returns the
243 resulting object.
244
Walter Dörwald69652032004-09-07 20:24:22 +0000245 chars indicates the number of characters to read from the
246 stream. read() will never return more than chars
247 characters, but it might return less, if there are not enough
248 characters available.
249
Guido van Rossum0612d842000-03-10 23:20:43 +0000250 size indicates the approximate maximum number of bytes to
251 read from the stream for decoding purposes. The decoder
252 can modify this setting as appropriate. The default value
253 -1 indicates to read and decode as much as possible. size
254 is intended to prevent having to decode huge files in one
255 step.
256
Martin v. Löwis56066d22005-08-24 07:38:12 +0000257 If firstline is true, and a UnicodeDecodeError happens
258 after the first line terminator in the input only the first line
259 will be returned, the rest of the input will be kept until the
260 next call to read().
261
Guido van Rossum0612d842000-03-10 23:20:43 +0000262 The method should use a greedy read strategy meaning that
263 it should read as much data as is allowed within the
264 definition of the encoding and the given size, e.g. if
265 optional encoding endings or state markers are available
266 on the stream, these should be read too.
Guido van Rossum0612d842000-03-10 23:20:43 +0000267 """
Martin v. Löwis4ed67382005-09-18 08:34:39 +0000268 # If we have lines cached, first merge them back into characters
269 if self.linebuffer:
270 self.charbuffer = "".join(self.linebuffer)
271 self.linebuffer = None
Tim Peters536cf992005-12-25 23:18:31 +0000272
Walter Dörwald69652032004-09-07 20:24:22 +0000273 # read until we get the required number of characters (if available)
Walter Dörwald69652032004-09-07 20:24:22 +0000274 while True:
275 # can the request can be satisfied from the character buffer?
276 if chars < 0:
277 if self.charbuffer:
Walter Dörwalde57d7b12004-12-21 22:24:00 +0000278 break
Guido van Rossum0612d842000-03-10 23:20:43 +0000279 else:
Walter Dörwald69652032004-09-07 20:24:22 +0000280 if len(self.charbuffer) >= chars:
Walter Dörwald69652032004-09-07 20:24:22 +0000281 break
282 # we need more data
283 if size < 0:
284 newdata = self.stream.read()
285 else:
286 newdata = self.stream.read(size)
Walter Dörwalde57d7b12004-12-21 22:24:00 +0000287 # decode bytes (those remaining from the last call included)
Walter Dörwald69652032004-09-07 20:24:22 +0000288 data = self.bytebuffer + newdata
Martin v. Löwis56066d22005-08-24 07:38:12 +0000289 try:
290 newchars, decodedbytes = self.decode(data, self.errors)
291 except UnicodeDecodeError, exc:
292 if firstline:
293 newchars, decodedbytes = self.decode(data[:exc.start], self.errors)
294 lines = newchars.splitlines(True)
295 if len(lines)<=1:
296 raise
297 else:
298 raise
Walter Dörwald69652032004-09-07 20:24:22 +0000299 # keep undecoded bytes until the next call
300 self.bytebuffer = data[decodedbytes:]
301 # put new characters in the character buffer
Walter Dörwalde57d7b12004-12-21 22:24:00 +0000302 self.charbuffer += newchars
Walter Dörwald69652032004-09-07 20:24:22 +0000303 # there was no data available
304 if not newdata:
Walter Dörwalde57d7b12004-12-21 22:24:00 +0000305 break
306 if chars < 0:
307 # Return everything we've got
308 result = self.charbuffer
Walter Dörwaldc9878e12005-07-20 22:15:39 +0000309 self.charbuffer = ""
Walter Dörwalde57d7b12004-12-21 22:24:00 +0000310 else:
311 # Return the first chars characters
312 result = self.charbuffer[:chars]
313 self.charbuffer = self.charbuffer[chars:]
Walter Dörwald69652032004-09-07 20:24:22 +0000314 return result
Guido van Rossum0612d842000-03-10 23:20:43 +0000315
Walter Dörwald69652032004-09-07 20:24:22 +0000316 def readline(self, size=None, keepends=True):
Guido van Rossuma3277132000-04-11 15:37:43 +0000317
318 """ Read one line from the input stream and return the
319 decoded data.
320
Walter Dörwald69652032004-09-07 20:24:22 +0000321 size, if given, is passed as size argument to the
322 read() method.
Guido van Rossum1c89b0e2000-04-11 15:41:38 +0000323
Guido van Rossuma3277132000-04-11 15:37:43 +0000324 """
Martin v. Löwis4ed67382005-09-18 08:34:39 +0000325 # If we have lines cached from an earlier read, return
326 # them unconditionally
327 if self.linebuffer:
328 line = self.linebuffer[0]
329 del self.linebuffer[0]
330 if len(self.linebuffer) == 1:
331 # revert to charbuffer mode; we might need more data
332 # next time
333 self.charbuffer = self.linebuffer[0]
334 self.linebuffer = None
335 if not keepends:
336 line = line.splitlines(False)[0]
337 return line
Tim Peters536cf992005-12-25 23:18:31 +0000338
Walter Dörwalde57d7b12004-12-21 22:24:00 +0000339 readsize = size or 72
Walter Dörwaldc9878e12005-07-20 22:15:39 +0000340 line = ""
Walter Dörwalde57d7b12004-12-21 22:24:00 +0000341 # If size is given, we call read() only once
Walter Dörwald69652032004-09-07 20:24:22 +0000342 while True:
Martin v. Löwis56066d22005-08-24 07:38:12 +0000343 data = self.read(readsize, firstline=True)
Walter Dörwalde57d7b12004-12-21 22:24:00 +0000344 if data:
Walter Dörwalda4eb2d52005-04-21 21:42:35 +0000345 # If we're at a "\r" read one extra character (which might
346 # be a "\n") to get a proper line ending. If the stream is
Walter Dörwaldbc8e6422005-04-21 21:32:03 +0000347 # temporarily exhausted we return the wrong line ending.
Walter Dörwaldc9878e12005-07-20 22:15:39 +0000348 if data.endswith("\r"):
Walter Dörwald7a6dc132005-04-04 21:38:47 +0000349 data += self.read(size=1, chars=1)
Walter Dörwald7a6dc132005-04-04 21:38:47 +0000350
Walter Dörwald69652032004-09-07 20:24:22 +0000351 line += data
Walter Dörwalde57d7b12004-12-21 22:24:00 +0000352 lines = line.splitlines(True)
353 if lines:
Martin v. Löwis4ed67382005-09-18 08:34:39 +0000354 if len(lines) > 1:
355 # More than one line result; the first line is a full line
356 # to return
357 line = lines[0]
358 del lines[0]
359 if len(lines) > 1:
360 # cache the remaining lines
361 lines[-1] += self.charbuffer
362 self.linebuffer = lines
363 self.charbuffer = None
364 else:
365 # only one remaining line, put it back into charbuffer
366 self.charbuffer = lines[0] + self.charbuffer
367 if not keepends:
368 line = line.splitlines(False)[0]
369 break
Walter Dörwalde57d7b12004-12-21 22:24:00 +0000370 line0withend = lines[0]
371 line0withoutend = lines[0].splitlines(False)[0]
372 if line0withend != line0withoutend: # We really have a line end
373 # Put the rest back together and keep it until the next call
Walter Dörwaldc9878e12005-07-20 22:15:39 +0000374 self.charbuffer = "".join(lines[1:]) + self.charbuffer
Walter Dörwalde57d7b12004-12-21 22:24:00 +0000375 if keepends:
376 line = line0withend
377 else:
378 line = line0withoutend
Walter Dörwald9fa09462005-01-10 12:01:39 +0000379 break
Walter Dörwalde57d7b12004-12-21 22:24:00 +0000380 # we didn't get anything or this was our only try
Walter Dörwald9fa09462005-01-10 12:01:39 +0000381 if not data or size is not None:
Walter Dörwalde57d7b12004-12-21 22:24:00 +0000382 if line and not keepends:
383 line = line.splitlines(False)[0]
384 break
385 if readsize<8000:
386 readsize *= 2
387 return line
Guido van Rossum1c89b0e2000-04-11 15:41:38 +0000388
Walter Dörwald69652032004-09-07 20:24:22 +0000389 def readlines(self, sizehint=None, keepends=True):
Guido van Rossuma3277132000-04-11 15:37:43 +0000390
391 """ Read all lines available on the input stream
392 and return them as list of lines.
393
394 Line breaks are implemented using the codec's decoder
395 method and are included in the list entries.
Guido van Rossum1c89b0e2000-04-11 15:41:38 +0000396
Marc-André Lemburgd5948492004-02-26 15:22:17 +0000397 sizehint, if given, is ignored since there is no efficient
398 way to finding the true end-of-line.
Guido van Rossuma3277132000-04-11 15:37:43 +0000399
400 """
Walter Dörwald69652032004-09-07 20:24:22 +0000401 data = self.read()
Hye-Shik Changaf5c7cf2004-10-17 23:51:21 +0000402 return data.splitlines(keepends)
Guido van Rossum0612d842000-03-10 23:20:43 +0000403
404 def reset(self):
405
406 """ Resets the codec buffers used for keeping state.
407
408 Note that no stream repositioning should take place.
Thomas Wouters7e474022000-07-16 12:04:32 +0000409 This method is primarily intended to be able to recover
Guido van Rossum0612d842000-03-10 23:20:43 +0000410 from decoding errors.
411
412 """
Walter Dörwald729c31f2005-03-14 19:06:30 +0000413 self.bytebuffer = ""
414 self.charbuffer = u""
Martin v. Löwis4ed67382005-09-18 08:34:39 +0000415 self.linebuffer = None
Walter Dörwald729c31f2005-03-14 19:06:30 +0000416
Walter Dörwald71fd90d2005-03-14 19:25:41 +0000417 def seek(self, offset, whence=0):
Walter Dörwald729c31f2005-03-14 19:06:30 +0000418 """ Set the input stream's current position.
419
420 Resets the codec buffers used for keeping state.
421 """
422 self.reset()
423 self.stream.seek(offset, whence)
Guido van Rossum0612d842000-03-10 23:20:43 +0000424
Walter Dörwald4dbf1922002-11-06 16:53:44 +0000425 def next(self):
426
427 """ Return the next decoded line from the input stream."""
428 line = self.readline()
429 if line:
430 return line
431 raise StopIteration
432
433 def __iter__(self):
434 return self
435
Tim Peters30324a72001-05-15 17:19:16 +0000436 def __getattr__(self, name,
Guido van Rossum0612d842000-03-10 23:20:43 +0000437 getattr=getattr):
438
439 """ Inherit all other methods from the underlying stream.
440 """
Tim Peters30324a72001-05-15 17:19:16 +0000441 return getattr(self.stream, name)
Guido van Rossum0612d842000-03-10 23:20:43 +0000442
443###
444
445class StreamReaderWriter:
446
Fred Drake49fd1072000-04-13 14:11:21 +0000447 """ StreamReaderWriter instances allow wrapping streams which
448 work in both read and write modes.
449
450 The design is such that one can use the factory functions
Thomas Wouters7e474022000-07-16 12:04:32 +0000451 returned by the codec.lookup() function to construct the
Fred Drake49fd1072000-04-13 14:11:21 +0000452 instance.
453
454 """
Guido van Rossuma3277132000-04-11 15:37:43 +0000455 # Optional attributes set by the file wrappers below
456 encoding = 'unknown'
457
Tim Peters30324a72001-05-15 17:19:16 +0000458 def __init__(self, stream, Reader, Writer, errors='strict'):
Guido van Rossum0612d842000-03-10 23:20:43 +0000459
460 """ Creates a StreamReaderWriter instance.
461
462 stream must be a Stream-like object.
463
464 Reader, Writer must be factory functions or classes
465 providing the StreamReader, StreamWriter interface resp.
466
467 Error handling is done in the same way as defined for the
468 StreamWriter/Readers.
469
470 """
471 self.stream = stream
472 self.reader = Reader(stream, errors)
473 self.writer = Writer(stream, errors)
474 self.errors = errors
475
Tim Peters30324a72001-05-15 17:19:16 +0000476 def read(self, size=-1):
Guido van Rossum0612d842000-03-10 23:20:43 +0000477
478 return self.reader.read(size)
479
Guido van Rossumd58c26f2000-05-01 16:17:32 +0000480 def readline(self, size=None):
Guido van Rossuma3277132000-04-11 15:37:43 +0000481
482 return self.reader.readline(size)
483
Guido van Rossumd58c26f2000-05-01 16:17:32 +0000484 def readlines(self, sizehint=None):
Guido van Rossuma3277132000-04-11 15:37:43 +0000485
486 return self.reader.readlines(sizehint)
487
Walter Dörwald4dbf1922002-11-06 16:53:44 +0000488 def next(self):
489
490 """ Return the next decoded line from the input stream."""
491 return self.reader.next()
492
493 def __iter__(self):
494 return self
495
Tim Peters30324a72001-05-15 17:19:16 +0000496 def write(self, data):
Guido van Rossum0612d842000-03-10 23:20:43 +0000497
498 return self.writer.write(data)
499
Tim Peters30324a72001-05-15 17:19:16 +0000500 def writelines(self, list):
Guido van Rossuma3277132000-04-11 15:37:43 +0000501
502 return self.writer.writelines(list)
503
Guido van Rossum0612d842000-03-10 23:20:43 +0000504 def reset(self):
505
506 self.reader.reset()
507 self.writer.reset()
508
Tim Peters30324a72001-05-15 17:19:16 +0000509 def __getattr__(self, name,
Guido van Rossum0612d842000-03-10 23:20:43 +0000510 getattr=getattr):
511
512 """ Inherit all other methods from the underlying stream.
513 """
Tim Peters30324a72001-05-15 17:19:16 +0000514 return getattr(self.stream, name)
Guido van Rossum0612d842000-03-10 23:20:43 +0000515
516###
517
518class StreamRecoder:
519
Fred Drake49fd1072000-04-13 14:11:21 +0000520 """ StreamRecoder instances provide a frontend - backend
521 view of encoding data.
522
523 They use the complete set of APIs returned by the
524 codecs.lookup() function to implement their task.
525
526 Data written to the stream is first decoded into an
527 intermediate format (which is dependent on the given codec
528 combination) and then written to the stream using an instance
529 of the provided Writer class.
530
531 In the other direction, data is read from the stream using a
532 Reader instance and then return encoded data to the caller.
533
534 """
Guido van Rossuma3277132000-04-11 15:37:43 +0000535 # Optional attributes set by the file wrappers below
536 data_encoding = 'unknown'
537 file_encoding = 'unknown'
538
Tim Peters30324a72001-05-15 17:19:16 +0000539 def __init__(self, stream, encode, decode, Reader, Writer,
540 errors='strict'):
Guido van Rossum0612d842000-03-10 23:20:43 +0000541
542 """ Creates a StreamRecoder instance which implements a two-way
543 conversion: encode and decode work on the frontend (the
Guido van Rossum1c89b0e2000-04-11 15:41:38 +0000544 input to .read() and output of .write()) while
Guido van Rossum0612d842000-03-10 23:20:43 +0000545 Reader and Writer work on the backend (reading and
Fred Drake908670c2000-03-17 15:42:11 +0000546 writing to the stream).
Guido van Rossum0612d842000-03-10 23:20:43 +0000547
548 You can use these objects to do transparent direct
549 recodings from e.g. latin-1 to utf-8 and back.
550
551 stream must be a file-like object.
552
553 encode, decode must adhere to the Codec interface, Reader,
554 Writer must be factory functions or classes providing the
555 StreamReader, StreamWriter interface resp.
556
557 encode and decode are needed for the frontend translation,
558 Reader and Writer for the backend translation. Unicode is
559 used as intermediate encoding.
560
561 Error handling is done in the same way as defined for the
562 StreamWriter/Readers.
563
564 """
565 self.stream = stream
566 self.encode = encode
567 self.decode = decode
568 self.reader = Reader(stream, errors)
569 self.writer = Writer(stream, errors)
570 self.errors = errors
571
Tim Peters30324a72001-05-15 17:19:16 +0000572 def read(self, size=-1):
Guido van Rossum0612d842000-03-10 23:20:43 +0000573
574 data = self.reader.read(size)
575 data, bytesencoded = self.encode(data, self.errors)
576 return data
577
Tim Peters30324a72001-05-15 17:19:16 +0000578 def readline(self, size=None):
Guido van Rossuma3277132000-04-11 15:37:43 +0000579
580 if size is None:
581 data = self.reader.readline()
582 else:
583 data = self.reader.readline(size)
584 data, bytesencoded = self.encode(data, self.errors)
585 return data
586
Tim Peters30324a72001-05-15 17:19:16 +0000587 def readlines(self, sizehint=None):
Guido van Rossuma3277132000-04-11 15:37:43 +0000588
Marc-André Lemburgd5948492004-02-26 15:22:17 +0000589 data = self.reader.read()
Guido van Rossuma3277132000-04-11 15:37:43 +0000590 data, bytesencoded = self.encode(data, self.errors)
591 return data.splitlines(1)
592
Walter Dörwald4dbf1922002-11-06 16:53:44 +0000593 def next(self):
594
595 """ Return the next decoded line from the input stream."""
Walter Dörwaldc5238b82005-09-01 11:56:53 +0000596 data = self.reader.next()
597 data, bytesencoded = self.encode(data, self.errors)
598 return data
Walter Dörwald4dbf1922002-11-06 16:53:44 +0000599
600 def __iter__(self):
601 return self
602
Tim Peters30324a72001-05-15 17:19:16 +0000603 def write(self, data):
Guido van Rossum0612d842000-03-10 23:20:43 +0000604
605 data, bytesdecoded = self.decode(data, self.errors)
606 return self.writer.write(data)
607
Tim Peters30324a72001-05-15 17:19:16 +0000608 def writelines(self, list):
Guido van Rossuma3277132000-04-11 15:37:43 +0000609
610 data = ''.join(list)
611 data, bytesdecoded = self.decode(data, self.errors)
612 return self.writer.write(data)
Guido van Rossum0612d842000-03-10 23:20:43 +0000613
614 def reset(self):
615
616 self.reader.reset()
617 self.writer.reset()
618
Tim Peters30324a72001-05-15 17:19:16 +0000619 def __getattr__(self, name,
Guido van Rossum0612d842000-03-10 23:20:43 +0000620 getattr=getattr):
621
622 """ Inherit all other methods from the underlying stream.
623 """
Tim Peters30324a72001-05-15 17:19:16 +0000624 return getattr(self.stream, name)
Guido van Rossum0612d842000-03-10 23:20:43 +0000625
626### Shortcuts
627
Marc-André Lemburg349a3d32000-06-21 21:21:04 +0000628def open(filename, mode='rb', encoding=None, errors='strict', buffering=1):
Guido van Rossum0612d842000-03-10 23:20:43 +0000629
630 """ Open an encoded file using the given mode and return
631 a wrapped version providing transparent encoding/decoding.
632
633 Note: The wrapped version will only accept the object format
634 defined by the codecs, i.e. Unicode objects for most builtin
Skip Montanaro9f5f9d92005-03-16 03:51:56 +0000635 codecs. Output is also codec dependent and will usually be
Guido van Rossum0612d842000-03-10 23:20:43 +0000636 Unicode as well.
637
Marc-André Lemburg349a3d32000-06-21 21:21:04 +0000638 Files are always opened in binary mode, even if no binary mode
Walter Dörwald7f3ed742003-02-02 23:08:27 +0000639 was specified. This is done to avoid data loss due to encodings
Marc-André Lemburg349a3d32000-06-21 21:21:04 +0000640 using 8-bit values. The default file mode is 'rb' meaning to
641 open the file in binary read mode.
642
Guido van Rossum0612d842000-03-10 23:20:43 +0000643 encoding specifies the encoding which is to be used for the
Walter Dörwald7f3ed742003-02-02 23:08:27 +0000644 file.
Guido van Rossum0612d842000-03-10 23:20:43 +0000645
646 errors may be given to define the error handling. It defaults
647 to 'strict' which causes ValueErrors to be raised in case an
648 encoding error occurs.
649
650 buffering has the same meaning as for the builtin open() API.
651 It defaults to line buffered.
652
Fred Drake49fd1072000-04-13 14:11:21 +0000653 The returned wrapped file object provides an extra attribute
654 .encoding which allows querying the used encoding. This
655 attribute is only available if an encoding was specified as
656 parameter.
657
Guido van Rossum0612d842000-03-10 23:20:43 +0000658 """
659 if encoding is not None and \
660 'b' not in mode:
661 # Force opening of the file in binary mode
662 mode = mode + 'b'
663 file = __builtin__.open(filename, mode, buffering)
664 if encoding is None:
665 return file
Tim Peters30324a72001-05-15 17:19:16 +0000666 (e, d, sr, sw) = lookup(encoding)
Guido van Rossuma3277132000-04-11 15:37:43 +0000667 srw = StreamReaderWriter(file, sr, sw, errors)
668 # Add attributes to simplify introspection
669 srw.encoding = encoding
670 return srw
Guido van Rossum0612d842000-03-10 23:20:43 +0000671
Guido van Rossuma3277132000-04-11 15:37:43 +0000672def EncodedFile(file, data_encoding, file_encoding=None, errors='strict'):
Guido van Rossum0612d842000-03-10 23:20:43 +0000673
674 """ Return a wrapped version of file which provides transparent
675 encoding translation.
676
677 Strings written to the wrapped file are interpreted according
Guido van Rossuma3277132000-04-11 15:37:43 +0000678 to the given data_encoding and then written to the original
679 file as string using file_encoding. The intermediate encoding
680 will usually be Unicode but depends on the specified codecs.
Guido van Rossum0612d842000-03-10 23:20:43 +0000681
Guido van Rossuma3277132000-04-11 15:37:43 +0000682 Strings are read from the file using file_encoding and then
683 passed back to the caller as string using data_encoding.
684
685 If file_encoding is not given, it defaults to data_encoding.
Guido van Rossum0612d842000-03-10 23:20:43 +0000686
687 errors may be given to define the error handling. It defaults
688 to 'strict' which causes ValueErrors to be raised in case an
689 encoding error occurs.
690
Fred Drake49fd1072000-04-13 14:11:21 +0000691 The returned wrapped file object provides two extra attributes
692 .data_encoding and .file_encoding which reflect the given
693 parameters of the same name. The attributes can be used for
694 introspection by Python programs.
695
Guido van Rossum0612d842000-03-10 23:20:43 +0000696 """
Guido van Rossuma3277132000-04-11 15:37:43 +0000697 if file_encoding is None:
698 file_encoding = data_encoding
699 encode, decode = lookup(data_encoding)[:2]
700 Reader, Writer = lookup(file_encoding)[2:]
701 sr = StreamRecoder(file,
Tim Peters30324a72001-05-15 17:19:16 +0000702 encode, decode, Reader, Writer,
Guido van Rossuma3277132000-04-11 15:37:43 +0000703 errors)
704 # Add attributes to simplify introspection
705 sr.data_encoding = data_encoding
706 sr.file_encoding = file_encoding
707 return sr
Guido van Rossum0612d842000-03-10 23:20:43 +0000708
Marc-André Lemburgaa32c5a2001-09-19 11:24:48 +0000709### Helpers for codec lookup
710
711def getencoder(encoding):
712
713 """ Lookup up the codec for the given encoding and return
714 its encoder function.
715
716 Raises a LookupError in case the encoding cannot be found.
717
718 """
719 return lookup(encoding)[0]
720
721def getdecoder(encoding):
722
723 """ Lookup up the codec for the given encoding and return
724 its decoder function.
725
726 Raises a LookupError in case the encoding cannot be found.
727
728 """
729 return lookup(encoding)[1]
730
731def getreader(encoding):
732
733 """ Lookup up the codec for the given encoding and return
734 its StreamReader class or factory function.
735
736 Raises a LookupError in case the encoding cannot be found.
737
738 """
739 return lookup(encoding)[2]
740
741def getwriter(encoding):
742
743 """ Lookup up the codec for the given encoding and return
744 its StreamWriter class or factory function.
745
746 Raises a LookupError in case the encoding cannot be found.
747
748 """
749 return lookup(encoding)[3]
750
Marc-André Lemburga866df82001-01-03 21:29:14 +0000751### Helpers for charmap-based codecs
752
753def make_identity_dict(rng):
754
755 """ make_identity_dict(rng) -> dict
756
757 Return a dictionary where elements of the rng sequence are
758 mapped to themselves.
Tim Peters88869f92001-01-14 23:36:06 +0000759
Marc-André Lemburga866df82001-01-03 21:29:14 +0000760 """
761 res = {}
762 for i in rng:
763 res[i]=i
764 return res
765
Marc-André Lemburg716cf912001-05-16 09:41:45 +0000766def make_encoding_map(decoding_map):
767
768 """ Creates an encoding map from a decoding map.
769
Walter Dörwald7f3ed742003-02-02 23:08:27 +0000770 If a target mapping in the decoding map occurs multiple
Marc-André Lemburg716cf912001-05-16 09:41:45 +0000771 times, then that target is mapped to None (undefined mapping),
772 causing an exception when encountered by the charmap codec
773 during translation.
774
775 One example where this happens is cp875.py which decodes
776 multiple character to \u001a.
777
778 """
779 m = {}
780 for k,v in decoding_map.items():
Raymond Hettinger54f02222002-06-01 14:18:47 +0000781 if not v in m:
Marc-André Lemburg716cf912001-05-16 09:41:45 +0000782 m[v] = k
783 else:
784 m[v] = None
785 return m
Tim Peters3a2ab1a2001-05-29 06:06:54 +0000786
Walter Dörwald3aeb6322002-09-02 13:14:32 +0000787### error handlers
788
Martin v. Löwise2713be2005-03-08 15:03:08 +0000789try:
790 strict_errors = lookup_error("strict")
791 ignore_errors = lookup_error("ignore")
792 replace_errors = lookup_error("replace")
793 xmlcharrefreplace_errors = lookup_error("xmlcharrefreplace")
794 backslashreplace_errors = lookup_error("backslashreplace")
795except LookupError:
796 # In --disable-unicode builds, these error handler are missing
797 strict_errors = None
798 ignore_errors = None
799 replace_errors = None
800 xmlcharrefreplace_errors = None
801 backslashreplace_errors = None
Walter Dörwald3aeb6322002-09-02 13:14:32 +0000802
Martin v. Löwis6cd441d2001-07-31 08:54:55 +0000803# Tell modulefinder that using codecs probably needs the encodings
804# package
805_false = 0
806if _false:
807 import encodings
808
Guido van Rossum0612d842000-03-10 23:20:43 +0000809### Tests
Guido van Rossum1c89b0e2000-04-11 15:41:38 +0000810
Guido van Rossum0612d842000-03-10 23:20:43 +0000811if __name__ == '__main__':
812
Guido van Rossuma3277132000-04-11 15:37:43 +0000813 # Make stdout translate Latin-1 output into UTF-8 output
814 sys.stdout = EncodedFile(sys.stdout, 'latin-1', 'utf-8')
Guido van Rossum1c89b0e2000-04-11 15:41:38 +0000815
Guido van Rossuma3277132000-04-11 15:37:43 +0000816 # Have stdin translate Latin-1 input into UTF-8 input
817 sys.stdin = EncodedFile(sys.stdin, 'utf-8', 'latin-1')