blob: 0ffa38268f79df6d667e61ee1f2202f984bf66f8 [file] [log] [blame]
Guido van Rossum0612d842000-03-10 23:20:43 +00001""" codecs -- Python Codec Registry, API and helpers.
2
3
4Written by Marc-Andre Lemburg (mal@lemburg.com).
5
6(c) Copyright CNRI, All Rights Reserved. NO WARRANTY.
7
8"""#"
9
Marc-André Lemburgb28de0d2002-12-12 17:37:50 +000010import __builtin__, sys
Guido van Rossum0612d842000-03-10 23:20:43 +000011
12### Registry and builtin stateless codec functions
13
Guido van Rossumb95de4f2000-03-31 17:25:23 +000014try:
15 from _codecs import *
Tim Peters30324a72001-05-15 17:19:16 +000016except ImportError, why:
Guido van Rossumb95de4f2000-03-31 17:25:23 +000017 raise SystemError,\
18 'Failed to load the builtin codecs: %s' % why
Guido van Rossum0612d842000-03-10 23:20:43 +000019
Tim Peters30324a72001-05-15 17:19:16 +000020__all__ = ["register", "lookup", "open", "EncodedFile", "BOM", "BOM_BE",
Walter Dörwald474458d2002-06-04 15:16:29 +000021 "BOM_LE", "BOM32_BE", "BOM32_LE", "BOM64_BE", "BOM64_LE",
22 "BOM_UTF8", "BOM_UTF16", "BOM_UTF16_LE", "BOM_UTF16_BE",
Walter Dörwald3aeb6322002-09-02 13:14:32 +000023 "BOM_UTF32", "BOM_UTF32_LE", "BOM_UTF32_BE",
24 "strict_errors", "ignore_errors", "replace_errors",
25 "xmlcharrefreplace_errors",
26 "register_error", "lookup_error"]
Skip Montanaroe99d5ea2001-01-20 19:54:20 +000027
Guido van Rossum0612d842000-03-10 23:20:43 +000028### Constants
29
30#
Walter Dörwald474458d2002-06-04 15:16:29 +000031# Byte Order Mark (BOM = ZERO WIDTH NO-BREAK SPACE = U+FEFF)
32# and its possible byte string values
33# for UTF8/UTF16/UTF32 output and little/big endian machines
Guido van Rossum0612d842000-03-10 23:20:43 +000034#
Guido van Rossum0612d842000-03-10 23:20:43 +000035
Walter Dörwald474458d2002-06-04 15:16:29 +000036# UTF-8
37BOM_UTF8 = '\xef\xbb\xbf'
38
39# UTF-16, little endian
40BOM_LE = BOM_UTF16_LE = '\xff\xfe'
41
42# UTF-16, big endian
43BOM_BE = BOM_UTF16_BE = '\xfe\xff'
44
45# UTF-32, little endian
46BOM_UTF32_LE = '\xff\xfe\x00\x00'
47
48# UTF-32, big endian
49BOM_UTF32_BE = '\x00\x00\xfe\xff'
50
Marc-André Lemburgb28de0d2002-12-12 17:37:50 +000051if sys.byteorder == 'little':
Walter Dörwald474458d2002-06-04 15:16:29 +000052
Marc-André Lemburgb28de0d2002-12-12 17:37:50 +000053 # UTF-16, native endianness
54 BOM = BOM_UTF16 = BOM_UTF16_LE
55
56 # UTF-32, native endianness
57 BOM_UTF32 = BOM_UTF32_LE
58
59else:
60
61 # UTF-16, native endianness
62 BOM = BOM_UTF16 = BOM_UTF16_BE
63
64 # UTF-32, native endianness
65 BOM_UTF32 = BOM_UTF32_BE
Walter Dörwald474458d2002-06-04 15:16:29 +000066
67# Old broken names (don't use in new code)
68BOM32_LE = BOM_UTF16_LE
69BOM32_BE = BOM_UTF16_BE
70BOM64_LE = BOM_UTF32_LE
71BOM64_BE = BOM_UTF32_BE
Guido van Rossum0612d842000-03-10 23:20:43 +000072
73
74### Codec base classes (defining the API)
75
76class Codec:
77
78 """ Defines the interface for stateless encoders/decoders.
79
Walter Dörwald7f82f792002-11-19 21:42:53 +000080 The .encode()/.decode() methods may use different error
Guido van Rossum0612d842000-03-10 23:20:43 +000081 handling schemes by providing the errors argument. These
Walter Dörwald7f82f792002-11-19 21:42:53 +000082 string values are predefined:
Guido van Rossum0612d842000-03-10 23:20:43 +000083
Guido van Rossumd8855fd2000-03-24 22:14:19 +000084 'strict' - raise a ValueError error (or a subclass)
Guido van Rossum0612d842000-03-10 23:20:43 +000085 'ignore' - ignore the character and continue with the next
86 'replace' - replace with a suitable replacement character;
87 Python will use the official U+FFFD REPLACEMENT
Walter Dörwald7f82f792002-11-19 21:42:53 +000088 CHARACTER for the builtin Unicode codecs on
89 decoding and '?' on encoding.
90 'xmlcharrefreplace' - Replace with the appropriate XML
91 character reference (only for encoding).
92 'backslashreplace' - Replace with backslashed escape sequences
93 (only for encoding).
94
95 The set of allowed values can be extended via register_error.
Guido van Rossum0612d842000-03-10 23:20:43 +000096
97 """
Tim Peters30324a72001-05-15 17:19:16 +000098 def encode(self, input, errors='strict'):
Guido van Rossum1c89b0e2000-04-11 15:41:38 +000099
Fred Drake3e74c0d2000-03-17 15:40:35 +0000100 """ Encodes the object input and returns a tuple (output
Guido van Rossum0612d842000-03-10 23:20:43 +0000101 object, length consumed).
102
103 errors defines the error handling to apply. It defaults to
104 'strict' handling.
105
106 The method may not store state in the Codec instance. Use
107 StreamCodec for codecs which have to keep state in order to
108 make encoding/decoding efficient.
109
110 The encoder must be able to handle zero length input and
111 return an empty object of the output object type in this
112 situation.
113
114 """
115 raise NotImplementedError
116
Tim Peters30324a72001-05-15 17:19:16 +0000117 def decode(self, input, errors='strict'):
Guido van Rossum0612d842000-03-10 23:20:43 +0000118
119 """ Decodes the object input and returns a tuple (output
120 object, length consumed).
121
122 input must be an object which provides the bf_getreadbuf
123 buffer slot. Python strings, buffer objects and memory
124 mapped files are examples of objects providing this slot.
Guido van Rossum1c89b0e2000-04-11 15:41:38 +0000125
Guido van Rossum0612d842000-03-10 23:20:43 +0000126 errors defines the error handling to apply. It defaults to
127 'strict' handling.
128
129 The method may not store state in the Codec instance. Use
130 StreamCodec for codecs which have to keep state in order to
131 make encoding/decoding efficient.
132
133 The decoder must be able to handle zero length input and
134 return an empty object of the output object type in this
135 situation.
136
Guido van Rossum1c89b0e2000-04-11 15:41:38 +0000137 """
Guido van Rossum0612d842000-03-10 23:20:43 +0000138 raise NotImplementedError
139
140#
141# The StreamWriter and StreamReader class provide generic working
Andrew M. Kuchling97c56352001-09-18 20:29:48 +0000142# interfaces which can be used to implement new encoding submodules
Guido van Rossum0612d842000-03-10 23:20:43 +0000143# very easily. See encodings/utf_8.py for an example on how this is
144# done.
Guido van Rossum1c89b0e2000-04-11 15:41:38 +0000145#
Guido van Rossum0612d842000-03-10 23:20:43 +0000146
147class StreamWriter(Codec):
148
Tim Peters30324a72001-05-15 17:19:16 +0000149 def __init__(self, stream, errors='strict'):
Guido van Rossum0612d842000-03-10 23:20:43 +0000150
151 """ Creates a StreamWriter instance.
152
153 stream must be a file-like object open for writing
154 (binary) data.
155
Walter Dörwald7f82f792002-11-19 21:42:53 +0000156 The StreamWriter may use different error handling
Guido van Rossum0612d842000-03-10 23:20:43 +0000157 schemes by providing the errors keyword argument. These
Walter Dörwald7f82f792002-11-19 21:42:53 +0000158 parameters are predefined:
Guido van Rossum0612d842000-03-10 23:20:43 +0000159
160 'strict' - raise a ValueError (or a subclass)
161 'ignore' - ignore the character and continue with the next
162 'replace'- replace with a suitable replacement character
Walter Dörwald7f82f792002-11-19 21:42:53 +0000163 'xmlcharrefreplace' - Replace with the appropriate XML
164 character reference.
165 'backslashreplace' - Replace with backslashed escape
166 sequences (only for encoding).
Guido van Rossum0612d842000-03-10 23:20:43 +0000167
Walter Dörwald7f82f792002-11-19 21:42:53 +0000168 The set of allowed parameter values can be extended via
169 register_error.
Guido van Rossum0612d842000-03-10 23:20:43 +0000170 """
171 self.stream = stream
172 self.errors = errors
173
Guido van Rossuma3277132000-04-11 15:37:43 +0000174 def write(self, object):
Guido van Rossum0612d842000-03-10 23:20:43 +0000175
176 """ Writes the object's contents encoded to self.stream.
177 """
Tim Peters30324a72001-05-15 17:19:16 +0000178 data, consumed = self.encode(object, self.errors)
Guido van Rossum0612d842000-03-10 23:20:43 +0000179 self.stream.write(data)
180
Guido van Rossuma3277132000-04-11 15:37:43 +0000181 def writelines(self, list):
182
183 """ Writes the concatenated list of strings to the stream
184 using .write().
185 """
186 self.write(''.join(list))
Guido van Rossum1c89b0e2000-04-11 15:41:38 +0000187
Guido van Rossum0612d842000-03-10 23:20:43 +0000188 def reset(self):
189
190 """ Flushes and resets the codec buffers used for keeping state.
191
192 Calling this method should ensure that the data on the
193 output is put into a clean state, that allows appending
194 of new fresh data without having to rescan the whole
195 stream to recover state.
196
197 """
198 pass
199
Tim Peters30324a72001-05-15 17:19:16 +0000200 def __getattr__(self, name,
Guido van Rossum0612d842000-03-10 23:20:43 +0000201 getattr=getattr):
202
203 """ Inherit all other methods from the underlying stream.
204 """
Tim Peters30324a72001-05-15 17:19:16 +0000205 return getattr(self.stream, name)
Guido van Rossum0612d842000-03-10 23:20:43 +0000206
207###
208
209class StreamReader(Codec):
210
Tim Peters30324a72001-05-15 17:19:16 +0000211 def __init__(self, stream, errors='strict'):
Guido van Rossum0612d842000-03-10 23:20:43 +0000212
213 """ Creates a StreamReader instance.
214
215 stream must be a file-like object open for reading
216 (binary) data.
217
Walter Dörwald7f82f792002-11-19 21:42:53 +0000218 The StreamReader may use different error handling
Guido van Rossum0612d842000-03-10 23:20:43 +0000219 schemes by providing the errors keyword argument. These
Walter Dörwald7f82f792002-11-19 21:42:53 +0000220 parameters are predefined:
Guido van Rossum0612d842000-03-10 23:20:43 +0000221
222 'strict' - raise a ValueError (or a subclass)
223 'ignore' - ignore the character and continue with the next
224 'replace'- replace with a suitable replacement character;
225
Walter Dörwald7f82f792002-11-19 21:42:53 +0000226 The set of allowed parameter values can be extended via
227 register_error.
Guido van Rossum0612d842000-03-10 23:20:43 +0000228 """
229 self.stream = stream
230 self.errors = errors
Walter Dörwald69652032004-09-07 20:24:22 +0000231 self.bytebuffer = ""
Walter Dörwaldc9878e12005-07-20 22:15:39 +0000232 # For str->str decoding this will stay a str
233 # For str->unicode decoding the first read will promote it to unicode
234 self.charbuffer = ""
Guido van Rossum0612d842000-03-10 23:20:43 +0000235
Walter Dörwald69652032004-09-07 20:24:22 +0000236 def decode(self, input, errors='strict'):
237 raise NotImplementedError
238
239 def read(self, size=-1, chars=-1):
Guido van Rossum0612d842000-03-10 23:20:43 +0000240
241 """ Decodes data from the stream self.stream and returns the
242 resulting object.
243
Walter Dörwald69652032004-09-07 20:24:22 +0000244 chars indicates the number of characters to read from the
245 stream. read() will never return more than chars
246 characters, but it might return less, if there are not enough
247 characters available.
248
Guido van Rossum0612d842000-03-10 23:20:43 +0000249 size indicates the approximate maximum number of bytes to
250 read from the stream for decoding purposes. The decoder
251 can modify this setting as appropriate. The default value
252 -1 indicates to read and decode as much as possible. size
253 is intended to prevent having to decode huge files in one
254 step.
255
256 The method should use a greedy read strategy meaning that
257 it should read as much data as is allowed within the
258 definition of the encoding and the given size, e.g. if
259 optional encoding endings or state markers are available
260 on the stream, these should be read too.
Guido van Rossum0612d842000-03-10 23:20:43 +0000261 """
Walter Dörwald69652032004-09-07 20:24:22 +0000262 # read until we get the required number of characters (if available)
Walter Dörwald69652032004-09-07 20:24:22 +0000263 while True:
264 # can the request can be satisfied from the character buffer?
265 if chars < 0:
266 if self.charbuffer:
Walter Dörwalde57d7b12004-12-21 22:24:00 +0000267 break
Guido van Rossum0612d842000-03-10 23:20:43 +0000268 else:
Walter Dörwald69652032004-09-07 20:24:22 +0000269 if len(self.charbuffer) >= chars:
Walter Dörwald69652032004-09-07 20:24:22 +0000270 break
271 # we need more data
272 if size < 0:
273 newdata = self.stream.read()
274 else:
275 newdata = self.stream.read(size)
Walter Dörwalde57d7b12004-12-21 22:24:00 +0000276 # decode bytes (those remaining from the last call included)
Walter Dörwald69652032004-09-07 20:24:22 +0000277 data = self.bytebuffer + newdata
Walter Dörwalde57d7b12004-12-21 22:24:00 +0000278 newchars, decodedbytes = self.decode(data, self.errors)
Walter Dörwald69652032004-09-07 20:24:22 +0000279 # keep undecoded bytes until the next call
280 self.bytebuffer = data[decodedbytes:]
281 # put new characters in the character buffer
Walter Dörwalde57d7b12004-12-21 22:24:00 +0000282 self.charbuffer += newchars
Walter Dörwald69652032004-09-07 20:24:22 +0000283 # there was no data available
284 if not newdata:
Walter Dörwalde57d7b12004-12-21 22:24:00 +0000285 break
286 if chars < 0:
287 # Return everything we've got
288 result = self.charbuffer
Walter Dörwaldc9878e12005-07-20 22:15:39 +0000289 self.charbuffer = ""
Walter Dörwalde57d7b12004-12-21 22:24:00 +0000290 else:
291 # Return the first chars characters
292 result = self.charbuffer[:chars]
293 self.charbuffer = self.charbuffer[chars:]
Walter Dörwald69652032004-09-07 20:24:22 +0000294 return result
Guido van Rossum0612d842000-03-10 23:20:43 +0000295
Walter Dörwald69652032004-09-07 20:24:22 +0000296 def readline(self, size=None, keepends=True):
Guido van Rossuma3277132000-04-11 15:37:43 +0000297
298 """ Read one line from the input stream and return the
299 decoded data.
300
Walter Dörwald69652032004-09-07 20:24:22 +0000301 size, if given, is passed as size argument to the
302 read() method.
Guido van Rossum1c89b0e2000-04-11 15:41:38 +0000303
Guido van Rossuma3277132000-04-11 15:37:43 +0000304 """
Walter Dörwalde57d7b12004-12-21 22:24:00 +0000305 readsize = size or 72
Walter Dörwaldc9878e12005-07-20 22:15:39 +0000306 line = ""
Walter Dörwalde57d7b12004-12-21 22:24:00 +0000307 # If size is given, we call read() only once
Walter Dörwald69652032004-09-07 20:24:22 +0000308 while True:
Walter Dörwalde57d7b12004-12-21 22:24:00 +0000309 data = self.read(readsize)
Walter Dörwalde57d7b12004-12-21 22:24:00 +0000310 if data:
Walter Dörwalda4eb2d52005-04-21 21:42:35 +0000311 # If we're at a "\r" read one extra character (which might
312 # be a "\n") to get a proper line ending. If the stream is
Walter Dörwaldbc8e6422005-04-21 21:32:03 +0000313 # temporarily exhausted we return the wrong line ending.
Walter Dörwaldc9878e12005-07-20 22:15:39 +0000314 if data.endswith("\r"):
Walter Dörwald7a6dc132005-04-04 21:38:47 +0000315 data += self.read(size=1, chars=1)
Walter Dörwald7a6dc132005-04-04 21:38:47 +0000316
Walter Dörwald69652032004-09-07 20:24:22 +0000317 line += data
Walter Dörwalde57d7b12004-12-21 22:24:00 +0000318 lines = line.splitlines(True)
319 if lines:
320 line0withend = lines[0]
321 line0withoutend = lines[0].splitlines(False)[0]
322 if line0withend != line0withoutend: # We really have a line end
323 # Put the rest back together and keep it until the next call
Walter Dörwaldc9878e12005-07-20 22:15:39 +0000324 self.charbuffer = "".join(lines[1:]) + self.charbuffer
Walter Dörwalde57d7b12004-12-21 22:24:00 +0000325 if keepends:
326 line = line0withend
327 else:
328 line = line0withoutend
Walter Dörwald9fa09462005-01-10 12:01:39 +0000329 break
Walter Dörwalde57d7b12004-12-21 22:24:00 +0000330 # we didn't get anything or this was our only try
Walter Dörwald9fa09462005-01-10 12:01:39 +0000331 if not data or size is not None:
Walter Dörwalde57d7b12004-12-21 22:24:00 +0000332 if line and not keepends:
333 line = line.splitlines(False)[0]
334 break
335 if readsize<8000:
336 readsize *= 2
337 return line
Guido van Rossum1c89b0e2000-04-11 15:41:38 +0000338
Walter Dörwald69652032004-09-07 20:24:22 +0000339 def readlines(self, sizehint=None, keepends=True):
Guido van Rossuma3277132000-04-11 15:37:43 +0000340
341 """ Read all lines available on the input stream
342 and return them as list of lines.
343
344 Line breaks are implemented using the codec's decoder
345 method and are included in the list entries.
Guido van Rossum1c89b0e2000-04-11 15:41:38 +0000346
Marc-André Lemburgd5948492004-02-26 15:22:17 +0000347 sizehint, if given, is ignored since there is no efficient
348 way to finding the true end-of-line.
Guido van Rossuma3277132000-04-11 15:37:43 +0000349
350 """
Walter Dörwald69652032004-09-07 20:24:22 +0000351 data = self.read()
Hye-Shik Changaf5c7cf2004-10-17 23:51:21 +0000352 return data.splitlines(keepends)
Guido van Rossum0612d842000-03-10 23:20:43 +0000353
354 def reset(self):
355
356 """ Resets the codec buffers used for keeping state.
357
358 Note that no stream repositioning should take place.
Thomas Wouters7e474022000-07-16 12:04:32 +0000359 This method is primarily intended to be able to recover
Guido van Rossum0612d842000-03-10 23:20:43 +0000360 from decoding errors.
361
362 """
Walter Dörwald729c31f2005-03-14 19:06:30 +0000363 self.bytebuffer = ""
364 self.charbuffer = u""
Walter Dörwald729c31f2005-03-14 19:06:30 +0000365
Walter Dörwald71fd90d2005-03-14 19:25:41 +0000366 def seek(self, offset, whence=0):
Walter Dörwald729c31f2005-03-14 19:06:30 +0000367 """ Set the input stream's current position.
368
369 Resets the codec buffers used for keeping state.
370 """
371 self.reset()
372 self.stream.seek(offset, whence)
Guido van Rossum0612d842000-03-10 23:20:43 +0000373
Walter Dörwald4dbf1922002-11-06 16:53:44 +0000374 def next(self):
375
376 """ Return the next decoded line from the input stream."""
377 line = self.readline()
378 if line:
379 return line
380 raise StopIteration
381
382 def __iter__(self):
383 return self
384
Tim Peters30324a72001-05-15 17:19:16 +0000385 def __getattr__(self, name,
Guido van Rossum0612d842000-03-10 23:20:43 +0000386 getattr=getattr):
387
388 """ Inherit all other methods from the underlying stream.
389 """
Tim Peters30324a72001-05-15 17:19:16 +0000390 return getattr(self.stream, name)
Guido van Rossum0612d842000-03-10 23:20:43 +0000391
392###
393
394class StreamReaderWriter:
395
Fred Drake49fd1072000-04-13 14:11:21 +0000396 """ StreamReaderWriter instances allow wrapping streams which
397 work in both read and write modes.
398
399 The design is such that one can use the factory functions
Thomas Wouters7e474022000-07-16 12:04:32 +0000400 returned by the codec.lookup() function to construct the
Fred Drake49fd1072000-04-13 14:11:21 +0000401 instance.
402
403 """
Guido van Rossuma3277132000-04-11 15:37:43 +0000404 # Optional attributes set by the file wrappers below
405 encoding = 'unknown'
406
Tim Peters30324a72001-05-15 17:19:16 +0000407 def __init__(self, stream, Reader, Writer, errors='strict'):
Guido van Rossum0612d842000-03-10 23:20:43 +0000408
409 """ Creates a StreamReaderWriter instance.
410
411 stream must be a Stream-like object.
412
413 Reader, Writer must be factory functions or classes
414 providing the StreamReader, StreamWriter interface resp.
415
416 Error handling is done in the same way as defined for the
417 StreamWriter/Readers.
418
419 """
420 self.stream = stream
421 self.reader = Reader(stream, errors)
422 self.writer = Writer(stream, errors)
423 self.errors = errors
424
Tim Peters30324a72001-05-15 17:19:16 +0000425 def read(self, size=-1):
Guido van Rossum0612d842000-03-10 23:20:43 +0000426
427 return self.reader.read(size)
428
Guido van Rossumd58c26f2000-05-01 16:17:32 +0000429 def readline(self, size=None):
Guido van Rossuma3277132000-04-11 15:37:43 +0000430
431 return self.reader.readline(size)
432
Guido van Rossumd58c26f2000-05-01 16:17:32 +0000433 def readlines(self, sizehint=None):
Guido van Rossuma3277132000-04-11 15:37:43 +0000434
435 return self.reader.readlines(sizehint)
436
Walter Dörwald4dbf1922002-11-06 16:53:44 +0000437 def next(self):
438
439 """ Return the next decoded line from the input stream."""
440 return self.reader.next()
441
442 def __iter__(self):
443 return self
444
Tim Peters30324a72001-05-15 17:19:16 +0000445 def write(self, data):
Guido van Rossum0612d842000-03-10 23:20:43 +0000446
447 return self.writer.write(data)
448
Tim Peters30324a72001-05-15 17:19:16 +0000449 def writelines(self, list):
Guido van Rossuma3277132000-04-11 15:37:43 +0000450
451 return self.writer.writelines(list)
452
Guido van Rossum0612d842000-03-10 23:20:43 +0000453 def reset(self):
454
455 self.reader.reset()
456 self.writer.reset()
457
Tim Peters30324a72001-05-15 17:19:16 +0000458 def __getattr__(self, name,
Guido van Rossum0612d842000-03-10 23:20:43 +0000459 getattr=getattr):
460
461 """ Inherit all other methods from the underlying stream.
462 """
Tim Peters30324a72001-05-15 17:19:16 +0000463 return getattr(self.stream, name)
Guido van Rossum0612d842000-03-10 23:20:43 +0000464
465###
466
467class StreamRecoder:
468
Fred Drake49fd1072000-04-13 14:11:21 +0000469 """ StreamRecoder instances provide a frontend - backend
470 view of encoding data.
471
472 They use the complete set of APIs returned by the
473 codecs.lookup() function to implement their task.
474
475 Data written to the stream is first decoded into an
476 intermediate format (which is dependent on the given codec
477 combination) and then written to the stream using an instance
478 of the provided Writer class.
479
480 In the other direction, data is read from the stream using a
481 Reader instance and then return encoded data to the caller.
482
483 """
Guido van Rossuma3277132000-04-11 15:37:43 +0000484 # Optional attributes set by the file wrappers below
485 data_encoding = 'unknown'
486 file_encoding = 'unknown'
487
Tim Peters30324a72001-05-15 17:19:16 +0000488 def __init__(self, stream, encode, decode, Reader, Writer,
489 errors='strict'):
Guido van Rossum0612d842000-03-10 23:20:43 +0000490
491 """ Creates a StreamRecoder instance which implements a two-way
492 conversion: encode and decode work on the frontend (the
Guido van Rossum1c89b0e2000-04-11 15:41:38 +0000493 input to .read() and output of .write()) while
Guido van Rossum0612d842000-03-10 23:20:43 +0000494 Reader and Writer work on the backend (reading and
Fred Drake908670c2000-03-17 15:42:11 +0000495 writing to the stream).
Guido van Rossum0612d842000-03-10 23:20:43 +0000496
497 You can use these objects to do transparent direct
498 recodings from e.g. latin-1 to utf-8 and back.
499
500 stream must be a file-like object.
501
502 encode, decode must adhere to the Codec interface, Reader,
503 Writer must be factory functions or classes providing the
504 StreamReader, StreamWriter interface resp.
505
506 encode and decode are needed for the frontend translation,
507 Reader and Writer for the backend translation. Unicode is
508 used as intermediate encoding.
509
510 Error handling is done in the same way as defined for the
511 StreamWriter/Readers.
512
513 """
514 self.stream = stream
515 self.encode = encode
516 self.decode = decode
517 self.reader = Reader(stream, errors)
518 self.writer = Writer(stream, errors)
519 self.errors = errors
520
Tim Peters30324a72001-05-15 17:19:16 +0000521 def read(self, size=-1):
Guido van Rossum0612d842000-03-10 23:20:43 +0000522
523 data = self.reader.read(size)
524 data, bytesencoded = self.encode(data, self.errors)
525 return data
526
Tim Peters30324a72001-05-15 17:19:16 +0000527 def readline(self, size=None):
Guido van Rossuma3277132000-04-11 15:37:43 +0000528
529 if size is None:
530 data = self.reader.readline()
531 else:
532 data = self.reader.readline(size)
533 data, bytesencoded = self.encode(data, self.errors)
534 return data
535
Tim Peters30324a72001-05-15 17:19:16 +0000536 def readlines(self, sizehint=None):
Guido van Rossuma3277132000-04-11 15:37:43 +0000537
Marc-André Lemburgd5948492004-02-26 15:22:17 +0000538 data = self.reader.read()
Guido van Rossuma3277132000-04-11 15:37:43 +0000539 data, bytesencoded = self.encode(data, self.errors)
540 return data.splitlines(1)
541
Walter Dörwald4dbf1922002-11-06 16:53:44 +0000542 def next(self):
543
544 """ Return the next decoded line from the input stream."""
545 return self.reader.next()
546
547 def __iter__(self):
548 return self
549
Tim Peters30324a72001-05-15 17:19:16 +0000550 def write(self, data):
Guido van Rossum0612d842000-03-10 23:20:43 +0000551
552 data, bytesdecoded = self.decode(data, self.errors)
553 return self.writer.write(data)
554
Tim Peters30324a72001-05-15 17:19:16 +0000555 def writelines(self, list):
Guido van Rossuma3277132000-04-11 15:37:43 +0000556
557 data = ''.join(list)
558 data, bytesdecoded = self.decode(data, self.errors)
559 return self.writer.write(data)
Guido van Rossum0612d842000-03-10 23:20:43 +0000560
561 def reset(self):
562
563 self.reader.reset()
564 self.writer.reset()
565
Tim Peters30324a72001-05-15 17:19:16 +0000566 def __getattr__(self, name,
Guido van Rossum0612d842000-03-10 23:20:43 +0000567 getattr=getattr):
568
569 """ Inherit all other methods from the underlying stream.
570 """
Tim Peters30324a72001-05-15 17:19:16 +0000571 return getattr(self.stream, name)
Guido van Rossum0612d842000-03-10 23:20:43 +0000572
573### Shortcuts
574
Marc-André Lemburg349a3d32000-06-21 21:21:04 +0000575def open(filename, mode='rb', encoding=None, errors='strict', buffering=1):
Guido van Rossum0612d842000-03-10 23:20:43 +0000576
577 """ Open an encoded file using the given mode and return
578 a wrapped version providing transparent encoding/decoding.
579
580 Note: The wrapped version will only accept the object format
581 defined by the codecs, i.e. Unicode objects for most builtin
Skip Montanaro9f5f9d92005-03-16 03:51:56 +0000582 codecs. Output is also codec dependent and will usually be
Guido van Rossum0612d842000-03-10 23:20:43 +0000583 Unicode as well.
584
Marc-André Lemburg349a3d32000-06-21 21:21:04 +0000585 Files are always opened in binary mode, even if no binary mode
Walter Dörwald7f3ed742003-02-02 23:08:27 +0000586 was specified. This is done to avoid data loss due to encodings
Marc-André Lemburg349a3d32000-06-21 21:21:04 +0000587 using 8-bit values. The default file mode is 'rb' meaning to
588 open the file in binary read mode.
589
Guido van Rossum0612d842000-03-10 23:20:43 +0000590 encoding specifies the encoding which is to be used for the
Walter Dörwald7f3ed742003-02-02 23:08:27 +0000591 file.
Guido van Rossum0612d842000-03-10 23:20:43 +0000592
593 errors may be given to define the error handling. It defaults
594 to 'strict' which causes ValueErrors to be raised in case an
595 encoding error occurs.
596
597 buffering has the same meaning as for the builtin open() API.
598 It defaults to line buffered.
599
Fred Drake49fd1072000-04-13 14:11:21 +0000600 The returned wrapped file object provides an extra attribute
601 .encoding which allows querying the used encoding. This
602 attribute is only available if an encoding was specified as
603 parameter.
604
Guido van Rossum0612d842000-03-10 23:20:43 +0000605 """
606 if encoding is not None and \
607 'b' not in mode:
608 # Force opening of the file in binary mode
609 mode = mode + 'b'
610 file = __builtin__.open(filename, mode, buffering)
611 if encoding is None:
612 return file
Tim Peters30324a72001-05-15 17:19:16 +0000613 (e, d, sr, sw) = lookup(encoding)
Guido van Rossuma3277132000-04-11 15:37:43 +0000614 srw = StreamReaderWriter(file, sr, sw, errors)
615 # Add attributes to simplify introspection
616 srw.encoding = encoding
617 return srw
Guido van Rossum0612d842000-03-10 23:20:43 +0000618
Guido van Rossuma3277132000-04-11 15:37:43 +0000619def EncodedFile(file, data_encoding, file_encoding=None, errors='strict'):
Guido van Rossum0612d842000-03-10 23:20:43 +0000620
621 """ Return a wrapped version of file which provides transparent
622 encoding translation.
623
624 Strings written to the wrapped file are interpreted according
Guido van Rossuma3277132000-04-11 15:37:43 +0000625 to the given data_encoding and then written to the original
626 file as string using file_encoding. The intermediate encoding
627 will usually be Unicode but depends on the specified codecs.
Guido van Rossum0612d842000-03-10 23:20:43 +0000628
Guido van Rossuma3277132000-04-11 15:37:43 +0000629 Strings are read from the file using file_encoding and then
630 passed back to the caller as string using data_encoding.
631
632 If file_encoding is not given, it defaults to data_encoding.
Guido van Rossum0612d842000-03-10 23:20:43 +0000633
634 errors may be given to define the error handling. It defaults
635 to 'strict' which causes ValueErrors to be raised in case an
636 encoding error occurs.
637
Fred Drake49fd1072000-04-13 14:11:21 +0000638 The returned wrapped file object provides two extra attributes
639 .data_encoding and .file_encoding which reflect the given
640 parameters of the same name. The attributes can be used for
641 introspection by Python programs.
642
Guido van Rossum0612d842000-03-10 23:20:43 +0000643 """
Guido van Rossuma3277132000-04-11 15:37:43 +0000644 if file_encoding is None:
645 file_encoding = data_encoding
646 encode, decode = lookup(data_encoding)[:2]
647 Reader, Writer = lookup(file_encoding)[2:]
648 sr = StreamRecoder(file,
Tim Peters30324a72001-05-15 17:19:16 +0000649 encode, decode, Reader, Writer,
Guido van Rossuma3277132000-04-11 15:37:43 +0000650 errors)
651 # Add attributes to simplify introspection
652 sr.data_encoding = data_encoding
653 sr.file_encoding = file_encoding
654 return sr
Guido van Rossum0612d842000-03-10 23:20:43 +0000655
Marc-André Lemburgaa32c5a2001-09-19 11:24:48 +0000656### Helpers for codec lookup
657
658def getencoder(encoding):
659
660 """ Lookup up the codec for the given encoding and return
661 its encoder function.
662
663 Raises a LookupError in case the encoding cannot be found.
664
665 """
666 return lookup(encoding)[0]
667
668def getdecoder(encoding):
669
670 """ Lookup up the codec for the given encoding and return
671 its decoder function.
672
673 Raises a LookupError in case the encoding cannot be found.
674
675 """
676 return lookup(encoding)[1]
677
678def getreader(encoding):
679
680 """ Lookup up the codec for the given encoding and return
681 its StreamReader class or factory function.
682
683 Raises a LookupError in case the encoding cannot be found.
684
685 """
686 return lookup(encoding)[2]
687
688def getwriter(encoding):
689
690 """ Lookup up the codec for the given encoding and return
691 its StreamWriter class or factory function.
692
693 Raises a LookupError in case the encoding cannot be found.
694
695 """
696 return lookup(encoding)[3]
697
Marc-André Lemburga866df82001-01-03 21:29:14 +0000698### Helpers for charmap-based codecs
699
700def make_identity_dict(rng):
701
702 """ make_identity_dict(rng) -> dict
703
704 Return a dictionary where elements of the rng sequence are
705 mapped to themselves.
Tim Peters88869f92001-01-14 23:36:06 +0000706
Marc-André Lemburga866df82001-01-03 21:29:14 +0000707 """
708 res = {}
709 for i in rng:
710 res[i]=i
711 return res
712
Marc-André Lemburg716cf912001-05-16 09:41:45 +0000713def make_encoding_map(decoding_map):
714
715 """ Creates an encoding map from a decoding map.
716
Walter Dörwald7f3ed742003-02-02 23:08:27 +0000717 If a target mapping in the decoding map occurs multiple
Marc-André Lemburg716cf912001-05-16 09:41:45 +0000718 times, then that target is mapped to None (undefined mapping),
719 causing an exception when encountered by the charmap codec
720 during translation.
721
722 One example where this happens is cp875.py which decodes
723 multiple character to \u001a.
724
725 """
726 m = {}
727 for k,v in decoding_map.items():
Raymond Hettinger54f02222002-06-01 14:18:47 +0000728 if not v in m:
Marc-André Lemburg716cf912001-05-16 09:41:45 +0000729 m[v] = k
730 else:
731 m[v] = None
732 return m
Tim Peters3a2ab1a2001-05-29 06:06:54 +0000733
Walter Dörwald3aeb6322002-09-02 13:14:32 +0000734### error handlers
735
Martin v. Löwise2713be2005-03-08 15:03:08 +0000736try:
737 strict_errors = lookup_error("strict")
738 ignore_errors = lookup_error("ignore")
739 replace_errors = lookup_error("replace")
740 xmlcharrefreplace_errors = lookup_error("xmlcharrefreplace")
741 backslashreplace_errors = lookup_error("backslashreplace")
742except LookupError:
743 # In --disable-unicode builds, these error handler are missing
744 strict_errors = None
745 ignore_errors = None
746 replace_errors = None
747 xmlcharrefreplace_errors = None
748 backslashreplace_errors = None
Walter Dörwald3aeb6322002-09-02 13:14:32 +0000749
Martin v. Löwis6cd441d2001-07-31 08:54:55 +0000750# Tell modulefinder that using codecs probably needs the encodings
751# package
752_false = 0
753if _false:
754 import encodings
755
Guido van Rossum0612d842000-03-10 23:20:43 +0000756### Tests
Guido van Rossum1c89b0e2000-04-11 15:41:38 +0000757
Guido van Rossum0612d842000-03-10 23:20:43 +0000758if __name__ == '__main__':
759
Guido van Rossuma3277132000-04-11 15:37:43 +0000760 # Make stdout translate Latin-1 output into UTF-8 output
761 sys.stdout = EncodedFile(sys.stdout, 'latin-1', 'utf-8')
Guido van Rossum1c89b0e2000-04-11 15:41:38 +0000762
Guido van Rossuma3277132000-04-11 15:37:43 +0000763 # Have stdin translate Latin-1 input into UTF-8 input
764 sys.stdin = EncodedFile(sys.stdin, 'utf-8', 'latin-1')