blob: d972a5191fd74ccb1d72c406540634b8a43ab211 [file] [log] [blame]
Guido van Rossum0612d842000-03-10 23:20:43 +00001""" codecs -- Python Codec Registry, API and helpers.
2
3
4Written by Marc-Andre Lemburg (mal@lemburg.com).
5
6(c) Copyright CNRI, All Rights Reserved. NO WARRANTY.
7
8"""#"
9
Marc-André Lemburgb28de0d2002-12-12 17:37:50 +000010import __builtin__, sys
Guido van Rossum0612d842000-03-10 23:20:43 +000011
12### Registry and builtin stateless codec functions
13
Guido van Rossumb95de4f2000-03-31 17:25:23 +000014try:
15 from _codecs import *
Tim Peters30324a72001-05-15 17:19:16 +000016except ImportError, why:
Guido van Rossumb95de4f2000-03-31 17:25:23 +000017 raise SystemError,\
18 'Failed to load the builtin codecs: %s' % why
Guido van Rossum0612d842000-03-10 23:20:43 +000019
Tim Peters30324a72001-05-15 17:19:16 +000020__all__ = ["register", "lookup", "open", "EncodedFile", "BOM", "BOM_BE",
Walter Dörwald474458d2002-06-04 15:16:29 +000021 "BOM_LE", "BOM32_BE", "BOM32_LE", "BOM64_BE", "BOM64_LE",
22 "BOM_UTF8", "BOM_UTF16", "BOM_UTF16_LE", "BOM_UTF16_BE",
Walter Dörwald3aeb6322002-09-02 13:14:32 +000023 "BOM_UTF32", "BOM_UTF32_LE", "BOM_UTF32_BE",
24 "strict_errors", "ignore_errors", "replace_errors",
25 "xmlcharrefreplace_errors",
26 "register_error", "lookup_error"]
Skip Montanaroe99d5ea2001-01-20 19:54:20 +000027
Guido van Rossum0612d842000-03-10 23:20:43 +000028### Constants
29
30#
Walter Dörwald474458d2002-06-04 15:16:29 +000031# Byte Order Mark (BOM = ZERO WIDTH NO-BREAK SPACE = U+FEFF)
32# and its possible byte string values
33# for UTF8/UTF16/UTF32 output and little/big endian machines
Guido van Rossum0612d842000-03-10 23:20:43 +000034#
Guido van Rossum0612d842000-03-10 23:20:43 +000035
Walter Dörwald474458d2002-06-04 15:16:29 +000036# UTF-8
37BOM_UTF8 = '\xef\xbb\xbf'
38
39# UTF-16, little endian
40BOM_LE = BOM_UTF16_LE = '\xff\xfe'
41
42# UTF-16, big endian
43BOM_BE = BOM_UTF16_BE = '\xfe\xff'
44
45# UTF-32, little endian
46BOM_UTF32_LE = '\xff\xfe\x00\x00'
47
48# UTF-32, big endian
49BOM_UTF32_BE = '\x00\x00\xfe\xff'
50
Marc-André Lemburgb28de0d2002-12-12 17:37:50 +000051if sys.byteorder == 'little':
Walter Dörwald474458d2002-06-04 15:16:29 +000052
Marc-André Lemburgb28de0d2002-12-12 17:37:50 +000053 # UTF-16, native endianness
54 BOM = BOM_UTF16 = BOM_UTF16_LE
55
56 # UTF-32, native endianness
57 BOM_UTF32 = BOM_UTF32_LE
58
59else:
60
61 # UTF-16, native endianness
62 BOM = BOM_UTF16 = BOM_UTF16_BE
63
64 # UTF-32, native endianness
65 BOM_UTF32 = BOM_UTF32_BE
Walter Dörwald474458d2002-06-04 15:16:29 +000066
67# Old broken names (don't use in new code)
68BOM32_LE = BOM_UTF16_LE
69BOM32_BE = BOM_UTF16_BE
70BOM64_LE = BOM_UTF32_LE
71BOM64_BE = BOM_UTF32_BE
Guido van Rossum0612d842000-03-10 23:20:43 +000072
73
74### Codec base classes (defining the API)
75
76class Codec:
77
78 """ Defines the interface for stateless encoders/decoders.
79
Walter Dörwald7f82f792002-11-19 21:42:53 +000080 The .encode()/.decode() methods may use different error
Guido van Rossum0612d842000-03-10 23:20:43 +000081 handling schemes by providing the errors argument. These
Walter Dörwald7f82f792002-11-19 21:42:53 +000082 string values are predefined:
Guido van Rossum0612d842000-03-10 23:20:43 +000083
Guido van Rossumd8855fd2000-03-24 22:14:19 +000084 'strict' - raise a ValueError error (or a subclass)
Guido van Rossum0612d842000-03-10 23:20:43 +000085 'ignore' - ignore the character and continue with the next
86 'replace' - replace with a suitable replacement character;
87 Python will use the official U+FFFD REPLACEMENT
Walter Dörwald7f82f792002-11-19 21:42:53 +000088 CHARACTER for the builtin Unicode codecs on
89 decoding and '?' on encoding.
90 'xmlcharrefreplace' - Replace with the appropriate XML
91 character reference (only for encoding).
92 'backslashreplace' - Replace with backslashed escape sequences
93 (only for encoding).
94
95 The set of allowed values can be extended via register_error.
Guido van Rossum0612d842000-03-10 23:20:43 +000096
97 """
Tim Peters30324a72001-05-15 17:19:16 +000098 def encode(self, input, errors='strict'):
Guido van Rossum1c89b0e2000-04-11 15:41:38 +000099
Fred Drake3e74c0d2000-03-17 15:40:35 +0000100 """ Encodes the object input and returns a tuple (output
Guido van Rossum0612d842000-03-10 23:20:43 +0000101 object, length consumed).
102
103 errors defines the error handling to apply. It defaults to
104 'strict' handling.
105
106 The method may not store state in the Codec instance. Use
107 StreamCodec for codecs which have to keep state in order to
108 make encoding/decoding efficient.
109
110 The encoder must be able to handle zero length input and
111 return an empty object of the output object type in this
112 situation.
113
114 """
115 raise NotImplementedError
116
Tim Peters30324a72001-05-15 17:19:16 +0000117 def decode(self, input, errors='strict'):
Guido van Rossum0612d842000-03-10 23:20:43 +0000118
119 """ Decodes the object input and returns a tuple (output
120 object, length consumed).
121
122 input must be an object which provides the bf_getreadbuf
123 buffer slot. Python strings, buffer objects and memory
124 mapped files are examples of objects providing this slot.
Guido van Rossum1c89b0e2000-04-11 15:41:38 +0000125
Guido van Rossum0612d842000-03-10 23:20:43 +0000126 errors defines the error handling to apply. It defaults to
127 'strict' handling.
128
129 The method may not store state in the Codec instance. Use
130 StreamCodec for codecs which have to keep state in order to
131 make encoding/decoding efficient.
132
133 The decoder must be able to handle zero length input and
134 return an empty object of the output object type in this
135 situation.
136
Guido van Rossum1c89b0e2000-04-11 15:41:38 +0000137 """
Guido van Rossum0612d842000-03-10 23:20:43 +0000138 raise NotImplementedError
139
140#
141# The StreamWriter and StreamReader class provide generic working
Andrew M. Kuchling97c56352001-09-18 20:29:48 +0000142# interfaces which can be used to implement new encoding submodules
Guido van Rossum0612d842000-03-10 23:20:43 +0000143# very easily. See encodings/utf_8.py for an example on how this is
144# done.
Guido van Rossum1c89b0e2000-04-11 15:41:38 +0000145#
Guido van Rossum0612d842000-03-10 23:20:43 +0000146
147class StreamWriter(Codec):
148
Tim Peters30324a72001-05-15 17:19:16 +0000149 def __init__(self, stream, errors='strict'):
Guido van Rossum0612d842000-03-10 23:20:43 +0000150
151 """ Creates a StreamWriter instance.
152
153 stream must be a file-like object open for writing
154 (binary) data.
155
Walter Dörwald7f82f792002-11-19 21:42:53 +0000156 The StreamWriter may use different error handling
Guido van Rossum0612d842000-03-10 23:20:43 +0000157 schemes by providing the errors keyword argument. These
Walter Dörwald7f82f792002-11-19 21:42:53 +0000158 parameters are predefined:
Guido van Rossum0612d842000-03-10 23:20:43 +0000159
160 'strict' - raise a ValueError (or a subclass)
161 'ignore' - ignore the character and continue with the next
162 'replace'- replace with a suitable replacement character
Walter Dörwald7f82f792002-11-19 21:42:53 +0000163 'xmlcharrefreplace' - Replace with the appropriate XML
164 character reference.
165 'backslashreplace' - Replace with backslashed escape
166 sequences (only for encoding).
Guido van Rossum0612d842000-03-10 23:20:43 +0000167
Walter Dörwald7f82f792002-11-19 21:42:53 +0000168 The set of allowed parameter values can be extended via
169 register_error.
Guido van Rossum0612d842000-03-10 23:20:43 +0000170 """
171 self.stream = stream
172 self.errors = errors
173
Guido van Rossuma3277132000-04-11 15:37:43 +0000174 def write(self, object):
Guido van Rossum0612d842000-03-10 23:20:43 +0000175
176 """ Writes the object's contents encoded to self.stream.
177 """
Tim Peters30324a72001-05-15 17:19:16 +0000178 data, consumed = self.encode(object, self.errors)
Guido van Rossum0612d842000-03-10 23:20:43 +0000179 self.stream.write(data)
180
Guido van Rossuma3277132000-04-11 15:37:43 +0000181 def writelines(self, list):
182
183 """ Writes the concatenated list of strings to the stream
184 using .write().
185 """
186 self.write(''.join(list))
Guido van Rossum1c89b0e2000-04-11 15:41:38 +0000187
Guido van Rossum0612d842000-03-10 23:20:43 +0000188 def reset(self):
189
190 """ Flushes and resets the codec buffers used for keeping state.
191
192 Calling this method should ensure that the data on the
193 output is put into a clean state, that allows appending
194 of new fresh data without having to rescan the whole
195 stream to recover state.
196
197 """
198 pass
199
Tim Peters30324a72001-05-15 17:19:16 +0000200 def __getattr__(self, name,
Guido van Rossum0612d842000-03-10 23:20:43 +0000201 getattr=getattr):
202
203 """ Inherit all other methods from the underlying stream.
204 """
Tim Peters30324a72001-05-15 17:19:16 +0000205 return getattr(self.stream, name)
Guido van Rossum0612d842000-03-10 23:20:43 +0000206
207###
208
209class StreamReader(Codec):
210
Tim Peters30324a72001-05-15 17:19:16 +0000211 def __init__(self, stream, errors='strict'):
Guido van Rossum0612d842000-03-10 23:20:43 +0000212
213 """ Creates a StreamReader instance.
214
215 stream must be a file-like object open for reading
216 (binary) data.
217
Walter Dörwald7f82f792002-11-19 21:42:53 +0000218 The StreamReader may use different error handling
Guido van Rossum0612d842000-03-10 23:20:43 +0000219 schemes by providing the errors keyword argument. These
Walter Dörwald7f82f792002-11-19 21:42:53 +0000220 parameters are predefined:
Guido van Rossum0612d842000-03-10 23:20:43 +0000221
222 'strict' - raise a ValueError (or a subclass)
223 'ignore' - ignore the character and continue with the next
224 'replace'- replace with a suitable replacement character;
225
Walter Dörwald7f82f792002-11-19 21:42:53 +0000226 The set of allowed parameter values can be extended via
227 register_error.
Guido van Rossum0612d842000-03-10 23:20:43 +0000228 """
229 self.stream = stream
230 self.errors = errors
Walter Dörwald69652032004-09-07 20:24:22 +0000231 self.bytebuffer = ""
Walter Dörwaldc9878e12005-07-20 22:15:39 +0000232 # For str->str decoding this will stay a str
233 # For str->unicode decoding the first read will promote it to unicode
234 self.charbuffer = ""
Guido van Rossum0612d842000-03-10 23:20:43 +0000235
Walter Dörwald69652032004-09-07 20:24:22 +0000236 def decode(self, input, errors='strict'):
237 raise NotImplementedError
238
Martin v. Löwis56066d22005-08-24 07:38:12 +0000239 def read(self, size=-1, chars=-1, firstline=False):
Guido van Rossum0612d842000-03-10 23:20:43 +0000240
241 """ Decodes data from the stream self.stream and returns the
242 resulting object.
243
Walter Dörwald69652032004-09-07 20:24:22 +0000244 chars indicates the number of characters to read from the
245 stream. read() will never return more than chars
246 characters, but it might return less, if there are not enough
247 characters available.
248
Guido van Rossum0612d842000-03-10 23:20:43 +0000249 size indicates the approximate maximum number of bytes to
250 read from the stream for decoding purposes. The decoder
251 can modify this setting as appropriate. The default value
252 -1 indicates to read and decode as much as possible. size
253 is intended to prevent having to decode huge files in one
254 step.
255
Martin v. Löwis56066d22005-08-24 07:38:12 +0000256 If firstline is true, and a UnicodeDecodeError happens
257 after the first line terminator in the input only the first line
258 will be returned, the rest of the input will be kept until the
259 next call to read().
260
Guido van Rossum0612d842000-03-10 23:20:43 +0000261 The method should use a greedy read strategy meaning that
262 it should read as much data as is allowed within the
263 definition of the encoding and the given size, e.g. if
264 optional encoding endings or state markers are available
265 on the stream, these should be read too.
Guido van Rossum0612d842000-03-10 23:20:43 +0000266 """
Walter Dörwald69652032004-09-07 20:24:22 +0000267 # read until we get the required number of characters (if available)
Walter Dörwald69652032004-09-07 20:24:22 +0000268 while True:
269 # can the request can be satisfied from the character buffer?
270 if chars < 0:
271 if self.charbuffer:
Walter Dörwalde57d7b12004-12-21 22:24:00 +0000272 break
Guido van Rossum0612d842000-03-10 23:20:43 +0000273 else:
Walter Dörwald69652032004-09-07 20:24:22 +0000274 if len(self.charbuffer) >= chars:
Walter Dörwald69652032004-09-07 20:24:22 +0000275 break
276 # we need more data
277 if size < 0:
278 newdata = self.stream.read()
279 else:
280 newdata = self.stream.read(size)
Walter Dörwalde57d7b12004-12-21 22:24:00 +0000281 # decode bytes (those remaining from the last call included)
Walter Dörwald69652032004-09-07 20:24:22 +0000282 data = self.bytebuffer + newdata
Martin v. Löwis56066d22005-08-24 07:38:12 +0000283 try:
284 newchars, decodedbytes = self.decode(data, self.errors)
285 except UnicodeDecodeError, exc:
286 if firstline:
287 newchars, decodedbytes = self.decode(data[:exc.start], self.errors)
288 lines = newchars.splitlines(True)
289 if len(lines)<=1:
290 raise
291 else:
292 raise
Walter Dörwald69652032004-09-07 20:24:22 +0000293 # keep undecoded bytes until the next call
294 self.bytebuffer = data[decodedbytes:]
295 # put new characters in the character buffer
Walter Dörwalde57d7b12004-12-21 22:24:00 +0000296 self.charbuffer += newchars
Walter Dörwald69652032004-09-07 20:24:22 +0000297 # there was no data available
298 if not newdata:
Walter Dörwalde57d7b12004-12-21 22:24:00 +0000299 break
300 if chars < 0:
301 # Return everything we've got
302 result = self.charbuffer
Walter Dörwaldc9878e12005-07-20 22:15:39 +0000303 self.charbuffer = ""
Walter Dörwalde57d7b12004-12-21 22:24:00 +0000304 else:
305 # Return the first chars characters
306 result = self.charbuffer[:chars]
307 self.charbuffer = self.charbuffer[chars:]
Walter Dörwald69652032004-09-07 20:24:22 +0000308 return result
Guido van Rossum0612d842000-03-10 23:20:43 +0000309
Walter Dörwald69652032004-09-07 20:24:22 +0000310 def readline(self, size=None, keepends=True):
Guido van Rossuma3277132000-04-11 15:37:43 +0000311
312 """ Read one line from the input stream and return the
313 decoded data.
314
Walter Dörwald69652032004-09-07 20:24:22 +0000315 size, if given, is passed as size argument to the
316 read() method.
Guido van Rossum1c89b0e2000-04-11 15:41:38 +0000317
Guido van Rossuma3277132000-04-11 15:37:43 +0000318 """
Walter Dörwalde57d7b12004-12-21 22:24:00 +0000319 readsize = size or 72
Walter Dörwaldc9878e12005-07-20 22:15:39 +0000320 line = ""
Walter Dörwalde57d7b12004-12-21 22:24:00 +0000321 # If size is given, we call read() only once
Walter Dörwald69652032004-09-07 20:24:22 +0000322 while True:
Martin v. Löwis56066d22005-08-24 07:38:12 +0000323 data = self.read(readsize, firstline=True)
Walter Dörwalde57d7b12004-12-21 22:24:00 +0000324 if data:
Walter Dörwalda4eb2d52005-04-21 21:42:35 +0000325 # If we're at a "\r" read one extra character (which might
326 # be a "\n") to get a proper line ending. If the stream is
Walter Dörwaldbc8e6422005-04-21 21:32:03 +0000327 # temporarily exhausted we return the wrong line ending.
Walter Dörwaldc9878e12005-07-20 22:15:39 +0000328 if data.endswith("\r"):
Walter Dörwald7a6dc132005-04-04 21:38:47 +0000329 data += self.read(size=1, chars=1)
Walter Dörwald7a6dc132005-04-04 21:38:47 +0000330
Walter Dörwald69652032004-09-07 20:24:22 +0000331 line += data
Walter Dörwalde57d7b12004-12-21 22:24:00 +0000332 lines = line.splitlines(True)
333 if lines:
334 line0withend = lines[0]
335 line0withoutend = lines[0].splitlines(False)[0]
336 if line0withend != line0withoutend: # We really have a line end
337 # Put the rest back together and keep it until the next call
Walter Dörwaldc9878e12005-07-20 22:15:39 +0000338 self.charbuffer = "".join(lines[1:]) + self.charbuffer
Walter Dörwalde57d7b12004-12-21 22:24:00 +0000339 if keepends:
340 line = line0withend
341 else:
342 line = line0withoutend
Walter Dörwald9fa09462005-01-10 12:01:39 +0000343 break
Walter Dörwalde57d7b12004-12-21 22:24:00 +0000344 # we didn't get anything or this was our only try
Walter Dörwald9fa09462005-01-10 12:01:39 +0000345 if not data or size is not None:
Walter Dörwalde57d7b12004-12-21 22:24:00 +0000346 if line and not keepends:
347 line = line.splitlines(False)[0]
348 break
349 if readsize<8000:
350 readsize *= 2
351 return line
Guido van Rossum1c89b0e2000-04-11 15:41:38 +0000352
Walter Dörwald69652032004-09-07 20:24:22 +0000353 def readlines(self, sizehint=None, keepends=True):
Guido van Rossuma3277132000-04-11 15:37:43 +0000354
355 """ Read all lines available on the input stream
356 and return them as list of lines.
357
358 Line breaks are implemented using the codec's decoder
359 method and are included in the list entries.
Guido van Rossum1c89b0e2000-04-11 15:41:38 +0000360
Marc-André Lemburgd5948492004-02-26 15:22:17 +0000361 sizehint, if given, is ignored since there is no efficient
362 way to finding the true end-of-line.
Guido van Rossuma3277132000-04-11 15:37:43 +0000363
364 """
Walter Dörwald69652032004-09-07 20:24:22 +0000365 data = self.read()
Hye-Shik Changaf5c7cf2004-10-17 23:51:21 +0000366 return data.splitlines(keepends)
Guido van Rossum0612d842000-03-10 23:20:43 +0000367
368 def reset(self):
369
370 """ Resets the codec buffers used for keeping state.
371
372 Note that no stream repositioning should take place.
Thomas Wouters7e474022000-07-16 12:04:32 +0000373 This method is primarily intended to be able to recover
Guido van Rossum0612d842000-03-10 23:20:43 +0000374 from decoding errors.
375
376 """
Walter Dörwald729c31f2005-03-14 19:06:30 +0000377 self.bytebuffer = ""
378 self.charbuffer = u""
Walter Dörwald729c31f2005-03-14 19:06:30 +0000379
Walter Dörwald71fd90d2005-03-14 19:25:41 +0000380 def seek(self, offset, whence=0):
Walter Dörwald729c31f2005-03-14 19:06:30 +0000381 """ Set the input stream's current position.
382
383 Resets the codec buffers used for keeping state.
384 """
385 self.reset()
386 self.stream.seek(offset, whence)
Guido van Rossum0612d842000-03-10 23:20:43 +0000387
Walter Dörwald4dbf1922002-11-06 16:53:44 +0000388 def next(self):
389
390 """ Return the next decoded line from the input stream."""
391 line = self.readline()
392 if line:
393 return line
394 raise StopIteration
395
396 def __iter__(self):
397 return self
398
Tim Peters30324a72001-05-15 17:19:16 +0000399 def __getattr__(self, name,
Guido van Rossum0612d842000-03-10 23:20:43 +0000400 getattr=getattr):
401
402 """ Inherit all other methods from the underlying stream.
403 """
Tim Peters30324a72001-05-15 17:19:16 +0000404 return getattr(self.stream, name)
Guido van Rossum0612d842000-03-10 23:20:43 +0000405
406###
407
408class StreamReaderWriter:
409
Fred Drake49fd1072000-04-13 14:11:21 +0000410 """ StreamReaderWriter instances allow wrapping streams which
411 work in both read and write modes.
412
413 The design is such that one can use the factory functions
Thomas Wouters7e474022000-07-16 12:04:32 +0000414 returned by the codec.lookup() function to construct the
Fred Drake49fd1072000-04-13 14:11:21 +0000415 instance.
416
417 """
Guido van Rossuma3277132000-04-11 15:37:43 +0000418 # Optional attributes set by the file wrappers below
419 encoding = 'unknown'
420
Tim Peters30324a72001-05-15 17:19:16 +0000421 def __init__(self, stream, Reader, Writer, errors='strict'):
Guido van Rossum0612d842000-03-10 23:20:43 +0000422
423 """ Creates a StreamReaderWriter instance.
424
425 stream must be a Stream-like object.
426
427 Reader, Writer must be factory functions or classes
428 providing the StreamReader, StreamWriter interface resp.
429
430 Error handling is done in the same way as defined for the
431 StreamWriter/Readers.
432
433 """
434 self.stream = stream
435 self.reader = Reader(stream, errors)
436 self.writer = Writer(stream, errors)
437 self.errors = errors
438
Tim Peters30324a72001-05-15 17:19:16 +0000439 def read(self, size=-1):
Guido van Rossum0612d842000-03-10 23:20:43 +0000440
441 return self.reader.read(size)
442
Guido van Rossumd58c26f2000-05-01 16:17:32 +0000443 def readline(self, size=None):
Guido van Rossuma3277132000-04-11 15:37:43 +0000444
445 return self.reader.readline(size)
446
Guido van Rossumd58c26f2000-05-01 16:17:32 +0000447 def readlines(self, sizehint=None):
Guido van Rossuma3277132000-04-11 15:37:43 +0000448
449 return self.reader.readlines(sizehint)
450
Walter Dörwald4dbf1922002-11-06 16:53:44 +0000451 def next(self):
452
453 """ Return the next decoded line from the input stream."""
454 return self.reader.next()
455
456 def __iter__(self):
457 return self
458
Tim Peters30324a72001-05-15 17:19:16 +0000459 def write(self, data):
Guido van Rossum0612d842000-03-10 23:20:43 +0000460
461 return self.writer.write(data)
462
Tim Peters30324a72001-05-15 17:19:16 +0000463 def writelines(self, list):
Guido van Rossuma3277132000-04-11 15:37:43 +0000464
465 return self.writer.writelines(list)
466
Guido van Rossum0612d842000-03-10 23:20:43 +0000467 def reset(self):
468
469 self.reader.reset()
470 self.writer.reset()
471
Tim Peters30324a72001-05-15 17:19:16 +0000472 def __getattr__(self, name,
Guido van Rossum0612d842000-03-10 23:20:43 +0000473 getattr=getattr):
474
475 """ Inherit all other methods from the underlying stream.
476 """
Tim Peters30324a72001-05-15 17:19:16 +0000477 return getattr(self.stream, name)
Guido van Rossum0612d842000-03-10 23:20:43 +0000478
479###
480
481class StreamRecoder:
482
Fred Drake49fd1072000-04-13 14:11:21 +0000483 """ StreamRecoder instances provide a frontend - backend
484 view of encoding data.
485
486 They use the complete set of APIs returned by the
487 codecs.lookup() function to implement their task.
488
489 Data written to the stream is first decoded into an
490 intermediate format (which is dependent on the given codec
491 combination) and then written to the stream using an instance
492 of the provided Writer class.
493
494 In the other direction, data is read from the stream using a
495 Reader instance and then return encoded data to the caller.
496
497 """
Guido van Rossuma3277132000-04-11 15:37:43 +0000498 # Optional attributes set by the file wrappers below
499 data_encoding = 'unknown'
500 file_encoding = 'unknown'
501
Tim Peters30324a72001-05-15 17:19:16 +0000502 def __init__(self, stream, encode, decode, Reader, Writer,
503 errors='strict'):
Guido van Rossum0612d842000-03-10 23:20:43 +0000504
505 """ Creates a StreamRecoder instance which implements a two-way
506 conversion: encode and decode work on the frontend (the
Guido van Rossum1c89b0e2000-04-11 15:41:38 +0000507 input to .read() and output of .write()) while
Guido van Rossum0612d842000-03-10 23:20:43 +0000508 Reader and Writer work on the backend (reading and
Fred Drake908670c2000-03-17 15:42:11 +0000509 writing to the stream).
Guido van Rossum0612d842000-03-10 23:20:43 +0000510
511 You can use these objects to do transparent direct
512 recodings from e.g. latin-1 to utf-8 and back.
513
514 stream must be a file-like object.
515
516 encode, decode must adhere to the Codec interface, Reader,
517 Writer must be factory functions or classes providing the
518 StreamReader, StreamWriter interface resp.
519
520 encode and decode are needed for the frontend translation,
521 Reader and Writer for the backend translation. Unicode is
522 used as intermediate encoding.
523
524 Error handling is done in the same way as defined for the
525 StreamWriter/Readers.
526
527 """
528 self.stream = stream
529 self.encode = encode
530 self.decode = decode
531 self.reader = Reader(stream, errors)
532 self.writer = Writer(stream, errors)
533 self.errors = errors
534
Tim Peters30324a72001-05-15 17:19:16 +0000535 def read(self, size=-1):
Guido van Rossum0612d842000-03-10 23:20:43 +0000536
537 data = self.reader.read(size)
538 data, bytesencoded = self.encode(data, self.errors)
539 return data
540
Tim Peters30324a72001-05-15 17:19:16 +0000541 def readline(self, size=None):
Guido van Rossuma3277132000-04-11 15:37:43 +0000542
543 if size is None:
544 data = self.reader.readline()
545 else:
546 data = self.reader.readline(size)
547 data, bytesencoded = self.encode(data, self.errors)
548 return data
549
Tim Peters30324a72001-05-15 17:19:16 +0000550 def readlines(self, sizehint=None):
Guido van Rossuma3277132000-04-11 15:37:43 +0000551
Marc-André Lemburgd5948492004-02-26 15:22:17 +0000552 data = self.reader.read()
Guido van Rossuma3277132000-04-11 15:37:43 +0000553 data, bytesencoded = self.encode(data, self.errors)
554 return data.splitlines(1)
555
Walter Dörwald4dbf1922002-11-06 16:53:44 +0000556 def next(self):
557
558 """ Return the next decoded line from the input stream."""
Walter Dörwaldc5238b82005-09-01 11:56:53 +0000559 data = self.reader.next()
560 data, bytesencoded = self.encode(data, self.errors)
561 return data
Walter Dörwald4dbf1922002-11-06 16:53:44 +0000562
563 def __iter__(self):
564 return self
565
Tim Peters30324a72001-05-15 17:19:16 +0000566 def write(self, data):
Guido van Rossum0612d842000-03-10 23:20:43 +0000567
568 data, bytesdecoded = self.decode(data, self.errors)
569 return self.writer.write(data)
570
Tim Peters30324a72001-05-15 17:19:16 +0000571 def writelines(self, list):
Guido van Rossuma3277132000-04-11 15:37:43 +0000572
573 data = ''.join(list)
574 data, bytesdecoded = self.decode(data, self.errors)
575 return self.writer.write(data)
Guido van Rossum0612d842000-03-10 23:20:43 +0000576
577 def reset(self):
578
579 self.reader.reset()
580 self.writer.reset()
581
Tim Peters30324a72001-05-15 17:19:16 +0000582 def __getattr__(self, name,
Guido van Rossum0612d842000-03-10 23:20:43 +0000583 getattr=getattr):
584
585 """ Inherit all other methods from the underlying stream.
586 """
Tim Peters30324a72001-05-15 17:19:16 +0000587 return getattr(self.stream, name)
Guido van Rossum0612d842000-03-10 23:20:43 +0000588
589### Shortcuts
590
Marc-André Lemburg349a3d32000-06-21 21:21:04 +0000591def open(filename, mode='rb', encoding=None, errors='strict', buffering=1):
Guido van Rossum0612d842000-03-10 23:20:43 +0000592
593 """ Open an encoded file using the given mode and return
594 a wrapped version providing transparent encoding/decoding.
595
596 Note: The wrapped version will only accept the object format
597 defined by the codecs, i.e. Unicode objects for most builtin
Skip Montanaro9f5f9d92005-03-16 03:51:56 +0000598 codecs. Output is also codec dependent and will usually be
Guido van Rossum0612d842000-03-10 23:20:43 +0000599 Unicode as well.
600
Marc-André Lemburg349a3d32000-06-21 21:21:04 +0000601 Files are always opened in binary mode, even if no binary mode
Walter Dörwald7f3ed742003-02-02 23:08:27 +0000602 was specified. This is done to avoid data loss due to encodings
Marc-André Lemburg349a3d32000-06-21 21:21:04 +0000603 using 8-bit values. The default file mode is 'rb' meaning to
604 open the file in binary read mode.
605
Guido van Rossum0612d842000-03-10 23:20:43 +0000606 encoding specifies the encoding which is to be used for the
Walter Dörwald7f3ed742003-02-02 23:08:27 +0000607 file.
Guido van Rossum0612d842000-03-10 23:20:43 +0000608
609 errors may be given to define the error handling. It defaults
610 to 'strict' which causes ValueErrors to be raised in case an
611 encoding error occurs.
612
613 buffering has the same meaning as for the builtin open() API.
614 It defaults to line buffered.
615
Fred Drake49fd1072000-04-13 14:11:21 +0000616 The returned wrapped file object provides an extra attribute
617 .encoding which allows querying the used encoding. This
618 attribute is only available if an encoding was specified as
619 parameter.
620
Guido van Rossum0612d842000-03-10 23:20:43 +0000621 """
622 if encoding is not None and \
623 'b' not in mode:
624 # Force opening of the file in binary mode
625 mode = mode + 'b'
626 file = __builtin__.open(filename, mode, buffering)
627 if encoding is None:
628 return file
Tim Peters30324a72001-05-15 17:19:16 +0000629 (e, d, sr, sw) = lookup(encoding)
Guido van Rossuma3277132000-04-11 15:37:43 +0000630 srw = StreamReaderWriter(file, sr, sw, errors)
631 # Add attributes to simplify introspection
632 srw.encoding = encoding
633 return srw
Guido van Rossum0612d842000-03-10 23:20:43 +0000634
Guido van Rossuma3277132000-04-11 15:37:43 +0000635def EncodedFile(file, data_encoding, file_encoding=None, errors='strict'):
Guido van Rossum0612d842000-03-10 23:20:43 +0000636
637 """ Return a wrapped version of file which provides transparent
638 encoding translation.
639
640 Strings written to the wrapped file are interpreted according
Guido van Rossuma3277132000-04-11 15:37:43 +0000641 to the given data_encoding and then written to the original
642 file as string using file_encoding. The intermediate encoding
643 will usually be Unicode but depends on the specified codecs.
Guido van Rossum0612d842000-03-10 23:20:43 +0000644
Guido van Rossuma3277132000-04-11 15:37:43 +0000645 Strings are read from the file using file_encoding and then
646 passed back to the caller as string using data_encoding.
647
648 If file_encoding is not given, it defaults to data_encoding.
Guido van Rossum0612d842000-03-10 23:20:43 +0000649
650 errors may be given to define the error handling. It defaults
651 to 'strict' which causes ValueErrors to be raised in case an
652 encoding error occurs.
653
Fred Drake49fd1072000-04-13 14:11:21 +0000654 The returned wrapped file object provides two extra attributes
655 .data_encoding and .file_encoding which reflect the given
656 parameters of the same name. The attributes can be used for
657 introspection by Python programs.
658
Guido van Rossum0612d842000-03-10 23:20:43 +0000659 """
Guido van Rossuma3277132000-04-11 15:37:43 +0000660 if file_encoding is None:
661 file_encoding = data_encoding
662 encode, decode = lookup(data_encoding)[:2]
663 Reader, Writer = lookup(file_encoding)[2:]
664 sr = StreamRecoder(file,
Tim Peters30324a72001-05-15 17:19:16 +0000665 encode, decode, Reader, Writer,
Guido van Rossuma3277132000-04-11 15:37:43 +0000666 errors)
667 # Add attributes to simplify introspection
668 sr.data_encoding = data_encoding
669 sr.file_encoding = file_encoding
670 return sr
Guido van Rossum0612d842000-03-10 23:20:43 +0000671
Marc-André Lemburgaa32c5a2001-09-19 11:24:48 +0000672### Helpers for codec lookup
673
674def getencoder(encoding):
675
676 """ Lookup up the codec for the given encoding and return
677 its encoder function.
678
679 Raises a LookupError in case the encoding cannot be found.
680
681 """
682 return lookup(encoding)[0]
683
684def getdecoder(encoding):
685
686 """ Lookup up the codec for the given encoding and return
687 its decoder function.
688
689 Raises a LookupError in case the encoding cannot be found.
690
691 """
692 return lookup(encoding)[1]
693
694def getreader(encoding):
695
696 """ Lookup up the codec for the given encoding and return
697 its StreamReader class or factory function.
698
699 Raises a LookupError in case the encoding cannot be found.
700
701 """
702 return lookup(encoding)[2]
703
704def getwriter(encoding):
705
706 """ Lookup up the codec for the given encoding and return
707 its StreamWriter class or factory function.
708
709 Raises a LookupError in case the encoding cannot be found.
710
711 """
712 return lookup(encoding)[3]
713
Marc-André Lemburga866df82001-01-03 21:29:14 +0000714### Helpers for charmap-based codecs
715
716def make_identity_dict(rng):
717
718 """ make_identity_dict(rng) -> dict
719
720 Return a dictionary where elements of the rng sequence are
721 mapped to themselves.
Tim Peters88869f92001-01-14 23:36:06 +0000722
Marc-André Lemburga866df82001-01-03 21:29:14 +0000723 """
724 res = {}
725 for i in rng:
726 res[i]=i
727 return res
728
Marc-André Lemburg716cf912001-05-16 09:41:45 +0000729def make_encoding_map(decoding_map):
730
731 """ Creates an encoding map from a decoding map.
732
Walter Dörwald7f3ed742003-02-02 23:08:27 +0000733 If a target mapping in the decoding map occurs multiple
Marc-André Lemburg716cf912001-05-16 09:41:45 +0000734 times, then that target is mapped to None (undefined mapping),
735 causing an exception when encountered by the charmap codec
736 during translation.
737
738 One example where this happens is cp875.py which decodes
739 multiple character to \u001a.
740
741 """
742 m = {}
743 for k,v in decoding_map.items():
Raymond Hettinger54f02222002-06-01 14:18:47 +0000744 if not v in m:
Marc-André Lemburg716cf912001-05-16 09:41:45 +0000745 m[v] = k
746 else:
747 m[v] = None
748 return m
Tim Peters3a2ab1a2001-05-29 06:06:54 +0000749
Walter Dörwald3aeb6322002-09-02 13:14:32 +0000750### error handlers
751
Martin v. Löwise2713be2005-03-08 15:03:08 +0000752try:
753 strict_errors = lookup_error("strict")
754 ignore_errors = lookup_error("ignore")
755 replace_errors = lookup_error("replace")
756 xmlcharrefreplace_errors = lookup_error("xmlcharrefreplace")
757 backslashreplace_errors = lookup_error("backslashreplace")
758except LookupError:
759 # In --disable-unicode builds, these error handler are missing
760 strict_errors = None
761 ignore_errors = None
762 replace_errors = None
763 xmlcharrefreplace_errors = None
764 backslashreplace_errors = None
Walter Dörwald3aeb6322002-09-02 13:14:32 +0000765
Martin v. Löwis6cd441d2001-07-31 08:54:55 +0000766# Tell modulefinder that using codecs probably needs the encodings
767# package
768_false = 0
769if _false:
770 import encodings
771
Guido van Rossum0612d842000-03-10 23:20:43 +0000772### Tests
Guido van Rossum1c89b0e2000-04-11 15:41:38 +0000773
Guido van Rossum0612d842000-03-10 23:20:43 +0000774if __name__ == '__main__':
775
Guido van Rossuma3277132000-04-11 15:37:43 +0000776 # Make stdout translate Latin-1 output into UTF-8 output
777 sys.stdout = EncodedFile(sys.stdout, 'latin-1', 'utf-8')
Guido van Rossum1c89b0e2000-04-11 15:41:38 +0000778
Guido van Rossuma3277132000-04-11 15:37:43 +0000779 # Have stdin translate Latin-1 input into UTF-8 input
780 sys.stdin = EncodedFile(sys.stdin, 'utf-8', 'latin-1')