blob: a964f991886094e27646af6b0a5c61ab4f23c590 [file] [log] [blame]
Guido van Rossum0612d842000-03-10 23:20:43 +00001""" codecs -- Python Codec Registry, API and helpers.
2
3
4Written by Marc-Andre Lemburg (mal@lemburg.com).
5
6(c) Copyright CNRI, All Rights Reserved. NO WARRANTY.
7
8"""#"
9
Marc-André Lemburgb28de0d2002-12-12 17:37:50 +000010import __builtin__, sys
Guido van Rossum0612d842000-03-10 23:20:43 +000011
12### Registry and builtin stateless codec functions
13
Guido van Rossumb95de4f2000-03-31 17:25:23 +000014try:
15 from _codecs import *
Tim Peters30324a72001-05-15 17:19:16 +000016except ImportError, why:
Guido van Rossumb95de4f2000-03-31 17:25:23 +000017 raise SystemError,\
18 'Failed to load the builtin codecs: %s' % why
Guido van Rossum0612d842000-03-10 23:20:43 +000019
Tim Peters30324a72001-05-15 17:19:16 +000020__all__ = ["register", "lookup", "open", "EncodedFile", "BOM", "BOM_BE",
Walter Dörwald474458d2002-06-04 15:16:29 +000021 "BOM_LE", "BOM32_BE", "BOM32_LE", "BOM64_BE", "BOM64_LE",
22 "BOM_UTF8", "BOM_UTF16", "BOM_UTF16_LE", "BOM_UTF16_BE",
Walter Dörwald3aeb6322002-09-02 13:14:32 +000023 "BOM_UTF32", "BOM_UTF32_LE", "BOM_UTF32_BE",
24 "strict_errors", "ignore_errors", "replace_errors",
25 "xmlcharrefreplace_errors",
26 "register_error", "lookup_error"]
Skip Montanaroe99d5ea2001-01-20 19:54:20 +000027
Guido van Rossum0612d842000-03-10 23:20:43 +000028### Constants
29
30#
Walter Dörwald474458d2002-06-04 15:16:29 +000031# Byte Order Mark (BOM = ZERO WIDTH NO-BREAK SPACE = U+FEFF)
32# and its possible byte string values
33# for UTF8/UTF16/UTF32 output and little/big endian machines
Guido van Rossum0612d842000-03-10 23:20:43 +000034#
Guido van Rossum0612d842000-03-10 23:20:43 +000035
Walter Dörwald474458d2002-06-04 15:16:29 +000036# UTF-8
37BOM_UTF8 = '\xef\xbb\xbf'
38
39# UTF-16, little endian
40BOM_LE = BOM_UTF16_LE = '\xff\xfe'
41
42# UTF-16, big endian
43BOM_BE = BOM_UTF16_BE = '\xfe\xff'
44
45# UTF-32, little endian
46BOM_UTF32_LE = '\xff\xfe\x00\x00'
47
48# UTF-32, big endian
49BOM_UTF32_BE = '\x00\x00\xfe\xff'
50
Marc-André Lemburgb28de0d2002-12-12 17:37:50 +000051if sys.byteorder == 'little':
Walter Dörwald474458d2002-06-04 15:16:29 +000052
Marc-André Lemburgb28de0d2002-12-12 17:37:50 +000053 # UTF-16, native endianness
54 BOM = BOM_UTF16 = BOM_UTF16_LE
55
56 # UTF-32, native endianness
57 BOM_UTF32 = BOM_UTF32_LE
58
59else:
60
61 # UTF-16, native endianness
62 BOM = BOM_UTF16 = BOM_UTF16_BE
63
64 # UTF-32, native endianness
65 BOM_UTF32 = BOM_UTF32_BE
Walter Dörwald474458d2002-06-04 15:16:29 +000066
67# Old broken names (don't use in new code)
68BOM32_LE = BOM_UTF16_LE
69BOM32_BE = BOM_UTF16_BE
70BOM64_LE = BOM_UTF32_LE
71BOM64_BE = BOM_UTF32_BE
Guido van Rossum0612d842000-03-10 23:20:43 +000072
73
74### Codec base classes (defining the API)
75
76class Codec:
77
78 """ Defines the interface for stateless encoders/decoders.
79
Walter Dörwald7f82f792002-11-19 21:42:53 +000080 The .encode()/.decode() methods may use different error
Guido van Rossum0612d842000-03-10 23:20:43 +000081 handling schemes by providing the errors argument. These
Walter Dörwald7f82f792002-11-19 21:42:53 +000082 string values are predefined:
Guido van Rossum0612d842000-03-10 23:20:43 +000083
Guido van Rossumd8855fd2000-03-24 22:14:19 +000084 'strict' - raise a ValueError error (or a subclass)
Guido van Rossum0612d842000-03-10 23:20:43 +000085 'ignore' - ignore the character and continue with the next
86 'replace' - replace with a suitable replacement character;
87 Python will use the official U+FFFD REPLACEMENT
Walter Dörwald7f82f792002-11-19 21:42:53 +000088 CHARACTER for the builtin Unicode codecs on
89 decoding and '?' on encoding.
90 'xmlcharrefreplace' - Replace with the appropriate XML
91 character reference (only for encoding).
92 'backslashreplace' - Replace with backslashed escape sequences
93 (only for encoding).
94
95 The set of allowed values can be extended via register_error.
Guido van Rossum0612d842000-03-10 23:20:43 +000096
97 """
Tim Peters30324a72001-05-15 17:19:16 +000098 def encode(self, input, errors='strict'):
Guido van Rossum1c89b0e2000-04-11 15:41:38 +000099
Fred Drake3e74c0d2000-03-17 15:40:35 +0000100 """ Encodes the object input and returns a tuple (output
Guido van Rossum0612d842000-03-10 23:20:43 +0000101 object, length consumed).
102
103 errors defines the error handling to apply. It defaults to
104 'strict' handling.
105
106 The method may not store state in the Codec instance. Use
107 StreamCodec for codecs which have to keep state in order to
108 make encoding/decoding efficient.
109
110 The encoder must be able to handle zero length input and
111 return an empty object of the output object type in this
112 situation.
113
114 """
115 raise NotImplementedError
116
Tim Peters30324a72001-05-15 17:19:16 +0000117 def decode(self, input, errors='strict'):
Guido van Rossum0612d842000-03-10 23:20:43 +0000118
119 """ Decodes the object input and returns a tuple (output
120 object, length consumed).
121
122 input must be an object which provides the bf_getreadbuf
123 buffer slot. Python strings, buffer objects and memory
124 mapped files are examples of objects providing this slot.
Guido van Rossum1c89b0e2000-04-11 15:41:38 +0000125
Guido van Rossum0612d842000-03-10 23:20:43 +0000126 errors defines the error handling to apply. It defaults to
127 'strict' handling.
128
129 The method may not store state in the Codec instance. Use
130 StreamCodec for codecs which have to keep state in order to
131 make encoding/decoding efficient.
132
133 The decoder must be able to handle zero length input and
134 return an empty object of the output object type in this
135 situation.
136
Guido van Rossum1c89b0e2000-04-11 15:41:38 +0000137 """
Guido van Rossum0612d842000-03-10 23:20:43 +0000138 raise NotImplementedError
139
140#
141# The StreamWriter and StreamReader class provide generic working
Andrew M. Kuchling97c56352001-09-18 20:29:48 +0000142# interfaces which can be used to implement new encoding submodules
Guido van Rossum0612d842000-03-10 23:20:43 +0000143# very easily. See encodings/utf_8.py for an example on how this is
144# done.
Guido van Rossum1c89b0e2000-04-11 15:41:38 +0000145#
Guido van Rossum0612d842000-03-10 23:20:43 +0000146
147class StreamWriter(Codec):
148
Tim Peters30324a72001-05-15 17:19:16 +0000149 def __init__(self, stream, errors='strict'):
Guido van Rossum0612d842000-03-10 23:20:43 +0000150
151 """ Creates a StreamWriter instance.
152
153 stream must be a file-like object open for writing
154 (binary) data.
155
Walter Dörwald7f82f792002-11-19 21:42:53 +0000156 The StreamWriter may use different error handling
Guido van Rossum0612d842000-03-10 23:20:43 +0000157 schemes by providing the errors keyword argument. These
Walter Dörwald7f82f792002-11-19 21:42:53 +0000158 parameters are predefined:
Guido van Rossum0612d842000-03-10 23:20:43 +0000159
160 'strict' - raise a ValueError (or a subclass)
161 'ignore' - ignore the character and continue with the next
162 'replace'- replace with a suitable replacement character
Walter Dörwald7f82f792002-11-19 21:42:53 +0000163 'xmlcharrefreplace' - Replace with the appropriate XML
164 character reference.
165 'backslashreplace' - Replace with backslashed escape
166 sequences (only for encoding).
Guido van Rossum0612d842000-03-10 23:20:43 +0000167
Walter Dörwald7f82f792002-11-19 21:42:53 +0000168 The set of allowed parameter values can be extended via
169 register_error.
Guido van Rossum0612d842000-03-10 23:20:43 +0000170 """
171 self.stream = stream
172 self.errors = errors
173
Guido van Rossuma3277132000-04-11 15:37:43 +0000174 def write(self, object):
Guido van Rossum0612d842000-03-10 23:20:43 +0000175
176 """ Writes the object's contents encoded to self.stream.
177 """
Tim Peters30324a72001-05-15 17:19:16 +0000178 data, consumed = self.encode(object, self.errors)
Guido van Rossum0612d842000-03-10 23:20:43 +0000179 self.stream.write(data)
180
Guido van Rossuma3277132000-04-11 15:37:43 +0000181 def writelines(self, list):
182
183 """ Writes the concatenated list of strings to the stream
184 using .write().
185 """
186 self.write(''.join(list))
Guido van Rossum1c89b0e2000-04-11 15:41:38 +0000187
Guido van Rossum0612d842000-03-10 23:20:43 +0000188 def reset(self):
189
190 """ Flushes and resets the codec buffers used for keeping state.
191
192 Calling this method should ensure that the data on the
193 output is put into a clean state, that allows appending
194 of new fresh data without having to rescan the whole
195 stream to recover state.
196
197 """
198 pass
199
Tim Peters30324a72001-05-15 17:19:16 +0000200 def __getattr__(self, name,
Guido van Rossum0612d842000-03-10 23:20:43 +0000201 getattr=getattr):
202
203 """ Inherit all other methods from the underlying stream.
204 """
Tim Peters30324a72001-05-15 17:19:16 +0000205 return getattr(self.stream, name)
Guido van Rossum0612d842000-03-10 23:20:43 +0000206
207###
208
209class StreamReader(Codec):
210
Tim Peters30324a72001-05-15 17:19:16 +0000211 def __init__(self, stream, errors='strict'):
Guido van Rossum0612d842000-03-10 23:20:43 +0000212
213 """ Creates a StreamReader instance.
214
215 stream must be a file-like object open for reading
216 (binary) data.
217
Walter Dörwald7f82f792002-11-19 21:42:53 +0000218 The StreamReader may use different error handling
Guido van Rossum0612d842000-03-10 23:20:43 +0000219 schemes by providing the errors keyword argument. These
Walter Dörwald7f82f792002-11-19 21:42:53 +0000220 parameters are predefined:
Guido van Rossum0612d842000-03-10 23:20:43 +0000221
222 'strict' - raise a ValueError (or a subclass)
223 'ignore' - ignore the character and continue with the next
224 'replace'- replace with a suitable replacement character;
225
Walter Dörwald7f82f792002-11-19 21:42:53 +0000226 The set of allowed parameter values can be extended via
227 register_error.
Guido van Rossum0612d842000-03-10 23:20:43 +0000228 """
229 self.stream = stream
230 self.errors = errors
Walter Dörwald69652032004-09-07 20:24:22 +0000231 self.bytebuffer = ""
Walter Dörwaldc9878e12005-07-20 22:15:39 +0000232 # For str->str decoding this will stay a str
233 # For str->unicode decoding the first read will promote it to unicode
234 self.charbuffer = ""
Guido van Rossum0612d842000-03-10 23:20:43 +0000235
Walter Dörwald69652032004-09-07 20:24:22 +0000236 def decode(self, input, errors='strict'):
237 raise NotImplementedError
238
Martin v. Löwis56066d22005-08-24 07:38:12 +0000239 def read(self, size=-1, chars=-1, firstline=False):
Guido van Rossum0612d842000-03-10 23:20:43 +0000240
241 """ Decodes data from the stream self.stream and returns the
242 resulting object.
243
Walter Dörwald69652032004-09-07 20:24:22 +0000244 chars indicates the number of characters to read from the
245 stream. read() will never return more than chars
246 characters, but it might return less, if there are not enough
247 characters available.
248
Guido van Rossum0612d842000-03-10 23:20:43 +0000249 size indicates the approximate maximum number of bytes to
250 read from the stream for decoding purposes. The decoder
251 can modify this setting as appropriate. The default value
252 -1 indicates to read and decode as much as possible. size
253 is intended to prevent having to decode huge files in one
254 step.
255
Martin v. Löwis56066d22005-08-24 07:38:12 +0000256 If firstline is true, and a UnicodeDecodeError happens
257 after the first line terminator in the input only the first line
258 will be returned, the rest of the input will be kept until the
259 next call to read().
260
Guido van Rossum0612d842000-03-10 23:20:43 +0000261 The method should use a greedy read strategy meaning that
262 it should read as much data as is allowed within the
263 definition of the encoding and the given size, e.g. if
264 optional encoding endings or state markers are available
265 on the stream, these should be read too.
Guido van Rossum0612d842000-03-10 23:20:43 +0000266 """
Walter Dörwald69652032004-09-07 20:24:22 +0000267 # read until we get the required number of characters (if available)
Walter Dörwald69652032004-09-07 20:24:22 +0000268 while True:
269 # can the request can be satisfied from the character buffer?
270 if chars < 0:
271 if self.charbuffer:
Walter Dörwalde57d7b12004-12-21 22:24:00 +0000272 break
Guido van Rossum0612d842000-03-10 23:20:43 +0000273 else:
Walter Dörwald69652032004-09-07 20:24:22 +0000274 if len(self.charbuffer) >= chars:
Walter Dörwald69652032004-09-07 20:24:22 +0000275 break
276 # we need more data
277 if size < 0:
278 newdata = self.stream.read()
279 else:
280 newdata = self.stream.read(size)
Walter Dörwalde57d7b12004-12-21 22:24:00 +0000281 # decode bytes (those remaining from the last call included)
Walter Dörwald69652032004-09-07 20:24:22 +0000282 data = self.bytebuffer + newdata
Martin v. Löwis56066d22005-08-24 07:38:12 +0000283 try:
284 newchars, decodedbytes = self.decode(data, self.errors)
285 except UnicodeDecodeError, exc:
286 if firstline:
287 newchars, decodedbytes = self.decode(data[:exc.start], self.errors)
288 lines = newchars.splitlines(True)
289 if len(lines)<=1:
290 raise
291 else:
292 raise
Walter Dörwald69652032004-09-07 20:24:22 +0000293 # keep undecoded bytes until the next call
294 self.bytebuffer = data[decodedbytes:]
295 # put new characters in the character buffer
Walter Dörwalde57d7b12004-12-21 22:24:00 +0000296 self.charbuffer += newchars
Walter Dörwald69652032004-09-07 20:24:22 +0000297 # there was no data available
298 if not newdata:
Walter Dörwalde57d7b12004-12-21 22:24:00 +0000299 break
300 if chars < 0:
301 # Return everything we've got
302 result = self.charbuffer
Walter Dörwaldc9878e12005-07-20 22:15:39 +0000303 self.charbuffer = ""
Walter Dörwalde57d7b12004-12-21 22:24:00 +0000304 else:
305 # Return the first chars characters
306 result = self.charbuffer[:chars]
307 self.charbuffer = self.charbuffer[chars:]
Walter Dörwald69652032004-09-07 20:24:22 +0000308 return result
Guido van Rossum0612d842000-03-10 23:20:43 +0000309
Walter Dörwald69652032004-09-07 20:24:22 +0000310 def readline(self, size=None, keepends=True):
Guido van Rossuma3277132000-04-11 15:37:43 +0000311
312 """ Read one line from the input stream and return the
313 decoded data.
314
Walter Dörwald69652032004-09-07 20:24:22 +0000315 size, if given, is passed as size argument to the
316 read() method.
Guido van Rossum1c89b0e2000-04-11 15:41:38 +0000317
Guido van Rossuma3277132000-04-11 15:37:43 +0000318 """
Walter Dörwalde57d7b12004-12-21 22:24:00 +0000319 readsize = size or 72
Walter Dörwaldc9878e12005-07-20 22:15:39 +0000320 line = ""
Walter Dörwalde57d7b12004-12-21 22:24:00 +0000321 # If size is given, we call read() only once
Walter Dörwald69652032004-09-07 20:24:22 +0000322 while True:
Martin v. Löwis56066d22005-08-24 07:38:12 +0000323 data = self.read(readsize, firstline=True)
Walter Dörwalde57d7b12004-12-21 22:24:00 +0000324 if data:
Walter Dörwalda4eb2d52005-04-21 21:42:35 +0000325 # If we're at a "\r" read one extra character (which might
326 # be a "\n") to get a proper line ending. If the stream is
Walter Dörwaldbc8e6422005-04-21 21:32:03 +0000327 # temporarily exhausted we return the wrong line ending.
Walter Dörwaldc9878e12005-07-20 22:15:39 +0000328 if data.endswith("\r"):
Walter Dörwald7a6dc132005-04-04 21:38:47 +0000329 data += self.read(size=1, chars=1)
Walter Dörwald7a6dc132005-04-04 21:38:47 +0000330
Walter Dörwald69652032004-09-07 20:24:22 +0000331 line += data
Walter Dörwalde57d7b12004-12-21 22:24:00 +0000332 lines = line.splitlines(True)
333 if lines:
334 line0withend = lines[0]
335 line0withoutend = lines[0].splitlines(False)[0]
336 if line0withend != line0withoutend: # We really have a line end
337 # Put the rest back together and keep it until the next call
Walter Dörwaldc9878e12005-07-20 22:15:39 +0000338 self.charbuffer = "".join(lines[1:]) + self.charbuffer
Walter Dörwalde57d7b12004-12-21 22:24:00 +0000339 if keepends:
340 line = line0withend
341 else:
342 line = line0withoutend
Walter Dörwald9fa09462005-01-10 12:01:39 +0000343 break
Walter Dörwalde57d7b12004-12-21 22:24:00 +0000344 # we didn't get anything or this was our only try
Walter Dörwald9fa09462005-01-10 12:01:39 +0000345 if not data or size is not None:
Walter Dörwalde57d7b12004-12-21 22:24:00 +0000346 if line and not keepends:
347 line = line.splitlines(False)[0]
348 break
349 if readsize<8000:
350 readsize *= 2
351 return line
Guido van Rossum1c89b0e2000-04-11 15:41:38 +0000352
Walter Dörwald69652032004-09-07 20:24:22 +0000353 def readlines(self, sizehint=None, keepends=True):
Guido van Rossuma3277132000-04-11 15:37:43 +0000354
355 """ Read all lines available on the input stream
356 and return them as list of lines.
357
358 Line breaks are implemented using the codec's decoder
359 method and are included in the list entries.
Guido van Rossum1c89b0e2000-04-11 15:41:38 +0000360
Marc-André Lemburgd5948492004-02-26 15:22:17 +0000361 sizehint, if given, is ignored since there is no efficient
362 way to finding the true end-of-line.
Guido van Rossuma3277132000-04-11 15:37:43 +0000363
364 """
Walter Dörwald69652032004-09-07 20:24:22 +0000365 data = self.read()
Hye-Shik Changaf5c7cf2004-10-17 23:51:21 +0000366 return data.splitlines(keepends)
Guido van Rossum0612d842000-03-10 23:20:43 +0000367
368 def reset(self):
369
370 """ Resets the codec buffers used for keeping state.
371
372 Note that no stream repositioning should take place.
Thomas Wouters7e474022000-07-16 12:04:32 +0000373 This method is primarily intended to be able to recover
Guido van Rossum0612d842000-03-10 23:20:43 +0000374 from decoding errors.
375
376 """
Walter Dörwald729c31f2005-03-14 19:06:30 +0000377 self.bytebuffer = ""
378 self.charbuffer = u""
Walter Dörwald729c31f2005-03-14 19:06:30 +0000379
Walter Dörwald71fd90d2005-03-14 19:25:41 +0000380 def seek(self, offset, whence=0):
Walter Dörwald729c31f2005-03-14 19:06:30 +0000381 """ Set the input stream's current position.
382
383 Resets the codec buffers used for keeping state.
384 """
385 self.reset()
386 self.stream.seek(offset, whence)
Guido van Rossum0612d842000-03-10 23:20:43 +0000387
Walter Dörwald4dbf1922002-11-06 16:53:44 +0000388 def next(self):
389
390 """ Return the next decoded line from the input stream."""
391 line = self.readline()
392 if line:
393 return line
394 raise StopIteration
395
396 def __iter__(self):
397 return self
398
Tim Peters30324a72001-05-15 17:19:16 +0000399 def __getattr__(self, name,
Guido van Rossum0612d842000-03-10 23:20:43 +0000400 getattr=getattr):
401
402 """ Inherit all other methods from the underlying stream.
403 """
Tim Peters30324a72001-05-15 17:19:16 +0000404 return getattr(self.stream, name)
Guido van Rossum0612d842000-03-10 23:20:43 +0000405
406###
407
408class StreamReaderWriter:
409
Fred Drake49fd1072000-04-13 14:11:21 +0000410 """ StreamReaderWriter instances allow wrapping streams which
411 work in both read and write modes.
412
413 The design is such that one can use the factory functions
Thomas Wouters7e474022000-07-16 12:04:32 +0000414 returned by the codec.lookup() function to construct the
Fred Drake49fd1072000-04-13 14:11:21 +0000415 instance.
416
417 """
Guido van Rossuma3277132000-04-11 15:37:43 +0000418 # Optional attributes set by the file wrappers below
419 encoding = 'unknown'
420
Tim Peters30324a72001-05-15 17:19:16 +0000421 def __init__(self, stream, Reader, Writer, errors='strict'):
Guido van Rossum0612d842000-03-10 23:20:43 +0000422
423 """ Creates a StreamReaderWriter instance.
424
425 stream must be a Stream-like object.
426
427 Reader, Writer must be factory functions or classes
428 providing the StreamReader, StreamWriter interface resp.
429
430 Error handling is done in the same way as defined for the
431 StreamWriter/Readers.
432
433 """
434 self.stream = stream
435 self.reader = Reader(stream, errors)
436 self.writer = Writer(stream, errors)
437 self.errors = errors
438
Tim Peters30324a72001-05-15 17:19:16 +0000439 def read(self, size=-1):
Guido van Rossum0612d842000-03-10 23:20:43 +0000440
441 return self.reader.read(size)
442
Guido van Rossumd58c26f2000-05-01 16:17:32 +0000443 def readline(self, size=None):
Guido van Rossuma3277132000-04-11 15:37:43 +0000444
445 return self.reader.readline(size)
446
Guido van Rossumd58c26f2000-05-01 16:17:32 +0000447 def readlines(self, sizehint=None):
Guido van Rossuma3277132000-04-11 15:37:43 +0000448
449 return self.reader.readlines(sizehint)
450
Walter Dörwald4dbf1922002-11-06 16:53:44 +0000451 def next(self):
452
453 """ Return the next decoded line from the input stream."""
454 return self.reader.next()
455
456 def __iter__(self):
457 return self
458
Tim Peters30324a72001-05-15 17:19:16 +0000459 def write(self, data):
Guido van Rossum0612d842000-03-10 23:20:43 +0000460
461 return self.writer.write(data)
462
Tim Peters30324a72001-05-15 17:19:16 +0000463 def writelines(self, list):
Guido van Rossuma3277132000-04-11 15:37:43 +0000464
465 return self.writer.writelines(list)
466
Guido van Rossum0612d842000-03-10 23:20:43 +0000467 def reset(self):
468
469 self.reader.reset()
470 self.writer.reset()
471
Tim Peters30324a72001-05-15 17:19:16 +0000472 def __getattr__(self, name,
Guido van Rossum0612d842000-03-10 23:20:43 +0000473 getattr=getattr):
474
475 """ Inherit all other methods from the underlying stream.
476 """
Tim Peters30324a72001-05-15 17:19:16 +0000477 return getattr(self.stream, name)
Guido van Rossum0612d842000-03-10 23:20:43 +0000478
479###
480
481class StreamRecoder:
482
Fred Drake49fd1072000-04-13 14:11:21 +0000483 """ StreamRecoder instances provide a frontend - backend
484 view of encoding data.
485
486 They use the complete set of APIs returned by the
487 codecs.lookup() function to implement their task.
488
489 Data written to the stream is first decoded into an
490 intermediate format (which is dependent on the given codec
491 combination) and then written to the stream using an instance
492 of the provided Writer class.
493
494 In the other direction, data is read from the stream using a
495 Reader instance and then return encoded data to the caller.
496
497 """
Guido van Rossuma3277132000-04-11 15:37:43 +0000498 # Optional attributes set by the file wrappers below
499 data_encoding = 'unknown'
500 file_encoding = 'unknown'
501
Tim Peters30324a72001-05-15 17:19:16 +0000502 def __init__(self, stream, encode, decode, Reader, Writer,
503 errors='strict'):
Guido van Rossum0612d842000-03-10 23:20:43 +0000504
505 """ Creates a StreamRecoder instance which implements a two-way
506 conversion: encode and decode work on the frontend (the
Guido van Rossum1c89b0e2000-04-11 15:41:38 +0000507 input to .read() and output of .write()) while
Guido van Rossum0612d842000-03-10 23:20:43 +0000508 Reader and Writer work on the backend (reading and
Fred Drake908670c2000-03-17 15:42:11 +0000509 writing to the stream).
Guido van Rossum0612d842000-03-10 23:20:43 +0000510
511 You can use these objects to do transparent direct
512 recodings from e.g. latin-1 to utf-8 and back.
513
514 stream must be a file-like object.
515
516 encode, decode must adhere to the Codec interface, Reader,
517 Writer must be factory functions or classes providing the
518 StreamReader, StreamWriter interface resp.
519
520 encode and decode are needed for the frontend translation,
521 Reader and Writer for the backend translation. Unicode is
522 used as intermediate encoding.
523
524 Error handling is done in the same way as defined for the
525 StreamWriter/Readers.
526
527 """
528 self.stream = stream
529 self.encode = encode
530 self.decode = decode
531 self.reader = Reader(stream, errors)
532 self.writer = Writer(stream, errors)
533 self.errors = errors
534
Tim Peters30324a72001-05-15 17:19:16 +0000535 def read(self, size=-1):
Guido van Rossum0612d842000-03-10 23:20:43 +0000536
537 data = self.reader.read(size)
538 data, bytesencoded = self.encode(data, self.errors)
539 return data
540
Tim Peters30324a72001-05-15 17:19:16 +0000541 def readline(self, size=None):
Guido van Rossuma3277132000-04-11 15:37:43 +0000542
543 if size is None:
544 data = self.reader.readline()
545 else:
546 data = self.reader.readline(size)
547 data, bytesencoded = self.encode(data, self.errors)
548 return data
549
Tim Peters30324a72001-05-15 17:19:16 +0000550 def readlines(self, sizehint=None):
Guido van Rossuma3277132000-04-11 15:37:43 +0000551
Marc-André Lemburgd5948492004-02-26 15:22:17 +0000552 data = self.reader.read()
Guido van Rossuma3277132000-04-11 15:37:43 +0000553 data, bytesencoded = self.encode(data, self.errors)
554 return data.splitlines(1)
555
Walter Dörwald4dbf1922002-11-06 16:53:44 +0000556 def next(self):
557
558 """ Return the next decoded line from the input stream."""
559 return self.reader.next()
560
561 def __iter__(self):
562 return self
563
Tim Peters30324a72001-05-15 17:19:16 +0000564 def write(self, data):
Guido van Rossum0612d842000-03-10 23:20:43 +0000565
566 data, bytesdecoded = self.decode(data, self.errors)
567 return self.writer.write(data)
568
Tim Peters30324a72001-05-15 17:19:16 +0000569 def writelines(self, list):
Guido van Rossuma3277132000-04-11 15:37:43 +0000570
571 data = ''.join(list)
572 data, bytesdecoded = self.decode(data, self.errors)
573 return self.writer.write(data)
Guido van Rossum0612d842000-03-10 23:20:43 +0000574
575 def reset(self):
576
577 self.reader.reset()
578 self.writer.reset()
579
Tim Peters30324a72001-05-15 17:19:16 +0000580 def __getattr__(self, name,
Guido van Rossum0612d842000-03-10 23:20:43 +0000581 getattr=getattr):
582
583 """ Inherit all other methods from the underlying stream.
584 """
Tim Peters30324a72001-05-15 17:19:16 +0000585 return getattr(self.stream, name)
Guido van Rossum0612d842000-03-10 23:20:43 +0000586
587### Shortcuts
588
Marc-André Lemburg349a3d32000-06-21 21:21:04 +0000589def open(filename, mode='rb', encoding=None, errors='strict', buffering=1):
Guido van Rossum0612d842000-03-10 23:20:43 +0000590
591 """ Open an encoded file using the given mode and return
592 a wrapped version providing transparent encoding/decoding.
593
594 Note: The wrapped version will only accept the object format
595 defined by the codecs, i.e. Unicode objects for most builtin
Skip Montanaro9f5f9d92005-03-16 03:51:56 +0000596 codecs. Output is also codec dependent and will usually be
Guido van Rossum0612d842000-03-10 23:20:43 +0000597 Unicode as well.
598
Marc-André Lemburg349a3d32000-06-21 21:21:04 +0000599 Files are always opened in binary mode, even if no binary mode
Walter Dörwald7f3ed742003-02-02 23:08:27 +0000600 was specified. This is done to avoid data loss due to encodings
Marc-André Lemburg349a3d32000-06-21 21:21:04 +0000601 using 8-bit values. The default file mode is 'rb' meaning to
602 open the file in binary read mode.
603
Guido van Rossum0612d842000-03-10 23:20:43 +0000604 encoding specifies the encoding which is to be used for the
Walter Dörwald7f3ed742003-02-02 23:08:27 +0000605 file.
Guido van Rossum0612d842000-03-10 23:20:43 +0000606
607 errors may be given to define the error handling. It defaults
608 to 'strict' which causes ValueErrors to be raised in case an
609 encoding error occurs.
610
611 buffering has the same meaning as for the builtin open() API.
612 It defaults to line buffered.
613
Fred Drake49fd1072000-04-13 14:11:21 +0000614 The returned wrapped file object provides an extra attribute
615 .encoding which allows querying the used encoding. This
616 attribute is only available if an encoding was specified as
617 parameter.
618
Guido van Rossum0612d842000-03-10 23:20:43 +0000619 """
620 if encoding is not None and \
621 'b' not in mode:
622 # Force opening of the file in binary mode
623 mode = mode + 'b'
624 file = __builtin__.open(filename, mode, buffering)
625 if encoding is None:
626 return file
Tim Peters30324a72001-05-15 17:19:16 +0000627 (e, d, sr, sw) = lookup(encoding)
Guido van Rossuma3277132000-04-11 15:37:43 +0000628 srw = StreamReaderWriter(file, sr, sw, errors)
629 # Add attributes to simplify introspection
630 srw.encoding = encoding
631 return srw
Guido van Rossum0612d842000-03-10 23:20:43 +0000632
Guido van Rossuma3277132000-04-11 15:37:43 +0000633def EncodedFile(file, data_encoding, file_encoding=None, errors='strict'):
Guido van Rossum0612d842000-03-10 23:20:43 +0000634
635 """ Return a wrapped version of file which provides transparent
636 encoding translation.
637
638 Strings written to the wrapped file are interpreted according
Guido van Rossuma3277132000-04-11 15:37:43 +0000639 to the given data_encoding and then written to the original
640 file as string using file_encoding. The intermediate encoding
641 will usually be Unicode but depends on the specified codecs.
Guido van Rossum0612d842000-03-10 23:20:43 +0000642
Guido van Rossuma3277132000-04-11 15:37:43 +0000643 Strings are read from the file using file_encoding and then
644 passed back to the caller as string using data_encoding.
645
646 If file_encoding is not given, it defaults to data_encoding.
Guido van Rossum0612d842000-03-10 23:20:43 +0000647
648 errors may be given to define the error handling. It defaults
649 to 'strict' which causes ValueErrors to be raised in case an
650 encoding error occurs.
651
Fred Drake49fd1072000-04-13 14:11:21 +0000652 The returned wrapped file object provides two extra attributes
653 .data_encoding and .file_encoding which reflect the given
654 parameters of the same name. The attributes can be used for
655 introspection by Python programs.
656
Guido van Rossum0612d842000-03-10 23:20:43 +0000657 """
Guido van Rossuma3277132000-04-11 15:37:43 +0000658 if file_encoding is None:
659 file_encoding = data_encoding
660 encode, decode = lookup(data_encoding)[:2]
661 Reader, Writer = lookup(file_encoding)[2:]
662 sr = StreamRecoder(file,
Tim Peters30324a72001-05-15 17:19:16 +0000663 encode, decode, Reader, Writer,
Guido van Rossuma3277132000-04-11 15:37:43 +0000664 errors)
665 # Add attributes to simplify introspection
666 sr.data_encoding = data_encoding
667 sr.file_encoding = file_encoding
668 return sr
Guido van Rossum0612d842000-03-10 23:20:43 +0000669
Marc-André Lemburgaa32c5a2001-09-19 11:24:48 +0000670### Helpers for codec lookup
671
672def getencoder(encoding):
673
674 """ Lookup up the codec for the given encoding and return
675 its encoder function.
676
677 Raises a LookupError in case the encoding cannot be found.
678
679 """
680 return lookup(encoding)[0]
681
682def getdecoder(encoding):
683
684 """ Lookup up the codec for the given encoding and return
685 its decoder function.
686
687 Raises a LookupError in case the encoding cannot be found.
688
689 """
690 return lookup(encoding)[1]
691
692def getreader(encoding):
693
694 """ Lookup up the codec for the given encoding and return
695 its StreamReader class or factory function.
696
697 Raises a LookupError in case the encoding cannot be found.
698
699 """
700 return lookup(encoding)[2]
701
702def getwriter(encoding):
703
704 """ Lookup up the codec for the given encoding and return
705 its StreamWriter class or factory function.
706
707 Raises a LookupError in case the encoding cannot be found.
708
709 """
710 return lookup(encoding)[3]
711
Marc-André Lemburga866df82001-01-03 21:29:14 +0000712### Helpers for charmap-based codecs
713
714def make_identity_dict(rng):
715
716 """ make_identity_dict(rng) -> dict
717
718 Return a dictionary where elements of the rng sequence are
719 mapped to themselves.
Tim Peters88869f92001-01-14 23:36:06 +0000720
Marc-André Lemburga866df82001-01-03 21:29:14 +0000721 """
722 res = {}
723 for i in rng:
724 res[i]=i
725 return res
726
Marc-André Lemburg716cf912001-05-16 09:41:45 +0000727def make_encoding_map(decoding_map):
728
729 """ Creates an encoding map from a decoding map.
730
Walter Dörwald7f3ed742003-02-02 23:08:27 +0000731 If a target mapping in the decoding map occurs multiple
Marc-André Lemburg716cf912001-05-16 09:41:45 +0000732 times, then that target is mapped to None (undefined mapping),
733 causing an exception when encountered by the charmap codec
734 during translation.
735
736 One example where this happens is cp875.py which decodes
737 multiple character to \u001a.
738
739 """
740 m = {}
741 for k,v in decoding_map.items():
Raymond Hettinger54f02222002-06-01 14:18:47 +0000742 if not v in m:
Marc-André Lemburg716cf912001-05-16 09:41:45 +0000743 m[v] = k
744 else:
745 m[v] = None
746 return m
Tim Peters3a2ab1a2001-05-29 06:06:54 +0000747
Walter Dörwald3aeb6322002-09-02 13:14:32 +0000748### error handlers
749
Martin v. Löwise2713be2005-03-08 15:03:08 +0000750try:
751 strict_errors = lookup_error("strict")
752 ignore_errors = lookup_error("ignore")
753 replace_errors = lookup_error("replace")
754 xmlcharrefreplace_errors = lookup_error("xmlcharrefreplace")
755 backslashreplace_errors = lookup_error("backslashreplace")
756except LookupError:
757 # In --disable-unicode builds, these error handler are missing
758 strict_errors = None
759 ignore_errors = None
760 replace_errors = None
761 xmlcharrefreplace_errors = None
762 backslashreplace_errors = None
Walter Dörwald3aeb6322002-09-02 13:14:32 +0000763
Martin v. Löwis6cd441d2001-07-31 08:54:55 +0000764# Tell modulefinder that using codecs probably needs the encodings
765# package
766_false = 0
767if _false:
768 import encodings
769
Guido van Rossum0612d842000-03-10 23:20:43 +0000770### Tests
Guido van Rossum1c89b0e2000-04-11 15:41:38 +0000771
Guido van Rossum0612d842000-03-10 23:20:43 +0000772if __name__ == '__main__':
773
Guido van Rossuma3277132000-04-11 15:37:43 +0000774 # Make stdout translate Latin-1 output into UTF-8 output
775 sys.stdout = EncodedFile(sys.stdout, 'latin-1', 'utf-8')
Guido van Rossum1c89b0e2000-04-11 15:41:38 +0000776
Guido van Rossuma3277132000-04-11 15:37:43 +0000777 # Have stdin translate Latin-1 input into UTF-8 input
778 sys.stdin = EncodedFile(sys.stdin, 'utf-8', 'latin-1')