blob: 58bba73ac0acc428c265aa287e0dbfefeea99c17 [file] [log] [blame]
Guido van Rossum0612d842000-03-10 23:20:43 +00001""" codecs -- Python Codec Registry, API and helpers.
2
3
4Written by Marc-Andre Lemburg (mal@lemburg.com).
5
6(c) Copyright CNRI, All Rights Reserved. NO WARRANTY.
7
8"""#"
9
Marc-André Lemburgb28de0d2002-12-12 17:37:50 +000010import __builtin__, sys
Guido van Rossum0612d842000-03-10 23:20:43 +000011
12### Registry and builtin stateless codec functions
13
Guido van Rossumb95de4f2000-03-31 17:25:23 +000014try:
15 from _codecs import *
Tim Peters30324a72001-05-15 17:19:16 +000016except ImportError, why:
Guido van Rossumb95de4f2000-03-31 17:25:23 +000017 raise SystemError,\
18 'Failed to load the builtin codecs: %s' % why
Guido van Rossum0612d842000-03-10 23:20:43 +000019
Tim Peters30324a72001-05-15 17:19:16 +000020__all__ = ["register", "lookup", "open", "EncodedFile", "BOM", "BOM_BE",
Walter Dörwald474458d2002-06-04 15:16:29 +000021 "BOM_LE", "BOM32_BE", "BOM32_LE", "BOM64_BE", "BOM64_LE",
22 "BOM_UTF8", "BOM_UTF16", "BOM_UTF16_LE", "BOM_UTF16_BE",
Walter Dörwald3aeb6322002-09-02 13:14:32 +000023 "BOM_UTF32", "BOM_UTF32_LE", "BOM_UTF32_BE",
24 "strict_errors", "ignore_errors", "replace_errors",
25 "xmlcharrefreplace_errors",
26 "register_error", "lookup_error"]
Skip Montanaroe99d5ea2001-01-20 19:54:20 +000027
Guido van Rossum0612d842000-03-10 23:20:43 +000028### Constants
29
30#
Walter Dörwald474458d2002-06-04 15:16:29 +000031# Byte Order Mark (BOM = ZERO WIDTH NO-BREAK SPACE = U+FEFF)
32# and its possible byte string values
33# for UTF8/UTF16/UTF32 output and little/big endian machines
Guido van Rossum0612d842000-03-10 23:20:43 +000034#
Guido van Rossum0612d842000-03-10 23:20:43 +000035
Walter Dörwald474458d2002-06-04 15:16:29 +000036# UTF-8
37BOM_UTF8 = '\xef\xbb\xbf'
38
39# UTF-16, little endian
40BOM_LE = BOM_UTF16_LE = '\xff\xfe'
41
42# UTF-16, big endian
43BOM_BE = BOM_UTF16_BE = '\xfe\xff'
44
45# UTF-32, little endian
46BOM_UTF32_LE = '\xff\xfe\x00\x00'
47
48# UTF-32, big endian
49BOM_UTF32_BE = '\x00\x00\xfe\xff'
50
Marc-André Lemburgb28de0d2002-12-12 17:37:50 +000051if sys.byteorder == 'little':
Walter Dörwald474458d2002-06-04 15:16:29 +000052
Marc-André Lemburgb28de0d2002-12-12 17:37:50 +000053 # UTF-16, native endianness
54 BOM = BOM_UTF16 = BOM_UTF16_LE
55
56 # UTF-32, native endianness
57 BOM_UTF32 = BOM_UTF32_LE
58
59else:
60
61 # UTF-16, native endianness
62 BOM = BOM_UTF16 = BOM_UTF16_BE
63
64 # UTF-32, native endianness
65 BOM_UTF32 = BOM_UTF32_BE
Walter Dörwald474458d2002-06-04 15:16:29 +000066
67# Old broken names (don't use in new code)
68BOM32_LE = BOM_UTF16_LE
69BOM32_BE = BOM_UTF16_BE
70BOM64_LE = BOM_UTF32_LE
71BOM64_BE = BOM_UTF32_BE
Guido van Rossum0612d842000-03-10 23:20:43 +000072
73
74### Codec base classes (defining the API)
75
76class Codec:
77
78 """ Defines the interface for stateless encoders/decoders.
79
Walter Dörwald7f82f792002-11-19 21:42:53 +000080 The .encode()/.decode() methods may use different error
Guido van Rossum0612d842000-03-10 23:20:43 +000081 handling schemes by providing the errors argument. These
Walter Dörwald7f82f792002-11-19 21:42:53 +000082 string values are predefined:
Guido van Rossum0612d842000-03-10 23:20:43 +000083
Guido van Rossumd8855fd2000-03-24 22:14:19 +000084 'strict' - raise a ValueError error (or a subclass)
Guido van Rossum0612d842000-03-10 23:20:43 +000085 'ignore' - ignore the character and continue with the next
86 'replace' - replace with a suitable replacement character;
87 Python will use the official U+FFFD REPLACEMENT
Walter Dörwald7f82f792002-11-19 21:42:53 +000088 CHARACTER for the builtin Unicode codecs on
89 decoding and '?' on encoding.
90 'xmlcharrefreplace' - Replace with the appropriate XML
91 character reference (only for encoding).
92 'backslashreplace' - Replace with backslashed escape sequences
93 (only for encoding).
94
95 The set of allowed values can be extended via register_error.
Guido van Rossum0612d842000-03-10 23:20:43 +000096
97 """
Tim Peters30324a72001-05-15 17:19:16 +000098 def encode(self, input, errors='strict'):
Guido van Rossum1c89b0e2000-04-11 15:41:38 +000099
Fred Drake3e74c0d2000-03-17 15:40:35 +0000100 """ Encodes the object input and returns a tuple (output
Guido van Rossum0612d842000-03-10 23:20:43 +0000101 object, length consumed).
102
103 errors defines the error handling to apply. It defaults to
104 'strict' handling.
105
106 The method may not store state in the Codec instance. Use
107 StreamCodec for codecs which have to keep state in order to
108 make encoding/decoding efficient.
109
110 The encoder must be able to handle zero length input and
111 return an empty object of the output object type in this
112 situation.
113
114 """
115 raise NotImplementedError
116
Tim Peters30324a72001-05-15 17:19:16 +0000117 def decode(self, input, errors='strict'):
Guido van Rossum0612d842000-03-10 23:20:43 +0000118
119 """ Decodes the object input and returns a tuple (output
120 object, length consumed).
121
122 input must be an object which provides the bf_getreadbuf
123 buffer slot. Python strings, buffer objects and memory
124 mapped files are examples of objects providing this slot.
Guido van Rossum1c89b0e2000-04-11 15:41:38 +0000125
Guido van Rossum0612d842000-03-10 23:20:43 +0000126 errors defines the error handling to apply. It defaults to
127 'strict' handling.
128
129 The method may not store state in the Codec instance. Use
130 StreamCodec for codecs which have to keep state in order to
131 make encoding/decoding efficient.
132
133 The decoder must be able to handle zero length input and
134 return an empty object of the output object type in this
135 situation.
136
Guido van Rossum1c89b0e2000-04-11 15:41:38 +0000137 """
Guido van Rossum0612d842000-03-10 23:20:43 +0000138 raise NotImplementedError
139
140#
141# The StreamWriter and StreamReader class provide generic working
Andrew M. Kuchling97c56352001-09-18 20:29:48 +0000142# interfaces which can be used to implement new encoding submodules
Guido van Rossum0612d842000-03-10 23:20:43 +0000143# very easily. See encodings/utf_8.py for an example on how this is
144# done.
Guido van Rossum1c89b0e2000-04-11 15:41:38 +0000145#
Guido van Rossum0612d842000-03-10 23:20:43 +0000146
147class StreamWriter(Codec):
148
Tim Peters30324a72001-05-15 17:19:16 +0000149 def __init__(self, stream, errors='strict'):
Guido van Rossum0612d842000-03-10 23:20:43 +0000150
151 """ Creates a StreamWriter instance.
152
153 stream must be a file-like object open for writing
154 (binary) data.
155
Walter Dörwald7f82f792002-11-19 21:42:53 +0000156 The StreamWriter may use different error handling
Guido van Rossum0612d842000-03-10 23:20:43 +0000157 schemes by providing the errors keyword argument. These
Walter Dörwald7f82f792002-11-19 21:42:53 +0000158 parameters are predefined:
Guido van Rossum0612d842000-03-10 23:20:43 +0000159
160 'strict' - raise a ValueError (or a subclass)
161 'ignore' - ignore the character and continue with the next
162 'replace'- replace with a suitable replacement character
Walter Dörwald7f82f792002-11-19 21:42:53 +0000163 'xmlcharrefreplace' - Replace with the appropriate XML
164 character reference.
165 'backslashreplace' - Replace with backslashed escape
166 sequences (only for encoding).
Guido van Rossum0612d842000-03-10 23:20:43 +0000167
Walter Dörwald7f82f792002-11-19 21:42:53 +0000168 The set of allowed parameter values can be extended via
169 register_error.
Guido van Rossum0612d842000-03-10 23:20:43 +0000170 """
171 self.stream = stream
172 self.errors = errors
173
Guido van Rossuma3277132000-04-11 15:37:43 +0000174 def write(self, object):
Guido van Rossum0612d842000-03-10 23:20:43 +0000175
176 """ Writes the object's contents encoded to self.stream.
177 """
Tim Peters30324a72001-05-15 17:19:16 +0000178 data, consumed = self.encode(object, self.errors)
Guido van Rossum0612d842000-03-10 23:20:43 +0000179 self.stream.write(data)
180
Guido van Rossuma3277132000-04-11 15:37:43 +0000181 def writelines(self, list):
182
183 """ Writes the concatenated list of strings to the stream
184 using .write().
185 """
186 self.write(''.join(list))
Guido van Rossum1c89b0e2000-04-11 15:41:38 +0000187
Guido van Rossum0612d842000-03-10 23:20:43 +0000188 def reset(self):
189
190 """ Flushes and resets the codec buffers used for keeping state.
191
192 Calling this method should ensure that the data on the
193 output is put into a clean state, that allows appending
194 of new fresh data without having to rescan the whole
195 stream to recover state.
196
197 """
198 pass
199
Tim Peters30324a72001-05-15 17:19:16 +0000200 def __getattr__(self, name,
Guido van Rossum0612d842000-03-10 23:20:43 +0000201 getattr=getattr):
202
203 """ Inherit all other methods from the underlying stream.
204 """
Tim Peters30324a72001-05-15 17:19:16 +0000205 return getattr(self.stream, name)
Guido van Rossum0612d842000-03-10 23:20:43 +0000206
207###
208
209class StreamReader(Codec):
210
Tim Peters30324a72001-05-15 17:19:16 +0000211 def __init__(self, stream, errors='strict'):
Guido van Rossum0612d842000-03-10 23:20:43 +0000212
213 """ Creates a StreamReader instance.
214
215 stream must be a file-like object open for reading
216 (binary) data.
217
Walter Dörwald7f82f792002-11-19 21:42:53 +0000218 The StreamReader may use different error handling
Guido van Rossum0612d842000-03-10 23:20:43 +0000219 schemes by providing the errors keyword argument. These
Walter Dörwald7f82f792002-11-19 21:42:53 +0000220 parameters are predefined:
Guido van Rossum0612d842000-03-10 23:20:43 +0000221
222 'strict' - raise a ValueError (or a subclass)
223 'ignore' - ignore the character and continue with the next
224 'replace'- replace with a suitable replacement character;
225
Walter Dörwald7f82f792002-11-19 21:42:53 +0000226 The set of allowed parameter values can be extended via
227 register_error.
Guido van Rossum0612d842000-03-10 23:20:43 +0000228 """
229 self.stream = stream
230 self.errors = errors
Walter Dörwald69652032004-09-07 20:24:22 +0000231 self.bytebuffer = ""
232 self.charbuffer = u""
Walter Dörwalde57d7b12004-12-21 22:24:00 +0000233 self.atcr = False
Guido van Rossum0612d842000-03-10 23:20:43 +0000234
Walter Dörwald69652032004-09-07 20:24:22 +0000235 def decode(self, input, errors='strict'):
236 raise NotImplementedError
237
238 def read(self, size=-1, chars=-1):
Guido van Rossum0612d842000-03-10 23:20:43 +0000239
240 """ Decodes data from the stream self.stream and returns the
241 resulting object.
242
Walter Dörwald69652032004-09-07 20:24:22 +0000243 chars indicates the number of characters to read from the
244 stream. read() will never return more than chars
245 characters, but it might return less, if there are not enough
246 characters available.
247
Guido van Rossum0612d842000-03-10 23:20:43 +0000248 size indicates the approximate maximum number of bytes to
249 read from the stream for decoding purposes. The decoder
250 can modify this setting as appropriate. The default value
251 -1 indicates to read and decode as much as possible. size
252 is intended to prevent having to decode huge files in one
253 step.
254
255 The method should use a greedy read strategy meaning that
256 it should read as much data as is allowed within the
257 definition of the encoding and the given size, e.g. if
258 optional encoding endings or state markers are available
259 on the stream, these should be read too.
Guido van Rossum0612d842000-03-10 23:20:43 +0000260 """
Walter Dörwald69652032004-09-07 20:24:22 +0000261 # read until we get the required number of characters (if available)
Walter Dörwald69652032004-09-07 20:24:22 +0000262 while True:
263 # can the request can be satisfied from the character buffer?
264 if chars < 0:
265 if self.charbuffer:
Walter Dörwalde57d7b12004-12-21 22:24:00 +0000266 break
Guido van Rossum0612d842000-03-10 23:20:43 +0000267 else:
Walter Dörwald69652032004-09-07 20:24:22 +0000268 if len(self.charbuffer) >= chars:
Walter Dörwald69652032004-09-07 20:24:22 +0000269 break
270 # we need more data
271 if size < 0:
272 newdata = self.stream.read()
273 else:
274 newdata = self.stream.read(size)
Walter Dörwalde57d7b12004-12-21 22:24:00 +0000275 # decode bytes (those remaining from the last call included)
Walter Dörwald69652032004-09-07 20:24:22 +0000276 data = self.bytebuffer + newdata
Walter Dörwalde57d7b12004-12-21 22:24:00 +0000277 newchars, decodedbytes = self.decode(data, self.errors)
Walter Dörwald69652032004-09-07 20:24:22 +0000278 # keep undecoded bytes until the next call
279 self.bytebuffer = data[decodedbytes:]
280 # put new characters in the character buffer
Walter Dörwalde57d7b12004-12-21 22:24:00 +0000281 self.charbuffer += newchars
Walter Dörwald69652032004-09-07 20:24:22 +0000282 # there was no data available
283 if not newdata:
Walter Dörwalde57d7b12004-12-21 22:24:00 +0000284 break
285 if chars < 0:
286 # Return everything we've got
287 result = self.charbuffer
288 self.charbuffer = u""
289 else:
290 # Return the first chars characters
291 result = self.charbuffer[:chars]
292 self.charbuffer = self.charbuffer[chars:]
Walter Dörwald69652032004-09-07 20:24:22 +0000293 return result
Guido van Rossum0612d842000-03-10 23:20:43 +0000294
Walter Dörwald69652032004-09-07 20:24:22 +0000295 def readline(self, size=None, keepends=True):
Guido van Rossuma3277132000-04-11 15:37:43 +0000296
297 """ Read one line from the input stream and return the
298 decoded data.
299
Walter Dörwald69652032004-09-07 20:24:22 +0000300 size, if given, is passed as size argument to the
301 read() method.
Guido van Rossum1c89b0e2000-04-11 15:41:38 +0000302
Guido van Rossuma3277132000-04-11 15:37:43 +0000303 """
Walter Dörwalde57d7b12004-12-21 22:24:00 +0000304 readsize = size or 72
Walter Dörwald69652032004-09-07 20:24:22 +0000305 line = u""
Walter Dörwalde57d7b12004-12-21 22:24:00 +0000306 # If size is given, we call read() only once
Walter Dörwald69652032004-09-07 20:24:22 +0000307 while True:
Walter Dörwalde57d7b12004-12-21 22:24:00 +0000308 data = self.read(readsize)
309 if self.atcr and data.startswith(u"\n"):
310 data = data[1:]
311 if data:
312 self.atcr = data.endswith(u"\r")
Walter Dörwald7a6dc132005-04-04 21:38:47 +0000313 # If we're at a "\r" (and are allowed to read more), read one
314 # extra character (which might be a "\n") to get a proper
Walter Dörwald714f8782005-04-04 21:42:22 +0000315 # line ending. (If the stream is temporarily exhausted we return
Walter Dörwald7a6dc132005-04-04 21:38:47 +0000316 # the wrong line ending, but at least we won't generate a bogus
Walter Dörwald714f8782005-04-04 21:42:22 +0000317 # second line.)
Walter Dörwald7a6dc132005-04-04 21:38:47 +0000318 if self.atcr and size is None:
319 data += self.read(size=1, chars=1)
320 self.atcr = data.endswith(u"\r")
321
Walter Dörwald69652032004-09-07 20:24:22 +0000322 line += data
Walter Dörwalde57d7b12004-12-21 22:24:00 +0000323 lines = line.splitlines(True)
324 if lines:
325 line0withend = lines[0]
326 line0withoutend = lines[0].splitlines(False)[0]
327 if line0withend != line0withoutend: # We really have a line end
328 # Put the rest back together and keep it until the next call
329 self.charbuffer = u"".join(lines[1:]) + self.charbuffer
330 if keepends:
331 line = line0withend
332 else:
333 line = line0withoutend
Walter Dörwald9fa09462005-01-10 12:01:39 +0000334 break
Walter Dörwalde57d7b12004-12-21 22:24:00 +0000335 # we didn't get anything or this was our only try
Walter Dörwald9fa09462005-01-10 12:01:39 +0000336 if not data or size is not None:
Walter Dörwalde57d7b12004-12-21 22:24:00 +0000337 if line and not keepends:
338 line = line.splitlines(False)[0]
339 break
340 if readsize<8000:
341 readsize *= 2
342 return line
Guido van Rossum1c89b0e2000-04-11 15:41:38 +0000343
Walter Dörwald69652032004-09-07 20:24:22 +0000344 def readlines(self, sizehint=None, keepends=True):
Guido van Rossuma3277132000-04-11 15:37:43 +0000345
346 """ Read all lines available on the input stream
347 and return them as list of lines.
348
349 Line breaks are implemented using the codec's decoder
350 method and are included in the list entries.
Guido van Rossum1c89b0e2000-04-11 15:41:38 +0000351
Marc-André Lemburgd5948492004-02-26 15:22:17 +0000352 sizehint, if given, is ignored since there is no efficient
353 way to finding the true end-of-line.
Guido van Rossuma3277132000-04-11 15:37:43 +0000354
355 """
Walter Dörwald69652032004-09-07 20:24:22 +0000356 data = self.read()
Hye-Shik Changaf5c7cf2004-10-17 23:51:21 +0000357 return data.splitlines(keepends)
Guido van Rossum0612d842000-03-10 23:20:43 +0000358
359 def reset(self):
360
361 """ Resets the codec buffers used for keeping state.
362
363 Note that no stream repositioning should take place.
Thomas Wouters7e474022000-07-16 12:04:32 +0000364 This method is primarily intended to be able to recover
Guido van Rossum0612d842000-03-10 23:20:43 +0000365 from decoding errors.
366
367 """
Walter Dörwald729c31f2005-03-14 19:06:30 +0000368 self.bytebuffer = ""
369 self.charbuffer = u""
370 self.atcr = False
371
Walter Dörwald71fd90d2005-03-14 19:25:41 +0000372 def seek(self, offset, whence=0):
Walter Dörwald729c31f2005-03-14 19:06:30 +0000373 """ Set the input stream's current position.
374
375 Resets the codec buffers used for keeping state.
376 """
377 self.reset()
378 self.stream.seek(offset, whence)
Guido van Rossum0612d842000-03-10 23:20:43 +0000379
Walter Dörwald4dbf1922002-11-06 16:53:44 +0000380 def next(self):
381
382 """ Return the next decoded line from the input stream."""
383 line = self.readline()
384 if line:
385 return line
386 raise StopIteration
387
388 def __iter__(self):
389 return self
390
Tim Peters30324a72001-05-15 17:19:16 +0000391 def __getattr__(self, name,
Guido van Rossum0612d842000-03-10 23:20:43 +0000392 getattr=getattr):
393
394 """ Inherit all other methods from the underlying stream.
395 """
Tim Peters30324a72001-05-15 17:19:16 +0000396 return getattr(self.stream, name)
Guido van Rossum0612d842000-03-10 23:20:43 +0000397
398###
399
400class StreamReaderWriter:
401
Fred Drake49fd1072000-04-13 14:11:21 +0000402 """ StreamReaderWriter instances allow wrapping streams which
403 work in both read and write modes.
404
405 The design is such that one can use the factory functions
Thomas Wouters7e474022000-07-16 12:04:32 +0000406 returned by the codec.lookup() function to construct the
Fred Drake49fd1072000-04-13 14:11:21 +0000407 instance.
408
409 """
Guido van Rossuma3277132000-04-11 15:37:43 +0000410 # Optional attributes set by the file wrappers below
411 encoding = 'unknown'
412
Tim Peters30324a72001-05-15 17:19:16 +0000413 def __init__(self, stream, Reader, Writer, errors='strict'):
Guido van Rossum0612d842000-03-10 23:20:43 +0000414
415 """ Creates a StreamReaderWriter instance.
416
417 stream must be a Stream-like object.
418
419 Reader, Writer must be factory functions or classes
420 providing the StreamReader, StreamWriter interface resp.
421
422 Error handling is done in the same way as defined for the
423 StreamWriter/Readers.
424
425 """
426 self.stream = stream
427 self.reader = Reader(stream, errors)
428 self.writer = Writer(stream, errors)
429 self.errors = errors
430
Tim Peters30324a72001-05-15 17:19:16 +0000431 def read(self, size=-1):
Guido van Rossum0612d842000-03-10 23:20:43 +0000432
433 return self.reader.read(size)
434
Guido van Rossumd58c26f2000-05-01 16:17:32 +0000435 def readline(self, size=None):
Guido van Rossuma3277132000-04-11 15:37:43 +0000436
437 return self.reader.readline(size)
438
Guido van Rossumd58c26f2000-05-01 16:17:32 +0000439 def readlines(self, sizehint=None):
Guido van Rossuma3277132000-04-11 15:37:43 +0000440
441 return self.reader.readlines(sizehint)
442
Walter Dörwald4dbf1922002-11-06 16:53:44 +0000443 def next(self):
444
445 """ Return the next decoded line from the input stream."""
446 return self.reader.next()
447
448 def __iter__(self):
449 return self
450
Tim Peters30324a72001-05-15 17:19:16 +0000451 def write(self, data):
Guido van Rossum0612d842000-03-10 23:20:43 +0000452
453 return self.writer.write(data)
454
Tim Peters30324a72001-05-15 17:19:16 +0000455 def writelines(self, list):
Guido van Rossuma3277132000-04-11 15:37:43 +0000456
457 return self.writer.writelines(list)
458
Guido van Rossum0612d842000-03-10 23:20:43 +0000459 def reset(self):
460
461 self.reader.reset()
462 self.writer.reset()
463
Tim Peters30324a72001-05-15 17:19:16 +0000464 def __getattr__(self, name,
Guido van Rossum0612d842000-03-10 23:20:43 +0000465 getattr=getattr):
466
467 """ Inherit all other methods from the underlying stream.
468 """
Tim Peters30324a72001-05-15 17:19:16 +0000469 return getattr(self.stream, name)
Guido van Rossum0612d842000-03-10 23:20:43 +0000470
471###
472
473class StreamRecoder:
474
Fred Drake49fd1072000-04-13 14:11:21 +0000475 """ StreamRecoder instances provide a frontend - backend
476 view of encoding data.
477
478 They use the complete set of APIs returned by the
479 codecs.lookup() function to implement their task.
480
481 Data written to the stream is first decoded into an
482 intermediate format (which is dependent on the given codec
483 combination) and then written to the stream using an instance
484 of the provided Writer class.
485
486 In the other direction, data is read from the stream using a
487 Reader instance and then return encoded data to the caller.
488
489 """
Guido van Rossuma3277132000-04-11 15:37:43 +0000490 # Optional attributes set by the file wrappers below
491 data_encoding = 'unknown'
492 file_encoding = 'unknown'
493
Tim Peters30324a72001-05-15 17:19:16 +0000494 def __init__(self, stream, encode, decode, Reader, Writer,
495 errors='strict'):
Guido van Rossum0612d842000-03-10 23:20:43 +0000496
497 """ Creates a StreamRecoder instance which implements a two-way
498 conversion: encode and decode work on the frontend (the
Guido van Rossum1c89b0e2000-04-11 15:41:38 +0000499 input to .read() and output of .write()) while
Guido van Rossum0612d842000-03-10 23:20:43 +0000500 Reader and Writer work on the backend (reading and
Fred Drake908670c2000-03-17 15:42:11 +0000501 writing to the stream).
Guido van Rossum0612d842000-03-10 23:20:43 +0000502
503 You can use these objects to do transparent direct
504 recodings from e.g. latin-1 to utf-8 and back.
505
506 stream must be a file-like object.
507
508 encode, decode must adhere to the Codec interface, Reader,
509 Writer must be factory functions or classes providing the
510 StreamReader, StreamWriter interface resp.
511
512 encode and decode are needed for the frontend translation,
513 Reader and Writer for the backend translation. Unicode is
514 used as intermediate encoding.
515
516 Error handling is done in the same way as defined for the
517 StreamWriter/Readers.
518
519 """
520 self.stream = stream
521 self.encode = encode
522 self.decode = decode
523 self.reader = Reader(stream, errors)
524 self.writer = Writer(stream, errors)
525 self.errors = errors
526
Tim Peters30324a72001-05-15 17:19:16 +0000527 def read(self, size=-1):
Guido van Rossum0612d842000-03-10 23:20:43 +0000528
529 data = self.reader.read(size)
530 data, bytesencoded = self.encode(data, self.errors)
531 return data
532
Tim Peters30324a72001-05-15 17:19:16 +0000533 def readline(self, size=None):
Guido van Rossuma3277132000-04-11 15:37:43 +0000534
535 if size is None:
536 data = self.reader.readline()
537 else:
538 data = self.reader.readline(size)
539 data, bytesencoded = self.encode(data, self.errors)
540 return data
541
Tim Peters30324a72001-05-15 17:19:16 +0000542 def readlines(self, sizehint=None):
Guido van Rossuma3277132000-04-11 15:37:43 +0000543
Marc-André Lemburgd5948492004-02-26 15:22:17 +0000544 data = self.reader.read()
Guido van Rossuma3277132000-04-11 15:37:43 +0000545 data, bytesencoded = self.encode(data, self.errors)
546 return data.splitlines(1)
547
Walter Dörwald4dbf1922002-11-06 16:53:44 +0000548 def next(self):
549
550 """ Return the next decoded line from the input stream."""
551 return self.reader.next()
552
553 def __iter__(self):
554 return self
555
Tim Peters30324a72001-05-15 17:19:16 +0000556 def write(self, data):
Guido van Rossum0612d842000-03-10 23:20:43 +0000557
558 data, bytesdecoded = self.decode(data, self.errors)
559 return self.writer.write(data)
560
Tim Peters30324a72001-05-15 17:19:16 +0000561 def writelines(self, list):
Guido van Rossuma3277132000-04-11 15:37:43 +0000562
563 data = ''.join(list)
564 data, bytesdecoded = self.decode(data, self.errors)
565 return self.writer.write(data)
Guido van Rossum0612d842000-03-10 23:20:43 +0000566
567 def reset(self):
568
569 self.reader.reset()
570 self.writer.reset()
571
Tim Peters30324a72001-05-15 17:19:16 +0000572 def __getattr__(self, name,
Guido van Rossum0612d842000-03-10 23:20:43 +0000573 getattr=getattr):
574
575 """ Inherit all other methods from the underlying stream.
576 """
Tim Peters30324a72001-05-15 17:19:16 +0000577 return getattr(self.stream, name)
Guido van Rossum0612d842000-03-10 23:20:43 +0000578
579### Shortcuts
580
Marc-André Lemburg349a3d32000-06-21 21:21:04 +0000581def open(filename, mode='rb', encoding=None, errors='strict', buffering=1):
Guido van Rossum0612d842000-03-10 23:20:43 +0000582
583 """ Open an encoded file using the given mode and return
584 a wrapped version providing transparent encoding/decoding.
585
586 Note: The wrapped version will only accept the object format
587 defined by the codecs, i.e. Unicode objects for most builtin
Skip Montanaro9f5f9d92005-03-16 03:51:56 +0000588 codecs. Output is also codec dependent and will usually be
Guido van Rossum0612d842000-03-10 23:20:43 +0000589 Unicode as well.
590
Marc-André Lemburg349a3d32000-06-21 21:21:04 +0000591 Files are always opened in binary mode, even if no binary mode
Walter Dörwald7f3ed742003-02-02 23:08:27 +0000592 was specified. This is done to avoid data loss due to encodings
Marc-André Lemburg349a3d32000-06-21 21:21:04 +0000593 using 8-bit values. The default file mode is 'rb' meaning to
594 open the file in binary read mode.
595
Guido van Rossum0612d842000-03-10 23:20:43 +0000596 encoding specifies the encoding which is to be used for the
Walter Dörwald7f3ed742003-02-02 23:08:27 +0000597 file.
Guido van Rossum0612d842000-03-10 23:20:43 +0000598
599 errors may be given to define the error handling. It defaults
600 to 'strict' which causes ValueErrors to be raised in case an
601 encoding error occurs.
602
603 buffering has the same meaning as for the builtin open() API.
604 It defaults to line buffered.
605
Fred Drake49fd1072000-04-13 14:11:21 +0000606 The returned wrapped file object provides an extra attribute
607 .encoding which allows querying the used encoding. This
608 attribute is only available if an encoding was specified as
609 parameter.
610
Guido van Rossum0612d842000-03-10 23:20:43 +0000611 """
612 if encoding is not None and \
613 'b' not in mode:
614 # Force opening of the file in binary mode
615 mode = mode + 'b'
616 file = __builtin__.open(filename, mode, buffering)
617 if encoding is None:
618 return file
Tim Peters30324a72001-05-15 17:19:16 +0000619 (e, d, sr, sw) = lookup(encoding)
Guido van Rossuma3277132000-04-11 15:37:43 +0000620 srw = StreamReaderWriter(file, sr, sw, errors)
621 # Add attributes to simplify introspection
622 srw.encoding = encoding
623 return srw
Guido van Rossum0612d842000-03-10 23:20:43 +0000624
Guido van Rossuma3277132000-04-11 15:37:43 +0000625def EncodedFile(file, data_encoding, file_encoding=None, errors='strict'):
Guido van Rossum0612d842000-03-10 23:20:43 +0000626
627 """ Return a wrapped version of file which provides transparent
628 encoding translation.
629
630 Strings written to the wrapped file are interpreted according
Guido van Rossuma3277132000-04-11 15:37:43 +0000631 to the given data_encoding and then written to the original
632 file as string using file_encoding. The intermediate encoding
633 will usually be Unicode but depends on the specified codecs.
Guido van Rossum0612d842000-03-10 23:20:43 +0000634
Guido van Rossuma3277132000-04-11 15:37:43 +0000635 Strings are read from the file using file_encoding and then
636 passed back to the caller as string using data_encoding.
637
638 If file_encoding is not given, it defaults to data_encoding.
Guido van Rossum0612d842000-03-10 23:20:43 +0000639
640 errors may be given to define the error handling. It defaults
641 to 'strict' which causes ValueErrors to be raised in case an
642 encoding error occurs.
643
Fred Drake49fd1072000-04-13 14:11:21 +0000644 The returned wrapped file object provides two extra attributes
645 .data_encoding and .file_encoding which reflect the given
646 parameters of the same name. The attributes can be used for
647 introspection by Python programs.
648
Guido van Rossum0612d842000-03-10 23:20:43 +0000649 """
Guido van Rossuma3277132000-04-11 15:37:43 +0000650 if file_encoding is None:
651 file_encoding = data_encoding
652 encode, decode = lookup(data_encoding)[:2]
653 Reader, Writer = lookup(file_encoding)[2:]
654 sr = StreamRecoder(file,
Tim Peters30324a72001-05-15 17:19:16 +0000655 encode, decode, Reader, Writer,
Guido van Rossuma3277132000-04-11 15:37:43 +0000656 errors)
657 # Add attributes to simplify introspection
658 sr.data_encoding = data_encoding
659 sr.file_encoding = file_encoding
660 return sr
Guido van Rossum0612d842000-03-10 23:20:43 +0000661
Marc-André Lemburgaa32c5a2001-09-19 11:24:48 +0000662### Helpers for codec lookup
663
664def getencoder(encoding):
665
666 """ Lookup up the codec for the given encoding and return
667 its encoder function.
668
669 Raises a LookupError in case the encoding cannot be found.
670
671 """
672 return lookup(encoding)[0]
673
674def getdecoder(encoding):
675
676 """ Lookup up the codec for the given encoding and return
677 its decoder function.
678
679 Raises a LookupError in case the encoding cannot be found.
680
681 """
682 return lookup(encoding)[1]
683
684def getreader(encoding):
685
686 """ Lookup up the codec for the given encoding and return
687 its StreamReader class or factory function.
688
689 Raises a LookupError in case the encoding cannot be found.
690
691 """
692 return lookup(encoding)[2]
693
694def getwriter(encoding):
695
696 """ Lookup up the codec for the given encoding and return
697 its StreamWriter class or factory function.
698
699 Raises a LookupError in case the encoding cannot be found.
700
701 """
702 return lookup(encoding)[3]
703
Marc-André Lemburga866df82001-01-03 21:29:14 +0000704### Helpers for charmap-based codecs
705
706def make_identity_dict(rng):
707
708 """ make_identity_dict(rng) -> dict
709
710 Return a dictionary where elements of the rng sequence are
711 mapped to themselves.
Tim Peters88869f92001-01-14 23:36:06 +0000712
Marc-André Lemburga866df82001-01-03 21:29:14 +0000713 """
714 res = {}
715 for i in rng:
716 res[i]=i
717 return res
718
Marc-André Lemburg716cf912001-05-16 09:41:45 +0000719def make_encoding_map(decoding_map):
720
721 """ Creates an encoding map from a decoding map.
722
Walter Dörwald7f3ed742003-02-02 23:08:27 +0000723 If a target mapping in the decoding map occurs multiple
Marc-André Lemburg716cf912001-05-16 09:41:45 +0000724 times, then that target is mapped to None (undefined mapping),
725 causing an exception when encountered by the charmap codec
726 during translation.
727
728 One example where this happens is cp875.py which decodes
729 multiple character to \u001a.
730
731 """
732 m = {}
733 for k,v in decoding_map.items():
Raymond Hettinger54f02222002-06-01 14:18:47 +0000734 if not v in m:
Marc-André Lemburg716cf912001-05-16 09:41:45 +0000735 m[v] = k
736 else:
737 m[v] = None
738 return m
Tim Peters3a2ab1a2001-05-29 06:06:54 +0000739
Walter Dörwald3aeb6322002-09-02 13:14:32 +0000740### error handlers
741
Martin v. Löwise2713be2005-03-08 15:03:08 +0000742try:
743 strict_errors = lookup_error("strict")
744 ignore_errors = lookup_error("ignore")
745 replace_errors = lookup_error("replace")
746 xmlcharrefreplace_errors = lookup_error("xmlcharrefreplace")
747 backslashreplace_errors = lookup_error("backslashreplace")
748except LookupError:
749 # In --disable-unicode builds, these error handler are missing
750 strict_errors = None
751 ignore_errors = None
752 replace_errors = None
753 xmlcharrefreplace_errors = None
754 backslashreplace_errors = None
Walter Dörwald3aeb6322002-09-02 13:14:32 +0000755
Martin v. Löwis6cd441d2001-07-31 08:54:55 +0000756# Tell modulefinder that using codecs probably needs the encodings
757# package
758_false = 0
759if _false:
760 import encodings
761
Guido van Rossum0612d842000-03-10 23:20:43 +0000762### Tests
Guido van Rossum1c89b0e2000-04-11 15:41:38 +0000763
Guido van Rossum0612d842000-03-10 23:20:43 +0000764if __name__ == '__main__':
765
Guido van Rossuma3277132000-04-11 15:37:43 +0000766 # Make stdout translate Latin-1 output into UTF-8 output
767 sys.stdout = EncodedFile(sys.stdout, 'latin-1', 'utf-8')
Guido van Rossum1c89b0e2000-04-11 15:41:38 +0000768
Guido van Rossuma3277132000-04-11 15:37:43 +0000769 # Have stdin translate Latin-1 input into UTF-8 input
770 sys.stdin = EncodedFile(sys.stdin, 'utf-8', 'latin-1')