blob: 08cf50b31e8d3762a4a82ce9bb8482656281cfef [file] [log] [blame]
Guido van Rossum0612d842000-03-10 23:20:43 +00001""" codecs -- Python Codec Registry, API and helpers.
2
3
4Written by Marc-Andre Lemburg (mal@lemburg.com).
5
6(c) Copyright CNRI, All Rights Reserved. NO WARRANTY.
7
8"""#"
9
Marc-André Lemburgb28de0d2002-12-12 17:37:50 +000010import __builtin__, sys
Guido van Rossum0612d842000-03-10 23:20:43 +000011
12### Registry and builtin stateless codec functions
13
Guido van Rossumb95de4f2000-03-31 17:25:23 +000014try:
15 from _codecs import *
Tim Peters30324a72001-05-15 17:19:16 +000016except ImportError, why:
Guido van Rossumb95de4f2000-03-31 17:25:23 +000017 raise SystemError,\
18 'Failed to load the builtin codecs: %s' % why
Guido van Rossum0612d842000-03-10 23:20:43 +000019
Tim Peters30324a72001-05-15 17:19:16 +000020__all__ = ["register", "lookup", "open", "EncodedFile", "BOM", "BOM_BE",
Walter Dörwald474458d2002-06-04 15:16:29 +000021 "BOM_LE", "BOM32_BE", "BOM32_LE", "BOM64_BE", "BOM64_LE",
22 "BOM_UTF8", "BOM_UTF16", "BOM_UTF16_LE", "BOM_UTF16_BE",
Walter Dörwald3aeb6322002-09-02 13:14:32 +000023 "BOM_UTF32", "BOM_UTF32_LE", "BOM_UTF32_BE",
24 "strict_errors", "ignore_errors", "replace_errors",
25 "xmlcharrefreplace_errors",
26 "register_error", "lookup_error"]
Skip Montanaroe99d5ea2001-01-20 19:54:20 +000027
Guido van Rossum0612d842000-03-10 23:20:43 +000028### Constants
29
30#
Walter Dörwald474458d2002-06-04 15:16:29 +000031# Byte Order Mark (BOM = ZERO WIDTH NO-BREAK SPACE = U+FEFF)
32# and its possible byte string values
33# for UTF8/UTF16/UTF32 output and little/big endian machines
Guido van Rossum0612d842000-03-10 23:20:43 +000034#
Guido van Rossum0612d842000-03-10 23:20:43 +000035
Walter Dörwald474458d2002-06-04 15:16:29 +000036# UTF-8
37BOM_UTF8 = '\xef\xbb\xbf'
38
39# UTF-16, little endian
40BOM_LE = BOM_UTF16_LE = '\xff\xfe'
41
42# UTF-16, big endian
43BOM_BE = BOM_UTF16_BE = '\xfe\xff'
44
45# UTF-32, little endian
46BOM_UTF32_LE = '\xff\xfe\x00\x00'
47
48# UTF-32, big endian
49BOM_UTF32_BE = '\x00\x00\xfe\xff'
50
Marc-André Lemburgb28de0d2002-12-12 17:37:50 +000051if sys.byteorder == 'little':
Walter Dörwald474458d2002-06-04 15:16:29 +000052
Marc-André Lemburgb28de0d2002-12-12 17:37:50 +000053 # UTF-16, native endianness
54 BOM = BOM_UTF16 = BOM_UTF16_LE
55
56 # UTF-32, native endianness
57 BOM_UTF32 = BOM_UTF32_LE
58
59else:
60
61 # UTF-16, native endianness
62 BOM = BOM_UTF16 = BOM_UTF16_BE
63
64 # UTF-32, native endianness
65 BOM_UTF32 = BOM_UTF32_BE
Walter Dörwald474458d2002-06-04 15:16:29 +000066
67# Old broken names (don't use in new code)
68BOM32_LE = BOM_UTF16_LE
69BOM32_BE = BOM_UTF16_BE
70BOM64_LE = BOM_UTF32_LE
71BOM64_BE = BOM_UTF32_BE
Guido van Rossum0612d842000-03-10 23:20:43 +000072
73
74### Codec base classes (defining the API)
75
76class Codec:
77
78 """ Defines the interface for stateless encoders/decoders.
79
Walter Dörwald7f82f792002-11-19 21:42:53 +000080 The .encode()/.decode() methods may use different error
Guido van Rossum0612d842000-03-10 23:20:43 +000081 handling schemes by providing the errors argument. These
Walter Dörwald7f82f792002-11-19 21:42:53 +000082 string values are predefined:
Guido van Rossum0612d842000-03-10 23:20:43 +000083
Guido van Rossumd8855fd2000-03-24 22:14:19 +000084 'strict' - raise a ValueError error (or a subclass)
Guido van Rossum0612d842000-03-10 23:20:43 +000085 'ignore' - ignore the character and continue with the next
86 'replace' - replace with a suitable replacement character;
87 Python will use the official U+FFFD REPLACEMENT
Walter Dörwald7f82f792002-11-19 21:42:53 +000088 CHARACTER for the builtin Unicode codecs on
89 decoding and '?' on encoding.
90 'xmlcharrefreplace' - Replace with the appropriate XML
91 character reference (only for encoding).
92 'backslashreplace' - Replace with backslashed escape sequences
93 (only for encoding).
94
95 The set of allowed values can be extended via register_error.
Guido van Rossum0612d842000-03-10 23:20:43 +000096
97 """
Tim Peters30324a72001-05-15 17:19:16 +000098 def encode(self, input, errors='strict'):
Guido van Rossum1c89b0e2000-04-11 15:41:38 +000099
Fred Drake3e74c0d2000-03-17 15:40:35 +0000100 """ Encodes the object input and returns a tuple (output
Guido van Rossum0612d842000-03-10 23:20:43 +0000101 object, length consumed).
102
103 errors defines the error handling to apply. It defaults to
104 'strict' handling.
105
106 The method may not store state in the Codec instance. Use
107 StreamCodec for codecs which have to keep state in order to
108 make encoding/decoding efficient.
109
110 The encoder must be able to handle zero length input and
111 return an empty object of the output object type in this
112 situation.
113
114 """
115 raise NotImplementedError
116
Tim Peters30324a72001-05-15 17:19:16 +0000117 def decode(self, input, errors='strict'):
Guido van Rossum0612d842000-03-10 23:20:43 +0000118
119 """ Decodes the object input and returns a tuple (output
120 object, length consumed).
121
122 input must be an object which provides the bf_getreadbuf
123 buffer slot. Python strings, buffer objects and memory
124 mapped files are examples of objects providing this slot.
Guido van Rossum1c89b0e2000-04-11 15:41:38 +0000125
Guido van Rossum0612d842000-03-10 23:20:43 +0000126 errors defines the error handling to apply. It defaults to
127 'strict' handling.
128
129 The method may not store state in the Codec instance. Use
130 StreamCodec for codecs which have to keep state in order to
131 make encoding/decoding efficient.
132
133 The decoder must be able to handle zero length input and
134 return an empty object of the output object type in this
135 situation.
136
Guido van Rossum1c89b0e2000-04-11 15:41:38 +0000137 """
Guido van Rossum0612d842000-03-10 23:20:43 +0000138 raise NotImplementedError
139
140#
141# The StreamWriter and StreamReader class provide generic working
Andrew M. Kuchling97c56352001-09-18 20:29:48 +0000142# interfaces which can be used to implement new encoding submodules
Guido van Rossum0612d842000-03-10 23:20:43 +0000143# very easily. See encodings/utf_8.py for an example on how this is
144# done.
Guido van Rossum1c89b0e2000-04-11 15:41:38 +0000145#
Guido van Rossum0612d842000-03-10 23:20:43 +0000146
147class StreamWriter(Codec):
148
Tim Peters30324a72001-05-15 17:19:16 +0000149 def __init__(self, stream, errors='strict'):
Guido van Rossum0612d842000-03-10 23:20:43 +0000150
151 """ Creates a StreamWriter instance.
152
153 stream must be a file-like object open for writing
154 (binary) data.
155
Walter Dörwald7f82f792002-11-19 21:42:53 +0000156 The StreamWriter may use different error handling
Guido van Rossum0612d842000-03-10 23:20:43 +0000157 schemes by providing the errors keyword argument. These
Walter Dörwald7f82f792002-11-19 21:42:53 +0000158 parameters are predefined:
Guido van Rossum0612d842000-03-10 23:20:43 +0000159
160 'strict' - raise a ValueError (or a subclass)
161 'ignore' - ignore the character and continue with the next
162 'replace'- replace with a suitable replacement character
Walter Dörwald7f82f792002-11-19 21:42:53 +0000163 'xmlcharrefreplace' - Replace with the appropriate XML
164 character reference.
165 'backslashreplace' - Replace with backslashed escape
166 sequences (only for encoding).
Guido van Rossum0612d842000-03-10 23:20:43 +0000167
Walter Dörwald7f82f792002-11-19 21:42:53 +0000168 The set of allowed parameter values can be extended via
169 register_error.
Guido van Rossum0612d842000-03-10 23:20:43 +0000170 """
171 self.stream = stream
172 self.errors = errors
173
Guido van Rossuma3277132000-04-11 15:37:43 +0000174 def write(self, object):
Guido van Rossum0612d842000-03-10 23:20:43 +0000175
176 """ Writes the object's contents encoded to self.stream.
177 """
Tim Peters30324a72001-05-15 17:19:16 +0000178 data, consumed = self.encode(object, self.errors)
Guido van Rossum0612d842000-03-10 23:20:43 +0000179 self.stream.write(data)
180
Guido van Rossuma3277132000-04-11 15:37:43 +0000181 def writelines(self, list):
182
183 """ Writes the concatenated list of strings to the stream
184 using .write().
185 """
186 self.write(''.join(list))
Guido van Rossum1c89b0e2000-04-11 15:41:38 +0000187
Guido van Rossum0612d842000-03-10 23:20:43 +0000188 def reset(self):
189
190 """ Flushes and resets the codec buffers used for keeping state.
191
192 Calling this method should ensure that the data on the
193 output is put into a clean state, that allows appending
194 of new fresh data without having to rescan the whole
195 stream to recover state.
196
197 """
198 pass
199
Tim Peters30324a72001-05-15 17:19:16 +0000200 def __getattr__(self, name,
Guido van Rossum0612d842000-03-10 23:20:43 +0000201 getattr=getattr):
202
203 """ Inherit all other methods from the underlying stream.
204 """
Tim Peters30324a72001-05-15 17:19:16 +0000205 return getattr(self.stream, name)
Guido van Rossum0612d842000-03-10 23:20:43 +0000206
207###
208
209class StreamReader(Codec):
210
Tim Peters30324a72001-05-15 17:19:16 +0000211 def __init__(self, stream, errors='strict'):
Guido van Rossum0612d842000-03-10 23:20:43 +0000212
213 """ Creates a StreamReader instance.
214
215 stream must be a file-like object open for reading
216 (binary) data.
217
Walter Dörwald7f82f792002-11-19 21:42:53 +0000218 The StreamReader may use different error handling
Guido van Rossum0612d842000-03-10 23:20:43 +0000219 schemes by providing the errors keyword argument. These
Walter Dörwald7f82f792002-11-19 21:42:53 +0000220 parameters are predefined:
Guido van Rossum0612d842000-03-10 23:20:43 +0000221
222 'strict' - raise a ValueError (or a subclass)
223 'ignore' - ignore the character and continue with the next
224 'replace'- replace with a suitable replacement character;
225
Walter Dörwald7f82f792002-11-19 21:42:53 +0000226 The set of allowed parameter values can be extended via
227 register_error.
Guido van Rossum0612d842000-03-10 23:20:43 +0000228 """
229 self.stream = stream
230 self.errors = errors
Walter Dörwald69652032004-09-07 20:24:22 +0000231 self.bytebuffer = ""
232 self.charbuffer = u""
Guido van Rossum0612d842000-03-10 23:20:43 +0000233
Walter Dörwald69652032004-09-07 20:24:22 +0000234 def decode(self, input, errors='strict'):
235 raise NotImplementedError
236
237 def read(self, size=-1, chars=-1):
Guido van Rossum0612d842000-03-10 23:20:43 +0000238
239 """ Decodes data from the stream self.stream and returns the
240 resulting object.
241
Walter Dörwald69652032004-09-07 20:24:22 +0000242 chars indicates the number of characters to read from the
243 stream. read() will never return more than chars
244 characters, but it might return less, if there are not enough
245 characters available.
246
Guido van Rossum0612d842000-03-10 23:20:43 +0000247 size indicates the approximate maximum number of bytes to
248 read from the stream for decoding purposes. The decoder
249 can modify this setting as appropriate. The default value
250 -1 indicates to read and decode as much as possible. size
251 is intended to prevent having to decode huge files in one
252 step.
253
254 The method should use a greedy read strategy meaning that
255 it should read as much data as is allowed within the
256 definition of the encoding and the given size, e.g. if
257 optional encoding endings or state markers are available
258 on the stream, these should be read too.
Guido van Rossum0612d842000-03-10 23:20:43 +0000259 """
Walter Dörwald69652032004-09-07 20:24:22 +0000260 # read until we get the required number of characters (if available)
Walter Dörwald69652032004-09-07 20:24:22 +0000261 while True:
262 # can the request can be satisfied from the character buffer?
263 if chars < 0:
264 if self.charbuffer:
Walter Dörwalde57d7b12004-12-21 22:24:00 +0000265 break
Guido van Rossum0612d842000-03-10 23:20:43 +0000266 else:
Walter Dörwald69652032004-09-07 20:24:22 +0000267 if len(self.charbuffer) >= chars:
Walter Dörwald69652032004-09-07 20:24:22 +0000268 break
269 # we need more data
270 if size < 0:
271 newdata = self.stream.read()
272 else:
273 newdata = self.stream.read(size)
Walter Dörwalde57d7b12004-12-21 22:24:00 +0000274 # decode bytes (those remaining from the last call included)
Walter Dörwald69652032004-09-07 20:24:22 +0000275 data = self.bytebuffer + newdata
Walter Dörwalde57d7b12004-12-21 22:24:00 +0000276 newchars, decodedbytes = self.decode(data, self.errors)
Walter Dörwald69652032004-09-07 20:24:22 +0000277 # keep undecoded bytes until the next call
278 self.bytebuffer = data[decodedbytes:]
279 # put new characters in the character buffer
Walter Dörwalde57d7b12004-12-21 22:24:00 +0000280 self.charbuffer += newchars
Walter Dörwald69652032004-09-07 20:24:22 +0000281 # there was no data available
282 if not newdata:
Walter Dörwalde57d7b12004-12-21 22:24:00 +0000283 break
284 if chars < 0:
285 # Return everything we've got
286 result = self.charbuffer
287 self.charbuffer = u""
288 else:
289 # Return the first chars characters
290 result = self.charbuffer[:chars]
291 self.charbuffer = self.charbuffer[chars:]
Walter Dörwald69652032004-09-07 20:24:22 +0000292 return result
Guido van Rossum0612d842000-03-10 23:20:43 +0000293
Walter Dörwald69652032004-09-07 20:24:22 +0000294 def readline(self, size=None, keepends=True):
Guido van Rossuma3277132000-04-11 15:37:43 +0000295
296 """ Read one line from the input stream and return the
297 decoded data.
298
Walter Dörwald69652032004-09-07 20:24:22 +0000299 size, if given, is passed as size argument to the
300 read() method.
Guido van Rossum1c89b0e2000-04-11 15:41:38 +0000301
Guido van Rossuma3277132000-04-11 15:37:43 +0000302 """
Walter Dörwalde57d7b12004-12-21 22:24:00 +0000303 readsize = size or 72
Walter Dörwald69652032004-09-07 20:24:22 +0000304 line = u""
Walter Dörwalde57d7b12004-12-21 22:24:00 +0000305 # If size is given, we call read() only once
Walter Dörwald69652032004-09-07 20:24:22 +0000306 while True:
Walter Dörwalde57d7b12004-12-21 22:24:00 +0000307 data = self.read(readsize)
Walter Dörwalde57d7b12004-12-21 22:24:00 +0000308 if data:
Walter Dörwalda4eb2d52005-04-21 21:42:35 +0000309 # If we're at a "\r" read one extra character (which might
310 # be a "\n") to get a proper line ending. If the stream is
Walter Dörwaldbc8e6422005-04-21 21:32:03 +0000311 # temporarily exhausted we return the wrong line ending.
312 if data.endswith(u"\r"):
Walter Dörwald7a6dc132005-04-04 21:38:47 +0000313 data += self.read(size=1, chars=1)
Walter Dörwald7a6dc132005-04-04 21:38:47 +0000314
Walter Dörwald69652032004-09-07 20:24:22 +0000315 line += data
Walter Dörwalde57d7b12004-12-21 22:24:00 +0000316 lines = line.splitlines(True)
317 if lines:
318 line0withend = lines[0]
319 line0withoutend = lines[0].splitlines(False)[0]
320 if line0withend != line0withoutend: # We really have a line end
321 # Put the rest back together and keep it until the next call
322 self.charbuffer = u"".join(lines[1:]) + self.charbuffer
323 if keepends:
324 line = line0withend
325 else:
326 line = line0withoutend
Walter Dörwald9fa09462005-01-10 12:01:39 +0000327 break
Walter Dörwalde57d7b12004-12-21 22:24:00 +0000328 # we didn't get anything or this was our only try
Walter Dörwald9fa09462005-01-10 12:01:39 +0000329 if not data or size is not None:
Walter Dörwalde57d7b12004-12-21 22:24:00 +0000330 if line and not keepends:
331 line = line.splitlines(False)[0]
332 break
333 if readsize<8000:
334 readsize *= 2
335 return line
Guido van Rossum1c89b0e2000-04-11 15:41:38 +0000336
Walter Dörwald69652032004-09-07 20:24:22 +0000337 def readlines(self, sizehint=None, keepends=True):
Guido van Rossuma3277132000-04-11 15:37:43 +0000338
339 """ Read all lines available on the input stream
340 and return them as list of lines.
341
342 Line breaks are implemented using the codec's decoder
343 method and are included in the list entries.
Guido van Rossum1c89b0e2000-04-11 15:41:38 +0000344
Marc-André Lemburgd5948492004-02-26 15:22:17 +0000345 sizehint, if given, is ignored since there is no efficient
346 way to finding the true end-of-line.
Guido van Rossuma3277132000-04-11 15:37:43 +0000347
348 """
Walter Dörwald69652032004-09-07 20:24:22 +0000349 data = self.read()
Hye-Shik Changaf5c7cf2004-10-17 23:51:21 +0000350 return data.splitlines(keepends)
Guido van Rossum0612d842000-03-10 23:20:43 +0000351
352 def reset(self):
353
354 """ Resets the codec buffers used for keeping state.
355
356 Note that no stream repositioning should take place.
Thomas Wouters7e474022000-07-16 12:04:32 +0000357 This method is primarily intended to be able to recover
Guido van Rossum0612d842000-03-10 23:20:43 +0000358 from decoding errors.
359
360 """
Walter Dörwald729c31f2005-03-14 19:06:30 +0000361 self.bytebuffer = ""
362 self.charbuffer = u""
Walter Dörwald729c31f2005-03-14 19:06:30 +0000363
Walter Dörwald71fd90d2005-03-14 19:25:41 +0000364 def seek(self, offset, whence=0):
Walter Dörwald729c31f2005-03-14 19:06:30 +0000365 """ Set the input stream's current position.
366
367 Resets the codec buffers used for keeping state.
368 """
369 self.reset()
370 self.stream.seek(offset, whence)
Guido van Rossum0612d842000-03-10 23:20:43 +0000371
Walter Dörwald4dbf1922002-11-06 16:53:44 +0000372 def next(self):
373
374 """ Return the next decoded line from the input stream."""
375 line = self.readline()
376 if line:
377 return line
378 raise StopIteration
379
380 def __iter__(self):
381 return self
382
Tim Peters30324a72001-05-15 17:19:16 +0000383 def __getattr__(self, name,
Guido van Rossum0612d842000-03-10 23:20:43 +0000384 getattr=getattr):
385
386 """ Inherit all other methods from the underlying stream.
387 """
Tim Peters30324a72001-05-15 17:19:16 +0000388 return getattr(self.stream, name)
Guido van Rossum0612d842000-03-10 23:20:43 +0000389
390###
391
392class StreamReaderWriter:
393
Fred Drake49fd1072000-04-13 14:11:21 +0000394 """ StreamReaderWriter instances allow wrapping streams which
395 work in both read and write modes.
396
397 The design is such that one can use the factory functions
Thomas Wouters7e474022000-07-16 12:04:32 +0000398 returned by the codec.lookup() function to construct the
Fred Drake49fd1072000-04-13 14:11:21 +0000399 instance.
400
401 """
Guido van Rossuma3277132000-04-11 15:37:43 +0000402 # Optional attributes set by the file wrappers below
403 encoding = 'unknown'
404
Tim Peters30324a72001-05-15 17:19:16 +0000405 def __init__(self, stream, Reader, Writer, errors='strict'):
Guido van Rossum0612d842000-03-10 23:20:43 +0000406
407 """ Creates a StreamReaderWriter instance.
408
409 stream must be a Stream-like object.
410
411 Reader, Writer must be factory functions or classes
412 providing the StreamReader, StreamWriter interface resp.
413
414 Error handling is done in the same way as defined for the
415 StreamWriter/Readers.
416
417 """
418 self.stream = stream
419 self.reader = Reader(stream, errors)
420 self.writer = Writer(stream, errors)
421 self.errors = errors
422
Tim Peters30324a72001-05-15 17:19:16 +0000423 def read(self, size=-1):
Guido van Rossum0612d842000-03-10 23:20:43 +0000424
425 return self.reader.read(size)
426
Guido van Rossumd58c26f2000-05-01 16:17:32 +0000427 def readline(self, size=None):
Guido van Rossuma3277132000-04-11 15:37:43 +0000428
429 return self.reader.readline(size)
430
Guido van Rossumd58c26f2000-05-01 16:17:32 +0000431 def readlines(self, sizehint=None):
Guido van Rossuma3277132000-04-11 15:37:43 +0000432
433 return self.reader.readlines(sizehint)
434
Walter Dörwald4dbf1922002-11-06 16:53:44 +0000435 def next(self):
436
437 """ Return the next decoded line from the input stream."""
438 return self.reader.next()
439
440 def __iter__(self):
441 return self
442
Tim Peters30324a72001-05-15 17:19:16 +0000443 def write(self, data):
Guido van Rossum0612d842000-03-10 23:20:43 +0000444
445 return self.writer.write(data)
446
Tim Peters30324a72001-05-15 17:19:16 +0000447 def writelines(self, list):
Guido van Rossuma3277132000-04-11 15:37:43 +0000448
449 return self.writer.writelines(list)
450
Guido van Rossum0612d842000-03-10 23:20:43 +0000451 def reset(self):
452
453 self.reader.reset()
454 self.writer.reset()
455
Tim Peters30324a72001-05-15 17:19:16 +0000456 def __getattr__(self, name,
Guido van Rossum0612d842000-03-10 23:20:43 +0000457 getattr=getattr):
458
459 """ Inherit all other methods from the underlying stream.
460 """
Tim Peters30324a72001-05-15 17:19:16 +0000461 return getattr(self.stream, name)
Guido van Rossum0612d842000-03-10 23:20:43 +0000462
463###
464
465class StreamRecoder:
466
Fred Drake49fd1072000-04-13 14:11:21 +0000467 """ StreamRecoder instances provide a frontend - backend
468 view of encoding data.
469
470 They use the complete set of APIs returned by the
471 codecs.lookup() function to implement their task.
472
473 Data written to the stream is first decoded into an
474 intermediate format (which is dependent on the given codec
475 combination) and then written to the stream using an instance
476 of the provided Writer class.
477
478 In the other direction, data is read from the stream using a
479 Reader instance and then return encoded data to the caller.
480
481 """
Guido van Rossuma3277132000-04-11 15:37:43 +0000482 # Optional attributes set by the file wrappers below
483 data_encoding = 'unknown'
484 file_encoding = 'unknown'
485
Tim Peters30324a72001-05-15 17:19:16 +0000486 def __init__(self, stream, encode, decode, Reader, Writer,
487 errors='strict'):
Guido van Rossum0612d842000-03-10 23:20:43 +0000488
489 """ Creates a StreamRecoder instance which implements a two-way
490 conversion: encode and decode work on the frontend (the
Guido van Rossum1c89b0e2000-04-11 15:41:38 +0000491 input to .read() and output of .write()) while
Guido van Rossum0612d842000-03-10 23:20:43 +0000492 Reader and Writer work on the backend (reading and
Fred Drake908670c2000-03-17 15:42:11 +0000493 writing to the stream).
Guido van Rossum0612d842000-03-10 23:20:43 +0000494
495 You can use these objects to do transparent direct
496 recodings from e.g. latin-1 to utf-8 and back.
497
498 stream must be a file-like object.
499
500 encode, decode must adhere to the Codec interface, Reader,
501 Writer must be factory functions or classes providing the
502 StreamReader, StreamWriter interface resp.
503
504 encode and decode are needed for the frontend translation,
505 Reader and Writer for the backend translation. Unicode is
506 used as intermediate encoding.
507
508 Error handling is done in the same way as defined for the
509 StreamWriter/Readers.
510
511 """
512 self.stream = stream
513 self.encode = encode
514 self.decode = decode
515 self.reader = Reader(stream, errors)
516 self.writer = Writer(stream, errors)
517 self.errors = errors
518
Tim Peters30324a72001-05-15 17:19:16 +0000519 def read(self, size=-1):
Guido van Rossum0612d842000-03-10 23:20:43 +0000520
521 data = self.reader.read(size)
522 data, bytesencoded = self.encode(data, self.errors)
523 return data
524
Tim Peters30324a72001-05-15 17:19:16 +0000525 def readline(self, size=None):
Guido van Rossuma3277132000-04-11 15:37:43 +0000526
527 if size is None:
528 data = self.reader.readline()
529 else:
530 data = self.reader.readline(size)
531 data, bytesencoded = self.encode(data, self.errors)
532 return data
533
Tim Peters30324a72001-05-15 17:19:16 +0000534 def readlines(self, sizehint=None):
Guido van Rossuma3277132000-04-11 15:37:43 +0000535
Marc-André Lemburgd5948492004-02-26 15:22:17 +0000536 data = self.reader.read()
Guido van Rossuma3277132000-04-11 15:37:43 +0000537 data, bytesencoded = self.encode(data, self.errors)
538 return data.splitlines(1)
539
Walter Dörwald4dbf1922002-11-06 16:53:44 +0000540 def next(self):
541
542 """ Return the next decoded line from the input stream."""
543 return self.reader.next()
544
545 def __iter__(self):
546 return self
547
Tim Peters30324a72001-05-15 17:19:16 +0000548 def write(self, data):
Guido van Rossum0612d842000-03-10 23:20:43 +0000549
550 data, bytesdecoded = self.decode(data, self.errors)
551 return self.writer.write(data)
552
Tim Peters30324a72001-05-15 17:19:16 +0000553 def writelines(self, list):
Guido van Rossuma3277132000-04-11 15:37:43 +0000554
555 data = ''.join(list)
556 data, bytesdecoded = self.decode(data, self.errors)
557 return self.writer.write(data)
Guido van Rossum0612d842000-03-10 23:20:43 +0000558
559 def reset(self):
560
561 self.reader.reset()
562 self.writer.reset()
563
Tim Peters30324a72001-05-15 17:19:16 +0000564 def __getattr__(self, name,
Guido van Rossum0612d842000-03-10 23:20:43 +0000565 getattr=getattr):
566
567 """ Inherit all other methods from the underlying stream.
568 """
Tim Peters30324a72001-05-15 17:19:16 +0000569 return getattr(self.stream, name)
Guido van Rossum0612d842000-03-10 23:20:43 +0000570
571### Shortcuts
572
Marc-André Lemburg349a3d32000-06-21 21:21:04 +0000573def open(filename, mode='rb', encoding=None, errors='strict', buffering=1):
Guido van Rossum0612d842000-03-10 23:20:43 +0000574
575 """ Open an encoded file using the given mode and return
576 a wrapped version providing transparent encoding/decoding.
577
578 Note: The wrapped version will only accept the object format
579 defined by the codecs, i.e. Unicode objects for most builtin
Skip Montanaro9f5f9d92005-03-16 03:51:56 +0000580 codecs. Output is also codec dependent and will usually be
Guido van Rossum0612d842000-03-10 23:20:43 +0000581 Unicode as well.
582
Marc-André Lemburg349a3d32000-06-21 21:21:04 +0000583 Files are always opened in binary mode, even if no binary mode
Walter Dörwald7f3ed742003-02-02 23:08:27 +0000584 was specified. This is done to avoid data loss due to encodings
Marc-André Lemburg349a3d32000-06-21 21:21:04 +0000585 using 8-bit values. The default file mode is 'rb' meaning to
586 open the file in binary read mode.
587
Guido van Rossum0612d842000-03-10 23:20:43 +0000588 encoding specifies the encoding which is to be used for the
Walter Dörwald7f3ed742003-02-02 23:08:27 +0000589 file.
Guido van Rossum0612d842000-03-10 23:20:43 +0000590
591 errors may be given to define the error handling. It defaults
592 to 'strict' which causes ValueErrors to be raised in case an
593 encoding error occurs.
594
595 buffering has the same meaning as for the builtin open() API.
596 It defaults to line buffered.
597
Fred Drake49fd1072000-04-13 14:11:21 +0000598 The returned wrapped file object provides an extra attribute
599 .encoding which allows querying the used encoding. This
600 attribute is only available if an encoding was specified as
601 parameter.
602
Guido van Rossum0612d842000-03-10 23:20:43 +0000603 """
604 if encoding is not None and \
605 'b' not in mode:
606 # Force opening of the file in binary mode
607 mode = mode + 'b'
608 file = __builtin__.open(filename, mode, buffering)
609 if encoding is None:
610 return file
Tim Peters30324a72001-05-15 17:19:16 +0000611 (e, d, sr, sw) = lookup(encoding)
Guido van Rossuma3277132000-04-11 15:37:43 +0000612 srw = StreamReaderWriter(file, sr, sw, errors)
613 # Add attributes to simplify introspection
614 srw.encoding = encoding
615 return srw
Guido van Rossum0612d842000-03-10 23:20:43 +0000616
Guido van Rossuma3277132000-04-11 15:37:43 +0000617def EncodedFile(file, data_encoding, file_encoding=None, errors='strict'):
Guido van Rossum0612d842000-03-10 23:20:43 +0000618
619 """ Return a wrapped version of file which provides transparent
620 encoding translation.
621
622 Strings written to the wrapped file are interpreted according
Guido van Rossuma3277132000-04-11 15:37:43 +0000623 to the given data_encoding and then written to the original
624 file as string using file_encoding. The intermediate encoding
625 will usually be Unicode but depends on the specified codecs.
Guido van Rossum0612d842000-03-10 23:20:43 +0000626
Guido van Rossuma3277132000-04-11 15:37:43 +0000627 Strings are read from the file using file_encoding and then
628 passed back to the caller as string using data_encoding.
629
630 If file_encoding is not given, it defaults to data_encoding.
Guido van Rossum0612d842000-03-10 23:20:43 +0000631
632 errors may be given to define the error handling. It defaults
633 to 'strict' which causes ValueErrors to be raised in case an
634 encoding error occurs.
635
Fred Drake49fd1072000-04-13 14:11:21 +0000636 The returned wrapped file object provides two extra attributes
637 .data_encoding and .file_encoding which reflect the given
638 parameters of the same name. The attributes can be used for
639 introspection by Python programs.
640
Guido van Rossum0612d842000-03-10 23:20:43 +0000641 """
Guido van Rossuma3277132000-04-11 15:37:43 +0000642 if file_encoding is None:
643 file_encoding = data_encoding
644 encode, decode = lookup(data_encoding)[:2]
645 Reader, Writer = lookup(file_encoding)[2:]
646 sr = StreamRecoder(file,
Tim Peters30324a72001-05-15 17:19:16 +0000647 encode, decode, Reader, Writer,
Guido van Rossuma3277132000-04-11 15:37:43 +0000648 errors)
649 # Add attributes to simplify introspection
650 sr.data_encoding = data_encoding
651 sr.file_encoding = file_encoding
652 return sr
Guido van Rossum0612d842000-03-10 23:20:43 +0000653
Marc-André Lemburgaa32c5a2001-09-19 11:24:48 +0000654### Helpers for codec lookup
655
656def getencoder(encoding):
657
658 """ Lookup up the codec for the given encoding and return
659 its encoder function.
660
661 Raises a LookupError in case the encoding cannot be found.
662
663 """
664 return lookup(encoding)[0]
665
666def getdecoder(encoding):
667
668 """ Lookup up the codec for the given encoding and return
669 its decoder function.
670
671 Raises a LookupError in case the encoding cannot be found.
672
673 """
674 return lookup(encoding)[1]
675
676def getreader(encoding):
677
678 """ Lookup up the codec for the given encoding and return
679 its StreamReader class or factory function.
680
681 Raises a LookupError in case the encoding cannot be found.
682
683 """
684 return lookup(encoding)[2]
685
686def getwriter(encoding):
687
688 """ Lookup up the codec for the given encoding and return
689 its StreamWriter class or factory function.
690
691 Raises a LookupError in case the encoding cannot be found.
692
693 """
694 return lookup(encoding)[3]
695
Marc-André Lemburga866df82001-01-03 21:29:14 +0000696### Helpers for charmap-based codecs
697
698def make_identity_dict(rng):
699
700 """ make_identity_dict(rng) -> dict
701
702 Return a dictionary where elements of the rng sequence are
703 mapped to themselves.
Tim Peters88869f92001-01-14 23:36:06 +0000704
Marc-André Lemburga866df82001-01-03 21:29:14 +0000705 """
706 res = {}
707 for i in rng:
708 res[i]=i
709 return res
710
Marc-André Lemburg716cf912001-05-16 09:41:45 +0000711def make_encoding_map(decoding_map):
712
713 """ Creates an encoding map from a decoding map.
714
Walter Dörwald7f3ed742003-02-02 23:08:27 +0000715 If a target mapping in the decoding map occurs multiple
Marc-André Lemburg716cf912001-05-16 09:41:45 +0000716 times, then that target is mapped to None (undefined mapping),
717 causing an exception when encountered by the charmap codec
718 during translation.
719
720 One example where this happens is cp875.py which decodes
721 multiple character to \u001a.
722
723 """
724 m = {}
725 for k,v in decoding_map.items():
Raymond Hettinger54f02222002-06-01 14:18:47 +0000726 if not v in m:
Marc-André Lemburg716cf912001-05-16 09:41:45 +0000727 m[v] = k
728 else:
729 m[v] = None
730 return m
Tim Peters3a2ab1a2001-05-29 06:06:54 +0000731
Walter Dörwald3aeb6322002-09-02 13:14:32 +0000732### error handlers
733
Martin v. Löwise2713be2005-03-08 15:03:08 +0000734try:
735 strict_errors = lookup_error("strict")
736 ignore_errors = lookup_error("ignore")
737 replace_errors = lookup_error("replace")
738 xmlcharrefreplace_errors = lookup_error("xmlcharrefreplace")
739 backslashreplace_errors = lookup_error("backslashreplace")
740except LookupError:
741 # In --disable-unicode builds, these error handler are missing
742 strict_errors = None
743 ignore_errors = None
744 replace_errors = None
745 xmlcharrefreplace_errors = None
746 backslashreplace_errors = None
Walter Dörwald3aeb6322002-09-02 13:14:32 +0000747
Martin v. Löwis6cd441d2001-07-31 08:54:55 +0000748# Tell modulefinder that using codecs probably needs the encodings
749# package
750_false = 0
751if _false:
752 import encodings
753
Guido van Rossum0612d842000-03-10 23:20:43 +0000754### Tests
Guido van Rossum1c89b0e2000-04-11 15:41:38 +0000755
Guido van Rossum0612d842000-03-10 23:20:43 +0000756if __name__ == '__main__':
757
Guido van Rossuma3277132000-04-11 15:37:43 +0000758 # Make stdout translate Latin-1 output into UTF-8 output
759 sys.stdout = EncodedFile(sys.stdout, 'latin-1', 'utf-8')
Guido van Rossum1c89b0e2000-04-11 15:41:38 +0000760
Guido van Rossuma3277132000-04-11 15:37:43 +0000761 # Have stdin translate Latin-1 input into UTF-8 input
762 sys.stdin = EncodedFile(sys.stdin, 'utf-8', 'latin-1')