blob: 092da0c7d7598c3704d723c0878d71482574d75b [file] [log] [blame]
Guido van Rossum0612d842000-03-10 23:20:43 +00001""" codecs -- Python Codec Registry, API and helpers.
2
3
4Written by Marc-Andre Lemburg (mal@lemburg.com).
5
6(c) Copyright CNRI, All Rights Reserved. NO WARRANTY.
7
8"""#"
9
Marc-André Lemburgb28de0d2002-12-12 17:37:50 +000010import __builtin__, sys
Guido van Rossum0612d842000-03-10 23:20:43 +000011
12### Registry and builtin stateless codec functions
13
Guido van Rossumb95de4f2000-03-31 17:25:23 +000014try:
15 from _codecs import *
Tim Peters30324a72001-05-15 17:19:16 +000016except ImportError, why:
Guido van Rossumb95de4f2000-03-31 17:25:23 +000017 raise SystemError,\
18 'Failed to load the builtin codecs: %s' % why
Guido van Rossum0612d842000-03-10 23:20:43 +000019
Tim Peters30324a72001-05-15 17:19:16 +000020__all__ = ["register", "lookup", "open", "EncodedFile", "BOM", "BOM_BE",
Walter Dörwald474458d2002-06-04 15:16:29 +000021 "BOM_LE", "BOM32_BE", "BOM32_LE", "BOM64_BE", "BOM64_LE",
22 "BOM_UTF8", "BOM_UTF16", "BOM_UTF16_LE", "BOM_UTF16_BE",
Walter Dörwald3aeb6322002-09-02 13:14:32 +000023 "BOM_UTF32", "BOM_UTF32_LE", "BOM_UTF32_BE",
24 "strict_errors", "ignore_errors", "replace_errors",
25 "xmlcharrefreplace_errors",
26 "register_error", "lookup_error"]
Skip Montanaroe99d5ea2001-01-20 19:54:20 +000027
Guido van Rossum0612d842000-03-10 23:20:43 +000028### Constants
29
30#
Walter Dörwald474458d2002-06-04 15:16:29 +000031# Byte Order Mark (BOM = ZERO WIDTH NO-BREAK SPACE = U+FEFF)
32# and its possible byte string values
33# for UTF8/UTF16/UTF32 output and little/big endian machines
Guido van Rossum0612d842000-03-10 23:20:43 +000034#
Guido van Rossum0612d842000-03-10 23:20:43 +000035
Walter Dörwald474458d2002-06-04 15:16:29 +000036# UTF-8
37BOM_UTF8 = '\xef\xbb\xbf'
38
39# UTF-16, little endian
40BOM_LE = BOM_UTF16_LE = '\xff\xfe'
41
42# UTF-16, big endian
43BOM_BE = BOM_UTF16_BE = '\xfe\xff'
44
45# UTF-32, little endian
46BOM_UTF32_LE = '\xff\xfe\x00\x00'
47
48# UTF-32, big endian
49BOM_UTF32_BE = '\x00\x00\xfe\xff'
50
Marc-André Lemburgb28de0d2002-12-12 17:37:50 +000051if sys.byteorder == 'little':
Walter Dörwald474458d2002-06-04 15:16:29 +000052
Marc-André Lemburgb28de0d2002-12-12 17:37:50 +000053 # UTF-16, native endianness
54 BOM = BOM_UTF16 = BOM_UTF16_LE
55
56 # UTF-32, native endianness
57 BOM_UTF32 = BOM_UTF32_LE
58
59else:
60
61 # UTF-16, native endianness
62 BOM = BOM_UTF16 = BOM_UTF16_BE
63
64 # UTF-32, native endianness
65 BOM_UTF32 = BOM_UTF32_BE
Walter Dörwald474458d2002-06-04 15:16:29 +000066
67# Old broken names (don't use in new code)
68BOM32_LE = BOM_UTF16_LE
69BOM32_BE = BOM_UTF16_BE
70BOM64_LE = BOM_UTF32_LE
71BOM64_BE = BOM_UTF32_BE
Guido van Rossum0612d842000-03-10 23:20:43 +000072
73
74### Codec base classes (defining the API)
75
76class Codec:
77
78 """ Defines the interface for stateless encoders/decoders.
79
Walter Dörwald7f82f792002-11-19 21:42:53 +000080 The .encode()/.decode() methods may use different error
Guido van Rossum0612d842000-03-10 23:20:43 +000081 handling schemes by providing the errors argument. These
Walter Dörwald7f82f792002-11-19 21:42:53 +000082 string values are predefined:
Guido van Rossum0612d842000-03-10 23:20:43 +000083
Guido van Rossumd8855fd2000-03-24 22:14:19 +000084 'strict' - raise a ValueError error (or a subclass)
Guido van Rossum0612d842000-03-10 23:20:43 +000085 'ignore' - ignore the character and continue with the next
86 'replace' - replace with a suitable replacement character;
87 Python will use the official U+FFFD REPLACEMENT
Walter Dörwald7f82f792002-11-19 21:42:53 +000088 CHARACTER for the builtin Unicode codecs on
89 decoding and '?' on encoding.
90 'xmlcharrefreplace' - Replace with the appropriate XML
91 character reference (only for encoding).
92 'backslashreplace' - Replace with backslashed escape sequences
93 (only for encoding).
94
95 The set of allowed values can be extended via register_error.
Guido van Rossum0612d842000-03-10 23:20:43 +000096
97 """
Tim Peters30324a72001-05-15 17:19:16 +000098 def encode(self, input, errors='strict'):
Guido van Rossum1c89b0e2000-04-11 15:41:38 +000099
Fred Drake3e74c0d2000-03-17 15:40:35 +0000100 """ Encodes the object input and returns a tuple (output
Guido van Rossum0612d842000-03-10 23:20:43 +0000101 object, length consumed).
102
103 errors defines the error handling to apply. It defaults to
104 'strict' handling.
105
106 The method may not store state in the Codec instance. Use
107 StreamCodec for codecs which have to keep state in order to
108 make encoding/decoding efficient.
109
110 The encoder must be able to handle zero length input and
111 return an empty object of the output object type in this
112 situation.
113
114 """
115 raise NotImplementedError
116
Tim Peters30324a72001-05-15 17:19:16 +0000117 def decode(self, input, errors='strict'):
Guido van Rossum0612d842000-03-10 23:20:43 +0000118
119 """ Decodes the object input and returns a tuple (output
120 object, length consumed).
121
122 input must be an object which provides the bf_getreadbuf
123 buffer slot. Python strings, buffer objects and memory
124 mapped files are examples of objects providing this slot.
Guido van Rossum1c89b0e2000-04-11 15:41:38 +0000125
Guido van Rossum0612d842000-03-10 23:20:43 +0000126 errors defines the error handling to apply. It defaults to
127 'strict' handling.
128
129 The method may not store state in the Codec instance. Use
130 StreamCodec for codecs which have to keep state in order to
131 make encoding/decoding efficient.
132
133 The decoder must be able to handle zero length input and
134 return an empty object of the output object type in this
135 situation.
136
Guido van Rossum1c89b0e2000-04-11 15:41:38 +0000137 """
Guido van Rossum0612d842000-03-10 23:20:43 +0000138 raise NotImplementedError
139
140#
141# The StreamWriter and StreamReader class provide generic working
Andrew M. Kuchling97c56352001-09-18 20:29:48 +0000142# interfaces which can be used to implement new encoding submodules
Guido van Rossum0612d842000-03-10 23:20:43 +0000143# very easily. See encodings/utf_8.py for an example on how this is
144# done.
Guido van Rossum1c89b0e2000-04-11 15:41:38 +0000145#
Guido van Rossum0612d842000-03-10 23:20:43 +0000146
147class StreamWriter(Codec):
148
Tim Peters30324a72001-05-15 17:19:16 +0000149 def __init__(self, stream, errors='strict'):
Guido van Rossum0612d842000-03-10 23:20:43 +0000150
151 """ Creates a StreamWriter instance.
152
153 stream must be a file-like object open for writing
154 (binary) data.
155
Walter Dörwald7f82f792002-11-19 21:42:53 +0000156 The StreamWriter may use different error handling
Guido van Rossum0612d842000-03-10 23:20:43 +0000157 schemes by providing the errors keyword argument. These
Walter Dörwald7f82f792002-11-19 21:42:53 +0000158 parameters are predefined:
Guido van Rossum0612d842000-03-10 23:20:43 +0000159
160 'strict' - raise a ValueError (or a subclass)
161 'ignore' - ignore the character and continue with the next
162 'replace'- replace with a suitable replacement character
Walter Dörwald7f82f792002-11-19 21:42:53 +0000163 'xmlcharrefreplace' - Replace with the appropriate XML
164 character reference.
165 'backslashreplace' - Replace with backslashed escape
166 sequences (only for encoding).
Guido van Rossum0612d842000-03-10 23:20:43 +0000167
Walter Dörwald7f82f792002-11-19 21:42:53 +0000168 The set of allowed parameter values can be extended via
169 register_error.
Guido van Rossum0612d842000-03-10 23:20:43 +0000170 """
171 self.stream = stream
172 self.errors = errors
173
Guido van Rossuma3277132000-04-11 15:37:43 +0000174 def write(self, object):
Guido van Rossum0612d842000-03-10 23:20:43 +0000175
176 """ Writes the object's contents encoded to self.stream.
177 """
Tim Peters30324a72001-05-15 17:19:16 +0000178 data, consumed = self.encode(object, self.errors)
Guido van Rossum0612d842000-03-10 23:20:43 +0000179 self.stream.write(data)
180
Guido van Rossuma3277132000-04-11 15:37:43 +0000181 def writelines(self, list):
182
183 """ Writes the concatenated list of strings to the stream
184 using .write().
185 """
186 self.write(''.join(list))
Guido van Rossum1c89b0e2000-04-11 15:41:38 +0000187
Guido van Rossum0612d842000-03-10 23:20:43 +0000188 def reset(self):
189
190 """ Flushes and resets the codec buffers used for keeping state.
191
192 Calling this method should ensure that the data on the
193 output is put into a clean state, that allows appending
194 of new fresh data without having to rescan the whole
195 stream to recover state.
196
197 """
198 pass
199
Tim Peters30324a72001-05-15 17:19:16 +0000200 def __getattr__(self, name,
Guido van Rossum0612d842000-03-10 23:20:43 +0000201 getattr=getattr):
202
203 """ Inherit all other methods from the underlying stream.
204 """
Tim Peters30324a72001-05-15 17:19:16 +0000205 return getattr(self.stream, name)
Guido van Rossum0612d842000-03-10 23:20:43 +0000206
207###
208
209class StreamReader(Codec):
210
Tim Peters30324a72001-05-15 17:19:16 +0000211 def __init__(self, stream, errors='strict'):
Guido van Rossum0612d842000-03-10 23:20:43 +0000212
213 """ Creates a StreamReader instance.
214
215 stream must be a file-like object open for reading
216 (binary) data.
217
Walter Dörwald7f82f792002-11-19 21:42:53 +0000218 The StreamReader may use different error handling
Guido van Rossum0612d842000-03-10 23:20:43 +0000219 schemes by providing the errors keyword argument. These
Walter Dörwald7f82f792002-11-19 21:42:53 +0000220 parameters are predefined:
Guido van Rossum0612d842000-03-10 23:20:43 +0000221
222 'strict' - raise a ValueError (or a subclass)
223 'ignore' - ignore the character and continue with the next
224 'replace'- replace with a suitable replacement character;
225
Walter Dörwald7f82f792002-11-19 21:42:53 +0000226 The set of allowed parameter values can be extended via
227 register_error.
Guido van Rossum0612d842000-03-10 23:20:43 +0000228 """
229 self.stream = stream
230 self.errors = errors
Walter Dörwald69652032004-09-07 20:24:22 +0000231 self.bytebuffer = ""
232 self.charbuffer = u""
Walter Dörwalde57d7b12004-12-21 22:24:00 +0000233 self.atcr = False
Guido van Rossum0612d842000-03-10 23:20:43 +0000234
Walter Dörwald69652032004-09-07 20:24:22 +0000235 def decode(self, input, errors='strict'):
236 raise NotImplementedError
237
238 def read(self, size=-1, chars=-1):
Guido van Rossum0612d842000-03-10 23:20:43 +0000239
240 """ Decodes data from the stream self.stream and returns the
241 resulting object.
242
Walter Dörwald69652032004-09-07 20:24:22 +0000243 chars indicates the number of characters to read from the
244 stream. read() will never return more than chars
245 characters, but it might return less, if there are not enough
246 characters available.
247
Guido van Rossum0612d842000-03-10 23:20:43 +0000248 size indicates the approximate maximum number of bytes to
249 read from the stream for decoding purposes. The decoder
250 can modify this setting as appropriate. The default value
251 -1 indicates to read and decode as much as possible. size
252 is intended to prevent having to decode huge files in one
253 step.
254
255 The method should use a greedy read strategy meaning that
256 it should read as much data as is allowed within the
257 definition of the encoding and the given size, e.g. if
258 optional encoding endings or state markers are available
259 on the stream, these should be read too.
Guido van Rossum0612d842000-03-10 23:20:43 +0000260 """
Walter Dörwald69652032004-09-07 20:24:22 +0000261 # read until we get the required number of characters (if available)
Walter Dörwald69652032004-09-07 20:24:22 +0000262 while True:
263 # can the request can be satisfied from the character buffer?
264 if chars < 0:
265 if self.charbuffer:
Walter Dörwalde57d7b12004-12-21 22:24:00 +0000266 break
Guido van Rossum0612d842000-03-10 23:20:43 +0000267 else:
Walter Dörwald69652032004-09-07 20:24:22 +0000268 if len(self.charbuffer) >= chars:
Walter Dörwald69652032004-09-07 20:24:22 +0000269 break
270 # we need more data
271 if size < 0:
272 newdata = self.stream.read()
273 else:
274 newdata = self.stream.read(size)
Walter Dörwalde57d7b12004-12-21 22:24:00 +0000275 # decode bytes (those remaining from the last call included)
Walter Dörwald69652032004-09-07 20:24:22 +0000276 data = self.bytebuffer + newdata
Walter Dörwalde57d7b12004-12-21 22:24:00 +0000277 newchars, decodedbytes = self.decode(data, self.errors)
Walter Dörwald69652032004-09-07 20:24:22 +0000278 # keep undecoded bytes until the next call
279 self.bytebuffer = data[decodedbytes:]
280 # put new characters in the character buffer
Walter Dörwalde57d7b12004-12-21 22:24:00 +0000281 self.charbuffer += newchars
Walter Dörwald69652032004-09-07 20:24:22 +0000282 # there was no data available
283 if not newdata:
Walter Dörwalde57d7b12004-12-21 22:24:00 +0000284 break
285 if chars < 0:
286 # Return everything we've got
287 result = self.charbuffer
288 self.charbuffer = u""
289 else:
290 # Return the first chars characters
291 result = self.charbuffer[:chars]
292 self.charbuffer = self.charbuffer[chars:]
Walter Dörwald69652032004-09-07 20:24:22 +0000293 return result
Guido van Rossum0612d842000-03-10 23:20:43 +0000294
Walter Dörwald69652032004-09-07 20:24:22 +0000295 def readline(self, size=None, keepends=True):
Guido van Rossuma3277132000-04-11 15:37:43 +0000296
297 """ Read one line from the input stream and return the
298 decoded data.
299
Walter Dörwald69652032004-09-07 20:24:22 +0000300 size, if given, is passed as size argument to the
301 read() method.
Guido van Rossum1c89b0e2000-04-11 15:41:38 +0000302
Guido van Rossuma3277132000-04-11 15:37:43 +0000303 """
Walter Dörwalde57d7b12004-12-21 22:24:00 +0000304 readsize = size or 72
Walter Dörwald69652032004-09-07 20:24:22 +0000305 line = u""
Walter Dörwalde57d7b12004-12-21 22:24:00 +0000306 # If size is given, we call read() only once
Walter Dörwald69652032004-09-07 20:24:22 +0000307 while True:
Walter Dörwalde57d7b12004-12-21 22:24:00 +0000308 data = self.read(readsize)
309 if self.atcr and data.startswith(u"\n"):
310 data = data[1:]
311 if data:
312 self.atcr = data.endswith(u"\r")
Walter Dörwald69652032004-09-07 20:24:22 +0000313 line += data
Walter Dörwalde57d7b12004-12-21 22:24:00 +0000314 lines = line.splitlines(True)
315 if lines:
316 line0withend = lines[0]
317 line0withoutend = lines[0].splitlines(False)[0]
318 if line0withend != line0withoutend: # We really have a line end
319 # Put the rest back together and keep it until the next call
320 self.charbuffer = u"".join(lines[1:]) + self.charbuffer
321 if keepends:
322 line = line0withend
323 else:
324 line = line0withoutend
Walter Dörwald9fa09462005-01-10 12:01:39 +0000325 break
Walter Dörwalde57d7b12004-12-21 22:24:00 +0000326 # we didn't get anything or this was our only try
Walter Dörwald9fa09462005-01-10 12:01:39 +0000327 if not data or size is not None:
Walter Dörwalde57d7b12004-12-21 22:24:00 +0000328 if line and not keepends:
329 line = line.splitlines(False)[0]
330 break
331 if readsize<8000:
332 readsize *= 2
333 return line
Guido van Rossum1c89b0e2000-04-11 15:41:38 +0000334
Walter Dörwald69652032004-09-07 20:24:22 +0000335 def readlines(self, sizehint=None, keepends=True):
Guido van Rossuma3277132000-04-11 15:37:43 +0000336
337 """ Read all lines available on the input stream
338 and return them as list of lines.
339
340 Line breaks are implemented using the codec's decoder
341 method and are included in the list entries.
Guido van Rossum1c89b0e2000-04-11 15:41:38 +0000342
Marc-André Lemburgd5948492004-02-26 15:22:17 +0000343 sizehint, if given, is ignored since there is no efficient
344 way to finding the true end-of-line.
Guido van Rossuma3277132000-04-11 15:37:43 +0000345
346 """
Walter Dörwald69652032004-09-07 20:24:22 +0000347 data = self.read()
Hye-Shik Changaf5c7cf2004-10-17 23:51:21 +0000348 return data.splitlines(keepends)
Guido van Rossum0612d842000-03-10 23:20:43 +0000349
350 def reset(self):
351
352 """ Resets the codec buffers used for keeping state.
353
354 Note that no stream repositioning should take place.
Thomas Wouters7e474022000-07-16 12:04:32 +0000355 This method is primarily intended to be able to recover
Guido van Rossum0612d842000-03-10 23:20:43 +0000356 from decoding errors.
357
358 """
Walter Dörwald729c31f2005-03-14 19:06:30 +0000359 self.bytebuffer = ""
360 self.charbuffer = u""
361 self.atcr = False
362
363 def seek(self, offset, whence):
364 """ Set the input stream's current position.
365
366 Resets the codec buffers used for keeping state.
367 """
368 self.reset()
369 self.stream.seek(offset, whence)
Guido van Rossum0612d842000-03-10 23:20:43 +0000370
Walter Dörwald4dbf1922002-11-06 16:53:44 +0000371 def next(self):
372
373 """ Return the next decoded line from the input stream."""
374 line = self.readline()
375 if line:
376 return line
377 raise StopIteration
378
379 def __iter__(self):
380 return self
381
Tim Peters30324a72001-05-15 17:19:16 +0000382 def __getattr__(self, name,
Guido van Rossum0612d842000-03-10 23:20:43 +0000383 getattr=getattr):
384
385 """ Inherit all other methods from the underlying stream.
386 """
Tim Peters30324a72001-05-15 17:19:16 +0000387 return getattr(self.stream, name)
Guido van Rossum0612d842000-03-10 23:20:43 +0000388
389###
390
391class StreamReaderWriter:
392
Fred Drake49fd1072000-04-13 14:11:21 +0000393 """ StreamReaderWriter instances allow wrapping streams which
394 work in both read and write modes.
395
396 The design is such that one can use the factory functions
Thomas Wouters7e474022000-07-16 12:04:32 +0000397 returned by the codec.lookup() function to construct the
Fred Drake49fd1072000-04-13 14:11:21 +0000398 instance.
399
400 """
Guido van Rossuma3277132000-04-11 15:37:43 +0000401 # Optional attributes set by the file wrappers below
402 encoding = 'unknown'
403
Tim Peters30324a72001-05-15 17:19:16 +0000404 def __init__(self, stream, Reader, Writer, errors='strict'):
Guido van Rossum0612d842000-03-10 23:20:43 +0000405
406 """ Creates a StreamReaderWriter instance.
407
408 stream must be a Stream-like object.
409
410 Reader, Writer must be factory functions or classes
411 providing the StreamReader, StreamWriter interface resp.
412
413 Error handling is done in the same way as defined for the
414 StreamWriter/Readers.
415
416 """
417 self.stream = stream
418 self.reader = Reader(stream, errors)
419 self.writer = Writer(stream, errors)
420 self.errors = errors
421
Tim Peters30324a72001-05-15 17:19:16 +0000422 def read(self, size=-1):
Guido van Rossum0612d842000-03-10 23:20:43 +0000423
424 return self.reader.read(size)
425
Guido van Rossumd58c26f2000-05-01 16:17:32 +0000426 def readline(self, size=None):
Guido van Rossuma3277132000-04-11 15:37:43 +0000427
428 return self.reader.readline(size)
429
Guido van Rossumd58c26f2000-05-01 16:17:32 +0000430 def readlines(self, sizehint=None):
Guido van Rossuma3277132000-04-11 15:37:43 +0000431
432 return self.reader.readlines(sizehint)
433
Walter Dörwald4dbf1922002-11-06 16:53:44 +0000434 def next(self):
435
436 """ Return the next decoded line from the input stream."""
437 return self.reader.next()
438
439 def __iter__(self):
440 return self
441
Tim Peters30324a72001-05-15 17:19:16 +0000442 def write(self, data):
Guido van Rossum0612d842000-03-10 23:20:43 +0000443
444 return self.writer.write(data)
445
Tim Peters30324a72001-05-15 17:19:16 +0000446 def writelines(self, list):
Guido van Rossuma3277132000-04-11 15:37:43 +0000447
448 return self.writer.writelines(list)
449
Guido van Rossum0612d842000-03-10 23:20:43 +0000450 def reset(self):
451
452 self.reader.reset()
453 self.writer.reset()
454
Tim Peters30324a72001-05-15 17:19:16 +0000455 def __getattr__(self, name,
Guido van Rossum0612d842000-03-10 23:20:43 +0000456 getattr=getattr):
457
458 """ Inherit all other methods from the underlying stream.
459 """
Tim Peters30324a72001-05-15 17:19:16 +0000460 return getattr(self.stream, name)
Guido van Rossum0612d842000-03-10 23:20:43 +0000461
462###
463
464class StreamRecoder:
465
Fred Drake49fd1072000-04-13 14:11:21 +0000466 """ StreamRecoder instances provide a frontend - backend
467 view of encoding data.
468
469 They use the complete set of APIs returned by the
470 codecs.lookup() function to implement their task.
471
472 Data written to the stream is first decoded into an
473 intermediate format (which is dependent on the given codec
474 combination) and then written to the stream using an instance
475 of the provided Writer class.
476
477 In the other direction, data is read from the stream using a
478 Reader instance and then return encoded data to the caller.
479
480 """
Guido van Rossuma3277132000-04-11 15:37:43 +0000481 # Optional attributes set by the file wrappers below
482 data_encoding = 'unknown'
483 file_encoding = 'unknown'
484
Tim Peters30324a72001-05-15 17:19:16 +0000485 def __init__(self, stream, encode, decode, Reader, Writer,
486 errors='strict'):
Guido van Rossum0612d842000-03-10 23:20:43 +0000487
488 """ Creates a StreamRecoder instance which implements a two-way
489 conversion: encode and decode work on the frontend (the
Guido van Rossum1c89b0e2000-04-11 15:41:38 +0000490 input to .read() and output of .write()) while
Guido van Rossum0612d842000-03-10 23:20:43 +0000491 Reader and Writer work on the backend (reading and
Fred Drake908670c2000-03-17 15:42:11 +0000492 writing to the stream).
Guido van Rossum0612d842000-03-10 23:20:43 +0000493
494 You can use these objects to do transparent direct
495 recodings from e.g. latin-1 to utf-8 and back.
496
497 stream must be a file-like object.
498
499 encode, decode must adhere to the Codec interface, Reader,
500 Writer must be factory functions or classes providing the
501 StreamReader, StreamWriter interface resp.
502
503 encode and decode are needed for the frontend translation,
504 Reader and Writer for the backend translation. Unicode is
505 used as intermediate encoding.
506
507 Error handling is done in the same way as defined for the
508 StreamWriter/Readers.
509
510 """
511 self.stream = stream
512 self.encode = encode
513 self.decode = decode
514 self.reader = Reader(stream, errors)
515 self.writer = Writer(stream, errors)
516 self.errors = errors
517
Tim Peters30324a72001-05-15 17:19:16 +0000518 def read(self, size=-1):
Guido van Rossum0612d842000-03-10 23:20:43 +0000519
520 data = self.reader.read(size)
521 data, bytesencoded = self.encode(data, self.errors)
522 return data
523
Tim Peters30324a72001-05-15 17:19:16 +0000524 def readline(self, size=None):
Guido van Rossuma3277132000-04-11 15:37:43 +0000525
526 if size is None:
527 data = self.reader.readline()
528 else:
529 data = self.reader.readline(size)
530 data, bytesencoded = self.encode(data, self.errors)
531 return data
532
Tim Peters30324a72001-05-15 17:19:16 +0000533 def readlines(self, sizehint=None):
Guido van Rossuma3277132000-04-11 15:37:43 +0000534
Marc-André Lemburgd5948492004-02-26 15:22:17 +0000535 data = self.reader.read()
Guido van Rossuma3277132000-04-11 15:37:43 +0000536 data, bytesencoded = self.encode(data, self.errors)
537 return data.splitlines(1)
538
Walter Dörwald4dbf1922002-11-06 16:53:44 +0000539 def next(self):
540
541 """ Return the next decoded line from the input stream."""
542 return self.reader.next()
543
544 def __iter__(self):
545 return self
546
Tim Peters30324a72001-05-15 17:19:16 +0000547 def write(self, data):
Guido van Rossum0612d842000-03-10 23:20:43 +0000548
549 data, bytesdecoded = self.decode(data, self.errors)
550 return self.writer.write(data)
551
Tim Peters30324a72001-05-15 17:19:16 +0000552 def writelines(self, list):
Guido van Rossuma3277132000-04-11 15:37:43 +0000553
554 data = ''.join(list)
555 data, bytesdecoded = self.decode(data, self.errors)
556 return self.writer.write(data)
Guido van Rossum0612d842000-03-10 23:20:43 +0000557
558 def reset(self):
559
560 self.reader.reset()
561 self.writer.reset()
562
Tim Peters30324a72001-05-15 17:19:16 +0000563 def __getattr__(self, name,
Guido van Rossum0612d842000-03-10 23:20:43 +0000564 getattr=getattr):
565
566 """ Inherit all other methods from the underlying stream.
567 """
Tim Peters30324a72001-05-15 17:19:16 +0000568 return getattr(self.stream, name)
Guido van Rossum0612d842000-03-10 23:20:43 +0000569
570### Shortcuts
571
Marc-André Lemburg349a3d32000-06-21 21:21:04 +0000572def open(filename, mode='rb', encoding=None, errors='strict', buffering=1):
Guido van Rossum0612d842000-03-10 23:20:43 +0000573
574 """ Open an encoded file using the given mode and return
575 a wrapped version providing transparent encoding/decoding.
576
577 Note: The wrapped version will only accept the object format
578 defined by the codecs, i.e. Unicode objects for most builtin
579 codecs. Output is also codec dependent and will usually by
580 Unicode as well.
581
Marc-André Lemburg349a3d32000-06-21 21:21:04 +0000582 Files are always opened in binary mode, even if no binary mode
Walter Dörwald7f3ed742003-02-02 23:08:27 +0000583 was specified. This is done to avoid data loss due to encodings
Marc-André Lemburg349a3d32000-06-21 21:21:04 +0000584 using 8-bit values. The default file mode is 'rb' meaning to
585 open the file in binary read mode.
586
Guido van Rossum0612d842000-03-10 23:20:43 +0000587 encoding specifies the encoding which is to be used for the
Walter Dörwald7f3ed742003-02-02 23:08:27 +0000588 file.
Guido van Rossum0612d842000-03-10 23:20:43 +0000589
590 errors may be given to define the error handling. It defaults
591 to 'strict' which causes ValueErrors to be raised in case an
592 encoding error occurs.
593
594 buffering has the same meaning as for the builtin open() API.
595 It defaults to line buffered.
596
Fred Drake49fd1072000-04-13 14:11:21 +0000597 The returned wrapped file object provides an extra attribute
598 .encoding which allows querying the used encoding. This
599 attribute is only available if an encoding was specified as
600 parameter.
601
Guido van Rossum0612d842000-03-10 23:20:43 +0000602 """
603 if encoding is not None and \
604 'b' not in mode:
605 # Force opening of the file in binary mode
606 mode = mode + 'b'
607 file = __builtin__.open(filename, mode, buffering)
608 if encoding is None:
609 return file
Tim Peters30324a72001-05-15 17:19:16 +0000610 (e, d, sr, sw) = lookup(encoding)
Guido van Rossuma3277132000-04-11 15:37:43 +0000611 srw = StreamReaderWriter(file, sr, sw, errors)
612 # Add attributes to simplify introspection
613 srw.encoding = encoding
614 return srw
Guido van Rossum0612d842000-03-10 23:20:43 +0000615
Guido van Rossuma3277132000-04-11 15:37:43 +0000616def EncodedFile(file, data_encoding, file_encoding=None, errors='strict'):
Guido van Rossum0612d842000-03-10 23:20:43 +0000617
618 """ Return a wrapped version of file which provides transparent
619 encoding translation.
620
621 Strings written to the wrapped file are interpreted according
Guido van Rossuma3277132000-04-11 15:37:43 +0000622 to the given data_encoding and then written to the original
623 file as string using file_encoding. The intermediate encoding
624 will usually be Unicode but depends on the specified codecs.
Guido van Rossum0612d842000-03-10 23:20:43 +0000625
Guido van Rossuma3277132000-04-11 15:37:43 +0000626 Strings are read from the file using file_encoding and then
627 passed back to the caller as string using data_encoding.
628
629 If file_encoding is not given, it defaults to data_encoding.
Guido van Rossum0612d842000-03-10 23:20:43 +0000630
631 errors may be given to define the error handling. It defaults
632 to 'strict' which causes ValueErrors to be raised in case an
633 encoding error occurs.
634
Fred Drake49fd1072000-04-13 14:11:21 +0000635 The returned wrapped file object provides two extra attributes
636 .data_encoding and .file_encoding which reflect the given
637 parameters of the same name. The attributes can be used for
638 introspection by Python programs.
639
Guido van Rossum0612d842000-03-10 23:20:43 +0000640 """
Guido van Rossuma3277132000-04-11 15:37:43 +0000641 if file_encoding is None:
642 file_encoding = data_encoding
643 encode, decode = lookup(data_encoding)[:2]
644 Reader, Writer = lookup(file_encoding)[2:]
645 sr = StreamRecoder(file,
Tim Peters30324a72001-05-15 17:19:16 +0000646 encode, decode, Reader, Writer,
Guido van Rossuma3277132000-04-11 15:37:43 +0000647 errors)
648 # Add attributes to simplify introspection
649 sr.data_encoding = data_encoding
650 sr.file_encoding = file_encoding
651 return sr
Guido van Rossum0612d842000-03-10 23:20:43 +0000652
Marc-André Lemburgaa32c5a2001-09-19 11:24:48 +0000653### Helpers for codec lookup
654
655def getencoder(encoding):
656
657 """ Lookup up the codec for the given encoding and return
658 its encoder function.
659
660 Raises a LookupError in case the encoding cannot be found.
661
662 """
663 return lookup(encoding)[0]
664
665def getdecoder(encoding):
666
667 """ Lookup up the codec for the given encoding and return
668 its decoder function.
669
670 Raises a LookupError in case the encoding cannot be found.
671
672 """
673 return lookup(encoding)[1]
674
675def getreader(encoding):
676
677 """ Lookup up the codec for the given encoding and return
678 its StreamReader class or factory function.
679
680 Raises a LookupError in case the encoding cannot be found.
681
682 """
683 return lookup(encoding)[2]
684
685def getwriter(encoding):
686
687 """ Lookup up the codec for the given encoding and return
688 its StreamWriter class or factory function.
689
690 Raises a LookupError in case the encoding cannot be found.
691
692 """
693 return lookup(encoding)[3]
694
Marc-André Lemburga866df82001-01-03 21:29:14 +0000695### Helpers for charmap-based codecs
696
697def make_identity_dict(rng):
698
699 """ make_identity_dict(rng) -> dict
700
701 Return a dictionary where elements of the rng sequence are
702 mapped to themselves.
Tim Peters88869f92001-01-14 23:36:06 +0000703
Marc-André Lemburga866df82001-01-03 21:29:14 +0000704 """
705 res = {}
706 for i in rng:
707 res[i]=i
708 return res
709
Marc-André Lemburg716cf912001-05-16 09:41:45 +0000710def make_encoding_map(decoding_map):
711
712 """ Creates an encoding map from a decoding map.
713
Walter Dörwald7f3ed742003-02-02 23:08:27 +0000714 If a target mapping in the decoding map occurs multiple
Marc-André Lemburg716cf912001-05-16 09:41:45 +0000715 times, then that target is mapped to None (undefined mapping),
716 causing an exception when encountered by the charmap codec
717 during translation.
718
719 One example where this happens is cp875.py which decodes
720 multiple character to \u001a.
721
722 """
723 m = {}
724 for k,v in decoding_map.items():
Raymond Hettinger54f02222002-06-01 14:18:47 +0000725 if not v in m:
Marc-André Lemburg716cf912001-05-16 09:41:45 +0000726 m[v] = k
727 else:
728 m[v] = None
729 return m
Tim Peters3a2ab1a2001-05-29 06:06:54 +0000730
Walter Dörwald3aeb6322002-09-02 13:14:32 +0000731### error handlers
732
Martin v. Löwise2713be2005-03-08 15:03:08 +0000733try:
734 strict_errors = lookup_error("strict")
735 ignore_errors = lookup_error("ignore")
736 replace_errors = lookup_error("replace")
737 xmlcharrefreplace_errors = lookup_error("xmlcharrefreplace")
738 backslashreplace_errors = lookup_error("backslashreplace")
739except LookupError:
740 # In --disable-unicode builds, these error handler are missing
741 strict_errors = None
742 ignore_errors = None
743 replace_errors = None
744 xmlcharrefreplace_errors = None
745 backslashreplace_errors = None
Walter Dörwald3aeb6322002-09-02 13:14:32 +0000746
Martin v. Löwis6cd441d2001-07-31 08:54:55 +0000747# Tell modulefinder that using codecs probably needs the encodings
748# package
749_false = 0
750if _false:
751 import encodings
752
Guido van Rossum0612d842000-03-10 23:20:43 +0000753### Tests
Guido van Rossum1c89b0e2000-04-11 15:41:38 +0000754
Guido van Rossum0612d842000-03-10 23:20:43 +0000755if __name__ == '__main__':
756
Guido van Rossuma3277132000-04-11 15:37:43 +0000757 # Make stdout translate Latin-1 output into UTF-8 output
758 sys.stdout = EncodedFile(sys.stdout, 'latin-1', 'utf-8')
Guido van Rossum1c89b0e2000-04-11 15:41:38 +0000759
Guido van Rossuma3277132000-04-11 15:37:43 +0000760 # Have stdin translate Latin-1 input into UTF-8 input
761 sys.stdin = EncodedFile(sys.stdin, 'utf-8', 'latin-1')