blob: b283925e01fa2a608420f8922739f23472530c55 [file] [log] [blame]
Guido van Rossum0612d842000-03-10 23:20:43 +00001""" codecs -- Python Codec Registry, API and helpers.
2
3
4Written by Marc-Andre Lemburg (mal@lemburg.com).
5
6(c) Copyright CNRI, All Rights Reserved. NO WARRANTY.
7
8"""#"
9
Marc-André Lemburgb28de0d2002-12-12 17:37:50 +000010import __builtin__, sys
Guido van Rossum0612d842000-03-10 23:20:43 +000011
12### Registry and builtin stateless codec functions
13
Guido van Rossumb95de4f2000-03-31 17:25:23 +000014try:
15 from _codecs import *
Tim Peters30324a72001-05-15 17:19:16 +000016except ImportError, why:
Guido van Rossumb95de4f2000-03-31 17:25:23 +000017 raise SystemError,\
18 'Failed to load the builtin codecs: %s' % why
Guido van Rossum0612d842000-03-10 23:20:43 +000019
Tim Peters30324a72001-05-15 17:19:16 +000020__all__ = ["register", "lookup", "open", "EncodedFile", "BOM", "BOM_BE",
Walter Dörwald474458d2002-06-04 15:16:29 +000021 "BOM_LE", "BOM32_BE", "BOM32_LE", "BOM64_BE", "BOM64_LE",
22 "BOM_UTF8", "BOM_UTF16", "BOM_UTF16_LE", "BOM_UTF16_BE",
Walter Dörwald3aeb6322002-09-02 13:14:32 +000023 "BOM_UTF32", "BOM_UTF32_LE", "BOM_UTF32_BE",
24 "strict_errors", "ignore_errors", "replace_errors",
25 "xmlcharrefreplace_errors",
26 "register_error", "lookup_error"]
Skip Montanaroe99d5ea2001-01-20 19:54:20 +000027
Guido van Rossum0612d842000-03-10 23:20:43 +000028### Constants
29
30#
Walter Dörwald474458d2002-06-04 15:16:29 +000031# Byte Order Mark (BOM = ZERO WIDTH NO-BREAK SPACE = U+FEFF)
32# and its possible byte string values
33# for UTF8/UTF16/UTF32 output and little/big endian machines
Guido van Rossum0612d842000-03-10 23:20:43 +000034#
Guido van Rossum0612d842000-03-10 23:20:43 +000035
Walter Dörwald474458d2002-06-04 15:16:29 +000036# UTF-8
37BOM_UTF8 = '\xef\xbb\xbf'
38
39# UTF-16, little endian
40BOM_LE = BOM_UTF16_LE = '\xff\xfe'
41
42# UTF-16, big endian
43BOM_BE = BOM_UTF16_BE = '\xfe\xff'
44
45# UTF-32, little endian
46BOM_UTF32_LE = '\xff\xfe\x00\x00'
47
48# UTF-32, big endian
49BOM_UTF32_BE = '\x00\x00\xfe\xff'
50
Marc-André Lemburgb28de0d2002-12-12 17:37:50 +000051if sys.byteorder == 'little':
Walter Dörwald474458d2002-06-04 15:16:29 +000052
Marc-André Lemburgb28de0d2002-12-12 17:37:50 +000053 # UTF-16, native endianness
54 BOM = BOM_UTF16 = BOM_UTF16_LE
55
56 # UTF-32, native endianness
57 BOM_UTF32 = BOM_UTF32_LE
58
59else:
60
61 # UTF-16, native endianness
62 BOM = BOM_UTF16 = BOM_UTF16_BE
63
64 # UTF-32, native endianness
65 BOM_UTF32 = BOM_UTF32_BE
Walter Dörwald474458d2002-06-04 15:16:29 +000066
67# Old broken names (don't use in new code)
68BOM32_LE = BOM_UTF16_LE
69BOM32_BE = BOM_UTF16_BE
70BOM64_LE = BOM_UTF32_LE
71BOM64_BE = BOM_UTF32_BE
Guido van Rossum0612d842000-03-10 23:20:43 +000072
73
74### Codec base classes (defining the API)
75
76class Codec:
77
78 """ Defines the interface for stateless encoders/decoders.
79
Walter Dörwald7f82f792002-11-19 21:42:53 +000080 The .encode()/.decode() methods may use different error
Guido van Rossum0612d842000-03-10 23:20:43 +000081 handling schemes by providing the errors argument. These
Walter Dörwald7f82f792002-11-19 21:42:53 +000082 string values are predefined:
Guido van Rossum0612d842000-03-10 23:20:43 +000083
Guido van Rossumd8855fd2000-03-24 22:14:19 +000084 'strict' - raise a ValueError error (or a subclass)
Guido van Rossum0612d842000-03-10 23:20:43 +000085 'ignore' - ignore the character and continue with the next
86 'replace' - replace with a suitable replacement character;
87 Python will use the official U+FFFD REPLACEMENT
Walter Dörwald7f82f792002-11-19 21:42:53 +000088 CHARACTER for the builtin Unicode codecs on
89 decoding and '?' on encoding.
90 'xmlcharrefreplace' - Replace with the appropriate XML
91 character reference (only for encoding).
92 'backslashreplace' - Replace with backslashed escape sequences
93 (only for encoding).
94
95 The set of allowed values can be extended via register_error.
Guido van Rossum0612d842000-03-10 23:20:43 +000096
97 """
Tim Peters30324a72001-05-15 17:19:16 +000098 def encode(self, input, errors='strict'):
Guido van Rossum1c89b0e2000-04-11 15:41:38 +000099
Fred Drake3e74c0d2000-03-17 15:40:35 +0000100 """ Encodes the object input and returns a tuple (output
Guido van Rossum0612d842000-03-10 23:20:43 +0000101 object, length consumed).
102
103 errors defines the error handling to apply. It defaults to
104 'strict' handling.
105
106 The method may not store state in the Codec instance. Use
107 StreamCodec for codecs which have to keep state in order to
108 make encoding/decoding efficient.
109
110 The encoder must be able to handle zero length input and
111 return an empty object of the output object type in this
112 situation.
113
114 """
115 raise NotImplementedError
116
Tim Peters30324a72001-05-15 17:19:16 +0000117 def decode(self, input, errors='strict'):
Guido van Rossum0612d842000-03-10 23:20:43 +0000118
119 """ Decodes the object input and returns a tuple (output
120 object, length consumed).
121
122 input must be an object which provides the bf_getreadbuf
123 buffer slot. Python strings, buffer objects and memory
124 mapped files are examples of objects providing this slot.
Guido van Rossum1c89b0e2000-04-11 15:41:38 +0000125
Guido van Rossum0612d842000-03-10 23:20:43 +0000126 errors defines the error handling to apply. It defaults to
127 'strict' handling.
128
129 The method may not store state in the Codec instance. Use
130 StreamCodec for codecs which have to keep state in order to
131 make encoding/decoding efficient.
132
133 The decoder must be able to handle zero length input and
134 return an empty object of the output object type in this
135 situation.
136
Guido van Rossum1c89b0e2000-04-11 15:41:38 +0000137 """
Guido van Rossum0612d842000-03-10 23:20:43 +0000138 raise NotImplementedError
139
140#
141# The StreamWriter and StreamReader class provide generic working
Andrew M. Kuchling97c56352001-09-18 20:29:48 +0000142# interfaces which can be used to implement new encoding submodules
Guido van Rossum0612d842000-03-10 23:20:43 +0000143# very easily. See encodings/utf_8.py for an example on how this is
144# done.
Guido van Rossum1c89b0e2000-04-11 15:41:38 +0000145#
Guido van Rossum0612d842000-03-10 23:20:43 +0000146
147class StreamWriter(Codec):
148
Tim Peters30324a72001-05-15 17:19:16 +0000149 def __init__(self, stream, errors='strict'):
Guido van Rossum0612d842000-03-10 23:20:43 +0000150
151 """ Creates a StreamWriter instance.
152
153 stream must be a file-like object open for writing
154 (binary) data.
155
Walter Dörwald7f82f792002-11-19 21:42:53 +0000156 The StreamWriter may use different error handling
Guido van Rossum0612d842000-03-10 23:20:43 +0000157 schemes by providing the errors keyword argument. These
Walter Dörwald7f82f792002-11-19 21:42:53 +0000158 parameters are predefined:
Guido van Rossum0612d842000-03-10 23:20:43 +0000159
160 'strict' - raise a ValueError (or a subclass)
161 'ignore' - ignore the character and continue with the next
162 'replace'- replace with a suitable replacement character
Walter Dörwald7f82f792002-11-19 21:42:53 +0000163 'xmlcharrefreplace' - Replace with the appropriate XML
164 character reference.
165 'backslashreplace' - Replace with backslashed escape
166 sequences (only for encoding).
Guido van Rossum0612d842000-03-10 23:20:43 +0000167
Walter Dörwald7f82f792002-11-19 21:42:53 +0000168 The set of allowed parameter values can be extended via
169 register_error.
Guido van Rossum0612d842000-03-10 23:20:43 +0000170 """
171 self.stream = stream
172 self.errors = errors
173
Guido van Rossuma3277132000-04-11 15:37:43 +0000174 def write(self, object):
Guido van Rossum0612d842000-03-10 23:20:43 +0000175
176 """ Writes the object's contents encoded to self.stream.
177 """
Tim Peters30324a72001-05-15 17:19:16 +0000178 data, consumed = self.encode(object, self.errors)
Guido van Rossum0612d842000-03-10 23:20:43 +0000179 self.stream.write(data)
180
Guido van Rossuma3277132000-04-11 15:37:43 +0000181 def writelines(self, list):
182
183 """ Writes the concatenated list of strings to the stream
184 using .write().
185 """
186 self.write(''.join(list))
Guido van Rossum1c89b0e2000-04-11 15:41:38 +0000187
Guido van Rossum0612d842000-03-10 23:20:43 +0000188 def reset(self):
189
190 """ Flushes and resets the codec buffers used for keeping state.
191
192 Calling this method should ensure that the data on the
193 output is put into a clean state, that allows appending
194 of new fresh data without having to rescan the whole
195 stream to recover state.
196
197 """
198 pass
199
Tim Peters30324a72001-05-15 17:19:16 +0000200 def __getattr__(self, name,
Guido van Rossum0612d842000-03-10 23:20:43 +0000201 getattr=getattr):
202
203 """ Inherit all other methods from the underlying stream.
204 """
Tim Peters30324a72001-05-15 17:19:16 +0000205 return getattr(self.stream, name)
Guido van Rossum0612d842000-03-10 23:20:43 +0000206
207###
208
209class StreamReader(Codec):
210
Tim Peters30324a72001-05-15 17:19:16 +0000211 def __init__(self, stream, errors='strict'):
Guido van Rossum0612d842000-03-10 23:20:43 +0000212
213 """ Creates a StreamReader instance.
214
215 stream must be a file-like object open for reading
216 (binary) data.
217
Walter Dörwald7f82f792002-11-19 21:42:53 +0000218 The StreamReader may use different error handling
Guido van Rossum0612d842000-03-10 23:20:43 +0000219 schemes by providing the errors keyword argument. These
Walter Dörwald7f82f792002-11-19 21:42:53 +0000220 parameters are predefined:
Guido van Rossum0612d842000-03-10 23:20:43 +0000221
222 'strict' - raise a ValueError (or a subclass)
223 'ignore' - ignore the character and continue with the next
224 'replace'- replace with a suitable replacement character;
225
Walter Dörwald7f82f792002-11-19 21:42:53 +0000226 The set of allowed parameter values can be extended via
227 register_error.
Guido van Rossum0612d842000-03-10 23:20:43 +0000228 """
229 self.stream = stream
230 self.errors = errors
Walter Dörwald69652032004-09-07 20:24:22 +0000231 self.bytebuffer = ""
232 self.charbuffer = u""
Walter Dörwalde57d7b12004-12-21 22:24:00 +0000233 self.atcr = False
Guido van Rossum0612d842000-03-10 23:20:43 +0000234
Walter Dörwald69652032004-09-07 20:24:22 +0000235 def decode(self, input, errors='strict'):
236 raise NotImplementedError
237
238 def read(self, size=-1, chars=-1):
Guido van Rossum0612d842000-03-10 23:20:43 +0000239
240 """ Decodes data from the stream self.stream and returns the
241 resulting object.
242
Walter Dörwald69652032004-09-07 20:24:22 +0000243 chars indicates the number of characters to read from the
244 stream. read() will never return more than chars
245 characters, but it might return less, if there are not enough
246 characters available.
247
Guido van Rossum0612d842000-03-10 23:20:43 +0000248 size indicates the approximate maximum number of bytes to
249 read from the stream for decoding purposes. The decoder
250 can modify this setting as appropriate. The default value
251 -1 indicates to read and decode as much as possible. size
252 is intended to prevent having to decode huge files in one
253 step.
254
255 The method should use a greedy read strategy meaning that
256 it should read as much data as is allowed within the
257 definition of the encoding and the given size, e.g. if
258 optional encoding endings or state markers are available
259 on the stream, these should be read too.
Guido van Rossum0612d842000-03-10 23:20:43 +0000260 """
Walter Dörwald69652032004-09-07 20:24:22 +0000261 # read until we get the required number of characters (if available)
Walter Dörwald69652032004-09-07 20:24:22 +0000262 while True:
263 # can the request can be satisfied from the character buffer?
264 if chars < 0:
265 if self.charbuffer:
Walter Dörwalde57d7b12004-12-21 22:24:00 +0000266 break
Guido van Rossum0612d842000-03-10 23:20:43 +0000267 else:
Walter Dörwald69652032004-09-07 20:24:22 +0000268 if len(self.charbuffer) >= chars:
Walter Dörwald69652032004-09-07 20:24:22 +0000269 break
270 # we need more data
271 if size < 0:
272 newdata = self.stream.read()
273 else:
274 newdata = self.stream.read(size)
Walter Dörwalde57d7b12004-12-21 22:24:00 +0000275 # decode bytes (those remaining from the last call included)
Walter Dörwald69652032004-09-07 20:24:22 +0000276 data = self.bytebuffer + newdata
Walter Dörwalde57d7b12004-12-21 22:24:00 +0000277 newchars, decodedbytes = self.decode(data, self.errors)
Walter Dörwald69652032004-09-07 20:24:22 +0000278 # keep undecoded bytes until the next call
279 self.bytebuffer = data[decodedbytes:]
280 # put new characters in the character buffer
Walter Dörwalde57d7b12004-12-21 22:24:00 +0000281 self.charbuffer += newchars
Walter Dörwald69652032004-09-07 20:24:22 +0000282 # there was no data available
283 if not newdata:
Walter Dörwalde57d7b12004-12-21 22:24:00 +0000284 break
285 if chars < 0:
286 # Return everything we've got
287 result = self.charbuffer
288 self.charbuffer = u""
289 else:
290 # Return the first chars characters
291 result = self.charbuffer[:chars]
292 self.charbuffer = self.charbuffer[chars:]
Walter Dörwald69652032004-09-07 20:24:22 +0000293 return result
Guido van Rossum0612d842000-03-10 23:20:43 +0000294
Walter Dörwald69652032004-09-07 20:24:22 +0000295 def readline(self, size=None, keepends=True):
Guido van Rossuma3277132000-04-11 15:37:43 +0000296
297 """ Read one line from the input stream and return the
298 decoded data.
299
Walter Dörwald69652032004-09-07 20:24:22 +0000300 size, if given, is passed as size argument to the
301 read() method.
Guido van Rossum1c89b0e2000-04-11 15:41:38 +0000302
Guido van Rossuma3277132000-04-11 15:37:43 +0000303 """
Walter Dörwalde57d7b12004-12-21 22:24:00 +0000304 readsize = size or 72
Walter Dörwald69652032004-09-07 20:24:22 +0000305 line = u""
Walter Dörwalde57d7b12004-12-21 22:24:00 +0000306 # If size is given, we call read() only once
Walter Dörwald69652032004-09-07 20:24:22 +0000307 while True:
Walter Dörwalde57d7b12004-12-21 22:24:00 +0000308 data = self.read(readsize)
309 if self.atcr and data.startswith(u"\n"):
310 data = data[1:]
311 if data:
312 self.atcr = data.endswith(u"\r")
Walter Dörwald69652032004-09-07 20:24:22 +0000313 line += data
Walter Dörwalde57d7b12004-12-21 22:24:00 +0000314 lines = line.splitlines(True)
315 if lines:
316 line0withend = lines[0]
317 line0withoutend = lines[0].splitlines(False)[0]
318 if line0withend != line0withoutend: # We really have a line end
319 # Put the rest back together and keep it until the next call
320 self.charbuffer = u"".join(lines[1:]) + self.charbuffer
321 if keepends:
322 line = line0withend
323 else:
324 line = line0withoutend
Walter Dörwald9fa09462005-01-10 12:01:39 +0000325 break
Walter Dörwalde57d7b12004-12-21 22:24:00 +0000326 # we didn't get anything or this was our only try
Walter Dörwald9fa09462005-01-10 12:01:39 +0000327 if not data or size is not None:
Walter Dörwalde57d7b12004-12-21 22:24:00 +0000328 if line and not keepends:
329 line = line.splitlines(False)[0]
330 break
331 if readsize<8000:
332 readsize *= 2
333 return line
Guido van Rossum1c89b0e2000-04-11 15:41:38 +0000334
Walter Dörwald69652032004-09-07 20:24:22 +0000335 def readlines(self, sizehint=None, keepends=True):
Guido van Rossuma3277132000-04-11 15:37:43 +0000336
337 """ Read all lines available on the input stream
338 and return them as list of lines.
339
340 Line breaks are implemented using the codec's decoder
341 method and are included in the list entries.
Guido van Rossum1c89b0e2000-04-11 15:41:38 +0000342
Marc-André Lemburgd5948492004-02-26 15:22:17 +0000343 sizehint, if given, is ignored since there is no efficient
344 way to finding the true end-of-line.
Guido van Rossuma3277132000-04-11 15:37:43 +0000345
346 """
Walter Dörwald69652032004-09-07 20:24:22 +0000347 data = self.read()
Hye-Shik Changaf5c7cf2004-10-17 23:51:21 +0000348 return data.splitlines(keepends)
Guido van Rossum0612d842000-03-10 23:20:43 +0000349
350 def reset(self):
351
352 """ Resets the codec buffers used for keeping state.
353
354 Note that no stream repositioning should take place.
Thomas Wouters7e474022000-07-16 12:04:32 +0000355 This method is primarily intended to be able to recover
Guido van Rossum0612d842000-03-10 23:20:43 +0000356 from decoding errors.
357
358 """
359 pass
360
Walter Dörwald4dbf1922002-11-06 16:53:44 +0000361 def next(self):
362
363 """ Return the next decoded line from the input stream."""
364 line = self.readline()
365 if line:
366 return line
367 raise StopIteration
368
369 def __iter__(self):
370 return self
371
Tim Peters30324a72001-05-15 17:19:16 +0000372 def __getattr__(self, name,
Guido van Rossum0612d842000-03-10 23:20:43 +0000373 getattr=getattr):
374
375 """ Inherit all other methods from the underlying stream.
376 """
Tim Peters30324a72001-05-15 17:19:16 +0000377 return getattr(self.stream, name)
Guido van Rossum0612d842000-03-10 23:20:43 +0000378
379###
380
381class StreamReaderWriter:
382
Fred Drake49fd1072000-04-13 14:11:21 +0000383 """ StreamReaderWriter instances allow wrapping streams which
384 work in both read and write modes.
385
386 The design is such that one can use the factory functions
Thomas Wouters7e474022000-07-16 12:04:32 +0000387 returned by the codec.lookup() function to construct the
Fred Drake49fd1072000-04-13 14:11:21 +0000388 instance.
389
390 """
Guido van Rossuma3277132000-04-11 15:37:43 +0000391 # Optional attributes set by the file wrappers below
392 encoding = 'unknown'
393
Tim Peters30324a72001-05-15 17:19:16 +0000394 def __init__(self, stream, Reader, Writer, errors='strict'):
Guido van Rossum0612d842000-03-10 23:20:43 +0000395
396 """ Creates a StreamReaderWriter instance.
397
398 stream must be a Stream-like object.
399
400 Reader, Writer must be factory functions or classes
401 providing the StreamReader, StreamWriter interface resp.
402
403 Error handling is done in the same way as defined for the
404 StreamWriter/Readers.
405
406 """
407 self.stream = stream
408 self.reader = Reader(stream, errors)
409 self.writer = Writer(stream, errors)
410 self.errors = errors
411
Tim Peters30324a72001-05-15 17:19:16 +0000412 def read(self, size=-1):
Guido van Rossum0612d842000-03-10 23:20:43 +0000413
414 return self.reader.read(size)
415
Guido van Rossumd58c26f2000-05-01 16:17:32 +0000416 def readline(self, size=None):
Guido van Rossuma3277132000-04-11 15:37:43 +0000417
418 return self.reader.readline(size)
419
Guido van Rossumd58c26f2000-05-01 16:17:32 +0000420 def readlines(self, sizehint=None):
Guido van Rossuma3277132000-04-11 15:37:43 +0000421
422 return self.reader.readlines(sizehint)
423
Walter Dörwald4dbf1922002-11-06 16:53:44 +0000424 def next(self):
425
426 """ Return the next decoded line from the input stream."""
427 return self.reader.next()
428
429 def __iter__(self):
430 return self
431
Tim Peters30324a72001-05-15 17:19:16 +0000432 def write(self, data):
Guido van Rossum0612d842000-03-10 23:20:43 +0000433
434 return self.writer.write(data)
435
Tim Peters30324a72001-05-15 17:19:16 +0000436 def writelines(self, list):
Guido van Rossuma3277132000-04-11 15:37:43 +0000437
438 return self.writer.writelines(list)
439
Guido van Rossum0612d842000-03-10 23:20:43 +0000440 def reset(self):
441
442 self.reader.reset()
443 self.writer.reset()
444
Tim Peters30324a72001-05-15 17:19:16 +0000445 def __getattr__(self, name,
Guido van Rossum0612d842000-03-10 23:20:43 +0000446 getattr=getattr):
447
448 """ Inherit all other methods from the underlying stream.
449 """
Tim Peters30324a72001-05-15 17:19:16 +0000450 return getattr(self.stream, name)
Guido van Rossum0612d842000-03-10 23:20:43 +0000451
452###
453
454class StreamRecoder:
455
Fred Drake49fd1072000-04-13 14:11:21 +0000456 """ StreamRecoder instances provide a frontend - backend
457 view of encoding data.
458
459 They use the complete set of APIs returned by the
460 codecs.lookup() function to implement their task.
461
462 Data written to the stream is first decoded into an
463 intermediate format (which is dependent on the given codec
464 combination) and then written to the stream using an instance
465 of the provided Writer class.
466
467 In the other direction, data is read from the stream using a
468 Reader instance and then return encoded data to the caller.
469
470 """
Guido van Rossuma3277132000-04-11 15:37:43 +0000471 # Optional attributes set by the file wrappers below
472 data_encoding = 'unknown'
473 file_encoding = 'unknown'
474
Tim Peters30324a72001-05-15 17:19:16 +0000475 def __init__(self, stream, encode, decode, Reader, Writer,
476 errors='strict'):
Guido van Rossum0612d842000-03-10 23:20:43 +0000477
478 """ Creates a StreamRecoder instance which implements a two-way
479 conversion: encode and decode work on the frontend (the
Guido van Rossum1c89b0e2000-04-11 15:41:38 +0000480 input to .read() and output of .write()) while
Guido van Rossum0612d842000-03-10 23:20:43 +0000481 Reader and Writer work on the backend (reading and
Fred Drake908670c2000-03-17 15:42:11 +0000482 writing to the stream).
Guido van Rossum0612d842000-03-10 23:20:43 +0000483
484 You can use these objects to do transparent direct
485 recodings from e.g. latin-1 to utf-8 and back.
486
487 stream must be a file-like object.
488
489 encode, decode must adhere to the Codec interface, Reader,
490 Writer must be factory functions or classes providing the
491 StreamReader, StreamWriter interface resp.
492
493 encode and decode are needed for the frontend translation,
494 Reader and Writer for the backend translation. Unicode is
495 used as intermediate encoding.
496
497 Error handling is done in the same way as defined for the
498 StreamWriter/Readers.
499
500 """
501 self.stream = stream
502 self.encode = encode
503 self.decode = decode
504 self.reader = Reader(stream, errors)
505 self.writer = Writer(stream, errors)
506 self.errors = errors
507
Tim Peters30324a72001-05-15 17:19:16 +0000508 def read(self, size=-1):
Guido van Rossum0612d842000-03-10 23:20:43 +0000509
510 data = self.reader.read(size)
511 data, bytesencoded = self.encode(data, self.errors)
512 return data
513
Tim Peters30324a72001-05-15 17:19:16 +0000514 def readline(self, size=None):
Guido van Rossuma3277132000-04-11 15:37:43 +0000515
516 if size is None:
517 data = self.reader.readline()
518 else:
519 data = self.reader.readline(size)
520 data, bytesencoded = self.encode(data, self.errors)
521 return data
522
Tim Peters30324a72001-05-15 17:19:16 +0000523 def readlines(self, sizehint=None):
Guido van Rossuma3277132000-04-11 15:37:43 +0000524
Marc-André Lemburgd5948492004-02-26 15:22:17 +0000525 data = self.reader.read()
Guido van Rossuma3277132000-04-11 15:37:43 +0000526 data, bytesencoded = self.encode(data, self.errors)
527 return data.splitlines(1)
528
Walter Dörwald4dbf1922002-11-06 16:53:44 +0000529 def next(self):
530
531 """ Return the next decoded line from the input stream."""
532 return self.reader.next()
533
534 def __iter__(self):
535 return self
536
Tim Peters30324a72001-05-15 17:19:16 +0000537 def write(self, data):
Guido van Rossum0612d842000-03-10 23:20:43 +0000538
539 data, bytesdecoded = self.decode(data, self.errors)
540 return self.writer.write(data)
541
Tim Peters30324a72001-05-15 17:19:16 +0000542 def writelines(self, list):
Guido van Rossuma3277132000-04-11 15:37:43 +0000543
544 data = ''.join(list)
545 data, bytesdecoded = self.decode(data, self.errors)
546 return self.writer.write(data)
Guido van Rossum0612d842000-03-10 23:20:43 +0000547
548 def reset(self):
549
550 self.reader.reset()
551 self.writer.reset()
552
Tim Peters30324a72001-05-15 17:19:16 +0000553 def __getattr__(self, name,
Guido van Rossum0612d842000-03-10 23:20:43 +0000554 getattr=getattr):
555
556 """ Inherit all other methods from the underlying stream.
557 """
Tim Peters30324a72001-05-15 17:19:16 +0000558 return getattr(self.stream, name)
Guido van Rossum0612d842000-03-10 23:20:43 +0000559
560### Shortcuts
561
Marc-André Lemburg349a3d32000-06-21 21:21:04 +0000562def open(filename, mode='rb', encoding=None, errors='strict', buffering=1):
Guido van Rossum0612d842000-03-10 23:20:43 +0000563
564 """ Open an encoded file using the given mode and return
565 a wrapped version providing transparent encoding/decoding.
566
567 Note: The wrapped version will only accept the object format
568 defined by the codecs, i.e. Unicode objects for most builtin
569 codecs. Output is also codec dependent and will usually by
570 Unicode as well.
571
Marc-André Lemburg349a3d32000-06-21 21:21:04 +0000572 Files are always opened in binary mode, even if no binary mode
Walter Dörwald7f3ed742003-02-02 23:08:27 +0000573 was specified. This is done to avoid data loss due to encodings
Marc-André Lemburg349a3d32000-06-21 21:21:04 +0000574 using 8-bit values. The default file mode is 'rb' meaning to
575 open the file in binary read mode.
576
Guido van Rossum0612d842000-03-10 23:20:43 +0000577 encoding specifies the encoding which is to be used for the
Walter Dörwald7f3ed742003-02-02 23:08:27 +0000578 file.
Guido van Rossum0612d842000-03-10 23:20:43 +0000579
580 errors may be given to define the error handling. It defaults
581 to 'strict' which causes ValueErrors to be raised in case an
582 encoding error occurs.
583
584 buffering has the same meaning as for the builtin open() API.
585 It defaults to line buffered.
586
Fred Drake49fd1072000-04-13 14:11:21 +0000587 The returned wrapped file object provides an extra attribute
588 .encoding which allows querying the used encoding. This
589 attribute is only available if an encoding was specified as
590 parameter.
591
Guido van Rossum0612d842000-03-10 23:20:43 +0000592 """
593 if encoding is not None and \
594 'b' not in mode:
595 # Force opening of the file in binary mode
596 mode = mode + 'b'
597 file = __builtin__.open(filename, mode, buffering)
598 if encoding is None:
599 return file
Tim Peters30324a72001-05-15 17:19:16 +0000600 (e, d, sr, sw) = lookup(encoding)
Guido van Rossuma3277132000-04-11 15:37:43 +0000601 srw = StreamReaderWriter(file, sr, sw, errors)
602 # Add attributes to simplify introspection
603 srw.encoding = encoding
604 return srw
Guido van Rossum0612d842000-03-10 23:20:43 +0000605
Guido van Rossuma3277132000-04-11 15:37:43 +0000606def EncodedFile(file, data_encoding, file_encoding=None, errors='strict'):
Guido van Rossum0612d842000-03-10 23:20:43 +0000607
608 """ Return a wrapped version of file which provides transparent
609 encoding translation.
610
611 Strings written to the wrapped file are interpreted according
Guido van Rossuma3277132000-04-11 15:37:43 +0000612 to the given data_encoding and then written to the original
613 file as string using file_encoding. The intermediate encoding
614 will usually be Unicode but depends on the specified codecs.
Guido van Rossum0612d842000-03-10 23:20:43 +0000615
Guido van Rossuma3277132000-04-11 15:37:43 +0000616 Strings are read from the file using file_encoding and then
617 passed back to the caller as string using data_encoding.
618
619 If file_encoding is not given, it defaults to data_encoding.
Guido van Rossum0612d842000-03-10 23:20:43 +0000620
621 errors may be given to define the error handling. It defaults
622 to 'strict' which causes ValueErrors to be raised in case an
623 encoding error occurs.
624
Fred Drake49fd1072000-04-13 14:11:21 +0000625 The returned wrapped file object provides two extra attributes
626 .data_encoding and .file_encoding which reflect the given
627 parameters of the same name. The attributes can be used for
628 introspection by Python programs.
629
Guido van Rossum0612d842000-03-10 23:20:43 +0000630 """
Guido van Rossuma3277132000-04-11 15:37:43 +0000631 if file_encoding is None:
632 file_encoding = data_encoding
633 encode, decode = lookup(data_encoding)[:2]
634 Reader, Writer = lookup(file_encoding)[2:]
635 sr = StreamRecoder(file,
Tim Peters30324a72001-05-15 17:19:16 +0000636 encode, decode, Reader, Writer,
Guido van Rossuma3277132000-04-11 15:37:43 +0000637 errors)
638 # Add attributes to simplify introspection
639 sr.data_encoding = data_encoding
640 sr.file_encoding = file_encoding
641 return sr
Guido van Rossum0612d842000-03-10 23:20:43 +0000642
Marc-André Lemburgaa32c5a2001-09-19 11:24:48 +0000643### Helpers for codec lookup
644
645def getencoder(encoding):
646
647 """ Lookup up the codec for the given encoding and return
648 its encoder function.
649
650 Raises a LookupError in case the encoding cannot be found.
651
652 """
653 return lookup(encoding)[0]
654
655def getdecoder(encoding):
656
657 """ Lookup up the codec for the given encoding and return
658 its decoder function.
659
660 Raises a LookupError in case the encoding cannot be found.
661
662 """
663 return lookup(encoding)[1]
664
665def getreader(encoding):
666
667 """ Lookup up the codec for the given encoding and return
668 its StreamReader class or factory function.
669
670 Raises a LookupError in case the encoding cannot be found.
671
672 """
673 return lookup(encoding)[2]
674
675def getwriter(encoding):
676
677 """ Lookup up the codec for the given encoding and return
678 its StreamWriter class or factory function.
679
680 Raises a LookupError in case the encoding cannot be found.
681
682 """
683 return lookup(encoding)[3]
684
Marc-André Lemburga866df82001-01-03 21:29:14 +0000685### Helpers for charmap-based codecs
686
687def make_identity_dict(rng):
688
689 """ make_identity_dict(rng) -> dict
690
691 Return a dictionary where elements of the rng sequence are
692 mapped to themselves.
Tim Peters88869f92001-01-14 23:36:06 +0000693
Marc-André Lemburga866df82001-01-03 21:29:14 +0000694 """
695 res = {}
696 for i in rng:
697 res[i]=i
698 return res
699
Marc-André Lemburg716cf912001-05-16 09:41:45 +0000700def make_encoding_map(decoding_map):
701
702 """ Creates an encoding map from a decoding map.
703
Walter Dörwald7f3ed742003-02-02 23:08:27 +0000704 If a target mapping in the decoding map occurs multiple
Marc-André Lemburg716cf912001-05-16 09:41:45 +0000705 times, then that target is mapped to None (undefined mapping),
706 causing an exception when encountered by the charmap codec
707 during translation.
708
709 One example where this happens is cp875.py which decodes
710 multiple character to \u001a.
711
712 """
713 m = {}
714 for k,v in decoding_map.items():
Raymond Hettinger54f02222002-06-01 14:18:47 +0000715 if not v in m:
Marc-André Lemburg716cf912001-05-16 09:41:45 +0000716 m[v] = k
717 else:
718 m[v] = None
719 return m
Tim Peters3a2ab1a2001-05-29 06:06:54 +0000720
Walter Dörwald3aeb6322002-09-02 13:14:32 +0000721### error handlers
722
723strict_errors = lookup_error("strict")
724ignore_errors = lookup_error("ignore")
725replace_errors = lookup_error("replace")
726xmlcharrefreplace_errors = lookup_error("xmlcharrefreplace")
727backslashreplace_errors = lookup_error("backslashreplace")
728
Martin v. Löwis6cd441d2001-07-31 08:54:55 +0000729# Tell modulefinder that using codecs probably needs the encodings
730# package
731_false = 0
732if _false:
733 import encodings
734
Guido van Rossum0612d842000-03-10 23:20:43 +0000735### Tests
Guido van Rossum1c89b0e2000-04-11 15:41:38 +0000736
Guido van Rossum0612d842000-03-10 23:20:43 +0000737if __name__ == '__main__':
738
Guido van Rossuma3277132000-04-11 15:37:43 +0000739 # Make stdout translate Latin-1 output into UTF-8 output
740 sys.stdout = EncodedFile(sys.stdout, 'latin-1', 'utf-8')
Guido van Rossum1c89b0e2000-04-11 15:41:38 +0000741
Guido van Rossuma3277132000-04-11 15:37:43 +0000742 # Have stdin translate Latin-1 input into UTF-8 input
743 sys.stdin = EncodedFile(sys.stdin, 'utf-8', 'latin-1')