blob: 6a61e1aa60434038246a72303e2612cd1b621655 [file] [log] [blame]
Guido van Rossum0612d842000-03-10 23:20:43 +00001""" codecs -- Python Codec Registry, API and helpers.
2
3
4Written by Marc-Andre Lemburg (mal@lemburg.com).
5
6(c) Copyright CNRI, All Rights Reserved. NO WARRANTY.
7
8"""#"
9
10import struct,types,__builtin__
11
12### Registry and builtin stateless codec functions
13
Guido van Rossumb95de4f2000-03-31 17:25:23 +000014try:
15 from _codecs import *
16except ImportError,why:
17 raise SystemError,\
18 'Failed to load the builtin codecs: %s' % why
Guido van Rossum0612d842000-03-10 23:20:43 +000019
20### Constants
21
22#
23# Byte Order Mark (BOM) and its possible values (BOM_BE, BOM_LE)
24#
25BOM = struct.pack('=H',0xFEFF)
26#
27BOM_BE = BOM32_BE = '\376\377'
28# corresponds to Unicode U+FEFF in UTF-16 on big endian
29# platforms == ZERO WIDTH NO-BREAK SPACE
30BOM_LE = BOM32_LE = '\377\376'
31# corresponds to Unicode U+FFFE in UTF-16 on little endian
32# platforms == defined as being an illegal Unicode character
33
34#
35# 64-bit Byte Order Marks
36#
37BOM64_BE = '\000\000\376\377'
38# corresponds to Unicode U+0000FEFF in UCS-4
39BOM64_LE = '\377\376\000\000'
40# corresponds to Unicode U+0000FFFE in UCS-4
41
42
43### Codec base classes (defining the API)
44
45class Codec:
46
47 """ Defines the interface for stateless encoders/decoders.
48
49 The .encode()/.decode() methods may implement different error
50 handling schemes by providing the errors argument. These
51 string values are defined:
52
Guido van Rossumd8855fd2000-03-24 22:14:19 +000053 'strict' - raise a ValueError error (or a subclass)
Guido van Rossum0612d842000-03-10 23:20:43 +000054 'ignore' - ignore the character and continue with the next
55 'replace' - replace with a suitable replacement character;
56 Python will use the official U+FFFD REPLACEMENT
57 CHARACTER for the builtin Unicode codecs.
58
59 """
60 def encode(self,input,errors='strict'):
61
Fred Drake3e74c0d2000-03-17 15:40:35 +000062 """ Encodes the object input and returns a tuple (output
Guido van Rossum0612d842000-03-10 23:20:43 +000063 object, length consumed).
64
65 errors defines the error handling to apply. It defaults to
66 'strict' handling.
67
68 The method may not store state in the Codec instance. Use
69 StreamCodec for codecs which have to keep state in order to
70 make encoding/decoding efficient.
71
72 The encoder must be able to handle zero length input and
73 return an empty object of the output object type in this
74 situation.
75
76 """
77 raise NotImplementedError
78
79 def decode(self,input,errors='strict'):
80
81 """ Decodes the object input and returns a tuple (output
82 object, length consumed).
83
84 input must be an object which provides the bf_getreadbuf
85 buffer slot. Python strings, buffer objects and memory
86 mapped files are examples of objects providing this slot.
87
88 errors defines the error handling to apply. It defaults to
89 'strict' handling.
90
91 The method may not store state in the Codec instance. Use
92 StreamCodec for codecs which have to keep state in order to
93 make encoding/decoding efficient.
94
95 The decoder must be able to handle zero length input and
96 return an empty object of the output object type in this
97 situation.
98
99 """
100 raise NotImplementedError
101
102#
103# The StreamWriter and StreamReader class provide generic working
104# interfaces which can be used to implement new encodings submodules
105# very easily. See encodings/utf_8.py for an example on how this is
106# done.
107#
108
109class StreamWriter(Codec):
110
111 def __init__(self,stream,errors='strict'):
112
113 """ Creates a StreamWriter instance.
114
115 stream must be a file-like object open for writing
116 (binary) data.
117
118 The StreamWriter may implement different error handling
119 schemes by providing the errors keyword argument. These
120 parameters are defined:
121
122 'strict' - raise a ValueError (or a subclass)
123 'ignore' - ignore the character and continue with the next
124 'replace'- replace with a suitable replacement character
125
126 """
127 self.stream = stream
128 self.errors = errors
129
Guido van Rossuma3277132000-04-11 15:37:43 +0000130 def write(self, object):
Guido van Rossum0612d842000-03-10 23:20:43 +0000131
132 """ Writes the object's contents encoded to self.stream.
133 """
134 data, consumed = self.encode(object,self.errors)
135 self.stream.write(data)
136
Guido van Rossuma3277132000-04-11 15:37:43 +0000137 def writelines(self, list):
138
139 """ Writes the concatenated list of strings to the stream
140 using .write().
141 """
142 self.write(''.join(list))
Guido van Rossum0612d842000-03-10 23:20:43 +0000143
144 def reset(self):
145
146 """ Flushes and resets the codec buffers used for keeping state.
147
148 Calling this method should ensure that the data on the
149 output is put into a clean state, that allows appending
150 of new fresh data without having to rescan the whole
151 stream to recover state.
152
153 """
154 pass
155
156 def __getattr__(self,name,
157
158 getattr=getattr):
159
160 """ Inherit all other methods from the underlying stream.
161 """
162 return getattr(self.stream,name)
163
164###
165
166class StreamReader(Codec):
167
168 def __init__(self,stream,errors='strict'):
169
170 """ Creates a StreamReader instance.
171
172 stream must be a file-like object open for reading
173 (binary) data.
174
175 The StreamReader may implement different error handling
176 schemes by providing the errors keyword argument. These
177 parameters are defined:
178
179 'strict' - raise a ValueError (or a subclass)
180 'ignore' - ignore the character and continue with the next
181 'replace'- replace with a suitable replacement character;
182
183 """
184 self.stream = stream
185 self.errors = errors
186
Guido van Rossuma3277132000-04-11 15:37:43 +0000187 def read(self, size=-1):
Guido van Rossum0612d842000-03-10 23:20:43 +0000188
189 """ Decodes data from the stream self.stream and returns the
190 resulting object.
191
192 size indicates the approximate maximum number of bytes to
193 read from the stream for decoding purposes. The decoder
194 can modify this setting as appropriate. The default value
195 -1 indicates to read and decode as much as possible. size
196 is intended to prevent having to decode huge files in one
197 step.
198
199 The method should use a greedy read strategy meaning that
200 it should read as much data as is allowed within the
201 definition of the encoding and the given size, e.g. if
202 optional encoding endings or state markers are available
203 on the stream, these should be read too.
204
205 """
206 # Unsliced reading:
207 if size < 0:
208 return self.decode(self.stream.read())[0]
209
210 # Sliced reading:
211 read = self.stream.read
212 decode = self.decode
213 data = read(size)
214 i = 0
215 while 1:
216 try:
217 object, decodedbytes = decode(data)
218 except ValueError,why:
219 # This method is slow but should work under pretty much
220 # all conditions; at most 10 tries are made
221 i = i + 1
222 newdata = read(1)
223 if not newdata or i > 10:
224 raise
225 data = data + newdata
226 else:
227 return object
228
Guido van Rossuma3277132000-04-11 15:37:43 +0000229 def readline(self, size=None):
230
231 """ Read one line from the input stream and return the
232 decoded data.
233
234 Note: Unlike the .readlines() method, line breaking must
235 be implemented by the underlying stream's .readline()
236 method -- there is currently no support for line breaking
237 using the codec decoder due to lack of line buffering.
238
239 size, if given, is passed as size argument to the stream's
240 .readline() method.
241
242 """
243 if size is None:
244 line = self.stream.readline()
245 else:
246 line = self.stream.readline(size)
247 return self.decode(line)[0]
248
249
250 def readlines(self, sizehint=0):
251
252 """ Read all lines available on the input stream
253 and return them as list of lines.
254
255 Line breaks are implemented using the codec's decoder
256 method and are included in the list entries.
257
258 sizehint, if given, is passed as size argument to the
259 stream's .read() method.
260
261 """
262 if sizehint is None:
263 data = self.stream.read()
264 else:
265 data = self.stream.read(sizehint)
266 return self.decode(data)[0].splitlines(1)
Guido van Rossum0612d842000-03-10 23:20:43 +0000267
268 def reset(self):
269
270 """ Resets the codec buffers used for keeping state.
271
272 Note that no stream repositioning should take place.
273 This method is primarely intended to be able to recover
274 from decoding errors.
275
276 """
277 pass
278
279 def __getattr__(self,name,
280
281 getattr=getattr):
282
283 """ Inherit all other methods from the underlying stream.
284 """
285 return getattr(self.stream,name)
286
287###
288
289class StreamReaderWriter:
290
Guido van Rossuma3277132000-04-11 15:37:43 +0000291 # Optional attributes set by the file wrappers below
292 encoding = 'unknown'
293
Guido van Rossum0612d842000-03-10 23:20:43 +0000294 def __init__(self,stream,Reader,Writer,errors='strict'):
295
296 """ Creates a StreamReaderWriter instance.
297
298 stream must be a Stream-like object.
299
300 Reader, Writer must be factory functions or classes
301 providing the StreamReader, StreamWriter interface resp.
302
303 Error handling is done in the same way as defined for the
304 StreamWriter/Readers.
305
306 """
307 self.stream = stream
308 self.reader = Reader(stream, errors)
309 self.writer = Writer(stream, errors)
310 self.errors = errors
311
312 def read(self,size=-1):
313
314 return self.reader.read(size)
315
Guido van Rossuma3277132000-04-11 15:37:43 +0000316 def readline(size=None):
317
318 return self.reader.readline(size)
319
320 def readlines(sizehint=None):
321
322 return self.reader.readlines(sizehint)
323
Guido van Rossum0612d842000-03-10 23:20:43 +0000324 def write(self,data):
325
326 return self.writer.write(data)
327
Guido van Rossuma3277132000-04-11 15:37:43 +0000328 def writelines(self,list):
329
330 return self.writer.writelines(list)
331
Guido van Rossum0612d842000-03-10 23:20:43 +0000332 def reset(self):
333
334 self.reader.reset()
335 self.writer.reset()
336
337 def __getattr__(self,name,
338
339 getattr=getattr):
340
341 """ Inherit all other methods from the underlying stream.
342 """
343 return getattr(self.stream,name)
344
345###
346
347class StreamRecoder:
348
Guido van Rossuma3277132000-04-11 15:37:43 +0000349 # Optional attributes set by the file wrappers below
350 data_encoding = 'unknown'
351 file_encoding = 'unknown'
352
Guido van Rossum0612d842000-03-10 23:20:43 +0000353 def __init__(self,stream,encode,decode,Reader,Writer,errors='strict'):
354
355 """ Creates a StreamRecoder instance which implements a two-way
356 conversion: encode and decode work on the frontend (the
357 input to .read() and output of .write()) while
358 Reader and Writer work on the backend (reading and
Fred Drake908670c2000-03-17 15:42:11 +0000359 writing to the stream).
Guido van Rossum0612d842000-03-10 23:20:43 +0000360
361 You can use these objects to do transparent direct
362 recodings from e.g. latin-1 to utf-8 and back.
363
364 stream must be a file-like object.
365
366 encode, decode must adhere to the Codec interface, Reader,
367 Writer must be factory functions or classes providing the
368 StreamReader, StreamWriter interface resp.
369
370 encode and decode are needed for the frontend translation,
371 Reader and Writer for the backend translation. Unicode is
372 used as intermediate encoding.
373
374 Error handling is done in the same way as defined for the
375 StreamWriter/Readers.
376
377 """
378 self.stream = stream
379 self.encode = encode
380 self.decode = decode
381 self.reader = Reader(stream, errors)
382 self.writer = Writer(stream, errors)
383 self.errors = errors
384
385 def read(self,size=-1):
386
387 data = self.reader.read(size)
388 data, bytesencoded = self.encode(data, self.errors)
389 return data
390
Guido van Rossuma3277132000-04-11 15:37:43 +0000391 def readline(self,size=None):
392
393 if size is None:
394 data = self.reader.readline()
395 else:
396 data = self.reader.readline(size)
397 data, bytesencoded = self.encode(data, self.errors)
398 return data
399
400 def readlines(self,sizehint=None):
401
402 if sizehint is None:
403 data = self.reader.read()
404 else:
405 data = self.reader.read(sizehint)
406 data, bytesencoded = self.encode(data, self.errors)
407 return data.splitlines(1)
408
Guido van Rossum0612d842000-03-10 23:20:43 +0000409 def write(self,data):
410
411 data, bytesdecoded = self.decode(data, self.errors)
412 return self.writer.write(data)
413
Guido van Rossuma3277132000-04-11 15:37:43 +0000414 def writelines(self,list):
415
416 data = ''.join(list)
417 data, bytesdecoded = self.decode(data, self.errors)
418 return self.writer.write(data)
Guido van Rossum0612d842000-03-10 23:20:43 +0000419
420 def reset(self):
421
422 self.reader.reset()
423 self.writer.reset()
424
425 def __getattr__(self,name,
426
427 getattr=getattr):
428
429 """ Inherit all other methods from the underlying stream.
430 """
431 return getattr(self.stream,name)
432
433### Shortcuts
434
435def open(filename, mode, encoding=None, errors='strict', buffering=1):
436
437 """ Open an encoded file using the given mode and return
438 a wrapped version providing transparent encoding/decoding.
439
440 Note: The wrapped version will only accept the object format
441 defined by the codecs, i.e. Unicode objects for most builtin
442 codecs. Output is also codec dependent and will usually by
443 Unicode as well.
444
445 encoding specifies the encoding which is to be used for the
446 the file.
447
448 errors may be given to define the error handling. It defaults
449 to 'strict' which causes ValueErrors to be raised in case an
450 encoding error occurs.
451
452 buffering has the same meaning as for the builtin open() API.
453 It defaults to line buffered.
454
455 """
456 if encoding is not None and \
457 'b' not in mode:
458 # Force opening of the file in binary mode
459 mode = mode + 'b'
460 file = __builtin__.open(filename, mode, buffering)
461 if encoding is None:
462 return file
463 (e,d,sr,sw) = lookup(encoding)
Guido van Rossuma3277132000-04-11 15:37:43 +0000464 srw = StreamReaderWriter(file, sr, sw, errors)
465 # Add attributes to simplify introspection
466 srw.encoding = encoding
467 return srw
Guido van Rossum0612d842000-03-10 23:20:43 +0000468
Guido van Rossuma3277132000-04-11 15:37:43 +0000469def EncodedFile(file, data_encoding, file_encoding=None, errors='strict'):
Guido van Rossum0612d842000-03-10 23:20:43 +0000470
471 """ Return a wrapped version of file which provides transparent
472 encoding translation.
473
474 Strings written to the wrapped file are interpreted according
Guido van Rossuma3277132000-04-11 15:37:43 +0000475 to the given data_encoding and then written to the original
476 file as string using file_encoding. The intermediate encoding
477 will usually be Unicode but depends on the specified codecs.
Guido van Rossum0612d842000-03-10 23:20:43 +0000478
Guido van Rossuma3277132000-04-11 15:37:43 +0000479 Strings are read from the file using file_encoding and then
480 passed back to the caller as string using data_encoding.
481
482 If file_encoding is not given, it defaults to data_encoding.
Guido van Rossum0612d842000-03-10 23:20:43 +0000483
484 errors may be given to define the error handling. It defaults
485 to 'strict' which causes ValueErrors to be raised in case an
486 encoding error occurs.
487
Guido van Rossuma3277132000-04-11 15:37:43 +0000488 data_encoding and file_encoding are added to the wrapped file
489 object as attributes .data_encoding and .file_encoding resp.
490
Guido van Rossum0612d842000-03-10 23:20:43 +0000491 """
Guido van Rossuma3277132000-04-11 15:37:43 +0000492 if file_encoding is None:
493 file_encoding = data_encoding
494 encode, decode = lookup(data_encoding)[:2]
495 Reader, Writer = lookup(file_encoding)[2:]
496 sr = StreamRecoder(file,
497 encode,decode,Reader,Writer,
498 errors)
499 # Add attributes to simplify introspection
500 sr.data_encoding = data_encoding
501 sr.file_encoding = file_encoding
502 return sr
Guido van Rossum0612d842000-03-10 23:20:43 +0000503
504### Tests
505
506if __name__ == '__main__':
507
508 import sys
509
Guido van Rossuma3277132000-04-11 15:37:43 +0000510 # Make stdout translate Latin-1 output into UTF-8 output
511 sys.stdout = EncodedFile(sys.stdout, 'latin-1', 'utf-8')
512
513 # Have stdin translate Latin-1 input into UTF-8 input
514 sys.stdin = EncodedFile(sys.stdin, 'utf-8', 'latin-1')