blob: 5c669c07a5d472cf0b20ccc987120d769c3bf825 [file] [log] [blame]
Guido van Rossum0612d842000-03-10 23:20:43 +00001""" codecs -- Python Codec Registry, API and helpers.
2
3
4Written by Marc-Andre Lemburg (mal@lemburg.com).
5
6(c) Copyright CNRI, All Rights Reserved. NO WARRANTY.
7
8"""#"
9
10import struct,types,__builtin__
11
12### Registry and builtin stateless codec functions
13
Guido van Rossumb95de4f2000-03-31 17:25:23 +000014try:
15 from _codecs import *
16except ImportError,why:
17 raise SystemError,\
18 'Failed to load the builtin codecs: %s' % why
Guido van Rossum0612d842000-03-10 23:20:43 +000019
20### Constants
21
22#
23# Byte Order Mark (BOM) and its possible values (BOM_BE, BOM_LE)
24#
25BOM = struct.pack('=H',0xFEFF)
26#
27BOM_BE = BOM32_BE = '\376\377'
28# corresponds to Unicode U+FEFF in UTF-16 on big endian
29# platforms == ZERO WIDTH NO-BREAK SPACE
30BOM_LE = BOM32_LE = '\377\376'
31# corresponds to Unicode U+FFFE in UTF-16 on little endian
32# platforms == defined as being an illegal Unicode character
33
34#
35# 64-bit Byte Order Marks
36#
37BOM64_BE = '\000\000\376\377'
38# corresponds to Unicode U+0000FEFF in UCS-4
39BOM64_LE = '\377\376\000\000'
40# corresponds to Unicode U+0000FFFE in UCS-4
41
42
43### Codec base classes (defining the API)
44
45class Codec:
46
47 """ Defines the interface for stateless encoders/decoders.
48
49 The .encode()/.decode() methods may implement different error
50 handling schemes by providing the errors argument. These
51 string values are defined:
52
Guido van Rossumd8855fd2000-03-24 22:14:19 +000053 'strict' - raise a ValueError error (or a subclass)
Guido van Rossum0612d842000-03-10 23:20:43 +000054 'ignore' - ignore the character and continue with the next
55 'replace' - replace with a suitable replacement character;
56 Python will use the official U+FFFD REPLACEMENT
57 CHARACTER for the builtin Unicode codecs.
58
59 """
60 def encode(self,input,errors='strict'):
61
Fred Drake3e74c0d2000-03-17 15:40:35 +000062 """ Encodes the object input and returns a tuple (output
Guido van Rossum0612d842000-03-10 23:20:43 +000063 object, length consumed).
64
65 errors defines the error handling to apply. It defaults to
66 'strict' handling.
67
68 The method may not store state in the Codec instance. Use
69 StreamCodec for codecs which have to keep state in order to
70 make encoding/decoding efficient.
71
72 The encoder must be able to handle zero length input and
73 return an empty object of the output object type in this
74 situation.
75
76 """
77 raise NotImplementedError
78
79 def decode(self,input,errors='strict'):
80
81 """ Decodes the object input and returns a tuple (output
82 object, length consumed).
83
84 input must be an object which provides the bf_getreadbuf
85 buffer slot. Python strings, buffer objects and memory
86 mapped files are examples of objects providing this slot.
87
88 errors defines the error handling to apply. It defaults to
89 'strict' handling.
90
91 The method may not store state in the Codec instance. Use
92 StreamCodec for codecs which have to keep state in order to
93 make encoding/decoding efficient.
94
95 The decoder must be able to handle zero length input and
96 return an empty object of the output object type in this
97 situation.
98
99 """
100 raise NotImplementedError
101
102#
103# The StreamWriter and StreamReader class provide generic working
104# interfaces which can be used to implement new encodings submodules
105# very easily. See encodings/utf_8.py for an example on how this is
106# done.
107#
108
109class StreamWriter(Codec):
110
111 def __init__(self,stream,errors='strict'):
112
113 """ Creates a StreamWriter instance.
114
115 stream must be a file-like object open for writing
116 (binary) data.
117
118 The StreamWriter may implement different error handling
119 schemes by providing the errors keyword argument. These
120 parameters are defined:
121
122 'strict' - raise a ValueError (or a subclass)
123 'ignore' - ignore the character and continue with the next
124 'replace'- replace with a suitable replacement character
125
126 """
127 self.stream = stream
128 self.errors = errors
129
130 def write(self,object):
131
132 """ Writes the object's contents encoded to self.stream.
133 """
134 data, consumed = self.encode(object,self.errors)
135 self.stream.write(data)
136
137 # XXX .writelines() ?
138
139 def reset(self):
140
141 """ Flushes and resets the codec buffers used for keeping state.
142
143 Calling this method should ensure that the data on the
144 output is put into a clean state, that allows appending
145 of new fresh data without having to rescan the whole
146 stream to recover state.
147
148 """
149 pass
150
151 def __getattr__(self,name,
152
153 getattr=getattr):
154
155 """ Inherit all other methods from the underlying stream.
156 """
157 return getattr(self.stream,name)
158
159###
160
161class StreamReader(Codec):
162
163 def __init__(self,stream,errors='strict'):
164
165 """ Creates a StreamReader instance.
166
167 stream must be a file-like object open for reading
168 (binary) data.
169
170 The StreamReader may implement different error handling
171 schemes by providing the errors keyword argument. These
172 parameters are defined:
173
174 'strict' - raise a ValueError (or a subclass)
175 'ignore' - ignore the character and continue with the next
176 'replace'- replace with a suitable replacement character;
177
178 """
179 self.stream = stream
180 self.errors = errors
181
182 def read(self,size=-1):
183
184 """ Decodes data from the stream self.stream and returns the
185 resulting object.
186
187 size indicates the approximate maximum number of bytes to
188 read from the stream for decoding purposes. The decoder
189 can modify this setting as appropriate. The default value
190 -1 indicates to read and decode as much as possible. size
191 is intended to prevent having to decode huge files in one
192 step.
193
194 The method should use a greedy read strategy meaning that
195 it should read as much data as is allowed within the
196 definition of the encoding and the given size, e.g. if
197 optional encoding endings or state markers are available
198 on the stream, these should be read too.
199
200 """
201 # Unsliced reading:
202 if size < 0:
203 return self.decode(self.stream.read())[0]
204
205 # Sliced reading:
206 read = self.stream.read
207 decode = self.decode
208 data = read(size)
209 i = 0
210 while 1:
211 try:
212 object, decodedbytes = decode(data)
213 except ValueError,why:
214 # This method is slow but should work under pretty much
215 # all conditions; at most 10 tries are made
216 i = i + 1
217 newdata = read(1)
218 if not newdata or i > 10:
219 raise
220 data = data + newdata
221 else:
222 return object
223
224 # XXX .readline() and .readlines() (these are hard to implement
225 # without using buffers for keeping read-ahead data)
226
227 def reset(self):
228
229 """ Resets the codec buffers used for keeping state.
230
231 Note that no stream repositioning should take place.
232 This method is primarely intended to be able to recover
233 from decoding errors.
234
235 """
236 pass
237
238 def __getattr__(self,name,
239
240 getattr=getattr):
241
242 """ Inherit all other methods from the underlying stream.
243 """
244 return getattr(self.stream,name)
245
246###
247
248class StreamReaderWriter:
249
250 def __init__(self,stream,Reader,Writer,errors='strict'):
251
252 """ Creates a StreamReaderWriter instance.
253
254 stream must be a Stream-like object.
255
256 Reader, Writer must be factory functions or classes
257 providing the StreamReader, StreamWriter interface resp.
258
259 Error handling is done in the same way as defined for the
260 StreamWriter/Readers.
261
262 """
263 self.stream = stream
264 self.reader = Reader(stream, errors)
265 self.writer = Writer(stream, errors)
266 self.errors = errors
267
268 def read(self,size=-1):
269
270 return self.reader.read(size)
271
272 def write(self,data):
273
274 return self.writer.write(data)
275
276 def reset(self):
277
278 self.reader.reset()
279 self.writer.reset()
280
281 def __getattr__(self,name,
282
283 getattr=getattr):
284
285 """ Inherit all other methods from the underlying stream.
286 """
287 return getattr(self.stream,name)
288
289###
290
291class StreamRecoder:
292
293 def __init__(self,stream,encode,decode,Reader,Writer,errors='strict'):
294
295 """ Creates a StreamRecoder instance which implements a two-way
296 conversion: encode and decode work on the frontend (the
297 input to .read() and output of .write()) while
298 Reader and Writer work on the backend (reading and
Fred Drake908670c2000-03-17 15:42:11 +0000299 writing to the stream).
Guido van Rossum0612d842000-03-10 23:20:43 +0000300
301 You can use these objects to do transparent direct
302 recodings from e.g. latin-1 to utf-8 and back.
303
304 stream must be a file-like object.
305
306 encode, decode must adhere to the Codec interface, Reader,
307 Writer must be factory functions or classes providing the
308 StreamReader, StreamWriter interface resp.
309
310 encode and decode are needed for the frontend translation,
311 Reader and Writer for the backend translation. Unicode is
312 used as intermediate encoding.
313
314 Error handling is done in the same way as defined for the
315 StreamWriter/Readers.
316
317 """
318 self.stream = stream
319 self.encode = encode
320 self.decode = decode
321 self.reader = Reader(stream, errors)
322 self.writer = Writer(stream, errors)
323 self.errors = errors
324
325 def read(self,size=-1):
326
327 data = self.reader.read(size)
328 data, bytesencoded = self.encode(data, self.errors)
329 return data
330
331 def write(self,data):
332
333 data, bytesdecoded = self.decode(data, self.errors)
334 return self.writer.write(data)
335
336 # .writelines(), .readline() and .readlines() ... see notes
337 # above.
338
339 def reset(self):
340
341 self.reader.reset()
342 self.writer.reset()
343
344 def __getattr__(self,name,
345
346 getattr=getattr):
347
348 """ Inherit all other methods from the underlying stream.
349 """
350 return getattr(self.stream,name)
351
352### Shortcuts
353
354def open(filename, mode, encoding=None, errors='strict', buffering=1):
355
356 """ Open an encoded file using the given mode and return
357 a wrapped version providing transparent encoding/decoding.
358
359 Note: The wrapped version will only accept the object format
360 defined by the codecs, i.e. Unicode objects for most builtin
361 codecs. Output is also codec dependent and will usually by
362 Unicode as well.
363
364 encoding specifies the encoding which is to be used for the
365 the file.
366
367 errors may be given to define the error handling. It defaults
368 to 'strict' which causes ValueErrors to be raised in case an
369 encoding error occurs.
370
371 buffering has the same meaning as for the builtin open() API.
372 It defaults to line buffered.
373
374 """
375 if encoding is not None and \
376 'b' not in mode:
377 # Force opening of the file in binary mode
378 mode = mode + 'b'
379 file = __builtin__.open(filename, mode, buffering)
380 if encoding is None:
381 return file
382 (e,d,sr,sw) = lookup(encoding)
383 return StreamReaderWriter(file, sr, sw, errors)
384
385def EncodedFile(file, input, output=None, errors='strict'):
386
387 """ Return a wrapped version of file which provides transparent
388 encoding translation.
389
390 Strings written to the wrapped file are interpreted according
391 to the given input encoding and then written to the original
392 file as string using the output encoding. The intermediate
393 encoding will usually be Unicode but depends on the specified
394 codecs.
395
396 If output is not given, it defaults to input.
397
398 errors may be given to define the error handling. It defaults
399 to 'strict' which causes ValueErrors to be raised in case an
400 encoding error occurs.
401
402 """
403 if output is None:
404 output = input
405 encode, decode = lookup(input)[:2]
406 Reader, Writer = lookup(output)[2:]
407 return StreamRecoder(file,
408 encode,decode,Reader,Writer,
409 errors)
410
411### Tests
412
413if __name__ == '__main__':
414
415 import sys
416
417 # Make stdout translate Latin-1 into Unicode-Escape
418 sys.stdout = EncodedFile(sys.stdout, 'latin-1', 'unicode-escape')