blob: b8b32d5a4155f1f7e32dade216bd79c1d78ea3e7 [file] [log] [blame]
Guido van Rossum0612d842000-03-10 23:20:43 +00001""" codecs -- Python Codec Registry, API and helpers.
2
3
4Written by Marc-Andre Lemburg (mal@lemburg.com).
5
6(c) Copyright CNRI, All Rights Reserved. NO WARRANTY.
7
8"""#"
9
10import struct,types,__builtin__
11
12### Registry and builtin stateless codec functions
13
14from _codecs import *
15
16### Constants
17
18#
19# Byte Order Mark (BOM) and its possible values (BOM_BE, BOM_LE)
20#
21BOM = struct.pack('=H',0xFEFF)
22#
23BOM_BE = BOM32_BE = '\376\377'
24# corresponds to Unicode U+FEFF in UTF-16 on big endian
25# platforms == ZERO WIDTH NO-BREAK SPACE
26BOM_LE = BOM32_LE = '\377\376'
27# corresponds to Unicode U+FFFE in UTF-16 on little endian
28# platforms == defined as being an illegal Unicode character
29
30#
31# 64-bit Byte Order Marks
32#
33BOM64_BE = '\000\000\376\377'
34# corresponds to Unicode U+0000FEFF in UCS-4
35BOM64_LE = '\377\376\000\000'
36# corresponds to Unicode U+0000FFFE in UCS-4
37
38
39### Codec base classes (defining the API)
40
41class Codec:
42
43 """ Defines the interface for stateless encoders/decoders.
44
45 The .encode()/.decode() methods may implement different error
46 handling schemes by providing the errors argument. These
47 string values are defined:
48
49 'strict' - raise an error (or a subclass)
50 'ignore' - ignore the character and continue with the next
51 'replace' - replace with a suitable replacement character;
52 Python will use the official U+FFFD REPLACEMENT
53 CHARACTER for the builtin Unicode codecs.
54
55 """
56 def encode(self,input,errors='strict'):
57
Fred Drake3e74c0d2000-03-17 15:40:35 +000058 """ Encodes the object input and returns a tuple (output
Guido van Rossum0612d842000-03-10 23:20:43 +000059 object, length consumed).
60
61 errors defines the error handling to apply. It defaults to
62 'strict' handling.
63
64 The method may not store state in the Codec instance. Use
65 StreamCodec for codecs which have to keep state in order to
66 make encoding/decoding efficient.
67
68 The encoder must be able to handle zero length input and
69 return an empty object of the output object type in this
70 situation.
71
72 """
73 raise NotImplementedError
74
75 def decode(self,input,errors='strict'):
76
77 """ Decodes the object input and returns a tuple (output
78 object, length consumed).
79
80 input must be an object which provides the bf_getreadbuf
81 buffer slot. Python strings, buffer objects and memory
82 mapped files are examples of objects providing this slot.
83
84 errors defines the error handling to apply. It defaults to
85 'strict' handling.
86
87 The method may not store state in the Codec instance. Use
88 StreamCodec for codecs which have to keep state in order to
89 make encoding/decoding efficient.
90
91 The decoder must be able to handle zero length input and
92 return an empty object of the output object type in this
93 situation.
94
95 """
96 raise NotImplementedError
97
98#
99# The StreamWriter and StreamReader class provide generic working
100# interfaces which can be used to implement new encodings submodules
101# very easily. See encodings/utf_8.py for an example on how this is
102# done.
103#
104
105class StreamWriter(Codec):
106
107 def __init__(self,stream,errors='strict'):
108
109 """ Creates a StreamWriter instance.
110
111 stream must be a file-like object open for writing
112 (binary) data.
113
114 The StreamWriter may implement different error handling
115 schemes by providing the errors keyword argument. These
116 parameters are defined:
117
118 'strict' - raise a ValueError (or a subclass)
119 'ignore' - ignore the character and continue with the next
120 'replace'- replace with a suitable replacement character
121
122 """
123 self.stream = stream
124 self.errors = errors
125
126 def write(self,object):
127
128 """ Writes the object's contents encoded to self.stream.
129 """
130 data, consumed = self.encode(object,self.errors)
131 self.stream.write(data)
132
133 # XXX .writelines() ?
134
135 def reset(self):
136
137 """ Flushes and resets the codec buffers used for keeping state.
138
139 Calling this method should ensure that the data on the
140 output is put into a clean state, that allows appending
141 of new fresh data without having to rescan the whole
142 stream to recover state.
143
144 """
145 pass
146
147 def __getattr__(self,name,
148
149 getattr=getattr):
150
151 """ Inherit all other methods from the underlying stream.
152 """
153 return getattr(self.stream,name)
154
155###
156
157class StreamReader(Codec):
158
159 def __init__(self,stream,errors='strict'):
160
161 """ Creates a StreamReader instance.
162
163 stream must be a file-like object open for reading
164 (binary) data.
165
166 The StreamReader may implement different error handling
167 schemes by providing the errors keyword argument. These
168 parameters are defined:
169
170 'strict' - raise a ValueError (or a subclass)
171 'ignore' - ignore the character and continue with the next
172 'replace'- replace with a suitable replacement character;
173
174 """
175 self.stream = stream
176 self.errors = errors
177
178 def read(self,size=-1):
179
180 """ Decodes data from the stream self.stream and returns the
181 resulting object.
182
183 size indicates the approximate maximum number of bytes to
184 read from the stream for decoding purposes. The decoder
185 can modify this setting as appropriate. The default value
186 -1 indicates to read and decode as much as possible. size
187 is intended to prevent having to decode huge files in one
188 step.
189
190 The method should use a greedy read strategy meaning that
191 it should read as much data as is allowed within the
192 definition of the encoding and the given size, e.g. if
193 optional encoding endings or state markers are available
194 on the stream, these should be read too.
195
196 """
197 # Unsliced reading:
198 if size < 0:
199 return self.decode(self.stream.read())[0]
200
201 # Sliced reading:
202 read = self.stream.read
203 decode = self.decode
204 data = read(size)
205 i = 0
206 while 1:
207 try:
208 object, decodedbytes = decode(data)
209 except ValueError,why:
210 # This method is slow but should work under pretty much
211 # all conditions; at most 10 tries are made
212 i = i + 1
213 newdata = read(1)
214 if not newdata or i > 10:
215 raise
216 data = data + newdata
217 else:
218 return object
219
220 # XXX .readline() and .readlines() (these are hard to implement
221 # without using buffers for keeping read-ahead data)
222
223 def reset(self):
224
225 """ Resets the codec buffers used for keeping state.
226
227 Note that no stream repositioning should take place.
228 This method is primarely intended to be able to recover
229 from decoding errors.
230
231 """
232 pass
233
234 def __getattr__(self,name,
235
236 getattr=getattr):
237
238 """ Inherit all other methods from the underlying stream.
239 """
240 return getattr(self.stream,name)
241
242###
243
244class StreamReaderWriter:
245
246 def __init__(self,stream,Reader,Writer,errors='strict'):
247
248 """ Creates a StreamReaderWriter instance.
249
250 stream must be a Stream-like object.
251
252 Reader, Writer must be factory functions or classes
253 providing the StreamReader, StreamWriter interface resp.
254
255 Error handling is done in the same way as defined for the
256 StreamWriter/Readers.
257
258 """
259 self.stream = stream
260 self.reader = Reader(stream, errors)
261 self.writer = Writer(stream, errors)
262 self.errors = errors
263
264 def read(self,size=-1):
265
266 return self.reader.read(size)
267
268 def write(self,data):
269
270 return self.writer.write(data)
271
272 def reset(self):
273
274 self.reader.reset()
275 self.writer.reset()
276
277 def __getattr__(self,name,
278
279 getattr=getattr):
280
281 """ Inherit all other methods from the underlying stream.
282 """
283 return getattr(self.stream,name)
284
285###
286
287class StreamRecoder:
288
289 def __init__(self,stream,encode,decode,Reader,Writer,errors='strict'):
290
291 """ Creates a StreamRecoder instance which implements a two-way
292 conversion: encode and decode work on the frontend (the
293 input to .read() and output of .write()) while
294 Reader and Writer work on the backend (reading and
295 writing to the the stream).
296
297 You can use these objects to do transparent direct
298 recodings from e.g. latin-1 to utf-8 and back.
299
300 stream must be a file-like object.
301
302 encode, decode must adhere to the Codec interface, Reader,
303 Writer must be factory functions or classes providing the
304 StreamReader, StreamWriter interface resp.
305
306 encode and decode are needed for the frontend translation,
307 Reader and Writer for the backend translation. Unicode is
308 used as intermediate encoding.
309
310 Error handling is done in the same way as defined for the
311 StreamWriter/Readers.
312
313 """
314 self.stream = stream
315 self.encode = encode
316 self.decode = decode
317 self.reader = Reader(stream, errors)
318 self.writer = Writer(stream, errors)
319 self.errors = errors
320
321 def read(self,size=-1):
322
323 data = self.reader.read(size)
324 data, bytesencoded = self.encode(data, self.errors)
325 return data
326
327 def write(self,data):
328
329 data, bytesdecoded = self.decode(data, self.errors)
330 return self.writer.write(data)
331
332 # .writelines(), .readline() and .readlines() ... see notes
333 # above.
334
335 def reset(self):
336
337 self.reader.reset()
338 self.writer.reset()
339
340 def __getattr__(self,name,
341
342 getattr=getattr):
343
344 """ Inherit all other methods from the underlying stream.
345 """
346 return getattr(self.stream,name)
347
348### Shortcuts
349
350def open(filename, mode, encoding=None, errors='strict', buffering=1):
351
352 """ Open an encoded file using the given mode and return
353 a wrapped version providing transparent encoding/decoding.
354
355 Note: The wrapped version will only accept the object format
356 defined by the codecs, i.e. Unicode objects for most builtin
357 codecs. Output is also codec dependent and will usually by
358 Unicode as well.
359
360 encoding specifies the encoding which is to be used for the
361 the file.
362
363 errors may be given to define the error handling. It defaults
364 to 'strict' which causes ValueErrors to be raised in case an
365 encoding error occurs.
366
367 buffering has the same meaning as for the builtin open() API.
368 It defaults to line buffered.
369
370 """
371 if encoding is not None and \
372 'b' not in mode:
373 # Force opening of the file in binary mode
374 mode = mode + 'b'
375 file = __builtin__.open(filename, mode, buffering)
376 if encoding is None:
377 return file
378 (e,d,sr,sw) = lookup(encoding)
379 return StreamReaderWriter(file, sr, sw, errors)
380
381def EncodedFile(file, input, output=None, errors='strict'):
382
383 """ Return a wrapped version of file which provides transparent
384 encoding translation.
385
386 Strings written to the wrapped file are interpreted according
387 to the given input encoding and then written to the original
388 file as string using the output encoding. The intermediate
389 encoding will usually be Unicode but depends on the specified
390 codecs.
391
392 If output is not given, it defaults to input.
393
394 errors may be given to define the error handling. It defaults
395 to 'strict' which causes ValueErrors to be raised in case an
396 encoding error occurs.
397
398 """
399 if output is None:
400 output = input
401 encode, decode = lookup(input)[:2]
402 Reader, Writer = lookup(output)[2:]
403 return StreamRecoder(file,
404 encode,decode,Reader,Writer,
405 errors)
406
407### Tests
408
409if __name__ == '__main__':
410
411 import sys
412
413 # Make stdout translate Latin-1 into Unicode-Escape
414 sys.stdout = EncodedFile(sys.stdout, 'latin-1', 'unicode-escape')