blob: 993113752ef1859750aa83e8f0eb2f4ebf019188 [file] [log] [blame]
Guido van Rossum0612d842000-03-10 23:20:43 +00001""" codecs -- Python Codec Registry, API and helpers.
2
3
4Written by Marc-Andre Lemburg (mal@lemburg.com).
5
6(c) Copyright CNRI, All Rights Reserved. NO WARRANTY.
7
8"""#"
9
10import struct,types,__builtin__
11
12### Registry and builtin stateless codec functions
13
Guido van Rossumb95de4f2000-03-31 17:25:23 +000014try:
15 from _codecs import *
16except ImportError,why:
17 raise SystemError,\
18 'Failed to load the builtin codecs: %s' % why
Guido van Rossum0612d842000-03-10 23:20:43 +000019
20### Constants
21
22#
23# Byte Order Mark (BOM) and its possible values (BOM_BE, BOM_LE)
24#
25BOM = struct.pack('=H',0xFEFF)
26#
27BOM_BE = BOM32_BE = '\376\377'
28# corresponds to Unicode U+FEFF in UTF-16 on big endian
29# platforms == ZERO WIDTH NO-BREAK SPACE
Guido van Rossum1c89b0e2000-04-11 15:41:38 +000030BOM_LE = BOM32_LE = '\377\376'
Guido van Rossum0612d842000-03-10 23:20:43 +000031# corresponds to Unicode U+FFFE in UTF-16 on little endian
32# platforms == defined as being an illegal Unicode character
33
34#
35# 64-bit Byte Order Marks
36#
37BOM64_BE = '\000\000\376\377'
38# corresponds to Unicode U+0000FEFF in UCS-4
39BOM64_LE = '\377\376\000\000'
40# corresponds to Unicode U+0000FFFE in UCS-4
41
42
43### Codec base classes (defining the API)
44
45class Codec:
46
47 """ Defines the interface for stateless encoders/decoders.
48
49 The .encode()/.decode() methods may implement different error
50 handling schemes by providing the errors argument. These
51 string values are defined:
52
Guido van Rossumd8855fd2000-03-24 22:14:19 +000053 'strict' - raise a ValueError error (or a subclass)
Guido van Rossum0612d842000-03-10 23:20:43 +000054 'ignore' - ignore the character and continue with the next
55 'replace' - replace with a suitable replacement character;
56 Python will use the official U+FFFD REPLACEMENT
57 CHARACTER for the builtin Unicode codecs.
58
59 """
60 def encode(self,input,errors='strict'):
Guido van Rossum1c89b0e2000-04-11 15:41:38 +000061
Fred Drake3e74c0d2000-03-17 15:40:35 +000062 """ Encodes the object input and returns a tuple (output
Guido van Rossum0612d842000-03-10 23:20:43 +000063 object, length consumed).
64
65 errors defines the error handling to apply. It defaults to
66 'strict' handling.
67
68 The method may not store state in the Codec instance. Use
69 StreamCodec for codecs which have to keep state in order to
70 make encoding/decoding efficient.
71
72 The encoder must be able to handle zero length input and
73 return an empty object of the output object type in this
74 situation.
75
76 """
77 raise NotImplementedError
78
79 def decode(self,input,errors='strict'):
80
81 """ Decodes the object input and returns a tuple (output
82 object, length consumed).
83
84 input must be an object which provides the bf_getreadbuf
85 buffer slot. Python strings, buffer objects and memory
86 mapped files are examples of objects providing this slot.
Guido van Rossum1c89b0e2000-04-11 15:41:38 +000087
Guido van Rossum0612d842000-03-10 23:20:43 +000088 errors defines the error handling to apply. It defaults to
89 'strict' handling.
90
91 The method may not store state in the Codec instance. Use
92 StreamCodec for codecs which have to keep state in order to
93 make encoding/decoding efficient.
94
95 The decoder must be able to handle zero length input and
96 return an empty object of the output object type in this
97 situation.
98
Guido van Rossum1c89b0e2000-04-11 15:41:38 +000099 """
Guido van Rossum0612d842000-03-10 23:20:43 +0000100 raise NotImplementedError
101
102#
103# The StreamWriter and StreamReader class provide generic working
104# interfaces which can be used to implement new encodings submodules
105# very easily. See encodings/utf_8.py for an example on how this is
106# done.
Guido van Rossum1c89b0e2000-04-11 15:41:38 +0000107#
Guido van Rossum0612d842000-03-10 23:20:43 +0000108
109class StreamWriter(Codec):
110
111 def __init__(self,stream,errors='strict'):
112
113 """ Creates a StreamWriter instance.
114
115 stream must be a file-like object open for writing
116 (binary) data.
117
118 The StreamWriter may implement different error handling
119 schemes by providing the errors keyword argument. These
120 parameters are defined:
121
122 'strict' - raise a ValueError (or a subclass)
123 'ignore' - ignore the character and continue with the next
124 'replace'- replace with a suitable replacement character
125
126 """
127 self.stream = stream
128 self.errors = errors
129
Guido van Rossuma3277132000-04-11 15:37:43 +0000130 def write(self, object):
Guido van Rossum0612d842000-03-10 23:20:43 +0000131
132 """ Writes the object's contents encoded to self.stream.
133 """
134 data, consumed = self.encode(object,self.errors)
135 self.stream.write(data)
136
Guido van Rossuma3277132000-04-11 15:37:43 +0000137 def writelines(self, list):
138
139 """ Writes the concatenated list of strings to the stream
140 using .write().
141 """
142 self.write(''.join(list))
Guido van Rossum1c89b0e2000-04-11 15:41:38 +0000143
Guido van Rossum0612d842000-03-10 23:20:43 +0000144 def reset(self):
145
146 """ Flushes and resets the codec buffers used for keeping state.
147
148 Calling this method should ensure that the data on the
149 output is put into a clean state, that allows appending
150 of new fresh data without having to rescan the whole
151 stream to recover state.
152
153 """
154 pass
155
156 def __getattr__(self,name,
157
158 getattr=getattr):
159
160 """ Inherit all other methods from the underlying stream.
161 """
162 return getattr(self.stream,name)
163
164###
165
166class StreamReader(Codec):
167
168 def __init__(self,stream,errors='strict'):
169
170 """ Creates a StreamReader instance.
171
172 stream must be a file-like object open for reading
173 (binary) data.
174
175 The StreamReader may implement different error handling
176 schemes by providing the errors keyword argument. These
177 parameters are defined:
178
179 'strict' - raise a ValueError (or a subclass)
180 'ignore' - ignore the character and continue with the next
181 'replace'- replace with a suitable replacement character;
182
183 """
184 self.stream = stream
185 self.errors = errors
186
Guido van Rossuma3277132000-04-11 15:37:43 +0000187 def read(self, size=-1):
Guido van Rossum0612d842000-03-10 23:20:43 +0000188
189 """ Decodes data from the stream self.stream and returns the
190 resulting object.
191
192 size indicates the approximate maximum number of bytes to
193 read from the stream for decoding purposes. The decoder
194 can modify this setting as appropriate. The default value
195 -1 indicates to read and decode as much as possible. size
196 is intended to prevent having to decode huge files in one
197 step.
198
199 The method should use a greedy read strategy meaning that
200 it should read as much data as is allowed within the
201 definition of the encoding and the given size, e.g. if
202 optional encoding endings or state markers are available
203 on the stream, these should be read too.
204
205 """
206 # Unsliced reading:
207 if size < 0:
Andrew M. Kuchlingc6c28382000-12-10 15:12:14 +0000208 return self.decode(self.stream.read(), self.errors)[0]
Guido van Rossum1c89b0e2000-04-11 15:41:38 +0000209
Guido van Rossum0612d842000-03-10 23:20:43 +0000210 # Sliced reading:
211 read = self.stream.read
212 decode = self.decode
213 data = read(size)
214 i = 0
215 while 1:
216 try:
Andrew M. Kuchlingc6c28382000-12-10 15:12:14 +0000217 object, decodedbytes = decode(data, self.errors)
Guido van Rossum0612d842000-03-10 23:20:43 +0000218 except ValueError,why:
219 # This method is slow but should work under pretty much
220 # all conditions; at most 10 tries are made
221 i = i + 1
222 newdata = read(1)
223 if not newdata or i > 10:
224 raise
225 data = data + newdata
226 else:
227 return object
228
Guido van Rossuma3277132000-04-11 15:37:43 +0000229 def readline(self, size=None):
230
231 """ Read one line from the input stream and return the
232 decoded data.
233
Fred Drake49fd1072000-04-13 14:11:21 +0000234 Note: Unlike the .readlines() method, this method inherits
235 the line breaking knowledge from the underlying stream's
236 .readline() method -- there is currently no support for
237 line breaking using the codec decoder due to lack of line
238 buffering. Sublcasses should however, if possible, try to
239 implement this method using their own knowledge of line
240 breaking.
Guido van Rossuma3277132000-04-11 15:37:43 +0000241
242 size, if given, is passed as size argument to the stream's
243 .readline() method.
Guido van Rossum1c89b0e2000-04-11 15:41:38 +0000244
Guido van Rossuma3277132000-04-11 15:37:43 +0000245 """
246 if size is None:
247 line = self.stream.readline()
248 else:
249 line = self.stream.readline(size)
Andrew M. Kuchlingc6c28382000-12-10 15:12:14 +0000250 return self.decode(line,self.errors)[0]
Guido van Rossum1c89b0e2000-04-11 15:41:38 +0000251
Guido van Rossuma3277132000-04-11 15:37:43 +0000252
253 def readlines(self, sizehint=0):
254
255 """ Read all lines available on the input stream
256 and return them as list of lines.
257
258 Line breaks are implemented using the codec's decoder
259 method and are included in the list entries.
Guido van Rossum1c89b0e2000-04-11 15:41:38 +0000260
Guido van Rossuma3277132000-04-11 15:37:43 +0000261 sizehint, if given, is passed as size argument to the
262 stream's .read() method.
263
264 """
265 if sizehint is None:
266 data = self.stream.read()
267 else:
268 data = self.stream.read(sizehint)
Andrew M. Kuchlingc6c28382000-12-10 15:12:14 +0000269 return self.decode(data,self.errors)[0].splitlines(1)
Guido van Rossum0612d842000-03-10 23:20:43 +0000270
271 def reset(self):
272
273 """ Resets the codec buffers used for keeping state.
274
275 Note that no stream repositioning should take place.
Thomas Wouters7e474022000-07-16 12:04:32 +0000276 This method is primarily intended to be able to recover
Guido van Rossum0612d842000-03-10 23:20:43 +0000277 from decoding errors.
278
279 """
280 pass
281
282 def __getattr__(self,name,
283
284 getattr=getattr):
285
286 """ Inherit all other methods from the underlying stream.
287 """
288 return getattr(self.stream,name)
289
290###
291
292class StreamReaderWriter:
293
Fred Drake49fd1072000-04-13 14:11:21 +0000294 """ StreamReaderWriter instances allow wrapping streams which
295 work in both read and write modes.
296
297 The design is such that one can use the factory functions
Thomas Wouters7e474022000-07-16 12:04:32 +0000298 returned by the codec.lookup() function to construct the
Fred Drake49fd1072000-04-13 14:11:21 +0000299 instance.
300
301 """
Guido van Rossuma3277132000-04-11 15:37:43 +0000302 # Optional attributes set by the file wrappers below
303 encoding = 'unknown'
304
Guido van Rossum0612d842000-03-10 23:20:43 +0000305 def __init__(self,stream,Reader,Writer,errors='strict'):
306
307 """ Creates a StreamReaderWriter instance.
308
309 stream must be a Stream-like object.
310
311 Reader, Writer must be factory functions or classes
312 providing the StreamReader, StreamWriter interface resp.
313
314 Error handling is done in the same way as defined for the
315 StreamWriter/Readers.
316
317 """
318 self.stream = stream
319 self.reader = Reader(stream, errors)
320 self.writer = Writer(stream, errors)
321 self.errors = errors
322
323 def read(self,size=-1):
324
325 return self.reader.read(size)
326
Guido van Rossumd58c26f2000-05-01 16:17:32 +0000327 def readline(self, size=None):
Guido van Rossuma3277132000-04-11 15:37:43 +0000328
329 return self.reader.readline(size)
330
Guido van Rossumd58c26f2000-05-01 16:17:32 +0000331 def readlines(self, sizehint=None):
Guido van Rossuma3277132000-04-11 15:37:43 +0000332
333 return self.reader.readlines(sizehint)
334
Guido van Rossum0612d842000-03-10 23:20:43 +0000335 def write(self,data):
336
337 return self.writer.write(data)
338
Guido van Rossuma3277132000-04-11 15:37:43 +0000339 def writelines(self,list):
340
341 return self.writer.writelines(list)
342
Guido van Rossum0612d842000-03-10 23:20:43 +0000343 def reset(self):
344
345 self.reader.reset()
346 self.writer.reset()
347
348 def __getattr__(self,name,
349
350 getattr=getattr):
351
352 """ Inherit all other methods from the underlying stream.
353 """
354 return getattr(self.stream,name)
355
356###
357
358class StreamRecoder:
359
Fred Drake49fd1072000-04-13 14:11:21 +0000360 """ StreamRecoder instances provide a frontend - backend
361 view of encoding data.
362
363 They use the complete set of APIs returned by the
364 codecs.lookup() function to implement their task.
365
366 Data written to the stream is first decoded into an
367 intermediate format (which is dependent on the given codec
368 combination) and then written to the stream using an instance
369 of the provided Writer class.
370
371 In the other direction, data is read from the stream using a
372 Reader instance and then return encoded data to the caller.
373
374 """
Guido van Rossuma3277132000-04-11 15:37:43 +0000375 # Optional attributes set by the file wrappers below
376 data_encoding = 'unknown'
377 file_encoding = 'unknown'
378
Guido van Rossum0612d842000-03-10 23:20:43 +0000379 def __init__(self,stream,encode,decode,Reader,Writer,errors='strict'):
380
381 """ Creates a StreamRecoder instance which implements a two-way
382 conversion: encode and decode work on the frontend (the
Guido van Rossum1c89b0e2000-04-11 15:41:38 +0000383 input to .read() and output of .write()) while
Guido van Rossum0612d842000-03-10 23:20:43 +0000384 Reader and Writer work on the backend (reading and
Fred Drake908670c2000-03-17 15:42:11 +0000385 writing to the stream).
Guido van Rossum0612d842000-03-10 23:20:43 +0000386
387 You can use these objects to do transparent direct
388 recodings from e.g. latin-1 to utf-8 and back.
389
390 stream must be a file-like object.
391
392 encode, decode must adhere to the Codec interface, Reader,
393 Writer must be factory functions or classes providing the
394 StreamReader, StreamWriter interface resp.
395
396 encode and decode are needed for the frontend translation,
397 Reader and Writer for the backend translation. Unicode is
398 used as intermediate encoding.
399
400 Error handling is done in the same way as defined for the
401 StreamWriter/Readers.
402
403 """
404 self.stream = stream
405 self.encode = encode
406 self.decode = decode
407 self.reader = Reader(stream, errors)
408 self.writer = Writer(stream, errors)
409 self.errors = errors
410
411 def read(self,size=-1):
412
413 data = self.reader.read(size)
414 data, bytesencoded = self.encode(data, self.errors)
415 return data
416
Guido van Rossuma3277132000-04-11 15:37:43 +0000417 def readline(self,size=None):
418
419 if size is None:
420 data = self.reader.readline()
421 else:
422 data = self.reader.readline(size)
423 data, bytesencoded = self.encode(data, self.errors)
424 return data
425
426 def readlines(self,sizehint=None):
427
428 if sizehint is None:
429 data = self.reader.read()
430 else:
431 data = self.reader.read(sizehint)
432 data, bytesencoded = self.encode(data, self.errors)
433 return data.splitlines(1)
434
Guido van Rossum0612d842000-03-10 23:20:43 +0000435 def write(self,data):
436
437 data, bytesdecoded = self.decode(data, self.errors)
438 return self.writer.write(data)
439
Guido van Rossuma3277132000-04-11 15:37:43 +0000440 def writelines(self,list):
441
442 data = ''.join(list)
443 data, bytesdecoded = self.decode(data, self.errors)
444 return self.writer.write(data)
Guido van Rossum0612d842000-03-10 23:20:43 +0000445
446 def reset(self):
447
448 self.reader.reset()
449 self.writer.reset()
450
451 def __getattr__(self,name,
452
453 getattr=getattr):
454
455 """ Inherit all other methods from the underlying stream.
456 """
457 return getattr(self.stream,name)
458
459### Shortcuts
460
Marc-André Lemburg349a3d32000-06-21 21:21:04 +0000461def open(filename, mode='rb', encoding=None, errors='strict', buffering=1):
Guido van Rossum0612d842000-03-10 23:20:43 +0000462
463 """ Open an encoded file using the given mode and return
464 a wrapped version providing transparent encoding/decoding.
465
466 Note: The wrapped version will only accept the object format
467 defined by the codecs, i.e. Unicode objects for most builtin
468 codecs. Output is also codec dependent and will usually by
469 Unicode as well.
470
Marc-André Lemburg349a3d32000-06-21 21:21:04 +0000471 Files are always opened in binary mode, even if no binary mode
472 was specified. Thisis done to avoid data loss due to encodings
473 using 8-bit values. The default file mode is 'rb' meaning to
474 open the file in binary read mode.
475
Guido van Rossum0612d842000-03-10 23:20:43 +0000476 encoding specifies the encoding which is to be used for the
477 the file.
478
479 errors may be given to define the error handling. It defaults
480 to 'strict' which causes ValueErrors to be raised in case an
481 encoding error occurs.
482
483 buffering has the same meaning as for the builtin open() API.
484 It defaults to line buffered.
485
Fred Drake49fd1072000-04-13 14:11:21 +0000486 The returned wrapped file object provides an extra attribute
487 .encoding which allows querying the used encoding. This
488 attribute is only available if an encoding was specified as
489 parameter.
490
Guido van Rossum0612d842000-03-10 23:20:43 +0000491 """
492 if encoding is not None and \
493 'b' not in mode:
494 # Force opening of the file in binary mode
495 mode = mode + 'b'
496 file = __builtin__.open(filename, mode, buffering)
497 if encoding is None:
498 return file
499 (e,d,sr,sw) = lookup(encoding)
Guido van Rossuma3277132000-04-11 15:37:43 +0000500 srw = StreamReaderWriter(file, sr, sw, errors)
501 # Add attributes to simplify introspection
502 srw.encoding = encoding
503 return srw
Guido van Rossum0612d842000-03-10 23:20:43 +0000504
Guido van Rossuma3277132000-04-11 15:37:43 +0000505def EncodedFile(file, data_encoding, file_encoding=None, errors='strict'):
Guido van Rossum0612d842000-03-10 23:20:43 +0000506
507 """ Return a wrapped version of file which provides transparent
508 encoding translation.
509
510 Strings written to the wrapped file are interpreted according
Guido van Rossuma3277132000-04-11 15:37:43 +0000511 to the given data_encoding and then written to the original
512 file as string using file_encoding. The intermediate encoding
513 will usually be Unicode but depends on the specified codecs.
Guido van Rossum0612d842000-03-10 23:20:43 +0000514
Guido van Rossuma3277132000-04-11 15:37:43 +0000515 Strings are read from the file using file_encoding and then
516 passed back to the caller as string using data_encoding.
517
518 If file_encoding is not given, it defaults to data_encoding.
Guido van Rossum0612d842000-03-10 23:20:43 +0000519
520 errors may be given to define the error handling. It defaults
521 to 'strict' which causes ValueErrors to be raised in case an
522 encoding error occurs.
523
Fred Drake49fd1072000-04-13 14:11:21 +0000524 The returned wrapped file object provides two extra attributes
525 .data_encoding and .file_encoding which reflect the given
526 parameters of the same name. The attributes can be used for
527 introspection by Python programs.
528
Guido van Rossum0612d842000-03-10 23:20:43 +0000529 """
Guido van Rossuma3277132000-04-11 15:37:43 +0000530 if file_encoding is None:
531 file_encoding = data_encoding
532 encode, decode = lookup(data_encoding)[:2]
533 Reader, Writer = lookup(file_encoding)[2:]
534 sr = StreamRecoder(file,
535 encode,decode,Reader,Writer,
536 errors)
537 # Add attributes to simplify introspection
538 sr.data_encoding = data_encoding
539 sr.file_encoding = file_encoding
540 return sr
Guido van Rossum0612d842000-03-10 23:20:43 +0000541
Marc-André Lemburga866df82001-01-03 21:29:14 +0000542### Helpers for charmap-based codecs
543
544def make_identity_dict(rng):
545
546 """ make_identity_dict(rng) -> dict
547
548 Return a dictionary where elements of the rng sequence are
549 mapped to themselves.
550
551 """
552 res = {}
553 for i in rng:
554 res[i]=i
555 return res
556
Guido van Rossum0612d842000-03-10 23:20:43 +0000557### Tests
Guido van Rossum1c89b0e2000-04-11 15:41:38 +0000558
Guido van Rossum0612d842000-03-10 23:20:43 +0000559if __name__ == '__main__':
560
561 import sys
Guido van Rossum1c89b0e2000-04-11 15:41:38 +0000562
Guido van Rossuma3277132000-04-11 15:37:43 +0000563 # Make stdout translate Latin-1 output into UTF-8 output
564 sys.stdout = EncodedFile(sys.stdout, 'latin-1', 'utf-8')
Guido van Rossum1c89b0e2000-04-11 15:41:38 +0000565
Guido van Rossuma3277132000-04-11 15:37:43 +0000566 # Have stdin translate Latin-1 input into UTF-8 input
567 sys.stdin = EncodedFile(sys.stdin, 'utf-8', 'latin-1')