blob: 21652b6146cb2f2417395c03f9417dfc53ed8038 [file] [log] [blame]
Guido van Rossum0612d842000-03-10 23:20:43 +00001""" codecs -- Python Codec Registry, API and helpers.
2
3
4Written by Marc-Andre Lemburg (mal@lemburg.com).
5
6(c) Copyright CNRI, All Rights Reserved. NO WARRANTY.
7
8"""#"
9
10import struct,types,__builtin__
11
12### Registry and builtin stateless codec functions
13
Guido van Rossumb95de4f2000-03-31 17:25:23 +000014try:
15 from _codecs import *
16except ImportError,why:
17 raise SystemError,\
18 'Failed to load the builtin codecs: %s' % why
Guido van Rossum0612d842000-03-10 23:20:43 +000019
Skip Montanaroe99d5ea2001-01-20 19:54:20 +000020__all__ = ["register","lookup","open","EncodedFile","BOM","BOM_BE",
21 "BOM_LE","BOM32_BE","BOM32_LE","BOM64_BE","BOM64_LE"]
22
Guido van Rossum0612d842000-03-10 23:20:43 +000023### Constants
24
25#
26# Byte Order Mark (BOM) and its possible values (BOM_BE, BOM_LE)
27#
28BOM = struct.pack('=H',0xFEFF)
29#
30BOM_BE = BOM32_BE = '\376\377'
Tim Peters88869f92001-01-14 23:36:06 +000031# corresponds to Unicode U+FEFF in UTF-16 on big endian
32# platforms == ZERO WIDTH NO-BREAK SPACE
Guido van Rossum1c89b0e2000-04-11 15:41:38 +000033BOM_LE = BOM32_LE = '\377\376'
Tim Peters88869f92001-01-14 23:36:06 +000034# corresponds to Unicode U+FFFE in UTF-16 on little endian
35# platforms == defined as being an illegal Unicode character
Guido van Rossum0612d842000-03-10 23:20:43 +000036
37#
38# 64-bit Byte Order Marks
39#
40BOM64_BE = '\000\000\376\377'
Tim Peters88869f92001-01-14 23:36:06 +000041# corresponds to Unicode U+0000FEFF in UCS-4
Guido van Rossum0612d842000-03-10 23:20:43 +000042BOM64_LE = '\377\376\000\000'
Tim Peters88869f92001-01-14 23:36:06 +000043# corresponds to Unicode U+0000FFFE in UCS-4
Guido van Rossum0612d842000-03-10 23:20:43 +000044
45
46### Codec base classes (defining the API)
47
48class Codec:
49
50 """ Defines the interface for stateless encoders/decoders.
51
52 The .encode()/.decode() methods may implement different error
53 handling schemes by providing the errors argument. These
54 string values are defined:
55
Guido van Rossumd8855fd2000-03-24 22:14:19 +000056 'strict' - raise a ValueError error (or a subclass)
Guido van Rossum0612d842000-03-10 23:20:43 +000057 'ignore' - ignore the character and continue with the next
58 'replace' - replace with a suitable replacement character;
59 Python will use the official U+FFFD REPLACEMENT
60 CHARACTER for the builtin Unicode codecs.
61
62 """
63 def encode(self,input,errors='strict'):
Guido van Rossum1c89b0e2000-04-11 15:41:38 +000064
Fred Drake3e74c0d2000-03-17 15:40:35 +000065 """ Encodes the object input and returns a tuple (output
Guido van Rossum0612d842000-03-10 23:20:43 +000066 object, length consumed).
67
68 errors defines the error handling to apply. It defaults to
69 'strict' handling.
70
71 The method may not store state in the Codec instance. Use
72 StreamCodec for codecs which have to keep state in order to
73 make encoding/decoding efficient.
74
75 The encoder must be able to handle zero length input and
76 return an empty object of the output object type in this
77 situation.
78
79 """
80 raise NotImplementedError
81
82 def decode(self,input,errors='strict'):
83
84 """ Decodes the object input and returns a tuple (output
85 object, length consumed).
86
87 input must be an object which provides the bf_getreadbuf
88 buffer slot. Python strings, buffer objects and memory
89 mapped files are examples of objects providing this slot.
Guido van Rossum1c89b0e2000-04-11 15:41:38 +000090
Guido van Rossum0612d842000-03-10 23:20:43 +000091 errors defines the error handling to apply. It defaults to
92 'strict' handling.
93
94 The method may not store state in the Codec instance. Use
95 StreamCodec for codecs which have to keep state in order to
96 make encoding/decoding efficient.
97
98 The decoder must be able to handle zero length input and
99 return an empty object of the output object type in this
100 situation.
101
Guido van Rossum1c89b0e2000-04-11 15:41:38 +0000102 """
Guido van Rossum0612d842000-03-10 23:20:43 +0000103 raise NotImplementedError
104
105#
106# The StreamWriter and StreamReader class provide generic working
107# interfaces which can be used to implement new encodings submodules
108# very easily. See encodings/utf_8.py for an example on how this is
109# done.
Guido van Rossum1c89b0e2000-04-11 15:41:38 +0000110#
Guido van Rossum0612d842000-03-10 23:20:43 +0000111
112class StreamWriter(Codec):
113
114 def __init__(self,stream,errors='strict'):
115
116 """ Creates a StreamWriter instance.
117
118 stream must be a file-like object open for writing
119 (binary) data.
120
121 The StreamWriter may implement different error handling
122 schemes by providing the errors keyword argument. These
123 parameters are defined:
124
125 'strict' - raise a ValueError (or a subclass)
126 'ignore' - ignore the character and continue with the next
127 'replace'- replace with a suitable replacement character
128
129 """
130 self.stream = stream
131 self.errors = errors
132
Guido van Rossuma3277132000-04-11 15:37:43 +0000133 def write(self, object):
Guido van Rossum0612d842000-03-10 23:20:43 +0000134
135 """ Writes the object's contents encoded to self.stream.
136 """
137 data, consumed = self.encode(object,self.errors)
138 self.stream.write(data)
139
Guido van Rossuma3277132000-04-11 15:37:43 +0000140 def writelines(self, list):
141
142 """ Writes the concatenated list of strings to the stream
143 using .write().
144 """
145 self.write(''.join(list))
Guido van Rossum1c89b0e2000-04-11 15:41:38 +0000146
Guido van Rossum0612d842000-03-10 23:20:43 +0000147 def reset(self):
148
149 """ Flushes and resets the codec buffers used for keeping state.
150
151 Calling this method should ensure that the data on the
152 output is put into a clean state, that allows appending
153 of new fresh data without having to rescan the whole
154 stream to recover state.
155
156 """
157 pass
158
159 def __getattr__(self,name,
160
161 getattr=getattr):
162
163 """ Inherit all other methods from the underlying stream.
164 """
165 return getattr(self.stream,name)
166
167###
168
169class StreamReader(Codec):
170
171 def __init__(self,stream,errors='strict'):
172
173 """ Creates a StreamReader instance.
174
175 stream must be a file-like object open for reading
176 (binary) data.
177
178 The StreamReader may implement different error handling
179 schemes by providing the errors keyword argument. These
180 parameters are defined:
181
182 'strict' - raise a ValueError (or a subclass)
183 'ignore' - ignore the character and continue with the next
184 'replace'- replace with a suitable replacement character;
185
186 """
187 self.stream = stream
188 self.errors = errors
189
Guido van Rossuma3277132000-04-11 15:37:43 +0000190 def read(self, size=-1):
Guido van Rossum0612d842000-03-10 23:20:43 +0000191
192 """ Decodes data from the stream self.stream and returns the
193 resulting object.
194
195 size indicates the approximate maximum number of bytes to
196 read from the stream for decoding purposes. The decoder
197 can modify this setting as appropriate. The default value
198 -1 indicates to read and decode as much as possible. size
199 is intended to prevent having to decode huge files in one
200 step.
201
202 The method should use a greedy read strategy meaning that
203 it should read as much data as is allowed within the
204 definition of the encoding and the given size, e.g. if
205 optional encoding endings or state markers are available
206 on the stream, these should be read too.
207
208 """
209 # Unsliced reading:
210 if size < 0:
Andrew M. Kuchlingc6c28382000-12-10 15:12:14 +0000211 return self.decode(self.stream.read(), self.errors)[0]
Guido van Rossum1c89b0e2000-04-11 15:41:38 +0000212
Guido van Rossum0612d842000-03-10 23:20:43 +0000213 # Sliced reading:
214 read = self.stream.read
215 decode = self.decode
216 data = read(size)
217 i = 0
218 while 1:
219 try:
Andrew M. Kuchlingc6c28382000-12-10 15:12:14 +0000220 object, decodedbytes = decode(data, self.errors)
Guido van Rossum0612d842000-03-10 23:20:43 +0000221 except ValueError,why:
222 # This method is slow but should work under pretty much
223 # all conditions; at most 10 tries are made
224 i = i + 1
225 newdata = read(1)
226 if not newdata or i > 10:
227 raise
228 data = data + newdata
229 else:
230 return object
231
Guido van Rossuma3277132000-04-11 15:37:43 +0000232 def readline(self, size=None):
233
234 """ Read one line from the input stream and return the
235 decoded data.
236
Fred Drake49fd1072000-04-13 14:11:21 +0000237 Note: Unlike the .readlines() method, this method inherits
238 the line breaking knowledge from the underlying stream's
239 .readline() method -- there is currently no support for
240 line breaking using the codec decoder due to lack of line
241 buffering. Sublcasses should however, if possible, try to
242 implement this method using their own knowledge of line
243 breaking.
Guido van Rossuma3277132000-04-11 15:37:43 +0000244
245 size, if given, is passed as size argument to the stream's
246 .readline() method.
Guido van Rossum1c89b0e2000-04-11 15:41:38 +0000247
Guido van Rossuma3277132000-04-11 15:37:43 +0000248 """
249 if size is None:
250 line = self.stream.readline()
251 else:
252 line = self.stream.readline(size)
Andrew M. Kuchlingc6c28382000-12-10 15:12:14 +0000253 return self.decode(line,self.errors)[0]
Guido van Rossum1c89b0e2000-04-11 15:41:38 +0000254
Guido van Rossuma3277132000-04-11 15:37:43 +0000255
256 def readlines(self, sizehint=0):
257
258 """ Read all lines available on the input stream
259 and return them as list of lines.
260
261 Line breaks are implemented using the codec's decoder
262 method and are included in the list entries.
Guido van Rossum1c89b0e2000-04-11 15:41:38 +0000263
Guido van Rossuma3277132000-04-11 15:37:43 +0000264 sizehint, if given, is passed as size argument to the
265 stream's .read() method.
266
267 """
268 if sizehint is None:
269 data = self.stream.read()
270 else:
271 data = self.stream.read(sizehint)
Andrew M. Kuchlingc6c28382000-12-10 15:12:14 +0000272 return self.decode(data,self.errors)[0].splitlines(1)
Guido van Rossum0612d842000-03-10 23:20:43 +0000273
274 def reset(self):
275
276 """ Resets the codec buffers used for keeping state.
277
278 Note that no stream repositioning should take place.
Thomas Wouters7e474022000-07-16 12:04:32 +0000279 This method is primarily intended to be able to recover
Guido van Rossum0612d842000-03-10 23:20:43 +0000280 from decoding errors.
281
282 """
283 pass
284
285 def __getattr__(self,name,
286
287 getattr=getattr):
288
289 """ Inherit all other methods from the underlying stream.
290 """
291 return getattr(self.stream,name)
292
293###
294
295class StreamReaderWriter:
296
Fred Drake49fd1072000-04-13 14:11:21 +0000297 """ StreamReaderWriter instances allow wrapping streams which
298 work in both read and write modes.
299
300 The design is such that one can use the factory functions
Thomas Wouters7e474022000-07-16 12:04:32 +0000301 returned by the codec.lookup() function to construct the
Fred Drake49fd1072000-04-13 14:11:21 +0000302 instance.
303
304 """
Guido van Rossuma3277132000-04-11 15:37:43 +0000305 # Optional attributes set by the file wrappers below
306 encoding = 'unknown'
307
Guido van Rossum0612d842000-03-10 23:20:43 +0000308 def __init__(self,stream,Reader,Writer,errors='strict'):
309
310 """ Creates a StreamReaderWriter instance.
311
312 stream must be a Stream-like object.
313
314 Reader, Writer must be factory functions or classes
315 providing the StreamReader, StreamWriter interface resp.
316
317 Error handling is done in the same way as defined for the
318 StreamWriter/Readers.
319
320 """
321 self.stream = stream
322 self.reader = Reader(stream, errors)
323 self.writer = Writer(stream, errors)
324 self.errors = errors
325
326 def read(self,size=-1):
327
328 return self.reader.read(size)
329
Guido van Rossumd58c26f2000-05-01 16:17:32 +0000330 def readline(self, size=None):
Guido van Rossuma3277132000-04-11 15:37:43 +0000331
332 return self.reader.readline(size)
333
Guido van Rossumd58c26f2000-05-01 16:17:32 +0000334 def readlines(self, sizehint=None):
Guido van Rossuma3277132000-04-11 15:37:43 +0000335
336 return self.reader.readlines(sizehint)
337
Guido van Rossum0612d842000-03-10 23:20:43 +0000338 def write(self,data):
339
340 return self.writer.write(data)
341
Guido van Rossuma3277132000-04-11 15:37:43 +0000342 def writelines(self,list):
343
344 return self.writer.writelines(list)
345
Guido van Rossum0612d842000-03-10 23:20:43 +0000346 def reset(self):
347
348 self.reader.reset()
349 self.writer.reset()
350
351 def __getattr__(self,name,
352
353 getattr=getattr):
354
355 """ Inherit all other methods from the underlying stream.
356 """
357 return getattr(self.stream,name)
358
359###
360
361class StreamRecoder:
362
Fred Drake49fd1072000-04-13 14:11:21 +0000363 """ StreamRecoder instances provide a frontend - backend
364 view of encoding data.
365
366 They use the complete set of APIs returned by the
367 codecs.lookup() function to implement their task.
368
369 Data written to the stream is first decoded into an
370 intermediate format (which is dependent on the given codec
371 combination) and then written to the stream using an instance
372 of the provided Writer class.
373
374 In the other direction, data is read from the stream using a
375 Reader instance and then return encoded data to the caller.
376
377 """
Guido van Rossuma3277132000-04-11 15:37:43 +0000378 # Optional attributes set by the file wrappers below
379 data_encoding = 'unknown'
380 file_encoding = 'unknown'
381
Guido van Rossum0612d842000-03-10 23:20:43 +0000382 def __init__(self,stream,encode,decode,Reader,Writer,errors='strict'):
383
384 """ Creates a StreamRecoder instance which implements a two-way
385 conversion: encode and decode work on the frontend (the
Guido van Rossum1c89b0e2000-04-11 15:41:38 +0000386 input to .read() and output of .write()) while
Guido van Rossum0612d842000-03-10 23:20:43 +0000387 Reader and Writer work on the backend (reading and
Fred Drake908670c2000-03-17 15:42:11 +0000388 writing to the stream).
Guido van Rossum0612d842000-03-10 23:20:43 +0000389
390 You can use these objects to do transparent direct
391 recodings from e.g. latin-1 to utf-8 and back.
392
393 stream must be a file-like object.
394
395 encode, decode must adhere to the Codec interface, Reader,
396 Writer must be factory functions or classes providing the
397 StreamReader, StreamWriter interface resp.
398
399 encode and decode are needed for the frontend translation,
400 Reader and Writer for the backend translation. Unicode is
401 used as intermediate encoding.
402
403 Error handling is done in the same way as defined for the
404 StreamWriter/Readers.
405
406 """
407 self.stream = stream
408 self.encode = encode
409 self.decode = decode
410 self.reader = Reader(stream, errors)
411 self.writer = Writer(stream, errors)
412 self.errors = errors
413
414 def read(self,size=-1):
415
416 data = self.reader.read(size)
417 data, bytesencoded = self.encode(data, self.errors)
418 return data
419
Guido van Rossuma3277132000-04-11 15:37:43 +0000420 def readline(self,size=None):
421
422 if size is None:
423 data = self.reader.readline()
424 else:
425 data = self.reader.readline(size)
426 data, bytesencoded = self.encode(data, self.errors)
427 return data
428
429 def readlines(self,sizehint=None):
430
431 if sizehint is None:
432 data = self.reader.read()
433 else:
434 data = self.reader.read(sizehint)
435 data, bytesencoded = self.encode(data, self.errors)
436 return data.splitlines(1)
437
Guido van Rossum0612d842000-03-10 23:20:43 +0000438 def write(self,data):
439
440 data, bytesdecoded = self.decode(data, self.errors)
441 return self.writer.write(data)
442
Guido van Rossuma3277132000-04-11 15:37:43 +0000443 def writelines(self,list):
444
445 data = ''.join(list)
446 data, bytesdecoded = self.decode(data, self.errors)
447 return self.writer.write(data)
Guido van Rossum0612d842000-03-10 23:20:43 +0000448
449 def reset(self):
450
451 self.reader.reset()
452 self.writer.reset()
453
454 def __getattr__(self,name,
455
456 getattr=getattr):
457
458 """ Inherit all other methods from the underlying stream.
459 """
460 return getattr(self.stream,name)
461
462### Shortcuts
463
Marc-André Lemburg349a3d32000-06-21 21:21:04 +0000464def open(filename, mode='rb', encoding=None, errors='strict', buffering=1):
Guido van Rossum0612d842000-03-10 23:20:43 +0000465
466 """ Open an encoded file using the given mode and return
467 a wrapped version providing transparent encoding/decoding.
468
469 Note: The wrapped version will only accept the object format
470 defined by the codecs, i.e. Unicode objects for most builtin
471 codecs. Output is also codec dependent and will usually by
472 Unicode as well.
473
Marc-André Lemburg349a3d32000-06-21 21:21:04 +0000474 Files are always opened in binary mode, even if no binary mode
475 was specified. Thisis done to avoid data loss due to encodings
476 using 8-bit values. The default file mode is 'rb' meaning to
477 open the file in binary read mode.
478
Guido van Rossum0612d842000-03-10 23:20:43 +0000479 encoding specifies the encoding which is to be used for the
480 the file.
481
482 errors may be given to define the error handling. It defaults
483 to 'strict' which causes ValueErrors to be raised in case an
484 encoding error occurs.
485
486 buffering has the same meaning as for the builtin open() API.
487 It defaults to line buffered.
488
Fred Drake49fd1072000-04-13 14:11:21 +0000489 The returned wrapped file object provides an extra attribute
490 .encoding which allows querying the used encoding. This
491 attribute is only available if an encoding was specified as
492 parameter.
493
Guido van Rossum0612d842000-03-10 23:20:43 +0000494 """
495 if encoding is not None and \
496 'b' not in mode:
497 # Force opening of the file in binary mode
498 mode = mode + 'b'
499 file = __builtin__.open(filename, mode, buffering)
500 if encoding is None:
501 return file
502 (e,d,sr,sw) = lookup(encoding)
Guido van Rossuma3277132000-04-11 15:37:43 +0000503 srw = StreamReaderWriter(file, sr, sw, errors)
504 # Add attributes to simplify introspection
505 srw.encoding = encoding
506 return srw
Guido van Rossum0612d842000-03-10 23:20:43 +0000507
Guido van Rossuma3277132000-04-11 15:37:43 +0000508def EncodedFile(file, data_encoding, file_encoding=None, errors='strict'):
Guido van Rossum0612d842000-03-10 23:20:43 +0000509
510 """ Return a wrapped version of file which provides transparent
511 encoding translation.
512
513 Strings written to the wrapped file are interpreted according
Guido van Rossuma3277132000-04-11 15:37:43 +0000514 to the given data_encoding and then written to the original
515 file as string using file_encoding. The intermediate encoding
516 will usually be Unicode but depends on the specified codecs.
Guido van Rossum0612d842000-03-10 23:20:43 +0000517
Guido van Rossuma3277132000-04-11 15:37:43 +0000518 Strings are read from the file using file_encoding and then
519 passed back to the caller as string using data_encoding.
520
521 If file_encoding is not given, it defaults to data_encoding.
Guido van Rossum0612d842000-03-10 23:20:43 +0000522
523 errors may be given to define the error handling. It defaults
524 to 'strict' which causes ValueErrors to be raised in case an
525 encoding error occurs.
526
Fred Drake49fd1072000-04-13 14:11:21 +0000527 The returned wrapped file object provides two extra attributes
528 .data_encoding and .file_encoding which reflect the given
529 parameters of the same name. The attributes can be used for
530 introspection by Python programs.
531
Guido van Rossum0612d842000-03-10 23:20:43 +0000532 """
Guido van Rossuma3277132000-04-11 15:37:43 +0000533 if file_encoding is None:
534 file_encoding = data_encoding
535 encode, decode = lookup(data_encoding)[:2]
536 Reader, Writer = lookup(file_encoding)[2:]
537 sr = StreamRecoder(file,
538 encode,decode,Reader,Writer,
539 errors)
540 # Add attributes to simplify introspection
541 sr.data_encoding = data_encoding
542 sr.file_encoding = file_encoding
543 return sr
Guido van Rossum0612d842000-03-10 23:20:43 +0000544
Marc-André Lemburga866df82001-01-03 21:29:14 +0000545### Helpers for charmap-based codecs
546
547def make_identity_dict(rng):
548
549 """ make_identity_dict(rng) -> dict
550
551 Return a dictionary where elements of the rng sequence are
552 mapped to themselves.
Tim Peters88869f92001-01-14 23:36:06 +0000553
Marc-André Lemburga866df82001-01-03 21:29:14 +0000554 """
555 res = {}
556 for i in rng:
557 res[i]=i
558 return res
559
Guido van Rossum0612d842000-03-10 23:20:43 +0000560### Tests
Guido van Rossum1c89b0e2000-04-11 15:41:38 +0000561
Guido van Rossum0612d842000-03-10 23:20:43 +0000562if __name__ == '__main__':
563
564 import sys
Guido van Rossum1c89b0e2000-04-11 15:41:38 +0000565
Guido van Rossuma3277132000-04-11 15:37:43 +0000566 # Make stdout translate Latin-1 output into UTF-8 output
567 sys.stdout = EncodedFile(sys.stdout, 'latin-1', 'utf-8')
Guido van Rossum1c89b0e2000-04-11 15:41:38 +0000568
Guido van Rossuma3277132000-04-11 15:37:43 +0000569 # Have stdin translate Latin-1 input into UTF-8 input
570 sys.stdin = EncodedFile(sys.stdin, 'utf-8', 'latin-1')