blob: 40f0a2e2262b30332a2611cda4ecba6fa397450f [file] [log] [blame]
Guido van Rossum0612d842000-03-10 23:20:43 +00001""" codecs -- Python Codec Registry, API and helpers.
2
3
4Written by Marc-Andre Lemburg (mal@lemburg.com).
5
6(c) Copyright CNRI, All Rights Reserved. NO WARRANTY.
7
8"""#"
9
Martin v. Löwis02d893c2001-08-02 07:15:29 +000010import struct, __builtin__
Guido van Rossum0612d842000-03-10 23:20:43 +000011
12### Registry and builtin stateless codec functions
13
Guido van Rossumb95de4f2000-03-31 17:25:23 +000014try:
15 from _codecs import *
Tim Peters30324a72001-05-15 17:19:16 +000016except ImportError, why:
Guido van Rossumb95de4f2000-03-31 17:25:23 +000017 raise SystemError,\
18 'Failed to load the builtin codecs: %s' % why
Guido van Rossum0612d842000-03-10 23:20:43 +000019
Tim Peters30324a72001-05-15 17:19:16 +000020__all__ = ["register", "lookup", "open", "EncodedFile", "BOM", "BOM_BE",
Walter Dörwald474458d2002-06-04 15:16:29 +000021 "BOM_LE", "BOM32_BE", "BOM32_LE", "BOM64_BE", "BOM64_LE",
22 "BOM_UTF8", "BOM_UTF16", "BOM_UTF16_LE", "BOM_UTF16_BE",
Walter Dörwald3aeb6322002-09-02 13:14:32 +000023 "BOM_UTF32", "BOM_UTF32_LE", "BOM_UTF32_BE",
24 "strict_errors", "ignore_errors", "replace_errors",
25 "xmlcharrefreplace_errors",
26 "register_error", "lookup_error"]
Skip Montanaroe99d5ea2001-01-20 19:54:20 +000027
Guido van Rossum0612d842000-03-10 23:20:43 +000028### Constants
29
30#
Walter Dörwald474458d2002-06-04 15:16:29 +000031# Byte Order Mark (BOM = ZERO WIDTH NO-BREAK SPACE = U+FEFF)
32# and its possible byte string values
33# for UTF8/UTF16/UTF32 output and little/big endian machines
Guido van Rossum0612d842000-03-10 23:20:43 +000034#
Guido van Rossum0612d842000-03-10 23:20:43 +000035
Walter Dörwald474458d2002-06-04 15:16:29 +000036# UTF-8
37BOM_UTF8 = '\xef\xbb\xbf'
38
39# UTF-16, little endian
40BOM_LE = BOM_UTF16_LE = '\xff\xfe'
41
42# UTF-16, big endian
43BOM_BE = BOM_UTF16_BE = '\xfe\xff'
44
45# UTF-32, little endian
46BOM_UTF32_LE = '\xff\xfe\x00\x00'
47
48# UTF-32, big endian
49BOM_UTF32_BE = '\x00\x00\xfe\xff'
50
51# UTF-16, native endianness
52BOM = BOM_UTF16 = struct.pack('=H', 0xFEFF)
53
54# UTF-32, native endianness
55BOM_UTF32 = struct.pack('=L', 0x0000FEFF)
56
57# Old broken names (don't use in new code)
58BOM32_LE = BOM_UTF16_LE
59BOM32_BE = BOM_UTF16_BE
60BOM64_LE = BOM_UTF32_LE
61BOM64_BE = BOM_UTF32_BE
Guido van Rossum0612d842000-03-10 23:20:43 +000062
63
64### Codec base classes (defining the API)
65
66class Codec:
67
68 """ Defines the interface for stateless encoders/decoders.
69
70 The .encode()/.decode() methods may implement different error
71 handling schemes by providing the errors argument. These
72 string values are defined:
73
Guido van Rossumd8855fd2000-03-24 22:14:19 +000074 'strict' - raise a ValueError error (or a subclass)
Guido van Rossum0612d842000-03-10 23:20:43 +000075 'ignore' - ignore the character and continue with the next
76 'replace' - replace with a suitable replacement character;
77 Python will use the official U+FFFD REPLACEMENT
78 CHARACTER for the builtin Unicode codecs.
79
80 """
Tim Peters30324a72001-05-15 17:19:16 +000081 def encode(self, input, errors='strict'):
Guido van Rossum1c89b0e2000-04-11 15:41:38 +000082
Fred Drake3e74c0d2000-03-17 15:40:35 +000083 """ Encodes the object input and returns a tuple (output
Guido van Rossum0612d842000-03-10 23:20:43 +000084 object, length consumed).
85
86 errors defines the error handling to apply. It defaults to
87 'strict' handling.
88
89 The method may not store state in the Codec instance. Use
90 StreamCodec for codecs which have to keep state in order to
91 make encoding/decoding efficient.
92
93 The encoder must be able to handle zero length input and
94 return an empty object of the output object type in this
95 situation.
96
97 """
98 raise NotImplementedError
99
Tim Peters30324a72001-05-15 17:19:16 +0000100 def decode(self, input, errors='strict'):
Guido van Rossum0612d842000-03-10 23:20:43 +0000101
102 """ Decodes the object input and returns a tuple (output
103 object, length consumed).
104
105 input must be an object which provides the bf_getreadbuf
106 buffer slot. Python strings, buffer objects and memory
107 mapped files are examples of objects providing this slot.
Guido van Rossum1c89b0e2000-04-11 15:41:38 +0000108
Guido van Rossum0612d842000-03-10 23:20:43 +0000109 errors defines the error handling to apply. It defaults to
110 'strict' handling.
111
112 The method may not store state in the Codec instance. Use
113 StreamCodec for codecs which have to keep state in order to
114 make encoding/decoding efficient.
115
116 The decoder must be able to handle zero length input and
117 return an empty object of the output object type in this
118 situation.
119
Guido van Rossum1c89b0e2000-04-11 15:41:38 +0000120 """
Guido van Rossum0612d842000-03-10 23:20:43 +0000121 raise NotImplementedError
122
123#
124# The StreamWriter and StreamReader class provide generic working
Andrew M. Kuchling97c56352001-09-18 20:29:48 +0000125# interfaces which can be used to implement new encoding submodules
Guido van Rossum0612d842000-03-10 23:20:43 +0000126# very easily. See encodings/utf_8.py for an example on how this is
127# done.
Guido van Rossum1c89b0e2000-04-11 15:41:38 +0000128#
Guido van Rossum0612d842000-03-10 23:20:43 +0000129
130class StreamWriter(Codec):
131
Tim Peters30324a72001-05-15 17:19:16 +0000132 def __init__(self, stream, errors='strict'):
Guido van Rossum0612d842000-03-10 23:20:43 +0000133
134 """ Creates a StreamWriter instance.
135
136 stream must be a file-like object open for writing
137 (binary) data.
138
139 The StreamWriter may implement different error handling
140 schemes by providing the errors keyword argument. These
141 parameters are defined:
142
143 'strict' - raise a ValueError (or a subclass)
144 'ignore' - ignore the character and continue with the next
145 'replace'- replace with a suitable replacement character
146
147 """
148 self.stream = stream
149 self.errors = errors
150
Guido van Rossuma3277132000-04-11 15:37:43 +0000151 def write(self, object):
Guido van Rossum0612d842000-03-10 23:20:43 +0000152
153 """ Writes the object's contents encoded to self.stream.
154 """
Tim Peters30324a72001-05-15 17:19:16 +0000155 data, consumed = self.encode(object, self.errors)
Guido van Rossum0612d842000-03-10 23:20:43 +0000156 self.stream.write(data)
157
Guido van Rossuma3277132000-04-11 15:37:43 +0000158 def writelines(self, list):
159
160 """ Writes the concatenated list of strings to the stream
161 using .write().
162 """
163 self.write(''.join(list))
Guido van Rossum1c89b0e2000-04-11 15:41:38 +0000164
Guido van Rossum0612d842000-03-10 23:20:43 +0000165 def reset(self):
166
167 """ Flushes and resets the codec buffers used for keeping state.
168
169 Calling this method should ensure that the data on the
170 output is put into a clean state, that allows appending
171 of new fresh data without having to rescan the whole
172 stream to recover state.
173
174 """
175 pass
176
Tim Peters30324a72001-05-15 17:19:16 +0000177 def __getattr__(self, name,
Guido van Rossum0612d842000-03-10 23:20:43 +0000178 getattr=getattr):
179
180 """ Inherit all other methods from the underlying stream.
181 """
Tim Peters30324a72001-05-15 17:19:16 +0000182 return getattr(self.stream, name)
Guido van Rossum0612d842000-03-10 23:20:43 +0000183
184###
185
186class StreamReader(Codec):
187
Tim Peters30324a72001-05-15 17:19:16 +0000188 def __init__(self, stream, errors='strict'):
Guido van Rossum0612d842000-03-10 23:20:43 +0000189
190 """ Creates a StreamReader instance.
191
192 stream must be a file-like object open for reading
193 (binary) data.
194
195 The StreamReader may implement different error handling
196 schemes by providing the errors keyword argument. These
197 parameters are defined:
198
199 'strict' - raise a ValueError (or a subclass)
200 'ignore' - ignore the character and continue with the next
201 'replace'- replace with a suitable replacement character;
202
203 """
204 self.stream = stream
205 self.errors = errors
206
Guido van Rossuma3277132000-04-11 15:37:43 +0000207 def read(self, size=-1):
Guido van Rossum0612d842000-03-10 23:20:43 +0000208
209 """ Decodes data from the stream self.stream and returns the
210 resulting object.
211
212 size indicates the approximate maximum number of bytes to
213 read from the stream for decoding purposes. The decoder
214 can modify this setting as appropriate. The default value
215 -1 indicates to read and decode as much as possible. size
216 is intended to prevent having to decode huge files in one
217 step.
218
219 The method should use a greedy read strategy meaning that
220 it should read as much data as is allowed within the
221 definition of the encoding and the given size, e.g. if
222 optional encoding endings or state markers are available
223 on the stream, these should be read too.
224
225 """
226 # Unsliced reading:
227 if size < 0:
Andrew M. Kuchlingc6c28382000-12-10 15:12:14 +0000228 return self.decode(self.stream.read(), self.errors)[0]
Guido van Rossum1c89b0e2000-04-11 15:41:38 +0000229
Guido van Rossum0612d842000-03-10 23:20:43 +0000230 # Sliced reading:
231 read = self.stream.read
232 decode = self.decode
233 data = read(size)
234 i = 0
235 while 1:
236 try:
Andrew M. Kuchlingc6c28382000-12-10 15:12:14 +0000237 object, decodedbytes = decode(data, self.errors)
Tim Peters30324a72001-05-15 17:19:16 +0000238 except ValueError, why:
Guido van Rossum0612d842000-03-10 23:20:43 +0000239 # This method is slow but should work under pretty much
240 # all conditions; at most 10 tries are made
241 i = i + 1
242 newdata = read(1)
243 if not newdata or i > 10:
244 raise
245 data = data + newdata
246 else:
247 return object
248
Guido van Rossuma3277132000-04-11 15:37:43 +0000249 def readline(self, size=None):
250
251 """ Read one line from the input stream and return the
252 decoded data.
253
Fred Drake49fd1072000-04-13 14:11:21 +0000254 Note: Unlike the .readlines() method, this method inherits
255 the line breaking knowledge from the underlying stream's
256 .readline() method -- there is currently no support for
257 line breaking using the codec decoder due to lack of line
258 buffering. Sublcasses should however, if possible, try to
259 implement this method using their own knowledge of line
260 breaking.
Guido van Rossuma3277132000-04-11 15:37:43 +0000261
262 size, if given, is passed as size argument to the stream's
263 .readline() method.
Guido van Rossum1c89b0e2000-04-11 15:41:38 +0000264
Guido van Rossuma3277132000-04-11 15:37:43 +0000265 """
266 if size is None:
267 line = self.stream.readline()
268 else:
269 line = self.stream.readline(size)
Tim Peters30324a72001-05-15 17:19:16 +0000270 return self.decode(line, self.errors)[0]
Guido van Rossum1c89b0e2000-04-11 15:41:38 +0000271
Guido van Rossuma3277132000-04-11 15:37:43 +0000272
Martin v. Löwisb786e612002-03-05 15:46:38 +0000273 def readlines(self, sizehint=None):
Guido van Rossuma3277132000-04-11 15:37:43 +0000274
275 """ Read all lines available on the input stream
276 and return them as list of lines.
277
278 Line breaks are implemented using the codec's decoder
279 method and are included in the list entries.
Guido van Rossum1c89b0e2000-04-11 15:41:38 +0000280
Guido van Rossuma3277132000-04-11 15:37:43 +0000281 sizehint, if given, is passed as size argument to the
282 stream's .read() method.
283
284 """
285 if sizehint is None:
286 data = self.stream.read()
287 else:
288 data = self.stream.read(sizehint)
Tim Peters30324a72001-05-15 17:19:16 +0000289 return self.decode(data, self.errors)[0].splitlines(1)
Guido van Rossum0612d842000-03-10 23:20:43 +0000290
291 def reset(self):
292
293 """ Resets the codec buffers used for keeping state.
294
295 Note that no stream repositioning should take place.
Thomas Wouters7e474022000-07-16 12:04:32 +0000296 This method is primarily intended to be able to recover
Guido van Rossum0612d842000-03-10 23:20:43 +0000297 from decoding errors.
298
299 """
300 pass
301
Tim Peters30324a72001-05-15 17:19:16 +0000302 def __getattr__(self, name,
Guido van Rossum0612d842000-03-10 23:20:43 +0000303 getattr=getattr):
304
305 """ Inherit all other methods from the underlying stream.
306 """
Tim Peters30324a72001-05-15 17:19:16 +0000307 return getattr(self.stream, name)
Guido van Rossum0612d842000-03-10 23:20:43 +0000308
309###
310
311class StreamReaderWriter:
312
Fred Drake49fd1072000-04-13 14:11:21 +0000313 """ StreamReaderWriter instances allow wrapping streams which
314 work in both read and write modes.
315
316 The design is such that one can use the factory functions
Thomas Wouters7e474022000-07-16 12:04:32 +0000317 returned by the codec.lookup() function to construct the
Fred Drake49fd1072000-04-13 14:11:21 +0000318 instance.
319
320 """
Guido van Rossuma3277132000-04-11 15:37:43 +0000321 # Optional attributes set by the file wrappers below
322 encoding = 'unknown'
323
Tim Peters30324a72001-05-15 17:19:16 +0000324 def __init__(self, stream, Reader, Writer, errors='strict'):
Guido van Rossum0612d842000-03-10 23:20:43 +0000325
326 """ Creates a StreamReaderWriter instance.
327
328 stream must be a Stream-like object.
329
330 Reader, Writer must be factory functions or classes
331 providing the StreamReader, StreamWriter interface resp.
332
333 Error handling is done in the same way as defined for the
334 StreamWriter/Readers.
335
336 """
337 self.stream = stream
338 self.reader = Reader(stream, errors)
339 self.writer = Writer(stream, errors)
340 self.errors = errors
341
Tim Peters30324a72001-05-15 17:19:16 +0000342 def read(self, size=-1):
Guido van Rossum0612d842000-03-10 23:20:43 +0000343
344 return self.reader.read(size)
345
Guido van Rossumd58c26f2000-05-01 16:17:32 +0000346 def readline(self, size=None):
Guido van Rossuma3277132000-04-11 15:37:43 +0000347
348 return self.reader.readline(size)
349
Guido van Rossumd58c26f2000-05-01 16:17:32 +0000350 def readlines(self, sizehint=None):
Guido van Rossuma3277132000-04-11 15:37:43 +0000351
352 return self.reader.readlines(sizehint)
353
Tim Peters30324a72001-05-15 17:19:16 +0000354 def write(self, data):
Guido van Rossum0612d842000-03-10 23:20:43 +0000355
356 return self.writer.write(data)
357
Tim Peters30324a72001-05-15 17:19:16 +0000358 def writelines(self, list):
Guido van Rossuma3277132000-04-11 15:37:43 +0000359
360 return self.writer.writelines(list)
361
Guido van Rossum0612d842000-03-10 23:20:43 +0000362 def reset(self):
363
364 self.reader.reset()
365 self.writer.reset()
366
Tim Peters30324a72001-05-15 17:19:16 +0000367 def __getattr__(self, name,
Guido van Rossum0612d842000-03-10 23:20:43 +0000368 getattr=getattr):
369
370 """ Inherit all other methods from the underlying stream.
371 """
Tim Peters30324a72001-05-15 17:19:16 +0000372 return getattr(self.stream, name)
Guido van Rossum0612d842000-03-10 23:20:43 +0000373
374###
375
376class StreamRecoder:
377
Fred Drake49fd1072000-04-13 14:11:21 +0000378 """ StreamRecoder instances provide a frontend - backend
379 view of encoding data.
380
381 They use the complete set of APIs returned by the
382 codecs.lookup() function to implement their task.
383
384 Data written to the stream is first decoded into an
385 intermediate format (which is dependent on the given codec
386 combination) and then written to the stream using an instance
387 of the provided Writer class.
388
389 In the other direction, data is read from the stream using a
390 Reader instance and then return encoded data to the caller.
391
392 """
Guido van Rossuma3277132000-04-11 15:37:43 +0000393 # Optional attributes set by the file wrappers below
394 data_encoding = 'unknown'
395 file_encoding = 'unknown'
396
Tim Peters30324a72001-05-15 17:19:16 +0000397 def __init__(self, stream, encode, decode, Reader, Writer,
398 errors='strict'):
Guido van Rossum0612d842000-03-10 23:20:43 +0000399
400 """ Creates a StreamRecoder instance which implements a two-way
401 conversion: encode and decode work on the frontend (the
Guido van Rossum1c89b0e2000-04-11 15:41:38 +0000402 input to .read() and output of .write()) while
Guido van Rossum0612d842000-03-10 23:20:43 +0000403 Reader and Writer work on the backend (reading and
Fred Drake908670c2000-03-17 15:42:11 +0000404 writing to the stream).
Guido van Rossum0612d842000-03-10 23:20:43 +0000405
406 You can use these objects to do transparent direct
407 recodings from e.g. latin-1 to utf-8 and back.
408
409 stream must be a file-like object.
410
411 encode, decode must adhere to the Codec interface, Reader,
412 Writer must be factory functions or classes providing the
413 StreamReader, StreamWriter interface resp.
414
415 encode and decode are needed for the frontend translation,
416 Reader and Writer for the backend translation. Unicode is
417 used as intermediate encoding.
418
419 Error handling is done in the same way as defined for the
420 StreamWriter/Readers.
421
422 """
423 self.stream = stream
424 self.encode = encode
425 self.decode = decode
426 self.reader = Reader(stream, errors)
427 self.writer = Writer(stream, errors)
428 self.errors = errors
429
Tim Peters30324a72001-05-15 17:19:16 +0000430 def read(self, size=-1):
Guido van Rossum0612d842000-03-10 23:20:43 +0000431
432 data = self.reader.read(size)
433 data, bytesencoded = self.encode(data, self.errors)
434 return data
435
Tim Peters30324a72001-05-15 17:19:16 +0000436 def readline(self, size=None):
Guido van Rossuma3277132000-04-11 15:37:43 +0000437
438 if size is None:
439 data = self.reader.readline()
440 else:
441 data = self.reader.readline(size)
442 data, bytesencoded = self.encode(data, self.errors)
443 return data
444
Tim Peters30324a72001-05-15 17:19:16 +0000445 def readlines(self, sizehint=None):
Guido van Rossuma3277132000-04-11 15:37:43 +0000446
447 if sizehint is None:
448 data = self.reader.read()
449 else:
450 data = self.reader.read(sizehint)
451 data, bytesencoded = self.encode(data, self.errors)
452 return data.splitlines(1)
453
Tim Peters30324a72001-05-15 17:19:16 +0000454 def write(self, data):
Guido van Rossum0612d842000-03-10 23:20:43 +0000455
456 data, bytesdecoded = self.decode(data, self.errors)
457 return self.writer.write(data)
458
Tim Peters30324a72001-05-15 17:19:16 +0000459 def writelines(self, list):
Guido van Rossuma3277132000-04-11 15:37:43 +0000460
461 data = ''.join(list)
462 data, bytesdecoded = self.decode(data, self.errors)
463 return self.writer.write(data)
Guido van Rossum0612d842000-03-10 23:20:43 +0000464
465 def reset(self):
466
467 self.reader.reset()
468 self.writer.reset()
469
Tim Peters30324a72001-05-15 17:19:16 +0000470 def __getattr__(self, name,
Guido van Rossum0612d842000-03-10 23:20:43 +0000471 getattr=getattr):
472
473 """ Inherit all other methods from the underlying stream.
474 """
Tim Peters30324a72001-05-15 17:19:16 +0000475 return getattr(self.stream, name)
Guido van Rossum0612d842000-03-10 23:20:43 +0000476
477### Shortcuts
478
Marc-André Lemburg349a3d32000-06-21 21:21:04 +0000479def open(filename, mode='rb', encoding=None, errors='strict', buffering=1):
Guido van Rossum0612d842000-03-10 23:20:43 +0000480
481 """ Open an encoded file using the given mode and return
482 a wrapped version providing transparent encoding/decoding.
483
484 Note: The wrapped version will only accept the object format
485 defined by the codecs, i.e. Unicode objects for most builtin
486 codecs. Output is also codec dependent and will usually by
487 Unicode as well.
488
Marc-André Lemburg349a3d32000-06-21 21:21:04 +0000489 Files are always opened in binary mode, even if no binary mode
490 was specified. Thisis done to avoid data loss due to encodings
491 using 8-bit values. The default file mode is 'rb' meaning to
492 open the file in binary read mode.
493
Guido van Rossum0612d842000-03-10 23:20:43 +0000494 encoding specifies the encoding which is to be used for the
495 the file.
496
497 errors may be given to define the error handling. It defaults
498 to 'strict' which causes ValueErrors to be raised in case an
499 encoding error occurs.
500
501 buffering has the same meaning as for the builtin open() API.
502 It defaults to line buffered.
503
Fred Drake49fd1072000-04-13 14:11:21 +0000504 The returned wrapped file object provides an extra attribute
505 .encoding which allows querying the used encoding. This
506 attribute is only available if an encoding was specified as
507 parameter.
508
Guido van Rossum0612d842000-03-10 23:20:43 +0000509 """
510 if encoding is not None and \
511 'b' not in mode:
512 # Force opening of the file in binary mode
513 mode = mode + 'b'
514 file = __builtin__.open(filename, mode, buffering)
515 if encoding is None:
516 return file
Tim Peters30324a72001-05-15 17:19:16 +0000517 (e, d, sr, sw) = lookup(encoding)
Guido van Rossuma3277132000-04-11 15:37:43 +0000518 srw = StreamReaderWriter(file, sr, sw, errors)
519 # Add attributes to simplify introspection
520 srw.encoding = encoding
521 return srw
Guido van Rossum0612d842000-03-10 23:20:43 +0000522
Guido van Rossuma3277132000-04-11 15:37:43 +0000523def EncodedFile(file, data_encoding, file_encoding=None, errors='strict'):
Guido van Rossum0612d842000-03-10 23:20:43 +0000524
525 """ Return a wrapped version of file which provides transparent
526 encoding translation.
527
528 Strings written to the wrapped file are interpreted according
Guido van Rossuma3277132000-04-11 15:37:43 +0000529 to the given data_encoding and then written to the original
530 file as string using file_encoding. The intermediate encoding
531 will usually be Unicode but depends on the specified codecs.
Guido van Rossum0612d842000-03-10 23:20:43 +0000532
Guido van Rossuma3277132000-04-11 15:37:43 +0000533 Strings are read from the file using file_encoding and then
534 passed back to the caller as string using data_encoding.
535
536 If file_encoding is not given, it defaults to data_encoding.
Guido van Rossum0612d842000-03-10 23:20:43 +0000537
538 errors may be given to define the error handling. It defaults
539 to 'strict' which causes ValueErrors to be raised in case an
540 encoding error occurs.
541
Fred Drake49fd1072000-04-13 14:11:21 +0000542 The returned wrapped file object provides two extra attributes
543 .data_encoding and .file_encoding which reflect the given
544 parameters of the same name. The attributes can be used for
545 introspection by Python programs.
546
Guido van Rossum0612d842000-03-10 23:20:43 +0000547 """
Guido van Rossuma3277132000-04-11 15:37:43 +0000548 if file_encoding is None:
549 file_encoding = data_encoding
550 encode, decode = lookup(data_encoding)[:2]
551 Reader, Writer = lookup(file_encoding)[2:]
552 sr = StreamRecoder(file,
Tim Peters30324a72001-05-15 17:19:16 +0000553 encode, decode, Reader, Writer,
Guido van Rossuma3277132000-04-11 15:37:43 +0000554 errors)
555 # Add attributes to simplify introspection
556 sr.data_encoding = data_encoding
557 sr.file_encoding = file_encoding
558 return sr
Guido van Rossum0612d842000-03-10 23:20:43 +0000559
Marc-André Lemburgaa32c5a2001-09-19 11:24:48 +0000560### Helpers for codec lookup
561
562def getencoder(encoding):
563
564 """ Lookup up the codec for the given encoding and return
565 its encoder function.
566
567 Raises a LookupError in case the encoding cannot be found.
568
569 """
570 return lookup(encoding)[0]
571
572def getdecoder(encoding):
573
574 """ Lookup up the codec for the given encoding and return
575 its decoder function.
576
577 Raises a LookupError in case the encoding cannot be found.
578
579 """
580 return lookup(encoding)[1]
581
582def getreader(encoding):
583
584 """ Lookup up the codec for the given encoding and return
585 its StreamReader class or factory function.
586
587 Raises a LookupError in case the encoding cannot be found.
588
589 """
590 return lookup(encoding)[2]
591
592def getwriter(encoding):
593
594 """ Lookup up the codec for the given encoding and return
595 its StreamWriter class or factory function.
596
597 Raises a LookupError in case the encoding cannot be found.
598
599 """
600 return lookup(encoding)[3]
601
Marc-André Lemburga866df82001-01-03 21:29:14 +0000602### Helpers for charmap-based codecs
603
604def make_identity_dict(rng):
605
606 """ make_identity_dict(rng) -> dict
607
608 Return a dictionary where elements of the rng sequence are
609 mapped to themselves.
Tim Peters88869f92001-01-14 23:36:06 +0000610
Marc-André Lemburga866df82001-01-03 21:29:14 +0000611 """
612 res = {}
613 for i in rng:
614 res[i]=i
615 return res
616
Marc-André Lemburg716cf912001-05-16 09:41:45 +0000617def make_encoding_map(decoding_map):
618
619 """ Creates an encoding map from a decoding map.
620
621 If a target mapping in the decoding map occurrs multiple
622 times, then that target is mapped to None (undefined mapping),
623 causing an exception when encountered by the charmap codec
624 during translation.
625
626 One example where this happens is cp875.py which decodes
627 multiple character to \u001a.
628
629 """
630 m = {}
631 for k,v in decoding_map.items():
Raymond Hettinger54f02222002-06-01 14:18:47 +0000632 if not v in m:
Marc-André Lemburg716cf912001-05-16 09:41:45 +0000633 m[v] = k
634 else:
635 m[v] = None
636 return m
Tim Peters3a2ab1a2001-05-29 06:06:54 +0000637
Walter Dörwald3aeb6322002-09-02 13:14:32 +0000638### error handlers
639
640strict_errors = lookup_error("strict")
641ignore_errors = lookup_error("ignore")
642replace_errors = lookup_error("replace")
643xmlcharrefreplace_errors = lookup_error("xmlcharrefreplace")
644backslashreplace_errors = lookup_error("backslashreplace")
645
Martin v. Löwis6cd441d2001-07-31 08:54:55 +0000646# Tell modulefinder that using codecs probably needs the encodings
647# package
648_false = 0
649if _false:
650 import encodings
651
Guido van Rossum0612d842000-03-10 23:20:43 +0000652### Tests
Guido van Rossum1c89b0e2000-04-11 15:41:38 +0000653
Guido van Rossum0612d842000-03-10 23:20:43 +0000654if __name__ == '__main__':
655
656 import sys
Guido van Rossum1c89b0e2000-04-11 15:41:38 +0000657
Guido van Rossuma3277132000-04-11 15:37:43 +0000658 # Make stdout translate Latin-1 output into UTF-8 output
659 sys.stdout = EncodedFile(sys.stdout, 'latin-1', 'utf-8')
Guido van Rossum1c89b0e2000-04-11 15:41:38 +0000660
Guido van Rossuma3277132000-04-11 15:37:43 +0000661 # Have stdin translate Latin-1 input into UTF-8 input
662 sys.stdin = EncodedFile(sys.stdin, 'utf-8', 'latin-1')