blob: b3c2f71efd6aa5faad522ae16ac8e7291e44863c [file] [log] [blame]
Guido van Rossum0612d842000-03-10 23:20:43 +00001""" codecs -- Python Codec Registry, API and helpers.
2
3
4Written by Marc-Andre Lemburg (mal@lemburg.com).
5
6(c) Copyright CNRI, All Rights Reserved. NO WARRANTY.
7
8"""#"
9
Tim Peters30324a72001-05-15 17:19:16 +000010import struct, types, __builtin__
Guido van Rossum0612d842000-03-10 23:20:43 +000011
12### Registry and builtin stateless codec functions
13
Guido van Rossumb95de4f2000-03-31 17:25:23 +000014try:
15 from _codecs import *
Tim Peters30324a72001-05-15 17:19:16 +000016except ImportError, why:
Guido van Rossumb95de4f2000-03-31 17:25:23 +000017 raise SystemError,\
18 'Failed to load the builtin codecs: %s' % why
Guido van Rossum0612d842000-03-10 23:20:43 +000019
Tim Peters30324a72001-05-15 17:19:16 +000020__all__ = ["register", "lookup", "open", "EncodedFile", "BOM", "BOM_BE",
21 "BOM_LE", "BOM32_BE", "BOM32_LE", "BOM64_BE", "BOM64_LE"]
Skip Montanaroe99d5ea2001-01-20 19:54:20 +000022
Guido van Rossum0612d842000-03-10 23:20:43 +000023### Constants
24
25#
26# Byte Order Mark (BOM) and its possible values (BOM_BE, BOM_LE)
27#
Tim Peters30324a72001-05-15 17:19:16 +000028BOM = struct.pack('=H', 0xFEFF)
Guido van Rossum0612d842000-03-10 23:20:43 +000029#
30BOM_BE = BOM32_BE = '\376\377'
Tim Peters88869f92001-01-14 23:36:06 +000031# corresponds to Unicode U+FEFF in UTF-16 on big endian
32# platforms == ZERO WIDTH NO-BREAK SPACE
Guido van Rossum1c89b0e2000-04-11 15:41:38 +000033BOM_LE = BOM32_LE = '\377\376'
Tim Peters88869f92001-01-14 23:36:06 +000034# corresponds to Unicode U+FFFE in UTF-16 on little endian
35# platforms == defined as being an illegal Unicode character
Guido van Rossum0612d842000-03-10 23:20:43 +000036
37#
38# 64-bit Byte Order Marks
39#
40BOM64_BE = '\000\000\376\377'
Tim Peters88869f92001-01-14 23:36:06 +000041# corresponds to Unicode U+0000FEFF in UCS-4
Guido van Rossum0612d842000-03-10 23:20:43 +000042BOM64_LE = '\377\376\000\000'
Tim Peters88869f92001-01-14 23:36:06 +000043# corresponds to Unicode U+0000FFFE in UCS-4
Guido van Rossum0612d842000-03-10 23:20:43 +000044
45
46### Codec base classes (defining the API)
47
48class Codec:
49
50 """ Defines the interface for stateless encoders/decoders.
51
52 The .encode()/.decode() methods may implement different error
53 handling schemes by providing the errors argument. These
54 string values are defined:
55
Guido van Rossumd8855fd2000-03-24 22:14:19 +000056 'strict' - raise a ValueError error (or a subclass)
Guido van Rossum0612d842000-03-10 23:20:43 +000057 'ignore' - ignore the character and continue with the next
58 'replace' - replace with a suitable replacement character;
59 Python will use the official U+FFFD REPLACEMENT
60 CHARACTER for the builtin Unicode codecs.
61
62 """
Tim Peters30324a72001-05-15 17:19:16 +000063 def encode(self, input, errors='strict'):
Guido van Rossum1c89b0e2000-04-11 15:41:38 +000064
Fred Drake3e74c0d2000-03-17 15:40:35 +000065 """ Encodes the object input and returns a tuple (output
Guido van Rossum0612d842000-03-10 23:20:43 +000066 object, length consumed).
67
68 errors defines the error handling to apply. It defaults to
69 'strict' handling.
70
71 The method may not store state in the Codec instance. Use
72 StreamCodec for codecs which have to keep state in order to
73 make encoding/decoding efficient.
74
75 The encoder must be able to handle zero length input and
76 return an empty object of the output object type in this
77 situation.
78
79 """
80 raise NotImplementedError
81
Tim Peters30324a72001-05-15 17:19:16 +000082 def decode(self, input, errors='strict'):
Guido van Rossum0612d842000-03-10 23:20:43 +000083
84 """ Decodes the object input and returns a tuple (output
85 object, length consumed).
86
87 input must be an object which provides the bf_getreadbuf
88 buffer slot. Python strings, buffer objects and memory
89 mapped files are examples of objects providing this slot.
Guido van Rossum1c89b0e2000-04-11 15:41:38 +000090
Guido van Rossum0612d842000-03-10 23:20:43 +000091 errors defines the error handling to apply. It defaults to
92 'strict' handling.
93
94 The method may not store state in the Codec instance. Use
95 StreamCodec for codecs which have to keep state in order to
96 make encoding/decoding efficient.
97
98 The decoder must be able to handle zero length input and
99 return an empty object of the output object type in this
100 situation.
101
Guido van Rossum1c89b0e2000-04-11 15:41:38 +0000102 """
Guido van Rossum0612d842000-03-10 23:20:43 +0000103 raise NotImplementedError
104
105#
106# The StreamWriter and StreamReader class provide generic working
107# interfaces which can be used to implement new encodings submodules
108# very easily. See encodings/utf_8.py for an example on how this is
109# done.
Guido van Rossum1c89b0e2000-04-11 15:41:38 +0000110#
Guido van Rossum0612d842000-03-10 23:20:43 +0000111
112class StreamWriter(Codec):
113
Tim Peters30324a72001-05-15 17:19:16 +0000114 def __init__(self, stream, errors='strict'):
Guido van Rossum0612d842000-03-10 23:20:43 +0000115
116 """ Creates a StreamWriter instance.
117
118 stream must be a file-like object open for writing
119 (binary) data.
120
121 The StreamWriter may implement different error handling
122 schemes by providing the errors keyword argument. These
123 parameters are defined:
124
125 'strict' - raise a ValueError (or a subclass)
126 'ignore' - ignore the character and continue with the next
127 'replace'- replace with a suitable replacement character
128
129 """
130 self.stream = stream
131 self.errors = errors
132
Guido van Rossuma3277132000-04-11 15:37:43 +0000133 def write(self, object):
Guido van Rossum0612d842000-03-10 23:20:43 +0000134
135 """ Writes the object's contents encoded to self.stream.
136 """
Tim Peters30324a72001-05-15 17:19:16 +0000137 data, consumed = self.encode(object, self.errors)
Guido van Rossum0612d842000-03-10 23:20:43 +0000138 self.stream.write(data)
139
Guido van Rossuma3277132000-04-11 15:37:43 +0000140 def writelines(self, list):
141
142 """ Writes the concatenated list of strings to the stream
143 using .write().
144 """
145 self.write(''.join(list))
Guido van Rossum1c89b0e2000-04-11 15:41:38 +0000146
Guido van Rossum0612d842000-03-10 23:20:43 +0000147 def reset(self):
148
149 """ Flushes and resets the codec buffers used for keeping state.
150
151 Calling this method should ensure that the data on the
152 output is put into a clean state, that allows appending
153 of new fresh data without having to rescan the whole
154 stream to recover state.
155
156 """
157 pass
158
Tim Peters30324a72001-05-15 17:19:16 +0000159 def __getattr__(self, name,
Guido van Rossum0612d842000-03-10 23:20:43 +0000160 getattr=getattr):
161
162 """ Inherit all other methods from the underlying stream.
163 """
Tim Peters30324a72001-05-15 17:19:16 +0000164 return getattr(self.stream, name)
Guido van Rossum0612d842000-03-10 23:20:43 +0000165
166###
167
168class StreamReader(Codec):
169
Tim Peters30324a72001-05-15 17:19:16 +0000170 def __init__(self, stream, errors='strict'):
Guido van Rossum0612d842000-03-10 23:20:43 +0000171
172 """ Creates a StreamReader instance.
173
174 stream must be a file-like object open for reading
175 (binary) data.
176
177 The StreamReader may implement different error handling
178 schemes by providing the errors keyword argument. These
179 parameters are defined:
180
181 'strict' - raise a ValueError (or a subclass)
182 'ignore' - ignore the character and continue with the next
183 'replace'- replace with a suitable replacement character;
184
185 """
186 self.stream = stream
187 self.errors = errors
188
Guido van Rossuma3277132000-04-11 15:37:43 +0000189 def read(self, size=-1):
Guido van Rossum0612d842000-03-10 23:20:43 +0000190
191 """ Decodes data from the stream self.stream and returns the
192 resulting object.
193
194 size indicates the approximate maximum number of bytes to
195 read from the stream for decoding purposes. The decoder
196 can modify this setting as appropriate. The default value
197 -1 indicates to read and decode as much as possible. size
198 is intended to prevent having to decode huge files in one
199 step.
200
201 The method should use a greedy read strategy meaning that
202 it should read as much data as is allowed within the
203 definition of the encoding and the given size, e.g. if
204 optional encoding endings or state markers are available
205 on the stream, these should be read too.
206
207 """
208 # Unsliced reading:
209 if size < 0:
Andrew M. Kuchlingc6c28382000-12-10 15:12:14 +0000210 return self.decode(self.stream.read(), self.errors)[0]
Guido van Rossum1c89b0e2000-04-11 15:41:38 +0000211
Guido van Rossum0612d842000-03-10 23:20:43 +0000212 # Sliced reading:
213 read = self.stream.read
214 decode = self.decode
215 data = read(size)
216 i = 0
217 while 1:
218 try:
Andrew M. Kuchlingc6c28382000-12-10 15:12:14 +0000219 object, decodedbytes = decode(data, self.errors)
Tim Peters30324a72001-05-15 17:19:16 +0000220 except ValueError, why:
Guido van Rossum0612d842000-03-10 23:20:43 +0000221 # This method is slow but should work under pretty much
222 # all conditions; at most 10 tries are made
223 i = i + 1
224 newdata = read(1)
225 if not newdata or i > 10:
226 raise
227 data = data + newdata
228 else:
229 return object
230
Guido van Rossuma3277132000-04-11 15:37:43 +0000231 def readline(self, size=None):
232
233 """ Read one line from the input stream and return the
234 decoded data.
235
Fred Drake49fd1072000-04-13 14:11:21 +0000236 Note: Unlike the .readlines() method, this method inherits
237 the line breaking knowledge from the underlying stream's
238 .readline() method -- there is currently no support for
239 line breaking using the codec decoder due to lack of line
240 buffering. Sublcasses should however, if possible, try to
241 implement this method using their own knowledge of line
242 breaking.
Guido van Rossuma3277132000-04-11 15:37:43 +0000243
244 size, if given, is passed as size argument to the stream's
245 .readline() method.
Guido van Rossum1c89b0e2000-04-11 15:41:38 +0000246
Guido van Rossuma3277132000-04-11 15:37:43 +0000247 """
248 if size is None:
249 line = self.stream.readline()
250 else:
251 line = self.stream.readline(size)
Tim Peters30324a72001-05-15 17:19:16 +0000252 return self.decode(line, self.errors)[0]
Guido van Rossum1c89b0e2000-04-11 15:41:38 +0000253
Guido van Rossuma3277132000-04-11 15:37:43 +0000254
255 def readlines(self, sizehint=0):
256
257 """ Read all lines available on the input stream
258 and return them as list of lines.
259
260 Line breaks are implemented using the codec's decoder
261 method and are included in the list entries.
Guido van Rossum1c89b0e2000-04-11 15:41:38 +0000262
Guido van Rossuma3277132000-04-11 15:37:43 +0000263 sizehint, if given, is passed as size argument to the
264 stream's .read() method.
265
266 """
267 if sizehint is None:
268 data = self.stream.read()
269 else:
270 data = self.stream.read(sizehint)
Tim Peters30324a72001-05-15 17:19:16 +0000271 return self.decode(data, self.errors)[0].splitlines(1)
Guido van Rossum0612d842000-03-10 23:20:43 +0000272
273 def reset(self):
274
275 """ Resets the codec buffers used for keeping state.
276
277 Note that no stream repositioning should take place.
Thomas Wouters7e474022000-07-16 12:04:32 +0000278 This method is primarily intended to be able to recover
Guido van Rossum0612d842000-03-10 23:20:43 +0000279 from decoding errors.
280
281 """
282 pass
283
Tim Peters30324a72001-05-15 17:19:16 +0000284 def __getattr__(self, name,
Guido van Rossum0612d842000-03-10 23:20:43 +0000285 getattr=getattr):
286
287 """ Inherit all other methods from the underlying stream.
288 """
Tim Peters30324a72001-05-15 17:19:16 +0000289 return getattr(self.stream, name)
Guido van Rossum0612d842000-03-10 23:20:43 +0000290
291###
292
293class StreamReaderWriter:
294
Fred Drake49fd1072000-04-13 14:11:21 +0000295 """ StreamReaderWriter instances allow wrapping streams which
296 work in both read and write modes.
297
298 The design is such that one can use the factory functions
Thomas Wouters7e474022000-07-16 12:04:32 +0000299 returned by the codec.lookup() function to construct the
Fred Drake49fd1072000-04-13 14:11:21 +0000300 instance.
301
302 """
Guido van Rossuma3277132000-04-11 15:37:43 +0000303 # Optional attributes set by the file wrappers below
304 encoding = 'unknown'
305
Tim Peters30324a72001-05-15 17:19:16 +0000306 def __init__(self, stream, Reader, Writer, errors='strict'):
Guido van Rossum0612d842000-03-10 23:20:43 +0000307
308 """ Creates a StreamReaderWriter instance.
309
310 stream must be a Stream-like object.
311
312 Reader, Writer must be factory functions or classes
313 providing the StreamReader, StreamWriter interface resp.
314
315 Error handling is done in the same way as defined for the
316 StreamWriter/Readers.
317
318 """
319 self.stream = stream
320 self.reader = Reader(stream, errors)
321 self.writer = Writer(stream, errors)
322 self.errors = errors
323
Tim Peters30324a72001-05-15 17:19:16 +0000324 def read(self, size=-1):
Guido van Rossum0612d842000-03-10 23:20:43 +0000325
326 return self.reader.read(size)
327
Guido van Rossumd58c26f2000-05-01 16:17:32 +0000328 def readline(self, size=None):
Guido van Rossuma3277132000-04-11 15:37:43 +0000329
330 return self.reader.readline(size)
331
Guido van Rossumd58c26f2000-05-01 16:17:32 +0000332 def readlines(self, sizehint=None):
Guido van Rossuma3277132000-04-11 15:37:43 +0000333
334 return self.reader.readlines(sizehint)
335
Tim Peters30324a72001-05-15 17:19:16 +0000336 def write(self, data):
Guido van Rossum0612d842000-03-10 23:20:43 +0000337
338 return self.writer.write(data)
339
Tim Peters30324a72001-05-15 17:19:16 +0000340 def writelines(self, list):
Guido van Rossuma3277132000-04-11 15:37:43 +0000341
342 return self.writer.writelines(list)
343
Guido van Rossum0612d842000-03-10 23:20:43 +0000344 def reset(self):
345
346 self.reader.reset()
347 self.writer.reset()
348
Tim Peters30324a72001-05-15 17:19:16 +0000349 def __getattr__(self, name,
Guido van Rossum0612d842000-03-10 23:20:43 +0000350 getattr=getattr):
351
352 """ Inherit all other methods from the underlying stream.
353 """
Tim Peters30324a72001-05-15 17:19:16 +0000354 return getattr(self.stream, name)
Guido van Rossum0612d842000-03-10 23:20:43 +0000355
356###
357
358class StreamRecoder:
359
Fred Drake49fd1072000-04-13 14:11:21 +0000360 """ StreamRecoder instances provide a frontend - backend
361 view of encoding data.
362
363 They use the complete set of APIs returned by the
364 codecs.lookup() function to implement their task.
365
366 Data written to the stream is first decoded into an
367 intermediate format (which is dependent on the given codec
368 combination) and then written to the stream using an instance
369 of the provided Writer class.
370
371 In the other direction, data is read from the stream using a
372 Reader instance and then return encoded data to the caller.
373
374 """
Guido van Rossuma3277132000-04-11 15:37:43 +0000375 # Optional attributes set by the file wrappers below
376 data_encoding = 'unknown'
377 file_encoding = 'unknown'
378
Tim Peters30324a72001-05-15 17:19:16 +0000379 def __init__(self, stream, encode, decode, Reader, Writer,
380 errors='strict'):
Guido van Rossum0612d842000-03-10 23:20:43 +0000381
382 """ Creates a StreamRecoder instance which implements a two-way
383 conversion: encode and decode work on the frontend (the
Guido van Rossum1c89b0e2000-04-11 15:41:38 +0000384 input to .read() and output of .write()) while
Guido van Rossum0612d842000-03-10 23:20:43 +0000385 Reader and Writer work on the backend (reading and
Fred Drake908670c2000-03-17 15:42:11 +0000386 writing to the stream).
Guido van Rossum0612d842000-03-10 23:20:43 +0000387
388 You can use these objects to do transparent direct
389 recodings from e.g. latin-1 to utf-8 and back.
390
391 stream must be a file-like object.
392
393 encode, decode must adhere to the Codec interface, Reader,
394 Writer must be factory functions or classes providing the
395 StreamReader, StreamWriter interface resp.
396
397 encode and decode are needed for the frontend translation,
398 Reader and Writer for the backend translation. Unicode is
399 used as intermediate encoding.
400
401 Error handling is done in the same way as defined for the
402 StreamWriter/Readers.
403
404 """
405 self.stream = stream
406 self.encode = encode
407 self.decode = decode
408 self.reader = Reader(stream, errors)
409 self.writer = Writer(stream, errors)
410 self.errors = errors
411
Tim Peters30324a72001-05-15 17:19:16 +0000412 def read(self, size=-1):
Guido van Rossum0612d842000-03-10 23:20:43 +0000413
414 data = self.reader.read(size)
415 data, bytesencoded = self.encode(data, self.errors)
416 return data
417
Tim Peters30324a72001-05-15 17:19:16 +0000418 def readline(self, size=None):
Guido van Rossuma3277132000-04-11 15:37:43 +0000419
420 if size is None:
421 data = self.reader.readline()
422 else:
423 data = self.reader.readline(size)
424 data, bytesencoded = self.encode(data, self.errors)
425 return data
426
Tim Peters30324a72001-05-15 17:19:16 +0000427 def readlines(self, sizehint=None):
Guido van Rossuma3277132000-04-11 15:37:43 +0000428
429 if sizehint is None:
430 data = self.reader.read()
431 else:
432 data = self.reader.read(sizehint)
433 data, bytesencoded = self.encode(data, self.errors)
434 return data.splitlines(1)
435
Tim Peters30324a72001-05-15 17:19:16 +0000436 def write(self, data):
Guido van Rossum0612d842000-03-10 23:20:43 +0000437
438 data, bytesdecoded = self.decode(data, self.errors)
439 return self.writer.write(data)
440
Tim Peters30324a72001-05-15 17:19:16 +0000441 def writelines(self, list):
Guido van Rossuma3277132000-04-11 15:37:43 +0000442
443 data = ''.join(list)
444 data, bytesdecoded = self.decode(data, self.errors)
445 return self.writer.write(data)
Guido van Rossum0612d842000-03-10 23:20:43 +0000446
447 def reset(self):
448
449 self.reader.reset()
450 self.writer.reset()
451
Tim Peters30324a72001-05-15 17:19:16 +0000452 def __getattr__(self, name,
Guido van Rossum0612d842000-03-10 23:20:43 +0000453 getattr=getattr):
454
455 """ Inherit all other methods from the underlying stream.
456 """
Tim Peters30324a72001-05-15 17:19:16 +0000457 return getattr(self.stream, name)
Guido van Rossum0612d842000-03-10 23:20:43 +0000458
459### Shortcuts
460
Marc-André Lemburg349a3d32000-06-21 21:21:04 +0000461def open(filename, mode='rb', encoding=None, errors='strict', buffering=1):
Guido van Rossum0612d842000-03-10 23:20:43 +0000462
463 """ Open an encoded file using the given mode and return
464 a wrapped version providing transparent encoding/decoding.
465
466 Note: The wrapped version will only accept the object format
467 defined by the codecs, i.e. Unicode objects for most builtin
468 codecs. Output is also codec dependent and will usually by
469 Unicode as well.
470
Marc-André Lemburg349a3d32000-06-21 21:21:04 +0000471 Files are always opened in binary mode, even if no binary mode
472 was specified. Thisis done to avoid data loss due to encodings
473 using 8-bit values. The default file mode is 'rb' meaning to
474 open the file in binary read mode.
475
Guido van Rossum0612d842000-03-10 23:20:43 +0000476 encoding specifies the encoding which is to be used for the
477 the file.
478
479 errors may be given to define the error handling. It defaults
480 to 'strict' which causes ValueErrors to be raised in case an
481 encoding error occurs.
482
483 buffering has the same meaning as for the builtin open() API.
484 It defaults to line buffered.
485
Fred Drake49fd1072000-04-13 14:11:21 +0000486 The returned wrapped file object provides an extra attribute
487 .encoding which allows querying the used encoding. This
488 attribute is only available if an encoding was specified as
489 parameter.
490
Guido van Rossum0612d842000-03-10 23:20:43 +0000491 """
492 if encoding is not None and \
493 'b' not in mode:
494 # Force opening of the file in binary mode
495 mode = mode + 'b'
496 file = __builtin__.open(filename, mode, buffering)
497 if encoding is None:
498 return file
Tim Peters30324a72001-05-15 17:19:16 +0000499 (e, d, sr, sw) = lookup(encoding)
Guido van Rossuma3277132000-04-11 15:37:43 +0000500 srw = StreamReaderWriter(file, sr, sw, errors)
501 # Add attributes to simplify introspection
502 srw.encoding = encoding
503 return srw
Guido van Rossum0612d842000-03-10 23:20:43 +0000504
Guido van Rossuma3277132000-04-11 15:37:43 +0000505def EncodedFile(file, data_encoding, file_encoding=None, errors='strict'):
Guido van Rossum0612d842000-03-10 23:20:43 +0000506
507 """ Return a wrapped version of file which provides transparent
508 encoding translation.
509
510 Strings written to the wrapped file are interpreted according
Guido van Rossuma3277132000-04-11 15:37:43 +0000511 to the given data_encoding and then written to the original
512 file as string using file_encoding. The intermediate encoding
513 will usually be Unicode but depends on the specified codecs.
Guido van Rossum0612d842000-03-10 23:20:43 +0000514
Guido van Rossuma3277132000-04-11 15:37:43 +0000515 Strings are read from the file using file_encoding and then
516 passed back to the caller as string using data_encoding.
517
518 If file_encoding is not given, it defaults to data_encoding.
Guido van Rossum0612d842000-03-10 23:20:43 +0000519
520 errors may be given to define the error handling. It defaults
521 to 'strict' which causes ValueErrors to be raised in case an
522 encoding error occurs.
523
Fred Drake49fd1072000-04-13 14:11:21 +0000524 The returned wrapped file object provides two extra attributes
525 .data_encoding and .file_encoding which reflect the given
526 parameters of the same name. The attributes can be used for
527 introspection by Python programs.
528
Guido van Rossum0612d842000-03-10 23:20:43 +0000529 """
Guido van Rossuma3277132000-04-11 15:37:43 +0000530 if file_encoding is None:
531 file_encoding = data_encoding
532 encode, decode = lookup(data_encoding)[:2]
533 Reader, Writer = lookup(file_encoding)[2:]
534 sr = StreamRecoder(file,
Tim Peters30324a72001-05-15 17:19:16 +0000535 encode, decode, Reader, Writer,
Guido van Rossuma3277132000-04-11 15:37:43 +0000536 errors)
537 # Add attributes to simplify introspection
538 sr.data_encoding = data_encoding
539 sr.file_encoding = file_encoding
540 return sr
Guido van Rossum0612d842000-03-10 23:20:43 +0000541
Marc-André Lemburga866df82001-01-03 21:29:14 +0000542### Helpers for charmap-based codecs
543
544def make_identity_dict(rng):
545
546 """ make_identity_dict(rng) -> dict
547
548 Return a dictionary where elements of the rng sequence are
549 mapped to themselves.
Tim Peters88869f92001-01-14 23:36:06 +0000550
Marc-André Lemburga866df82001-01-03 21:29:14 +0000551 """
552 res = {}
553 for i in rng:
554 res[i]=i
555 return res
556
Marc-André Lemburg716cf912001-05-16 09:41:45 +0000557def make_encoding_map(decoding_map):
558
559 """ Creates an encoding map from a decoding map.
560
561 If a target mapping in the decoding map occurrs multiple
562 times, then that target is mapped to None (undefined mapping),
563 causing an exception when encountered by the charmap codec
564 during translation.
565
566 One example where this happens is cp875.py which decodes
567 multiple character to \u001a.
568
569 """
570 m = {}
571 for k,v in decoding_map.items():
572 if not m.has_key(v):
573 m[v] = k
574 else:
575 m[v] = None
576 return m
Tim Peters3a2ab1a2001-05-29 06:06:54 +0000577
Guido van Rossum0612d842000-03-10 23:20:43 +0000578### Tests
Guido van Rossum1c89b0e2000-04-11 15:41:38 +0000579
Guido van Rossum0612d842000-03-10 23:20:43 +0000580if __name__ == '__main__':
581
582 import sys
Guido van Rossum1c89b0e2000-04-11 15:41:38 +0000583
Guido van Rossuma3277132000-04-11 15:37:43 +0000584 # Make stdout translate Latin-1 output into UTF-8 output
585 sys.stdout = EncodedFile(sys.stdout, 'latin-1', 'utf-8')
Guido van Rossum1c89b0e2000-04-11 15:41:38 +0000586
Guido van Rossuma3277132000-04-11 15:37:43 +0000587 # Have stdin translate Latin-1 input into UTF-8 input
588 sys.stdin = EncodedFile(sys.stdin, 'utf-8', 'latin-1')