Blame - python/lib/Lib/codecs.py - platform/tools/idea

blob: f834b8dd17a8f14423e9da8b2acbb79b38873bee [file] [log] [blame]

Tor Norbye	3a2425a	2013-11-04 10:16:08 -0800	[diff] [blame^]	1	""" codecs -- Python Codec Registry, API and helpers.
				2
				3
				4	Written by Marc-Andre Lemburg (mal@lemburg.com).
				5
				6	(c) Copyright CNRI, All Rights Reserved. NO WARRANTY.
				7
				8	"""#"
				9
				10	import __builtin__, sys
				11
				12	### Registry and builtin stateless codec functions
				13
				14	try:
				15	from _codecs import *
				16	except ImportError, why:
				17	raise SystemError('Failed to load the builtin codecs: %s' % why)
				18
				19	__all__ = ["register", "lookup", "open", "EncodedFile", "BOM", "BOM_BE",
				20	"BOM_LE", "BOM32_BE", "BOM32_LE", "BOM64_BE", "BOM64_LE",
				21	"BOM_UTF8", "BOM_UTF16", "BOM_UTF16_LE", "BOM_UTF16_BE",
				22	"BOM_UTF32", "BOM_UTF32_LE", "BOM_UTF32_BE",
				23	"strict_errors", "ignore_errors", "replace_errors",
				24	"xmlcharrefreplace_errors",
				25	"register_error", "lookup_error"]
				26
				27	### Constants
				28
				29	#
				30	# Byte Order Mark (BOM = ZERO WIDTH NO-BREAK SPACE = U+FEFF)
				31	# and its possible byte string values
				32	# for UTF8/UTF16/UTF32 output and little/big endian machines
				33	#
				34
				35	# UTF-8
				36	BOM_UTF8 = '\xef\xbb\xbf'
				37
				38	# UTF-16, little endian
				39	BOM_LE = BOM_UTF16_LE = '\xff\xfe'
				40
				41	# UTF-16, big endian
				42	BOM_BE = BOM_UTF16_BE = '\xfe\xff'
				43
				44	# UTF-32, little endian
				45	BOM_UTF32_LE = '\xff\xfe\x00\x00'
				46
				47	# UTF-32, big endian
				48	BOM_UTF32_BE = '\x00\x00\xfe\xff'
				49
				50	if sys.byteorder == 'little':
				51
				52	# UTF-16, native endianness
				53	BOM = BOM_UTF16 = BOM_UTF16_LE
				54
				55	# UTF-32, native endianness
				56	BOM_UTF32 = BOM_UTF32_LE
				57
				58	else:
				59
				60	# UTF-16, native endianness
				61	BOM = BOM_UTF16 = BOM_UTF16_BE
				62
				63	# UTF-32, native endianness
				64	BOM_UTF32 = BOM_UTF32_BE
				65
				66	# Old broken names (don't use in new code)
				67	BOM32_LE = BOM_UTF16_LE
				68	BOM32_BE = BOM_UTF16_BE
				69	BOM64_LE = BOM_UTF32_LE
				70	BOM64_BE = BOM_UTF32_BE
				71
				72
				73	### Codec base classes (defining the API)
				74
				75	class CodecInfo(tuple):
				76
				77	def __new__(cls, encode, decode, streamreader=None, streamwriter=None,
				78	incrementalencoder=None, incrementaldecoder=None, name=None):
				79	self = tuple.__new__(cls, (encode, decode, streamreader, streamwriter))
				80	self.name = name
				81	self.encode = encode
				82	self.decode = decode
				83	self.incrementalencoder = incrementalencoder
				84	self.incrementaldecoder = incrementaldecoder
				85	self.streamwriter = streamwriter
				86	self.streamreader = streamreader
				87	return self
				88
				89	def __repr__(self):
				90	return "<%s.%s object for encoding %s at 0x%x>" % (self.__class__.__module__, self.__class__.__name__, self.name, id(self))
				91
				92	class Codec:
				93
				94	""" Defines the interface for stateless encoders/decoders.
				95
				96	The .encode()/.decode() methods may use different error
				97	handling schemes by providing the errors argument. These
				98	string values are predefined:
				99
				100	'strict' - raise a ValueError error (or a subclass)
				101	'ignore' - ignore the character and continue with the next
				102	'replace' - replace with a suitable replacement character;
				103	Python will use the official U+FFFD REPLACEMENT
				104	CHARACTER for the builtin Unicode codecs on
				105	decoding and '?' on encoding.
				106	'xmlcharrefreplace' - Replace with the appropriate XML
				107	character reference (only for encoding).
				108	'backslashreplace' - Replace with backslashed escape sequences
				109	(only for encoding).
				110
				111	The set of allowed values can be extended via register_error.
				112
				113	"""
				114	def encode(self, input, errors='strict'):
				115
				116	""" Encodes the object input and returns a tuple (output
				117	object, length consumed).
				118
				119	errors defines the error handling to apply. It defaults to
				120	'strict' handling.
				121
				122	The method may not store state in the Codec instance. Use
				123	StreamCodec for codecs which have to keep state in order to
				124	make encoding/decoding efficient.
				125
				126	The encoder must be able to handle zero length input and
				127	return an empty object of the output object type in this
				128	situation.
				129
				130	"""
				131	raise NotImplementedError
				132
				133	def decode(self, input, errors='strict'):
				134
				135	""" Decodes the object input and returns a tuple (output
				136	object, length consumed).
				137
				138	input must be an object which provides the bf_getreadbuf
				139	buffer slot. Python strings, buffer objects and memory
				140	mapped files are examples of objects providing this slot.
				141
				142	errors defines the error handling to apply. It defaults to
				143	'strict' handling.
				144
				145	The method may not store state in the Codec instance. Use
				146	StreamCodec for codecs which have to keep state in order to
				147	make encoding/decoding efficient.
				148
				149	The decoder must be able to handle zero length input and
				150	return an empty object of the output object type in this
				151	situation.
				152
				153	"""
				154	raise NotImplementedError
				155
				156	class IncrementalEncoder(object):
				157	"""
				158	An IncrementalEncoder encodes an input in multiple steps. The input can be
				159	passed piece by piece to the encode() method. The IncrementalEncoder remembers
				160	the state of the Encoding process between calls to encode().
				161	"""
				162	def __init__(self, errors='strict'):
				163	"""
				164	Creates an IncrementalEncoder instance.
				165
				166	The IncrementalEncoder may use different error handling schemes by
				167	providing the errors keyword argument. See the module docstring
				168	for a list of possible values.
				169	"""
				170	self.errors = errors
				171	self.buffer = ""
				172
				173	def encode(self, input, final=False):
				174	"""
				175	Encodes input and returns the resulting object.
				176	"""
				177	raise NotImplementedError
				178
				179	def reset(self):
				180	"""
				181	Resets the encoder to the initial state.
				182	"""
				183
				184	class BufferedIncrementalEncoder(IncrementalEncoder):
				185	"""
				186	This subclass of IncrementalEncoder can be used as the baseclass for an
				187	incremental encoder if the encoder must keep some of the output in a
				188	buffer between calls to encode().
				189	"""
				190	def __init__(self, errors='strict'):
				191	IncrementalEncoder.__init__(self, errors)
				192	self.buffer = "" # unencoded input that is kept between calls to encode()
				193
				194	def _buffer_encode(self, input, errors, final):
				195	# Overwrite this method in subclasses: It must encode input
				196	# and return an (output, length consumed) tuple
				197	raise NotImplementedError
				198
				199	def encode(self, input, final=False):
				200	# encode input (taking the buffer into account)
				201	data = self.buffer + input
				202	(result, consumed) = self._buffer_encode(data, self.errors, final)
				203	# keep unencoded input until the next call
				204	self.buffer = data[consumed:]
				205	return result
				206
				207	def reset(self):
				208	IncrementalEncoder.reset(self)
				209	self.buffer = ""
				210
				211	class IncrementalDecoder(object):
				212	"""
				213	An IncrementalDecoder decodes an input in multiple steps. The input can be
				214	passed piece by piece to the decode() method. The IncrementalDecoder
				215	remembers the state of the decoding process between calls to decode().
				216	"""
				217	def __init__(self, errors='strict'):
				218	"""
				219	Creates a IncrementalDecoder instance.
				220
				221	The IncrementalDecoder may use different error handling schemes by
				222	providing the errors keyword argument. See the module docstring
				223	for a list of possible values.
				224	"""
				225	self.errors = errors
				226
				227	def decode(self, input, final=False):
				228	"""
				229	Decodes input and returns the resulting object.
				230	"""
				231	raise NotImplementedError
				232
				233	def reset(self):
				234	"""
				235	Resets the decoder to the initial state.
				236	"""
				237
				238	class BufferedIncrementalDecoder(IncrementalDecoder):
				239	"""
				240	This subclass of IncrementalDecoder can be used as the baseclass for an
				241	incremental decoder if the decoder must be able to handle incomplete byte
				242	sequences.
				243	"""
				244	def __init__(self, errors='strict'):
				245	IncrementalDecoder.__init__(self, errors)
				246	self.buffer = "" # undecoded input that is kept between calls to decode()
				247
				248	def _buffer_decode(self, input, errors, final):
				249	# Overwrite this method in subclasses: It must decode input
				250	# and return an (output, length consumed) tuple
				251	raise NotImplementedError
				252
				253	def decode(self, input, final=False):
				254	# decode input (taking the buffer into account)
				255	data = self.buffer + input
				256	(result, consumed) = self._buffer_decode(data, self.errors, final)
				257	# keep undecoded input until the next call
				258	self.buffer = data[consumed:]
				259	return result
				260
				261	def reset(self):
				262	IncrementalDecoder.reset(self)
				263	self.buffer = ""
				264
				265	#
				266	# The StreamWriter and StreamReader class provide generic working
				267	# interfaces which can be used to implement new encoding submodules
				268	# very easily. See encodings/utf_8.py for an example on how this is
				269	# done.
				270	#
				271
				272	class StreamWriter(Codec):
				273
				274	def __init__(self, stream, errors='strict'):
				275
				276	""" Creates a StreamWriter instance.
				277
				278	stream must be a file-like object open for writing
				279	(binary) data.
				280
				281	The StreamWriter may use different error handling
				282	schemes by providing the errors keyword argument. These
				283	parameters are predefined:
				284
				285	'strict' - raise a ValueError (or a subclass)
				286	'ignore' - ignore the character and continue with the next
				287	'replace'- replace with a suitable replacement character
				288	'xmlcharrefreplace' - Replace with the appropriate XML
				289	character reference.
				290	'backslashreplace' - Replace with backslashed escape
				291	sequences (only for encoding).
				292
				293	The set of allowed parameter values can be extended via
				294	register_error.
				295	"""
				296	self.stream = stream
				297	self.errors = errors
				298
				299	def write(self, object):
				300
				301	""" Writes the object's contents encoded to self.stream.
				302	"""
				303	data, consumed = self.encode(object, self.errors)
				304	self.stream.write(data)
				305
				306	def writelines(self, list):
				307
				308	""" Writes the concatenated list of strings to the stream
				309	using .write().
				310	"""
				311	self.write(''.join(list))
				312
				313	def reset(self):
				314
				315	""" Flushes and resets the codec buffers used for keeping state.
				316
				317	Calling this method should ensure that the data on the
				318	output is put into a clean state, that allows appending
				319	of new fresh data without having to rescan the whole
				320	stream to recover state.
				321
				322	"""
				323	pass
				324
				325	def __getattr__(self, name,
				326	getattr=getattr):
				327
				328	""" Inherit all other methods from the underlying stream.
				329	"""
				330	return getattr(self.stream, name)
				331
				332	def __enter__(self):
				333	return self
				334
				335	def __exit__(self, type, value, tb):
				336	self.stream.close()
				337
				338	###
				339
				340	class StreamReader(Codec):
				341
				342	def __init__(self, stream, errors='strict'):
				343
				344	""" Creates a StreamReader instance.
				345
				346	stream must be a file-like object open for reading
				347	(binary) data.
				348
				349	The StreamReader may use different error handling
				350	schemes by providing the errors keyword argument. These
				351	parameters are predefined:
				352
				353	'strict' - raise a ValueError (or a subclass)
				354	'ignore' - ignore the character and continue with the next
				355	'replace'- replace with a suitable replacement character;
				356
				357	The set of allowed parameter values can be extended via
				358	register_error.
				359	"""
				360	self.stream = stream
				361	self.errors = errors
				362	self.bytebuffer = ""
				363	# For str->str decoding this will stay a str
				364	# For str->unicode decoding the first read will promote it to unicode
				365	self.charbuffer = ""
				366	self.linebuffer = None
				367
				368	def decode(self, input, errors='strict'):
				369	raise NotImplementedError
				370
				371	def read(self, size=-1, chars=-1, firstline=False):
				372
				373	""" Decodes data from the stream self.stream and returns the
				374	resulting object.
				375
				376	chars indicates the number of characters to read from the
				377	stream. read() will never return more than chars
				378	characters, but it might return less, if there are not enough
				379	characters available.
				380
				381	size indicates the approximate maximum number of bytes to
				382	read from the stream for decoding purposes. The decoder
				383	can modify this setting as appropriate. The default value
				384	-1 indicates to read and decode as much as possible. size
				385	is intended to prevent having to decode huge files in one
				386	step.
				387
				388	If firstline is true, and a UnicodeDecodeError happens
				389	after the first line terminator in the input only the first line
				390	will be returned, the rest of the input will be kept until the
				391	next call to read().
				392
				393	The method should use a greedy read strategy meaning that
				394	it should read as much data as is allowed within the
				395	definition of the encoding and the given size, e.g. if
				396	optional encoding endings or state markers are available
				397	on the stream, these should be read too.
				398	"""
				399	# If we have lines cached, first merge them back into characters
				400	if self.linebuffer:
				401	self.charbuffer = "".join(self.linebuffer)
				402	self.linebuffer = None
				403
				404	# read until we get the required number of characters (if available)
				405	while True:
				406	# can the request can be satisfied from the character buffer?
				407	if chars < 0:
				408	if size < 0:
				409	if self.charbuffer:
				410	break
				411	elif len(self.charbuffer) >= size:
				412	break
				413	else:
				414	if len(self.charbuffer) >= chars:
				415	break
				416	# we need more data
				417	if size < 0:
				418	newdata = self.stream.read()
				419	else:
				420	newdata = self.stream.read(size)
				421	# decode bytes (those remaining from the last call included)
				422	data = self.bytebuffer + newdata
				423	try:
				424	newchars, decodedbytes = self.decode(data, self.errors)
				425	except UnicodeDecodeError, exc:
				426	if firstline:
				427	newchars, decodedbytes = self.decode(data[:exc.start], self.errors)
				428	lines = newchars.splitlines(True)
				429	if len(lines)<=1:
				430	raise
				431	else:
				432	raise
				433	# keep undecoded bytes until the next call
				434	self.bytebuffer = data[decodedbytes:]
				435	# put new characters in the character buffer
				436	self.charbuffer += newchars
				437	# there was no data available
				438	if not newdata:
				439	break
				440	if chars < 0:
				441	# Return everything we've got
				442	result = self.charbuffer
				443	self.charbuffer = ""
				444	else:
				445	# Return the first chars characters
				446	result = self.charbuffer[:chars]
				447	self.charbuffer = self.charbuffer[chars:]
				448	return result
				449
				450	def readline(self, size=None, keepends=True):
				451
				452	""" Read one line from the input stream and return the
				453	decoded data.
				454
				455	size, if given, is passed as size argument to the
				456	read() method.
				457
				458	"""
				459	# If we have lines cached from an earlier read, return
				460	# them unconditionally
				461	if self.linebuffer:
				462	line = self.linebuffer[0]
				463	del self.linebuffer[0]
				464	if len(self.linebuffer) == 1:
				465	# revert to charbuffer mode; we might need more data
				466	# next time
				467	self.charbuffer = self.linebuffer[0]
				468	self.linebuffer = None
				469	if not keepends:
				470	line = line.splitlines(False)[0]
				471	return line
				472
				473	readsize = size or 72
				474	line = ""
				475	# If size is given, we call read() only once
				476	while True:
				477	data = self.read(readsize, firstline=True)
				478	if data:
				479	# If we're at a "\r" read one extra character (which might
				480	# be a "\n") to get a proper line ending. If the stream is
				481	# temporarily exhausted we return the wrong line ending.
				482	if data.endswith("\r"):
				483	data += self.read(size=1, chars=1)
				484
				485	line += data
				486	lines = line.splitlines(True)
				487	if lines:
				488	if len(lines) > 1:
				489	# More than one line result; the first line is a full line
				490	# to return
				491	line = lines[0]
				492	del lines[0]
				493	if len(lines) > 1:
				494	# cache the remaining lines
				495	lines[-1] += self.charbuffer
				496	self.linebuffer = lines
				497	self.charbuffer = None
				498	else:
				499	# only one remaining line, put it back into charbuffer
				500	self.charbuffer = lines[0] + self.charbuffer
				501	if not keepends:
				502	line = line.splitlines(False)[0]
				503	break
				504	line0withend = lines[0]
				505	line0withoutend = lines[0].splitlines(False)[0]
				506	if line0withend != line0withoutend: # We really have a line end
				507	# Put the rest back together and keep it until the next call
				508	self.charbuffer = "".join(lines[1:]) + self.charbuffer
				509	if keepends:
				510	line = line0withend
				511	else:
				512	line = line0withoutend
				513	break
				514	# we didn't get anything or this was our only try
				515	if not data or size is not None:
				516	if line and not keepends:
				517	line = line.splitlines(False)[0]
				518	break
				519	if readsize<8000:
				520	readsize *= 2
				521	return line
				522
				523	def readlines(self, sizehint=None, keepends=True):
				524
				525	""" Read all lines available on the input stream
				526	and return them as list of lines.
				527
				528	Line breaks are implemented using the codec's decoder
				529	method and are included in the list entries.
				530
				531	sizehint, if given, is ignored since there is no efficient
				532	way to finding the true end-of-line.
				533
				534	"""
				535	data = self.read()
				536	return data.splitlines(keepends)
				537
				538	def reset(self):
				539
				540	""" Resets the codec buffers used for keeping state.
				541
				542	Note that no stream repositioning should take place.
				543	This method is primarily intended to be able to recover
				544	from decoding errors.
				545
				546	"""
				547	self.bytebuffer = ""
				548	self.charbuffer = u""
				549	self.linebuffer = None
				550
				551	def seek(self, offset, whence=0):
				552	""" Set the input stream's current position.
				553
				554	Resets the codec buffers used for keeping state.
				555	"""
				556	self.reset()
				557	self.stream.seek(offset, whence)
				558
				559	def next(self):
				560
				561	""" Return the next decoded line from the input stream."""
				562	line = self.readline()
				563	if line:
				564	return line
				565	raise StopIteration
				566
				567	def __iter__(self):
				568	return self
				569
				570	def __getattr__(self, name,
				571	getattr=getattr):
				572
				573	""" Inherit all other methods from the underlying stream.
				574	"""
				575	return getattr(self.stream, name)
				576
				577	def __enter__(self):
				578	return self
				579
				580	def __exit__(self, type, value, tb):
				581	self.stream.close()
				582
				583	###
				584
				585	class StreamReaderWriter:
				586
				587	""" StreamReaderWriter instances allow wrapping streams which
				588	work in both read and write modes.
				589
				590	The design is such that one can use the factory functions
				591	returned by the codec.lookup() function to construct the
				592	instance.
				593
				594	"""
				595	# Optional attributes set by the file wrappers below
				596	encoding = 'unknown'
				597
				598	def __init__(self, stream, Reader, Writer, errors='strict'):
				599
				600	""" Creates a StreamReaderWriter instance.
				601
				602	stream must be a Stream-like object.
				603
				604	Reader, Writer must be factory functions or classes
				605	providing the StreamReader, StreamWriter interface resp.
				606
				607	Error handling is done in the same way as defined for the
				608	StreamWriter/Readers.
				609
				610	"""
				611	self.stream = stream
				612	self.reader = Reader(stream, errors)
				613	self.writer = Writer(stream, errors)
				614	self.errors = errors
				615
				616	def read(self, size=-1):
				617
				618	return self.reader.read(size)
				619
				620	def readline(self, size=None):
				621
				622	return self.reader.readline(size)
				623
				624	def readlines(self, sizehint=None):
				625
				626	return self.reader.readlines(sizehint)
				627
				628	def next(self):
				629
				630	""" Return the next decoded line from the input stream."""
				631	return self.reader.next()
				632
				633	def __iter__(self):
				634	return self
				635
				636	def write(self, data):
				637
				638	return self.writer.write(data)
				639
				640	def writelines(self, list):
				641
				642	return self.writer.writelines(list)
				643
				644	def reset(self):
				645
				646	self.reader.reset()
				647	self.writer.reset()
				648
				649	def __getattr__(self, name,
				650	getattr=getattr):
				651
				652	""" Inherit all other methods from the underlying stream.
				653	"""
				654	return getattr(self.stream, name)
				655
				656	# these are needed to make "with codecs.open(...)" work properly
				657
				658	def __enter__(self):
				659	return self
				660
				661	def __exit__(self, type, value, tb):
				662	self.stream.close()
				663
				664	###
				665
				666	class StreamRecoder:
				667
				668	""" StreamRecoder instances provide a frontend - backend
				669	view of encoding data.
				670
				671	They use the complete set of APIs returned by the
				672	codecs.lookup() function to implement their task.
				673
				674	Data written to the stream is first decoded into an
				675	intermediate format (which is dependent on the given codec
				676	combination) and then written to the stream using an instance
				677	of the provided Writer class.
				678
				679	In the other direction, data is read from the stream using a
				680	Reader instance and then return encoded data to the caller.
				681
				682	"""
				683	# Optional attributes set by the file wrappers below
				684	data_encoding = 'unknown'
				685	file_encoding = 'unknown'
				686
				687	def __init__(self, stream, encode, decode, Reader, Writer,
				688	errors='strict'):
				689
				690	""" Creates a StreamRecoder instance which implements a two-way
				691	conversion: encode and decode work on the frontend (the
				692	input to .read() and output of .write()) while
				693	Reader and Writer work on the backend (reading and
				694	writing to the stream).
				695
				696	You can use these objects to do transparent direct
				697	recodings from e.g. latin-1 to utf-8 and back.
				698
				699	stream must be a file-like object.
				700
				701	encode, decode must adhere to the Codec interface, Reader,
				702	Writer must be factory functions or classes providing the
				703	StreamReader, StreamWriter interface resp.
				704
				705	encode and decode are needed for the frontend translation,
				706	Reader and Writer for the backend translation. Unicode is
				707	used as intermediate encoding.
				708
				709	Error handling is done in the same way as defined for the
				710	StreamWriter/Readers.
				711
				712	"""
				713	self.stream = stream
				714	self.encode = encode
				715	self.decode = decode
				716	self.reader = Reader(stream, errors)
				717	self.writer = Writer(stream, errors)
				718	self.errors = errors
				719
				720	def read(self, size=-1):
				721
				722	data = self.reader.read(size)
				723	data, bytesencoded = self.encode(data, self.errors)
				724	return data
				725
				726	def readline(self, size=None):
				727
				728	if size is None:
				729	data = self.reader.readline()
				730	else:
				731	data = self.reader.readline(size)
				732	data, bytesencoded = self.encode(data, self.errors)
				733	return data
				734
				735	def readlines(self, sizehint=None):
				736
				737	data = self.reader.read()
				738	data, bytesencoded = self.encode(data, self.errors)
				739	return data.splitlines(1)
				740
				741	def next(self):
				742
				743	""" Return the next decoded line from the input stream."""
				744	data = self.reader.next()
				745	data, bytesencoded = self.encode(data, self.errors)
				746	return data
				747
				748	def __iter__(self):
				749	return self
				750
				751	def write(self, data):
				752
				753	data, bytesdecoded = self.decode(data, self.errors)
				754	return self.writer.write(data)
				755
				756	def writelines(self, list):
				757
				758	data = ''.join(list)
				759	data, bytesdecoded = self.decode(data, self.errors)
				760	return self.writer.write(data)
				761
				762	def reset(self):
				763
				764	self.reader.reset()
				765	self.writer.reset()
				766
				767	def __getattr__(self, name,
				768	getattr=getattr):
				769
				770	""" Inherit all other methods from the underlying stream.
				771	"""
				772	return getattr(self.stream, name)
				773
				774	def __enter__(self):
				775	return self
				776
				777	def __exit__(self, type, value, tb):
				778	self.stream.close()
				779
				780	### Shortcuts
				781
				782	def open(filename, mode='rb', encoding=None, errors='strict', buffering=1):
				783
				784	""" Open an encoded file using the given mode and return
				785	a wrapped version providing transparent encoding/decoding.
				786
				787	Note: The wrapped version will only accept the object format
				788	defined by the codecs, i.e. Unicode objects for most builtin
				789	codecs. Output is also codec dependent and will usually be
				790	Unicode as well.
				791
				792	Files are always opened in binary mode, even if no binary mode
				793	was specified. This is done to avoid data loss due to encodings
				794	using 8-bit values. The default file mode is 'rb' meaning to
				795	open the file in binary read mode.
				796
				797	encoding specifies the encoding which is to be used for the
				798	file.
				799
				800	errors may be given to define the error handling. It defaults
				801	to 'strict' which causes ValueErrors to be raised in case an
				802	encoding error occurs.
				803
				804	buffering has the same meaning as for the builtin open() API.
				805	It defaults to line buffered.
				806
				807	The returned wrapped file object provides an extra attribute
				808	.encoding which allows querying the used encoding. This
				809	attribute is only available if an encoding was specified as
				810	parameter.
				811
				812	"""
				813	if encoding is not None and \
				814	'b' not in mode:
				815	# Force opening of the file in binary mode
				816	mode = mode + 'b'
				817	file = __builtin__.open(filename, mode, buffering)
				818	if encoding is None:
				819	return file
				820	info = lookup(encoding)
				821	srw = StreamReaderWriter(file, info.streamreader, info.streamwriter, errors)
				822	# Add attributes to simplify introspection
				823	srw.encoding = encoding
				824	return srw
				825
				826	def EncodedFile(file, data_encoding, file_encoding=None, errors='strict'):
				827
				828	""" Return a wrapped version of file which provides transparent
				829	encoding translation.
				830
				831	Strings written to the wrapped file are interpreted according
				832	to the given data_encoding and then written to the original
				833	file as string using file_encoding. The intermediate encoding
				834	will usually be Unicode but depends on the specified codecs.
				835
				836	Strings are read from the file using file_encoding and then
				837	passed back to the caller as string using data_encoding.
				838
				839	If file_encoding is not given, it defaults to data_encoding.
				840
				841	errors may be given to define the error handling. It defaults
				842	to 'strict' which causes ValueErrors to be raised in case an
				843	encoding error occurs.
				844
				845	The returned wrapped file object provides two extra attributes
				846	.data_encoding and .file_encoding which reflect the given
				847	parameters of the same name. The attributes can be used for
				848	introspection by Python programs.
				849
				850	"""
				851	if file_encoding is None:
				852	file_encoding = data_encoding
				853	data_info = lookup(data_encoding)
				854	file_info = lookup(file_encoding)
				855	sr = StreamRecoder(file, data_info.encode, data_info.decode,
				856	file_info.streamreader, file_info.streamwriter, errors)
				857	# Add attributes to simplify introspection
				858	sr.data_encoding = data_encoding
				859	sr.file_encoding = file_encoding
				860	return sr
				861
				862	### Helpers for codec lookup
				863
				864	def getencoder(encoding):
				865
				866	""" Lookup up the codec for the given encoding and return
				867	its encoder function.
				868
				869	Raises a LookupError in case the encoding cannot be found.
				870
				871	"""
				872	return lookup(encoding).encode
				873
				874	def getdecoder(encoding):
				875
				876	""" Lookup up the codec for the given encoding and return
				877	its decoder function.
				878
				879	Raises a LookupError in case the encoding cannot be found.
				880
				881	"""
				882	return lookup(encoding).decode
				883
				884	def getincrementalencoder(encoding):
				885
				886	""" Lookup up the codec for the given encoding and return
				887	its IncrementalEncoder class or factory function.
				888
				889	Raises a LookupError in case the encoding cannot be found
				890	or the codecs doesn't provide an incremental encoder.
				891
				892	"""
				893	encoder = lookup(encoding).incrementalencoder
				894	if encoder is None:
				895	raise LookupError(encoding)
				896	return encoder
				897
				898	def getincrementaldecoder(encoding):
				899
				900	""" Lookup up the codec for the given encoding and return
				901	its IncrementalDecoder class or factory function.
				902
				903	Raises a LookupError in case the encoding cannot be found
				904	or the codecs doesn't provide an incremental decoder.
				905
				906	"""
				907	decoder = lookup(encoding).incrementaldecoder
				908	if decoder is None:
				909	raise LookupError(encoding)
				910	return decoder
				911
				912	def getreader(encoding):
				913
				914	""" Lookup up the codec for the given encoding and return
				915	its StreamReader class or factory function.
				916
				917	Raises a LookupError in case the encoding cannot be found.
				918
				919	"""
				920	return lookup(encoding).streamreader
				921
				922	def getwriter(encoding):
				923
				924	""" Lookup up the codec for the given encoding and return
				925	its StreamWriter class or factory function.
				926
				927	Raises a LookupError in case the encoding cannot be found.
				928
				929	"""
				930	return lookup(encoding).streamwriter
				931
				932	def iterencode(iterator, encoding, errors='strict', **kwargs):
				933	"""
				934	Encoding iterator.
				935
				936	Encodes the input strings from the iterator using a IncrementalEncoder.
				937
				938	errors and kwargs are passed through to the IncrementalEncoder
				939	constructor.
				940	"""
				941	encoder = getincrementalencoder(encoding)(errors, **kwargs)
				942	for input in iterator:
				943	output = encoder.encode(input)
				944	if output:
				945	yield output
				946	output = encoder.encode("", True)
				947	if output:
				948	yield output
				949
				950	def iterdecode(iterator, encoding, errors='strict', **kwargs):
				951	"""
				952	Decoding iterator.
				953
				954	Decodes the input strings from the iterator using a IncrementalDecoder.
				955
				956	errors and kwargs are passed through to the IncrementalDecoder
				957	constructor.
				958	"""
				959	decoder = getincrementaldecoder(encoding)(errors, **kwargs)
				960	for input in iterator:
				961	output = decoder.decode(input)
				962	if output:
				963	yield output
				964	output = decoder.decode("", True)
				965	if output:
				966	yield output
				967
				968	### Helpers for charmap-based codecs
				969
				970	def make_identity_dict(rng):
				971
				972	""" make_identity_dict(rng) -> dict
				973
				974	Return a dictionary where elements of the rng sequence are
				975	mapped to themselves.
				976
				977	"""
				978	res = {}
				979	for i in rng:
				980	res[i]=i
				981	return res
				982
				983	def make_encoding_map(decoding_map):
				984
				985	""" Creates an encoding map from a decoding map.
				986
				987	If a target mapping in the decoding map occurs multiple
				988	times, then that target is mapped to None (undefined mapping),
				989	causing an exception when encountered by the charmap codec
				990	during translation.
				991
				992	One example where this happens is cp875.py which decodes
				993	multiple character to \u001a.
				994
				995	"""
				996	m = {}
				997	for k,v in decoding_map.items():
				998	if not v in m:
				999	m[v] = k
				1000	else:
				1001	m[v] = None
				1002	return m
				1003
				1004	### error handlers
				1005
				1006	try:
				1007	strict_errors = lookup_error("strict")
				1008	ignore_errors = lookup_error("ignore")
				1009	replace_errors = lookup_error("replace")
				1010	xmlcharrefreplace_errors = lookup_error("xmlcharrefreplace")
				1011	backslashreplace_errors = lookup_error("backslashreplace")
				1012	except LookupError:
				1013	# In --disable-unicode builds, these error handler are missing
				1014	strict_errors = None
				1015	ignore_errors = None
				1016	replace_errors = None
				1017	xmlcharrefreplace_errors = None
				1018	backslashreplace_errors = None
				1019
				1020	# Tell modulefinder that using codecs probably needs the encodings
				1021	# package
				1022	_false = 0
				1023	if _false:
				1024	import encodings
				1025
				1026	### Tests
				1027
				1028	if __name__ == '__main__':
				1029
				1030	# Make stdout translate Latin-1 output into UTF-8 output
				1031	sys.stdout = EncodedFile(sys.stdout, 'latin-1', 'utf-8')
				1032
				1033	# Have stdin translate Latin-1 input into UTF-8 input
				1034	sys.stdin = EncodedFile(sys.stdin, 'utf-8', 'latin-1')