Blame - Doc/lib/libcodecs.tex - platform/external/python/cpython3

blob: caaaaf49a182e184f025c9dd13db008e03c6c329 [file] [log] [blame]

Fred Drake	b7979c7	2000-04-06 14:21:58 +0000	[diff] [blame]	1	\section{\module{codecs} ---
Fred Drake	69ca950	2000-04-06 16:09:59 +0000	[diff] [blame]	2	Codec registry and base classes}
Fred Drake	b7979c7	2000-04-06 14:21:58 +0000	[diff] [blame]	3
Fred Drake	69ca950	2000-04-06 16:09:59 +0000	[diff] [blame]	4	\declaremodule{standard}{codecs}
Fred Drake	b7979c7	2000-04-06 14:21:58 +0000	[diff] [blame]	5	\modulesynopsis{Encode and decode data and streams.}
				6	\moduleauthor{Marc-Andre Lemburg}{mal@lemburg.com}
				7	\sectionauthor{Marc-Andre Lemburg}{mal@lemburg.com}
				8
				9
				10	\index{Unicode}
				11	\index{Codecs}
				12	\indexii{Codecs}{encode}
				13	\indexii{Codecs}{decode}
				14	\index{streams}
				15	\indexii{stackable}{streams}
				16
				17
				18	This module defines base classes for standard Python codecs (encoders
				19	and decoders) and provides access to the internal Python codec
Walter Dörwald	3aeb632	2002-09-02 13:14:32 +0000	[diff] [blame]	20	registry which manages the codec and error handling lookup process.
Fred Drake	b7979c7	2000-04-06 14:21:58 +0000	[diff] [blame]	21
				22	It defines the following functions:
				23
				24	\begin{funcdesc}{register}{search_function}
				25	Register a codec search function. Search functions are expected to
				26	take one argument, the encoding name in all lower case letters, and
				27	return a tuple of functions \code{(\var{encoder}, \var{decoder}, \var{stream_reader},
				28	\var{stream_writer})} taking the following arguments:
				29
				30	\var{encoder} and \var{decoder}: These must be functions or methods
Fred Drake	602aa77	2000-10-12 20:50:55 +0000	[diff] [blame]	31	which have the same interface as the
				32	\method{encode()}/\method{decode()} methods of Codec instances (see
				33	Codec Interface). The functions/methods are expected to work in a
				34	stateless mode.
Fred Drake	b7979c7	2000-04-06 14:21:58 +0000	[diff] [blame]	35
				36	\var{stream_reader} and \var{stream_writer}: These have to be
				37	factory functions providing the following interface:
				38
Fred Drake	602aa77	2000-10-12 20:50:55 +0000	[diff] [blame]	39	\code{factory(\var{stream}, \var{errors}='strict')}
Fred Drake	b7979c7	2000-04-06 14:21:58 +0000	[diff] [blame]	40
				41	The factory functions must return objects providing the interfaces
Fred Drake	69ca950	2000-04-06 16:09:59 +0000	[diff] [blame]	42	defined by the base classes \class{StreamWriter} and
				43	\class{StreamReader}, respectively. Stream codecs can maintain
				44	state.
Fred Drake	b7979c7	2000-04-06 14:21:58 +0000	[diff] [blame]	45
Fred Drake	69ca950	2000-04-06 16:09:59 +0000	[diff] [blame]	46	Possible values for errors are \code{'strict'} (raise an exception
				47	in case of an encoding error), \code{'replace'} (replace malformed
Walter Dörwald	72f8616	2002-11-19 21:51:35 +0000	[diff] [blame]	48	data with a suitable replacement marker, such as \character{?}),
Fred Drake	69ca950	2000-04-06 16:09:59 +0000	[diff] [blame]	49	\code{'ignore'} (ignore malformed data and continue without further
Walter Dörwald	72f8616	2002-11-19 21:51:35 +0000	[diff] [blame]	50	notice), \code{'xmlcharrefreplace'} (replace with the appropriate XML
				51	character reference (for encoding only)) and \code{'backslashreplace'}
				52	(replace with backslashed escape sequences (for encoding only)) as
				53	well as any other error handling name defined via
				54	\function{register_error()}.
Fred Drake	b7979c7	2000-04-06 14:21:58 +0000	[diff] [blame]	55
				56	In case a search function cannot find a given encoding, it should
Fred Drake	69ca950	2000-04-06 16:09:59 +0000	[diff] [blame]	57	return \code{None}.
Fred Drake	b7979c7	2000-04-06 14:21:58 +0000	[diff] [blame]	58	\end{funcdesc}
				59
				60	\begin{funcdesc}{lookup}{encoding}
				61	Looks up a codec tuple in the Python codec registry and returns the
				62	function tuple as defined above.
				63
				64	Encodings are first looked up in the registry's cache. If not found,
				65	the list of registered search functions is scanned. If no codecs tuple
Fred Drake	69ca950	2000-04-06 16:09:59 +0000	[diff] [blame]	66	is found, a \exception{LookupError} is raised. Otherwise, the codecs
				67	tuple is stored in the cache and returned to the caller.
Fred Drake	b7979c7	2000-04-06 14:21:58 +0000	[diff] [blame]	68	\end{funcdesc}
				69
Skip Montanaro	b02ea65	2002-04-17 19:33:06 +0000	[diff] [blame]	70	To simplify access to the various codecs, the module provides these
Marc-André Lemburg	494f2ae	2001-09-19 11:33:31 +0000	[diff] [blame]	71	additional functions which use \function{lookup()} for the codec
				72	lookup:
				73
				74	\begin{funcdesc}{getencoder}{encoding}
				75	Lookup up the codec for the given encoding and return its encoder
				76	function.
				77
				78	Raises a \exception{LookupError} in case the encoding cannot be found.
				79	\end{funcdesc}
				80
				81	\begin{funcdesc}{getdecoder}{encoding}
				82	Lookup up the codec for the given encoding and return its decoder
				83	function.
				84
				85	Raises a \exception{LookupError} in case the encoding cannot be found.
				86	\end{funcdesc}
				87
				88	\begin{funcdesc}{getreader}{encoding}
				89	Lookup up the codec for the given encoding and return its StreamReader
				90	class or factory function.
				91
				92	Raises a \exception{LookupError} in case the encoding cannot be found.
				93	\end{funcdesc}
				94
				95	\begin{funcdesc}{getwriter}{encoding}
				96	Lookup up the codec for the given encoding and return its StreamWriter
				97	class or factory function.
				98
				99	Raises a \exception{LookupError} in case the encoding cannot be found.
				100	\end{funcdesc}
				101
Walter Dörwald	3aeb632	2002-09-02 13:14:32 +0000	[diff] [blame]	102	\begin{funcdesc}{register_error}{name, error_handler}
				103	Register the error handling function \var{error_handler} under the
Raymond Hettinger	8a64d40	2002-09-08 22:26:13 +0000	[diff] [blame]	104	name \var{name}. \var{error_handler} will be called during encoding
Walter Dörwald	3aeb632	2002-09-02 13:14:32 +0000	[diff] [blame]	105	and decoding in case of an error, when \var{name} is specified as the
Walter Dörwald	2e0b18a	2003-01-31 17:19:08 +0000	[diff] [blame^]	106	errors parameter.
				107
				108	For encoding \var{error_handler} will be called with a
				109	\exception{UnicodeEncodeError} instance, which contains information about
				110	the location of the error. The error handler must either raise this or
				111	a different exception or return a tuple with a replacement for the
				112	unencodable part of the input and a position where encoding should
				113	continue. The encoder will encode the replacement and continue encoding
				114	the original input at the specified position. Negative position values
				115	will be treated as being relative to the end of the input string. If the
				116	resulting position is out of bound an IndexError will be raised.
				117
				118	Decoding and translating works similar, except \exception{UnicodeDecodeError}
				119	or \exception{UnicodeTranslateError} will be passed to the handler and
				120	that the replacement from the error handler will be put into the output
				121	directly.
Walter Dörwald	3aeb632	2002-09-02 13:14:32 +0000	[diff] [blame]	122	\end{funcdesc}
				123
				124	\begin{funcdesc}{lookup_error}{name}
				125	Return the error handler previously register under the name \var{name}.
				126
				127	Raises a \exception{LookupError} in case the handler cannot be found.
				128	\end{funcdesc}
				129
				130	\begin{funcdesc}{strict_errors}{exception}
				131	Implements the \code{strict} error handling.
				132	\end{funcdesc}
				133
				134	\begin{funcdesc}{replace_errors}{exception}
				135	Implements the \code{replace} error handling.
				136	\end{funcdesc}
				137
				138	\begin{funcdesc}{ignore_errors}{exception}
				139	Implements the \code{ignore} error handling.
				140	\end{funcdesc}
				141
				142	\begin{funcdesc}{xmlcharrefreplace_errors_errors}{exception}
				143	Implements the \code{xmlcharrefreplace} error handling.
				144	\end{funcdesc}
				145
				146	\begin{funcdesc}{backslashreplace_errors_errors}{exception}
				147	Implements the \code{backslashreplace} error handling.
				148	\end{funcdesc}
				149
Walter Dörwald	1a7a894	2002-11-02 13:32:07 +0000	[diff] [blame]	150	To simplify working with encoded files or stream, the module
				151	also defines these utility functions:
				152
Fred Drake	e1b304d	2000-07-24 19:35:52 +0000	[diff] [blame]	153	\begin{funcdesc}{open}{filename, mode\optional{, encoding\optional{,
				154	errors\optional{, buffering}}}}
Fred Drake	b7979c7	2000-04-06 14:21:58 +0000	[diff] [blame]	155	Open an encoded file using the given \var{mode} and return
				156	a wrapped version providing transparent encoding/decoding.
				157
Fred Drake	0aa811c	2001-10-20 04:24:09 +0000	[diff] [blame]	158	\note{The wrapped version will only accept the object format
Fred Drake	e1b304d	2000-07-24 19:35:52 +0000	[diff] [blame]	159	defined by the codecs, i.e.\ Unicode objects for most built-in
				160	codecs. Output is also codec-dependent and will usually be Unicode as
Fred Drake	0aa811c	2001-10-20 04:24:09 +0000	[diff] [blame]	161	well.}
Fred Drake	b7979c7	2000-04-06 14:21:58 +0000	[diff] [blame]	162
				163	\var{encoding} specifies the encoding which is to be used for the
				164	the file.
				165
				166	\var{errors} may be given to define the error handling. It defaults
Fred Drake	e1b304d	2000-07-24 19:35:52 +0000	[diff] [blame]	167	to \code{'strict'} which causes a \exception{ValueError} to be raised
				168	in case an encoding error occurs.
Fred Drake	b7979c7	2000-04-06 14:21:58 +0000	[diff] [blame]	169
Fred Drake	69ca950	2000-04-06 16:09:59 +0000	[diff] [blame]	170	\var{buffering} has the same meaning as for the built-in
				171	\function{open()} function. It defaults to line buffered.
Fred Drake	b7979c7	2000-04-06 14:21:58 +0000	[diff] [blame]	172	\end{funcdesc}
				173
Fred Drake	e1b304d	2000-07-24 19:35:52 +0000	[diff] [blame]	174	\begin{funcdesc}{EncodedFile}{file, input\optional{,
				175	output\optional{, errors}}}
Fred Drake	b7979c7	2000-04-06 14:21:58 +0000	[diff] [blame]	176	Return a wrapped version of file which provides transparent
				177	encoding translation.
				178
				179	Strings written to the wrapped file are interpreted according to the
				180	given \var{input} encoding and then written to the original file as
Fred Drake	e1b304d	2000-07-24 19:35:52 +0000	[diff] [blame]	181	strings using the \var{output} encoding. The intermediate encoding will
Fred Drake	b7979c7	2000-04-06 14:21:58 +0000	[diff] [blame]	182	usually be Unicode but depends on the specified codecs.
				183
Fred Drake	e1b304d	2000-07-24 19:35:52 +0000	[diff] [blame]	184	If \var{output} is not given, it defaults to \var{input}.
Fred Drake	b7979c7	2000-04-06 14:21:58 +0000	[diff] [blame]	185
				186	\var{errors} may be given to define the error handling. It defaults to
Fred Drake	e1b304d	2000-07-24 19:35:52 +0000	[diff] [blame]	187	\code{'strict'}, which causes \exception{ValueError} to be raised in case
Fred Drake	b7979c7	2000-04-06 14:21:58 +0000	[diff] [blame]	188	an encoding error occurs.
				189	\end{funcdesc}
				190
Fred Drake	b7979c7	2000-04-06 14:21:58 +0000	[diff] [blame]	191	The module also provides the following constants which are useful
				192	for reading and writing to platform dependent files:
				193
				194	\begin{datadesc}{BOM}
				195	\dataline{BOM_BE}
				196	\dataline{BOM_LE}
Walter Dörwald	474458d	2002-06-04 15:16:29 +0000	[diff] [blame]	197	\dataline{BOM_UTF8}
				198	\dataline{BOM_UTF16}
				199	\dataline{BOM_UTF16_BE}
				200	\dataline{BOM_UTF16_LE}
				201	\dataline{BOM_UTF32}
				202	\dataline{BOM_UTF32_BE}
				203	\dataline{BOM_UTF32_LE}
				204	These constants define various encodings of the Unicode byte order mark
				205	(BOM) used in UTF-16 and UTF-32 data streams to indicate the byte order
				206	used in the stream or file and in UTF-8 as a Unicode signature.
				207	\constant{BOM_UTF16} is either \constant{BOM_UTF16_BE} or
				208	\constant{BOM_UTF16_LE} depending on the platform's native byte order,
				209	\constant{BOM} is an alias for \constant{BOM_UTF16}, \constant{BOM_LE}
				210	for \constant{BOM_UTF16_LE} and \constant{BOM_BE} for \constant{BOM_UTF16_BE}.
				211	The others represent the BOM in UTF-8 and UTF-32 encodings.
Fred Drake	b7979c7	2000-04-06 14:21:58 +0000	[diff] [blame]	212	\end{datadesc}
				213
Fred Drake	dc40ac0	2001-01-22 20:17:54 +0000	[diff] [blame]	214
				215	\begin{seealso}
				216	\seeurl{http://sourceforge.net/projects/python-codecs/}{A
				217	SourceForge project working on additional support for Asian
				218	codecs for use with Python. They are in the early stages of
				219	development at the time of this writing --- look in their
				220	FTP area for downloadable files.}
				221	\end{seealso}
				222
				223
Fred Drake	602aa77	2000-10-12 20:50:55 +0000	[diff] [blame]	224	\subsection{Codec Base Classes}
				225
				226	The \module{codecs} defines a set of base classes which define the
				227	interface and can also be used to easily write you own codecs for use
				228	in Python.
				229
				230	Each codec has to define four interfaces to make it usable as codec in
				231	Python: stateless encoder, stateless decoder, stream reader and stream
				232	writer. The stream reader and writers typically reuse the stateless
				233	encoder/decoder to implement the file protocols.
				234
				235	The \class{Codec} class defines the interface for stateless
				236	encoders/decoders.
				237
				238	To simplify and standardize error handling, the \method{encode()} and
				239	\method{decode()} methods may implement different error handling
				240	schemes by providing the \var{errors} string argument. The following
				241	string values are defined and implemented by all standard Python
				242	codecs:
				243
Fred Drake	dc40ac0	2001-01-22 20:17:54 +0000	[diff] [blame]	244	\begin{tableii}{l\|l}{code}{Value}{Meaning}
Walter Dörwald	430b156	2002-11-07 22:33:17 +0000	[diff] [blame]	245	\lineii{'strict'}{Raise \exception{UnicodeError} (or a subclass);
Fred Drake	dc40ac0	2001-01-22 20:17:54 +0000	[diff] [blame]	246	this is the default.}
				247	\lineii{'ignore'}{Ignore the character and continue with the next.}
				248	\lineii{'replace'}{Replace with a suitable replacement character;
				249	Python will use the official U+FFFD REPLACEMENT
Walter Dörwald	430b156	2002-11-07 22:33:17 +0000	[diff] [blame]	250	CHARACTER for the built-in Unicode codecs on
				251	decoding and '?' on encoding.}
				252	\lineii{'xmlcharrefreplace'}{Replace with the appropriate XML
				253	character reference (only for encoding).}
				254	\lineii{'backslashreplace'}{Replace with backslashed escape sequences
				255	(only for encoding).}
Fred Drake	dc40ac0	2001-01-22 20:17:54 +0000	[diff] [blame]	256	\end{tableii}
Fred Drake	602aa77	2000-10-12 20:50:55 +0000	[diff] [blame]	257
Walter Dörwald	430b156	2002-11-07 22:33:17 +0000	[diff] [blame]	258	The set of allowed values can be extended via \method{register_error}.
				259
Fred Drake	602aa77	2000-10-12 20:50:55 +0000	[diff] [blame]	260
				261	\subsubsection{Codec Objects \label{codec-objects}}
				262
				263	The \class{Codec} class defines these methods which also define the
				264	function interfaces of the stateless encoder and decoder:
				265
				266	\begin{methoddesc}{encode}{input\optional{, errors}}
				267	Encodes the object \var{input} and returns a tuple (output object,
Skip Montanaro	6c7bc31	2002-04-16 15:12:10 +0000	[diff] [blame]	268	length consumed). While codecs are not restricted to use with Unicode, in
				269	a Unicode context, encoding converts a Unicode object to a plain string
				270	using a particular character set encoding (e.g., \code{cp1252} or
				271	\code{iso-8859-1}).
Fred Drake	602aa77	2000-10-12 20:50:55 +0000	[diff] [blame]	272
				273	\var{errors} defines the error handling to apply. It defaults to
				274	\code{'strict'} handling.
				275
				276	The method may not store state in the \class{Codec} instance. Use
				277	\class{StreamCodec} for codecs which have to keep state in order to
				278	make encoding/decoding efficient.
				279
				280	The encoder must be able to handle zero length input and return an
				281	empty object of the output object type in this situation.
				282	\end{methoddesc}
				283
				284	\begin{methoddesc}{decode}{input\optional{, errors}}
				285	Decodes the object \var{input} and returns a tuple (output object,
Skip Montanaro	6c7bc31	2002-04-16 15:12:10 +0000	[diff] [blame]	286	length consumed). In a Unicode context, decoding converts a plain string
				287	encoded using a particular character set encoding to a Unicode object.
Fred Drake	602aa77	2000-10-12 20:50:55 +0000	[diff] [blame]	288
				289	\var{input} must be an object which provides the \code{bf_getreadbuf}
				290	buffer slot. Python strings, buffer objects and memory mapped files
				291	are examples of objects providing this slot.
				292
				293	\var{errors} defines the error handling to apply. It defaults to
				294	\code{'strict'} handling.
				295
				296	The method may not store state in the \class{Codec} instance. Use
				297	\class{StreamCodec} for codecs which have to keep state in order to
				298	make encoding/decoding efficient.
				299
				300	The decoder must be able to handle zero length input and return an
				301	empty object of the output object type in this situation.
				302	\end{methoddesc}
				303
				304	The \class{StreamWriter} and \class{StreamReader} classes provide
				305	generic working interfaces which can be used to implement new
				306	encodings submodules very easily. See \module{encodings.utf_8} for an
				307	example on how this is done.
				308
				309
				310	\subsubsection{StreamWriter Objects \label{stream-writer-objects}}
				311
				312	The \class{StreamWriter} class is a subclass of \class{Codec} and
				313	defines the following methods which every stream writer must define in
				314	order to be compatible to the Python codec registry.
				315
				316	\begin{classdesc}{StreamWriter}{stream\optional{, errors}}
				317	Constructor for a \class{StreamWriter} instance.
				318
				319	All stream writers must provide this constructor interface. They are
				320	free to add additional keyword arguments, but only the ones defined
				321	here are used by the Python codec registry.
				322
				323	\var{stream} must be a file-like object open for writing (binary)
				324	data.
				325
				326	The \class{StreamWriter} may implement different error handling
				327	schemes by providing the \var{errors} keyword argument. These
Walter Dörwald	430b156	2002-11-07 22:33:17 +0000	[diff] [blame]	328	parameters are predefined:
Fred Drake	602aa77	2000-10-12 20:50:55 +0000	[diff] [blame]	329
				330	\begin{itemize}
				331	\item \code{'strict'} Raise \exception{ValueError} (or a subclass);
				332	this is the default.
				333	\item \code{'ignore'} Ignore the character and continue with the next.
				334	\item \code{'replace'} Replace with a suitable replacement character
Walter Dörwald	430b156	2002-11-07 22:33:17 +0000	[diff] [blame]	335	\item \code{'xmlcharrefreplace'} Replace with the appropriate XML
				336	character reference
				337	\item \code{'backslashreplace'} Replace with backslashed escape sequences.
Fred Drake	602aa77	2000-10-12 20:50:55 +0000	[diff] [blame]	338	\end{itemize}
Walter Dörwald	430b156	2002-11-07 22:33:17 +0000	[diff] [blame]	339
				340	The \var{errors} argument will be assigned to an attribute of the
				341	same name. Assigning to this attribute makes it possible to switch
				342	between different error handling strategies during the lifetime
				343	of the \class{StreamWriter} object.
				344
				345	The set of allowed values for the \var{errors} argument can
				346	be extended with \function{register_error()}.
Fred Drake	602aa77	2000-10-12 20:50:55 +0000	[diff] [blame]	347	\end{classdesc}
				348
				349	\begin{methoddesc}{write}{object}
				350	Writes the object's contents encoded to the stream.
				351	\end{methoddesc}
				352
				353	\begin{methoddesc}{writelines}{list}
				354	Writes the concatenated list of strings to the stream (possibly by
				355	reusing the \method{write()} method).
				356	\end{methoddesc}
				357
				358	\begin{methoddesc}{reset}{}
				359	Flushes and resets the codec buffers used for keeping state.
				360
				361	Calling this method should ensure that the data on the output is put
				362	into a clean state, that allows appending of new fresh data without
				363	having to rescan the whole stream to recover state.
				364	\end{methoddesc}
				365
				366	In addition to the above methods, the \class{StreamWriter} must also
				367	inherit all other methods and attribute from the underlying stream.
				368
				369
				370	\subsubsection{StreamReader Objects \label{stream-reader-objects}}
				371
				372	The \class{StreamReader} class is a subclass of \class{Codec} and
				373	defines the following methods which every stream reader must define in
				374	order to be compatible to the Python codec registry.
				375
				376	\begin{classdesc}{StreamReader}{stream\optional{, errors}}
				377	Constructor for a \class{StreamReader} instance.
				378
				379	All stream readers must provide this constructor interface. They are
				380	free to add additional keyword arguments, but only the ones defined
				381	here are used by the Python codec registry.
				382
				383	\var{stream} must be a file-like object open for reading (binary)
				384	data.
				385
				386	The \class{StreamReader} may implement different error handling
				387	schemes by providing the \var{errors} keyword argument. These
				388	parameters are defined:
				389
				390	\begin{itemize}
				391	\item \code{'strict'} Raise \exception{ValueError} (or a subclass);
				392	this is the default.
				393	\item \code{'ignore'} Ignore the character and continue with the next.
				394	\item \code{'replace'} Replace with a suitable replacement character.
				395	\end{itemize}
Walter Dörwald	430b156	2002-11-07 22:33:17 +0000	[diff] [blame]	396
				397	The \var{errors} argument will be assigned to an attribute of the
				398	same name. Assigning to this attribute makes it possible to switch
				399	between different error handling strategies during the lifetime
				400	of the \class{StreamReader} object.
				401
				402	The set of allowed values for the \var{errors} argument can
				403	be extended with \function{register_error()}.
Fred Drake	602aa77	2000-10-12 20:50:55 +0000	[diff] [blame]	404	\end{classdesc}
				405
				406	\begin{methoddesc}{read}{\optional{size}}
				407	Decodes data from the stream and returns the resulting object.
				408
				409	\var{size} indicates the approximate maximum number of bytes to read
				410	from the stream for decoding purposes. The decoder can modify this
				411	setting as appropriate. The default value -1 indicates to read and
				412	decode as much as possible. \var{size} is intended to prevent having
				413	to decode huge files in one step.
				414
				415	The method should use a greedy read strategy meaning that it should
				416	read as much data as is allowed within the definition of the encoding
				417	and the given size, e.g. if optional encoding endings or state
				418	markers are available on the stream, these should be read too.
				419	\end{methoddesc}
				420
				421	\begin{methoddesc}{readline}{[size]}
				422	Read one line from the input stream and return the
				423	decoded data.
				424
Fred Drake	0aa811c	2001-10-20 04:24:09 +0000	[diff] [blame]	425	Unlike the \method{readlines()} method, this method inherits
Fred Drake	602aa77	2000-10-12 20:50:55 +0000	[diff] [blame]	426	the line breaking knowledge from the underlying stream's
				427	\method{readline()} method -- there is currently no support for line
				428	breaking using the codec decoder due to lack of line buffering.
				429	Sublcasses should however, if possible, try to implement this method
				430	using their own knowledge of line breaking.
				431
				432	\var{size}, if given, is passed as size argument to the stream's
				433	\method{readline()} method.
				434	\end{methoddesc}
				435
				436	\begin{methoddesc}{readlines}{[sizehint]}
				437	Read all lines available on the input stream and return them as list
				438	of lines.
				439
				440	Line breaks are implemented using the codec's decoder method and are
				441	included in the list entries.
				442
				443	\var{sizehint}, if given, is passed as \var{size} argument to the
				444	stream's \method{read()} method.
				445	\end{methoddesc}
				446
				447	\begin{methoddesc}{reset}{}
				448	Resets the codec buffers used for keeping state.
				449
				450	Note that no stream repositioning should take place. This method is
				451	primarily intended to be able to recover from decoding errors.
				452	\end{methoddesc}
				453
				454	In addition to the above methods, the \class{StreamReader} must also
				455	inherit all other methods and attribute from the underlying stream.
				456
				457	The next two base classes are included for convenience. They are not
				458	needed by the codec registry, but may provide useful in practice.
				459
				460
				461	\subsubsection{StreamReaderWriter Objects \label{stream-reader-writer}}
				462
				463	The \class{StreamReaderWriter} allows wrapping streams which work in
				464	both read and write modes.
				465
				466	The design is such that one can use the factory functions returned by
				467	the \function{lookup()} function to construct the instance.
				468
				469	\begin{classdesc}{StreamReaderWriter}{stream, Reader, Writer, errors}
				470	Creates a \class{StreamReaderWriter} instance.
				471	\var{stream} must be a file-like object.
				472	\var{Reader} and \var{Writer} must be factory functions or classes
				473	providing the \class{StreamReader} and \class{StreamWriter} interface
				474	resp.
				475	Error handling is done in the same way as defined for the
				476	stream readers and writers.
				477	\end{classdesc}
				478
				479	\class{StreamReaderWriter} instances define the combined interfaces of
				480	\class{StreamReader} and \class{StreamWriter} classes. They inherit
				481	all other methods and attribute from the underlying stream.
				482
				483
				484	\subsubsection{StreamRecoder Objects \label{stream-recoder-objects}}
				485
				486	The \class{StreamRecoder} provide a frontend - backend view of
				487	encoding data which is sometimes useful when dealing with different
				488	encoding environments.
				489
				490	The design is such that one can use the factory functions returned by
				491	the \function{lookup()} function to construct the instance.
				492
				493	\begin{classdesc}{StreamRecoder}{stream, encode, decode,
				494	Reader, Writer, errors}
				495	Creates a \class{StreamRecoder} instance which implements a two-way
				496	conversion: \var{encode} and \var{decode} work on the frontend (the
				497	input to \method{read()} and output of \method{write()}) while
				498	\var{Reader} and \var{Writer} work on the backend (reading and
				499	writing to the stream).
				500
				501	You can use these objects to do transparent direct recodings from
				502	e.g.\ Latin-1 to UTF-8 and back.
				503
				504	\var{stream} must be a file-like object.
				505
				506	\var{encode}, \var{decode} must adhere to the \class{Codec}
				507	interface, \var{Reader}, \var{Writer} must be factory functions or
				508	classes providing objects of the the \class{StreamReader} and
				509	\class{StreamWriter} interface respectively.
				510
				511	\var{encode} and \var{decode} are needed for the frontend
				512	translation, \var{Reader} and \var{Writer} for the backend
				513	translation. The intermediate format used is determined by the two
				514	sets of codecs, e.g. the Unicode codecs will use Unicode as
				515	intermediate encoding.
				516
				517	Error handling is done in the same way as defined for the
				518	stream readers and writers.
				519	\end{classdesc}
				520
				521	\class{StreamRecoder} instances define the combined interfaces of
				522	\class{StreamReader} and \class{StreamWriter} classes. They inherit
				523	all other methods and attribute from the underlying stream.
				524
Martin v. Löwis	5c37a77	2002-12-31 12:39:07 +0000	[diff] [blame]	525	\subsection{Standard Encodings}
				526
				527	Python comes with a number of codecs builtin, either implemented as C
				528	functions, or with dictionaries as mapping tables. The following table
				529	lists the codecs by name, together with a few common aliases, and the
				530	languages for which the encoding is likely used. Neither the list of
				531	aliases nor the list of languages is meant to be exhaustive. Notice
				532	that spelling alternatives that only differ in case or use a hyphen
				533	instead of an underscore are also valid aliases.
				534
				535	Many of the character sets support the same languages. They vary in
				536	individual characters (e.g. whether the EURO SIGN is supported or
				537	not), and in the assignment of characters to code positions. For the
				538	European languages in particular, the following variants typically
				539	exist:
				540
				541	\begin{itemize}
				542	\item an ISO 8859 codeset
				543	\item a Microsoft Windows code page, which is typically derived from
				544	a 8859 codeset, but replaces control characters with additional
				545	graphic characters
				546	\item an IBM EBCDIC code page
				547	\item an IBM PC code page, which is ASCII compatible
				548	\end{itemize}
				549
				550	\begin{longtableiii}{l\|l\|l}{textrm}{Codec}{Aliases}{Languages}
				551
				552	\lineiii{ascii}
				553	{646, us-ascii}
				554	{English}
				555
				556	\lineiii{cp037}
				557	{IBM037, IBM039}
				558	{English}
				559
				560	\lineiii{cp424}
				561	{EBCDIC-CP-HE, IBM424}
				562	{Hebrew}
				563
				564	\lineiii{cp437}
				565	{437, IBM437}
				566	{English}
				567
				568	\lineiii{cp500}
				569	{EBCDIC-CP-BE, EBCDIC-CP-CH, IBM500}
				570	{Western Europe}
				571
				572	\lineiii{cp737}
				573	{}
				574	{Greek}
				575
				576	\lineiii{cp775}
				577	{IBM775}
				578	{Baltic languages}
				579
				580	\lineiii{cp850}
				581	{850, IBM850}
				582	{Western Europe}
				583
				584	\lineiii{cp852}
				585	{852, IBM852}
				586	{Central and Eastern Europe}
				587
				588	\lineiii{cp855}
				589	{855, IBM855}
				590	{Bulgarian, Byelorussian, Macedonian, Russian, Serbian}
				591
				592	\lineiii{cp856}
				593	{}
				594	{Hebrew}
				595
				596	\lineiii{cp857}
				597	{857, IBM857}
				598	{Turkish}
				599
				600	\lineiii{cp860}
				601	{860, IBM860}
				602	{Portuguese}
				603
				604	\lineiii{cp861}
				605	{861, CP-IS, IBM861}
				606	{Icelandic}
				607
				608	\lineiii{cp862}
				609	{862, IBM862}
				610	{Hebrew}
				611
				612	\lineiii{cp863}
				613	{863, IBM863}
				614	{Canadian}
				615
				616	\lineiii{cp864}
				617	{IBM864}
				618	{Arabic}
				619
				620	\lineiii{cp865}
				621	{865, IBM865}
				622	{Danish, Norwegian}
				623
				624	\lineiii{cp869}
				625	{869, CP-GR, IBM869}
				626	{Greek}
				627
				628	\lineiii{cp874}
				629	{}
				630	{Thai}
				631
				632	\lineiii{cp875}
				633	{}
				634	{Greek}
				635
				636	\lineiii{cp1006}
				637	{}
				638	{Urdu}
				639
				640	\lineiii{cp1026}
				641	{ibm1026}
				642	{Turkish}
				643
				644	\lineiii{cp1140}
				645	{ibm1140}
				646	{Western Europe}
				647
				648	\lineiii{cp1250}
				649	{windows-1250}
				650	{Central and Eastern Europe}
				651
				652	\lineiii{cp1251}
				653	{windows-1251}
				654	{Bulgarian, Byelorussian, Macedonian, Russian, Serbian}
				655
				656	\lineiii{cp1252}
				657	{windows-1252}
				658	{Western Europe}
				659
				660	\lineiii{cp1253}
				661	{windows-1253}
				662	{Greek}
				663
				664	\lineiii{cp1254}
				665	{windows-1254}
				666	{Turkish}
				667
				668	\lineiii{cp1255}
				669	{windows-1255}
				670	{Hebrew}
				671
				672	\lineiii{cp1256}
				673	{windows1256}
				674	{Arabic}
				675
				676	\lineiii{cp1257}
				677	{windows-1257}
				678	{Baltic languages}
				679
				680	\lineiii{cp1258}
				681	{windows-1258}
				682	{Vietnamese}
				683
				684	\lineiii{latin_1}
				685	{iso-8859-1, iso8859-1, 8859, cp819, latin, latin1, L1}
				686	{West Europe}
				687
				688	\lineiii{iso8859_2}
				689	{iso-8859-2, latin2, L2}
				690	{Central and Eastern Europe}
				691
				692	\lineiii{iso8859_3}
				693	{iso-8859-3, latin3, L3}
				694	{Esperanto, Maltese}
				695
				696	\lineiii{iso8859_4}
				697	{iso-8859-4, latin4, L4}
				698	{Baltic languagues}
				699
				700	\lineiii{iso8859_5}
				701	{iso-8859-5, cyrillic}
				702	{Bulgarian, Byelorussian, Macedonian, Russian, Serbian}
				703
				704	\lineiii{iso8859_6}
				705	{iso-8859-6, arabic}
				706	{Arabic}
				707
				708	\lineiii{iso8859_7}
				709	{iso-8859-7, greek, greek8}
				710	{Greek}
				711
				712	\lineiii{iso8859_8}
				713	{iso-8859-8, hebrew}
				714	{Hebrew}
				715
				716	\lineiii{iso8859_9}
				717	{iso-8859-9, latin5, L5}
				718	{Turkish}
				719
				720	\lineiii{iso8859_10}
				721	{iso-8859-10, latin6, L6}
				722	{Nordic languages}
				723
				724	\lineiii{iso8859_13}
				725	{iso-8859-13}
				726	{Baltic languages}
				727
				728	\lineiii{iso8859_14}
				729	{iso-8859-14, latin8, L8}
				730	{Celtic languages}
				731
				732	\lineiii{iso8859_15}
				733	{iso-8859-15}
				734	{Western Europe}
				735
				736	\lineiii{koi8_r}
				737	{}
				738	{Russian}
				739
				740	\lineiii{koi8_u}
				741	{}
				742	{Ukrainian}
				743
				744	\lineiii{mac_cyrillic}
				745	{maccyrillic}
				746	{Bulgarian, Byelorussian, Macedonian, Russian, Serbian}
				747
				748	\lineiii{mac_greek}
				749	{macgreek}
				750	{Greek}
				751
				752	\lineiii{mac_iceland}
				753	{maciceland}
				754	{Icelandic}
				755
				756	\lineiii{mac_latin2}
				757	{maclatin2, maccentraleurope}
				758	{Central and Eastern Europe}
				759
				760	\lineiii{mac_roman}
				761	{macroman}
				762	{Western Europe}
				763
				764	\lineiii{mac_turkish}
				765	{macturkish}
				766	{Turkish}
				767
				768	\lineiii{utf_16}
				769	{U16, utf16}
				770	{all languages}
				771
				772	\lineiii{utf_16_be}
				773	{UTF-16BE}
				774	{all languages (BMP only)}
				775
				776	\lineiii{utf_16_le}
				777	{UTF-16LE}
				778	{all languages (BMP only)}
				779
				780	\lineiii{utf_7}
				781	{U7}
				782	{all languages}
				783
				784	\lineiii{utf_8}
				785	{U8, UTF, utf8}
				786	{all languages}
				787
				788	\end{longtableiii}
				789
				790	A number of codecs are specific to Python, so their codec names have
				791	no meaning outside Python. Some of them don't convert from Unicode
				792	strings to byte strings, but instead use the property of the Python
				793	codecs machinery that any bijective function with one argument can be
				794	considered as an encoding.
				795
				796	For the codecs listed below, the result in the ``encoding'' direction
				797	is always a byte string. The result of the ``decoding'' direction is
				798	listed as operand type in the table.
				799
				800	\begin{tableiv}{l\|l\|l\|l}{textrm}{Codec}{Aliases}{Operand type}{Purpose}
				801
				802	\lineiv{base64_codec}
				803	{base64, base-64}
				804	{byte string}
				805	{Convert operand to MIME base64}
				806
				807	\lineiv{hex_codec}
				808	{hex}
				809	{byte string}
				810	{Convert operand to hexadecimal representation, with two digits per byte}
				811
				812	\lineiv{mbcs}
				813	{dbcs}
				814	{Unicode string}
				815	{Windows only: Encode operand according to the ANSI codepage (CP_ACP)}
				816
				817	\lineiv{palmos}
				818	{}
				819	{Unicode string}
				820	{Encoding of PalmOS 3.5}
				821
				822	\lineiv{quopri_codec}
				823	{quopri, quoted-printable, quotedprintable}
				824	{byte string}
				825	{Convert operand to MIME quoted printable}
				826
				827	\lineiv{raw_unicode_escape}
				828	{}
				829	{Unicode string}
				830	{Produce a string that is suitable as raw Unicode literal in Python source code}
				831
				832	\lineiv{rot_13}
				833	{rot13}
				834	{byte string}
				835	{Returns the Caesar-cypher encryption of the operand}
				836
				837	\lineiv{string_escape}
				838	{}
				839	{byte string}
				840	{Produce a string that is suitable as string literal in Python source code}
				841
				842	\lineiv{undefined}
				843	{}
				844	{any}
				845	{Raise an exception for all conversion. Can be used as the system encoding if no automatic coercion between byte and Unicode strings is desired.}
				846
				847	\lineiv{unicode_escape}
				848	{}
				849	{Unicode string}
				850	{Produce a string that is suitable as Unicode literal in Python source code}
				851
				852	\lineiv{unicode_internal}
				853	{}
				854	{Unicode string}
				855	{Return the internal represenation of the operand}
				856
				857	\lineiv{uu_codec}
				858	{uu}
				859	{byte string}
				860	{Convert the operand using uuencode}
				861
				862	\lineiv{zlib_codec}
				863	{zip, zlib}
				864	{byte string}
				865	{Compress the operand using gzip}
				866
				867	\end{tableiv}