Blame - Doc/lib/libcodecs.tex - platform/external/python/cpython3

blob: 355ac5d112bb1deae9a30f49bd7b43d304a85c13 [file] [log] [blame]

Fred Drake	b7979c7	2000-04-06 14:21:58 +0000	[diff] [blame]	1	\section{\module{codecs} ---
Fred Drake	69ca950	2000-04-06 16:09:59 +0000	[diff] [blame]	2	Codec registry and base classes}
Fred Drake	b7979c7	2000-04-06 14:21:58 +0000	[diff] [blame]	3
Fred Drake	69ca950	2000-04-06 16:09:59 +0000	[diff] [blame]	4	\declaremodule{standard}{codecs}
Fred Drake	b7979c7	2000-04-06 14:21:58 +0000	[diff] [blame]	5	\modulesynopsis{Encode and decode data and streams.}
				6	\moduleauthor{Marc-Andre Lemburg}{mal@lemburg.com}
				7	\sectionauthor{Marc-Andre Lemburg}{mal@lemburg.com}
				8
				9
				10	\index{Unicode}
				11	\index{Codecs}
				12	\indexii{Codecs}{encode}
				13	\indexii{Codecs}{decode}
				14	\index{streams}
				15	\indexii{stackable}{streams}
				16
				17
				18	This module defines base classes for standard Python codecs (encoders
				19	and decoders) and provides access to the internal Python codec
Walter Dörwald	3aeb632	2002-09-02 13:14:32 +0000	[diff] [blame]	20	registry which manages the codec and error handling lookup process.
Fred Drake	b7979c7	2000-04-06 14:21:58 +0000	[diff] [blame]	21
				22	It defines the following functions:
				23
				24	\begin{funcdesc}{register}{search_function}
				25	Register a codec search function. Search functions are expected to
				26	take one argument, the encoding name in all lower case letters, and
				27	return a tuple of functions \code{(\var{encoder}, \var{decoder}, \var{stream_reader},
				28	\var{stream_writer})} taking the following arguments:
				29
				30	\var{encoder} and \var{decoder}: These must be functions or methods
Fred Drake	602aa77	2000-10-12 20:50:55 +0000	[diff] [blame]	31	which have the same interface as the
				32	\method{encode()}/\method{decode()} methods of Codec instances (see
				33	Codec Interface). The functions/methods are expected to work in a
				34	stateless mode.
Fred Drake	b7979c7	2000-04-06 14:21:58 +0000	[diff] [blame]	35
				36	\var{stream_reader} and \var{stream_writer}: These have to be
				37	factory functions providing the following interface:
				38
Fred Drake	602aa77	2000-10-12 20:50:55 +0000	[diff] [blame]	39	\code{factory(\var{stream}, \var{errors}='strict')}
Fred Drake	b7979c7	2000-04-06 14:21:58 +0000	[diff] [blame]	40
				41	The factory functions must return objects providing the interfaces
Fred Drake	69ca950	2000-04-06 16:09:59 +0000	[diff] [blame]	42	defined by the base classes \class{StreamWriter} and
				43	\class{StreamReader}, respectively. Stream codecs can maintain
				44	state.
Fred Drake	b7979c7	2000-04-06 14:21:58 +0000	[diff] [blame]	45
Fred Drake	69ca950	2000-04-06 16:09:59 +0000	[diff] [blame]	46	Possible values for errors are \code{'strict'} (raise an exception
				47	in case of an encoding error), \code{'replace'} (replace malformed
Walter Dörwald	72f8616	2002-11-19 21:51:35 +0000	[diff] [blame]	48	data with a suitable replacement marker, such as \character{?}),
Fred Drake	69ca950	2000-04-06 16:09:59 +0000	[diff] [blame]	49	\code{'ignore'} (ignore malformed data and continue without further
Walter Dörwald	72f8616	2002-11-19 21:51:35 +0000	[diff] [blame]	50	notice), \code{'xmlcharrefreplace'} (replace with the appropriate XML
				51	character reference (for encoding only)) and \code{'backslashreplace'}
				52	(replace with backslashed escape sequences (for encoding only)) as
				53	well as any other error handling name defined via
				54	\function{register_error()}.
Fred Drake	b7979c7	2000-04-06 14:21:58 +0000	[diff] [blame]	55
				56	In case a search function cannot find a given encoding, it should
Fred Drake	69ca950	2000-04-06 16:09:59 +0000	[diff] [blame]	57	return \code{None}.
Fred Drake	b7979c7	2000-04-06 14:21:58 +0000	[diff] [blame]	58	\end{funcdesc}
				59
				60	\begin{funcdesc}{lookup}{encoding}
				61	Looks up a codec tuple in the Python codec registry and returns the
				62	function tuple as defined above.
				63
				64	Encodings are first looked up in the registry's cache. If not found,
				65	the list of registered search functions is scanned. If no codecs tuple
Fred Drake	69ca950	2000-04-06 16:09:59 +0000	[diff] [blame]	66	is found, a \exception{LookupError} is raised. Otherwise, the codecs
				67	tuple is stored in the cache and returned to the caller.
Fred Drake	b7979c7	2000-04-06 14:21:58 +0000	[diff] [blame]	68	\end{funcdesc}
				69
Skip Montanaro	b02ea65	2002-04-17 19:33:06 +0000	[diff] [blame]	70	To simplify access to the various codecs, the module provides these
Marc-André Lemburg	494f2ae	2001-09-19 11:33:31 +0000	[diff] [blame]	71	additional functions which use \function{lookup()} for the codec
				72	lookup:
				73
				74	\begin{funcdesc}{getencoder}{encoding}
				75	Lookup up the codec for the given encoding and return its encoder
				76	function.
				77
				78	Raises a \exception{LookupError} in case the encoding cannot be found.
				79	\end{funcdesc}
				80
				81	\begin{funcdesc}{getdecoder}{encoding}
				82	Lookup up the codec for the given encoding and return its decoder
				83	function.
				84
				85	Raises a \exception{LookupError} in case the encoding cannot be found.
				86	\end{funcdesc}
				87
				88	\begin{funcdesc}{getreader}{encoding}
				89	Lookup up the codec for the given encoding and return its StreamReader
				90	class or factory function.
				91
				92	Raises a \exception{LookupError} in case the encoding cannot be found.
				93	\end{funcdesc}
				94
				95	\begin{funcdesc}{getwriter}{encoding}
				96	Lookup up the codec for the given encoding and return its StreamWriter
				97	class or factory function.
				98
				99	Raises a \exception{LookupError} in case the encoding cannot be found.
				100	\end{funcdesc}
				101
Walter Dörwald	3aeb632	2002-09-02 13:14:32 +0000	[diff] [blame]	102	\begin{funcdesc}{register_error}{name, error_handler}
				103	Register the error handling function \var{error_handler} under the
Raymond Hettinger	8a64d40	2002-09-08 22:26:13 +0000	[diff] [blame]	104	name \var{name}. \var{error_handler} will be called during encoding
Walter Dörwald	3aeb632	2002-09-02 13:14:32 +0000	[diff] [blame]	105	and decoding in case of an error, when \var{name} is specified as the
				106	errors parameter. \var{error_handler} will be called with an
				107	\exception{UnicodeEncodeError}, \exception{UnicodeDecodeError} or
				108	\exception{UnicodeTranslateError} instance and must return a tuple
				109	with a replacement for the unencodable/undecodable part of the input
				110	and a position where encoding/decoding should continue.
				111	\end{funcdesc}
				112
				113	\begin{funcdesc}{lookup_error}{name}
				114	Return the error handler previously register under the name \var{name}.
				115
				116	Raises a \exception{LookupError} in case the handler cannot be found.
				117	\end{funcdesc}
				118
				119	\begin{funcdesc}{strict_errors}{exception}
				120	Implements the \code{strict} error handling.
				121	\end{funcdesc}
				122
				123	\begin{funcdesc}{replace_errors}{exception}
				124	Implements the \code{replace} error handling.
				125	\end{funcdesc}
				126
				127	\begin{funcdesc}{ignore_errors}{exception}
				128	Implements the \code{ignore} error handling.
				129	\end{funcdesc}
				130
				131	\begin{funcdesc}{xmlcharrefreplace_errors_errors}{exception}
				132	Implements the \code{xmlcharrefreplace} error handling.
				133	\end{funcdesc}
				134
				135	\begin{funcdesc}{backslashreplace_errors_errors}{exception}
				136	Implements the \code{backslashreplace} error handling.
				137	\end{funcdesc}
				138
Walter Dörwald	1a7a894	2002-11-02 13:32:07 +0000	[diff] [blame]	139	To simplify working with encoded files or stream, the module
				140	also defines these utility functions:
				141
Fred Drake	e1b304d	2000-07-24 19:35:52 +0000	[diff] [blame]	142	\begin{funcdesc}{open}{filename, mode\optional{, encoding\optional{,
				143	errors\optional{, buffering}}}}
Fred Drake	b7979c7	2000-04-06 14:21:58 +0000	[diff] [blame]	144	Open an encoded file using the given \var{mode} and return
				145	a wrapped version providing transparent encoding/decoding.
				146
Fred Drake	0aa811c	2001-10-20 04:24:09 +0000	[diff] [blame]	147	\note{The wrapped version will only accept the object format
Fred Drake	e1b304d	2000-07-24 19:35:52 +0000	[diff] [blame]	148	defined by the codecs, i.e.\ Unicode objects for most built-in
				149	codecs. Output is also codec-dependent and will usually be Unicode as
Fred Drake	0aa811c	2001-10-20 04:24:09 +0000	[diff] [blame]	150	well.}
Fred Drake	b7979c7	2000-04-06 14:21:58 +0000	[diff] [blame]	151
				152	\var{encoding} specifies the encoding which is to be used for the
				153	the file.
				154
				155	\var{errors} may be given to define the error handling. It defaults
Fred Drake	e1b304d	2000-07-24 19:35:52 +0000	[diff] [blame]	156	to \code{'strict'} which causes a \exception{ValueError} to be raised
				157	in case an encoding error occurs.
Fred Drake	b7979c7	2000-04-06 14:21:58 +0000	[diff] [blame]	158
Fred Drake	69ca950	2000-04-06 16:09:59 +0000	[diff] [blame]	159	\var{buffering} has the same meaning as for the built-in
				160	\function{open()} function. It defaults to line buffered.
Fred Drake	b7979c7	2000-04-06 14:21:58 +0000	[diff] [blame]	161	\end{funcdesc}
				162
Fred Drake	e1b304d	2000-07-24 19:35:52 +0000	[diff] [blame]	163	\begin{funcdesc}{EncodedFile}{file, input\optional{,
				164	output\optional{, errors}}}
Fred Drake	b7979c7	2000-04-06 14:21:58 +0000	[diff] [blame]	165	Return a wrapped version of file which provides transparent
				166	encoding translation.
				167
				168	Strings written to the wrapped file are interpreted according to the
				169	given \var{input} encoding and then written to the original file as
Fred Drake	e1b304d	2000-07-24 19:35:52 +0000	[diff] [blame]	170	strings using the \var{output} encoding. The intermediate encoding will
Fred Drake	b7979c7	2000-04-06 14:21:58 +0000	[diff] [blame]	171	usually be Unicode but depends on the specified codecs.
				172
Fred Drake	e1b304d	2000-07-24 19:35:52 +0000	[diff] [blame]	173	If \var{output} is not given, it defaults to \var{input}.
Fred Drake	b7979c7	2000-04-06 14:21:58 +0000	[diff] [blame]	174
				175	\var{errors} may be given to define the error handling. It defaults to
Fred Drake	e1b304d	2000-07-24 19:35:52 +0000	[diff] [blame]	176	\code{'strict'}, which causes \exception{ValueError} to be raised in case
Fred Drake	b7979c7	2000-04-06 14:21:58 +0000	[diff] [blame]	177	an encoding error occurs.
				178	\end{funcdesc}
				179
Fred Drake	b7979c7	2000-04-06 14:21:58 +0000	[diff] [blame]	180	The module also provides the following constants which are useful
				181	for reading and writing to platform dependent files:
				182
				183	\begin{datadesc}{BOM}
				184	\dataline{BOM_BE}
				185	\dataline{BOM_LE}
Walter Dörwald	474458d	2002-06-04 15:16:29 +0000	[diff] [blame]	186	\dataline{BOM_UTF8}
				187	\dataline{BOM_UTF16}
				188	\dataline{BOM_UTF16_BE}
				189	\dataline{BOM_UTF16_LE}
				190	\dataline{BOM_UTF32}
				191	\dataline{BOM_UTF32_BE}
				192	\dataline{BOM_UTF32_LE}
				193	These constants define various encodings of the Unicode byte order mark
				194	(BOM) used in UTF-16 and UTF-32 data streams to indicate the byte order
				195	used in the stream or file and in UTF-8 as a Unicode signature.
				196	\constant{BOM_UTF16} is either \constant{BOM_UTF16_BE} or
				197	\constant{BOM_UTF16_LE} depending on the platform's native byte order,
				198	\constant{BOM} is an alias for \constant{BOM_UTF16}, \constant{BOM_LE}
				199	for \constant{BOM_UTF16_LE} and \constant{BOM_BE} for \constant{BOM_UTF16_BE}.
				200	The others represent the BOM in UTF-8 and UTF-32 encodings.
Fred Drake	b7979c7	2000-04-06 14:21:58 +0000	[diff] [blame]	201	\end{datadesc}
				202
Fred Drake	dc40ac0	2001-01-22 20:17:54 +0000	[diff] [blame]	203
				204	\begin{seealso}
				205	\seeurl{http://sourceforge.net/projects/python-codecs/}{A
				206	SourceForge project working on additional support for Asian
				207	codecs for use with Python. They are in the early stages of
				208	development at the time of this writing --- look in their
				209	FTP area for downloadable files.}
				210	\end{seealso}
				211
				212
Fred Drake	602aa77	2000-10-12 20:50:55 +0000	[diff] [blame]	213	\subsection{Codec Base Classes}
				214
				215	The \module{codecs} defines a set of base classes which define the
				216	interface and can also be used to easily write you own codecs for use
				217	in Python.
				218
				219	Each codec has to define four interfaces to make it usable as codec in
				220	Python: stateless encoder, stateless decoder, stream reader and stream
				221	writer. The stream reader and writers typically reuse the stateless
				222	encoder/decoder to implement the file protocols.
				223
				224	The \class{Codec} class defines the interface for stateless
				225	encoders/decoders.
				226
				227	To simplify and standardize error handling, the \method{encode()} and
				228	\method{decode()} methods may implement different error handling
				229	schemes by providing the \var{errors} string argument. The following
				230	string values are defined and implemented by all standard Python
				231	codecs:
				232
Fred Drake	dc40ac0	2001-01-22 20:17:54 +0000	[diff] [blame]	233	\begin{tableii}{l\|l}{code}{Value}{Meaning}
Walter Dörwald	430b156	2002-11-07 22:33:17 +0000	[diff] [blame]	234	\lineii{'strict'}{Raise \exception{UnicodeError} (or a subclass);
Fred Drake	dc40ac0	2001-01-22 20:17:54 +0000	[diff] [blame]	235	this is the default.}
				236	\lineii{'ignore'}{Ignore the character and continue with the next.}
				237	\lineii{'replace'}{Replace with a suitable replacement character;
				238	Python will use the official U+FFFD REPLACEMENT
Walter Dörwald	430b156	2002-11-07 22:33:17 +0000	[diff] [blame]	239	CHARACTER for the built-in Unicode codecs on
				240	decoding and '?' on encoding.}
				241	\lineii{'xmlcharrefreplace'}{Replace with the appropriate XML
				242	character reference (only for encoding).}
				243	\lineii{'backslashreplace'}{Replace with backslashed escape sequences
				244	(only for encoding).}
Fred Drake	dc40ac0	2001-01-22 20:17:54 +0000	[diff] [blame]	245	\end{tableii}
Fred Drake	602aa77	2000-10-12 20:50:55 +0000	[diff] [blame]	246
Walter Dörwald	430b156	2002-11-07 22:33:17 +0000	[diff] [blame]	247	The set of allowed values can be extended via \method{register_error}.
				248
Fred Drake	602aa77	2000-10-12 20:50:55 +0000	[diff] [blame]	249
				250	\subsubsection{Codec Objects \label{codec-objects}}
				251
				252	The \class{Codec} class defines these methods which also define the
				253	function interfaces of the stateless encoder and decoder:
				254
				255	\begin{methoddesc}{encode}{input\optional{, errors}}
				256	Encodes the object \var{input} and returns a tuple (output object,
Skip Montanaro	6c7bc31	2002-04-16 15:12:10 +0000	[diff] [blame]	257	length consumed). While codecs are not restricted to use with Unicode, in
				258	a Unicode context, encoding converts a Unicode object to a plain string
				259	using a particular character set encoding (e.g., \code{cp1252} or
				260	\code{iso-8859-1}).
Fred Drake	602aa77	2000-10-12 20:50:55 +0000	[diff] [blame]	261
				262	\var{errors} defines the error handling to apply. It defaults to
				263	\code{'strict'} handling.
				264
				265	The method may not store state in the \class{Codec} instance. Use
				266	\class{StreamCodec} for codecs which have to keep state in order to
				267	make encoding/decoding efficient.
				268
				269	The encoder must be able to handle zero length input and return an
				270	empty object of the output object type in this situation.
				271	\end{methoddesc}
				272
				273	\begin{methoddesc}{decode}{input\optional{, errors}}
				274	Decodes the object \var{input} and returns a tuple (output object,
Skip Montanaro	6c7bc31	2002-04-16 15:12:10 +0000	[diff] [blame]	275	length consumed). In a Unicode context, decoding converts a plain string
				276	encoded using a particular character set encoding to a Unicode object.
Fred Drake	602aa77	2000-10-12 20:50:55 +0000	[diff] [blame]	277
				278	\var{input} must be an object which provides the \code{bf_getreadbuf}
				279	buffer slot. Python strings, buffer objects and memory mapped files
				280	are examples of objects providing this slot.
				281
				282	\var{errors} defines the error handling to apply. It defaults to
				283	\code{'strict'} handling.
				284
				285	The method may not store state in the \class{Codec} instance. Use
				286	\class{StreamCodec} for codecs which have to keep state in order to
				287	make encoding/decoding efficient.
				288
				289	The decoder must be able to handle zero length input and return an
				290	empty object of the output object type in this situation.
				291	\end{methoddesc}
				292
				293	The \class{StreamWriter} and \class{StreamReader} classes provide
				294	generic working interfaces which can be used to implement new
				295	encodings submodules very easily. See \module{encodings.utf_8} for an
				296	example on how this is done.
				297
				298
				299	\subsubsection{StreamWriter Objects \label{stream-writer-objects}}
				300
				301	The \class{StreamWriter} class is a subclass of \class{Codec} and
				302	defines the following methods which every stream writer must define in
				303	order to be compatible to the Python codec registry.
				304
				305	\begin{classdesc}{StreamWriter}{stream\optional{, errors}}
				306	Constructor for a \class{StreamWriter} instance.
				307
				308	All stream writers must provide this constructor interface. They are
				309	free to add additional keyword arguments, but only the ones defined
				310	here are used by the Python codec registry.
				311
				312	\var{stream} must be a file-like object open for writing (binary)
				313	data.
				314
				315	The \class{StreamWriter} may implement different error handling
				316	schemes by providing the \var{errors} keyword argument. These
Walter Dörwald	430b156	2002-11-07 22:33:17 +0000	[diff] [blame]	317	parameters are predefined:
Fred Drake	602aa77	2000-10-12 20:50:55 +0000	[diff] [blame]	318
				319	\begin{itemize}
				320	\item \code{'strict'} Raise \exception{ValueError} (or a subclass);
				321	this is the default.
				322	\item \code{'ignore'} Ignore the character and continue with the next.
				323	\item \code{'replace'} Replace with a suitable replacement character
Walter Dörwald	430b156	2002-11-07 22:33:17 +0000	[diff] [blame]	324	\item \code{'xmlcharrefreplace'} Replace with the appropriate XML
				325	character reference
				326	\item \code{'backslashreplace'} Replace with backslashed escape sequences.
Fred Drake	602aa77	2000-10-12 20:50:55 +0000	[diff] [blame]	327	\end{itemize}
Walter Dörwald	430b156	2002-11-07 22:33:17 +0000	[diff] [blame]	328
				329	The \var{errors} argument will be assigned to an attribute of the
				330	same name. Assigning to this attribute makes it possible to switch
				331	between different error handling strategies during the lifetime
				332	of the \class{StreamWriter} object.
				333
				334	The set of allowed values for the \var{errors} argument can
				335	be extended with \function{register_error()}.
Fred Drake	602aa77	2000-10-12 20:50:55 +0000	[diff] [blame]	336	\end{classdesc}
				337
				338	\begin{methoddesc}{write}{object}
				339	Writes the object's contents encoded to the stream.
				340	\end{methoddesc}
				341
				342	\begin{methoddesc}{writelines}{list}
				343	Writes the concatenated list of strings to the stream (possibly by
				344	reusing the \method{write()} method).
				345	\end{methoddesc}
				346
				347	\begin{methoddesc}{reset}{}
				348	Flushes and resets the codec buffers used for keeping state.
				349
				350	Calling this method should ensure that the data on the output is put
				351	into a clean state, that allows appending of new fresh data without
				352	having to rescan the whole stream to recover state.
				353	\end{methoddesc}
				354
				355	In addition to the above methods, the \class{StreamWriter} must also
				356	inherit all other methods and attribute from the underlying stream.
				357
				358
				359	\subsubsection{StreamReader Objects \label{stream-reader-objects}}
				360
				361	The \class{StreamReader} class is a subclass of \class{Codec} and
				362	defines the following methods which every stream reader must define in
				363	order to be compatible to the Python codec registry.
				364
				365	\begin{classdesc}{StreamReader}{stream\optional{, errors}}
				366	Constructor for a \class{StreamReader} instance.
				367
				368	All stream readers must provide this constructor interface. They are
				369	free to add additional keyword arguments, but only the ones defined
				370	here are used by the Python codec registry.
				371
				372	\var{stream} must be a file-like object open for reading (binary)
				373	data.
				374
				375	The \class{StreamReader} may implement different error handling
				376	schemes by providing the \var{errors} keyword argument. These
				377	parameters are defined:
				378
				379	\begin{itemize}
				380	\item \code{'strict'} Raise \exception{ValueError} (or a subclass);
				381	this is the default.
				382	\item \code{'ignore'} Ignore the character and continue with the next.
				383	\item \code{'replace'} Replace with a suitable replacement character.
				384	\end{itemize}
Walter Dörwald	430b156	2002-11-07 22:33:17 +0000	[diff] [blame]	385
				386	The \var{errors} argument will be assigned to an attribute of the
				387	same name. Assigning to this attribute makes it possible to switch
				388	between different error handling strategies during the lifetime
				389	of the \class{StreamReader} object.
				390
				391	The set of allowed values for the \var{errors} argument can
				392	be extended with \function{register_error()}.
Fred Drake	602aa77	2000-10-12 20:50:55 +0000	[diff] [blame]	393	\end{classdesc}
				394
				395	\begin{methoddesc}{read}{\optional{size}}
				396	Decodes data from the stream and returns the resulting object.
				397
				398	\var{size} indicates the approximate maximum number of bytes to read
				399	from the stream for decoding purposes. The decoder can modify this
				400	setting as appropriate. The default value -1 indicates to read and
				401	decode as much as possible. \var{size} is intended to prevent having
				402	to decode huge files in one step.
				403
				404	The method should use a greedy read strategy meaning that it should
				405	read as much data as is allowed within the definition of the encoding
				406	and the given size, e.g. if optional encoding endings or state
				407	markers are available on the stream, these should be read too.
				408	\end{methoddesc}
				409
				410	\begin{methoddesc}{readline}{[size]}
				411	Read one line from the input stream and return the
				412	decoded data.
				413
Fred Drake	0aa811c	2001-10-20 04:24:09 +0000	[diff] [blame]	414	Unlike the \method{readlines()} method, this method inherits
Fred Drake	602aa77	2000-10-12 20:50:55 +0000	[diff] [blame]	415	the line breaking knowledge from the underlying stream's
				416	\method{readline()} method -- there is currently no support for line
				417	breaking using the codec decoder due to lack of line buffering.
				418	Sublcasses should however, if possible, try to implement this method
				419	using their own knowledge of line breaking.
				420
				421	\var{size}, if given, is passed as size argument to the stream's
				422	\method{readline()} method.
				423	\end{methoddesc}
				424
				425	\begin{methoddesc}{readlines}{[sizehint]}
				426	Read all lines available on the input stream and return them as list
				427	of lines.
				428
				429	Line breaks are implemented using the codec's decoder method and are
				430	included in the list entries.
				431
				432	\var{sizehint}, if given, is passed as \var{size} argument to the
				433	stream's \method{read()} method.
				434	\end{methoddesc}
				435
				436	\begin{methoddesc}{reset}{}
				437	Resets the codec buffers used for keeping state.
				438
				439	Note that no stream repositioning should take place. This method is
				440	primarily intended to be able to recover from decoding errors.
				441	\end{methoddesc}
				442
				443	In addition to the above methods, the \class{StreamReader} must also
				444	inherit all other methods and attribute from the underlying stream.
				445
				446	The next two base classes are included for convenience. They are not
				447	needed by the codec registry, but may provide useful in practice.
				448
				449
				450	\subsubsection{StreamReaderWriter Objects \label{stream-reader-writer}}
				451
				452	The \class{StreamReaderWriter} allows wrapping streams which work in
				453	both read and write modes.
				454
				455	The design is such that one can use the factory functions returned by
				456	the \function{lookup()} function to construct the instance.
				457
				458	\begin{classdesc}{StreamReaderWriter}{stream, Reader, Writer, errors}
				459	Creates a \class{StreamReaderWriter} instance.
				460	\var{stream} must be a file-like object.
				461	\var{Reader} and \var{Writer} must be factory functions or classes
				462	providing the \class{StreamReader} and \class{StreamWriter} interface
				463	resp.
				464	Error handling is done in the same way as defined for the
				465	stream readers and writers.
				466	\end{classdesc}
				467
				468	\class{StreamReaderWriter} instances define the combined interfaces of
				469	\class{StreamReader} and \class{StreamWriter} classes. They inherit
				470	all other methods and attribute from the underlying stream.
				471
				472
				473	\subsubsection{StreamRecoder Objects \label{stream-recoder-objects}}
				474
				475	The \class{StreamRecoder} provide a frontend - backend view of
				476	encoding data which is sometimes useful when dealing with different
				477	encoding environments.
				478
				479	The design is such that one can use the factory functions returned by
				480	the \function{lookup()} function to construct the instance.
				481
				482	\begin{classdesc}{StreamRecoder}{stream, encode, decode,
				483	Reader, Writer, errors}
				484	Creates a \class{StreamRecoder} instance which implements a two-way
				485	conversion: \var{encode} and \var{decode} work on the frontend (the
				486	input to \method{read()} and output of \method{write()}) while
				487	\var{Reader} and \var{Writer} work on the backend (reading and
				488	writing to the stream).
				489
				490	You can use these objects to do transparent direct recodings from
				491	e.g.\ Latin-1 to UTF-8 and back.
				492
				493	\var{stream} must be a file-like object.
				494
				495	\var{encode}, \var{decode} must adhere to the \class{Codec}
				496	interface, \var{Reader}, \var{Writer} must be factory functions or
				497	classes providing objects of the the \class{StreamReader} and
				498	\class{StreamWriter} interface respectively.
				499
				500	\var{encode} and \var{decode} are needed for the frontend
				501	translation, \var{Reader} and \var{Writer} for the backend
				502	translation. The intermediate format used is determined by the two
				503	sets of codecs, e.g. the Unicode codecs will use Unicode as
				504	intermediate encoding.
				505
				506	Error handling is done in the same way as defined for the
				507	stream readers and writers.
				508	\end{classdesc}
				509
				510	\class{StreamRecoder} instances define the combined interfaces of
				511	\class{StreamReader} and \class{StreamWriter} classes. They inherit
				512	all other methods and attribute from the underlying stream.
				513
Martin v. Löwis	5c37a77	2002-12-31 12:39:07 +0000	[diff] [blame^]	514	\subsection{Standard Encodings}
				515
				516	Python comes with a number of codecs builtin, either implemented as C
				517	functions, or with dictionaries as mapping tables. The following table
				518	lists the codecs by name, together with a few common aliases, and the
				519	languages for which the encoding is likely used. Neither the list of
				520	aliases nor the list of languages is meant to be exhaustive. Notice
				521	that spelling alternatives that only differ in case or use a hyphen
				522	instead of an underscore are also valid aliases.
				523
				524	Many of the character sets support the same languages. They vary in
				525	individual characters (e.g. whether the EURO SIGN is supported or
				526	not), and in the assignment of characters to code positions. For the
				527	European languages in particular, the following variants typically
				528	exist:
				529
				530	\begin{itemize}
				531	\item an ISO 8859 codeset
				532	\item a Microsoft Windows code page, which is typically derived from
				533	a 8859 codeset, but replaces control characters with additional
				534	graphic characters
				535	\item an IBM EBCDIC code page
				536	\item an IBM PC code page, which is ASCII compatible
				537	\end{itemize}
				538
				539	\begin{longtableiii}{l\|l\|l}{textrm}{Codec}{Aliases}{Languages}
				540
				541	\lineiii{ascii}
				542	{646, us-ascii}
				543	{English}
				544
				545	\lineiii{cp037}
				546	{IBM037, IBM039}
				547	{English}
				548
				549	\lineiii{cp424}
				550	{EBCDIC-CP-HE, IBM424}
				551	{Hebrew}
				552
				553	\lineiii{cp437}
				554	{437, IBM437}
				555	{English}
				556
				557	\lineiii{cp500}
				558	{EBCDIC-CP-BE, EBCDIC-CP-CH, IBM500}
				559	{Western Europe}
				560
				561	\lineiii{cp737}
				562	{}
				563	{Greek}
				564
				565	\lineiii{cp775}
				566	{IBM775}
				567	{Baltic languages}
				568
				569	\lineiii{cp850}
				570	{850, IBM850}
				571	{Western Europe}
				572
				573	\lineiii{cp852}
				574	{852, IBM852}
				575	{Central and Eastern Europe}
				576
				577	\lineiii{cp855}
				578	{855, IBM855}
				579	{Bulgarian, Byelorussian, Macedonian, Russian, Serbian}
				580
				581	\lineiii{cp856}
				582	{}
				583	{Hebrew}
				584
				585	\lineiii{cp857}
				586	{857, IBM857}
				587	{Turkish}
				588
				589	\lineiii{cp860}
				590	{860, IBM860}
				591	{Portuguese}
				592
				593	\lineiii{cp861}
				594	{861, CP-IS, IBM861}
				595	{Icelandic}
				596
				597	\lineiii{cp862}
				598	{862, IBM862}
				599	{Hebrew}
				600
				601	\lineiii{cp863}
				602	{863, IBM863}
				603	{Canadian}
				604
				605	\lineiii{cp864}
				606	{IBM864}
				607	{Arabic}
				608
				609	\lineiii{cp865}
				610	{865, IBM865}
				611	{Danish, Norwegian}
				612
				613	\lineiii{cp869}
				614	{869, CP-GR, IBM869}
				615	{Greek}
				616
				617	\lineiii{cp874}
				618	{}
				619	{Thai}
				620
				621	\lineiii{cp875}
				622	{}
				623	{Greek}
				624
				625	\lineiii{cp1006}
				626	{}
				627	{Urdu}
				628
				629	\lineiii{cp1026}
				630	{ibm1026}
				631	{Turkish}
				632
				633	\lineiii{cp1140}
				634	{ibm1140}
				635	{Western Europe}
				636
				637	\lineiii{cp1250}
				638	{windows-1250}
				639	{Central and Eastern Europe}
				640
				641	\lineiii{cp1251}
				642	{windows-1251}
				643	{Bulgarian, Byelorussian, Macedonian, Russian, Serbian}
				644
				645	\lineiii{cp1252}
				646	{windows-1252}
				647	{Western Europe}
				648
				649	\lineiii{cp1253}
				650	{windows-1253}
				651	{Greek}
				652
				653	\lineiii{cp1254}
				654	{windows-1254}
				655	{Turkish}
				656
				657	\lineiii{cp1255}
				658	{windows-1255}
				659	{Hebrew}
				660
				661	\lineiii{cp1256}
				662	{windows1256}
				663	{Arabic}
				664
				665	\lineiii{cp1257}
				666	{windows-1257}
				667	{Baltic languages}
				668
				669	\lineiii{cp1258}
				670	{windows-1258}
				671	{Vietnamese}
				672
				673	\lineiii{latin_1}
				674	{iso-8859-1, iso8859-1, 8859, cp819, latin, latin1, L1}
				675	{West Europe}
				676
				677	\lineiii{iso8859_2}
				678	{iso-8859-2, latin2, L2}
				679	{Central and Eastern Europe}
				680
				681	\lineiii{iso8859_3}
				682	{iso-8859-3, latin3, L3}
				683	{Esperanto, Maltese}
				684
				685	\lineiii{iso8859_4}
				686	{iso-8859-4, latin4, L4}
				687	{Baltic languagues}
				688
				689	\lineiii{iso8859_5}
				690	{iso-8859-5, cyrillic}
				691	{Bulgarian, Byelorussian, Macedonian, Russian, Serbian}
				692
				693	\lineiii{iso8859_6}
				694	{iso-8859-6, arabic}
				695	{Arabic}
				696
				697	\lineiii{iso8859_7}
				698	{iso-8859-7, greek, greek8}
				699	{Greek}
				700
				701	\lineiii{iso8859_8}
				702	{iso-8859-8, hebrew}
				703	{Hebrew}
				704
				705	\lineiii{iso8859_9}
				706	{iso-8859-9, latin5, L5}
				707	{Turkish}
				708
				709	\lineiii{iso8859_10}
				710	{iso-8859-10, latin6, L6}
				711	{Nordic languages}
				712
				713	\lineiii{iso8859_13}
				714	{iso-8859-13}
				715	{Baltic languages}
				716
				717	\lineiii{iso8859_14}
				718	{iso-8859-14, latin8, L8}
				719	{Celtic languages}
				720
				721	\lineiii{iso8859_15}
				722	{iso-8859-15}
				723	{Western Europe}
				724
				725	\lineiii{koi8_r}
				726	{}
				727	{Russian}
				728
				729	\lineiii{koi8_u}
				730	{}
				731	{Ukrainian}
				732
				733	\lineiii{mac_cyrillic}
				734	{maccyrillic}
				735	{Bulgarian, Byelorussian, Macedonian, Russian, Serbian}
				736
				737	\lineiii{mac_greek}
				738	{macgreek}
				739	{Greek}
				740
				741	\lineiii{mac_iceland}
				742	{maciceland}
				743	{Icelandic}
				744
				745	\lineiii{mac_latin2}
				746	{maclatin2, maccentraleurope}
				747	{Central and Eastern Europe}
				748
				749	\lineiii{mac_roman}
				750	{macroman}
				751	{Western Europe}
				752
				753	\lineiii{mac_turkish}
				754	{macturkish}
				755	{Turkish}
				756
				757	\lineiii{utf_16}
				758	{U16, utf16}
				759	{all languages}
				760
				761	\lineiii{utf_16_be}
				762	{UTF-16BE}
				763	{all languages (BMP only)}
				764
				765	\lineiii{utf_16_le}
				766	{UTF-16LE}
				767	{all languages (BMP only)}
				768
				769	\lineiii{utf_7}
				770	{U7}
				771	{all languages}
				772
				773	\lineiii{utf_8}
				774	{U8, UTF, utf8}
				775	{all languages}
				776
				777	\end{longtableiii}
				778
				779	A number of codecs are specific to Python, so their codec names have
				780	no meaning outside Python. Some of them don't convert from Unicode
				781	strings to byte strings, but instead use the property of the Python
				782	codecs machinery that any bijective function with one argument can be
				783	considered as an encoding.
				784
				785	For the codecs listed below, the result in the ``encoding'' direction
				786	is always a byte string. The result of the ``decoding'' direction is
				787	listed as operand type in the table.
				788
				789	\begin{tableiv}{l\|l\|l\|l}{textrm}{Codec}{Aliases}{Operand type}{Purpose}
				790
				791	\lineiv{base64_codec}
				792	{base64, base-64}
				793	{byte string}
				794	{Convert operand to MIME base64}
				795
				796	\lineiv{hex_codec}
				797	{hex}
				798	{byte string}
				799	{Convert operand to hexadecimal representation, with two digits per byte}
				800
				801	\lineiv{mbcs}
				802	{dbcs}
				803	{Unicode string}
				804	{Windows only: Encode operand according to the ANSI codepage (CP_ACP)}
				805
				806	\lineiv{palmos}
				807	{}
				808	{Unicode string}
				809	{Encoding of PalmOS 3.5}
				810
				811	\lineiv{quopri_codec}
				812	{quopri, quoted-printable, quotedprintable}
				813	{byte string}
				814	{Convert operand to MIME quoted printable}
				815
				816	\lineiv{raw_unicode_escape}
				817	{}
				818	{Unicode string}
				819	{Produce a string that is suitable as raw Unicode literal in Python source code}
				820
				821	\lineiv{rot_13}
				822	{rot13}
				823	{byte string}
				824	{Returns the Caesar-cypher encryption of the operand}
				825
				826	\lineiv{string_escape}
				827	{}
				828	{byte string}
				829	{Produce a string that is suitable as string literal in Python source code}
				830
				831	\lineiv{undefined}
				832	{}
				833	{any}
				834	{Raise an exception for all conversion. Can be used as the system encoding if no automatic coercion between byte and Unicode strings is desired.}
				835
				836	\lineiv{unicode_escape}
				837	{}
				838	{Unicode string}
				839	{Produce a string that is suitable as Unicode literal in Python source code}
				840
				841	\lineiv{unicode_internal}
				842	{}
				843	{Unicode string}
				844	{Return the internal represenation of the operand}
				845
				846	\lineiv{uu_codec}
				847	{uu}
				848	{byte string}
				849	{Convert the operand using uuencode}
				850
				851	\lineiv{zlib_codec}
				852	{zip, zlib}
				853	{byte string}
				854	{Compress the operand using gzip}
				855
				856	\end{tableiv}