Doc/lib/emailcharsets.tex - platform/external/python/cpython2 - Gitiles

 \declaremodule{standard}{email.Charset}
 \modulesynopsis{Character Sets}

 This module provides a class \class{Charset} for representing
 character sets and character set conversions in email messages, as
 well as a character set registry and several convenience methods for
 manipulating this registry.  Instances of \class{Charset} are used in
 several other modules within the \module{email} package.

 \versionadded{2.2.2}

 \begin{classdesc}{Charset}{\optional{input_charset}}
 Map character sets to their email properties.

 This class provides information about the requirements imposed on
 email for a specific character set.  It also provides convenience
 routines for converting between character sets, given the availability
 of the applicable codecs.  Given a character set, it will do its best
 to provide information on how to use that character set in an email
 message in an RFC-compliant way.

 Certain character sets must be encoded with quoted-printable or base64
 when used in email headers or bodies.  Certain character sets must be
 converted outright, and are not allowed in email.

 Optional \var{input_charset} is as described below; it is always
 coerced to lower case.  After being alias normalized it is also used
 as a lookup into the registry of character sets to find out the header
 encoding, body encoding, and output conversion codec to be used for
 the character set.  For example, if
 \var{input_charset} is \code{iso-8859-1}, then headers and bodies will
 be encoded using quoted-printable and no output conversion codec is
 necessary.  If \var{input_charset} is \code{euc-jp}, then headers will
 be encoded with base64, bodies will not be encoded, but output text
 will be converted from the \code{euc-jp} character set to the
 \code{iso-2022-jp} character set.
 \end{classdesc}

 \class{Charset} instances have the following data attributes:

 \begin{datadesc}{input_charset}
 The initial character set specified.  Common aliases are converted to
 their \emph{official} email names (e.g. \code{latin_1} is converted to
 \code{iso-8859-1}).  Defaults to 7-bit \code{us-ascii}.
 \end{datadesc}

 \begin{datadesc}{header_encoding}
 If the character set must be encoded before it can be used in an
 email header, this attribute will be set to \code{Charset.QP} (for
 quoted-printable), \code{Charset.BASE64} (for base64 encoding), or
 \code{Charset.SHORTEST} for the shortest of QP or BASE64 encoding.
 Otherwise, it will be \code{None}.
 \end{datadesc}

 \begin{datadesc}{body_encoding}
 Same as \var{header_encoding}, but describes the encoding for the
 mail message's body, which indeed may be different than the header
 encoding.  \code{Charset.SHORTEST} is not allowed for
 \var{body_encoding}.
 \end{datadesc}

 \begin{datadesc}{output_charset}
 Some character sets must be converted before they can be used in
 email headers or bodies.  If the \var{input_charset} is one of
 them, this attribute will contain the name of the character set
 output will be converted to.  Otherwise, it will be \code{None}.
 \end{datadesc}

 \begin{datadesc}{input_codec}
 The name of the Python codec used to convert the \var{input_charset} to
 Unicode.  If no conversion codec is necessary, this attribute will be
 \code{None}.
 \end{datadesc}

 \begin{datadesc}{output_codec}
 The name of the Python codec used to convert Unicode to the
 \var{output_charset}.  If no conversion codec is necessary, this
 attribute will have the same value as the \var{input_codec}.
 \end{datadesc}

 \class{Charset} instances also have the following methods:

 \begin{methoddesc}[Charset]{get_body_encoding}{}
 Return the content transfer encoding used for body encoding.

 This is either the string \samp{quoted-printable} or \samp{base64}
 depending on the encoding used, or it is a function, in which case you
 should call the function with a single argument, the Message object
 being encoded.  The function should then set the
 \mailheader{Content-Transfer-Encoding} header itself to whatever is
 appropriate.

 Returns the string \samp{quoted-printable} if
 \var{body_encoding} is \code{QP}, returns the string
 \samp{base64} if \var{body_encoding} is \code{BASE64}, and returns the
 string \samp{7bit} otherwise.
 \end{methoddesc}

 \begin{methoddesc}{convert}{s}
 Convert the string \var{s} from the \var{input_codec} to the
 \var{output_codec}.
 \end{methoddesc}

 \begin{methoddesc}{to_splittable}{s}
 Convert a possibly multibyte string to a safely splittable format.
 \var{s} is the string to split.

 Uses the \var{input_codec} to try and convert the string to Unicode,
 so it can be safely split on character boundaries (even for multibyte
 characters).

 Returns the string as-is if it isn't known how to convert \var{s} to
 Unicode with the \var{input_charset}.

 Characters that could not be converted to Unicode will be replaced
 with the Unicode replacement character \character{U+FFFD}.
 \end{methoddesc}

 \begin{methoddesc}{from_splittable}{ustr\optional{, to_output}}
 Convert a splittable string back into an encoded string.  \var{ustr}
 is a Unicode string to ``unsplit''.

 This method uses the proper codec to try and convert the string from
 Unicode back into an encoded format.  Return the string as-is if it is
 not Unicode, or if it could not be converted from Unicode.

 Characters that could not be converted from Unicode will be replaced
 with an appropriate character (usually \character{?}).

 If \var{to_output} is \code{True} (the default), uses
 \var{output_codec} to convert to an
 encoded format.  If \var{to_output} is \code{False}, it uses
 \var{input_codec}.
 \end{methoddesc}

 \begin{methoddesc}{get_output_charset}{}
 Return the output character set.

 This is the \var{output_charset} attribute if that is not \code{None},
 otherwise it is \var{input_charset}.
 \end{methoddesc}

 \begin{methoddesc}{encoded_header_len}{}
 Return the length of the encoded header string, properly calculating
 for quoted-printable or base64 encoding.
 \end{methoddesc}

 \begin{methoddesc}{header_encode}{s\optional{, convert}}
 Header-encode the string \var{s}.

 If \var{convert} is \code{True}, the string will be converted from the
 input charset to the output charset automatically.  This is not useful
 for multibyte character sets, which have line length issues (multibyte
 characters must be split on a character, not a byte boundary); use the
 higher-level \class{Header} class to deal with these issues (see
 \refmodule{email.Header}).  \var{convert} defaults to \code{False}.

 The type of encoding (base64 or quoted-printable) will be based on
 the \var{header_encoding} attribute.
 \end{methoddesc}

 \begin{methoddesc}{body_encode}{s\optional{, convert}}
 Body-encode the string \var{s}.

 If \var{convert} is \code{True} (the default), the string will be
 converted from the input charset to output charset automatically.
 Unlike \method{header_encode()}, there are no issues with byte
 boundaries and multibyte charsets in email bodies, so this is usually
 pretty safe.

 The type of encoding (base64 or quoted-printable) will be based on
 the \var{body_encoding} attribute.
 \end{methoddesc}

 The \class{Charset} class also provides a number of methods to support
 standard operations and built-in functions.

 \begin{methoddesc}[Charset]{__str__}{}
 Returns \var{input_charset} as a string coerced to lower case.
 \method{__repr__()} is an alias for \method{__str__()}.
 \end{methoddesc}

 \begin{methoddesc}[Charset]{__eq__}{other}
 This method allows you to compare two \class{Charset} instances for equality.
 \end{methoddesc}

 \begin{methoddesc}[Header]{__ne__}{other}
 This method allows you to compare two \class{Charset} instances for inequality.
 \end{methoddesc}

 The \module{email.Charset} module also provides the following
 functions for adding new entries to the global character set, alias,
 and codec registries:

 \begin{funcdesc}{add_charset}{charset\optional{, header_enc\optional{,
     body_enc\optional{, output_charset}}}}
 Add character properties to the global registry.

 \var{charset} is the input character set, and must be the canonical
 name of a character set.

 Optional \var{header_enc} and \var{body_enc} is either
 \code{Charset.QP} for quoted-printable, \code{Charset.BASE64} for
 base64 encoding, \code{Charset.SHORTEST} for the shortest of
 quoted-printable or base64 encoding, or \code{None} for no encoding.
 \code{SHORTEST} is only valid for \var{header_enc}. The default is
 \code{None} for no encoding.

 Optional \var{output_charset} is the character set that the output
 should be in.  Conversions will proceed from input charset, to
 Unicode, to the output charset when the method
 \method{Charset.convert()} is called.  The default is to output in the
 same character set as the input.

 Both \var{input_charset} and \var{output_charset} must have Unicode
 codec entries in the module's character set-to-codec mapping; use
 \function{add_codec()} to add codecs the module does
 not know about.  See the \refmodule{codecs} module's documentation for
 more information.

 The global character set registry is kept in the module global
 dictionary \code{CHARSETS}.
 \end{funcdesc}

 \begin{funcdesc}{add_alias}{alias, canonical}
 Add a character set alias.  \var{alias} is the alias name,
 e.g. \code{latin-1}.  \var{canonical} is the character set's canonical
 name, e.g. \code{iso-8859-1}.

 The global charset alias registry is kept in the module global
 dictionary \code{ALIASES}.
 \end{funcdesc}

 \begin{funcdesc}{add_codec}{charset, codecname}
 Add a codec that map characters in the given character set to and from
 Unicode.

 \var{charset} is the canonical name of a character set.
 \var{codecname} is the name of a Python codec, as appropriate for the
 second argument to the \function{unicode()} built-in, or to the
 \method{encode()} method of a Unicode string.
 \end{funcdesc}
	\declaremodule{standard}{email.Charset}
	\modulesynopsis{Character Sets}

	This module provides a class \class{Charset} for representing
	character sets and character set conversions in email messages, as
	well as a character set registry and several convenience methods for
	manipulating this registry. Instances of \class{Charset} are used in
	several other modules within the \module{email} package.

	\versionadded{2.2.2}

	\begin{classdesc}{Charset}{\optional{input_charset}}
	Map character sets to their email properties.

	This class provides information about the requirements imposed on
	email for a specific character set. It also provides convenience
	routines for converting between character sets, given the availability
	of the applicable codecs. Given a character set, it will do its best
	to provide information on how to use that character set in an email
	message in an RFC-compliant way.

	Certain character sets must be encoded with quoted-printable or base64
	when used in email headers or bodies. Certain character sets must be
	converted outright, and are not allowed in email.

	Optional \var{input_charset} is as described below; it is always
	coerced to lower case. After being alias normalized it is also used
	as a lookup into the registry of character sets to find out the header
	encoding, body encoding, and output conversion codec to be used for
	the character set. For example, if
	\var{input_charset} is \code{iso-8859-1}, then headers and bodies will
	be encoded using quoted-printable and no output conversion codec is
	necessary. If \var{input_charset} is \code{euc-jp}, then headers will
	be encoded with base64, bodies will not be encoded, but output text
	will be converted from the \code{euc-jp} character set to the
	\code{iso-2022-jp} character set.
	\end{classdesc}

	\class{Charset} instances have the following data attributes:

	\begin{datadesc}{input_charset}
	The initial character set specified. Common aliases are converted to
	their \emph{official} email names (e.g. \code{latin_1} is converted to
	\code{iso-8859-1}). Defaults to 7-bit \code{us-ascii}.
	\end{datadesc}

	\begin{datadesc}{header_encoding}
	If the character set must be encoded before it can be used in an
	email header, this attribute will be set to \code{Charset.QP} (for
	quoted-printable), \code{Charset.BASE64} (for base64 encoding), or
	\code{Charset.SHORTEST} for the shortest of QP or BASE64 encoding.
	Otherwise, it will be \code{None}.
	\end{datadesc}

	\begin{datadesc}{body_encoding}
	Same as \var{header_encoding}, but describes the encoding for the
	mail message's body, which indeed may be different than the header
	encoding. \code{Charset.SHORTEST} is not allowed for
	\var{body_encoding}.
	\end{datadesc}

	\begin{datadesc}{output_charset}
	Some character sets must be converted before they can be used in
	email headers or bodies. If the \var{input_charset} is one of
	them, this attribute will contain the name of the character set
	output will be converted to. Otherwise, it will be \code{None}.
	\end{datadesc}

	\begin{datadesc}{input_codec}
	The name of the Python codec used to convert the \var{input_charset} to
	Unicode. If no conversion codec is necessary, this attribute will be
	\code{None}.
	\end{datadesc}

	\begin{datadesc}{output_codec}
	The name of the Python codec used to convert Unicode to the
	\var{output_charset}. If no conversion codec is necessary, this
	attribute will have the same value as the \var{input_codec}.
	\end{datadesc}

	\class{Charset} instances also have the following methods:

	\begin{methoddesc}[Charset]{get_body_encoding}{}
	Return the content transfer encoding used for body encoding.

	This is either the string \samp{quoted-printable} or \samp{base64}
	depending on the encoding used, or it is a function, in which case you
	should call the function with a single argument, the Message object
	being encoded. The function should then set the
	\mailheader{Content-Transfer-Encoding} header itself to whatever is
	appropriate.

	Returns the string \samp{quoted-printable} if
	\var{body_encoding} is \code{QP}, returns the string
	\samp{base64} if \var{body_encoding} is \code{BASE64}, and returns the
	string \samp{7bit} otherwise.
	\end{methoddesc}

	\begin{methoddesc}{convert}{s}
	Convert the string \var{s} from the \var{input_codec} to the
	\var{output_codec}.
	\end{methoddesc}

	\begin{methoddesc}{to_splittable}{s}
	Convert a possibly multibyte string to a safely splittable format.
	\var{s} is the string to split.

	Uses the \var{input_codec} to try and convert the string to Unicode,
	so it can be safely split on character boundaries (even for multibyte
	characters).

	Returns the string as-is if it isn't known how to convert \var{s} to
	Unicode with the \var{input_charset}.

	Characters that could not be converted to Unicode will be replaced
	with the Unicode replacement character \character{U+FFFD}.
	\end{methoddesc}

	\begin{methoddesc}{from_splittable}{ustr\optional{, to_output}}
	Convert a splittable string back into an encoded string. \var{ustr}
	is a Unicode string to ``unsplit''.

	This method uses the proper codec to try and convert the string from
	Unicode back into an encoded format. Return the string as-is if it is
	not Unicode, or if it could not be converted from Unicode.

	Characters that could not be converted from Unicode will be replaced
	with an appropriate character (usually \character{?}).

	If \var{to_output} is \code{True} (the default), uses
	\var{output_codec} to convert to an
	encoded format. If \var{to_output} is \code{False}, it uses
	\var{input_codec}.
	\end{methoddesc}

	\begin{methoddesc}{get_output_charset}{}
	Return the output character set.

	This is the \var{output_charset} attribute if that is not \code{None},
	otherwise it is \var{input_charset}.
	\end{methoddesc}

	\begin{methoddesc}{encoded_header_len}{}
	Return the length of the encoded header string, properly calculating
	for quoted-printable or base64 encoding.
	\end{methoddesc}

	\begin{methoddesc}{header_encode}{s\optional{, convert}}
	Header-encode the string \var{s}.

	If \var{convert} is \code{True}, the string will be converted from the
	input charset to the output charset automatically. This is not useful
	for multibyte character sets, which have line length issues (multibyte
	characters must be split on a character, not a byte boundary); use the
	higher-level \class{Header} class to deal with these issues (see
	\refmodule{email.Header}). \var{convert} defaults to \code{False}.

	The type of encoding (base64 or quoted-printable) will be based on
	the \var{header_encoding} attribute.
	\end{methoddesc}

	\begin{methoddesc}{body_encode}{s\optional{, convert}}
	Body-encode the string \var{s}.

	If \var{convert} is \code{True} (the default), the string will be
	converted from the input charset to output charset automatically.
	Unlike \method{header_encode()}, there are no issues with byte
	boundaries and multibyte charsets in email bodies, so this is usually
	pretty safe.

	The type of encoding (base64 or quoted-printable) will be based on
	the \var{body_encoding} attribute.
	\end{methoddesc}

	The \class{Charset} class also provides a number of methods to support
	standard operations and built-in functions.

	\begin{methoddesc}[Charset]{__str__}{}
	Returns \var{input_charset} as a string coerced to lower case.
	\method{__repr__()} is an alias for \method{__str__()}.
	\end{methoddesc}

	\begin{methoddesc}[Charset]{__eq__}{other}
	This method allows you to compare two \class{Charset} instances for equality.
	\end{methoddesc}

	\begin{methoddesc}[Header]{__ne__}{other}
	This method allows you to compare two \class{Charset} instances for inequality.
	\end{methoddesc}

	The \module{email.Charset} module also provides the following
	functions for adding new entries to the global character set, alias,
	and codec registries:

	\begin{funcdesc}{add_charset}{charset\optional{, header_enc\optional{,
	body_enc\optional{, output_charset}}}}
	Add character properties to the global registry.

	\var{charset} is the input character set, and must be the canonical
	name of a character set.

	Optional \var{header_enc} and \var{body_enc} is either
	\code{Charset.QP} for quoted-printable, \code{Charset.BASE64} for
	base64 encoding, \code{Charset.SHORTEST} for the shortest of
	quoted-printable or base64 encoding, or \code{None} for no encoding.
	\code{SHORTEST} is only valid for \var{header_enc}. The default is
	\code{None} for no encoding.

	Optional \var{output_charset} is the character set that the output
	should be in. Conversions will proceed from input charset, to
	Unicode, to the output charset when the method
	\method{Charset.convert()} is called. The default is to output in the
	same character set as the input.

	Both \var{input_charset} and \var{output_charset} must have Unicode
	codec entries in the module's character set-to-codec mapping; use
	\function{add_codec()} to add codecs the module does
	not know about. See the \refmodule{codecs} module's documentation for
	more information.

	The global character set registry is kept in the module global
	dictionary \code{CHARSETS}.
	\end{funcdesc}

	\begin{funcdesc}{add_alias}{alias, canonical}
	Add a character set alias. \var{alias} is the alias name,
	e.g. \code{latin-1}. \var{canonical} is the character set's canonical
	name, e.g. \code{iso-8859-1}.

	The global charset alias registry is kept in the module global
	dictionary \code{ALIASES}.
	\end{funcdesc}

	\begin{funcdesc}{add_codec}{charset, codecname}
	Add a codec that map characters in the given character set to and from
	Unicode.

	\var{charset} is the canonical name of a character set.
	\var{codecname} is the name of a Python codec, as appropriate for the
	second argument to the \function{unicode()} built-in, or to the
	\method{encode()} method of a Unicode string.
	\end{funcdesc}