blob: 18f2a016c006f6136ab6dfee3bae5790fa6d49c0 [file] [log] [blame]
Barry Warsaw5db478f2002-10-01 04:33:16 +00001\declaremodule{standard}{email.Charset}
2\modulesynopsis{Character Sets}
3
4This module provides a class \class{Charset} for representing
5character sets and character set conversions in email messages, as
6well as a character set registry and several convenience methods for
7manipulating this registry. Instances of \class{Charset} are used in
8several other modules within the \module{email} package.
9
10\versionadded{2.2.2}
11
12\begin{classdesc}{Charset}{\optional{input_charset}}
13Map character sets to their email properties.
14
15This class provides information about the requirements imposed on
16email for a specific character set. It also provides convenience
17routines for converting between character sets, given the availability
18of the applicable codecs. Given a character set, it will do its best
19to provide information on how to use that character set in an email
20message in an RFC-compliant way.
21
22Certain character sets must be encoded with quoted-printable or base64
23when used in email headers or bodies. Certain character sets must be
24converted outright, and are not allowed in email.
25
Barry Warsawdf88b9f2002-10-10 15:23:38 +000026Optional \var{input_charset} is as described below; it is always
27coerced to lower case. After being alias normalized it is also used
28as a lookup into the registry of character sets to find out the header
29encoding, body encoding, and output conversion codec to be used for
30the character set. For example, if
Barry Warsaw5db478f2002-10-01 04:33:16 +000031\var{input_charset} is \code{iso-8859-1}, then headers and bodies will
32be encoded using quoted-printable and no output conversion codec is
33necessary. If \var{input_charset} is \code{euc-jp}, then headers will
34be encoded with base64, bodies will not be encoded, but output text
35will be converted from the \code{euc-jp} character set to the
36\code{iso-2022-jp} character set.
37\end{classdesc}
38
39\class{Charset} instances have the following data attributes:
40
41\begin{datadesc}{input_charset}
42The initial character set specified. Common aliases are converted to
43their \emph{official} email names (e.g. \code{latin_1} is converted to
44\code{iso-8859-1}). Defaults to 7-bit \code{us-ascii}.
45\end{datadesc}
46
47\begin{datadesc}{header_encoding}
48If the character set must be encoded before it can be used in an
49email header, this attribute will be set to \code{Charset.QP} (for
50quoted-printable), \code{Charset.BASE64} (for base64 encoding), or
51\code{Charset.SHORTEST} for the shortest of QP or BASE64 encoding.
52Otherwise, it will be \code{None}.
53\end{datadesc}
54
55\begin{datadesc}{body_encoding}
56Same as \var{header_encoding}, but describes the encoding for the
57mail message's body, which indeed may be different than the header
58encoding. \code{Charset.SHORTEST} is not allowed for
59\var{body_encoding}.
60\end{datadesc}
61
62\begin{datadesc}{output_charset}
63Some character sets must be converted before they can be used in
64email headers or bodies. If the \var{input_charset} is one of
65them, this attribute will contain the name of the character set
66output will be converted to. Otherwise, it will be \code{None}.
67\end{datadesc}
68
69\begin{datadesc}{input_codec}
70The name of the Python codec used to convert the \var{input_charset} to
71Unicode. If no conversion codec is necessary, this attribute will be
72\code{None}.
73\end{datadesc}
74
75\begin{datadesc}{output_codec}
76The name of the Python codec used to convert Unicode to the
77\var{output_charset}. If no conversion codec is necessary, this
78attribute will have the same value as the \var{input_codec}.
79\end{datadesc}
80
81\class{Charset} instances also have the following methods:
82
83\begin{methoddesc}[Charset]{get_body_encoding}{}
84Return the content transfer encoding used for body encoding.
85
86This is either the string \samp{quoted-printable} or \samp{base64}
87depending on the encoding used, or it is a function, in which case you
88should call the function with a single argument, the Message object
89being encoded. The function should then set the
90\mailheader{Content-Transfer-Encoding} header itself to whatever is
91appropriate.
92
93Returns the string \samp{quoted-printable} if
94\var{body_encoding} is \code{QP}, returns the string
95\samp{base64} if \var{body_encoding} is \code{BASE64}, and returns the
96string \samp{7bit} otherwise.
97\end{methoddesc}
98
99\begin{methoddesc}{convert}{s}
100Convert the string \var{s} from the \var{input_codec} to the
101\var{output_codec}.
102\end{methoddesc}
103
104\begin{methoddesc}{to_splittable}{s}
105Convert a possibly multibyte string to a safely splittable format.
106\var{s} is the string to split.
107
108Uses the \var{input_codec} to try and convert the string to Unicode,
109so it can be safely split on character boundaries (even for multibyte
110characters).
111
112Returns the string as-is if it isn't known how to convert \var{s} to
113Unicode with the \var{input_charset}.
114
115Characters that could not be converted to Unicode will be replaced
116with the Unicode replacement character \character{U+FFFD}.
117\end{methoddesc}
118
119\begin{methoddesc}{from_splittable}{ustr\optional{, to_output}}
120Convert a splittable string back into an encoded string. \var{ustr}
121is a Unicode string to ``unsplit''.
122
123This method uses the proper codec to try and convert the string from
124Unicode back into an encoded format. Return the string as-is if it is
125not Unicode, or if it could not be converted from Unicode.
126
127Characters that could not be converted from Unicode will be replaced
128with an appropriate character (usually \character{?}).
129
130If \var{to_output} is \code{True} (the default), uses
131\var{output_codec} to convert to an
132encoded format. If \var{to_output} is \code{False}, it uses
133\var{input_codec}.
134\end{methoddesc}
135
136\begin{methoddesc}{get_output_charset}{}
137Return the output character set.
138
139This is the \var{output_charset} attribute if that is not \code{None},
140otherwise it is \var{input_charset}.
141\end{methoddesc}
142
143\begin{methoddesc}{encoded_header_len}{}
144Return the length of the encoded header string, properly calculating
145for quoted-printable or base64 encoding.
146\end{methoddesc}
147
148\begin{methoddesc}{header_encode}{s\optional{, convert}}
149Header-encode the string \var{s}.
150
151If \var{convert} is \code{True}, the string will be converted from the
152input charset to the output charset automatically. This is not useful
153for multibyte character sets, which have line length issues (multibyte
154characters must be split on a character, not a byte boundary); use the
155higher-level \class{Header} class to deal with these issues (see
156\refmodule{email.Header}). \var{convert} defaults to \code{False}.
157
158The type of encoding (base64 or quoted-printable) will be based on
159the \var{header_encoding} attribute.
160\end{methoddesc}
161
162\begin{methoddesc}{body_encode}{s\optional{, convert}}
163Body-encode the string \var{s}.
164
165If \var{convert} is \code{True} (the default), the string will be
166converted from the input charset to output charset automatically.
167Unlike \method{header_encode()}, there are no issues with byte
168boundaries and multibyte charsets in email bodies, so this is usually
169pretty safe.
170
171The type of encoding (base64 or quoted-printable) will be based on
172the \var{body_encoding} attribute.
173\end{methoddesc}
174
175The \class{Charset} class also provides a number of methods to support
176standard operations and built-in functions.
177
178\begin{methoddesc}[Charset]{__str__}{}
179Returns \var{input_charset} as a string coerced to lower case.
Barry Warsaw1a2c3732003-03-06 06:07:34 +0000180\method{__repr__()} is an alias for \method{__str__()}.
Barry Warsaw5db478f2002-10-01 04:33:16 +0000181\end{methoddesc}
182
183\begin{methoddesc}[Charset]{__eq__}{other}
184This method allows you to compare two \class{Charset} instances for equality.
185\end{methoddesc}
186
187\begin{methoddesc}[Header]{__ne__}{other}
188This method allows you to compare two \class{Charset} instances for inequality.
189\end{methoddesc}
190
191The \module{email.Charset} module also provides the following
192functions for adding new entries to the global character set, alias,
193and codec registries:
194
195\begin{funcdesc}{add_charset}{charset\optional{, header_enc\optional{,
196 body_enc\optional{, output_charset}}}}
197Add character properties to the global registry.
198
199\var{charset} is the input character set, and must be the canonical
200name of a character set.
201
202Optional \var{header_enc} and \var{body_enc} is either
203\code{Charset.QP} for quoted-printable, \code{Charset.BASE64} for
204base64 encoding, \code{Charset.SHORTEST} for the shortest of
205quoted-printable or base64 encoding, or \code{None} for no encoding.
206\code{SHORTEST} is only valid for \var{header_enc}. The default is
207\code{None} for no encoding.
208
209Optional \var{output_charset} is the character set that the output
210should be in. Conversions will proceed from input charset, to
211Unicode, to the output charset when the method
212\method{Charset.convert()} is called. The default is to output in the
213same character set as the input.
214
215Both \var{input_charset} and \var{output_charset} must have Unicode
216codec entries in the module's character set-to-codec mapping; use
217\function{add_codec()} to add codecs the module does
218not know about. See the \refmodule{codecs} module's documentation for
219more information.
220
221The global character set registry is kept in the module global
222dictionary \code{CHARSETS}.
223\end{funcdesc}
224
225\begin{funcdesc}{add_alias}{alias, canonical}
226Add a character set alias. \var{alias} is the alias name,
227e.g. \code{latin-1}. \var{canonical} is the character set's canonical
228name, e.g. \code{iso-8859-1}.
229
230The global charset alias registry is kept in the module global
231dictionary \code{ALIASES}.
232\end{funcdesc}
233
234\begin{funcdesc}{add_codec}{charset, codecname}
235Add a codec that map characters in the given character set to and from
236Unicode.
237
238\var{charset} is the canonical name of a character set.
239\var{codecname} is the name of a Python codec, as appropriate for the
240second argument to the \function{unicode()} built-in, or to the
241\method{encode()} method of a Unicode string.
242\end{funcdesc}