blob: e0be68ab7ee54b54ec78eaea515142776b0c65ce [file] [log] [blame]
Barry Warsaw40ef0062006-03-18 15:41:53 +00001\declaremodule{standard}{email.charset}
Barry Warsaw5db478f2002-10-01 04:33:16 +00002\modulesynopsis{Character Sets}
3
4This module provides a class \class{Charset} for representing
5character sets and character set conversions in email messages, as
6well as a character set registry and several convenience methods for
7manipulating this registry. Instances of \class{Charset} are used in
8several other modules within the \module{email} package.
9
Barry Warsaw40ef0062006-03-18 15:41:53 +000010Import this class from the \module{email.charset} module.
11
Barry Warsaw5db478f2002-10-01 04:33:16 +000012\versionadded{2.2.2}
13
14\begin{classdesc}{Charset}{\optional{input_charset}}
15Map character sets to their email properties.
16
17This class provides information about the requirements imposed on
18email for a specific character set. It also provides convenience
19routines for converting between character sets, given the availability
20of the applicable codecs. Given a character set, it will do its best
21to provide information on how to use that character set in an email
22message in an RFC-compliant way.
23
24Certain character sets must be encoded with quoted-printable or base64
25when used in email headers or bodies. Certain character sets must be
26converted outright, and are not allowed in email.
27
Barry Warsawdf88b9f2002-10-10 15:23:38 +000028Optional \var{input_charset} is as described below; it is always
29coerced to lower case. After being alias normalized it is also used
30as a lookup into the registry of character sets to find out the header
31encoding, body encoding, and output conversion codec to be used for
32the character set. For example, if
Barry Warsaw5db478f2002-10-01 04:33:16 +000033\var{input_charset} is \code{iso-8859-1}, then headers and bodies will
34be encoded using quoted-printable and no output conversion codec is
35necessary. If \var{input_charset} is \code{euc-jp}, then headers will
36be encoded with base64, bodies will not be encoded, but output text
37will be converted from the \code{euc-jp} character set to the
38\code{iso-2022-jp} character set.
39\end{classdesc}
40
41\class{Charset} instances have the following data attributes:
42
43\begin{datadesc}{input_charset}
44The initial character set specified. Common aliases are converted to
45their \emph{official} email names (e.g. \code{latin_1} is converted to
46\code{iso-8859-1}). Defaults to 7-bit \code{us-ascii}.
47\end{datadesc}
48
49\begin{datadesc}{header_encoding}
50If the character set must be encoded before it can be used in an
51email header, this attribute will be set to \code{Charset.QP} (for
52quoted-printable), \code{Charset.BASE64} (for base64 encoding), or
53\code{Charset.SHORTEST} for the shortest of QP or BASE64 encoding.
54Otherwise, it will be \code{None}.
55\end{datadesc}
56
57\begin{datadesc}{body_encoding}
58Same as \var{header_encoding}, but describes the encoding for the
59mail message's body, which indeed may be different than the header
60encoding. \code{Charset.SHORTEST} is not allowed for
61\var{body_encoding}.
62\end{datadesc}
63
64\begin{datadesc}{output_charset}
65Some character sets must be converted before they can be used in
66email headers or bodies. If the \var{input_charset} is one of
67them, this attribute will contain the name of the character set
68output will be converted to. Otherwise, it will be \code{None}.
69\end{datadesc}
70
71\begin{datadesc}{input_codec}
72The name of the Python codec used to convert the \var{input_charset} to
73Unicode. If no conversion codec is necessary, this attribute will be
74\code{None}.
75\end{datadesc}
76
77\begin{datadesc}{output_codec}
78The name of the Python codec used to convert Unicode to the
79\var{output_charset}. If no conversion codec is necessary, this
80attribute will have the same value as the \var{input_codec}.
81\end{datadesc}
82
83\class{Charset} instances also have the following methods:
84
85\begin{methoddesc}[Charset]{get_body_encoding}{}
86Return the content transfer encoding used for body encoding.
87
88This is either the string \samp{quoted-printable} or \samp{base64}
89depending on the encoding used, or it is a function, in which case you
90should call the function with a single argument, the Message object
91being encoded. The function should then set the
92\mailheader{Content-Transfer-Encoding} header itself to whatever is
93appropriate.
94
95Returns the string \samp{quoted-printable} if
96\var{body_encoding} is \code{QP}, returns the string
97\samp{base64} if \var{body_encoding} is \code{BASE64}, and returns the
98string \samp{7bit} otherwise.
99\end{methoddesc}
100
101\begin{methoddesc}{convert}{s}
102Convert the string \var{s} from the \var{input_codec} to the
103\var{output_codec}.
104\end{methoddesc}
105
106\begin{methoddesc}{to_splittable}{s}
107Convert a possibly multibyte string to a safely splittable format.
108\var{s} is the string to split.
109
110Uses the \var{input_codec} to try and convert the string to Unicode,
111so it can be safely split on character boundaries (even for multibyte
112characters).
113
114Returns the string as-is if it isn't known how to convert \var{s} to
115Unicode with the \var{input_charset}.
116
117Characters that could not be converted to Unicode will be replaced
118with the Unicode replacement character \character{U+FFFD}.
119\end{methoddesc}
120
121\begin{methoddesc}{from_splittable}{ustr\optional{, to_output}}
122Convert a splittable string back into an encoded string. \var{ustr}
123is a Unicode string to ``unsplit''.
124
125This method uses the proper codec to try and convert the string from
126Unicode back into an encoded format. Return the string as-is if it is
127not Unicode, or if it could not be converted from Unicode.
128
129Characters that could not be converted from Unicode will be replaced
130with an appropriate character (usually \character{?}).
131
132If \var{to_output} is \code{True} (the default), uses
133\var{output_codec} to convert to an
134encoded format. If \var{to_output} is \code{False}, it uses
135\var{input_codec}.
136\end{methoddesc}
137
138\begin{methoddesc}{get_output_charset}{}
139Return the output character set.
140
141This is the \var{output_charset} attribute if that is not \code{None},
142otherwise it is \var{input_charset}.
143\end{methoddesc}
144
145\begin{methoddesc}{encoded_header_len}{}
146Return the length of the encoded header string, properly calculating
147for quoted-printable or base64 encoding.
148\end{methoddesc}
149
150\begin{methoddesc}{header_encode}{s\optional{, convert}}
151Header-encode the string \var{s}.
152
153If \var{convert} is \code{True}, the string will be converted from the
154input charset to the output charset automatically. This is not useful
155for multibyte character sets, which have line length issues (multibyte
156characters must be split on a character, not a byte boundary); use the
157higher-level \class{Header} class to deal with these issues (see
Barry Warsaw40ef0062006-03-18 15:41:53 +0000158\refmodule{email.header}). \var{convert} defaults to \code{False}.
Barry Warsaw5db478f2002-10-01 04:33:16 +0000159
160The type of encoding (base64 or quoted-printable) will be based on
161the \var{header_encoding} attribute.
162\end{methoddesc}
163
164\begin{methoddesc}{body_encode}{s\optional{, convert}}
165Body-encode the string \var{s}.
166
167If \var{convert} is \code{True} (the default), the string will be
168converted from the input charset to output charset automatically.
169Unlike \method{header_encode()}, there are no issues with byte
170boundaries and multibyte charsets in email bodies, so this is usually
171pretty safe.
172
173The type of encoding (base64 or quoted-printable) will be based on
174the \var{body_encoding} attribute.
175\end{methoddesc}
176
177The \class{Charset} class also provides a number of methods to support
178standard operations and built-in functions.
179
180\begin{methoddesc}[Charset]{__str__}{}
181Returns \var{input_charset} as a string coerced to lower case.
Barry Warsaw1a2c3732003-03-06 06:07:34 +0000182\method{__repr__()} is an alias for \method{__str__()}.
Barry Warsaw5db478f2002-10-01 04:33:16 +0000183\end{methoddesc}
184
185\begin{methoddesc}[Charset]{__eq__}{other}
186This method allows you to compare two \class{Charset} instances for equality.
187\end{methoddesc}
188
189\begin{methoddesc}[Header]{__ne__}{other}
190This method allows you to compare two \class{Charset} instances for inequality.
191\end{methoddesc}
192
Barry Warsaw40ef0062006-03-18 15:41:53 +0000193The \module{email.charset} module also provides the following
Barry Warsaw5db478f2002-10-01 04:33:16 +0000194functions for adding new entries to the global character set, alias,
195and codec registries:
196
197\begin{funcdesc}{add_charset}{charset\optional{, header_enc\optional{,
198 body_enc\optional{, output_charset}}}}
199Add character properties to the global registry.
200
201\var{charset} is the input character set, and must be the canonical
202name of a character set.
203
204Optional \var{header_enc} and \var{body_enc} is either
205\code{Charset.QP} for quoted-printable, \code{Charset.BASE64} for
206base64 encoding, \code{Charset.SHORTEST} for the shortest of
207quoted-printable or base64 encoding, or \code{None} for no encoding.
208\code{SHORTEST} is only valid for \var{header_enc}. The default is
209\code{None} for no encoding.
210
211Optional \var{output_charset} is the character set that the output
212should be in. Conversions will proceed from input charset, to
213Unicode, to the output charset when the method
214\method{Charset.convert()} is called. The default is to output in the
215same character set as the input.
216
217Both \var{input_charset} and \var{output_charset} must have Unicode
218codec entries in the module's character set-to-codec mapping; use
219\function{add_codec()} to add codecs the module does
220not know about. See the \refmodule{codecs} module's documentation for
221more information.
222
223The global character set registry is kept in the module global
224dictionary \code{CHARSETS}.
225\end{funcdesc}
226
227\begin{funcdesc}{add_alias}{alias, canonical}
228Add a character set alias. \var{alias} is the alias name,
229e.g. \code{latin-1}. \var{canonical} is the character set's canonical
230name, e.g. \code{iso-8859-1}.
231
232The global charset alias registry is kept in the module global
233dictionary \code{ALIASES}.
234\end{funcdesc}
235
236\begin{funcdesc}{add_codec}{charset, codecname}
237Add a codec that map characters in the given character set to and from
238Unicode.
239
240\var{charset} is the canonical name of a character set.
241\var{codecname} is the name of a Python codec, as appropriate for the
242second argument to the \function{unicode()} built-in, or to the
243\method{encode()} method of a Unicode string.
244\end{funcdesc}