blob: d1ae72804c00446def772ea146c59fd0b07ea354 [file] [log] [blame]
Barry Warsaw5db478f2002-10-01 04:33:16 +00001\declaremodule{standard}{email.Charset}
2\modulesynopsis{Character Sets}
3
4This module provides a class \class{Charset} for representing
5character sets and character set conversions in email messages, as
6well as a character set registry and several convenience methods for
7manipulating this registry. Instances of \class{Charset} are used in
8several other modules within the \module{email} package.
9
10\versionadded{2.2.2}
11
12\begin{classdesc}{Charset}{\optional{input_charset}}
13Map character sets to their email properties.
14
15This class provides information about the requirements imposed on
16email for a specific character set. It also provides convenience
17routines for converting between character sets, given the availability
18of the applicable codecs. Given a character set, it will do its best
19to provide information on how to use that character set in an email
20message in an RFC-compliant way.
21
22Certain character sets must be encoded with quoted-printable or base64
23when used in email headers or bodies. Certain character sets must be
24converted outright, and are not allowed in email.
25
26Optional \var{input_charset} is as described below. After being alias
27normalized it is also used as a lookup into the registry of character
28sets to find out the header encoding, body encoding, and output
29conversion codec to be used for the character set. For example, if
30\var{input_charset} is \code{iso-8859-1}, then headers and bodies will
31be encoded using quoted-printable and no output conversion codec is
32necessary. If \var{input_charset} is \code{euc-jp}, then headers will
33be encoded with base64, bodies will not be encoded, but output text
34will be converted from the \code{euc-jp} character set to the
35\code{iso-2022-jp} character set.
36\end{classdesc}
37
38\class{Charset} instances have the following data attributes:
39
40\begin{datadesc}{input_charset}
41The initial character set specified. Common aliases are converted to
42their \emph{official} email names (e.g. \code{latin_1} is converted to
43\code{iso-8859-1}). Defaults to 7-bit \code{us-ascii}.
44\end{datadesc}
45
46\begin{datadesc}{header_encoding}
47If the character set must be encoded before it can be used in an
48email header, this attribute will be set to \code{Charset.QP} (for
49quoted-printable), \code{Charset.BASE64} (for base64 encoding), or
50\code{Charset.SHORTEST} for the shortest of QP or BASE64 encoding.
51Otherwise, it will be \code{None}.
52\end{datadesc}
53
54\begin{datadesc}{body_encoding}
55Same as \var{header_encoding}, but describes the encoding for the
56mail message's body, which indeed may be different than the header
57encoding. \code{Charset.SHORTEST} is not allowed for
58\var{body_encoding}.
59\end{datadesc}
60
61\begin{datadesc}{output_charset}
62Some character sets must be converted before they can be used in
63email headers or bodies. If the \var{input_charset} is one of
64them, this attribute will contain the name of the character set
65output will be converted to. Otherwise, it will be \code{None}.
66\end{datadesc}
67
68\begin{datadesc}{input_codec}
69The name of the Python codec used to convert the \var{input_charset} to
70Unicode. If no conversion codec is necessary, this attribute will be
71\code{None}.
72\end{datadesc}
73
74\begin{datadesc}{output_codec}
75The name of the Python codec used to convert Unicode to the
76\var{output_charset}. If no conversion codec is necessary, this
77attribute will have the same value as the \var{input_codec}.
78\end{datadesc}
79
80\class{Charset} instances also have the following methods:
81
82\begin{methoddesc}[Charset]{get_body_encoding}{}
83Return the content transfer encoding used for body encoding.
84
85This is either the string \samp{quoted-printable} or \samp{base64}
86depending on the encoding used, or it is a function, in which case you
87should call the function with a single argument, the Message object
88being encoded. The function should then set the
89\mailheader{Content-Transfer-Encoding} header itself to whatever is
90appropriate.
91
92Returns the string \samp{quoted-printable} if
93\var{body_encoding} is \code{QP}, returns the string
94\samp{base64} if \var{body_encoding} is \code{BASE64}, and returns the
95string \samp{7bit} otherwise.
96\end{methoddesc}
97
98\begin{methoddesc}{convert}{s}
99Convert the string \var{s} from the \var{input_codec} to the
100\var{output_codec}.
101\end{methoddesc}
102
103\begin{methoddesc}{to_splittable}{s}
104Convert a possibly multibyte string to a safely splittable format.
105\var{s} is the string to split.
106
107Uses the \var{input_codec} to try and convert the string to Unicode,
108so it can be safely split on character boundaries (even for multibyte
109characters).
110
111Returns the string as-is if it isn't known how to convert \var{s} to
112Unicode with the \var{input_charset}.
113
114Characters that could not be converted to Unicode will be replaced
115with the Unicode replacement character \character{U+FFFD}.
116\end{methoddesc}
117
118\begin{methoddesc}{from_splittable}{ustr\optional{, to_output}}
119Convert a splittable string back into an encoded string. \var{ustr}
120is a Unicode string to ``unsplit''.
121
122This method uses the proper codec to try and convert the string from
123Unicode back into an encoded format. Return the string as-is if it is
124not Unicode, or if it could not be converted from Unicode.
125
126Characters that could not be converted from Unicode will be replaced
127with an appropriate character (usually \character{?}).
128
129If \var{to_output} is \code{True} (the default), uses
130\var{output_codec} to convert to an
131encoded format. If \var{to_output} is \code{False}, it uses
132\var{input_codec}.
133\end{methoddesc}
134
135\begin{methoddesc}{get_output_charset}{}
136Return the output character set.
137
138This is the \var{output_charset} attribute if that is not \code{None},
139otherwise it is \var{input_charset}.
140\end{methoddesc}
141
142\begin{methoddesc}{encoded_header_len}{}
143Return the length of the encoded header string, properly calculating
144for quoted-printable or base64 encoding.
145\end{methoddesc}
146
147\begin{methoddesc}{header_encode}{s\optional{, convert}}
148Header-encode the string \var{s}.
149
150If \var{convert} is \code{True}, the string will be converted from the
151input charset to the output charset automatically. This is not useful
152for multibyte character sets, which have line length issues (multibyte
153characters must be split on a character, not a byte boundary); use the
154higher-level \class{Header} class to deal with these issues (see
155\refmodule{email.Header}). \var{convert} defaults to \code{False}.
156
157The type of encoding (base64 or quoted-printable) will be based on
158the \var{header_encoding} attribute.
159\end{methoddesc}
160
161\begin{methoddesc}{body_encode}{s\optional{, convert}}
162Body-encode the string \var{s}.
163
164If \var{convert} is \code{True} (the default), the string will be
165converted from the input charset to output charset automatically.
166Unlike \method{header_encode()}, there are no issues with byte
167boundaries and multibyte charsets in email bodies, so this is usually
168pretty safe.
169
170The type of encoding (base64 or quoted-printable) will be based on
171the \var{body_encoding} attribute.
172\end{methoddesc}
173
174The \class{Charset} class also provides a number of methods to support
175standard operations and built-in functions.
176
177\begin{methoddesc}[Charset]{__str__}{}
178Returns \var{input_charset} as a string coerced to lower case.
179\end{methoddesc}
180
181\begin{methoddesc}[Charset]{__eq__}{other}
182This method allows you to compare two \class{Charset} instances for equality.
183\end{methoddesc}
184
185\begin{methoddesc}[Header]{__ne__}{other}
186This method allows you to compare two \class{Charset} instances for inequality.
187\end{methoddesc}
188
189The \module{email.Charset} module also provides the following
190functions for adding new entries to the global character set, alias,
191and codec registries:
192
193\begin{funcdesc}{add_charset}{charset\optional{, header_enc\optional{,
194 body_enc\optional{, output_charset}}}}
195Add character properties to the global registry.
196
197\var{charset} is the input character set, and must be the canonical
198name of a character set.
199
200Optional \var{header_enc} and \var{body_enc} is either
201\code{Charset.QP} for quoted-printable, \code{Charset.BASE64} for
202base64 encoding, \code{Charset.SHORTEST} for the shortest of
203quoted-printable or base64 encoding, or \code{None} for no encoding.
204\code{SHORTEST} is only valid for \var{header_enc}. The default is
205\code{None} for no encoding.
206
207Optional \var{output_charset} is the character set that the output
208should be in. Conversions will proceed from input charset, to
209Unicode, to the output charset when the method
210\method{Charset.convert()} is called. The default is to output in the
211same character set as the input.
212
213Both \var{input_charset} and \var{output_charset} must have Unicode
214codec entries in the module's character set-to-codec mapping; use
215\function{add_codec()} to add codecs the module does
216not know about. See the \refmodule{codecs} module's documentation for
217more information.
218
219The global character set registry is kept in the module global
220dictionary \code{CHARSETS}.
221\end{funcdesc}
222
223\begin{funcdesc}{add_alias}{alias, canonical}
224Add a character set alias. \var{alias} is the alias name,
225e.g. \code{latin-1}. \var{canonical} is the character set's canonical
226name, e.g. \code{iso-8859-1}.
227
228The global charset alias registry is kept in the module global
229dictionary \code{ALIASES}.
230\end{funcdesc}
231
232\begin{funcdesc}{add_codec}{charset, codecname}
233Add a codec that map characters in the given character set to and from
234Unicode.
235
236\var{charset} is the canonical name of a character set.
237\var{codecname} is the name of a Python codec, as appropriate for the
238second argument to the \function{unicode()} built-in, or to the
239\method{encode()} method of a Unicode string.
240\end{funcdesc}