blob: 172e5d6539d8ea5acf0e7780b8e8296ff9ca63ae [file] [log] [blame]
Barry Warsaw5b9da892002-10-01 01:05:52 +00001\declaremodule{standard}{email.Header}
2\modulesynopsis{Representing non-ASCII headers}
3
4\rfc{2822} is the base standard that describes the format of email
5messages. It derives from the older \rfc{822} standard which came
6into widespread at a time when most email was composed of \ASCII{}
7characters only. \rfc{2822} is a specification written assuming email
8contains only 7-bit \ASCII{} characters.
9
10Of course, as email has been deployed worldwide, it has become
11internationalized, such that language specific character sets can now
12be used in email messages. The base standard still requires email
13messages to be transfered using only 7-bit \ASCII{} characters, so a
14slew of RFCs have been written describing how to encode email
15containing non-\ASCII{} characters into \rfc{2822}-compliant format.
16These RFCs include \rfc{2045}, \rfc{2046}, \rfc{2047}, and \rfc{2231}.
17The \module{email} package supports these standards in its
18\module{email.Header} and \module{email.Charset} modules.
19
20If you want to include non-\ASCII{} characters in your email headers,
21say in the \mailheader{Subject} or \mailheader{To} fields, you should
22use the \class{Header} class (in module \module{email.Header} and
23assign the field in the \class{Message} object to an instance of
24\class{Header} instead of using a string for the header value. For
25example:
26
27\begin{verbatim}
28>>> from email.Message import Message
29>>> from email.Header import Header
30>>> msg = Message()
31>>> h = Header('p\xf6stal', 'iso-8859-1')
32>>> msg['Subject'] = h
33>>> print msg.as_string()
34Subject: =?iso-8859-1?q?p=F6stal?=
35
36
37\end{verbatim}
38
39Notice here how we wanted the \mailheader{Subject} field to contain a
40non-\ASCII{} character? We did this by creating a \class{Header}
41instance and passing in the character set that the byte string was
42encoded in. When the subsequent \class{Message} instance was
43flattened, the \mailheader{Subject} field was properly \rfc{2047}
44encoded. MIME-aware mail readers would show this header using the
45embedded ISO-8859-1 character.
46
47\versionadded{2.2.2}
48
49Here is the \class{Header} class description:
50
51\begin{classdesc}{Header}{\optional{s\optional{, charset\optional{,
52 maxlinelen\optional{, header_name\optional{, continuation_ws}}}}}}
53Create a MIME-compliant header that can contain many character sets.
54
55Optional \var{s} is the initial header value. If \code{None} (the
56default), the initial header value is not set. You can later append
57to the header with \method{append()} method calls. \var{s} may be a
58byte string or a Unicode string, but see the \method{append()}
59documentation for semantics.
60
61Optional \var{charset} serves two purposes: it has the same meaning as
62the \var{charset} argument to the \method{append()} method. It also
63sets the default character set for all subsequent \method{append()}
64calls that omit the \var{charset} argument. If \var{charset} is not
65provided in the constructor (the default), the \code{us-ascii}
66character set is used both as \var{s}'s initial charset and as the
67default for subsequent \method{append()} calls.
68
69The maximum line length can be specified explicit via
70\var{maxlinelen}. For splitting the first line to a shorter value (to
71account for the field header which isn't included in \var{s},
72e.g. \mailheader{Subject}) pass in the name of the field in
73\var{header_name}. The default \var{maxlinelen} is 76, and the
74default value for \var{header_name} is \code{None}, meaning it is not
75taken into account for the first line of a long, split header.
76
77Optional \var{continuation_ws} must be RFC 2822 compliant folding
78whitespace, and is usually either a space or a hard tab character.
79This character will be prepended to continuation lines.
80\end{classdesc}
81
82\begin{methoddesc}[Header]{append}{s\optional{, charset}}
83Append the string \var{s} to the MIME header.
84
85Optional \var{charset}, if given, should be a \class{Charset} instance
86(see \refmodule{email.Charset}) or the name of a character set, which
87will be converted to a \class{Charset} instance. A value of
88\code{None} (the default) means that the \var{charset} given in the
89constructor is used.
90
91\var{s} may be a byte string or a Unicode string. If it is a byte
92string (i.e. \code{isinstance(s, StringType)} is true), then
93\var{charset} is the encoding of that byte string, and a
94\exception{UnicodeError} will be raised if the string cannot be
95decoded with that character set.
96
97If \var{s} is a Unicode string, then \var{charset} is a hint
98specifying the character set of the characters in the string. In this
99case, when producing an \rfc{2822}-compliant header using \rfc{2047}
100rules, the Unicode string will be encoded using the following charsets
101in order: \code{us-ascii}, the \var{charset} hint, \code{utf-8}. The
102first character set to not provoke a \exception{UnicodeError} is used.
103\end{methoddesc}
104
105\begin{methoddesc}[Header]{encode}{}
106Encode a message header into an RFC-compliant format, possibly
107wrapping long lines and encapsulating non-\ASCII{} parts in base64 or
108quoted-printable encodings.
109\end{methoddesc}
110
111The \class{Header} class also provides a number of methods to support
112standard operators and built-in functions.
113
114\begin{methoddesc}[Header]{__str__}{}
115A synonym for \method{Header.encode()}. Useful for
116\code{str(aHeader)} calls.
117\end{methoddesc}
118
119\begin{methoddesc}[Header]{__unicode__}{}
120A helper for the built-in \function{unicode()} function. Returns the
121header as a Unicode string.
122\end{methoddesc}
123
124\begin{methoddesc}[Header]{__eq__}{other}
125This method allows you to compare two \class{Header} instances for equality.
126\end{methoddesc}
127
128\begin{methoddesc}[Header]{__ne__}{other}
129This method allows you to compare two \class{Header} instances for inequality.
130\end{methoddesc}
131
132The \module{email.Header} module also provides the following
133convenient functions.
134
135\begin{funcdesc}{decode_header}{header}
136Decode a message header value without converting the character set.
137The header value is in \var{header}.
138
139This function returns a list of \code{(decoded_string, charset)} pairs
140containing each of the decoded parts of the header. \var{charset} is
141\code{None} for non-encoded parts of the header, otherwise a lower
142case string containing the name of the character set specified in the
143encoded string.
144
145Here's an example:
146
147\begin{verbatim}
148>>> from email.Header import decode_header
149>>> decode_header('=?iso-8859-1?q?p=F6stal?=')
150[('p\\xf6stal', 'iso-8859-1')]
151\end{verbatim}
152\end{funcdesc}
153
154\begin{funcdesc}{make_header}{decoded_seq\optional{, maxlinelen\optional{,
155 header_name\optional{, continuation_ws}}}}
156Create a \class{Header} instance from a sequence of pairs as returned
157by \function{decode_header()}.
158
159\function{decode_header()} takes a header value string and returns a
160sequence of pairs of the format \code{(decoded_string, charset)} where
161\var{charset} is the name of the character set.
162
163This function takes one of those sequence of pairs and returns a
164\class{Header} instance. Optional \var{maxlinelen},
165\var{header_name}, and \var{continuation_ws} are as in the
166\class{Header} constructor.
167\end{funcdesc}
168
169\declaremodule{standard}{email.Charset}
170\modulesynopsis{Character Sets}
171
172This module provides a class \class{Charset} for representing
173character sets and character set conversions in email messages, as
174well as a character set registry and several convenience methods for
175manipulating this registry. Instances of \class{Charset} are used in
176several other modules within the \module{email} package.
177
178\versionadded{2.2.2}
179
180\begin{classdesc}{Charset}{\optional{input_charset}}
181Map character sets to their email properties.
182
183This class provides information about the requirements imposed on
184email for a specific character set. It also provides convenience
185routines for converting between character sets, given the availability
186of the applicable codecs. Given a character set, it will do its best
187to provide information on how to use that character set in an email
188message in an RFC-compliant way.
189
190Certain character sets must be encoded with quoted-printable or base64
191when used in email headers or bodies. Certain character sets must be
192converted outright, and are not allowed in email.
193
194Optional \var{input_charset} is as described below. After being alias
195normalized it is also used as a lookup into the registry of character
196sets to find out the header encoding, body encoding, and output
197conversion codec to be used for the character set. For example, if
198\var{input_charset} is \code{iso-8859-1}, then headers and bodies will
199be encoded using quoted-printable and no output conversion codec is
200necessary. If \var{input_charset} is \code{euc-jp}, then headers will
201be encoded with base64, bodies will not be encoded, but output text
202will be converted from the \code{euc-jp} character set to the
203\code{iso-2022-jp} character set.
204\end{classdesc}
205
206\class{Charset} instances have the following data attributes:
207
208\begin{datadesc}{input_charset}
209The initial character set specified. Common aliases are converted to
210their \emph{official} email names (e.g. \code{latin_1} is converted to
211\code{iso-8859-1}). Defaults to 7-bit \code{us-ascii}.
212\end{datadesc}
213
214\begin{datadesc}{header_encoding}
215If the character set must be encoded before it can be used in an
216email header, this attribute will be set to \code{Charset.QP} (for
217quoted-printable), \code{Charset.BASE64} (for base64 encoding), or
218\code{Charset.SHORTEST} for the shortest of QP or BASE64 encoding.
219Otherwise, it will be \code{None}.
220\end{datadesc}
221
222\begin{datadesc}{body_encoding}
223Same as \var{header_encoding}, but describes the encoding for the
224mail message's body, which indeed may be different than the header
225encoding. \code{Charset.SHORTEST} is not allowed for
226\var{body_encoding}.
227\end{datadesc}
228
229\begin{datadesc}{output_charset}
230Some character sets must be converted before the can be used in
231email headers or bodies. If the \var{input_charset} is one of
232them, this attribute will contain the name of the character set
233output will be converted to. Otherwise, it will be \code{None}.
234\end{datadesc}
235
236\begin{datadesc}{input_codec}
237The name of the Python codec used to convert the \var{input_charset} to
238Unicode. If no conversion codec is necessary, this attribute will be
239\code{None}.
240\end{datadesc}
241
242\begin{datadesc}{output_codec}
243The name of the Python codec used to convert Unicode to the
244\var{output_charset}. If no conversion codec is necessary, this
245attribute will have the same value as the \var{input_codec}.
246\end{datadesc}
247
248\class{Charset} instances also have the following methods:
249
250\begin{methoddesc}[Charset]{get_body_encoding}{}
251Return the content transfer encoding used for body encoding.
252
253This is either the string \samp{quoted-printable} or \samp{base64}
254depending on the encoding used, or it is a function, in which case you
255should call the function with a single argument, the Message object
256being encoded. The function should then set the
257\mailheader{Content-Transfer-Encoding} header itself to whatever is
258appropriate.
259
260Returns the string \samp{quoted-printable} if
261\var{body_encoding} is \code{QP}, returns the string
262\samp{base64} if \var{body_encoding} is \code{BASE64}, and returns the
263string \samp{7bit} otherwise.
264\end{methoddesc}
265
266\begin{methoddesc}{convert}{s}
267Convert the string \var{s} from the \var{input_codec} to the
268\var{output_codec}.
269\end{methoddesc}
270
271\begin{methoddesc}{to_splittable}{s}
272Convert a possibly multibyte string to a safely splittable format.
273\var{s} is the string to split.
274
275Uses the \var{input_codec} to try and convert the string to Unicode,
276so it can be safely split on character boundaries (even for multibyte
277characters).
278
279Returns the string as-is if it isn't known how to convert \var{s} to
280Unicode with the \var{input_charset}.
281
282Characters that could not be converted to Unicode will be replaced
283with the Unicode replacement character \character{U+FFFD}.
284\end{methoddesc}
285
286\begin{methoddesc}{from_splittable}{ustr\optional{, to_output}}
287Convert a splittable string back into an encoded string. \var{ustr}
288is a Unicode string to ``unsplit''.
289
290This method uses the proper codec to try and convert the string from
291Unicode back into an encoded format. Return the string as-is if it is
292not Unicode, or if it could not be converted from Unicode.
293
294Characters that could not be converted from Unicode will be replaced
295with an appropriate character (usually \character{?}).
296
297If \var{to_output} is \code{True} (the default), uses
298\var{output_codec} to convert to an
299encoded format. If \var{to_output} is \code{False}, it uses
300\var{input_codec}.
301\end{methoddesc}
302
303\begin{methoddesc}{get_output_charset}{}
304Return the output character set.
305
306This is the \var{output_charset} attribute if that is not \code{None},
307otherwise it is \var{input_charset}.
308\end{methoddesc}
309
310\begin{methoddesc}{encoded_header_len}{}
311Return the length of the encoded header string, properly calculating
312for quoted-printable or base64 encoding.
313\end{methoddesc}
314
315\begin{methoddesc}{header_encode}{s\optional{, convert}}
316Header-encode the string \var{s}.
317
318If \var{convert} is \code{True}, the string will be converted from the
319input charset to the output charset automatically. This is not useful
320for multibyte character sets, which have line length issues (multibyte
321characters must be split on a character, not a byte boundary); use the
322higher-level \class{Header} class to deal with these issues (see
323\refmodule{email.Header}). \var{convert} defaults to \code{False}.
324
325The type of encoding (base64 or quoted-printable) will be based on
326the \var{header_encoding} attribute.
327\end{methoddesc}
328
329\begin{methoddesc}{body_encode}{s\optional{, convert}}
330Body-encode the string \var{s}.
331
332If \var{convert} is \code{True} (the default), the string will be
333converted from the input charset to output charset automatically.
334Unlike \method{header_encode()}, there are no issues with byte
335boundaries and multibyte charsets in email bodies, so this is usually
336pretty safe.
337
338The type of encoding (base64 or quoted-printable) will be based on
339the \var{body_encoding} attribute.
340\end{methoddesc}
341
342The \class{Charset} class also provides a number of methods to support
343standard operations and built-in functions.
344
345\begin{methoddesc}[Charset]{__str__}{}
346Returns \var{input_charset} as a string coerced to lower case.
347\end{methoddesc}
348
349\begin{methoddesc}[Charset]{__eq__}{other}
350This method allows you to compare two \class{Charset} instances for equality.
351\end{methoddesc}
352
353\begin{methoddesc}[Header]{__ne__}{other}
354This method allows you to compare two \class{Charset} instances for inequality.
355\end{methoddesc}
356
357The \module{email.Charset} module also provides the following
358functions for adding new entries to the global character set, alias,
359and codec registries:
360
361\begin{funcdesc}{add_charset}{charset\optional{, header_enc\optional{,
362 body_enc\optional{, output_charset}}}}
363Add character properties to the global registry.
364
365\var{charset} is the input character set, and must be the canonical
366name of a character set.
367
368Optional \var{header_enc} and \var{body_enc} is either
369\code{Charset.QP} for quoted-printable, \code{Charset.BASE64} for
370base64 encoding, \code{Charset.SHORTEST} for the shortest of qp or
371base64 encoding, or \code{None} for no encoding. \code{SHORTEST} is
372only valid for \var{header_enc}. It describes how message headers and
373message bodies in the input charset are to be encoded. Default is no
374encoding.
375
376Optional \var{output_charset} is the character set that the output
377should be in. Conversions will proceed from input charset, to
378Unicode, to the output charset when the method
379\method{Charset.convert()} is called. The default is to output in the
380same character set as the input.
381
382Both \var{input_charset} and \var{output_charset} must have Unicode
383codec entries in the module's character set-to-codec mapping; use
384\function{add_codec(charset, codecname)} to add codecs the module does
385not know about. See the \refmodule{codecs} module's documentation for
386more information.
387
388The global character set registry is kept in the module global
389dictionary \code{CHARSETS}.
390\end{funcdesc}
391
392\begin{funcdesc}{add_alias}{alias, canonical}
393Add a character set alias. \var{alias} is the alias name,
394e.g. \code{latin-1}. \var{canonical} is the character set's canonical
395name, e.g. \code{iso-8859-1}.
396
397The global charset alias registry is kept in the module global
398dictionary \code{ALIASES}.
399\end{funcdesc}
400
401\begin{funcdesc}{add_codec}{charset, codecname}
402Add a codec that map characters in the given character set to and from
403Unicode.
404
405\var{charset} is the canonical name of a character set.
406\var{codecname} is the name of a Python codec, as appropriate for the
407second argument to the \function{unicode()} built-in, or to the
408\method{encode()} method of a Unicode string.
409\end{funcdesc}