blob: 2fb7d3e23e4e18e5c8a40340f2bb3363b53d991e [file] [log] [blame]
Fred Drake295da241998-08-10 19:42:37 +00001\section{\module{rfc822} ---
Barry Warsaw95400a22001-07-16 20:47:58 +00002 Parse RFC 2822 mail headers}
Guido van Rossuma12ef941995-02-27 17:53:25 +00003
Fred Drakeffbe6871999-04-22 21:23:22 +00004\declaremodule{standard}{rfc822}
Barry Warsaw95400a22001-07-16 20:47:58 +00005\modulesynopsis{Parse \rfc{2822} style mail messages.}
Fred Drakeb91e9341998-07-23 17:59:49 +00006
Barry Warsaw95400a22001-07-16 20:47:58 +00007This module defines a class, \class{Message}, which represents an
8``email message'' as defined by the Internet standard
Fred Drakee78661b2001-07-17 05:17:58 +00009\rfc{2822}.\footnote{This module originally conformed to \rfc{822},
Barry Warsaw95400a22001-07-16 20:47:58 +000010hence the name. Since then, \rfc{2822} has been released as an
11update to \rfc{822}. This module should be considered
12\rfc{2822}-conformant, especially in cases where the
Fred Drakee78661b2001-07-17 05:17:58 +000013syntax or semantics have changed since \rfc{822}.} Such messages
Barry Warsaw95400a22001-07-16 20:47:58 +000014consist of a collection of message headers, and a message body. This
15module also defines a helper class
16\class{AddressList} for parsing \rfc{2822} addresses. Please refer to
17the RFC for information on the specific syntax of \rfc{2822} messages.
Guido van Rossuma12ef941995-02-27 17:53:25 +000018
Fred Drake38e5d272000-04-03 20:13:55 +000019The \refmodule{mailbox}\refstmodindex{mailbox} module provides classes
20to read mailboxes produced by various end-user mail programs.
Guido van Rossum067a2ac1997-06-02 17:30:03 +000021
Fred Drakecdea8a31998-03-14 06:17:43 +000022\begin{classdesc}{Message}{file\optional{, seekable}}
Guido van Rossum12991001998-06-10 21:34:27 +000023A \class{Message} instance is instantiated with an input object as
24parameter. Message relies only on the input object having a
Fred Drake23329d41998-08-10 17:46:22 +000025\method{readline()} method; in particular, ordinary file objects
26qualify. Instantiation reads headers from the input object up to a
27delimiter line (normally a blank line) and stores them in the
Eric S. Raymonde7213c72001-01-27 10:56:14 +000028instance. The message body, following the headers, is not consumed.
Guido van Rossum12991001998-06-10 21:34:27 +000029
Fred Drake23329d41998-08-10 17:46:22 +000030This class can work with any input object that supports a
31\method{readline()} method. If the input object has seek and tell
32capability, the \method{rewindbody()} method will work; also, illegal
33lines will be pushed back onto the input stream. If the input object
34lacks seek but has an \method{unread()} method that can push back a
35line of input, \class{Message} will use that to push back illegal
36lines. Thus this class can be used to parse messages coming from a
37buffered stream.
Guido van Rossum12991001998-06-10 21:34:27 +000038
Fred Drake23329d41998-08-10 17:46:22 +000039The optional \var{seekable} argument is provided as a workaround for
40certain stdio libraries in which \cfunction{tell()} discards buffered
41data before discovering that the \cfunction{lseek()} system call
42doesn't work. For maximum portability, you should set the seekable
43argument to zero to prevent that initial \method{tell()} when passing
44in an unseekable object such as a a file object created from a socket
45object.
Guido van Rossuma12ef941995-02-27 17:53:25 +000046
47Input lines as read from the file may either be terminated by CR-LF or
48by a single linefeed; a terminating CR-LF is replaced by a single
49linefeed before the line is stored.
50
51All header matching is done independent of upper or lower case;
Fred Drake23329d41998-08-10 17:46:22 +000052e.g.\ \code{\var{m}['From']}, \code{\var{m}['from']} and
Fred Drakecdea8a31998-03-14 06:17:43 +000053\code{\var{m}['FROM']} all yield the same result.
54\end{classdesc}
Guido van Rossuma12ef941995-02-27 17:53:25 +000055
Guido van Rossum87294831998-06-16 22:27:40 +000056\begin{classdesc}{AddressList}{field}
Fred Drakeae0f2921999-06-10 15:03:07 +000057You may instantiate the \class{AddressList} helper class using a single
Barry Warsaw95400a22001-07-16 20:47:58 +000058string parameter, a comma-separated list of \rfc{2822} addresses to be
Fred Drake23329d41998-08-10 17:46:22 +000059parsed. (The parameter \code{None} yields an empty list.)
Guido van Rossum87294831998-06-16 22:27:40 +000060\end{classdesc}
61
Barry Warsaw95400a22001-07-16 20:47:58 +000062\begin{funcdesc}{quote}{str}
63Return a new string with backslashes in \var{str} replaced by two
64backslashes and double quotes replaced by backslash-double quote.
65\end{funcdesc}
66
67\begin{funcdesc}{unquote}{str}
68Return a new string which is an \emph{unquoted} version of \var{str}.
69If \var{str} ends and begins with double quotes, they are stripped
70off. Likewise if \var{str} ends and begins with angle brackets, they
71are stripped off.
72\end{funcdesc}
73
74\begin{funcdesc}{parseaddr}{address}
Fred Draked86038d2001-08-03 18:39:36 +000075Parse \var{address}, which should be the value of some
76address-containing field such as \mailheader{To} or \mailheader{Cc},
77into its constituent ``realname'' and ``email address'' parts.
78Returns a tuple of that information, unless the parse fails, in which
79case a 2-tuple \code{(None, None)} is returned.
Barry Warsaw95400a22001-07-16 20:47:58 +000080\end{funcdesc}
81
82\begin{funcdesc}{dump_address_pair}{pair}
83The inverse of \method{parseaddr()}, this takes a 2-tuple of the form
Fred Draked86038d2001-08-03 18:39:36 +000084\code{(\var{realname}, \var{email_address})} and returns the string
85value suitable for a \mailheader{To} or \mailheader{Cc} header. If
86the first element of \var{pair} is false, then the second element is
87returned unmodified.
Barry Warsaw95400a22001-07-16 20:47:58 +000088\end{funcdesc}
89
Guido van Rossum843e7121996-12-06 21:23:53 +000090\begin{funcdesc}{parsedate}{date}
Barry Warsaw95400a22001-07-16 20:47:58 +000091Attempts to parse a date according to the rules in \rfc{2822}.
Fred Drakecdea8a31998-03-14 06:17:43 +000092however, some mailers don't follow that format as specified, so
93\function{parsedate()} tries to guess correctly in such cases.
Barry Warsaw95400a22001-07-16 20:47:58 +000094\var{date} is a string containing an \rfc{2822} date, such as
Fred Drakecdea8a31998-03-14 06:17:43 +000095\code{'Mon, 20 Nov 1995 19:12:08 -0500'}. If it succeeds in parsing
96the date, \function{parsedate()} returns a 9-tuple that can be passed
97directly to \function{time.mktime()}; otherwise \code{None} will be
Fred Drake38e5d272000-04-03 20:13:55 +000098returned. Note that fields 6, 7, and 8 of the result tuple are not
99usable.
Guido van Rossum843e7121996-12-06 21:23:53 +0000100\end{funcdesc}
101
102\begin{funcdesc}{parsedate_tz}{date}
Fred Drakecdea8a31998-03-14 06:17:43 +0000103Performs the same function as \function{parsedate()}, but returns
104either \code{None} or a 10-tuple; the first 9 elements make up a tuple
105that can be passed directly to \function{time.mktime()}, and the tenth
106is the offset of the date's timezone from UTC (which is the official
107term for Greenwich Mean Time). (Note that the sign of the timezone
108offset is the opposite of the sign of the \code{time.timezone}
109variable for the same timezone; the latter variable follows the
Barry Warsaw95400a22001-07-16 20:47:58 +0000110\POSIX{} standard while this module follows \rfc{2822}.) If the input
Fred Drakecdea8a31998-03-14 06:17:43 +0000111string has no timezone, the last element of the tuple returned is
Fred Drake38e5d272000-04-03 20:13:55 +0000112\code{None}. Note that fields 6, 7, and 8 of the result tuple are not
113usable.
Guido van Rossum843e7121996-12-06 21:23:53 +0000114\end{funcdesc}
115
Guido van Rossum8cf94e61998-02-18 05:09:14 +0000116\begin{funcdesc}{mktime_tz}{tuple}
Fred Drakecdea8a31998-03-14 06:17:43 +0000117Turn a 10-tuple as returned by \function{parsedate_tz()} into a UTC
Fred Draked93d68b2002-01-05 01:52:41 +0000118timestamp. If the timezone item in the tuple is \code{None}, assume
Fred Drakecdea8a31998-03-14 06:17:43 +0000119local time. Minor deficiency: this first interprets the first 8
120elements as a local time and then compensates for the timezone
121difference; this may yield a slight error around daylight savings time
Guido van Rossum8cf94e61998-02-18 05:09:14 +0000122switch dates. Not enough to worry about for common use.
123\end{funcdesc}
124
Fred Drakeea002051999-04-28 18:11:09 +0000125
Fred Drake38e5d272000-04-03 20:13:55 +0000126\begin{seealso}
127 \seemodule{mailbox}{Classes to read various mailbox formats produced
128 by end-user mail programs.}
Skip Montanaro6634b142000-09-15 18:20:20 +0000129 \seemodule{mimetools}{Subclass of rfc.Message that handles MIME encoded
130 messages.}
Fred Drake38e5d272000-04-03 20:13:55 +0000131\end{seealso}
132
133
Fred Drakeea002051999-04-28 18:11:09 +0000134\subsection{Message Objects \label{message-objects}}
Guido van Rossumecde7811995-03-28 13:35:14 +0000135
Fred Drakecdea8a31998-03-14 06:17:43 +0000136A \class{Message} instance has the following methods:
Guido van Rossuma12ef941995-02-27 17:53:25 +0000137
Fred Drakee14dde21998-04-04 06:19:30 +0000138\begin{methoddesc}{rewindbody}{}
Guido van Rossuma12ef941995-02-27 17:53:25 +0000139Seek to the start of the message body. This only works if the file
140object is seekable.
Fred Drakee14dde21998-04-04 06:19:30 +0000141\end{methoddesc}
Guido van Rossuma12ef941995-02-27 17:53:25 +0000142
Guido van Rossum444d0f81998-06-11 13:50:02 +0000143\begin{methoddesc}{isheader}{line}
144Returns a line's canonicalized fieldname (the dictionary key that will
Barry Warsaw95400a22001-07-16 20:47:58 +0000145be used to index it) if the line is a legal \rfc{2822} header; otherwise
Fred Draked86038d2001-08-03 18:39:36 +0000146returns \code{None} (implying that parsing should stop here and the
147line be pushed back on the input stream). It is sometimes useful to
148override this method in a subclass.
Guido van Rossum444d0f81998-06-11 13:50:02 +0000149\end{methoddesc}
150
Guido van Rossum12991001998-06-10 21:34:27 +0000151\begin{methoddesc}{islast}{line}
152Return true if the given line is a delimiter on which Message should
Guido van Rossum444d0f81998-06-11 13:50:02 +0000153stop. The delimiter line is consumed, and the file object's read
154location positioned immediately after it. By default this method just
155checks that the line is blank, but you can override it in a subclass.
Guido van Rossum12991001998-06-10 21:34:27 +0000156\end{methoddesc}
157
158\begin{methoddesc}{iscomment}{line}
159Return true if the given line should be ignored entirely, just skipped.
160By default this is a stub that always returns false, but you can
161override it in a subclass.
162\end{methoddesc}
163
Fred Drakee14dde21998-04-04 06:19:30 +0000164\begin{methoddesc}{getallmatchingheaders}{name}
Guido van Rossum6c4f0031995-03-07 10:14:09 +0000165Return a list of lines consisting of all headers matching
Guido van Rossuma12ef941995-02-27 17:53:25 +0000166\var{name}, if any. Each physical line, whether it is a continuation
167line or not, is a separate list item. Return the empty list if no
168header matches \var{name}.
Fred Drakee14dde21998-04-04 06:19:30 +0000169\end{methoddesc}
Guido van Rossuma12ef941995-02-27 17:53:25 +0000170
Fred Drakee14dde21998-04-04 06:19:30 +0000171\begin{methoddesc}{getfirstmatchingheader}{name}
Guido van Rossuma12ef941995-02-27 17:53:25 +0000172Return a list of lines comprising the first header matching
Fred Drakeea002051999-04-28 18:11:09 +0000173\var{name}, and its continuation line(s), if any. Return
174\code{None} if there is no header matching \var{name}.
Fred Drakee14dde21998-04-04 06:19:30 +0000175\end{methoddesc}
Guido van Rossuma12ef941995-02-27 17:53:25 +0000176
Fred Drakee14dde21998-04-04 06:19:30 +0000177\begin{methoddesc}{getrawheader}{name}
Guido van Rossuma12ef941995-02-27 17:53:25 +0000178Return a single string consisting of the text after the colon in the
179first header matching \var{name}. This includes leading whitespace,
180the trailing linefeed, and internal linefeeds and whitespace if there
181any continuation line(s) were present. Return \code{None} if there is
182no header matching \var{name}.
Fred Drakee14dde21998-04-04 06:19:30 +0000183\end{methoddesc}
Guido van Rossuma12ef941995-02-27 17:53:25 +0000184
Guido van Rossum12991001998-06-10 21:34:27 +0000185\begin{methoddesc}{getheader}{name\optional{, default}}
Guido van Rossuma12ef941995-02-27 17:53:25 +0000186Like \code{getrawheader(\var{name})}, but strip leading and trailing
Guido van Rossum12991001998-06-10 21:34:27 +0000187whitespace. Internal whitespace is not stripped. The optional
188\var{default} argument can be used to specify a different default to
189be returned when there is no header matching \var{name}.
190\end{methoddesc}
191
192\begin{methoddesc}{get}{name\optional{, default}}
Fred Drake23329d41998-08-10 17:46:22 +0000193An alias for \method{getheader()}, to make the interface more compatible
Guido van Rossum12991001998-06-10 21:34:27 +0000194with regular dictionaries.
Fred Drakee14dde21998-04-04 06:19:30 +0000195\end{methoddesc}
Guido van Rossuma12ef941995-02-27 17:53:25 +0000196
Fred Drakee14dde21998-04-04 06:19:30 +0000197\begin{methoddesc}{getaddr}{name}
Fred Drakecdea8a31998-03-14 06:17:43 +0000198Return a pair \code{(\var{full name}, \var{email address})} parsed
199from the string returned by \code{getheader(\var{name})}. If no
200header matching \var{name} exists, return \code{(None, None)};
201otherwise both the full name and the address are (possibly empty)
202strings.
Guido van Rossuma12ef941995-02-27 17:53:25 +0000203
Fred Draked86038d2001-08-03 18:39:36 +0000204Example: If \var{m}'s first \mailheader{From} header contains the
205string \code{'jack@cwi.nl (Jack Jansen)'}, then
Guido van Rossuma12ef941995-02-27 17:53:25 +0000206\code{m.getaddr('From')} will yield the pair
Guido van Rossum470be141995-03-17 16:07:09 +0000207\code{('Jack Jansen', 'jack@cwi.nl')}.
Guido van Rossuma12ef941995-02-27 17:53:25 +0000208If the header contained
Guido van Rossum470be141995-03-17 16:07:09 +0000209\code{'Jack Jansen <jack@cwi.nl>'} instead, it would yield the
Guido van Rossuma12ef941995-02-27 17:53:25 +0000210exact same result.
Fred Drakee14dde21998-04-04 06:19:30 +0000211\end{methoddesc}
Guido van Rossuma12ef941995-02-27 17:53:25 +0000212
Fred Drakee14dde21998-04-04 06:19:30 +0000213\begin{methoddesc}{getaddrlist}{name}
Guido van Rossuma12ef941995-02-27 17:53:25 +0000214This is similar to \code{getaddr(\var{list})}, but parses a header
Fred Draked86038d2001-08-03 18:39:36 +0000215containing a list of email addresses (e.g.\ a \mailheader{To} header) and
Fred Drakecdea8a31998-03-14 06:17:43 +0000216returns a list of \code{(\var{full name}, \var{email address})} pairs
217(even if there was only one address in the header). If there is no
218header matching \var{name}, return an empty list.
Guido van Rossuma12ef941995-02-27 17:53:25 +0000219
Barry Warsaw53610ca1999-01-14 21:26:54 +0000220If multiple headers exist that match the named header (e.g. if there
Fred Draked86038d2001-08-03 18:39:36 +0000221are several \mailheader{Cc} headers), all are parsed for addresses.
222Any continuation lines the named headers contain are also parsed.
Fred Drakee14dde21998-04-04 06:19:30 +0000223\end{methoddesc}
Guido van Rossuma12ef941995-02-27 17:53:25 +0000224
Fred Drakee14dde21998-04-04 06:19:30 +0000225\begin{methoddesc}{getdate}{name}
Fred Drakecdea8a31998-03-14 06:17:43 +0000226Retrieve a header using \method{getheader()} and parse it into a 9-tuple
Fred Drake38e5d272000-04-03 20:13:55 +0000227compatible with \function{time.mktime()}; note that fields 6, 7, and 8
228are not usable. If there is no header matching
Guido van Rossuma12ef941995-02-27 17:53:25 +0000229\var{name}, or it is unparsable, return \code{None}.
230
231Date parsing appears to be a black art, and not all mailers adhere to
232the standard. While it has been tested and found correct on a large
233collection of email from many sources, it is still possible that this
234function may occasionally yield an incorrect result.
Fred Drakee14dde21998-04-04 06:19:30 +0000235\end{methoddesc}
Guido van Rossuma12ef941995-02-27 17:53:25 +0000236
Fred Drakee14dde21998-04-04 06:19:30 +0000237\begin{methoddesc}{getdate_tz}{name}
Fred Drakecdea8a31998-03-14 06:17:43 +0000238Retrieve a header using \method{getheader()} and parse it into a
23910-tuple; the first 9 elements will make a tuple compatible with
240\function{time.mktime()}, and the 10th is a number giving the offset
Fred Drake38e5d272000-04-03 20:13:55 +0000241of the date's timezone from UTC. Note that fields 6, 7, and 8
242are not usable. Similarly to \method{getdate()}, if
Guido van Rossum843e7121996-12-06 21:23:53 +0000243there is no header matching \var{name}, or it is unparsable, return
244\code{None}.
Fred Drakee14dde21998-04-04 06:19:30 +0000245\end{methoddesc}
Guido van Rossum843e7121996-12-06 21:23:53 +0000246
Fred Drake70631492001-05-22 14:36:30 +0000247\class{Message} instances also support a limited mapping interface.
Fred Drakee14dde21998-04-04 06:19:30 +0000248In particular: \code{\var{m}[name]} is like
249\code{\var{m}.getheader(name)} but raises \exception{KeyError} if
250there is no matching header; and \code{len(\var{m})},
Fred Drake6b4593e2001-05-22 15:12:46 +0000251\code{\var{m}.get(name\optional{, deafult})},
Fred Drakecdea8a31998-03-14 06:17:43 +0000252\code{\var{m}.has_key(name)}, \code{\var{m}.keys()},
Fred Drake6b4593e2001-05-22 15:12:46 +0000253\code{\var{m}.values()} \code{\var{m}.items()}, and
Fred Drake98cfab62001-05-22 22:00:40 +0000254\code{\var{m}.setdefault(name\optional{, default})} act as expected,
255with the one difference that \method{get()} and \method{setdefault()}
256use an empty string as the default value. \class{Message} instances
257also support the mapping writable interface \code{\var{m}[name] =
258value} and \code{del \var{m}[name]}. \class{Message} objects do not
259support the \method{clear()}, \method{copy()}, \method{popitem()}, or
Fred Drake6b4593e2001-05-22 15:12:46 +0000260\method{update()} methods of the mapping interface. (Support for
Fred Drakee78661b2001-07-17 05:17:58 +0000261\method{get()} and \method{setdefault()} was only added in Python
Fred Drake6b4593e2001-05-22 15:12:46 +00002622.2.)
Guido van Rossuma12ef941995-02-27 17:53:25 +0000263
Fred Drakef5072b92001-09-06 15:07:55 +0000264Finally, \class{Message} instances have some public instance variables:
Guido van Rossuma12ef941995-02-27 17:53:25 +0000265
Fred Drakee14dde21998-04-04 06:19:30 +0000266\begin{memberdesc}{headers}
Guido van Rossuma12ef941995-02-27 17:53:25 +0000267A list containing the entire set of header lines, in the order in
Guido van Rossum87294831998-06-16 22:27:40 +0000268which they were read (except that setitem calls may disturb this
269order). Each line contains a trailing newline. The
Guido van Rossuma12ef941995-02-27 17:53:25 +0000270blank line terminating the headers is not contained in the list.
Fred Drakee14dde21998-04-04 06:19:30 +0000271\end{memberdesc}
Guido van Rossuma12ef941995-02-27 17:53:25 +0000272
Fred Drakee14dde21998-04-04 06:19:30 +0000273\begin{memberdesc}{fp}
Fred Drakeea002051999-04-28 18:11:09 +0000274The file or file-like object passed at instantiation time. This can
275be used to read the message content.
Fred Drakee14dde21998-04-04 06:19:30 +0000276\end{memberdesc}
Guido van Rossum87294831998-06-16 22:27:40 +0000277
Fred Drakef5072b92001-09-06 15:07:55 +0000278\begin{memberdesc}{unixfrom}
279The \UNIX{} \samp{From~} line, if the message had one, or an empty
280string. This is needed to regenerate the message in some contexts,
281such as an \code{mbox}-style mailbox file.
282\end{memberdesc}
283
Fred Drakeea002051999-04-28 18:11:09 +0000284
285\subsection{AddressList Objects \label{addresslist-objects}}
Guido van Rossum87294831998-06-16 22:27:40 +0000286
287An \class{AddressList} instance has the following methods:
288
Fred Drake9c846362001-04-09 15:42:56 +0000289\begin{methoddesc}{__len__}{}
Guido van Rossum87294831998-06-16 22:27:40 +0000290Return the number of addresses in the address list.
291\end{methoddesc}
292
Fred Drake9c846362001-04-09 15:42:56 +0000293\begin{methoddesc}{__str__}{}
Guido van Rossum87294831998-06-16 22:27:40 +0000294Return a canonicalized string representation of the address list.
295Addresses are rendered in "name" <host@domain> form, comma-separated.
296\end{methoddesc}
297
Fred Drake9c846362001-04-09 15:42:56 +0000298\begin{methoddesc}{__add__}{alist}
299Return a new \class{AddressList} instance that contains all addresses
300in both \class{AddressList} operands, with duplicates removed (set
301union).
Guido van Rossum87294831998-06-16 22:27:40 +0000302\end{methoddesc}
303
Fred Drake9c846362001-04-09 15:42:56 +0000304\begin{methoddesc}{__iadd__}{alist}
305In-place version of \method{__add__()}; turns this \class{AddressList}
306instance into the union of itself and the right-hand instance,
307\var{alist}.
308\end{methoddesc}
309
310\begin{methoddesc}{__sub__}{alist}
311Return a new \class{AddressList} instance that contains every address
312in the left-hand \class{AddressList} operand that is not present in
313the right-hand address operand (set difference).
314\end{methoddesc}
315
316\begin{methoddesc}{__isub__}{alist}
317In-place version of \method{__sub__()}, removing addresses in this
318list which are also in \var{alist}.
Guido van Rossum87294831998-06-16 22:27:40 +0000319\end{methoddesc}
320
321
322Finally, \class{AddressList} instances have one public instance variable:
323
324\begin{memberdesc}{addresslist}
325A list of tuple string pairs, one per address. In each member, the
Eric S. Raymonde7213c72001-01-27 10:56:14 +0000326first is the canonicalized name part, the second is the
Fred Drake9c846362001-04-09 15:42:56 +0000327actual route-address (\character{@}-separated username-host.domain
328pair).
Guido van Rossum87294831998-06-16 22:27:40 +0000329\end{memberdesc}