blob: 5db0bf38c80cdd65d0695983754cd8a0765e50f1 [file] [log] [blame]
Fred Drake295da241998-08-10 19:42:37 +00001\section{\module{rfc822} ---
Barry Warsaw95400a22001-07-16 20:47:58 +00002 Parse RFC 2822 mail headers}
Guido van Rossuma12ef941995-02-27 17:53:25 +00003
Fred Drakeffbe6871999-04-22 21:23:22 +00004\declaremodule{standard}{rfc822}
Barry Warsaw95400a22001-07-16 20:47:58 +00005\modulesynopsis{Parse \rfc{2822} style mail messages.}
Fred Drakeb91e9341998-07-23 17:59:49 +00006
Barry Warsaw95400a22001-07-16 20:47:58 +00007This module defines a class, \class{Message}, which represents an
8``email message'' as defined by the Internet standard
Fred Drakee78661b2001-07-17 05:17:58 +00009\rfc{2822}.\footnote{This module originally conformed to \rfc{822},
Barry Warsaw95400a22001-07-16 20:47:58 +000010hence the name. Since then, \rfc{2822} has been released as an
11update to \rfc{822}. This module should be considered
12\rfc{2822}-conformant, especially in cases where the
Fred Drakee78661b2001-07-17 05:17:58 +000013syntax or semantics have changed since \rfc{822}.} Such messages
Barry Warsaw95400a22001-07-16 20:47:58 +000014consist of a collection of message headers, and a message body. This
15module also defines a helper class
16\class{AddressList} for parsing \rfc{2822} addresses. Please refer to
17the RFC for information on the specific syntax of \rfc{2822} messages.
Guido van Rossuma12ef941995-02-27 17:53:25 +000018
Fred Drake38e5d272000-04-03 20:13:55 +000019The \refmodule{mailbox}\refstmodindex{mailbox} module provides classes
20to read mailboxes produced by various end-user mail programs.
Guido van Rossum067a2ac1997-06-02 17:30:03 +000021
Fred Drakecdea8a31998-03-14 06:17:43 +000022\begin{classdesc}{Message}{file\optional{, seekable}}
Guido van Rossum12991001998-06-10 21:34:27 +000023A \class{Message} instance is instantiated with an input object as
24parameter. Message relies only on the input object having a
Fred Drake23329d41998-08-10 17:46:22 +000025\method{readline()} method; in particular, ordinary file objects
26qualify. Instantiation reads headers from the input object up to a
27delimiter line (normally a blank line) and stores them in the
Eric S. Raymonde7213c72001-01-27 10:56:14 +000028instance. The message body, following the headers, is not consumed.
Guido van Rossum12991001998-06-10 21:34:27 +000029
Fred Drake23329d41998-08-10 17:46:22 +000030This class can work with any input object that supports a
31\method{readline()} method. If the input object has seek and tell
32capability, the \method{rewindbody()} method will work; also, illegal
33lines will be pushed back onto the input stream. If the input object
34lacks seek but has an \method{unread()} method that can push back a
35line of input, \class{Message} will use that to push back illegal
36lines. Thus this class can be used to parse messages coming from a
37buffered stream.
Guido van Rossum12991001998-06-10 21:34:27 +000038
Fred Drake23329d41998-08-10 17:46:22 +000039The optional \var{seekable} argument is provided as a workaround for
40certain stdio libraries in which \cfunction{tell()} discards buffered
41data before discovering that the \cfunction{lseek()} system call
42doesn't work. For maximum portability, you should set the seekable
43argument to zero to prevent that initial \method{tell()} when passing
44in an unseekable object such as a a file object created from a socket
45object.
Guido van Rossuma12ef941995-02-27 17:53:25 +000046
47Input lines as read from the file may either be terminated by CR-LF or
48by a single linefeed; a terminating CR-LF is replaced by a single
49linefeed before the line is stored.
50
51All header matching is done independent of upper or lower case;
Fred Drake23329d41998-08-10 17:46:22 +000052e.g.\ \code{\var{m}['From']}, \code{\var{m}['from']} and
Fred Drakecdea8a31998-03-14 06:17:43 +000053\code{\var{m}['FROM']} all yield the same result.
54\end{classdesc}
Guido van Rossuma12ef941995-02-27 17:53:25 +000055
Guido van Rossum87294831998-06-16 22:27:40 +000056\begin{classdesc}{AddressList}{field}
Fred Drakeae0f2921999-06-10 15:03:07 +000057You may instantiate the \class{AddressList} helper class using a single
Barry Warsaw95400a22001-07-16 20:47:58 +000058string parameter, a comma-separated list of \rfc{2822} addresses to be
Fred Drake23329d41998-08-10 17:46:22 +000059parsed. (The parameter \code{None} yields an empty list.)
Guido van Rossum87294831998-06-16 22:27:40 +000060\end{classdesc}
61
Barry Warsaw95400a22001-07-16 20:47:58 +000062\begin{funcdesc}{quote}{str}
63Return a new string with backslashes in \var{str} replaced by two
64backslashes and double quotes replaced by backslash-double quote.
65\end{funcdesc}
66
67\begin{funcdesc}{unquote}{str}
68Return a new string which is an \emph{unquoted} version of \var{str}.
69If \var{str} ends and begins with double quotes, they are stripped
70off. Likewise if \var{str} ends and begins with angle brackets, they
71are stripped off.
72\end{funcdesc}
73
74\begin{funcdesc}{parseaddr}{address}
Fred Drakee78661b2001-07-17 05:17:58 +000075Parse \var{address}, which should be the value of some address-containing
76field such as \code{To:} or \code{Cc:}, into its constituent
Barry Warsaw95400a22001-07-16 20:47:58 +000077``realname'' and ``email address'' parts. Returns a tuple of that
78information, unless the parse fails, in which case a 2-tuple of
79\code{(None, None)} is returned.
80\end{funcdesc}
81
82\begin{funcdesc}{dump_address_pair}{pair}
83The inverse of \method{parseaddr()}, this takes a 2-tuple of the form
84\code{(realname, email_address)} and returns the string value suitable
85for a \code{To:} or \code{Cc:} header. If the first element of
86\var{pair} is false, then the second element is returned unmodified.
87\end{funcdesc}
88
Guido van Rossum843e7121996-12-06 21:23:53 +000089\begin{funcdesc}{parsedate}{date}
Barry Warsaw95400a22001-07-16 20:47:58 +000090Attempts to parse a date according to the rules in \rfc{2822}.
Fred Drakecdea8a31998-03-14 06:17:43 +000091however, some mailers don't follow that format as specified, so
92\function{parsedate()} tries to guess correctly in such cases.
Barry Warsaw95400a22001-07-16 20:47:58 +000093\var{date} is a string containing an \rfc{2822} date, such as
Fred Drakecdea8a31998-03-14 06:17:43 +000094\code{'Mon, 20 Nov 1995 19:12:08 -0500'}. If it succeeds in parsing
95the date, \function{parsedate()} returns a 9-tuple that can be passed
96directly to \function{time.mktime()}; otherwise \code{None} will be
Fred Drake38e5d272000-04-03 20:13:55 +000097returned. Note that fields 6, 7, and 8 of the result tuple are not
98usable.
Guido van Rossum843e7121996-12-06 21:23:53 +000099\end{funcdesc}
100
101\begin{funcdesc}{parsedate_tz}{date}
Fred Drakecdea8a31998-03-14 06:17:43 +0000102Performs the same function as \function{parsedate()}, but returns
103either \code{None} or a 10-tuple; the first 9 elements make up a tuple
104that can be passed directly to \function{time.mktime()}, and the tenth
105is the offset of the date's timezone from UTC (which is the official
106term for Greenwich Mean Time). (Note that the sign of the timezone
107offset is the opposite of the sign of the \code{time.timezone}
108variable for the same timezone; the latter variable follows the
Barry Warsaw95400a22001-07-16 20:47:58 +0000109\POSIX{} standard while this module follows \rfc{2822}.) If the input
Fred Drakecdea8a31998-03-14 06:17:43 +0000110string has no timezone, the last element of the tuple returned is
Fred Drake38e5d272000-04-03 20:13:55 +0000111\code{None}. Note that fields 6, 7, and 8 of the result tuple are not
112usable.
Guido van Rossum843e7121996-12-06 21:23:53 +0000113\end{funcdesc}
114
Guido van Rossum8cf94e61998-02-18 05:09:14 +0000115\begin{funcdesc}{mktime_tz}{tuple}
Fred Drakecdea8a31998-03-14 06:17:43 +0000116Turn a 10-tuple as returned by \function{parsedate_tz()} into a UTC
117timestamp. It the timezone item in the tuple is \code{None}, assume
118local time. Minor deficiency: this first interprets the first 8
119elements as a local time and then compensates for the timezone
120difference; this may yield a slight error around daylight savings time
Guido van Rossum8cf94e61998-02-18 05:09:14 +0000121switch dates. Not enough to worry about for common use.
122\end{funcdesc}
123
Fred Drakeea002051999-04-28 18:11:09 +0000124
Fred Drake38e5d272000-04-03 20:13:55 +0000125\begin{seealso}
126 \seemodule{mailbox}{Classes to read various mailbox formats produced
127 by end-user mail programs.}
Skip Montanaro6634b142000-09-15 18:20:20 +0000128 \seemodule{mimetools}{Subclass of rfc.Message that handles MIME encoded
129 messages.}
Fred Drake38e5d272000-04-03 20:13:55 +0000130\end{seealso}
131
132
Fred Drakeea002051999-04-28 18:11:09 +0000133\subsection{Message Objects \label{message-objects}}
Guido van Rossumecde7811995-03-28 13:35:14 +0000134
Fred Drakecdea8a31998-03-14 06:17:43 +0000135A \class{Message} instance has the following methods:
Guido van Rossuma12ef941995-02-27 17:53:25 +0000136
Fred Drakee14dde21998-04-04 06:19:30 +0000137\begin{methoddesc}{rewindbody}{}
Guido van Rossuma12ef941995-02-27 17:53:25 +0000138Seek to the start of the message body. This only works if the file
139object is seekable.
Fred Drakee14dde21998-04-04 06:19:30 +0000140\end{methoddesc}
Guido van Rossuma12ef941995-02-27 17:53:25 +0000141
Guido van Rossum444d0f81998-06-11 13:50:02 +0000142\begin{methoddesc}{isheader}{line}
143Returns a line's canonicalized fieldname (the dictionary key that will
Barry Warsaw95400a22001-07-16 20:47:58 +0000144be used to index it) if the line is a legal \rfc{2822} header; otherwise
Guido van Rossum444d0f81998-06-11 13:50:02 +0000145returns None (implying that parsing should stop here and the line be
146pushed back on the input stream). It is sometimes useful to override
147this method in a subclass.
148\end{methoddesc}
149
Guido van Rossum12991001998-06-10 21:34:27 +0000150\begin{methoddesc}{islast}{line}
151Return true if the given line is a delimiter on which Message should
Guido van Rossum444d0f81998-06-11 13:50:02 +0000152stop. The delimiter line is consumed, and the file object's read
153location positioned immediately after it. By default this method just
154checks that the line is blank, but you can override it in a subclass.
Guido van Rossum12991001998-06-10 21:34:27 +0000155\end{methoddesc}
156
157\begin{methoddesc}{iscomment}{line}
158Return true if the given line should be ignored entirely, just skipped.
159By default this is a stub that always returns false, but you can
160override it in a subclass.
161\end{methoddesc}
162
Fred Drakee14dde21998-04-04 06:19:30 +0000163\begin{methoddesc}{getallmatchingheaders}{name}
Guido van Rossum6c4f0031995-03-07 10:14:09 +0000164Return a list of lines consisting of all headers matching
Guido van Rossuma12ef941995-02-27 17:53:25 +0000165\var{name}, if any. Each physical line, whether it is a continuation
166line or not, is a separate list item. Return the empty list if no
167header matches \var{name}.
Fred Drakee14dde21998-04-04 06:19:30 +0000168\end{methoddesc}
Guido van Rossuma12ef941995-02-27 17:53:25 +0000169
Fred Drakee14dde21998-04-04 06:19:30 +0000170\begin{methoddesc}{getfirstmatchingheader}{name}
Guido van Rossuma12ef941995-02-27 17:53:25 +0000171Return a list of lines comprising the first header matching
Fred Drakeea002051999-04-28 18:11:09 +0000172\var{name}, and its continuation line(s), if any. Return
173\code{None} if there is no header matching \var{name}.
Fred Drakee14dde21998-04-04 06:19:30 +0000174\end{methoddesc}
Guido van Rossuma12ef941995-02-27 17:53:25 +0000175
Fred Drakee14dde21998-04-04 06:19:30 +0000176\begin{methoddesc}{getrawheader}{name}
Guido van Rossuma12ef941995-02-27 17:53:25 +0000177Return a single string consisting of the text after the colon in the
178first header matching \var{name}. This includes leading whitespace,
179the trailing linefeed, and internal linefeeds and whitespace if there
180any continuation line(s) were present. Return \code{None} if there is
181no header matching \var{name}.
Fred Drakee14dde21998-04-04 06:19:30 +0000182\end{methoddesc}
Guido van Rossuma12ef941995-02-27 17:53:25 +0000183
Guido van Rossum12991001998-06-10 21:34:27 +0000184\begin{methoddesc}{getheader}{name\optional{, default}}
Guido van Rossuma12ef941995-02-27 17:53:25 +0000185Like \code{getrawheader(\var{name})}, but strip leading and trailing
Guido van Rossum12991001998-06-10 21:34:27 +0000186whitespace. Internal whitespace is not stripped. The optional
187\var{default} argument can be used to specify a different default to
188be returned when there is no header matching \var{name}.
189\end{methoddesc}
190
191\begin{methoddesc}{get}{name\optional{, default}}
Fred Drake23329d41998-08-10 17:46:22 +0000192An alias for \method{getheader()}, to make the interface more compatible
Guido van Rossum12991001998-06-10 21:34:27 +0000193with regular dictionaries.
Fred Drakee14dde21998-04-04 06:19:30 +0000194\end{methoddesc}
Guido van Rossuma12ef941995-02-27 17:53:25 +0000195
Fred Drakee14dde21998-04-04 06:19:30 +0000196\begin{methoddesc}{getaddr}{name}
Fred Drakecdea8a31998-03-14 06:17:43 +0000197Return a pair \code{(\var{full name}, \var{email address})} parsed
198from the string returned by \code{getheader(\var{name})}. If no
199header matching \var{name} exists, return \code{(None, None)};
200otherwise both the full name and the address are (possibly empty)
201strings.
Guido van Rossuma12ef941995-02-27 17:53:25 +0000202
Fred Drakecdea8a31998-03-14 06:17:43 +0000203Example: If \var{m}'s first \code{From} header contains the string
Guido van Rossum470be141995-03-17 16:07:09 +0000204\code{'jack@cwi.nl (Jack Jansen)'}, then
Guido van Rossuma12ef941995-02-27 17:53:25 +0000205\code{m.getaddr('From')} will yield the pair
Guido van Rossum470be141995-03-17 16:07:09 +0000206\code{('Jack Jansen', 'jack@cwi.nl')}.
Guido van Rossuma12ef941995-02-27 17:53:25 +0000207If the header contained
Guido van Rossum470be141995-03-17 16:07:09 +0000208\code{'Jack Jansen <jack@cwi.nl>'} instead, it would yield the
Guido van Rossuma12ef941995-02-27 17:53:25 +0000209exact same result.
Fred Drakee14dde21998-04-04 06:19:30 +0000210\end{methoddesc}
Guido van Rossuma12ef941995-02-27 17:53:25 +0000211
Fred Drakee14dde21998-04-04 06:19:30 +0000212\begin{methoddesc}{getaddrlist}{name}
Guido van Rossuma12ef941995-02-27 17:53:25 +0000213This is similar to \code{getaddr(\var{list})}, but parses a header
Fred Drake23329d41998-08-10 17:46:22 +0000214containing a list of email addresses (e.g.\ a \code{To} header) and
Fred Drakecdea8a31998-03-14 06:17:43 +0000215returns a list of \code{(\var{full name}, \var{email address})} pairs
216(even if there was only one address in the header). If there is no
217header matching \var{name}, return an empty list.
Guido van Rossuma12ef941995-02-27 17:53:25 +0000218
Barry Warsaw53610ca1999-01-14 21:26:54 +0000219If multiple headers exist that match the named header (e.g. if there
220are several \code{Cc} headers), all are parsed for addresses. Any
221continuation lines the named headers contain are also parsed.
Fred Drakee14dde21998-04-04 06:19:30 +0000222\end{methoddesc}
Guido van Rossuma12ef941995-02-27 17:53:25 +0000223
Fred Drakee14dde21998-04-04 06:19:30 +0000224\begin{methoddesc}{getdate}{name}
Fred Drakecdea8a31998-03-14 06:17:43 +0000225Retrieve a header using \method{getheader()} and parse it into a 9-tuple
Fred Drake38e5d272000-04-03 20:13:55 +0000226compatible with \function{time.mktime()}; note that fields 6, 7, and 8
227are not usable. If there is no header matching
Guido van Rossuma12ef941995-02-27 17:53:25 +0000228\var{name}, or it is unparsable, return \code{None}.
229
230Date parsing appears to be a black art, and not all mailers adhere to
231the standard. While it has been tested and found correct on a large
232collection of email from many sources, it is still possible that this
233function may occasionally yield an incorrect result.
Fred Drakee14dde21998-04-04 06:19:30 +0000234\end{methoddesc}
Guido van Rossuma12ef941995-02-27 17:53:25 +0000235
Fred Drakee14dde21998-04-04 06:19:30 +0000236\begin{methoddesc}{getdate_tz}{name}
Fred Drakecdea8a31998-03-14 06:17:43 +0000237Retrieve a header using \method{getheader()} and parse it into a
23810-tuple; the first 9 elements will make a tuple compatible with
239\function{time.mktime()}, and the 10th is a number giving the offset
Fred Drake38e5d272000-04-03 20:13:55 +0000240of the date's timezone from UTC. Note that fields 6, 7, and 8
241are not usable. Similarly to \method{getdate()}, if
Guido van Rossum843e7121996-12-06 21:23:53 +0000242there is no header matching \var{name}, or it is unparsable, return
243\code{None}.
Fred Drakee14dde21998-04-04 06:19:30 +0000244\end{methoddesc}
Guido van Rossum843e7121996-12-06 21:23:53 +0000245
Fred Drake70631492001-05-22 14:36:30 +0000246\class{Message} instances also support a limited mapping interface.
Fred Drakee14dde21998-04-04 06:19:30 +0000247In particular: \code{\var{m}[name]} is like
248\code{\var{m}.getheader(name)} but raises \exception{KeyError} if
249there is no matching header; and \code{len(\var{m})},
Fred Drake6b4593e2001-05-22 15:12:46 +0000250\code{\var{m}.get(name\optional{, deafult})},
Fred Drakecdea8a31998-03-14 06:17:43 +0000251\code{\var{m}.has_key(name)}, \code{\var{m}.keys()},
Fred Drake6b4593e2001-05-22 15:12:46 +0000252\code{\var{m}.values()} \code{\var{m}.items()}, and
Fred Drake98cfab62001-05-22 22:00:40 +0000253\code{\var{m}.setdefault(name\optional{, default})} act as expected,
254with the one difference that \method{get()} and \method{setdefault()}
255use an empty string as the default value. \class{Message} instances
256also support the mapping writable interface \code{\var{m}[name] =
257value} and \code{del \var{m}[name]}. \class{Message} objects do not
258support the \method{clear()}, \method{copy()}, \method{popitem()}, or
Fred Drake6b4593e2001-05-22 15:12:46 +0000259\method{update()} methods of the mapping interface. (Support for
Fred Drakee78661b2001-07-17 05:17:58 +0000260\method{get()} and \method{setdefault()} was only added in Python
Fred Drake6b4593e2001-05-22 15:12:46 +00002612.2.)
Guido van Rossuma12ef941995-02-27 17:53:25 +0000262
Fred Drakecdea8a31998-03-14 06:17:43 +0000263Finally, \class{Message} instances have two public instance variables:
Guido van Rossuma12ef941995-02-27 17:53:25 +0000264
Fred Drakee14dde21998-04-04 06:19:30 +0000265\begin{memberdesc}{headers}
Guido van Rossuma12ef941995-02-27 17:53:25 +0000266A list containing the entire set of header lines, in the order in
Guido van Rossum87294831998-06-16 22:27:40 +0000267which they were read (except that setitem calls may disturb this
268order). Each line contains a trailing newline. The
Guido van Rossuma12ef941995-02-27 17:53:25 +0000269blank line terminating the headers is not contained in the list.
Fred Drakee14dde21998-04-04 06:19:30 +0000270\end{memberdesc}
Guido van Rossuma12ef941995-02-27 17:53:25 +0000271
Fred Drakee14dde21998-04-04 06:19:30 +0000272\begin{memberdesc}{fp}
Fred Drakeea002051999-04-28 18:11:09 +0000273The file or file-like object passed at instantiation time. This can
274be used to read the message content.
Fred Drakee14dde21998-04-04 06:19:30 +0000275\end{memberdesc}
Guido van Rossum87294831998-06-16 22:27:40 +0000276
Fred Drakeea002051999-04-28 18:11:09 +0000277
278\subsection{AddressList Objects \label{addresslist-objects}}
Guido van Rossum87294831998-06-16 22:27:40 +0000279
280An \class{AddressList} instance has the following methods:
281
Fred Drake9c846362001-04-09 15:42:56 +0000282\begin{methoddesc}{__len__}{}
Guido van Rossum87294831998-06-16 22:27:40 +0000283Return the number of addresses in the address list.
284\end{methoddesc}
285
Fred Drake9c846362001-04-09 15:42:56 +0000286\begin{methoddesc}{__str__}{}
Guido van Rossum87294831998-06-16 22:27:40 +0000287Return a canonicalized string representation of the address list.
288Addresses are rendered in "name" <host@domain> form, comma-separated.
289\end{methoddesc}
290
Fred Drake9c846362001-04-09 15:42:56 +0000291\begin{methoddesc}{__add__}{alist}
292Return a new \class{AddressList} instance that contains all addresses
293in both \class{AddressList} operands, with duplicates removed (set
294union).
Guido van Rossum87294831998-06-16 22:27:40 +0000295\end{methoddesc}
296
Fred Drake9c846362001-04-09 15:42:56 +0000297\begin{methoddesc}{__iadd__}{alist}
298In-place version of \method{__add__()}; turns this \class{AddressList}
299instance into the union of itself and the right-hand instance,
300\var{alist}.
301\end{methoddesc}
302
303\begin{methoddesc}{__sub__}{alist}
304Return a new \class{AddressList} instance that contains every address
305in the left-hand \class{AddressList} operand that is not present in
306the right-hand address operand (set difference).
307\end{methoddesc}
308
309\begin{methoddesc}{__isub__}{alist}
310In-place version of \method{__sub__()}, removing addresses in this
311list which are also in \var{alist}.
Guido van Rossum87294831998-06-16 22:27:40 +0000312\end{methoddesc}
313
314
315Finally, \class{AddressList} instances have one public instance variable:
316
317\begin{memberdesc}{addresslist}
318A list of tuple string pairs, one per address. In each member, the
Eric S. Raymonde7213c72001-01-27 10:56:14 +0000319first is the canonicalized name part, the second is the
Fred Drake9c846362001-04-09 15:42:56 +0000320actual route-address (\character{@}-separated username-host.domain
321pair).
Guido van Rossum87294831998-06-16 22:27:40 +0000322\end{memberdesc}