blob: 6dfa4d38e6dc7dfdccd3469c225914ba50e312fa [file] [log] [blame]
Barry Warsaw15e9dc92002-01-27 06:48:02 +00001# Copyright (C) 2001,2002 Python Software Foundation
Barry Warsawba925802001-09-23 03:17:28 +00002# Author: barry@zope.com (Barry Warsaw)
3
4"""A parser of RFC 2822 and MIME email messages.
5"""
6
Barry Warsaw7e21b672002-05-19 23:51:50 +00007import re
Barry Warsawba925802001-09-23 03:17:28 +00008from cStringIO import StringIO
Barry Warsaw15e9dc92002-01-27 06:48:02 +00009from types import ListType
Barry Warsawba925802001-09-23 03:17:28 +000010
Barry Warsaw69e18af2002-06-02 19:12:03 +000011from email import Errors
12from email import Message
Barry Warsawba925802001-09-23 03:17:28 +000013
Barry Warsawba925802001-09-23 03:17:28 +000014EMPTYSTRING = ''
15NL = '\n'
16
Barry Warsawe03e8f02002-09-28 20:44:58 +000017try:
18 True, False
19except NameError:
20 True = 1
21 False = 0
22
Barry Warsaw487fe6a2002-10-07 17:27:35 +000023nlcre = re.compile('\r\n|\r|\n')
24
Barry Warsawe03e8f02002-09-28 20:44:58 +000025
Barry Warsawe968ead2001-10-04 17:05:11 +000026
Barry Warsawba925802001-09-23 03:17:28 +000027class Parser:
Barry Warsawe03e8f02002-09-28 20:44:58 +000028 def __init__(self, _class=Message.Message, strict=False):
Barry Warsawba925802001-09-23 03:17:28 +000029 """Parser of RFC 2822 and MIME email messages.
30
31 Creates an in-memory object tree representing the email message, which
32 can then be manipulated and turned over to a Generator to return the
33 textual representation of the message.
34
35 The string must be formatted as a block of RFC 2822 headers and header
36 continuation lines, optionally preceeded by a `Unix-from' header. The
37 header block is terminated either by the end of the string or by a
38 blank line.
39
40 _class is the class to instantiate for new message objects when they
41 must be created. This class must have a constructor that can take
42 zero arguments. Default is Message.Message.
Barry Warsawf6caeba2002-07-09 02:50:02 +000043
44 Optional strict tells the parser to be strictly RFC compliant or to be
45 more forgiving in parsing of ill-formatted MIME documents. When
46 non-strict mode is used, the parser will try to make up for missing or
47 erroneous boundaries and other peculiarities seen in the wild.
Barry Warsawbb26b452002-07-19 22:25:34 +000048 Default is non-strict parsing.
Barry Warsawba925802001-09-23 03:17:28 +000049 """
50 self._class = _class
Barry Warsawf6caeba2002-07-09 02:50:02 +000051 self._strict = strict
Barry Warsawba925802001-09-23 03:17:28 +000052
Barry Warsawe03e8f02002-09-28 20:44:58 +000053 def parse(self, fp, headersonly=False):
Barry Warsaw057b8422002-09-30 20:07:22 +000054 """Create a message structure from the data in a file.
55
56 Reads all the data from the file and returns the root of the message
57 structure. Optional headersonly is a flag specifying whether to stop
58 parsing after reading the headers or not. The default is False,
59 meaning it parses the entire contents of the file.
60 """
Barry Warsawba925802001-09-23 03:17:28 +000061 root = self._class()
62 self._parseheaders(root, fp)
Barry Warsawf6caeba2002-07-09 02:50:02 +000063 if not headersonly:
64 self._parsebody(root, fp)
Barry Warsawba925802001-09-23 03:17:28 +000065 return root
66
Barry Warsawe03e8f02002-09-28 20:44:58 +000067 def parsestr(self, text, headersonly=False):
Barry Warsaw057b8422002-09-30 20:07:22 +000068 """Create a message structure from a string.
69
70 Returns the root of the message structure. Optional headersonly is a
71 flag specifying whether to stop parsing after reading the headers or
72 not. The default is False, meaning it parses the entire contents of
73 the file.
74 """
Barry Warsawf6caeba2002-07-09 02:50:02 +000075 return self.parse(StringIO(text), headersonly=headersonly)
Barry Warsawba925802001-09-23 03:17:28 +000076
77 def _parseheaders(self, container, fp):
78 # Parse the headers, returning a list of header/value pairs. None as
79 # the header means the Unix-From header.
80 lastheader = ''
81 lastvalue = []
82 lineno = 0
Barry Warsawe03e8f02002-09-28 20:44:58 +000083 while True:
Barry Warsaw409a4c02002-04-10 21:01:31 +000084 # Don't strip the line before we test for the end condition,
85 # because whitespace-only header lines are RFC compliant
86 # continuation lines.
87 line = fp.readline()
88 if not line:
Barry Warsawba925802001-09-23 03:17:28 +000089 break
Barry Warsaw409a4c02002-04-10 21:01:31 +000090 line = line.splitlines()[0]
91 if not line:
92 break
93 # Ignore the trailing newline
Barry Warsawba925802001-09-23 03:17:28 +000094 lineno += 1
95 # Check for initial Unix From_ line
96 if line.startswith('From '):
97 if lineno == 1:
98 container.set_unixfrom(line)
99 continue
Barry Warsawf6caeba2002-07-09 02:50:02 +0000100 elif self._strict:
Barry Warsawba925802001-09-23 03:17:28 +0000101 raise Errors.HeaderParseError(
102 'Unix-from in headers after first rfc822 header')
Barry Warsawf6caeba2002-07-09 02:50:02 +0000103 else:
104 # ignore the wierdly placed From_ line
105 # XXX: maybe set unixfrom anyway? or only if not already?
106 continue
Barry Warsawba925802001-09-23 03:17:28 +0000107 # Header continuation line
108 if line[0] in ' \t':
109 if not lastheader:
110 raise Errors.HeaderParseError(
111 'Continuation line seen before first header')
112 lastvalue.append(line)
113 continue
114 # Normal, non-continuation header. BAW: this should check to make
115 # sure it's a legal header, e.g. doesn't contain spaces. Also, we
116 # should expose the header matching algorithm in the API, and
117 # allow for a non-strict parsing mode (that ignores the line
118 # instead of raising the exception).
119 i = line.find(':')
120 if i < 0:
Barry Warsawf6caeba2002-07-09 02:50:02 +0000121 if self._strict:
122 raise Errors.HeaderParseError(
123 "Not a header, not a continuation: ``%s''"%line)
124 elif lineno == 1 and line.startswith('--'):
125 # allow through duplicate boundary tags.
126 continue
127 else:
128 raise Errors.HeaderParseError(
129 "Not a header, not a continuation: ``%s''"%line)
Barry Warsawba925802001-09-23 03:17:28 +0000130 if lastheader:
131 container[lastheader] = NL.join(lastvalue)
132 lastheader = line[:i]
133 lastvalue = [line[i+1:].lstrip()]
134 # Make sure we retain the last header
135 if lastheader:
136 container[lastheader] = NL.join(lastvalue)
137
138 def _parsebody(self, container, fp):
139 # Parse the body, but first split the payload on the content-type
140 # boundary if present.
Barry Warsaw66971fb2001-09-26 05:44:09 +0000141 boundary = container.get_boundary()
Barry Warsaw487fe6a2002-10-07 17:27:35 +0000142 isdigest = (container.get_content_type() == 'multipart/digest')
Barry Warsawba925802001-09-23 03:17:28 +0000143 # If there's a boundary, split the payload text into its constituent
144 # parts and parse each separately. Otherwise, just parse the rest of
145 # the body as a single message. Note: any exceptions raised in the
146 # recursive parse need to have their line numbers coerced.
147 if boundary:
148 preamble = epilogue = None
149 # Split into subparts. The first boundary we're looking for won't
Barry Warsaw7aeac912002-07-18 23:09:09 +0000150 # always have a leading newline since we're at the start of the
151 # body text, and there's not always a preamble before the first
152 # boundary.
Barry Warsawba925802001-09-23 03:17:28 +0000153 separator = '--' + boundary
154 payload = fp.read()
Tim Peters280488b2002-08-23 18:19:30 +0000155 # We use an RE here because boundaries can have trailing
Barry Warsaw7aeac912002-07-18 23:09:09 +0000156 # whitespace.
157 mo = re.search(
158 r'(?P<sep>' + re.escape(separator) + r')(?P<ws>[ \t]*)',
159 payload)
160 if not mo:
Barry Warsaw034b47a2002-09-10 16:14:56 +0000161 if self._strict:
162 raise Errors.BoundaryError(
163 "Couldn't find starting boundary: %s" % boundary)
164 container.set_payload(payload)
165 return
Barry Warsaw7aeac912002-07-18 23:09:09 +0000166 start = mo.start()
Barry Warsawba925802001-09-23 03:17:28 +0000167 if start > 0:
168 # there's some pre-MIME boundary preamble
169 preamble = payload[0:start]
Barry Warsaw7e21b672002-05-19 23:51:50 +0000170 # Find out what kind of line endings we're using
Barry Warsaw7aeac912002-07-18 23:09:09 +0000171 start += len(mo.group('sep')) + len(mo.group('ws'))
Barry Warsaw487fe6a2002-10-07 17:27:35 +0000172 mo = nlcre.search(payload, start)
Barry Warsaw7e21b672002-05-19 23:51:50 +0000173 if mo:
Barry Warsawf6caeba2002-07-09 02:50:02 +0000174 start += len(mo.group(0))
Barry Warsaw7e21b672002-05-19 23:51:50 +0000175 # We create a compiled regexp first because we need to be able to
176 # specify the start position, and the module function doesn't
177 # support this signature. :(
178 cre = re.compile('(?P<sep>\r\n|\r|\n)' +
179 re.escape(separator) + '--')
180 mo = cre.search(payload, start)
Barry Warsawf6caeba2002-07-09 02:50:02 +0000181 if mo:
182 terminator = mo.start()
183 linesep = mo.group('sep')
184 if mo.end() < len(payload):
Barry Warsaw7aeac912002-07-18 23:09:09 +0000185 # There's some post-MIME boundary epilogue
Barry Warsawf6caeba2002-07-09 02:50:02 +0000186 epilogue = payload[mo.end():]
187 elif self._strict:
Barry Warsawba925802001-09-23 03:17:28 +0000188 raise Errors.BoundaryError(
Barry Warsawf6caeba2002-07-09 02:50:02 +0000189 "Couldn't find terminating boundary: %s" % boundary)
190 else:
Barry Warsaw7aeac912002-07-18 23:09:09 +0000191 # Handle the case of no trailing boundary. Check that it ends
192 # in a blank line. Some cases (spamspamspam) don't even have
193 # that!
194 mo = re.search('(?P<sep>\r\n|\r|\n){2}$', payload)
Barry Warsawf6caeba2002-07-09 02:50:02 +0000195 if not mo:
Barry Warsaw7aeac912002-07-18 23:09:09 +0000196 mo = re.search('(?P<sep>\r\n|\r|\n)$', payload)
197 if not mo:
198 raise Errors.BoundaryError(
199 'No terminating boundary and no trailing empty line')
200 linesep = mo.group('sep')
201 terminator = len(payload)
Barry Warsawba925802001-09-23 03:17:28 +0000202 # We split the textual payload on the boundary separator, which
Barry Warsawf6caeba2002-07-09 02:50:02 +0000203 # includes the trailing newline. If the container is a
Tim Peters280488b2002-08-23 18:19:30 +0000204 # multipart/digest then the subparts are by default message/rfc822
205 # instead of text/plain. In that case, they'll have a optional
206 # block of MIME headers, then an empty line followed by the
Barry Warsawf6caeba2002-07-09 02:50:02 +0000207 # message headers.
Barry Warsaw7aeac912002-07-18 23:09:09 +0000208 parts = re.split(
209 linesep + re.escape(separator) + r'[ \t]*' + linesep,
210 payload[start:terminator])
Barry Warsawba925802001-09-23 03:17:28 +0000211 for part in parts:
Tim Peters280488b2002-08-23 18:19:30 +0000212 if isdigest:
Barry Warsaw487fe6a2002-10-07 17:27:35 +0000213 if part.startswith(linesep):
Barry Warsawf6caeba2002-07-09 02:50:02 +0000214 # There's no header block so create an empty message
215 # object as the container, and lop off the newline so
216 # we can parse the sub-subobject
217 msgobj = self._class()
Barry Warsaw487fe6a2002-10-07 17:27:35 +0000218 part = part[len(linesep):]
Barry Warsawf6caeba2002-07-09 02:50:02 +0000219 else:
220 parthdrs, part = part.split(linesep+linesep, 1)
221 # msgobj in this case is the "message/rfc822" container
222 msgobj = self.parsestr(parthdrs, headersonly=1)
223 # while submsgobj is the message itself
Barry Warsawf6caeba2002-07-09 02:50:02 +0000224 msgobj.set_default_type('message/rfc822')
Barry Warsaw5c9130e2002-11-05 20:54:37 +0000225 maintype = msgobj.get_content_maintype()
226 if maintype in ('message', 'multipart'):
227 submsgobj = self.parsestr(part)
228 msgobj.attach(submsgobj)
229 else:
230 msgobj.set_payload(part)
Barry Warsawf6caeba2002-07-09 02:50:02 +0000231 else:
232 msgobj = self.parsestr(part)
Barry Warsawba925802001-09-23 03:17:28 +0000233 container.preamble = preamble
234 container.epilogue = epilogue
Barry Warsaw409a4c02002-04-10 21:01:31 +0000235 container.attach(msgobj)
236 elif container.get_main_type() == 'multipart':
237 # Very bad. A message is a multipart with no boundary!
238 raise Errors.BoundaryError(
239 'multipart message with no defined boundary')
Barry Warsaw66971fb2001-09-26 05:44:09 +0000240 elif container.get_type() == 'message/delivery-status':
241 # This special kind of type contains blocks of headers separated
242 # by a blank line. We'll represent each header block as a
243 # separate Message object
244 blocks = []
Barry Warsawe03e8f02002-09-28 20:44:58 +0000245 while True:
Barry Warsaw66971fb2001-09-26 05:44:09 +0000246 blockmsg = self._class()
247 self._parseheaders(blockmsg, fp)
248 if not len(blockmsg):
249 # No more header blocks left
250 break
251 blocks.append(blockmsg)
252 container.set_payload(blocks)
253 elif container.get_main_type() == 'message':
Barry Warsawba925802001-09-23 03:17:28 +0000254 # Create a container for the payload, but watch out for there not
255 # being any headers left
256 try:
257 msg = self.parse(fp)
258 except Errors.HeaderParseError:
259 msg = self._class()
260 self._parsebody(msg, fp)
Barry Warsaw69e18af2002-06-02 19:12:03 +0000261 container.attach(msg)
Barry Warsawba925802001-09-23 03:17:28 +0000262 else:
Barry Warsaw409a4c02002-04-10 21:01:31 +0000263 container.set_payload(fp.read())
Barry Warsawe5528822001-10-11 15:43:00 +0000264
265
266
267class HeaderParser(Parser):
268 """A subclass of Parser, this one only meaningfully parses message headers.
269
270 This class can be used if all you're interested in is the headers of a
271 message. While it consumes the message body, it does not parse it, but
272 simply makes it available as a string payload.
273
274 Parsing with this subclass can be considerably faster if all you're
275 interested in is the message headers.
276 """
277 def _parsebody(self, container, fp):
278 # Consume but do not parse, the body
279 container.set_payload(fp.read())