blob: 09fac4552f9379f8a32d57dd74329527c0478037 [file] [log] [blame]
Barry Warsaw15e9dc92002-01-27 06:48:02 +00001# Copyright (C) 2001,2002 Python Software Foundation
Barry Warsawba925802001-09-23 03:17:28 +00002# Author: barry@zope.com (Barry Warsaw)
3
4"""A parser of RFC 2822 and MIME email messages.
5"""
6
Barry Warsaw7e21b672002-05-19 23:51:50 +00007import re
Barry Warsawba925802001-09-23 03:17:28 +00008from cStringIO import StringIO
Barry Warsaw15e9dc92002-01-27 06:48:02 +00009from types import ListType
Barry Warsawba925802001-09-23 03:17:28 +000010
Barry Warsaw69e18af2002-06-02 19:12:03 +000011from email import Errors
12from email import Message
Barry Warsawba925802001-09-23 03:17:28 +000013
Barry Warsawba925802001-09-23 03:17:28 +000014EMPTYSTRING = ''
15NL = '\n'
16
Barry Warsawe03e8f02002-09-28 20:44:58 +000017try:
18 True, False
19except NameError:
20 True = 1
21 False = 0
22
Barry Warsaw0e4570b2003-03-06 05:25:35 +000023NLCRE = re.compile('\r\n|\r|\n')
Barry Warsaw487fe6a2002-10-07 17:27:35 +000024
Barry Warsawe03e8f02002-09-28 20:44:58 +000025
Barry Warsawe968ead2001-10-04 17:05:11 +000026
Barry Warsawba925802001-09-23 03:17:28 +000027class Parser:
Barry Warsawe03e8f02002-09-28 20:44:58 +000028 def __init__(self, _class=Message.Message, strict=False):
Barry Warsawba925802001-09-23 03:17:28 +000029 """Parser of RFC 2822 and MIME email messages.
30
31 Creates an in-memory object tree representing the email message, which
32 can then be manipulated and turned over to a Generator to return the
33 textual representation of the message.
34
35 The string must be formatted as a block of RFC 2822 headers and header
36 continuation lines, optionally preceeded by a `Unix-from' header. The
37 header block is terminated either by the end of the string or by a
38 blank line.
39
40 _class is the class to instantiate for new message objects when they
41 must be created. This class must have a constructor that can take
42 zero arguments. Default is Message.Message.
Barry Warsawf6caeba2002-07-09 02:50:02 +000043
44 Optional strict tells the parser to be strictly RFC compliant or to be
45 more forgiving in parsing of ill-formatted MIME documents. When
46 non-strict mode is used, the parser will try to make up for missing or
47 erroneous boundaries and other peculiarities seen in the wild.
Barry Warsawbb26b452002-07-19 22:25:34 +000048 Default is non-strict parsing.
Barry Warsawba925802001-09-23 03:17:28 +000049 """
50 self._class = _class
Barry Warsawf6caeba2002-07-09 02:50:02 +000051 self._strict = strict
Barry Warsawba925802001-09-23 03:17:28 +000052
Barry Warsawe03e8f02002-09-28 20:44:58 +000053 def parse(self, fp, headersonly=False):
Barry Warsaw057b8422002-09-30 20:07:22 +000054 """Create a message structure from the data in a file.
55
56 Reads all the data from the file and returns the root of the message
57 structure. Optional headersonly is a flag specifying whether to stop
58 parsing after reading the headers or not. The default is False,
59 meaning it parses the entire contents of the file.
60 """
Barry Warsawba925802001-09-23 03:17:28 +000061 root = self._class()
Barry Warsawda2525e2002-11-05 21:44:06 +000062 firstbodyline = self._parseheaders(root, fp)
Barry Warsawf6caeba2002-07-09 02:50:02 +000063 if not headersonly:
Barry Warsawda2525e2002-11-05 21:44:06 +000064 self._parsebody(root, fp, firstbodyline)
Barry Warsawba925802001-09-23 03:17:28 +000065 return root
66
Barry Warsawe03e8f02002-09-28 20:44:58 +000067 def parsestr(self, text, headersonly=False):
Barry Warsaw057b8422002-09-30 20:07:22 +000068 """Create a message structure from a string.
69
70 Returns the root of the message structure. Optional headersonly is a
71 flag specifying whether to stop parsing after reading the headers or
72 not. The default is False, meaning it parses the entire contents of
73 the file.
74 """
Barry Warsawf6caeba2002-07-09 02:50:02 +000075 return self.parse(StringIO(text), headersonly=headersonly)
Barry Warsawba925802001-09-23 03:17:28 +000076
77 def _parseheaders(self, container, fp):
78 # Parse the headers, returning a list of header/value pairs. None as
79 # the header means the Unix-From header.
80 lastheader = ''
81 lastvalue = []
82 lineno = 0
Barry Warsawda2525e2002-11-05 21:44:06 +000083 firstbodyline = None
Barry Warsawe03e8f02002-09-28 20:44:58 +000084 while True:
Barry Warsaw409a4c02002-04-10 21:01:31 +000085 # Don't strip the line before we test for the end condition,
86 # because whitespace-only header lines are RFC compliant
87 # continuation lines.
88 line = fp.readline()
89 if not line:
Barry Warsawba925802001-09-23 03:17:28 +000090 break
Barry Warsaw409a4c02002-04-10 21:01:31 +000091 line = line.splitlines()[0]
92 if not line:
93 break
94 # Ignore the trailing newline
Barry Warsawba925802001-09-23 03:17:28 +000095 lineno += 1
96 # Check for initial Unix From_ line
97 if line.startswith('From '):
98 if lineno == 1:
99 container.set_unixfrom(line)
100 continue
Barry Warsawf6caeba2002-07-09 02:50:02 +0000101 elif self._strict:
Barry Warsawba925802001-09-23 03:17:28 +0000102 raise Errors.HeaderParseError(
103 'Unix-from in headers after first rfc822 header')
Barry Warsawf6caeba2002-07-09 02:50:02 +0000104 else:
105 # ignore the wierdly placed From_ line
106 # XXX: maybe set unixfrom anyway? or only if not already?
107 continue
Barry Warsawba925802001-09-23 03:17:28 +0000108 # Header continuation line
109 if line[0] in ' \t':
110 if not lastheader:
111 raise Errors.HeaderParseError(
112 'Continuation line seen before first header')
113 lastvalue.append(line)
114 continue
115 # Normal, non-continuation header. BAW: this should check to make
116 # sure it's a legal header, e.g. doesn't contain spaces. Also, we
117 # should expose the header matching algorithm in the API, and
118 # allow for a non-strict parsing mode (that ignores the line
119 # instead of raising the exception).
120 i = line.find(':')
121 if i < 0:
Barry Warsawf6caeba2002-07-09 02:50:02 +0000122 if self._strict:
123 raise Errors.HeaderParseError(
Barry Warsawda2525e2002-11-05 21:44:06 +0000124 "Not a header, not a continuation: ``%s''" % line)
Barry Warsawf6caeba2002-07-09 02:50:02 +0000125 elif lineno == 1 and line.startswith('--'):
126 # allow through duplicate boundary tags.
127 continue
128 else:
Barry Warsawda2525e2002-11-05 21:44:06 +0000129 # There was no separating blank line as mandated by RFC
130 # 2822, but we're in non-strict mode. So just offer up
131 # this current line as the first body line.
132 firstbodyline = line
133 break
Barry Warsawba925802001-09-23 03:17:28 +0000134 if lastheader:
135 container[lastheader] = NL.join(lastvalue)
136 lastheader = line[:i]
137 lastvalue = [line[i+1:].lstrip()]
138 # Make sure we retain the last header
139 if lastheader:
140 container[lastheader] = NL.join(lastvalue)
Barry Warsawda2525e2002-11-05 21:44:06 +0000141 return firstbodyline
Barry Warsawba925802001-09-23 03:17:28 +0000142
Barry Warsawda2525e2002-11-05 21:44:06 +0000143 def _parsebody(self, container, fp, firstbodyline=None):
Barry Warsawba925802001-09-23 03:17:28 +0000144 # Parse the body, but first split the payload on the content-type
145 # boundary if present.
Barry Warsaw66971fb2001-09-26 05:44:09 +0000146 boundary = container.get_boundary()
Barry Warsaw487fe6a2002-10-07 17:27:35 +0000147 isdigest = (container.get_content_type() == 'multipart/digest')
Barry Warsawba925802001-09-23 03:17:28 +0000148 # If there's a boundary, split the payload text into its constituent
149 # parts and parse each separately. Otherwise, just parse the rest of
150 # the body as a single message. Note: any exceptions raised in the
151 # recursive parse need to have their line numbers coerced.
152 if boundary:
153 preamble = epilogue = None
154 # Split into subparts. The first boundary we're looking for won't
Barry Warsaw7aeac912002-07-18 23:09:09 +0000155 # always have a leading newline since we're at the start of the
156 # body text, and there's not always a preamble before the first
157 # boundary.
Barry Warsawba925802001-09-23 03:17:28 +0000158 separator = '--' + boundary
159 payload = fp.read()
Barry Warsawda2525e2002-11-05 21:44:06 +0000160 if firstbodyline is not None:
161 payload = firstbodyline + '\n' + payload
Tim Peters280488b2002-08-23 18:19:30 +0000162 # We use an RE here because boundaries can have trailing
Barry Warsaw7aeac912002-07-18 23:09:09 +0000163 # whitespace.
164 mo = re.search(
165 r'(?P<sep>' + re.escape(separator) + r')(?P<ws>[ \t]*)',
166 payload)
167 if not mo:
Barry Warsaw034b47a2002-09-10 16:14:56 +0000168 if self._strict:
169 raise Errors.BoundaryError(
170 "Couldn't find starting boundary: %s" % boundary)
171 container.set_payload(payload)
172 return
Barry Warsaw7aeac912002-07-18 23:09:09 +0000173 start = mo.start()
Barry Warsawba925802001-09-23 03:17:28 +0000174 if start > 0:
175 # there's some pre-MIME boundary preamble
176 preamble = payload[0:start]
Barry Warsaw7e21b672002-05-19 23:51:50 +0000177 # Find out what kind of line endings we're using
Barry Warsaw7aeac912002-07-18 23:09:09 +0000178 start += len(mo.group('sep')) + len(mo.group('ws'))
Barry Warsaw0e4570b2003-03-06 05:25:35 +0000179 mo = NLCRE.search(payload, start)
Barry Warsaw7e21b672002-05-19 23:51:50 +0000180 if mo:
Barry Warsawf6caeba2002-07-09 02:50:02 +0000181 start += len(mo.group(0))
Barry Warsaw7e21b672002-05-19 23:51:50 +0000182 # We create a compiled regexp first because we need to be able to
183 # specify the start position, and the module function doesn't
184 # support this signature. :(
185 cre = re.compile('(?P<sep>\r\n|\r|\n)' +
186 re.escape(separator) + '--')
187 mo = cre.search(payload, start)
Barry Warsawf6caeba2002-07-09 02:50:02 +0000188 if mo:
189 terminator = mo.start()
190 linesep = mo.group('sep')
191 if mo.end() < len(payload):
Barry Warsaw7aeac912002-07-18 23:09:09 +0000192 # There's some post-MIME boundary epilogue
Barry Warsawf6caeba2002-07-09 02:50:02 +0000193 epilogue = payload[mo.end():]
194 elif self._strict:
Barry Warsawba925802001-09-23 03:17:28 +0000195 raise Errors.BoundaryError(
Barry Warsawf6caeba2002-07-09 02:50:02 +0000196 "Couldn't find terminating boundary: %s" % boundary)
197 else:
Barry Warsaw7aeac912002-07-18 23:09:09 +0000198 # Handle the case of no trailing boundary. Check that it ends
199 # in a blank line. Some cases (spamspamspam) don't even have
200 # that!
201 mo = re.search('(?P<sep>\r\n|\r|\n){2}$', payload)
Barry Warsawf6caeba2002-07-09 02:50:02 +0000202 if not mo:
Barry Warsaw7aeac912002-07-18 23:09:09 +0000203 mo = re.search('(?P<sep>\r\n|\r|\n)$', payload)
204 if not mo:
205 raise Errors.BoundaryError(
206 'No terminating boundary and no trailing empty line')
207 linesep = mo.group('sep')
208 terminator = len(payload)
Barry Warsawba925802001-09-23 03:17:28 +0000209 # We split the textual payload on the boundary separator, which
Barry Warsawf6caeba2002-07-09 02:50:02 +0000210 # includes the trailing newline. If the container is a
Tim Peters280488b2002-08-23 18:19:30 +0000211 # multipart/digest then the subparts are by default message/rfc822
212 # instead of text/plain. In that case, they'll have a optional
213 # block of MIME headers, then an empty line followed by the
Barry Warsawf6caeba2002-07-09 02:50:02 +0000214 # message headers.
Barry Warsaw7aeac912002-07-18 23:09:09 +0000215 parts = re.split(
216 linesep + re.escape(separator) + r'[ \t]*' + linesep,
217 payload[start:terminator])
Barry Warsawba925802001-09-23 03:17:28 +0000218 for part in parts:
Tim Peters280488b2002-08-23 18:19:30 +0000219 if isdigest:
Barry Warsaw487fe6a2002-10-07 17:27:35 +0000220 if part.startswith(linesep):
Barry Warsawf6caeba2002-07-09 02:50:02 +0000221 # There's no header block so create an empty message
222 # object as the container, and lop off the newline so
223 # we can parse the sub-subobject
224 msgobj = self._class()
Barry Warsaw487fe6a2002-10-07 17:27:35 +0000225 part = part[len(linesep):]
Barry Warsawf6caeba2002-07-09 02:50:02 +0000226 else:
227 parthdrs, part = part.split(linesep+linesep, 1)
228 # msgobj in this case is the "message/rfc822" container
229 msgobj = self.parsestr(parthdrs, headersonly=1)
230 # while submsgobj is the message itself
Barry Warsawf6caeba2002-07-09 02:50:02 +0000231 msgobj.set_default_type('message/rfc822')
Barry Warsaw5c9130e2002-11-05 20:54:37 +0000232 maintype = msgobj.get_content_maintype()
233 if maintype in ('message', 'multipart'):
234 submsgobj = self.parsestr(part)
235 msgobj.attach(submsgobj)
236 else:
237 msgobj.set_payload(part)
Barry Warsawf6caeba2002-07-09 02:50:02 +0000238 else:
239 msgobj = self.parsestr(part)
Barry Warsawba925802001-09-23 03:17:28 +0000240 container.preamble = preamble
241 container.epilogue = epilogue
Barry Warsaw409a4c02002-04-10 21:01:31 +0000242 container.attach(msgobj)
243 elif container.get_main_type() == 'multipart':
244 # Very bad. A message is a multipart with no boundary!
245 raise Errors.BoundaryError(
246 'multipart message with no defined boundary')
Barry Warsaw66971fb2001-09-26 05:44:09 +0000247 elif container.get_type() == 'message/delivery-status':
248 # This special kind of type contains blocks of headers separated
249 # by a blank line. We'll represent each header block as a
250 # separate Message object
251 blocks = []
Barry Warsawe03e8f02002-09-28 20:44:58 +0000252 while True:
Barry Warsaw66971fb2001-09-26 05:44:09 +0000253 blockmsg = self._class()
254 self._parseheaders(blockmsg, fp)
255 if not len(blockmsg):
256 # No more header blocks left
257 break
258 blocks.append(blockmsg)
259 container.set_payload(blocks)
260 elif container.get_main_type() == 'message':
Barry Warsawba925802001-09-23 03:17:28 +0000261 # Create a container for the payload, but watch out for there not
262 # being any headers left
263 try:
264 msg = self.parse(fp)
265 except Errors.HeaderParseError:
266 msg = self._class()
267 self._parsebody(msg, fp)
Barry Warsaw69e18af2002-06-02 19:12:03 +0000268 container.attach(msg)
Barry Warsawba925802001-09-23 03:17:28 +0000269 else:
Barry Warsawda2525e2002-11-05 21:44:06 +0000270 text = fp.read()
271 if firstbodyline is not None:
272 text = firstbodyline + '\n' + text
273 container.set_payload(text)
Barry Warsawe5528822001-10-11 15:43:00 +0000274
275
276
277class HeaderParser(Parser):
278 """A subclass of Parser, this one only meaningfully parses message headers.
279
280 This class can be used if all you're interested in is the headers of a
281 message. While it consumes the message body, it does not parse it, but
282 simply makes it available as a string payload.
283
284 Parsing with this subclass can be considerably faster if all you're
285 interested in is the message headers.
286 """
Barry Warsawda2525e2002-11-05 21:44:06 +0000287 def _parsebody(self, container, fp, firstbodyline=None):
Barry Warsawe5528822001-10-11 15:43:00 +0000288 # Consume but do not parse, the body
Barry Warsawda2525e2002-11-05 21:44:06 +0000289 text = fp.read()
290 if firstbodyline is not None:
291 text = firstbodyline + '\n' + text
292 container.set_payload(text)