blob: 98d20c3bdc471578af23ffd0f52a780109d9db38 [file] [log] [blame]
Barry Warsaw15e9dc92002-01-27 06:48:02 +00001# Copyright (C) 2001,2002 Python Software Foundation
Barry Warsawba925802001-09-23 03:17:28 +00002# Author: barry@zope.com (Barry Warsaw)
3
4"""A parser of RFC 2822 and MIME email messages.
5"""
6
Barry Warsaw7e21b672002-05-19 23:51:50 +00007import re
Barry Warsawba925802001-09-23 03:17:28 +00008from cStringIO import StringIO
Barry Warsaw15e9dc92002-01-27 06:48:02 +00009from types import ListType
Barry Warsawba925802001-09-23 03:17:28 +000010
Barry Warsaw69e18af2002-06-02 19:12:03 +000011from email import Errors
12from email import Message
Barry Warsawba925802001-09-23 03:17:28 +000013
Barry Warsawba925802001-09-23 03:17:28 +000014EMPTYSTRING = ''
15NL = '\n'
16
Barry Warsawe03e8f02002-09-28 20:44:58 +000017try:
18 True, False
19except NameError:
20 True = 1
21 False = 0
22
23
Barry Warsawe968ead2001-10-04 17:05:11 +000024
Barry Warsawba925802001-09-23 03:17:28 +000025class Parser:
Barry Warsawe03e8f02002-09-28 20:44:58 +000026 def __init__(self, _class=Message.Message, strict=False):
Barry Warsawba925802001-09-23 03:17:28 +000027 """Parser of RFC 2822 and MIME email messages.
28
29 Creates an in-memory object tree representing the email message, which
30 can then be manipulated and turned over to a Generator to return the
31 textual representation of the message.
32
33 The string must be formatted as a block of RFC 2822 headers and header
34 continuation lines, optionally preceeded by a `Unix-from' header. The
35 header block is terminated either by the end of the string or by a
36 blank line.
37
38 _class is the class to instantiate for new message objects when they
39 must be created. This class must have a constructor that can take
40 zero arguments. Default is Message.Message.
Barry Warsawf6caeba2002-07-09 02:50:02 +000041
42 Optional strict tells the parser to be strictly RFC compliant or to be
43 more forgiving in parsing of ill-formatted MIME documents. When
44 non-strict mode is used, the parser will try to make up for missing or
45 erroneous boundaries and other peculiarities seen in the wild.
Barry Warsawbb26b452002-07-19 22:25:34 +000046 Default is non-strict parsing.
Barry Warsawba925802001-09-23 03:17:28 +000047 """
48 self._class = _class
Barry Warsawf6caeba2002-07-09 02:50:02 +000049 self._strict = strict
Barry Warsawba925802001-09-23 03:17:28 +000050
Barry Warsawe03e8f02002-09-28 20:44:58 +000051 def parse(self, fp, headersonly=False):
Barry Warsawba925802001-09-23 03:17:28 +000052 root = self._class()
53 self._parseheaders(root, fp)
Barry Warsawf6caeba2002-07-09 02:50:02 +000054 if not headersonly:
55 self._parsebody(root, fp)
Barry Warsawba925802001-09-23 03:17:28 +000056 return root
57
Barry Warsawe03e8f02002-09-28 20:44:58 +000058 def parsestr(self, text, headersonly=False):
Barry Warsawf6caeba2002-07-09 02:50:02 +000059 return self.parse(StringIO(text), headersonly=headersonly)
Barry Warsawba925802001-09-23 03:17:28 +000060
61 def _parseheaders(self, container, fp):
62 # Parse the headers, returning a list of header/value pairs. None as
63 # the header means the Unix-From header.
64 lastheader = ''
65 lastvalue = []
66 lineno = 0
Barry Warsawe03e8f02002-09-28 20:44:58 +000067 while True:
Barry Warsaw409a4c02002-04-10 21:01:31 +000068 # Don't strip the line before we test for the end condition,
69 # because whitespace-only header lines are RFC compliant
70 # continuation lines.
71 line = fp.readline()
72 if not line:
Barry Warsawba925802001-09-23 03:17:28 +000073 break
Barry Warsaw409a4c02002-04-10 21:01:31 +000074 line = line.splitlines()[0]
75 if not line:
76 break
77 # Ignore the trailing newline
Barry Warsawba925802001-09-23 03:17:28 +000078 lineno += 1
79 # Check for initial Unix From_ line
80 if line.startswith('From '):
81 if lineno == 1:
82 container.set_unixfrom(line)
83 continue
Barry Warsawf6caeba2002-07-09 02:50:02 +000084 elif self._strict:
Barry Warsawba925802001-09-23 03:17:28 +000085 raise Errors.HeaderParseError(
86 'Unix-from in headers after first rfc822 header')
Barry Warsawf6caeba2002-07-09 02:50:02 +000087 else:
88 # ignore the wierdly placed From_ line
89 # XXX: maybe set unixfrom anyway? or only if not already?
90 continue
Barry Warsawba925802001-09-23 03:17:28 +000091 # Header continuation line
92 if line[0] in ' \t':
93 if not lastheader:
94 raise Errors.HeaderParseError(
95 'Continuation line seen before first header')
96 lastvalue.append(line)
97 continue
98 # Normal, non-continuation header. BAW: this should check to make
99 # sure it's a legal header, e.g. doesn't contain spaces. Also, we
100 # should expose the header matching algorithm in the API, and
101 # allow for a non-strict parsing mode (that ignores the line
102 # instead of raising the exception).
103 i = line.find(':')
104 if i < 0:
Barry Warsawf6caeba2002-07-09 02:50:02 +0000105 if self._strict:
106 raise Errors.HeaderParseError(
107 "Not a header, not a continuation: ``%s''"%line)
108 elif lineno == 1 and line.startswith('--'):
109 # allow through duplicate boundary tags.
110 continue
111 else:
112 raise Errors.HeaderParseError(
113 "Not a header, not a continuation: ``%s''"%line)
Barry Warsawba925802001-09-23 03:17:28 +0000114 if lastheader:
115 container[lastheader] = NL.join(lastvalue)
116 lastheader = line[:i]
117 lastvalue = [line[i+1:].lstrip()]
118 # Make sure we retain the last header
119 if lastheader:
120 container[lastheader] = NL.join(lastvalue)
121
122 def _parsebody(self, container, fp):
123 # Parse the body, but first split the payload on the content-type
124 # boundary if present.
Barry Warsaw66971fb2001-09-26 05:44:09 +0000125 boundary = container.get_boundary()
126 isdigest = (container.get_type() == 'multipart/digest')
Barry Warsawba925802001-09-23 03:17:28 +0000127 # If there's a boundary, split the payload text into its constituent
128 # parts and parse each separately. Otherwise, just parse the rest of
129 # the body as a single message. Note: any exceptions raised in the
130 # recursive parse need to have their line numbers coerced.
131 if boundary:
132 preamble = epilogue = None
133 # Split into subparts. The first boundary we're looking for won't
Barry Warsaw7aeac912002-07-18 23:09:09 +0000134 # always have a leading newline since we're at the start of the
135 # body text, and there's not always a preamble before the first
136 # boundary.
Barry Warsawba925802001-09-23 03:17:28 +0000137 separator = '--' + boundary
138 payload = fp.read()
Tim Peters280488b2002-08-23 18:19:30 +0000139 # We use an RE here because boundaries can have trailing
Barry Warsaw7aeac912002-07-18 23:09:09 +0000140 # whitespace.
141 mo = re.search(
142 r'(?P<sep>' + re.escape(separator) + r')(?P<ws>[ \t]*)',
143 payload)
144 if not mo:
Barry Warsaw034b47a2002-09-10 16:14:56 +0000145 if self._strict:
146 raise Errors.BoundaryError(
147 "Couldn't find starting boundary: %s" % boundary)
148 container.set_payload(payload)
149 return
Barry Warsaw7aeac912002-07-18 23:09:09 +0000150 start = mo.start()
Barry Warsawba925802001-09-23 03:17:28 +0000151 if start > 0:
152 # there's some pre-MIME boundary preamble
153 preamble = payload[0:start]
Barry Warsaw7e21b672002-05-19 23:51:50 +0000154 # Find out what kind of line endings we're using
Barry Warsaw7aeac912002-07-18 23:09:09 +0000155 start += len(mo.group('sep')) + len(mo.group('ws'))
Barry Warsaw7e21b672002-05-19 23:51:50 +0000156 cre = re.compile('\r\n|\r|\n')
157 mo = cre.search(payload, start)
158 if mo:
Barry Warsawf6caeba2002-07-09 02:50:02 +0000159 start += len(mo.group(0))
Barry Warsaw7e21b672002-05-19 23:51:50 +0000160 # We create a compiled regexp first because we need to be able to
161 # specify the start position, and the module function doesn't
162 # support this signature. :(
163 cre = re.compile('(?P<sep>\r\n|\r|\n)' +
164 re.escape(separator) + '--')
165 mo = cre.search(payload, start)
Barry Warsawf6caeba2002-07-09 02:50:02 +0000166 if mo:
167 terminator = mo.start()
168 linesep = mo.group('sep')
169 if mo.end() < len(payload):
Barry Warsaw7aeac912002-07-18 23:09:09 +0000170 # There's some post-MIME boundary epilogue
Barry Warsawf6caeba2002-07-09 02:50:02 +0000171 epilogue = payload[mo.end():]
172 elif self._strict:
Barry Warsawba925802001-09-23 03:17:28 +0000173 raise Errors.BoundaryError(
Barry Warsawf6caeba2002-07-09 02:50:02 +0000174 "Couldn't find terminating boundary: %s" % boundary)
175 else:
Barry Warsaw7aeac912002-07-18 23:09:09 +0000176 # Handle the case of no trailing boundary. Check that it ends
177 # in a blank line. Some cases (spamspamspam) don't even have
178 # that!
179 mo = re.search('(?P<sep>\r\n|\r|\n){2}$', payload)
Barry Warsawf6caeba2002-07-09 02:50:02 +0000180 if not mo:
Barry Warsaw7aeac912002-07-18 23:09:09 +0000181 mo = re.search('(?P<sep>\r\n|\r|\n)$', payload)
182 if not mo:
183 raise Errors.BoundaryError(
184 'No terminating boundary and no trailing empty line')
185 linesep = mo.group('sep')
186 terminator = len(payload)
Barry Warsawba925802001-09-23 03:17:28 +0000187 # We split the textual payload on the boundary separator, which
Barry Warsawf6caeba2002-07-09 02:50:02 +0000188 # includes the trailing newline. If the container is a
Tim Peters280488b2002-08-23 18:19:30 +0000189 # multipart/digest then the subparts are by default message/rfc822
190 # instead of text/plain. In that case, they'll have a optional
191 # block of MIME headers, then an empty line followed by the
Barry Warsawf6caeba2002-07-09 02:50:02 +0000192 # message headers.
Barry Warsaw7aeac912002-07-18 23:09:09 +0000193 parts = re.split(
194 linesep + re.escape(separator) + r'[ \t]*' + linesep,
195 payload[start:terminator])
Barry Warsawba925802001-09-23 03:17:28 +0000196 for part in parts:
Tim Peters280488b2002-08-23 18:19:30 +0000197 if isdigest:
Barry Warsawf6caeba2002-07-09 02:50:02 +0000198 if part[0] == linesep:
199 # There's no header block so create an empty message
200 # object as the container, and lop off the newline so
201 # we can parse the sub-subobject
202 msgobj = self._class()
203 part = part[1:]
204 else:
205 parthdrs, part = part.split(linesep+linesep, 1)
206 # msgobj in this case is the "message/rfc822" container
207 msgobj = self.parsestr(parthdrs, headersonly=1)
208 # while submsgobj is the message itself
209 submsgobj = self.parsestr(part)
210 msgobj.attach(submsgobj)
211 msgobj.set_default_type('message/rfc822')
212 else:
213 msgobj = self.parsestr(part)
Barry Warsawba925802001-09-23 03:17:28 +0000214 container.preamble = preamble
215 container.epilogue = epilogue
Barry Warsaw409a4c02002-04-10 21:01:31 +0000216 container.attach(msgobj)
217 elif container.get_main_type() == 'multipart':
218 # Very bad. A message is a multipart with no boundary!
219 raise Errors.BoundaryError(
220 'multipart message with no defined boundary')
Barry Warsaw66971fb2001-09-26 05:44:09 +0000221 elif container.get_type() == 'message/delivery-status':
222 # This special kind of type contains blocks of headers separated
223 # by a blank line. We'll represent each header block as a
224 # separate Message object
225 blocks = []
Barry Warsawe03e8f02002-09-28 20:44:58 +0000226 while True:
Barry Warsaw66971fb2001-09-26 05:44:09 +0000227 blockmsg = self._class()
228 self._parseheaders(blockmsg, fp)
229 if not len(blockmsg):
230 # No more header blocks left
231 break
232 blocks.append(blockmsg)
233 container.set_payload(blocks)
234 elif container.get_main_type() == 'message':
Barry Warsawba925802001-09-23 03:17:28 +0000235 # Create a container for the payload, but watch out for there not
236 # being any headers left
237 try:
238 msg = self.parse(fp)
239 except Errors.HeaderParseError:
240 msg = self._class()
241 self._parsebody(msg, fp)
Barry Warsaw69e18af2002-06-02 19:12:03 +0000242 container.attach(msg)
Barry Warsawba925802001-09-23 03:17:28 +0000243 else:
Barry Warsaw409a4c02002-04-10 21:01:31 +0000244 container.set_payload(fp.read())
Barry Warsawe5528822001-10-11 15:43:00 +0000245
246
247
248class HeaderParser(Parser):
249 """A subclass of Parser, this one only meaningfully parses message headers.
250
251 This class can be used if all you're interested in is the headers of a
252 message. While it consumes the message body, it does not parse it, but
253 simply makes it available as a string payload.
254
255 Parsing with this subclass can be considerably faster if all you're
256 interested in is the message headers.
257 """
258 def _parsebody(self, container, fp):
259 # Consume but do not parse, the body
260 container.set_payload(fp.read())