blob: b9d3ed3645e9a26cb06005b731236779a6c45cd4 [file] [log] [blame]
Barry Warsaw15e9dc92002-01-27 06:48:02 +00001# Copyright (C) 2001,2002 Python Software Foundation
Barry Warsawba925802001-09-23 03:17:28 +00002# Author: barry@zope.com (Barry Warsaw)
3
4"""A parser of RFC 2822 and MIME email messages.
5"""
6
Barry Warsaw7e21b672002-05-19 23:51:50 +00007import re
Barry Warsawba925802001-09-23 03:17:28 +00008from cStringIO import StringIO
Barry Warsaw15e9dc92002-01-27 06:48:02 +00009from types import ListType
Barry Warsawba925802001-09-23 03:17:28 +000010
Barry Warsaw69e18af2002-06-02 19:12:03 +000011from email import Errors
12from email import Message
Barry Warsawba925802001-09-23 03:17:28 +000013
Barry Warsawba925802001-09-23 03:17:28 +000014EMPTYSTRING = ''
15NL = '\n'
16
Barry Warsawe968ead2001-10-04 17:05:11 +000017
Barry Warsawba925802001-09-23 03:17:28 +000018class Parser:
Barry Warsawbb26b452002-07-19 22:25:34 +000019 def __init__(self, _class=Message.Message, strict=0):
Barry Warsawba925802001-09-23 03:17:28 +000020 """Parser of RFC 2822 and MIME email messages.
21
22 Creates an in-memory object tree representing the email message, which
23 can then be manipulated and turned over to a Generator to return the
24 textual representation of the message.
25
26 The string must be formatted as a block of RFC 2822 headers and header
27 continuation lines, optionally preceeded by a `Unix-from' header. The
28 header block is terminated either by the end of the string or by a
29 blank line.
30
31 _class is the class to instantiate for new message objects when they
32 must be created. This class must have a constructor that can take
33 zero arguments. Default is Message.Message.
Barry Warsawf6caeba2002-07-09 02:50:02 +000034
35 Optional strict tells the parser to be strictly RFC compliant or to be
36 more forgiving in parsing of ill-formatted MIME documents. When
37 non-strict mode is used, the parser will try to make up for missing or
38 erroneous boundaries and other peculiarities seen in the wild.
Barry Warsawbb26b452002-07-19 22:25:34 +000039 Default is non-strict parsing.
Barry Warsawba925802001-09-23 03:17:28 +000040 """
41 self._class = _class
Barry Warsawf6caeba2002-07-09 02:50:02 +000042 self._strict = strict
Barry Warsawba925802001-09-23 03:17:28 +000043
Barry Warsawf6caeba2002-07-09 02:50:02 +000044 def parse(self, fp, headersonly=0):
Barry Warsawba925802001-09-23 03:17:28 +000045 root = self._class()
46 self._parseheaders(root, fp)
Barry Warsawf6caeba2002-07-09 02:50:02 +000047 if not headersonly:
48 self._parsebody(root, fp)
Barry Warsawba925802001-09-23 03:17:28 +000049 return root
50
Barry Warsawf6caeba2002-07-09 02:50:02 +000051 def parsestr(self, text, headersonly=0):
52 return self.parse(StringIO(text), headersonly=headersonly)
Barry Warsawba925802001-09-23 03:17:28 +000053
54 def _parseheaders(self, container, fp):
55 # Parse the headers, returning a list of header/value pairs. None as
56 # the header means the Unix-From header.
57 lastheader = ''
58 lastvalue = []
59 lineno = 0
60 while 1:
Barry Warsaw409a4c02002-04-10 21:01:31 +000061 # Don't strip the line before we test for the end condition,
62 # because whitespace-only header lines are RFC compliant
63 # continuation lines.
64 line = fp.readline()
65 if not line:
Barry Warsawba925802001-09-23 03:17:28 +000066 break
Barry Warsaw409a4c02002-04-10 21:01:31 +000067 line = line.splitlines()[0]
68 if not line:
69 break
70 # Ignore the trailing newline
Barry Warsawba925802001-09-23 03:17:28 +000071 lineno += 1
72 # Check for initial Unix From_ line
73 if line.startswith('From '):
74 if lineno == 1:
75 container.set_unixfrom(line)
76 continue
Barry Warsawf6caeba2002-07-09 02:50:02 +000077 elif self._strict:
Barry Warsawba925802001-09-23 03:17:28 +000078 raise Errors.HeaderParseError(
79 'Unix-from in headers after first rfc822 header')
Barry Warsawf6caeba2002-07-09 02:50:02 +000080 else:
81 # ignore the wierdly placed From_ line
82 # XXX: maybe set unixfrom anyway? or only if not already?
83 continue
Barry Warsawba925802001-09-23 03:17:28 +000084 # Header continuation line
85 if line[0] in ' \t':
86 if not lastheader:
87 raise Errors.HeaderParseError(
88 'Continuation line seen before first header')
89 lastvalue.append(line)
90 continue
91 # Normal, non-continuation header. BAW: this should check to make
92 # sure it's a legal header, e.g. doesn't contain spaces. Also, we
93 # should expose the header matching algorithm in the API, and
94 # allow for a non-strict parsing mode (that ignores the line
95 # instead of raising the exception).
96 i = line.find(':')
97 if i < 0:
Barry Warsawf6caeba2002-07-09 02:50:02 +000098 if self._strict:
99 raise Errors.HeaderParseError(
100 "Not a header, not a continuation: ``%s''"%line)
101 elif lineno == 1 and line.startswith('--'):
102 # allow through duplicate boundary tags.
103 continue
104 else:
105 raise Errors.HeaderParseError(
106 "Not a header, not a continuation: ``%s''"%line)
Barry Warsawba925802001-09-23 03:17:28 +0000107 if lastheader:
108 container[lastheader] = NL.join(lastvalue)
109 lastheader = line[:i]
110 lastvalue = [line[i+1:].lstrip()]
111 # Make sure we retain the last header
112 if lastheader:
113 container[lastheader] = NL.join(lastvalue)
114
115 def _parsebody(self, container, fp):
116 # Parse the body, but first split the payload on the content-type
117 # boundary if present.
Barry Warsaw66971fb2001-09-26 05:44:09 +0000118 boundary = container.get_boundary()
119 isdigest = (container.get_type() == 'multipart/digest')
Barry Warsawba925802001-09-23 03:17:28 +0000120 # If there's a boundary, split the payload text into its constituent
121 # parts and parse each separately. Otherwise, just parse the rest of
122 # the body as a single message. Note: any exceptions raised in the
123 # recursive parse need to have their line numbers coerced.
124 if boundary:
125 preamble = epilogue = None
126 # Split into subparts. The first boundary we're looking for won't
Barry Warsaw7aeac912002-07-18 23:09:09 +0000127 # always have a leading newline since we're at the start of the
128 # body text, and there's not always a preamble before the first
129 # boundary.
Barry Warsawba925802001-09-23 03:17:28 +0000130 separator = '--' + boundary
131 payload = fp.read()
Barry Warsaw7aeac912002-07-18 23:09:09 +0000132 # We use an RE here because boundaries can have trailing
133 # whitespace.
134 mo = re.search(
135 r'(?P<sep>' + re.escape(separator) + r')(?P<ws>[ \t]*)',
136 payload)
137 if not mo:
Barry Warsawba925802001-09-23 03:17:28 +0000138 raise Errors.BoundaryError(
139 "Couldn't find starting boundary: %s" % boundary)
Barry Warsaw7aeac912002-07-18 23:09:09 +0000140 start = mo.start()
Barry Warsawba925802001-09-23 03:17:28 +0000141 if start > 0:
142 # there's some pre-MIME boundary preamble
143 preamble = payload[0:start]
Barry Warsaw7e21b672002-05-19 23:51:50 +0000144 # Find out what kind of line endings we're using
Barry Warsaw7aeac912002-07-18 23:09:09 +0000145 start += len(mo.group('sep')) + len(mo.group('ws'))
Barry Warsaw7e21b672002-05-19 23:51:50 +0000146 cre = re.compile('\r\n|\r|\n')
147 mo = cre.search(payload, start)
148 if mo:
Barry Warsawf6caeba2002-07-09 02:50:02 +0000149 start += len(mo.group(0))
Barry Warsaw7e21b672002-05-19 23:51:50 +0000150 # We create a compiled regexp first because we need to be able to
151 # specify the start position, and the module function doesn't
152 # support this signature. :(
153 cre = re.compile('(?P<sep>\r\n|\r|\n)' +
154 re.escape(separator) + '--')
155 mo = cre.search(payload, start)
Barry Warsawf6caeba2002-07-09 02:50:02 +0000156 if mo:
157 terminator = mo.start()
158 linesep = mo.group('sep')
159 if mo.end() < len(payload):
Barry Warsaw7aeac912002-07-18 23:09:09 +0000160 # There's some post-MIME boundary epilogue
Barry Warsawf6caeba2002-07-09 02:50:02 +0000161 epilogue = payload[mo.end():]
162 elif self._strict:
Barry Warsawba925802001-09-23 03:17:28 +0000163 raise Errors.BoundaryError(
Barry Warsawf6caeba2002-07-09 02:50:02 +0000164 "Couldn't find terminating boundary: %s" % boundary)
165 else:
Barry Warsaw7aeac912002-07-18 23:09:09 +0000166 # Handle the case of no trailing boundary. Check that it ends
167 # in a blank line. Some cases (spamspamspam) don't even have
168 # that!
169 mo = re.search('(?P<sep>\r\n|\r|\n){2}$', payload)
Barry Warsawf6caeba2002-07-09 02:50:02 +0000170 if not mo:
Barry Warsaw7aeac912002-07-18 23:09:09 +0000171 mo = re.search('(?P<sep>\r\n|\r|\n)$', payload)
172 if not mo:
173 raise Errors.BoundaryError(
174 'No terminating boundary and no trailing empty line')
175 linesep = mo.group('sep')
176 terminator = len(payload)
Barry Warsawba925802001-09-23 03:17:28 +0000177 # We split the textual payload on the boundary separator, which
Barry Warsawf6caeba2002-07-09 02:50:02 +0000178 # includes the trailing newline. If the container is a
179 # multipart/digest then the subparts are by default message/rfc822
180 # instead of text/plain. In that case, they'll have a optional
181 # block of MIME headers, then an empty line followed by the
182 # message headers.
Barry Warsaw7aeac912002-07-18 23:09:09 +0000183 parts = re.split(
184 linesep + re.escape(separator) + r'[ \t]*' + linesep,
185 payload[start:terminator])
Barry Warsawba925802001-09-23 03:17:28 +0000186 for part in parts:
Barry Warsawf6caeba2002-07-09 02:50:02 +0000187 if isdigest:
188 if part[0] == linesep:
189 # There's no header block so create an empty message
190 # object as the container, and lop off the newline so
191 # we can parse the sub-subobject
192 msgobj = self._class()
193 part = part[1:]
194 else:
195 parthdrs, part = part.split(linesep+linesep, 1)
196 # msgobj in this case is the "message/rfc822" container
197 msgobj = self.parsestr(parthdrs, headersonly=1)
198 # while submsgobj is the message itself
199 submsgobj = self.parsestr(part)
200 msgobj.attach(submsgobj)
201 msgobj.set_default_type('message/rfc822')
202 else:
203 msgobj = self.parsestr(part)
Barry Warsawba925802001-09-23 03:17:28 +0000204 container.preamble = preamble
205 container.epilogue = epilogue
Barry Warsaw409a4c02002-04-10 21:01:31 +0000206 container.attach(msgobj)
207 elif container.get_main_type() == 'multipart':
208 # Very bad. A message is a multipart with no boundary!
209 raise Errors.BoundaryError(
210 'multipart message with no defined boundary')
Barry Warsaw66971fb2001-09-26 05:44:09 +0000211 elif container.get_type() == 'message/delivery-status':
212 # This special kind of type contains blocks of headers separated
213 # by a blank line. We'll represent each header block as a
214 # separate Message object
215 blocks = []
216 while 1:
217 blockmsg = self._class()
218 self._parseheaders(blockmsg, fp)
219 if not len(blockmsg):
220 # No more header blocks left
221 break
222 blocks.append(blockmsg)
223 container.set_payload(blocks)
224 elif container.get_main_type() == 'message':
Barry Warsawba925802001-09-23 03:17:28 +0000225 # Create a container for the payload, but watch out for there not
226 # being any headers left
227 try:
228 msg = self.parse(fp)
229 except Errors.HeaderParseError:
230 msg = self._class()
231 self._parsebody(msg, fp)
Barry Warsaw69e18af2002-06-02 19:12:03 +0000232 container.attach(msg)
Barry Warsawba925802001-09-23 03:17:28 +0000233 else:
Barry Warsaw409a4c02002-04-10 21:01:31 +0000234 container.set_payload(fp.read())
Barry Warsawe5528822001-10-11 15:43:00 +0000235
236
237
238class HeaderParser(Parser):
239 """A subclass of Parser, this one only meaningfully parses message headers.
240
241 This class can be used if all you're interested in is the headers of a
242 message. While it consumes the message body, it does not parse it, but
243 simply makes it available as a string payload.
244
245 Parsing with this subclass can be considerably faster if all you're
246 interested in is the message headers.
247 """
248 def _parsebody(self, container, fp):
249 # Consume but do not parse, the body
250 container.set_payload(fp.read())