blob: 119a90dbf4914176f2ddb8ff84238c1c197d7fa6 [file] [log] [blame]
Barry Warsaw15e9dc92002-01-27 06:48:02 +00001# Copyright (C) 2001,2002 Python Software Foundation
Barry Warsawba925802001-09-23 03:17:28 +00002# Author: barry@zope.com (Barry Warsaw)
3
4"""A parser of RFC 2822 and MIME email messages.
5"""
6
Barry Warsaw7e21b672002-05-19 23:51:50 +00007import re
Barry Warsawba925802001-09-23 03:17:28 +00008from cStringIO import StringIO
Barry Warsaw15e9dc92002-01-27 06:48:02 +00009from types import ListType
Barry Warsawba925802001-09-23 03:17:28 +000010
Barry Warsaw69e18af2002-06-02 19:12:03 +000011from email import Errors
12from email import Message
Barry Warsawba925802001-09-23 03:17:28 +000013
Barry Warsawba925802001-09-23 03:17:28 +000014EMPTYSTRING = ''
15NL = '\n'
16
Barry Warsawe03e8f02002-09-28 20:44:58 +000017try:
18 True, False
19except NameError:
20 True = 1
21 False = 0
22
23
Barry Warsawe968ead2001-10-04 17:05:11 +000024
Barry Warsawba925802001-09-23 03:17:28 +000025class Parser:
Barry Warsawe03e8f02002-09-28 20:44:58 +000026 def __init__(self, _class=Message.Message, strict=False):
Barry Warsawba925802001-09-23 03:17:28 +000027 """Parser of RFC 2822 and MIME email messages.
28
29 Creates an in-memory object tree representing the email message, which
30 can then be manipulated and turned over to a Generator to return the
31 textual representation of the message.
32
33 The string must be formatted as a block of RFC 2822 headers and header
34 continuation lines, optionally preceeded by a `Unix-from' header. The
35 header block is terminated either by the end of the string or by a
36 blank line.
37
38 _class is the class to instantiate for new message objects when they
39 must be created. This class must have a constructor that can take
40 zero arguments. Default is Message.Message.
Barry Warsawf6caeba2002-07-09 02:50:02 +000041
42 Optional strict tells the parser to be strictly RFC compliant or to be
43 more forgiving in parsing of ill-formatted MIME documents. When
44 non-strict mode is used, the parser will try to make up for missing or
45 erroneous boundaries and other peculiarities seen in the wild.
Barry Warsawbb26b452002-07-19 22:25:34 +000046 Default is non-strict parsing.
Barry Warsawba925802001-09-23 03:17:28 +000047 """
48 self._class = _class
Barry Warsawf6caeba2002-07-09 02:50:02 +000049 self._strict = strict
Barry Warsawba925802001-09-23 03:17:28 +000050
Barry Warsawe03e8f02002-09-28 20:44:58 +000051 def parse(self, fp, headersonly=False):
Barry Warsaw057b8422002-09-30 20:07:22 +000052 """Create a message structure from the data in a file.
53
54 Reads all the data from the file and returns the root of the message
55 structure. Optional headersonly is a flag specifying whether to stop
56 parsing after reading the headers or not. The default is False,
57 meaning it parses the entire contents of the file.
58 """
Barry Warsawba925802001-09-23 03:17:28 +000059 root = self._class()
60 self._parseheaders(root, fp)
Barry Warsawf6caeba2002-07-09 02:50:02 +000061 if not headersonly:
62 self._parsebody(root, fp)
Barry Warsawba925802001-09-23 03:17:28 +000063 return root
64
Barry Warsawe03e8f02002-09-28 20:44:58 +000065 def parsestr(self, text, headersonly=False):
Barry Warsaw057b8422002-09-30 20:07:22 +000066 """Create a message structure from a string.
67
68 Returns the root of the message structure. Optional headersonly is a
69 flag specifying whether to stop parsing after reading the headers or
70 not. The default is False, meaning it parses the entire contents of
71 the file.
72 """
Barry Warsawf6caeba2002-07-09 02:50:02 +000073 return self.parse(StringIO(text), headersonly=headersonly)
Barry Warsawba925802001-09-23 03:17:28 +000074
75 def _parseheaders(self, container, fp):
76 # Parse the headers, returning a list of header/value pairs. None as
77 # the header means the Unix-From header.
78 lastheader = ''
79 lastvalue = []
80 lineno = 0
Barry Warsawe03e8f02002-09-28 20:44:58 +000081 while True:
Barry Warsaw409a4c02002-04-10 21:01:31 +000082 # Don't strip the line before we test for the end condition,
83 # because whitespace-only header lines are RFC compliant
84 # continuation lines.
85 line = fp.readline()
86 if not line:
Barry Warsawba925802001-09-23 03:17:28 +000087 break
Barry Warsaw409a4c02002-04-10 21:01:31 +000088 line = line.splitlines()[0]
89 if not line:
90 break
91 # Ignore the trailing newline
Barry Warsawba925802001-09-23 03:17:28 +000092 lineno += 1
93 # Check for initial Unix From_ line
94 if line.startswith('From '):
95 if lineno == 1:
96 container.set_unixfrom(line)
97 continue
Barry Warsawf6caeba2002-07-09 02:50:02 +000098 elif self._strict:
Barry Warsawba925802001-09-23 03:17:28 +000099 raise Errors.HeaderParseError(
100 'Unix-from in headers after first rfc822 header')
Barry Warsawf6caeba2002-07-09 02:50:02 +0000101 else:
102 # ignore the wierdly placed From_ line
103 # XXX: maybe set unixfrom anyway? or only if not already?
104 continue
Barry Warsawba925802001-09-23 03:17:28 +0000105 # Header continuation line
106 if line[0] in ' \t':
107 if not lastheader:
108 raise Errors.HeaderParseError(
109 'Continuation line seen before first header')
110 lastvalue.append(line)
111 continue
112 # Normal, non-continuation header. BAW: this should check to make
113 # sure it's a legal header, e.g. doesn't contain spaces. Also, we
114 # should expose the header matching algorithm in the API, and
115 # allow for a non-strict parsing mode (that ignores the line
116 # instead of raising the exception).
117 i = line.find(':')
118 if i < 0:
Barry Warsawf6caeba2002-07-09 02:50:02 +0000119 if self._strict:
120 raise Errors.HeaderParseError(
121 "Not a header, not a continuation: ``%s''"%line)
122 elif lineno == 1 and line.startswith('--'):
123 # allow through duplicate boundary tags.
124 continue
125 else:
126 raise Errors.HeaderParseError(
127 "Not a header, not a continuation: ``%s''"%line)
Barry Warsawba925802001-09-23 03:17:28 +0000128 if lastheader:
129 container[lastheader] = NL.join(lastvalue)
130 lastheader = line[:i]
131 lastvalue = [line[i+1:].lstrip()]
132 # Make sure we retain the last header
133 if lastheader:
134 container[lastheader] = NL.join(lastvalue)
135
136 def _parsebody(self, container, fp):
137 # Parse the body, but first split the payload on the content-type
138 # boundary if present.
Barry Warsaw66971fb2001-09-26 05:44:09 +0000139 boundary = container.get_boundary()
140 isdigest = (container.get_type() == 'multipart/digest')
Barry Warsawba925802001-09-23 03:17:28 +0000141 # If there's a boundary, split the payload text into its constituent
142 # parts and parse each separately. Otherwise, just parse the rest of
143 # the body as a single message. Note: any exceptions raised in the
144 # recursive parse need to have their line numbers coerced.
145 if boundary:
146 preamble = epilogue = None
147 # Split into subparts. The first boundary we're looking for won't
Barry Warsaw7aeac912002-07-18 23:09:09 +0000148 # always have a leading newline since we're at the start of the
149 # body text, and there's not always a preamble before the first
150 # boundary.
Barry Warsawba925802001-09-23 03:17:28 +0000151 separator = '--' + boundary
152 payload = fp.read()
Tim Peters280488b2002-08-23 18:19:30 +0000153 # We use an RE here because boundaries can have trailing
Barry Warsaw7aeac912002-07-18 23:09:09 +0000154 # whitespace.
155 mo = re.search(
156 r'(?P<sep>' + re.escape(separator) + r')(?P<ws>[ \t]*)',
157 payload)
158 if not mo:
Barry Warsaw034b47a2002-09-10 16:14:56 +0000159 if self._strict:
160 raise Errors.BoundaryError(
161 "Couldn't find starting boundary: %s" % boundary)
162 container.set_payload(payload)
163 return
Barry Warsaw7aeac912002-07-18 23:09:09 +0000164 start = mo.start()
Barry Warsawba925802001-09-23 03:17:28 +0000165 if start > 0:
166 # there's some pre-MIME boundary preamble
167 preamble = payload[0:start]
Barry Warsaw7e21b672002-05-19 23:51:50 +0000168 # Find out what kind of line endings we're using
Barry Warsaw7aeac912002-07-18 23:09:09 +0000169 start += len(mo.group('sep')) + len(mo.group('ws'))
Barry Warsaw7e21b672002-05-19 23:51:50 +0000170 cre = re.compile('\r\n|\r|\n')
171 mo = cre.search(payload, start)
172 if mo:
Barry Warsawf6caeba2002-07-09 02:50:02 +0000173 start += len(mo.group(0))
Barry Warsaw7e21b672002-05-19 23:51:50 +0000174 # We create a compiled regexp first because we need to be able to
175 # specify the start position, and the module function doesn't
176 # support this signature. :(
177 cre = re.compile('(?P<sep>\r\n|\r|\n)' +
178 re.escape(separator) + '--')
179 mo = cre.search(payload, start)
Barry Warsawf6caeba2002-07-09 02:50:02 +0000180 if mo:
181 terminator = mo.start()
182 linesep = mo.group('sep')
183 if mo.end() < len(payload):
Barry Warsaw7aeac912002-07-18 23:09:09 +0000184 # There's some post-MIME boundary epilogue
Barry Warsawf6caeba2002-07-09 02:50:02 +0000185 epilogue = payload[mo.end():]
186 elif self._strict:
Barry Warsawba925802001-09-23 03:17:28 +0000187 raise Errors.BoundaryError(
Barry Warsawf6caeba2002-07-09 02:50:02 +0000188 "Couldn't find terminating boundary: %s" % boundary)
189 else:
Barry Warsaw7aeac912002-07-18 23:09:09 +0000190 # Handle the case of no trailing boundary. Check that it ends
191 # in a blank line. Some cases (spamspamspam) don't even have
192 # that!
193 mo = re.search('(?P<sep>\r\n|\r|\n){2}$', payload)
Barry Warsawf6caeba2002-07-09 02:50:02 +0000194 if not mo:
Barry Warsaw7aeac912002-07-18 23:09:09 +0000195 mo = re.search('(?P<sep>\r\n|\r|\n)$', payload)
196 if not mo:
197 raise Errors.BoundaryError(
198 'No terminating boundary and no trailing empty line')
199 linesep = mo.group('sep')
200 terminator = len(payload)
Barry Warsawba925802001-09-23 03:17:28 +0000201 # We split the textual payload on the boundary separator, which
Barry Warsawf6caeba2002-07-09 02:50:02 +0000202 # includes the trailing newline. If the container is a
Tim Peters280488b2002-08-23 18:19:30 +0000203 # multipart/digest then the subparts are by default message/rfc822
204 # instead of text/plain. In that case, they'll have a optional
205 # block of MIME headers, then an empty line followed by the
Barry Warsawf6caeba2002-07-09 02:50:02 +0000206 # message headers.
Barry Warsaw7aeac912002-07-18 23:09:09 +0000207 parts = re.split(
208 linesep + re.escape(separator) + r'[ \t]*' + linesep,
209 payload[start:terminator])
Barry Warsawba925802001-09-23 03:17:28 +0000210 for part in parts:
Tim Peters280488b2002-08-23 18:19:30 +0000211 if isdigest:
Barry Warsawf6caeba2002-07-09 02:50:02 +0000212 if part[0] == linesep:
213 # There's no header block so create an empty message
214 # object as the container, and lop off the newline so
215 # we can parse the sub-subobject
216 msgobj = self._class()
217 part = part[1:]
218 else:
219 parthdrs, part = part.split(linesep+linesep, 1)
220 # msgobj in this case is the "message/rfc822" container
221 msgobj = self.parsestr(parthdrs, headersonly=1)
222 # while submsgobj is the message itself
223 submsgobj = self.parsestr(part)
224 msgobj.attach(submsgobj)
225 msgobj.set_default_type('message/rfc822')
226 else:
227 msgobj = self.parsestr(part)
Barry Warsawba925802001-09-23 03:17:28 +0000228 container.preamble = preamble
229 container.epilogue = epilogue
Barry Warsaw409a4c02002-04-10 21:01:31 +0000230 container.attach(msgobj)
231 elif container.get_main_type() == 'multipart':
232 # Very bad. A message is a multipart with no boundary!
233 raise Errors.BoundaryError(
234 'multipart message with no defined boundary')
Barry Warsaw66971fb2001-09-26 05:44:09 +0000235 elif container.get_type() == 'message/delivery-status':
236 # This special kind of type contains blocks of headers separated
237 # by a blank line. We'll represent each header block as a
238 # separate Message object
239 blocks = []
Barry Warsawe03e8f02002-09-28 20:44:58 +0000240 while True:
Barry Warsaw66971fb2001-09-26 05:44:09 +0000241 blockmsg = self._class()
242 self._parseheaders(blockmsg, fp)
243 if not len(blockmsg):
244 # No more header blocks left
245 break
246 blocks.append(blockmsg)
247 container.set_payload(blocks)
248 elif container.get_main_type() == 'message':
Barry Warsawba925802001-09-23 03:17:28 +0000249 # Create a container for the payload, but watch out for there not
250 # being any headers left
251 try:
252 msg = self.parse(fp)
253 except Errors.HeaderParseError:
254 msg = self._class()
255 self._parsebody(msg, fp)
Barry Warsaw69e18af2002-06-02 19:12:03 +0000256 container.attach(msg)
Barry Warsawba925802001-09-23 03:17:28 +0000257 else:
Barry Warsaw409a4c02002-04-10 21:01:31 +0000258 container.set_payload(fp.read())
Barry Warsawe5528822001-10-11 15:43:00 +0000259
260
261
262class HeaderParser(Parser):
263 """A subclass of Parser, this one only meaningfully parses message headers.
264
265 This class can be used if all you're interested in is the headers of a
266 message. While it consumes the message body, it does not parse it, but
267 simply makes it available as a string payload.
268
269 Parsing with this subclass can be considerably faster if all you're
270 interested in is the message headers.
271 """
272 def _parsebody(self, container, fp):
273 # Consume but do not parse, the body
274 container.set_payload(fp.read())