blob: 869ef1618d2ea37042536b6a360fb68ec16a75db [file] [log] [blame]
Barry Warsaw15e9dc92002-01-27 06:48:02 +00001# Copyright (C) 2001,2002 Python Software Foundation
Barry Warsawba925802001-09-23 03:17:28 +00002# Author: barry@zope.com (Barry Warsaw)
3
4"""A parser of RFC 2822 and MIME email messages.
5"""
6
Barry Warsaw7e21b672002-05-19 23:51:50 +00007import re
Barry Warsawba925802001-09-23 03:17:28 +00008from cStringIO import StringIO
Barry Warsaw15e9dc92002-01-27 06:48:02 +00009from types import ListType
Barry Warsawba925802001-09-23 03:17:28 +000010
Barry Warsaw69e18af2002-06-02 19:12:03 +000011from email import Errors
12from email import Message
Barry Warsawba925802001-09-23 03:17:28 +000013
Barry Warsawba925802001-09-23 03:17:28 +000014EMPTYSTRING = ''
15NL = '\n'
16
Barry Warsawe968ead2001-10-04 17:05:11 +000017
Barry Warsawba925802001-09-23 03:17:28 +000018class Parser:
Barry Warsawbb26b452002-07-19 22:25:34 +000019 def __init__(self, _class=Message.Message, strict=0):
Barry Warsawba925802001-09-23 03:17:28 +000020 """Parser of RFC 2822 and MIME email messages.
21
22 Creates an in-memory object tree representing the email message, which
23 can then be manipulated and turned over to a Generator to return the
24 textual representation of the message.
25
26 The string must be formatted as a block of RFC 2822 headers and header
27 continuation lines, optionally preceeded by a `Unix-from' header. The
28 header block is terminated either by the end of the string or by a
29 blank line.
30
31 _class is the class to instantiate for new message objects when they
32 must be created. This class must have a constructor that can take
33 zero arguments. Default is Message.Message.
Barry Warsawf6caeba2002-07-09 02:50:02 +000034
35 Optional strict tells the parser to be strictly RFC compliant or to be
36 more forgiving in parsing of ill-formatted MIME documents. When
37 non-strict mode is used, the parser will try to make up for missing or
38 erroneous boundaries and other peculiarities seen in the wild.
Barry Warsawbb26b452002-07-19 22:25:34 +000039 Default is non-strict parsing.
Barry Warsawba925802001-09-23 03:17:28 +000040 """
41 self._class = _class
Barry Warsawf6caeba2002-07-09 02:50:02 +000042 self._strict = strict
Barry Warsawba925802001-09-23 03:17:28 +000043
Barry Warsawf6caeba2002-07-09 02:50:02 +000044 def parse(self, fp, headersonly=0):
Barry Warsawba925802001-09-23 03:17:28 +000045 root = self._class()
46 self._parseheaders(root, fp)
Barry Warsawf6caeba2002-07-09 02:50:02 +000047 if not headersonly:
48 self._parsebody(root, fp)
Barry Warsawba925802001-09-23 03:17:28 +000049 return root
50
Barry Warsawf6caeba2002-07-09 02:50:02 +000051 def parsestr(self, text, headersonly=0):
52 return self.parse(StringIO(text), headersonly=headersonly)
Barry Warsawba925802001-09-23 03:17:28 +000053
54 def _parseheaders(self, container, fp):
55 # Parse the headers, returning a list of header/value pairs. None as
56 # the header means the Unix-From header.
57 lastheader = ''
58 lastvalue = []
59 lineno = 0
60 while 1:
Barry Warsaw409a4c02002-04-10 21:01:31 +000061 # Don't strip the line before we test for the end condition,
62 # because whitespace-only header lines are RFC compliant
63 # continuation lines.
64 line = fp.readline()
65 if not line:
Barry Warsawba925802001-09-23 03:17:28 +000066 break
Barry Warsaw409a4c02002-04-10 21:01:31 +000067 line = line.splitlines()[0]
68 if not line:
69 break
70 # Ignore the trailing newline
Barry Warsawba925802001-09-23 03:17:28 +000071 lineno += 1
72 # Check for initial Unix From_ line
73 if line.startswith('From '):
74 if lineno == 1:
75 container.set_unixfrom(line)
76 continue
Barry Warsawf6caeba2002-07-09 02:50:02 +000077 elif self._strict:
Barry Warsawba925802001-09-23 03:17:28 +000078 raise Errors.HeaderParseError(
79 'Unix-from in headers after first rfc822 header')
Barry Warsawf6caeba2002-07-09 02:50:02 +000080 else:
81 # ignore the wierdly placed From_ line
82 # XXX: maybe set unixfrom anyway? or only if not already?
83 continue
Barry Warsawba925802001-09-23 03:17:28 +000084 # Header continuation line
85 if line[0] in ' \t':
86 if not lastheader:
87 raise Errors.HeaderParseError(
88 'Continuation line seen before first header')
89 lastvalue.append(line)
90 continue
91 # Normal, non-continuation header. BAW: this should check to make
92 # sure it's a legal header, e.g. doesn't contain spaces. Also, we
93 # should expose the header matching algorithm in the API, and
94 # allow for a non-strict parsing mode (that ignores the line
95 # instead of raising the exception).
96 i = line.find(':')
97 if i < 0:
Barry Warsawf6caeba2002-07-09 02:50:02 +000098 if self._strict:
99 raise Errors.HeaderParseError(
100 "Not a header, not a continuation: ``%s''"%line)
101 elif lineno == 1 and line.startswith('--'):
102 # allow through duplicate boundary tags.
103 continue
104 else:
105 raise Errors.HeaderParseError(
106 "Not a header, not a continuation: ``%s''"%line)
Barry Warsawba925802001-09-23 03:17:28 +0000107 if lastheader:
108 container[lastheader] = NL.join(lastvalue)
109 lastheader = line[:i]
110 lastvalue = [line[i+1:].lstrip()]
111 # Make sure we retain the last header
112 if lastheader:
113 container[lastheader] = NL.join(lastvalue)
114
115 def _parsebody(self, container, fp):
116 # Parse the body, but first split the payload on the content-type
117 # boundary if present.
Barry Warsaw66971fb2001-09-26 05:44:09 +0000118 boundary = container.get_boundary()
119 isdigest = (container.get_type() == 'multipart/digest')
Barry Warsawba925802001-09-23 03:17:28 +0000120 # If there's a boundary, split the payload text into its constituent
121 # parts and parse each separately. Otherwise, just parse the rest of
122 # the body as a single message. Note: any exceptions raised in the
123 # recursive parse need to have their line numbers coerced.
124 if boundary:
125 preamble = epilogue = None
126 # Split into subparts. The first boundary we're looking for won't
Barry Warsaw7aeac912002-07-18 23:09:09 +0000127 # always have a leading newline since we're at the start of the
128 # body text, and there's not always a preamble before the first
129 # boundary.
Barry Warsawba925802001-09-23 03:17:28 +0000130 separator = '--' + boundary
131 payload = fp.read()
Tim Peters280488b2002-08-23 18:19:30 +0000132 # We use an RE here because boundaries can have trailing
Barry Warsaw7aeac912002-07-18 23:09:09 +0000133 # whitespace.
134 mo = re.search(
135 r'(?P<sep>' + re.escape(separator) + r')(?P<ws>[ \t]*)',
136 payload)
137 if not mo:
Barry Warsaw034b47a2002-09-10 16:14:56 +0000138 if self._strict:
139 raise Errors.BoundaryError(
140 "Couldn't find starting boundary: %s" % boundary)
141 container.set_payload(payload)
142 return
Barry Warsaw7aeac912002-07-18 23:09:09 +0000143 start = mo.start()
Barry Warsawba925802001-09-23 03:17:28 +0000144 if start > 0:
145 # there's some pre-MIME boundary preamble
146 preamble = payload[0:start]
Barry Warsaw7e21b672002-05-19 23:51:50 +0000147 # Find out what kind of line endings we're using
Barry Warsaw7aeac912002-07-18 23:09:09 +0000148 start += len(mo.group('sep')) + len(mo.group('ws'))
Barry Warsaw7e21b672002-05-19 23:51:50 +0000149 cre = re.compile('\r\n|\r|\n')
150 mo = cre.search(payload, start)
151 if mo:
Barry Warsawf6caeba2002-07-09 02:50:02 +0000152 start += len(mo.group(0))
Barry Warsaw7e21b672002-05-19 23:51:50 +0000153 # We create a compiled regexp first because we need to be able to
154 # specify the start position, and the module function doesn't
155 # support this signature. :(
156 cre = re.compile('(?P<sep>\r\n|\r|\n)' +
157 re.escape(separator) + '--')
158 mo = cre.search(payload, start)
Barry Warsawf6caeba2002-07-09 02:50:02 +0000159 if mo:
160 terminator = mo.start()
161 linesep = mo.group('sep')
162 if mo.end() < len(payload):
Barry Warsaw7aeac912002-07-18 23:09:09 +0000163 # There's some post-MIME boundary epilogue
Barry Warsawf6caeba2002-07-09 02:50:02 +0000164 epilogue = payload[mo.end():]
165 elif self._strict:
Barry Warsawba925802001-09-23 03:17:28 +0000166 raise Errors.BoundaryError(
Barry Warsawf6caeba2002-07-09 02:50:02 +0000167 "Couldn't find terminating boundary: %s" % boundary)
168 else:
Barry Warsaw7aeac912002-07-18 23:09:09 +0000169 # Handle the case of no trailing boundary. Check that it ends
170 # in a blank line. Some cases (spamspamspam) don't even have
171 # that!
172 mo = re.search('(?P<sep>\r\n|\r|\n){2}$', payload)
Barry Warsawf6caeba2002-07-09 02:50:02 +0000173 if not mo:
Barry Warsaw7aeac912002-07-18 23:09:09 +0000174 mo = re.search('(?P<sep>\r\n|\r|\n)$', payload)
175 if not mo:
176 raise Errors.BoundaryError(
177 'No terminating boundary and no trailing empty line')
178 linesep = mo.group('sep')
179 terminator = len(payload)
Barry Warsawba925802001-09-23 03:17:28 +0000180 # We split the textual payload on the boundary separator, which
Barry Warsawf6caeba2002-07-09 02:50:02 +0000181 # includes the trailing newline. If the container is a
Tim Peters280488b2002-08-23 18:19:30 +0000182 # multipart/digest then the subparts are by default message/rfc822
183 # instead of text/plain. In that case, they'll have a optional
184 # block of MIME headers, then an empty line followed by the
Barry Warsawf6caeba2002-07-09 02:50:02 +0000185 # message headers.
Barry Warsaw7aeac912002-07-18 23:09:09 +0000186 parts = re.split(
187 linesep + re.escape(separator) + r'[ \t]*' + linesep,
188 payload[start:terminator])
Barry Warsawba925802001-09-23 03:17:28 +0000189 for part in parts:
Tim Peters280488b2002-08-23 18:19:30 +0000190 if isdigest:
Barry Warsawf6caeba2002-07-09 02:50:02 +0000191 if part[0] == linesep:
192 # There's no header block so create an empty message
193 # object as the container, and lop off the newline so
194 # we can parse the sub-subobject
195 msgobj = self._class()
196 part = part[1:]
197 else:
198 parthdrs, part = part.split(linesep+linesep, 1)
199 # msgobj in this case is the "message/rfc822" container
200 msgobj = self.parsestr(parthdrs, headersonly=1)
201 # while submsgobj is the message itself
202 submsgobj = self.parsestr(part)
203 msgobj.attach(submsgobj)
204 msgobj.set_default_type('message/rfc822')
205 else:
206 msgobj = self.parsestr(part)
Barry Warsawba925802001-09-23 03:17:28 +0000207 container.preamble = preamble
208 container.epilogue = epilogue
Barry Warsaw409a4c02002-04-10 21:01:31 +0000209 container.attach(msgobj)
210 elif container.get_main_type() == 'multipart':
211 # Very bad. A message is a multipart with no boundary!
212 raise Errors.BoundaryError(
213 'multipart message with no defined boundary')
Barry Warsaw66971fb2001-09-26 05:44:09 +0000214 elif container.get_type() == 'message/delivery-status':
215 # This special kind of type contains blocks of headers separated
216 # by a blank line. We'll represent each header block as a
217 # separate Message object
218 blocks = []
219 while 1:
220 blockmsg = self._class()
221 self._parseheaders(blockmsg, fp)
222 if not len(blockmsg):
223 # No more header blocks left
224 break
225 blocks.append(blockmsg)
226 container.set_payload(blocks)
227 elif container.get_main_type() == 'message':
Barry Warsawba925802001-09-23 03:17:28 +0000228 # Create a container for the payload, but watch out for there not
229 # being any headers left
230 try:
231 msg = self.parse(fp)
232 except Errors.HeaderParseError:
233 msg = self._class()
234 self._parsebody(msg, fp)
Barry Warsaw69e18af2002-06-02 19:12:03 +0000235 container.attach(msg)
Barry Warsawba925802001-09-23 03:17:28 +0000236 else:
Barry Warsaw409a4c02002-04-10 21:01:31 +0000237 container.set_payload(fp.read())
Barry Warsawe5528822001-10-11 15:43:00 +0000238
239
240
241class HeaderParser(Parser):
242 """A subclass of Parser, this one only meaningfully parses message headers.
243
244 This class can be used if all you're interested in is the headers of a
245 message. While it consumes the message body, it does not parse it, but
246 simply makes it available as a string payload.
247
248 Parsing with this subclass can be considerably faster if all you're
249 interested in is the message headers.
250 """
251 def _parsebody(self, container, fp):
252 # Consume but do not parse, the body
253 container.set_payload(fp.read())