blob: 308110796c95e4b03aa7d2eac2ba0fbdb8cb1e9c [file] [log] [blame]
Barry Warsaw15e9dc92002-01-27 06:48:02 +00001# Copyright (C) 2001,2002 Python Software Foundation
Barry Warsawba925802001-09-23 03:17:28 +00002# Author: barry@zope.com (Barry Warsaw)
3
4"""A parser of RFC 2822 and MIME email messages.
5"""
6
Barry Warsaw7e21b672002-05-19 23:51:50 +00007import re
Barry Warsawba925802001-09-23 03:17:28 +00008from cStringIO import StringIO
Barry Warsaw15e9dc92002-01-27 06:48:02 +00009from types import ListType
Barry Warsawba925802001-09-23 03:17:28 +000010
Barry Warsaw69e18af2002-06-02 19:12:03 +000011from email import Errors
12from email import Message
Barry Warsawba925802001-09-23 03:17:28 +000013
Barry Warsawba925802001-09-23 03:17:28 +000014EMPTYSTRING = ''
15NL = '\n'
16
Barry Warsawe968ead2001-10-04 17:05:11 +000017
Barry Warsawba925802001-09-23 03:17:28 +000018class Parser:
Barry Warsawf6caeba2002-07-09 02:50:02 +000019 def __init__(self, _class=Message.Message, strict=1):
Barry Warsawba925802001-09-23 03:17:28 +000020 """Parser of RFC 2822 and MIME email messages.
21
22 Creates an in-memory object tree representing the email message, which
23 can then be manipulated and turned over to a Generator to return the
24 textual representation of the message.
25
26 The string must be formatted as a block of RFC 2822 headers and header
27 continuation lines, optionally preceeded by a `Unix-from' header. The
28 header block is terminated either by the end of the string or by a
29 blank line.
30
31 _class is the class to instantiate for new message objects when they
32 must be created. This class must have a constructor that can take
33 zero arguments. Default is Message.Message.
Barry Warsawf6caeba2002-07-09 02:50:02 +000034
35 Optional strict tells the parser to be strictly RFC compliant or to be
36 more forgiving in parsing of ill-formatted MIME documents. When
37 non-strict mode is used, the parser will try to make up for missing or
38 erroneous boundaries and other peculiarities seen in the wild.
39 Defaults to strict parsing.
Barry Warsawba925802001-09-23 03:17:28 +000040 """
41 self._class = _class
Barry Warsawf6caeba2002-07-09 02:50:02 +000042 self._strict = strict
Barry Warsawba925802001-09-23 03:17:28 +000043
Barry Warsawf6caeba2002-07-09 02:50:02 +000044 def parse(self, fp, headersonly=0):
Barry Warsawba925802001-09-23 03:17:28 +000045 root = self._class()
46 self._parseheaders(root, fp)
Barry Warsawf6caeba2002-07-09 02:50:02 +000047 if not headersonly:
48 self._parsebody(root, fp)
Barry Warsawba925802001-09-23 03:17:28 +000049 return root
50
Barry Warsawf6caeba2002-07-09 02:50:02 +000051 def parsestr(self, text, headersonly=0):
52 return self.parse(StringIO(text), headersonly=headersonly)
Barry Warsawba925802001-09-23 03:17:28 +000053
54 def _parseheaders(self, container, fp):
55 # Parse the headers, returning a list of header/value pairs. None as
56 # the header means the Unix-From header.
57 lastheader = ''
58 lastvalue = []
59 lineno = 0
60 while 1:
Barry Warsaw409a4c02002-04-10 21:01:31 +000061 # Don't strip the line before we test for the end condition,
62 # because whitespace-only header lines are RFC compliant
63 # continuation lines.
64 line = fp.readline()
65 if not line:
Barry Warsawba925802001-09-23 03:17:28 +000066 break
Barry Warsaw409a4c02002-04-10 21:01:31 +000067 line = line.splitlines()[0]
68 if not line:
69 break
70 # Ignore the trailing newline
Barry Warsawba925802001-09-23 03:17:28 +000071 lineno += 1
72 # Check for initial Unix From_ line
73 if line.startswith('From '):
74 if lineno == 1:
75 container.set_unixfrom(line)
76 continue
Barry Warsawf6caeba2002-07-09 02:50:02 +000077 elif self._strict:
Barry Warsawba925802001-09-23 03:17:28 +000078 raise Errors.HeaderParseError(
79 'Unix-from in headers after first rfc822 header')
Barry Warsawf6caeba2002-07-09 02:50:02 +000080 else:
81 # ignore the wierdly placed From_ line
82 # XXX: maybe set unixfrom anyway? or only if not already?
83 continue
Barry Warsawba925802001-09-23 03:17:28 +000084 # Header continuation line
85 if line[0] in ' \t':
86 if not lastheader:
87 raise Errors.HeaderParseError(
88 'Continuation line seen before first header')
89 lastvalue.append(line)
90 continue
91 # Normal, non-continuation header. BAW: this should check to make
92 # sure it's a legal header, e.g. doesn't contain spaces. Also, we
93 # should expose the header matching algorithm in the API, and
94 # allow for a non-strict parsing mode (that ignores the line
95 # instead of raising the exception).
96 i = line.find(':')
97 if i < 0:
Barry Warsawf6caeba2002-07-09 02:50:02 +000098 if self._strict:
99 raise Errors.HeaderParseError(
100 "Not a header, not a continuation: ``%s''"%line)
101 elif lineno == 1 and line.startswith('--'):
102 # allow through duplicate boundary tags.
103 continue
104 else:
105 raise Errors.HeaderParseError(
106 "Not a header, not a continuation: ``%s''"%line)
Barry Warsawba925802001-09-23 03:17:28 +0000107 if lastheader:
108 container[lastheader] = NL.join(lastvalue)
109 lastheader = line[:i]
110 lastvalue = [line[i+1:].lstrip()]
111 # Make sure we retain the last header
112 if lastheader:
113 container[lastheader] = NL.join(lastvalue)
114
115 def _parsebody(self, container, fp):
116 # Parse the body, but first split the payload on the content-type
117 # boundary if present.
Barry Warsaw66971fb2001-09-26 05:44:09 +0000118 boundary = container.get_boundary()
119 isdigest = (container.get_type() == 'multipart/digest')
Barry Warsawba925802001-09-23 03:17:28 +0000120 # If there's a boundary, split the payload text into its constituent
121 # parts and parse each separately. Otherwise, just parse the rest of
122 # the body as a single message. Note: any exceptions raised in the
123 # recursive parse need to have their line numbers coerced.
124 if boundary:
125 preamble = epilogue = None
126 # Split into subparts. The first boundary we're looking for won't
Barry Warsaw7aeac912002-07-18 23:09:09 +0000127 # always have a leading newline since we're at the start of the
128 # body text, and there's not always a preamble before the first
129 # boundary.
Barry Warsawba925802001-09-23 03:17:28 +0000130 separator = '--' + boundary
131 payload = fp.read()
Barry Warsaw7aeac912002-07-18 23:09:09 +0000132 # We use an RE here because boundaries can have trailing
133 # whitespace.
134 mo = re.search(
135 r'(?P<sep>' + re.escape(separator) + r')(?P<ws>[ \t]*)',
136 payload)
137 if not mo:
Barry Warsawba925802001-09-23 03:17:28 +0000138 raise Errors.BoundaryError(
139 "Couldn't find starting boundary: %s" % boundary)
Barry Warsaw7aeac912002-07-18 23:09:09 +0000140 start = mo.start()
Barry Warsawba925802001-09-23 03:17:28 +0000141 if start > 0:
142 # there's some pre-MIME boundary preamble
143 preamble = payload[0:start]
Barry Warsaw7e21b672002-05-19 23:51:50 +0000144 # Find out what kind of line endings we're using
Barry Warsaw7aeac912002-07-18 23:09:09 +0000145 start += len(mo.group('sep')) + len(mo.group('ws'))
Barry Warsaw7e21b672002-05-19 23:51:50 +0000146 cre = re.compile('\r\n|\r|\n')
147 mo = cre.search(payload, start)
148 if mo:
Barry Warsawf6caeba2002-07-09 02:50:02 +0000149 start += len(mo.group(0))
Barry Warsaw7e21b672002-05-19 23:51:50 +0000150 # We create a compiled regexp first because we need to be able to
151 # specify the start position, and the module function doesn't
152 # support this signature. :(
153 cre = re.compile('(?P<sep>\r\n|\r|\n)' +
154 re.escape(separator) + '--')
155 mo = cre.search(payload, start)
Barry Warsawf6caeba2002-07-09 02:50:02 +0000156 if mo:
157 terminator = mo.start()
158 linesep = mo.group('sep')
159 if mo.end() < len(payload):
Barry Warsaw7aeac912002-07-18 23:09:09 +0000160 # There's some post-MIME boundary epilogue
Barry Warsawf6caeba2002-07-09 02:50:02 +0000161 epilogue = payload[mo.end():]
162 elif self._strict:
Barry Warsawba925802001-09-23 03:17:28 +0000163 raise Errors.BoundaryError(
Barry Warsawf6caeba2002-07-09 02:50:02 +0000164 "Couldn't find terminating boundary: %s" % boundary)
165 else:
Barry Warsaw7aeac912002-07-18 23:09:09 +0000166 # Handle the case of no trailing boundary. Check that it ends
167 # in a blank line. Some cases (spamspamspam) don't even have
168 # that!
169 mo = re.search('(?P<sep>\r\n|\r|\n){2}$', payload)
Barry Warsawf6caeba2002-07-09 02:50:02 +0000170 if not mo:
Barry Warsaw7aeac912002-07-18 23:09:09 +0000171 mo = re.search('(?P<sep>\r\n|\r|\n)$', payload)
172 if not mo:
173 raise Errors.BoundaryError(
174 'No terminating boundary and no trailing empty line')
175 linesep = mo.group('sep')
176 terminator = len(payload)
Barry Warsawba925802001-09-23 03:17:28 +0000177 # We split the textual payload on the boundary separator, which
Barry Warsawf6caeba2002-07-09 02:50:02 +0000178 # includes the trailing newline. If the container is a
179 # multipart/digest then the subparts are by default message/rfc822
180 # instead of text/plain. In that case, they'll have a optional
181 # block of MIME headers, then an empty line followed by the
182 # message headers.
Barry Warsaw7aeac912002-07-18 23:09:09 +0000183 parts = re.split(
184 linesep + re.escape(separator) + r'[ \t]*' + linesep,
185 payload[start:terminator])
Barry Warsawba925802001-09-23 03:17:28 +0000186 for part in parts:
Barry Warsawf6caeba2002-07-09 02:50:02 +0000187 if isdigest:
188 if part[0] == linesep:
189 # There's no header block so create an empty message
190 # object as the container, and lop off the newline so
191 # we can parse the sub-subobject
192 msgobj = self._class()
193 part = part[1:]
194 else:
195 parthdrs, part = part.split(linesep+linesep, 1)
196 # msgobj in this case is the "message/rfc822" container
197 msgobj = self.parsestr(parthdrs, headersonly=1)
198 # while submsgobj is the message itself
199 submsgobj = self.parsestr(part)
200 msgobj.attach(submsgobj)
201 msgobj.set_default_type('message/rfc822')
202 else:
203 msgobj = self.parsestr(part)
Barry Warsawba925802001-09-23 03:17:28 +0000204 container.preamble = preamble
205 container.epilogue = epilogue
Barry Warsaw409a4c02002-04-10 21:01:31 +0000206 container.attach(msgobj)
207 elif container.get_main_type() == 'multipart':
208 # Very bad. A message is a multipart with no boundary!
209 raise Errors.BoundaryError(
210 'multipart message with no defined boundary')
Barry Warsaw66971fb2001-09-26 05:44:09 +0000211 elif container.get_type() == 'message/delivery-status':
212 # This special kind of type contains blocks of headers separated
213 # by a blank line. We'll represent each header block as a
214 # separate Message object
215 blocks = []
216 while 1:
217 blockmsg = self._class()
218 self._parseheaders(blockmsg, fp)
219 if not len(blockmsg):
220 # No more header blocks left
221 break
222 blocks.append(blockmsg)
223 container.set_payload(blocks)
224 elif container.get_main_type() == 'message':
Barry Warsawba925802001-09-23 03:17:28 +0000225 # Create a container for the payload, but watch out for there not
226 # being any headers left
227 try:
228 msg = self.parse(fp)
229 except Errors.HeaderParseError:
230 msg = self._class()
231 self._parsebody(msg, fp)
Barry Warsaw69e18af2002-06-02 19:12:03 +0000232 container.attach(msg)
Barry Warsawba925802001-09-23 03:17:28 +0000233 else:
Barry Warsaw409a4c02002-04-10 21:01:31 +0000234 container.set_payload(fp.read())
Barry Warsawe5528822001-10-11 15:43:00 +0000235
236
237
238class HeaderParser(Parser):
239 """A subclass of Parser, this one only meaningfully parses message headers.
240
241 This class can be used if all you're interested in is the headers of a
242 message. While it consumes the message body, it does not parse it, but
243 simply makes it available as a string payload.
244
245 Parsing with this subclass can be considerably faster if all you're
246 interested in is the message headers.
247 """
248 def _parsebody(self, container, fp):
249 # Consume but do not parse, the body
250 container.set_payload(fp.read())