blob: 228adbcebe55dcc9d2034ad3cfdca09626c63d80 [file] [log] [blame]
Barry Warsaw15e9dc92002-01-27 06:48:02 +00001# Copyright (C) 2001,2002 Python Software Foundation
Barry Warsawba925802001-09-23 03:17:28 +00002# Author: barry@zope.com (Barry Warsaw)
3
4"""A parser of RFC 2822 and MIME email messages.
5"""
6
Barry Warsaw7e21b672002-05-19 23:51:50 +00007import re
Barry Warsawba925802001-09-23 03:17:28 +00008from cStringIO import StringIO
Barry Warsaw15e9dc92002-01-27 06:48:02 +00009from types import ListType
Barry Warsawba925802001-09-23 03:17:28 +000010
Barry Warsaw69e18af2002-06-02 19:12:03 +000011from email import Errors
12from email import Message
Barry Warsawba925802001-09-23 03:17:28 +000013
Barry Warsawba925802001-09-23 03:17:28 +000014EMPTYSTRING = ''
15NL = '\n'
16
Barry Warsawe968ead2001-10-04 17:05:11 +000017
Barry Warsawba925802001-09-23 03:17:28 +000018class Parser:
Barry Warsawf6caeba2002-07-09 02:50:02 +000019 def __init__(self, _class=Message.Message, strict=1):
Barry Warsawba925802001-09-23 03:17:28 +000020 """Parser of RFC 2822 and MIME email messages.
21
22 Creates an in-memory object tree representing the email message, which
23 can then be manipulated and turned over to a Generator to return the
24 textual representation of the message.
25
26 The string must be formatted as a block of RFC 2822 headers and header
27 continuation lines, optionally preceeded by a `Unix-from' header. The
28 header block is terminated either by the end of the string or by a
29 blank line.
30
31 _class is the class to instantiate for new message objects when they
32 must be created. This class must have a constructor that can take
33 zero arguments. Default is Message.Message.
Barry Warsawf6caeba2002-07-09 02:50:02 +000034
35 Optional strict tells the parser to be strictly RFC compliant or to be
36 more forgiving in parsing of ill-formatted MIME documents. When
37 non-strict mode is used, the parser will try to make up for missing or
38 erroneous boundaries and other peculiarities seen in the wild.
39 Defaults to strict parsing.
Barry Warsawba925802001-09-23 03:17:28 +000040 """
41 self._class = _class
Barry Warsawf6caeba2002-07-09 02:50:02 +000042 self._strict = strict
Barry Warsawba925802001-09-23 03:17:28 +000043
Barry Warsawf6caeba2002-07-09 02:50:02 +000044 def parse(self, fp, headersonly=0):
Barry Warsawba925802001-09-23 03:17:28 +000045 root = self._class()
46 self._parseheaders(root, fp)
Barry Warsawf6caeba2002-07-09 02:50:02 +000047 if not headersonly:
48 self._parsebody(root, fp)
Barry Warsawba925802001-09-23 03:17:28 +000049 return root
50
Barry Warsawf6caeba2002-07-09 02:50:02 +000051 def parsestr(self, text, headersonly=0):
52 return self.parse(StringIO(text), headersonly=headersonly)
Barry Warsawba925802001-09-23 03:17:28 +000053
54 def _parseheaders(self, container, fp):
55 # Parse the headers, returning a list of header/value pairs. None as
56 # the header means the Unix-From header.
57 lastheader = ''
58 lastvalue = []
59 lineno = 0
60 while 1:
Barry Warsaw409a4c02002-04-10 21:01:31 +000061 # Don't strip the line before we test for the end condition,
62 # because whitespace-only header lines are RFC compliant
63 # continuation lines.
64 line = fp.readline()
65 if not line:
Barry Warsawba925802001-09-23 03:17:28 +000066 break
Barry Warsaw409a4c02002-04-10 21:01:31 +000067 line = line.splitlines()[0]
68 if not line:
69 break
70 # Ignore the trailing newline
Barry Warsawba925802001-09-23 03:17:28 +000071 lineno += 1
72 # Check for initial Unix From_ line
73 if line.startswith('From '):
74 if lineno == 1:
75 container.set_unixfrom(line)
76 continue
Barry Warsawf6caeba2002-07-09 02:50:02 +000077 elif self._strict:
Barry Warsawba925802001-09-23 03:17:28 +000078 raise Errors.HeaderParseError(
79 'Unix-from in headers after first rfc822 header')
Barry Warsawf6caeba2002-07-09 02:50:02 +000080 else:
81 # ignore the wierdly placed From_ line
82 # XXX: maybe set unixfrom anyway? or only if not already?
83 continue
Barry Warsawba925802001-09-23 03:17:28 +000084 # Header continuation line
85 if line[0] in ' \t':
86 if not lastheader:
87 raise Errors.HeaderParseError(
88 'Continuation line seen before first header')
89 lastvalue.append(line)
90 continue
91 # Normal, non-continuation header. BAW: this should check to make
92 # sure it's a legal header, e.g. doesn't contain spaces. Also, we
93 # should expose the header matching algorithm in the API, and
94 # allow for a non-strict parsing mode (that ignores the line
95 # instead of raising the exception).
96 i = line.find(':')
97 if i < 0:
Barry Warsawf6caeba2002-07-09 02:50:02 +000098 if self._strict:
99 raise Errors.HeaderParseError(
100 "Not a header, not a continuation: ``%s''"%line)
101 elif lineno == 1 and line.startswith('--'):
102 # allow through duplicate boundary tags.
103 continue
104 else:
105 raise Errors.HeaderParseError(
106 "Not a header, not a continuation: ``%s''"%line)
Barry Warsawba925802001-09-23 03:17:28 +0000107 if lastheader:
108 container[lastheader] = NL.join(lastvalue)
109 lastheader = line[:i]
110 lastvalue = [line[i+1:].lstrip()]
111 # Make sure we retain the last header
112 if lastheader:
113 container[lastheader] = NL.join(lastvalue)
114
115 def _parsebody(self, container, fp):
116 # Parse the body, but first split the payload on the content-type
117 # boundary if present.
Barry Warsaw66971fb2001-09-26 05:44:09 +0000118 boundary = container.get_boundary()
119 isdigest = (container.get_type() == 'multipart/digest')
Barry Warsawba925802001-09-23 03:17:28 +0000120 # If there's a boundary, split the payload text into its constituent
121 # parts and parse each separately. Otherwise, just parse the rest of
122 # the body as a single message. Note: any exceptions raised in the
123 # recursive parse need to have their line numbers coerced.
124 if boundary:
125 preamble = epilogue = None
126 # Split into subparts. The first boundary we're looking for won't
127 # have the leading newline since we're at the start of the body
128 # text.
129 separator = '--' + boundary
130 payload = fp.read()
131 start = payload.find(separator)
132 if start < 0:
133 raise Errors.BoundaryError(
134 "Couldn't find starting boundary: %s" % boundary)
135 if start > 0:
136 # there's some pre-MIME boundary preamble
137 preamble = payload[0:start]
Barry Warsaw7e21b672002-05-19 23:51:50 +0000138 # Find out what kind of line endings we're using
139 start += len(separator)
140 cre = re.compile('\r\n|\r|\n')
141 mo = cre.search(payload, start)
142 if mo:
Barry Warsawf6caeba2002-07-09 02:50:02 +0000143 start += len(mo.group(0))
Barry Warsaw7e21b672002-05-19 23:51:50 +0000144 # We create a compiled regexp first because we need to be able to
145 # specify the start position, and the module function doesn't
146 # support this signature. :(
147 cre = re.compile('(?P<sep>\r\n|\r|\n)' +
148 re.escape(separator) + '--')
149 mo = cre.search(payload, start)
Barry Warsawf6caeba2002-07-09 02:50:02 +0000150 if mo:
151 terminator = mo.start()
152 linesep = mo.group('sep')
153 if mo.end() < len(payload):
154 # there's some post-MIME boundary epilogue
155 epilogue = payload[mo.end():]
156 elif self._strict:
Barry Warsawba925802001-09-23 03:17:28 +0000157 raise Errors.BoundaryError(
Barry Warsawf6caeba2002-07-09 02:50:02 +0000158 "Couldn't find terminating boundary: %s" % boundary)
159 else:
160 # handle the case of no trailing boundary. I hate mail clients.
161 # check that it ends in a blank line
162 endre = re.compile('(?P<sep>\r\n|\r|\n){2}$')
163 mo = endre.search(payload)
164 if not mo:
165 raise Errors.BoundaryError(
166 "Couldn't find terminating boundary, and no "+
167 "trailing empty line")
168 else:
169 linesep = mo.group('sep')
170 terminator = len(payload)
Barry Warsawba925802001-09-23 03:17:28 +0000171 # We split the textual payload on the boundary separator, which
Barry Warsawf6caeba2002-07-09 02:50:02 +0000172 # includes the trailing newline. If the container is a
173 # multipart/digest then the subparts are by default message/rfc822
174 # instead of text/plain. In that case, they'll have a optional
175 # block of MIME headers, then an empty line followed by the
176 # message headers.
177 separator += linesep
Barry Warsaw7e21b672002-05-19 23:51:50 +0000178 parts = payload[start:terminator].split(linesep + separator)
Barry Warsawba925802001-09-23 03:17:28 +0000179 for part in parts:
Barry Warsawf6caeba2002-07-09 02:50:02 +0000180 if isdigest:
181 if part[0] == linesep:
182 # There's no header block so create an empty message
183 # object as the container, and lop off the newline so
184 # we can parse the sub-subobject
185 msgobj = self._class()
186 part = part[1:]
187 else:
188 parthdrs, part = part.split(linesep+linesep, 1)
189 # msgobj in this case is the "message/rfc822" container
190 msgobj = self.parsestr(parthdrs, headersonly=1)
191 # while submsgobj is the message itself
192 submsgobj = self.parsestr(part)
193 msgobj.attach(submsgobj)
194 msgobj.set_default_type('message/rfc822')
195 else:
196 msgobj = self.parsestr(part)
Barry Warsawba925802001-09-23 03:17:28 +0000197 container.preamble = preamble
198 container.epilogue = epilogue
Barry Warsaw409a4c02002-04-10 21:01:31 +0000199 container.attach(msgobj)
200 elif container.get_main_type() == 'multipart':
201 # Very bad. A message is a multipart with no boundary!
202 raise Errors.BoundaryError(
203 'multipart message with no defined boundary')
Barry Warsaw66971fb2001-09-26 05:44:09 +0000204 elif container.get_type() == 'message/delivery-status':
205 # This special kind of type contains blocks of headers separated
206 # by a blank line. We'll represent each header block as a
207 # separate Message object
208 blocks = []
209 while 1:
210 blockmsg = self._class()
211 self._parseheaders(blockmsg, fp)
212 if not len(blockmsg):
213 # No more header blocks left
214 break
215 blocks.append(blockmsg)
216 container.set_payload(blocks)
217 elif container.get_main_type() == 'message':
Barry Warsawba925802001-09-23 03:17:28 +0000218 # Create a container for the payload, but watch out for there not
219 # being any headers left
220 try:
221 msg = self.parse(fp)
222 except Errors.HeaderParseError:
223 msg = self._class()
224 self._parsebody(msg, fp)
Barry Warsaw69e18af2002-06-02 19:12:03 +0000225 container.attach(msg)
Barry Warsawba925802001-09-23 03:17:28 +0000226 else:
Barry Warsaw409a4c02002-04-10 21:01:31 +0000227 container.set_payload(fp.read())
Barry Warsawe5528822001-10-11 15:43:00 +0000228
229
230
231class HeaderParser(Parser):
232 """A subclass of Parser, this one only meaningfully parses message headers.
233
234 This class can be used if all you're interested in is the headers of a
235 message. While it consumes the message body, it does not parse it, but
236 simply makes it available as a string payload.
237
238 Parsing with this subclass can be considerably faster if all you're
239 interested in is the message headers.
240 """
241 def _parsebody(self, container, fp):
242 # Consume but do not parse, the body
243 container.set_payload(fp.read())