blob: 3fe1990ec5c4c3719a05306c540064595cfd7e6e [file] [log] [blame]
Barry Warsaw15e9dc92002-01-27 06:48:02 +00001# Copyright (C) 2001,2002 Python Software Foundation
Barry Warsawba925802001-09-23 03:17:28 +00002# Author: barry@zope.com (Barry Warsaw)
3
4"""A parser of RFC 2822 and MIME email messages.
5"""
6
Barry Warsaw7e21b672002-05-19 23:51:50 +00007import re
Barry Warsawba925802001-09-23 03:17:28 +00008from cStringIO import StringIO
Barry Warsaw15e9dc92002-01-27 06:48:02 +00009from types import ListType
Barry Warsawba925802001-09-23 03:17:28 +000010
Barry Warsaw69e18af2002-06-02 19:12:03 +000011from email import Errors
12from email import Message
Barry Warsawba925802001-09-23 03:17:28 +000013
Barry Warsawba925802001-09-23 03:17:28 +000014EMPTYSTRING = ''
15NL = '\n'
16
Barry Warsawe03e8f02002-09-28 20:44:58 +000017try:
18 True, False
19except NameError:
20 True = 1
21 False = 0
22
Barry Warsaw0e4570b2003-03-06 05:25:35 +000023NLCRE = re.compile('\r\n|\r|\n')
Barry Warsaw487fe6a2002-10-07 17:27:35 +000024
Thomas Wouters0813d762004-03-20 17:31:29 +000025class TextUtil:
26 """ A utility class for wrapping a file object and providing a
27 couple of additional useful functions.
28 """
29
30 def __init__(self, fp):
31 self.fp = fp
32 self.unread = []
33
34 def readline(self):
35 """ Return a line of data.
36
37 If data has been pushed back with unreadline(), the most recently
38 returned unreadline()d data will be returned.
39 """
40 if self.unread:
41 return self.unread.pop()
42 else:
43 return self.fp.readline()
44
45 def unreadline(self, line):
46 """Push a line back into the object.
47 """
48 self.unread.append(line)
49
50 def peekline(self):
51 """Non-destructively look at the next line"""
52 line = self.readline()
53 self.unreadline(line)
54 return line
55
56 def read(self):
57 """Return the remaining data
58 """
59 r = self.fp.read()
60 if self.unread:
61 r = "\n".join(self.unread) + r
62 self.unread = []
63 return r
64
65 def readuntil(self, re, afterblank=0, includematch=0):
66 """Read a line at a time until we get the specified RE.
67
68 Returns the text up to (and including, if includematch is true) the
69 matched text, and the RE match object. If afterblank is true,
70 there must be a blank line before the matched text. Moves current
71 filepointer to the line following the matched line. If we reach
72 end-of-file, return what we've got so far, and return None as the
73 RE match object.
74 """
75 prematch = []
76 blankseen = 0
77 while 1:
78 line = self.readline()
79 if not line:
80 # end of file
81 return EMPTYSTRING.join(prematch), None
82 if afterblank:
83 if NLCRE.match(line):
84 blankseen = 1
85 continue
86 else:
87 blankseen = 0
88 m = re.match(line)
89 if (m and not afterblank) or (m and afterblank and blankseen):
90 if includematch:
91 prematch.append(line)
92 return EMPTYSTRING.join(prematch), m
93 prematch.append(line)
Barry Warsawe03e8f02002-09-28 20:44:58 +000094
Barry Warsawe968ead2001-10-04 17:05:11 +000095
Barry Warsawba925802001-09-23 03:17:28 +000096class Parser:
Barry Warsawe03e8f02002-09-28 20:44:58 +000097 def __init__(self, _class=Message.Message, strict=False):
Barry Warsawba925802001-09-23 03:17:28 +000098 """Parser of RFC 2822 and MIME email messages.
99
100 Creates an in-memory object tree representing the email message, which
101 can then be manipulated and turned over to a Generator to return the
102 textual representation of the message.
103
104 The string must be formatted as a block of RFC 2822 headers and header
105 continuation lines, optionally preceeded by a `Unix-from' header. The
106 header block is terminated either by the end of the string or by a
107 blank line.
108
109 _class is the class to instantiate for new message objects when they
110 must be created. This class must have a constructor that can take
111 zero arguments. Default is Message.Message.
Barry Warsawf6caeba2002-07-09 02:50:02 +0000112
113 Optional strict tells the parser to be strictly RFC compliant or to be
114 more forgiving in parsing of ill-formatted MIME documents. When
115 non-strict mode is used, the parser will try to make up for missing or
116 erroneous boundaries and other peculiarities seen in the wild.
Barry Warsawbb26b452002-07-19 22:25:34 +0000117 Default is non-strict parsing.
Barry Warsawba925802001-09-23 03:17:28 +0000118 """
119 self._class = _class
Barry Warsawf6caeba2002-07-09 02:50:02 +0000120 self._strict = strict
Barry Warsawba925802001-09-23 03:17:28 +0000121
Barry Warsawe03e8f02002-09-28 20:44:58 +0000122 def parse(self, fp, headersonly=False):
Barry Warsaw057b8422002-09-30 20:07:22 +0000123 """Create a message structure from the data in a file.
124
125 Reads all the data from the file and returns the root of the message
126 structure. Optional headersonly is a flag specifying whether to stop
127 parsing after reading the headers or not. The default is False,
128 meaning it parses the entire contents of the file.
129 """
Barry Warsawba925802001-09-23 03:17:28 +0000130 root = self._class()
Thomas Wouters0813d762004-03-20 17:31:29 +0000131 fp = TextUtil(fp)
132 self._parseheaders(root, fp)
Barry Warsawf6caeba2002-07-09 02:50:02 +0000133 if not headersonly:
Thomas Wouters0813d762004-03-20 17:31:29 +0000134 obj = self._parsemessage(root, fp)
135 trailer = fp.read()
136 if obj and trailer:
137 self._attach_trailer(obj, trailer)
Barry Warsawba925802001-09-23 03:17:28 +0000138 return root
139
Barry Warsawe03e8f02002-09-28 20:44:58 +0000140 def parsestr(self, text, headersonly=False):
Barry Warsaw057b8422002-09-30 20:07:22 +0000141 """Create a message structure from a string.
142
143 Returns the root of the message structure. Optional headersonly is a
144 flag specifying whether to stop parsing after reading the headers or
145 not. The default is False, meaning it parses the entire contents of
146 the file.
147 """
Barry Warsawf6caeba2002-07-09 02:50:02 +0000148 return self.parse(StringIO(text), headersonly=headersonly)
Barry Warsawba925802001-09-23 03:17:28 +0000149
150 def _parseheaders(self, container, fp):
151 # Parse the headers, returning a list of header/value pairs. None as
152 # the header means the Unix-From header.
153 lastheader = ''
154 lastvalue = []
155 lineno = 0
Barry Warsawe03e8f02002-09-28 20:44:58 +0000156 while True:
Barry Warsaw409a4c02002-04-10 21:01:31 +0000157 # Don't strip the line before we test for the end condition,
158 # because whitespace-only header lines are RFC compliant
159 # continuation lines.
160 line = fp.readline()
161 if not line:
Barry Warsawba925802001-09-23 03:17:28 +0000162 break
Barry Warsaw409a4c02002-04-10 21:01:31 +0000163 line = line.splitlines()[0]
164 if not line:
165 break
166 # Ignore the trailing newline
Barry Warsawba925802001-09-23 03:17:28 +0000167 lineno += 1
168 # Check for initial Unix From_ line
169 if line.startswith('From '):
170 if lineno == 1:
171 container.set_unixfrom(line)
172 continue
Barry Warsawf6caeba2002-07-09 02:50:02 +0000173 elif self._strict:
Barry Warsawba925802001-09-23 03:17:28 +0000174 raise Errors.HeaderParseError(
175 'Unix-from in headers after first rfc822 header')
Barry Warsawf6caeba2002-07-09 02:50:02 +0000176 else:
177 # ignore the wierdly placed From_ line
178 # XXX: maybe set unixfrom anyway? or only if not already?
179 continue
Barry Warsawba925802001-09-23 03:17:28 +0000180 # Header continuation line
181 if line[0] in ' \t':
182 if not lastheader:
183 raise Errors.HeaderParseError(
184 'Continuation line seen before first header')
185 lastvalue.append(line)
186 continue
187 # Normal, non-continuation header. BAW: this should check to make
188 # sure it's a legal header, e.g. doesn't contain spaces. Also, we
189 # should expose the header matching algorithm in the API, and
190 # allow for a non-strict parsing mode (that ignores the line
191 # instead of raising the exception).
192 i = line.find(':')
193 if i < 0:
Barry Warsawf6caeba2002-07-09 02:50:02 +0000194 if self._strict:
195 raise Errors.HeaderParseError(
Barry Warsawda2525e2002-11-05 21:44:06 +0000196 "Not a header, not a continuation: ``%s''" % line)
Barry Warsawf6caeba2002-07-09 02:50:02 +0000197 elif lineno == 1 and line.startswith('--'):
198 # allow through duplicate boundary tags.
199 continue
200 else:
Barry Warsawda2525e2002-11-05 21:44:06 +0000201 # There was no separating blank line as mandated by RFC
202 # 2822, but we're in non-strict mode. So just offer up
203 # this current line as the first body line.
Thomas Wouters0813d762004-03-20 17:31:29 +0000204 fp.unreadline(line)
Barry Warsawda2525e2002-11-05 21:44:06 +0000205 break
Barry Warsawba925802001-09-23 03:17:28 +0000206 if lastheader:
207 container[lastheader] = NL.join(lastvalue)
208 lastheader = line[:i]
209 lastvalue = [line[i+1:].lstrip()]
210 # Make sure we retain the last header
211 if lastheader:
212 container[lastheader] = NL.join(lastvalue)
Thomas Wouters0813d762004-03-20 17:31:29 +0000213 return
Barry Warsawba925802001-09-23 03:17:28 +0000214
Thomas Wouters0813d762004-03-20 17:31:29 +0000215 def _parsemessage(self, container, fp):
216 # Parse the body. We walk through the body from top to bottom,
217 # keeping track of the current multipart nesting as we go.
218 # We return the object that gets the data at the end of this
219 # block.
Barry Warsaw66971fb2001-09-26 05:44:09 +0000220 boundary = container.get_boundary()
Barry Warsaw487fe6a2002-10-07 17:27:35 +0000221 isdigest = (container.get_content_type() == 'multipart/digest')
Thomas Wouters0813d762004-03-20 17:31:29 +0000222 if boundary:
Barry Warsawba925802001-09-23 03:17:28 +0000223 separator = '--' + boundary
Thomas Wouters0813d762004-03-20 17:31:29 +0000224 boundaryRE = re.compile(
225 r'(?P<sep>' + re.escape(separator) +
226 r')(?P<end>--)?(?P<ws>[ \t]*)(?P<linesep>\r\n|\r|\n)$')
227 preamble, matchobj = fp.readuntil(boundaryRE)
228 if not matchobj:
229 # Broken - we hit the end of file. Just set the body
230 # to the text.
231 container.set_payload(preamble)
232 return container
233 if preamble:
Barry Warsawba925802001-09-23 03:17:28 +0000234 container.preamble = preamble
Thomas Wouters0813d762004-03-20 17:31:29 +0000235 else:
236 # The module docs specify an empty preamble is None, not ''
237 container.preamble = None
238 while 1:
239 subobj = self._class()
240 if isdigest:
241 subobj.set_default_type('message/rfc822')
242 firstline = fp.peekline()
243 if firstline.strip():
244 # we have MIME headers. all good.
245 self._parseheaders(subobj, fp)
246 else:
247 # no MIME headers. this is allowed for multipart/digest
248 # Consume the extra blank line
249 fp.readline()
250 pass
251 else:
252 self._parseheaders(subobj, fp)
253 container.attach(subobj)
254 maintype = subobj.get_content_maintype()
255 hassubparts = (subobj.get_content_maintype() in
256 ( "message", "multipart" ))
257 if hassubparts:
258 subobj = self._parsemessage(subobj, fp)
259
260 trailer, matchobj = fp.readuntil(boundaryRE)
261 if matchobj is None or trailer:
262 mo = re.search('(?P<sep>\r\n|\r|\n){2}$', trailer)
263 if not mo:
264 mo = re.search('(?P<sep>\r\n|\r|\n)$', trailer)
265 if not mo:
266 raise Errors.BoundaryError(
267 'No terminating boundary and no trailing empty line')
268 linesep = mo.group('sep')
269 trailer = trailer[:-len(linesep)]
270 if trailer:
271 self._attach_trailer(subobj, trailer)
272 if matchobj is None or matchobj.group('end'):
273 # That was the last piece of data. Let our caller attach
274 # the epilogue to us. But before we do that, push the
275 # line ending of the match group back into the readline
276 # buffer, as it's part of the epilogue.
277 if matchobj:
278 fp.unreadline(matchobj.group('linesep'))
279 return container
280
281 elif container.get_content_maintype() == "multipart":
Barry Warsaw409a4c02002-04-10 21:01:31 +0000282 # Very bad. A message is a multipart with no boundary!
283 raise Errors.BoundaryError(
Thomas Wouters0813d762004-03-20 17:31:29 +0000284 'multipart message with no defined boundary')
285 elif container.get_content_maintype() == "message":
286 ct = container.get_content_type()
287 if ct == "message/rfc822":
288 submessage = self._class()
289 self._parseheaders(submessage, fp)
290 self._parsemessage(submessage, fp)
291 container.attach(submessage)
292 return submessage
293 elif ct == "message/delivery-status":
294 # This special kind of type contains blocks of headers
295 # separated by a blank line. We'll represent each header
296 # block as a separate Message object
297 while 1:
298 nextblock = self._class()
299 self._parseheaders(nextblock, fp)
300 container.attach(nextblock)
301 # next peek ahead to see whether we've hit the end or not
302 nextline = fp.peekline()
303 if nextline[:2] == "--":
304 break
305 return container
306 else:
307 # Other sort of message object (e.g. external-body)
Barry Warsawba925802001-09-23 03:17:28 +0000308 msg = self._class()
Thomas Wouters0813d762004-03-20 17:31:29 +0000309 self._parsemessage(msg, fp)
310 container.attach(msg)
311 return msg
Barry Warsawba925802001-09-23 03:17:28 +0000312 else:
Thomas Wouters0813d762004-03-20 17:31:29 +0000313 # single body section. We let our caller set the payload.
314 return container
Barry Warsawe5528822001-10-11 15:43:00 +0000315
Thomas Wouters0813d762004-03-20 17:31:29 +0000316 def _attach_trailer(self, obj, trailer):
317 if obj.get_content_maintype() in ("message", "multipart"):
318 obj.epilogue = trailer
319 else:
320 obj.set_payload(trailer)
Barry Warsawe5528822001-10-11 15:43:00 +0000321
322
323class HeaderParser(Parser):
324 """A subclass of Parser, this one only meaningfully parses message headers.
325
326 This class can be used if all you're interested in is the headers of a
327 message. While it consumes the message body, it does not parse it, but
328 simply makes it available as a string payload.
329
330 Parsing with this subclass can be considerably faster if all you're
331 interested in is the message headers.
332 """
Thomas Wouters0813d762004-03-20 17:31:29 +0000333 def _parsemessage(self, container, fp):
Barry Warsawe5528822001-10-11 15:43:00 +0000334 # Consume but do not parse, the body
Barry Warsawda2525e2002-11-05 21:44:06 +0000335 text = fp.read()
Barry Warsawda2525e2002-11-05 21:44:06 +0000336 container.set_payload(text)
Thomas Wouters0813d762004-03-20 17:31:29 +0000337 return None