blob: 690b7c2866df84d160432e17fb50b2f75d33d3a7 [file] [log] [blame]
Barry Warsaw418101f2004-05-09 03:29:23 +00001# Copyright (C) 2004 Python Software Foundation
2# Authors: Baxter, Wouters and Warsaw
Barry Warsawbb113862004-10-03 03:16:19 +00003# Contact: email-sig@python.org
Anthony Baxter39a0f042004-03-22 00:33:28 +00004
Barry Warsaw418101f2004-05-09 03:29:23 +00005"""FeedParser - An email feed parser.
6
7The feed parser implements an interface for incrementally parsing an email
8message, line by line. This has advantages for certain applications, such as
9those reading email messages off a socket.
10
11FeedParser.feed() is the primary interface for pushing new data into the
12parser. It returns when there's nothing more it can do with the available
13data. When you have no more data to push into the parser, call .close().
14This completes the parsing and returns the root message object.
15
16The other advantage of this parser is that it will never throw a parsing
17exception. Instead, when it finds something unexpected, it adds a 'defect' to
18the current message. Defects are just instances that live on the message
Barry Warsawbb113862004-10-03 03:16:19 +000019object's .defects attribute.
Barry Warsaw418101f2004-05-09 03:29:23 +000020"""
21
Anthony Baxter39a0f042004-03-22 00:33:28 +000022import re
Barry Warsaw418101f2004-05-09 03:29:23 +000023from email import Errors
24from email import Message
Anthony Baxter39a0f042004-03-22 00:33:28 +000025
26NLCRE = re.compile('\r\n|\r|\n')
Barry Warsaw418101f2004-05-09 03:29:23 +000027NLCRE_bol = re.compile('(\r\n|\r|\n)')
28NLCRE_eol = re.compile('(\r\n|\r|\n)$')
29NLCRE_crack = re.compile('(\r\n|\r|\n)')
30headerRE = re.compile(r'^(From |[-\w]{2,}:|[\t ])')
Anthony Baxter39a0f042004-03-22 00:33:28 +000031EMPTYSTRING = ''
32NL = '\n'
33
34NeedMoreData = object()
35
Anthony Baxter39a0f042004-03-22 00:33:28 +000036
Barry Warsaw418101f2004-05-09 03:29:23 +000037
38class BufferedSubFile(object):
39 """A file-ish object that can have new data loaded into it.
40
41 You can also push and pop line-matching predicates onto a stack. When the
42 current predicate matches the current line, a false EOF response
43 (i.e. empty string) is returned instead. This lets the parser adhere to a
44 simple abstraction -- it parses until EOF closes the current message.
45 """
Anthony Baxter39a0f042004-03-22 00:33:28 +000046 def __init__(self):
Barry Warsaw418101f2004-05-09 03:29:23 +000047 # The last partial line pushed into this object.
Anthony Baxter39a0f042004-03-22 00:33:28 +000048 self._partial = ''
Barry Warsaw418101f2004-05-09 03:29:23 +000049 # The list of full, pushed lines, in reverse order
50 self._lines = []
51 # The stack of false-EOF checking predicates.
52 self._eofstack = []
53 # A flag indicating whether the file has been closed or not.
54 self._closed = False
55
56 def push_eof_matcher(self, pred):
57 self._eofstack.append(pred)
58
59 def pop_eof_matcher(self):
60 return self._eofstack.pop()
61
62 def close(self):
63 # Don't forget any trailing partial line.
64 self._lines.append(self._partial)
Barry Warsaw2e8c1f12004-11-28 00:21:42 +000065 self._partial = ''
Barry Warsaw418101f2004-05-09 03:29:23 +000066 self._closed = True
Anthony Baxter39a0f042004-03-22 00:33:28 +000067
68 def readline(self):
Barry Warsaw418101f2004-05-09 03:29:23 +000069 if not self._lines:
70 if self._closed:
Anthony Baxter39a0f042004-03-22 00:33:28 +000071 return ''
72 return NeedMoreData
Barry Warsaw418101f2004-05-09 03:29:23 +000073 # Pop the line off the stack and see if it matches the current
74 # false-EOF predicate.
75 line = self._lines.pop()
Barry Warsaw4e59bc12004-05-13 20:17:51 +000076 # RFC 2046, section 5.1.2 requires us to recognize outer level
77 # boundaries at any level of inner nesting. Do this, but be sure it's
78 # in the order of most to least nested.
79 for ateof in self._eofstack[::-1]:
80 if ateof(line):
Barry Warsaw418101f2004-05-09 03:29:23 +000081 # We're at the false EOF. But push the last line back first.
82 self._lines.append(line)
83 return ''
84 return line
Anthony Baxter39a0f042004-03-22 00:33:28 +000085
86 def unreadline(self, line):
Barry Warsaw418101f2004-05-09 03:29:23 +000087 # Let the consumer push a line back into the buffer.
88 self._lines.append(line)
Anthony Baxter39a0f042004-03-22 00:33:28 +000089
90 def push(self, data):
Barry Warsaw418101f2004-05-09 03:29:23 +000091 """Push some new data into this object."""
Anthony Baxter39a0f042004-03-22 00:33:28 +000092 # Handle any previous leftovers
Barry Warsaw418101f2004-05-09 03:29:23 +000093 data, self._partial = self._partial + data, ''
94 # Crack into lines, but preserve the newlines on the end of each
95 parts = NLCRE_crack.split(data)
96 # The *ahem* interesting behaviour of re.split when supplied grouping
97 # parentheses is that the last element of the resulting list is the
98 # data after the final RE. In the case of a NL/CR terminated string,
99 # this is the empty string.
100 self._partial = parts.pop()
101 # parts is a list of strings, alternating between the line contents
102 # and the eol character(s). Gather up a list of lines after
103 # re-attaching the newlines.
104 lines = []
Barry Warsawbb113862004-10-03 03:16:19 +0000105 for i in range(len(parts) // 2):
Barry Warsaw418101f2004-05-09 03:29:23 +0000106 lines.append(parts[i*2] + parts[i*2+1])
107 self.pushlines(lines)
108
Anthony Baxter39a0f042004-03-22 00:33:28 +0000109 def pushlines(self, lines):
Barry Warsaw418101f2004-05-09 03:29:23 +0000110 # Reverse and insert at the front of the lines.
111 self._lines[:0] = lines[::-1]
Anthony Baxter39a0f042004-03-22 00:33:28 +0000112
Barry Warsaw418101f2004-05-09 03:29:23 +0000113 def is_closed(self):
114 return self._closed
Anthony Baxter39a0f042004-03-22 00:33:28 +0000115
116 def __iter__(self):
117 return self
118
119 def next(self):
Barry Warsaw418101f2004-05-09 03:29:23 +0000120 line = self.readline()
121 if line == '':
Anthony Baxter39a0f042004-03-22 00:33:28 +0000122 raise StopIteration
Barry Warsaw418101f2004-05-09 03:29:23 +0000123 return line
Anthony Baxter39a0f042004-03-22 00:33:28 +0000124
Barry Warsaw418101f2004-05-09 03:29:23 +0000125
126
Anthony Baxter39a0f042004-03-22 00:33:28 +0000127class FeedParser:
Barry Warsaw418101f2004-05-09 03:29:23 +0000128 """A feed-style parser of email."""
Anthony Baxter39a0f042004-03-22 00:33:28 +0000129
Barry Warsaw418101f2004-05-09 03:29:23 +0000130 def __init__(self, _factory=Message.Message):
131 """_factory is called with no arguments to create a new message obj"""
132 self._factory = _factory
133 self._input = BufferedSubFile()
134 self._msgstack = []
Anthony Baxter39a0f042004-03-22 00:33:28 +0000135 self._parse = self._parsegen().next
Barry Warsaw418101f2004-05-09 03:29:23 +0000136 self._cur = None
137 self._last = None
138 self._headersonly = False
Anthony Baxter39a0f042004-03-22 00:33:28 +0000139
Barry Warsaw418101f2004-05-09 03:29:23 +0000140 # Non-public interface for supporting Parser's headersonly flag
141 def _set_headersonly(self):
142 self._headersonly = True
Anthony Baxter39a0f042004-03-22 00:33:28 +0000143
144 def feed(self, data):
Barry Warsaw418101f2004-05-09 03:29:23 +0000145 """Push more data into the parser."""
Anthony Baxter39a0f042004-03-22 00:33:28 +0000146 self._input.push(data)
147 self._call_parse()
148
149 def _call_parse(self):
150 try:
151 self._parse()
152 except StopIteration:
153 pass
154
Barry Warsaw418101f2004-05-09 03:29:23 +0000155 def close(self):
156 """Parse all remaining data and return the root message object."""
157 self._input.close()
158 self._call_parse()
159 root = self._pop_message()
160 assert not self._msgstack
Barry Warsawbb113862004-10-03 03:16:19 +0000161 # Look for final set of defects
162 if root.get_content_maintype() == 'multipart' \
163 and not root.is_multipart():
164 root.defects.append(Errors.MultipartInvariantViolationDefect())
Barry Warsaw418101f2004-05-09 03:29:23 +0000165 return root
Anthony Baxter39a0f042004-03-22 00:33:28 +0000166
Barry Warsaw418101f2004-05-09 03:29:23 +0000167 def _new_message(self):
168 msg = self._factory()
169 if self._cur and self._cur.get_content_type() == 'multipart/digest':
170 msg.set_default_type('message/rfc822')
171 if self._msgstack:
172 self._msgstack[-1].attach(msg)
173 self._msgstack.append(msg)
174 self._cur = msg
Barry Warsaw418101f2004-05-09 03:29:23 +0000175 self._last = msg
176
177 def _pop_message(self):
178 retval = self._msgstack.pop()
179 if self._msgstack:
180 self._cur = self._msgstack[-1]
181 else:
182 self._cur = None
183 return retval
184
185 def _parsegen(self):
186 # Create a new message and start by parsing headers.
187 self._new_message()
188 headers = []
189 # Collect the headers, searching for a line that doesn't match the RFC
190 # 2822 header or continuation pattern (including an empty line).
191 for line in self._input:
192 if line is NeedMoreData:
193 yield NeedMoreData
194 continue
195 if not headerRE.match(line):
196 # If we saw the RFC defined header/body separator
197 # (i.e. newline), just throw it away. Otherwise the line is
198 # part of the body so push it back.
199 if not NLCRE.match(line):
200 self._input.unreadline(line)
201 break
202 headers.append(line)
203 # Done with the headers, so parse them and figure out what we're
204 # supposed to see in the body of the message.
205 self._parse_headers(headers)
206 # Headers-only parsing is a backwards compatibility hack, which was
207 # necessary in the older parser, which could throw errors. All
208 # remaining lines in the input are thrown into the message body.
209 if self._headersonly:
210 lines = []
211 while True:
212 line = self._input.readline()
213 if line is NeedMoreData:
214 yield NeedMoreData
215 continue
216 if line == '':
217 break
218 lines.append(line)
219 self._cur.set_payload(EMPTYSTRING.join(lines))
220 return
Barry Warsaw418101f2004-05-09 03:29:23 +0000221 if self._cur.get_content_type() == 'message/delivery-status':
222 # message/delivery-status contains blocks of headers separated by
223 # a blank line. We'll represent each header block as a separate
Barry Warsawd38f4482004-05-11 20:19:09 +0000224 # nested message object, but the processing is a bit different
225 # than standard message/* types because there is no body for the
226 # nested messages. A blank line separates the subparts.
Barry Warsaw418101f2004-05-09 03:29:23 +0000227 while True:
228 self._input.push_eof_matcher(NLCRE.match)
229 for retval in self._parsegen():
230 if retval is NeedMoreData:
231 yield NeedMoreData
232 continue
233 break
234 msg = self._pop_message()
235 # We need to pop the EOF matcher in order to tell if we're at
236 # the end of the current file, not the end of the last block
237 # of message headers.
238 self._input.pop_eof_matcher()
239 # The input stream must be sitting at the newline or at the
240 # EOF. We want to see if we're at the end of this subpart, so
241 # first consume the blank line, then test the next line to see
242 # if we're at this subpart's EOF.
243 line = self._input.readline()
244 line = self._input.readline()
245 if line == '':
246 break
247 # Not at EOF so this is a line we're going to need.
248 self._input.unreadline(line)
249 return
Barry Warsawd38f4482004-05-11 20:19:09 +0000250 if self._cur.get_content_maintype() == 'message':
251 # The message claims to be a message/* type, then what follows is
252 # another RFC 2822 message.
253 for retval in self._parsegen():
254 if retval is NeedMoreData:
255 yield NeedMoreData
256 continue
257 break
258 self._pop_message()
259 return
Barry Warsaw418101f2004-05-09 03:29:23 +0000260 if self._cur.get_content_maintype() == 'multipart':
261 boundary = self._cur.get_boundary()
262 if boundary is None:
263 # The message /claims/ to be a multipart but it has not
264 # defined a boundary. That's a problem which we'll handle by
265 # reading everything until the EOF and marking the message as
266 # defective.
Barry Warsawbb113862004-10-03 03:16:19 +0000267 self._cur.defects.append(Errors.NoBoundaryInMultipartDefect())
Barry Warsaw418101f2004-05-09 03:29:23 +0000268 lines = []
269 for line in self._input:
270 if line is NeedMoreData:
271 yield NeedMoreData
272 continue
273 lines.append(line)
274 self._cur.set_payload(EMPTYSTRING.join(lines))
275 return
276 # Create a line match predicate which matches the inter-part
277 # boundary as well as the end-of-multipart boundary. Don't push
278 # this onto the input stream until we've scanned past the
279 # preamble.
280 separator = '--' + boundary
281 boundaryre = re.compile(
282 '(?P<sep>' + re.escape(separator) +
Barry Warsaw2e8c1f12004-11-28 00:21:42 +0000283 r')(?P<end>--)?(?P<ws>[ \t]*)(?P<linesep>\r\n|\r|\n)?$')
Barry Warsaw418101f2004-05-09 03:29:23 +0000284 capturing_preamble = True
285 preamble = []
286 linesep = False
287 while True:
288 line = self._input.readline()
289 if line is NeedMoreData:
290 yield NeedMoreData
291 continue
292 if line == '':
293 break
294 mo = boundaryre.match(line)
295 if mo:
296 # If we're looking at the end boundary, we're done with
297 # this multipart. If there was a newline at the end of
298 # the closing boundary, then we need to initialize the
299 # epilogue with the empty string (see below).
300 if mo.group('end'):
301 linesep = mo.group('linesep')
302 break
303 # We saw an inter-part boundary. Were we in the preamble?
304 if capturing_preamble:
305 if preamble:
306 # According to RFC 2046, the last newline belongs
307 # to the boundary.
308 lastline = preamble[-1]
309 eolmo = NLCRE_eol.search(lastline)
310 if eolmo:
311 preamble[-1] = lastline[:-len(eolmo.group(0))]
312 self._cur.preamble = EMPTYSTRING.join(preamble)
313 capturing_preamble = False
314 self._input.unreadline(line)
315 continue
Barry Warsaw486cb0a2004-05-11 22:23:59 +0000316 # We saw a boundary separating two parts. Consume any
317 # multiple boundary lines that may be following. Our
318 # interpretation of RFC 2046 BNF grammar does not produce
319 # body parts within such double boundaries.
320 while True:
321 line = self._input.readline()
Barry Warsawe4aeb7d2004-05-15 16:26:28 +0000322 if line is NeedMoreData:
323 yield NeedMoreData
324 continue
Barry Warsaw486cb0a2004-05-11 22:23:59 +0000325 mo = boundaryre.match(line)
326 if not mo:
327 self._input.unreadline(line)
328 break
329 # Recurse to parse this subpart; the input stream points
330 # at the subpart's first line.
Barry Warsaw418101f2004-05-09 03:29:23 +0000331 self._input.push_eof_matcher(boundaryre.match)
332 for retval in self._parsegen():
333 if retval is NeedMoreData:
334 yield NeedMoreData
335 continue
336 break
337 # Because of RFC 2046, the newline preceding the boundary
338 # separator actually belongs to the boundary, not the
339 # previous subpart's payload (or epilogue if the previous
340 # part is a multipart).
341 if self._last.get_content_maintype() == 'multipart':
342 epilogue = self._last.epilogue
343 if epilogue == '':
344 self._last.epilogue = None
345 elif epilogue is not None:
346 mo = NLCRE_eol.search(epilogue)
347 if mo:
348 end = len(mo.group(0))
349 self._last.epilogue = epilogue[:-end]
350 else:
351 payload = self._last.get_payload()
352 if isinstance(payload, basestring):
353 mo = NLCRE_eol.search(payload)
354 if mo:
355 payload = payload[:-len(mo.group(0))]
356 self._last.set_payload(payload)
357 self._input.pop_eof_matcher()
358 self._pop_message()
359 # Set the multipart up for newline cleansing, which will
360 # happen if we're in a nested multipart.
361 self._last = self._cur
362 else:
363 # I think we must be in the preamble
364 assert capturing_preamble
365 preamble.append(line)
366 # We've seen either the EOF or the end boundary. If we're still
367 # capturing the preamble, we never saw the start boundary. Note
368 # that as a defect and store the captured text as the payload.
Barry Warsawdee0cf12004-10-09 23:00:11 +0000369 # Everything from here to the EOF is epilogue.
Barry Warsaw418101f2004-05-09 03:29:23 +0000370 if capturing_preamble:
Barry Warsawbb113862004-10-03 03:16:19 +0000371 self._cur.defects.append(Errors.StartBoundaryNotFoundDefect())
Barry Warsaw418101f2004-05-09 03:29:23 +0000372 self._cur.set_payload(EMPTYSTRING.join(preamble))
Barry Warsawdee0cf12004-10-09 23:00:11 +0000373 epilogue = []
374 for line in self._input:
375 if line is NeedMoreData:
376 yield NeedMoreData
377 continue
378 self._cur.epilogue = EMPTYSTRING.join(epilogue)
Barry Warsaw418101f2004-05-09 03:29:23 +0000379 return
380 # If the end boundary ended in a newline, we'll need to make sure
381 # the epilogue isn't None
382 if linesep:
383 epilogue = ['']
384 else:
385 epilogue = []
386 for line in self._input:
387 if line is NeedMoreData:
388 yield NeedMoreData
389 continue
390 epilogue.append(line)
391 # Any CRLF at the front of the epilogue is not technically part of
392 # the epilogue. Also, watch out for an empty string epilogue,
393 # which means a single newline.
Barry Warsaw5b44cd62004-05-11 18:10:15 +0000394 if epilogue:
395 firstline = epilogue[0]
396 bolmo = NLCRE_bol.match(firstline)
397 if bolmo:
398 epilogue[0] = firstline[len(bolmo.group(0)):]
Barry Warsaw418101f2004-05-09 03:29:23 +0000399 self._cur.epilogue = EMPTYSTRING.join(epilogue)
400 return
401 # Otherwise, it's some non-multipart type, so the entire rest of the
402 # file contents becomes the payload.
403 lines = []
404 for line in self._input:
405 if line is NeedMoreData:
406 yield NeedMoreData
407 continue
408 lines.append(line)
409 self._cur.set_payload(EMPTYSTRING.join(lines))
410
411 def _parse_headers(self, lines):
412 # Passed a list of lines that make up the headers for the current msg
Anthony Baxter39a0f042004-03-22 00:33:28 +0000413 lastheader = ''
414 lastvalue = []
Barry Warsaw418101f2004-05-09 03:29:23 +0000415 for lineno, line in enumerate(lines):
Anthony Baxter39a0f042004-03-22 00:33:28 +0000416 # Check for continuation
417 if line[0] in ' \t':
418 if not lastheader:
Barry Warsaw418101f2004-05-09 03:29:23 +0000419 # The first line of the headers was a continuation. This
420 # is illegal, so let's note the defect, store the illegal
421 # line, and ignore it for purposes of headers.
Barry Warsawbb113862004-10-03 03:16:19 +0000422 defect = Errors.FirstHeaderLineIsContinuationDefect(line)
Barry Warsaw418101f2004-05-09 03:29:23 +0000423 self._cur.defects.append(defect)
424 continue
Anthony Baxter39a0f042004-03-22 00:33:28 +0000425 lastvalue.append(line)
426 continue
Anthony Baxter39a0f042004-03-22 00:33:28 +0000427 if lastheader:
428 # XXX reconsider the joining of folded lines
Barry Warsaw8896bf52004-08-07 15:57:52 +0000429 lhdr = EMPTYSTRING.join(lastvalue)[:-1].rstrip('\r\n')
430 self._cur[lastheader] = lhdr
Anthony Baxter39a0f042004-03-22 00:33:28 +0000431 lastheader, lastvalue = '', []
Barry Warsaw418101f2004-05-09 03:29:23 +0000432 # Check for envelope header, i.e. unix-from
Anthony Baxter39a0f042004-03-22 00:33:28 +0000433 if line.startswith('From '):
434 if lineno == 0:
Barry Warsawc29db262004-05-10 14:48:30 +0000435 # Strip off the trailing newline
436 mo = NLCRE_eol.search(line)
437 if mo:
438 line = line[:-len(mo.group(0))]
Anthony Baxter39a0f042004-03-22 00:33:28 +0000439 self._cur.set_unixfrom(line)
440 continue
Barry Warsaw418101f2004-05-09 03:29:23 +0000441 elif lineno == len(lines) - 1:
Anthony Baxter39a0f042004-03-22 00:33:28 +0000442 # Something looking like a unix-from at the end - it's
Barry Warsaw418101f2004-05-09 03:29:23 +0000443 # probably the first line of the body, so push back the
444 # line and stop.
Anthony Baxter39a0f042004-03-22 00:33:28 +0000445 self._input.unreadline(line)
446 return
447 else:
Barry Warsaw418101f2004-05-09 03:29:23 +0000448 # Weirdly placed unix-from line. Note this as a defect
449 # and ignore it.
Barry Warsawbb113862004-10-03 03:16:19 +0000450 defect = Errors.MisplacedEnvelopeHeaderDefect(line)
Barry Warsaw418101f2004-05-09 03:29:23 +0000451 self._cur.defects.append(defect)
Anthony Baxter39a0f042004-03-22 00:33:28 +0000452 continue
Barry Warsaw418101f2004-05-09 03:29:23 +0000453 # Split the line on the colon separating field name from value.
Anthony Baxter39a0f042004-03-22 00:33:28 +0000454 i = line.find(':')
455 if i < 0:
Barry Warsawbb113862004-10-03 03:16:19 +0000456 defect = Errors.MalformedHeaderDefect(line)
Barry Warsaw418101f2004-05-09 03:29:23 +0000457 self._cur.defects.append(defect)
458 continue
Anthony Baxter39a0f042004-03-22 00:33:28 +0000459 lastheader = line[:i]
460 lastvalue = [line[i+1:].lstrip()]
Barry Warsaw418101f2004-05-09 03:29:23 +0000461 # Done with all the lines, so handle the last header.
Anthony Baxter39a0f042004-03-22 00:33:28 +0000462 if lastheader:
463 # XXX reconsider the joining of folded lines
Barry Warsaw8896bf52004-08-07 15:57:52 +0000464 self._cur[lastheader] = EMPTYSTRING.join(lastvalue).rstrip('\r\n')