blob: f514728156522ae4c5db4dcf853767dd3f797343 [file] [log] [blame]
Barry Warsaw418101f2004-05-09 03:29:23 +00001# Copyright (C) 2004 Python Software Foundation
2# Authors: Baxter, Wouters and Warsaw
Barry Warsawbb113862004-10-03 03:16:19 +00003# Contact: email-sig@python.org
Anthony Baxter39a0f042004-03-22 00:33:28 +00004
Barry Warsaw418101f2004-05-09 03:29:23 +00005"""FeedParser - An email feed parser.
6
7The feed parser implements an interface for incrementally parsing an email
8message, line by line. This has advantages for certain applications, such as
9those reading email messages off a socket.
10
11FeedParser.feed() is the primary interface for pushing new data into the
12parser. It returns when there's nothing more it can do with the available
13data. When you have no more data to push into the parser, call .close().
14This completes the parsing and returns the root message object.
15
16The other advantage of this parser is that it will never throw a parsing
17exception. Instead, when it finds something unexpected, it adds a 'defect' to
18the current message. Defects are just instances that live on the message
Barry Warsawbb113862004-10-03 03:16:19 +000019object's .defects attribute.
Barry Warsaw418101f2004-05-09 03:29:23 +000020"""
21
Anthony Baxter39a0f042004-03-22 00:33:28 +000022import re
Barry Warsaw418101f2004-05-09 03:29:23 +000023from email import Errors
24from email import Message
Anthony Baxter39a0f042004-03-22 00:33:28 +000025
26NLCRE = re.compile('\r\n|\r|\n')
Barry Warsaw418101f2004-05-09 03:29:23 +000027NLCRE_bol = re.compile('(\r\n|\r|\n)')
28NLCRE_eol = re.compile('(\r\n|\r|\n)$')
29NLCRE_crack = re.compile('(\r\n|\r|\n)')
Barry Warsawf4c7c402004-11-29 03:46:54 +000030# RFC 2822 $3.6.8 Optional fields. ftext is %d33-57 / %d59-126, Any character
31# except controls, SP, and ":".
32headerRE = re.compile(r'^(From |[\041-\071\073-\176]{2,}:|[\t ])')
Anthony Baxter39a0f042004-03-22 00:33:28 +000033EMPTYSTRING = ''
34NL = '\n'
35
36NeedMoreData = object()
37
Anthony Baxter39a0f042004-03-22 00:33:28 +000038
Barry Warsaw418101f2004-05-09 03:29:23 +000039
40class BufferedSubFile(object):
41 """A file-ish object that can have new data loaded into it.
42
43 You can also push and pop line-matching predicates onto a stack. When the
44 current predicate matches the current line, a false EOF response
45 (i.e. empty string) is returned instead. This lets the parser adhere to a
46 simple abstraction -- it parses until EOF closes the current message.
47 """
Anthony Baxter39a0f042004-03-22 00:33:28 +000048 def __init__(self):
Barry Warsaw418101f2004-05-09 03:29:23 +000049 # The last partial line pushed into this object.
Anthony Baxter39a0f042004-03-22 00:33:28 +000050 self._partial = ''
Barry Warsaw418101f2004-05-09 03:29:23 +000051 # The list of full, pushed lines, in reverse order
52 self._lines = []
53 # The stack of false-EOF checking predicates.
54 self._eofstack = []
55 # A flag indicating whether the file has been closed or not.
56 self._closed = False
57
58 def push_eof_matcher(self, pred):
59 self._eofstack.append(pred)
60
61 def pop_eof_matcher(self):
62 return self._eofstack.pop()
63
64 def close(self):
65 # Don't forget any trailing partial line.
66 self._lines.append(self._partial)
Barry Warsaw2e8c1f12004-11-28 00:21:42 +000067 self._partial = ''
Barry Warsaw418101f2004-05-09 03:29:23 +000068 self._closed = True
Anthony Baxter39a0f042004-03-22 00:33:28 +000069
70 def readline(self):
Barry Warsaw418101f2004-05-09 03:29:23 +000071 if not self._lines:
72 if self._closed:
Anthony Baxter39a0f042004-03-22 00:33:28 +000073 return ''
74 return NeedMoreData
Barry Warsaw418101f2004-05-09 03:29:23 +000075 # Pop the line off the stack and see if it matches the current
76 # false-EOF predicate.
77 line = self._lines.pop()
Barry Warsaw4e59bc12004-05-13 20:17:51 +000078 # RFC 2046, section 5.1.2 requires us to recognize outer level
79 # boundaries at any level of inner nesting. Do this, but be sure it's
80 # in the order of most to least nested.
81 for ateof in self._eofstack[::-1]:
82 if ateof(line):
Barry Warsaw418101f2004-05-09 03:29:23 +000083 # We're at the false EOF. But push the last line back first.
84 self._lines.append(line)
85 return ''
86 return line
Anthony Baxter39a0f042004-03-22 00:33:28 +000087
88 def unreadline(self, line):
Barry Warsaw418101f2004-05-09 03:29:23 +000089 # Let the consumer push a line back into the buffer.
90 self._lines.append(line)
Anthony Baxter39a0f042004-03-22 00:33:28 +000091
92 def push(self, data):
Barry Warsaw418101f2004-05-09 03:29:23 +000093 """Push some new data into this object."""
Anthony Baxter39a0f042004-03-22 00:33:28 +000094 # Handle any previous leftovers
Barry Warsaw418101f2004-05-09 03:29:23 +000095 data, self._partial = self._partial + data, ''
96 # Crack into lines, but preserve the newlines on the end of each
97 parts = NLCRE_crack.split(data)
98 # The *ahem* interesting behaviour of re.split when supplied grouping
99 # parentheses is that the last element of the resulting list is the
100 # data after the final RE. In the case of a NL/CR terminated string,
101 # this is the empty string.
102 self._partial = parts.pop()
103 # parts is a list of strings, alternating between the line contents
104 # and the eol character(s). Gather up a list of lines after
105 # re-attaching the newlines.
106 lines = []
Barry Warsawbb113862004-10-03 03:16:19 +0000107 for i in range(len(parts) // 2):
Barry Warsaw418101f2004-05-09 03:29:23 +0000108 lines.append(parts[i*2] + parts[i*2+1])
109 self.pushlines(lines)
110
Anthony Baxter39a0f042004-03-22 00:33:28 +0000111 def pushlines(self, lines):
Barry Warsaw418101f2004-05-09 03:29:23 +0000112 # Reverse and insert at the front of the lines.
113 self._lines[:0] = lines[::-1]
Anthony Baxter39a0f042004-03-22 00:33:28 +0000114
Barry Warsaw418101f2004-05-09 03:29:23 +0000115 def is_closed(self):
116 return self._closed
Anthony Baxter39a0f042004-03-22 00:33:28 +0000117
118 def __iter__(self):
119 return self
120
121 def next(self):
Barry Warsaw418101f2004-05-09 03:29:23 +0000122 line = self.readline()
123 if line == '':
Anthony Baxter39a0f042004-03-22 00:33:28 +0000124 raise StopIteration
Barry Warsaw418101f2004-05-09 03:29:23 +0000125 return line
Anthony Baxter39a0f042004-03-22 00:33:28 +0000126
Barry Warsaw418101f2004-05-09 03:29:23 +0000127
128
Anthony Baxter39a0f042004-03-22 00:33:28 +0000129class FeedParser:
Barry Warsaw418101f2004-05-09 03:29:23 +0000130 """A feed-style parser of email."""
Anthony Baxter39a0f042004-03-22 00:33:28 +0000131
Barry Warsaw418101f2004-05-09 03:29:23 +0000132 def __init__(self, _factory=Message.Message):
133 """_factory is called with no arguments to create a new message obj"""
134 self._factory = _factory
135 self._input = BufferedSubFile()
136 self._msgstack = []
Anthony Baxter39a0f042004-03-22 00:33:28 +0000137 self._parse = self._parsegen().next
Barry Warsaw418101f2004-05-09 03:29:23 +0000138 self._cur = None
139 self._last = None
140 self._headersonly = False
Anthony Baxter39a0f042004-03-22 00:33:28 +0000141
Barry Warsaw418101f2004-05-09 03:29:23 +0000142 # Non-public interface for supporting Parser's headersonly flag
143 def _set_headersonly(self):
144 self._headersonly = True
Anthony Baxter39a0f042004-03-22 00:33:28 +0000145
146 def feed(self, data):
Barry Warsaw418101f2004-05-09 03:29:23 +0000147 """Push more data into the parser."""
Anthony Baxter39a0f042004-03-22 00:33:28 +0000148 self._input.push(data)
149 self._call_parse()
150
151 def _call_parse(self):
152 try:
153 self._parse()
154 except StopIteration:
155 pass
156
Barry Warsaw418101f2004-05-09 03:29:23 +0000157 def close(self):
158 """Parse all remaining data and return the root message object."""
159 self._input.close()
160 self._call_parse()
161 root = self._pop_message()
162 assert not self._msgstack
Barry Warsawbb113862004-10-03 03:16:19 +0000163 # Look for final set of defects
164 if root.get_content_maintype() == 'multipart' \
165 and not root.is_multipart():
166 root.defects.append(Errors.MultipartInvariantViolationDefect())
Barry Warsaw418101f2004-05-09 03:29:23 +0000167 return root
Anthony Baxter39a0f042004-03-22 00:33:28 +0000168
Barry Warsaw418101f2004-05-09 03:29:23 +0000169 def _new_message(self):
170 msg = self._factory()
171 if self._cur and self._cur.get_content_type() == 'multipart/digest':
172 msg.set_default_type('message/rfc822')
173 if self._msgstack:
174 self._msgstack[-1].attach(msg)
175 self._msgstack.append(msg)
176 self._cur = msg
Barry Warsaw418101f2004-05-09 03:29:23 +0000177 self._last = msg
178
179 def _pop_message(self):
180 retval = self._msgstack.pop()
181 if self._msgstack:
182 self._cur = self._msgstack[-1]
183 else:
184 self._cur = None
185 return retval
186
187 def _parsegen(self):
188 # Create a new message and start by parsing headers.
189 self._new_message()
190 headers = []
191 # Collect the headers, searching for a line that doesn't match the RFC
192 # 2822 header or continuation pattern (including an empty line).
193 for line in self._input:
194 if line is NeedMoreData:
195 yield NeedMoreData
196 continue
197 if not headerRE.match(line):
198 # If we saw the RFC defined header/body separator
199 # (i.e. newline), just throw it away. Otherwise the line is
200 # part of the body so push it back.
201 if not NLCRE.match(line):
202 self._input.unreadline(line)
203 break
204 headers.append(line)
205 # Done with the headers, so parse them and figure out what we're
206 # supposed to see in the body of the message.
207 self._parse_headers(headers)
208 # Headers-only parsing is a backwards compatibility hack, which was
209 # necessary in the older parser, which could throw errors. All
210 # remaining lines in the input are thrown into the message body.
211 if self._headersonly:
212 lines = []
213 while True:
214 line = self._input.readline()
215 if line is NeedMoreData:
216 yield NeedMoreData
217 continue
218 if line == '':
219 break
220 lines.append(line)
221 self._cur.set_payload(EMPTYSTRING.join(lines))
222 return
Barry Warsaw418101f2004-05-09 03:29:23 +0000223 if self._cur.get_content_type() == 'message/delivery-status':
224 # message/delivery-status contains blocks of headers separated by
225 # a blank line. We'll represent each header block as a separate
Barry Warsawd38f4482004-05-11 20:19:09 +0000226 # nested message object, but the processing is a bit different
227 # than standard message/* types because there is no body for the
228 # nested messages. A blank line separates the subparts.
Barry Warsaw418101f2004-05-09 03:29:23 +0000229 while True:
230 self._input.push_eof_matcher(NLCRE.match)
231 for retval in self._parsegen():
232 if retval is NeedMoreData:
233 yield NeedMoreData
234 continue
235 break
236 msg = self._pop_message()
237 # We need to pop the EOF matcher in order to tell if we're at
238 # the end of the current file, not the end of the last block
239 # of message headers.
240 self._input.pop_eof_matcher()
241 # The input stream must be sitting at the newline or at the
242 # EOF. We want to see if we're at the end of this subpart, so
243 # first consume the blank line, then test the next line to see
244 # if we're at this subpart's EOF.
245 line = self._input.readline()
246 line = self._input.readline()
247 if line == '':
248 break
249 # Not at EOF so this is a line we're going to need.
250 self._input.unreadline(line)
251 return
Barry Warsawd38f4482004-05-11 20:19:09 +0000252 if self._cur.get_content_maintype() == 'message':
253 # The message claims to be a message/* type, then what follows is
254 # another RFC 2822 message.
255 for retval in self._parsegen():
256 if retval is NeedMoreData:
257 yield NeedMoreData
258 continue
259 break
260 self._pop_message()
261 return
Barry Warsaw418101f2004-05-09 03:29:23 +0000262 if self._cur.get_content_maintype() == 'multipart':
263 boundary = self._cur.get_boundary()
264 if boundary is None:
265 # The message /claims/ to be a multipart but it has not
266 # defined a boundary. That's a problem which we'll handle by
267 # reading everything until the EOF and marking the message as
268 # defective.
Barry Warsawbb113862004-10-03 03:16:19 +0000269 self._cur.defects.append(Errors.NoBoundaryInMultipartDefect())
Barry Warsaw418101f2004-05-09 03:29:23 +0000270 lines = []
271 for line in self._input:
272 if line is NeedMoreData:
273 yield NeedMoreData
274 continue
275 lines.append(line)
276 self._cur.set_payload(EMPTYSTRING.join(lines))
277 return
278 # Create a line match predicate which matches the inter-part
279 # boundary as well as the end-of-multipart boundary. Don't push
280 # this onto the input stream until we've scanned past the
281 # preamble.
282 separator = '--' + boundary
283 boundaryre = re.compile(
284 '(?P<sep>' + re.escape(separator) +
Barry Warsaw2e8c1f12004-11-28 00:21:42 +0000285 r')(?P<end>--)?(?P<ws>[ \t]*)(?P<linesep>\r\n|\r|\n)?$')
Barry Warsaw418101f2004-05-09 03:29:23 +0000286 capturing_preamble = True
287 preamble = []
288 linesep = False
289 while True:
290 line = self._input.readline()
291 if line is NeedMoreData:
292 yield NeedMoreData
293 continue
294 if line == '':
295 break
296 mo = boundaryre.match(line)
297 if mo:
298 # If we're looking at the end boundary, we're done with
299 # this multipart. If there was a newline at the end of
300 # the closing boundary, then we need to initialize the
301 # epilogue with the empty string (see below).
302 if mo.group('end'):
303 linesep = mo.group('linesep')
304 break
305 # We saw an inter-part boundary. Were we in the preamble?
306 if capturing_preamble:
307 if preamble:
308 # According to RFC 2046, the last newline belongs
309 # to the boundary.
310 lastline = preamble[-1]
311 eolmo = NLCRE_eol.search(lastline)
312 if eolmo:
313 preamble[-1] = lastline[:-len(eolmo.group(0))]
314 self._cur.preamble = EMPTYSTRING.join(preamble)
315 capturing_preamble = False
316 self._input.unreadline(line)
317 continue
Barry Warsaw486cb0a2004-05-11 22:23:59 +0000318 # We saw a boundary separating two parts. Consume any
319 # multiple boundary lines that may be following. Our
320 # interpretation of RFC 2046 BNF grammar does not produce
321 # body parts within such double boundaries.
322 while True:
323 line = self._input.readline()
Barry Warsawe4aeb7d2004-05-15 16:26:28 +0000324 if line is NeedMoreData:
325 yield NeedMoreData
326 continue
Barry Warsaw486cb0a2004-05-11 22:23:59 +0000327 mo = boundaryre.match(line)
328 if not mo:
329 self._input.unreadline(line)
330 break
331 # Recurse to parse this subpart; the input stream points
332 # at the subpart's first line.
Barry Warsaw418101f2004-05-09 03:29:23 +0000333 self._input.push_eof_matcher(boundaryre.match)
334 for retval in self._parsegen():
335 if retval is NeedMoreData:
336 yield NeedMoreData
337 continue
338 break
339 # Because of RFC 2046, the newline preceding the boundary
340 # separator actually belongs to the boundary, not the
341 # previous subpart's payload (or epilogue if the previous
342 # part is a multipart).
343 if self._last.get_content_maintype() == 'multipart':
344 epilogue = self._last.epilogue
345 if epilogue == '':
346 self._last.epilogue = None
347 elif epilogue is not None:
348 mo = NLCRE_eol.search(epilogue)
349 if mo:
350 end = len(mo.group(0))
351 self._last.epilogue = epilogue[:-end]
352 else:
353 payload = self._last.get_payload()
354 if isinstance(payload, basestring):
355 mo = NLCRE_eol.search(payload)
356 if mo:
357 payload = payload[:-len(mo.group(0))]
358 self._last.set_payload(payload)
359 self._input.pop_eof_matcher()
360 self._pop_message()
361 # Set the multipart up for newline cleansing, which will
362 # happen if we're in a nested multipart.
363 self._last = self._cur
364 else:
365 # I think we must be in the preamble
366 assert capturing_preamble
367 preamble.append(line)
368 # We've seen either the EOF or the end boundary. If we're still
369 # capturing the preamble, we never saw the start boundary. Note
370 # that as a defect and store the captured text as the payload.
Barry Warsawdee0cf12004-10-09 23:00:11 +0000371 # Everything from here to the EOF is epilogue.
Barry Warsaw418101f2004-05-09 03:29:23 +0000372 if capturing_preamble:
Barry Warsawbb113862004-10-03 03:16:19 +0000373 self._cur.defects.append(Errors.StartBoundaryNotFoundDefect())
Barry Warsaw418101f2004-05-09 03:29:23 +0000374 self._cur.set_payload(EMPTYSTRING.join(preamble))
Barry Warsawdee0cf12004-10-09 23:00:11 +0000375 epilogue = []
376 for line in self._input:
377 if line is NeedMoreData:
378 yield NeedMoreData
379 continue
380 self._cur.epilogue = EMPTYSTRING.join(epilogue)
Barry Warsaw418101f2004-05-09 03:29:23 +0000381 return
382 # If the end boundary ended in a newline, we'll need to make sure
383 # the epilogue isn't None
384 if linesep:
385 epilogue = ['']
386 else:
387 epilogue = []
388 for line in self._input:
389 if line is NeedMoreData:
390 yield NeedMoreData
391 continue
392 epilogue.append(line)
393 # Any CRLF at the front of the epilogue is not technically part of
394 # the epilogue. Also, watch out for an empty string epilogue,
395 # which means a single newline.
Barry Warsaw5b44cd62004-05-11 18:10:15 +0000396 if epilogue:
397 firstline = epilogue[0]
398 bolmo = NLCRE_bol.match(firstline)
399 if bolmo:
400 epilogue[0] = firstline[len(bolmo.group(0)):]
Barry Warsaw418101f2004-05-09 03:29:23 +0000401 self._cur.epilogue = EMPTYSTRING.join(epilogue)
402 return
403 # Otherwise, it's some non-multipart type, so the entire rest of the
404 # file contents becomes the payload.
405 lines = []
406 for line in self._input:
407 if line is NeedMoreData:
408 yield NeedMoreData
409 continue
410 lines.append(line)
411 self._cur.set_payload(EMPTYSTRING.join(lines))
412
413 def _parse_headers(self, lines):
414 # Passed a list of lines that make up the headers for the current msg
Anthony Baxter39a0f042004-03-22 00:33:28 +0000415 lastheader = ''
416 lastvalue = []
Barry Warsaw418101f2004-05-09 03:29:23 +0000417 for lineno, line in enumerate(lines):
Anthony Baxter39a0f042004-03-22 00:33:28 +0000418 # Check for continuation
419 if line[0] in ' \t':
420 if not lastheader:
Barry Warsaw418101f2004-05-09 03:29:23 +0000421 # The first line of the headers was a continuation. This
422 # is illegal, so let's note the defect, store the illegal
423 # line, and ignore it for purposes of headers.
Barry Warsawbb113862004-10-03 03:16:19 +0000424 defect = Errors.FirstHeaderLineIsContinuationDefect(line)
Barry Warsaw418101f2004-05-09 03:29:23 +0000425 self._cur.defects.append(defect)
426 continue
Anthony Baxter39a0f042004-03-22 00:33:28 +0000427 lastvalue.append(line)
428 continue
Anthony Baxter39a0f042004-03-22 00:33:28 +0000429 if lastheader:
430 # XXX reconsider the joining of folded lines
Barry Warsaw8896bf52004-08-07 15:57:52 +0000431 lhdr = EMPTYSTRING.join(lastvalue)[:-1].rstrip('\r\n')
432 self._cur[lastheader] = lhdr
Anthony Baxter39a0f042004-03-22 00:33:28 +0000433 lastheader, lastvalue = '', []
Barry Warsaw418101f2004-05-09 03:29:23 +0000434 # Check for envelope header, i.e. unix-from
Anthony Baxter39a0f042004-03-22 00:33:28 +0000435 if line.startswith('From '):
436 if lineno == 0:
Barry Warsawc29db262004-05-10 14:48:30 +0000437 # Strip off the trailing newline
438 mo = NLCRE_eol.search(line)
439 if mo:
440 line = line[:-len(mo.group(0))]
Anthony Baxter39a0f042004-03-22 00:33:28 +0000441 self._cur.set_unixfrom(line)
442 continue
Barry Warsaw418101f2004-05-09 03:29:23 +0000443 elif lineno == len(lines) - 1:
Anthony Baxter39a0f042004-03-22 00:33:28 +0000444 # Something looking like a unix-from at the end - it's
Barry Warsaw418101f2004-05-09 03:29:23 +0000445 # probably the first line of the body, so push back the
446 # line and stop.
Anthony Baxter39a0f042004-03-22 00:33:28 +0000447 self._input.unreadline(line)
448 return
449 else:
Barry Warsaw418101f2004-05-09 03:29:23 +0000450 # Weirdly placed unix-from line. Note this as a defect
451 # and ignore it.
Barry Warsawbb113862004-10-03 03:16:19 +0000452 defect = Errors.MisplacedEnvelopeHeaderDefect(line)
Barry Warsaw418101f2004-05-09 03:29:23 +0000453 self._cur.defects.append(defect)
Anthony Baxter39a0f042004-03-22 00:33:28 +0000454 continue
Barry Warsaw418101f2004-05-09 03:29:23 +0000455 # Split the line on the colon separating field name from value.
Anthony Baxter39a0f042004-03-22 00:33:28 +0000456 i = line.find(':')
457 if i < 0:
Barry Warsawbb113862004-10-03 03:16:19 +0000458 defect = Errors.MalformedHeaderDefect(line)
Barry Warsaw418101f2004-05-09 03:29:23 +0000459 self._cur.defects.append(defect)
460 continue
Anthony Baxter39a0f042004-03-22 00:33:28 +0000461 lastheader = line[:i]
462 lastvalue = [line[i+1:].lstrip()]
Barry Warsaw418101f2004-05-09 03:29:23 +0000463 # Done with all the lines, so handle the last header.
Anthony Baxter39a0f042004-03-22 00:33:28 +0000464 if lastheader:
465 # XXX reconsider the joining of folded lines
Barry Warsaw8896bf52004-08-07 15:57:52 +0000466 self._cur[lastheader] = EMPTYSTRING.join(lastvalue).rstrip('\r\n')