blob: 1d6e3dd1cf160bfd7b7ce6b4e4418b3f466ccf5c [file] [log] [blame]
Barry Warsaw418101f2004-05-09 03:29:23 +00001# Copyright (C) 2004 Python Software Foundation
2# Authors: Baxter, Wouters and Warsaw
Barry Warsawbb113862004-10-03 03:16:19 +00003# Contact: email-sig@python.org
Anthony Baxter39a0f042004-03-22 00:33:28 +00004
Barry Warsaw418101f2004-05-09 03:29:23 +00005"""FeedParser - An email feed parser.
6
7The feed parser implements an interface for incrementally parsing an email
8message, line by line. This has advantages for certain applications, such as
9those reading email messages off a socket.
10
11FeedParser.feed() is the primary interface for pushing new data into the
12parser. It returns when there's nothing more it can do with the available
13data. When you have no more data to push into the parser, call .close().
14This completes the parsing and returns the root message object.
15
16The other advantage of this parser is that it will never throw a parsing
17exception. Instead, when it finds something unexpected, it adds a 'defect' to
18the current message. Defects are just instances that live on the message
Barry Warsawbb113862004-10-03 03:16:19 +000019object's .defects attribute.
Barry Warsaw418101f2004-05-09 03:29:23 +000020"""
21
Anthony Baxter39a0f042004-03-22 00:33:28 +000022import re
Barry Warsaw418101f2004-05-09 03:29:23 +000023from email import Errors
24from email import Message
Anthony Baxter39a0f042004-03-22 00:33:28 +000025
26NLCRE = re.compile('\r\n|\r|\n')
Barry Warsaw418101f2004-05-09 03:29:23 +000027NLCRE_bol = re.compile('(\r\n|\r|\n)')
28NLCRE_eol = re.compile('(\r\n|\r|\n)$')
29NLCRE_crack = re.compile('(\r\n|\r|\n)')
30headerRE = re.compile(r'^(From |[-\w]{2,}:|[\t ])')
Anthony Baxter39a0f042004-03-22 00:33:28 +000031EMPTYSTRING = ''
32NL = '\n'
33
34NeedMoreData = object()
35
Anthony Baxter39a0f042004-03-22 00:33:28 +000036
Barry Warsaw418101f2004-05-09 03:29:23 +000037
38class BufferedSubFile(object):
39 """A file-ish object that can have new data loaded into it.
40
41 You can also push and pop line-matching predicates onto a stack. When the
42 current predicate matches the current line, a false EOF response
43 (i.e. empty string) is returned instead. This lets the parser adhere to a
44 simple abstraction -- it parses until EOF closes the current message.
45 """
Anthony Baxter39a0f042004-03-22 00:33:28 +000046 def __init__(self):
Barry Warsaw418101f2004-05-09 03:29:23 +000047 # The last partial line pushed into this object.
Anthony Baxter39a0f042004-03-22 00:33:28 +000048 self._partial = ''
Barry Warsaw418101f2004-05-09 03:29:23 +000049 # The list of full, pushed lines, in reverse order
50 self._lines = []
51 # The stack of false-EOF checking predicates.
52 self._eofstack = []
53 # A flag indicating whether the file has been closed or not.
54 self._closed = False
55
56 def push_eof_matcher(self, pred):
57 self._eofstack.append(pred)
58
59 def pop_eof_matcher(self):
60 return self._eofstack.pop()
61
62 def close(self):
63 # Don't forget any trailing partial line.
64 self._lines.append(self._partial)
65 self._closed = True
Anthony Baxter39a0f042004-03-22 00:33:28 +000066
67 def readline(self):
Barry Warsaw418101f2004-05-09 03:29:23 +000068 if not self._lines:
69 if self._closed:
Anthony Baxter39a0f042004-03-22 00:33:28 +000070 return ''
71 return NeedMoreData
Barry Warsaw418101f2004-05-09 03:29:23 +000072 # Pop the line off the stack and see if it matches the current
73 # false-EOF predicate.
74 line = self._lines.pop()
Barry Warsaw4e59bc12004-05-13 20:17:51 +000075 # RFC 2046, section 5.1.2 requires us to recognize outer level
76 # boundaries at any level of inner nesting. Do this, but be sure it's
77 # in the order of most to least nested.
78 for ateof in self._eofstack[::-1]:
79 if ateof(line):
Barry Warsaw418101f2004-05-09 03:29:23 +000080 # We're at the false EOF. But push the last line back first.
81 self._lines.append(line)
82 return ''
83 return line
Anthony Baxter39a0f042004-03-22 00:33:28 +000084
85 def unreadline(self, line):
Barry Warsaw418101f2004-05-09 03:29:23 +000086 # Let the consumer push a line back into the buffer.
87 self._lines.append(line)
Anthony Baxter39a0f042004-03-22 00:33:28 +000088
89 def push(self, data):
Barry Warsaw418101f2004-05-09 03:29:23 +000090 """Push some new data into this object."""
Anthony Baxter39a0f042004-03-22 00:33:28 +000091 # Handle any previous leftovers
Barry Warsaw418101f2004-05-09 03:29:23 +000092 data, self._partial = self._partial + data, ''
93 # Crack into lines, but preserve the newlines on the end of each
94 parts = NLCRE_crack.split(data)
95 # The *ahem* interesting behaviour of re.split when supplied grouping
96 # parentheses is that the last element of the resulting list is the
97 # data after the final RE. In the case of a NL/CR terminated string,
98 # this is the empty string.
99 self._partial = parts.pop()
100 # parts is a list of strings, alternating between the line contents
101 # and the eol character(s). Gather up a list of lines after
102 # re-attaching the newlines.
103 lines = []
Barry Warsawbb113862004-10-03 03:16:19 +0000104 for i in range(len(parts) // 2):
Barry Warsaw418101f2004-05-09 03:29:23 +0000105 lines.append(parts[i*2] + parts[i*2+1])
106 self.pushlines(lines)
107
Anthony Baxter39a0f042004-03-22 00:33:28 +0000108 def pushlines(self, lines):
Barry Warsaw418101f2004-05-09 03:29:23 +0000109 # Reverse and insert at the front of the lines.
110 self._lines[:0] = lines[::-1]
Anthony Baxter39a0f042004-03-22 00:33:28 +0000111
Barry Warsaw418101f2004-05-09 03:29:23 +0000112 def is_closed(self):
113 return self._closed
Anthony Baxter39a0f042004-03-22 00:33:28 +0000114
115 def __iter__(self):
116 return self
117
118 def next(self):
Barry Warsaw418101f2004-05-09 03:29:23 +0000119 line = self.readline()
120 if line == '':
Anthony Baxter39a0f042004-03-22 00:33:28 +0000121 raise StopIteration
Barry Warsaw418101f2004-05-09 03:29:23 +0000122 return line
Anthony Baxter39a0f042004-03-22 00:33:28 +0000123
Barry Warsaw418101f2004-05-09 03:29:23 +0000124
125
Anthony Baxter39a0f042004-03-22 00:33:28 +0000126class FeedParser:
Barry Warsaw418101f2004-05-09 03:29:23 +0000127 """A feed-style parser of email."""
Anthony Baxter39a0f042004-03-22 00:33:28 +0000128
Barry Warsaw418101f2004-05-09 03:29:23 +0000129 def __init__(self, _factory=Message.Message):
130 """_factory is called with no arguments to create a new message obj"""
131 self._factory = _factory
132 self._input = BufferedSubFile()
133 self._msgstack = []
Anthony Baxter39a0f042004-03-22 00:33:28 +0000134 self._parse = self._parsegen().next
Barry Warsaw418101f2004-05-09 03:29:23 +0000135 self._cur = None
136 self._last = None
137 self._headersonly = False
Anthony Baxter39a0f042004-03-22 00:33:28 +0000138
Barry Warsaw418101f2004-05-09 03:29:23 +0000139 # Non-public interface for supporting Parser's headersonly flag
140 def _set_headersonly(self):
141 self._headersonly = True
Anthony Baxter39a0f042004-03-22 00:33:28 +0000142
143 def feed(self, data):
Barry Warsaw418101f2004-05-09 03:29:23 +0000144 """Push more data into the parser."""
Anthony Baxter39a0f042004-03-22 00:33:28 +0000145 self._input.push(data)
146 self._call_parse()
147
148 def _call_parse(self):
149 try:
150 self._parse()
151 except StopIteration:
152 pass
153
Barry Warsaw418101f2004-05-09 03:29:23 +0000154 def close(self):
155 """Parse all remaining data and return the root message object."""
156 self._input.close()
157 self._call_parse()
158 root = self._pop_message()
159 assert not self._msgstack
Barry Warsawbb113862004-10-03 03:16:19 +0000160 # Look for final set of defects
161 if root.get_content_maintype() == 'multipart' \
162 and not root.is_multipart():
163 root.defects.append(Errors.MultipartInvariantViolationDefect())
Barry Warsaw418101f2004-05-09 03:29:23 +0000164 return root
Anthony Baxter39a0f042004-03-22 00:33:28 +0000165
Barry Warsaw418101f2004-05-09 03:29:23 +0000166 def _new_message(self):
167 msg = self._factory()
168 if self._cur and self._cur.get_content_type() == 'multipart/digest':
169 msg.set_default_type('message/rfc822')
170 if self._msgstack:
171 self._msgstack[-1].attach(msg)
172 self._msgstack.append(msg)
173 self._cur = msg
Barry Warsaw418101f2004-05-09 03:29:23 +0000174 self._last = msg
175
176 def _pop_message(self):
177 retval = self._msgstack.pop()
178 if self._msgstack:
179 self._cur = self._msgstack[-1]
180 else:
181 self._cur = None
182 return retval
183
184 def _parsegen(self):
185 # Create a new message and start by parsing headers.
186 self._new_message()
187 headers = []
188 # Collect the headers, searching for a line that doesn't match the RFC
189 # 2822 header or continuation pattern (including an empty line).
190 for line in self._input:
191 if line is NeedMoreData:
192 yield NeedMoreData
193 continue
194 if not headerRE.match(line):
195 # If we saw the RFC defined header/body separator
196 # (i.e. newline), just throw it away. Otherwise the line is
197 # part of the body so push it back.
198 if not NLCRE.match(line):
199 self._input.unreadline(line)
200 break
201 headers.append(line)
202 # Done with the headers, so parse them and figure out what we're
203 # supposed to see in the body of the message.
204 self._parse_headers(headers)
205 # Headers-only parsing is a backwards compatibility hack, which was
206 # necessary in the older parser, which could throw errors. All
207 # remaining lines in the input are thrown into the message body.
208 if self._headersonly:
209 lines = []
210 while True:
211 line = self._input.readline()
212 if line is NeedMoreData:
213 yield NeedMoreData
214 continue
215 if line == '':
216 break
217 lines.append(line)
218 self._cur.set_payload(EMPTYSTRING.join(lines))
219 return
Barry Warsaw418101f2004-05-09 03:29:23 +0000220 if self._cur.get_content_type() == 'message/delivery-status':
221 # message/delivery-status contains blocks of headers separated by
222 # a blank line. We'll represent each header block as a separate
Barry Warsawd38f4482004-05-11 20:19:09 +0000223 # nested message object, but the processing is a bit different
224 # than standard message/* types because there is no body for the
225 # nested messages. A blank line separates the subparts.
Barry Warsaw418101f2004-05-09 03:29:23 +0000226 while True:
227 self._input.push_eof_matcher(NLCRE.match)
228 for retval in self._parsegen():
229 if retval is NeedMoreData:
230 yield NeedMoreData
231 continue
232 break
233 msg = self._pop_message()
234 # We need to pop the EOF matcher in order to tell if we're at
235 # the end of the current file, not the end of the last block
236 # of message headers.
237 self._input.pop_eof_matcher()
238 # The input stream must be sitting at the newline or at the
239 # EOF. We want to see if we're at the end of this subpart, so
240 # first consume the blank line, then test the next line to see
241 # if we're at this subpart's EOF.
242 line = self._input.readline()
243 line = self._input.readline()
244 if line == '':
245 break
246 # Not at EOF so this is a line we're going to need.
247 self._input.unreadline(line)
248 return
Barry Warsawd38f4482004-05-11 20:19:09 +0000249 if self._cur.get_content_maintype() == 'message':
250 # The message claims to be a message/* type, then what follows is
251 # another RFC 2822 message.
252 for retval in self._parsegen():
253 if retval is NeedMoreData:
254 yield NeedMoreData
255 continue
256 break
257 self._pop_message()
258 return
Barry Warsaw418101f2004-05-09 03:29:23 +0000259 if self._cur.get_content_maintype() == 'multipart':
260 boundary = self._cur.get_boundary()
261 if boundary is None:
262 # The message /claims/ to be a multipart but it has not
263 # defined a boundary. That's a problem which we'll handle by
264 # reading everything until the EOF and marking the message as
265 # defective.
Barry Warsawbb113862004-10-03 03:16:19 +0000266 self._cur.defects.append(Errors.NoBoundaryInMultipartDefect())
Barry Warsaw418101f2004-05-09 03:29:23 +0000267 lines = []
268 for line in self._input:
269 if line is NeedMoreData:
270 yield NeedMoreData
271 continue
272 lines.append(line)
273 self._cur.set_payload(EMPTYSTRING.join(lines))
274 return
275 # Create a line match predicate which matches the inter-part
276 # boundary as well as the end-of-multipart boundary. Don't push
277 # this onto the input stream until we've scanned past the
278 # preamble.
279 separator = '--' + boundary
280 boundaryre = re.compile(
281 '(?P<sep>' + re.escape(separator) +
282 r')(?P<end>--)?(?P<ws>[ \t]*)(?P<linesep>\r\n|\r|\n)$')
283 capturing_preamble = True
284 preamble = []
285 linesep = False
286 while True:
287 line = self._input.readline()
288 if line is NeedMoreData:
289 yield NeedMoreData
290 continue
291 if line == '':
292 break
293 mo = boundaryre.match(line)
294 if mo:
295 # If we're looking at the end boundary, we're done with
296 # this multipart. If there was a newline at the end of
297 # the closing boundary, then we need to initialize the
298 # epilogue with the empty string (see below).
299 if mo.group('end'):
300 linesep = mo.group('linesep')
301 break
302 # We saw an inter-part boundary. Were we in the preamble?
303 if capturing_preamble:
304 if preamble:
305 # According to RFC 2046, the last newline belongs
306 # to the boundary.
307 lastline = preamble[-1]
308 eolmo = NLCRE_eol.search(lastline)
309 if eolmo:
310 preamble[-1] = lastline[:-len(eolmo.group(0))]
311 self._cur.preamble = EMPTYSTRING.join(preamble)
312 capturing_preamble = False
313 self._input.unreadline(line)
314 continue
Barry Warsaw486cb0a2004-05-11 22:23:59 +0000315 # We saw a boundary separating two parts. Consume any
316 # multiple boundary lines that may be following. Our
317 # interpretation of RFC 2046 BNF grammar does not produce
318 # body parts within such double boundaries.
319 while True:
320 line = self._input.readline()
Barry Warsawe4aeb7d2004-05-15 16:26:28 +0000321 if line is NeedMoreData:
322 yield NeedMoreData
323 continue
Barry Warsaw486cb0a2004-05-11 22:23:59 +0000324 mo = boundaryre.match(line)
325 if not mo:
326 self._input.unreadline(line)
327 break
328 # Recurse to parse this subpart; the input stream points
329 # at the subpart's first line.
Barry Warsaw418101f2004-05-09 03:29:23 +0000330 self._input.push_eof_matcher(boundaryre.match)
331 for retval in self._parsegen():
332 if retval is NeedMoreData:
333 yield NeedMoreData
334 continue
335 break
336 # Because of RFC 2046, the newline preceding the boundary
337 # separator actually belongs to the boundary, not the
338 # previous subpart's payload (or epilogue if the previous
339 # part is a multipart).
340 if self._last.get_content_maintype() == 'multipart':
341 epilogue = self._last.epilogue
342 if epilogue == '':
343 self._last.epilogue = None
344 elif epilogue is not None:
345 mo = NLCRE_eol.search(epilogue)
346 if mo:
347 end = len(mo.group(0))
348 self._last.epilogue = epilogue[:-end]
349 else:
350 payload = self._last.get_payload()
351 if isinstance(payload, basestring):
352 mo = NLCRE_eol.search(payload)
353 if mo:
354 payload = payload[:-len(mo.group(0))]
355 self._last.set_payload(payload)
356 self._input.pop_eof_matcher()
357 self._pop_message()
358 # Set the multipart up for newline cleansing, which will
359 # happen if we're in a nested multipart.
360 self._last = self._cur
361 else:
362 # I think we must be in the preamble
363 assert capturing_preamble
364 preamble.append(line)
365 # We've seen either the EOF or the end boundary. If we're still
366 # capturing the preamble, we never saw the start boundary. Note
367 # that as a defect and store the captured text as the payload.
Barry Warsawdee0cf12004-10-09 23:00:11 +0000368 # Everything from here to the EOF is epilogue.
Barry Warsaw418101f2004-05-09 03:29:23 +0000369 if capturing_preamble:
Barry Warsawbb113862004-10-03 03:16:19 +0000370 self._cur.defects.append(Errors.StartBoundaryNotFoundDefect())
Barry Warsaw418101f2004-05-09 03:29:23 +0000371 self._cur.set_payload(EMPTYSTRING.join(preamble))
Barry Warsawdee0cf12004-10-09 23:00:11 +0000372 epilogue = []
373 for line in self._input:
374 if line is NeedMoreData:
375 yield NeedMoreData
376 continue
377 self._cur.epilogue = EMPTYSTRING.join(epilogue)
Barry Warsaw418101f2004-05-09 03:29:23 +0000378 return
379 # If the end boundary ended in a newline, we'll need to make sure
380 # the epilogue isn't None
381 if linesep:
382 epilogue = ['']
383 else:
384 epilogue = []
385 for line in self._input:
386 if line is NeedMoreData:
387 yield NeedMoreData
388 continue
389 epilogue.append(line)
390 # Any CRLF at the front of the epilogue is not technically part of
391 # the epilogue. Also, watch out for an empty string epilogue,
392 # which means a single newline.
Barry Warsaw5b44cd62004-05-11 18:10:15 +0000393 if epilogue:
394 firstline = epilogue[0]
395 bolmo = NLCRE_bol.match(firstline)
396 if bolmo:
397 epilogue[0] = firstline[len(bolmo.group(0)):]
Barry Warsaw418101f2004-05-09 03:29:23 +0000398 self._cur.epilogue = EMPTYSTRING.join(epilogue)
399 return
400 # Otherwise, it's some non-multipart type, so the entire rest of the
401 # file contents becomes the payload.
402 lines = []
403 for line in self._input:
404 if line is NeedMoreData:
405 yield NeedMoreData
406 continue
407 lines.append(line)
408 self._cur.set_payload(EMPTYSTRING.join(lines))
409
410 def _parse_headers(self, lines):
411 # Passed a list of lines that make up the headers for the current msg
Anthony Baxter39a0f042004-03-22 00:33:28 +0000412 lastheader = ''
413 lastvalue = []
Barry Warsaw418101f2004-05-09 03:29:23 +0000414 for lineno, line in enumerate(lines):
Anthony Baxter39a0f042004-03-22 00:33:28 +0000415 # Check for continuation
416 if line[0] in ' \t':
417 if not lastheader:
Barry Warsaw418101f2004-05-09 03:29:23 +0000418 # The first line of the headers was a continuation. This
419 # is illegal, so let's note the defect, store the illegal
420 # line, and ignore it for purposes of headers.
Barry Warsawbb113862004-10-03 03:16:19 +0000421 defect = Errors.FirstHeaderLineIsContinuationDefect(line)
Barry Warsaw418101f2004-05-09 03:29:23 +0000422 self._cur.defects.append(defect)
423 continue
Anthony Baxter39a0f042004-03-22 00:33:28 +0000424 lastvalue.append(line)
425 continue
Anthony Baxter39a0f042004-03-22 00:33:28 +0000426 if lastheader:
427 # XXX reconsider the joining of folded lines
Barry Warsaw8896bf52004-08-07 15:57:52 +0000428 lhdr = EMPTYSTRING.join(lastvalue)[:-1].rstrip('\r\n')
429 self._cur[lastheader] = lhdr
Anthony Baxter39a0f042004-03-22 00:33:28 +0000430 lastheader, lastvalue = '', []
Barry Warsaw418101f2004-05-09 03:29:23 +0000431 # Check for envelope header, i.e. unix-from
Anthony Baxter39a0f042004-03-22 00:33:28 +0000432 if line.startswith('From '):
433 if lineno == 0:
Barry Warsawc29db262004-05-10 14:48:30 +0000434 # Strip off the trailing newline
435 mo = NLCRE_eol.search(line)
436 if mo:
437 line = line[:-len(mo.group(0))]
Anthony Baxter39a0f042004-03-22 00:33:28 +0000438 self._cur.set_unixfrom(line)
439 continue
Barry Warsaw418101f2004-05-09 03:29:23 +0000440 elif lineno == len(lines) - 1:
Anthony Baxter39a0f042004-03-22 00:33:28 +0000441 # Something looking like a unix-from at the end - it's
Barry Warsaw418101f2004-05-09 03:29:23 +0000442 # probably the first line of the body, so push back the
443 # line and stop.
Anthony Baxter39a0f042004-03-22 00:33:28 +0000444 self._input.unreadline(line)
445 return
446 else:
Barry Warsaw418101f2004-05-09 03:29:23 +0000447 # Weirdly placed unix-from line. Note this as a defect
448 # and ignore it.
Barry Warsawbb113862004-10-03 03:16:19 +0000449 defect = Errors.MisplacedEnvelopeHeaderDefect(line)
Barry Warsaw418101f2004-05-09 03:29:23 +0000450 self._cur.defects.append(defect)
Anthony Baxter39a0f042004-03-22 00:33:28 +0000451 continue
Barry Warsaw418101f2004-05-09 03:29:23 +0000452 # Split the line on the colon separating field name from value.
Anthony Baxter39a0f042004-03-22 00:33:28 +0000453 i = line.find(':')
454 if i < 0:
Barry Warsawbb113862004-10-03 03:16:19 +0000455 defect = Errors.MalformedHeaderDefect(line)
Barry Warsaw418101f2004-05-09 03:29:23 +0000456 self._cur.defects.append(defect)
457 continue
Anthony Baxter39a0f042004-03-22 00:33:28 +0000458 lastheader = line[:i]
459 lastvalue = [line[i+1:].lstrip()]
Barry Warsaw418101f2004-05-09 03:29:23 +0000460 # Done with all the lines, so handle the last header.
Anthony Baxter39a0f042004-03-22 00:33:28 +0000461 if lastheader:
462 # XXX reconsider the joining of folded lines
Barry Warsaw8896bf52004-08-07 15:57:52 +0000463 self._cur[lastheader] = EMPTYSTRING.join(lastvalue).rstrip('\r\n')