blob: ac3769d4b252d206bc3c5eff4d3c566bb3c53d0c [file] [log] [blame]
Barry Warsaw418101f2004-05-09 03:29:23 +00001# Copyright (C) 2004 Python Software Foundation
2# Authors: Baxter, Wouters and Warsaw
Anthony Baxter39a0f042004-03-22 00:33:28 +00003
Barry Warsaw418101f2004-05-09 03:29:23 +00004"""FeedParser - An email feed parser.
5
6The feed parser implements an interface for incrementally parsing an email
7message, line by line. This has advantages for certain applications, such as
8those reading email messages off a socket.
9
10FeedParser.feed() is the primary interface for pushing new data into the
11parser. It returns when there's nothing more it can do with the available
12data. When you have no more data to push into the parser, call .close().
13This completes the parsing and returns the root message object.
14
15The other advantage of this parser is that it will never throw a parsing
16exception. Instead, when it finds something unexpected, it adds a 'defect' to
17the current message. Defects are just instances that live on the message
18object's .defect attribute.
19"""
20
Anthony Baxter39a0f042004-03-22 00:33:28 +000021import re
Barry Warsaw418101f2004-05-09 03:29:23 +000022from email import Errors
23from email import Message
Anthony Baxter39a0f042004-03-22 00:33:28 +000024
25NLCRE = re.compile('\r\n|\r|\n')
Barry Warsaw418101f2004-05-09 03:29:23 +000026NLCRE_bol = re.compile('(\r\n|\r|\n)')
27NLCRE_eol = re.compile('(\r\n|\r|\n)$')
28NLCRE_crack = re.compile('(\r\n|\r|\n)')
29headerRE = re.compile(r'^(From |[-\w]{2,}:|[\t ])')
Anthony Baxter39a0f042004-03-22 00:33:28 +000030EMPTYSTRING = ''
31NL = '\n'
32
33NeedMoreData = object()
34
Anthony Baxter39a0f042004-03-22 00:33:28 +000035
Barry Warsaw418101f2004-05-09 03:29:23 +000036
37class BufferedSubFile(object):
38 """A file-ish object that can have new data loaded into it.
39
40 You can also push and pop line-matching predicates onto a stack. When the
41 current predicate matches the current line, a false EOF response
42 (i.e. empty string) is returned instead. This lets the parser adhere to a
43 simple abstraction -- it parses until EOF closes the current message.
44 """
Anthony Baxter39a0f042004-03-22 00:33:28 +000045 def __init__(self):
Barry Warsaw418101f2004-05-09 03:29:23 +000046 # The last partial line pushed into this object.
Anthony Baxter39a0f042004-03-22 00:33:28 +000047 self._partial = ''
Barry Warsaw418101f2004-05-09 03:29:23 +000048 # The list of full, pushed lines, in reverse order
49 self._lines = []
50 # The stack of false-EOF checking predicates.
51 self._eofstack = []
52 # A flag indicating whether the file has been closed or not.
53 self._closed = False
54
55 def push_eof_matcher(self, pred):
56 self._eofstack.append(pred)
57
58 def pop_eof_matcher(self):
59 return self._eofstack.pop()
60
61 def close(self):
62 # Don't forget any trailing partial line.
63 self._lines.append(self._partial)
64 self._closed = True
Anthony Baxter39a0f042004-03-22 00:33:28 +000065
66 def readline(self):
Barry Warsaw418101f2004-05-09 03:29:23 +000067 if not self._lines:
68 if self._closed:
Anthony Baxter39a0f042004-03-22 00:33:28 +000069 return ''
70 return NeedMoreData
Barry Warsaw418101f2004-05-09 03:29:23 +000071 # Pop the line off the stack and see if it matches the current
72 # false-EOF predicate.
73 line = self._lines.pop()
Barry Warsaw4e59bc12004-05-13 20:17:51 +000074 # RFC 2046, section 5.1.2 requires us to recognize outer level
75 # boundaries at any level of inner nesting. Do this, but be sure it's
76 # in the order of most to least nested.
77 for ateof in self._eofstack[::-1]:
78 if ateof(line):
Barry Warsaw418101f2004-05-09 03:29:23 +000079 # We're at the false EOF. But push the last line back first.
80 self._lines.append(line)
81 return ''
82 return line
Anthony Baxter39a0f042004-03-22 00:33:28 +000083
84 def unreadline(self, line):
Barry Warsaw418101f2004-05-09 03:29:23 +000085 # Let the consumer push a line back into the buffer.
86 self._lines.append(line)
Anthony Baxter39a0f042004-03-22 00:33:28 +000087
88 def push(self, data):
Barry Warsaw418101f2004-05-09 03:29:23 +000089 """Push some new data into this object."""
Anthony Baxter39a0f042004-03-22 00:33:28 +000090 # Handle any previous leftovers
Barry Warsaw418101f2004-05-09 03:29:23 +000091 data, self._partial = self._partial + data, ''
92 # Crack into lines, but preserve the newlines on the end of each
93 parts = NLCRE_crack.split(data)
94 # The *ahem* interesting behaviour of re.split when supplied grouping
95 # parentheses is that the last element of the resulting list is the
96 # data after the final RE. In the case of a NL/CR terminated string,
97 # this is the empty string.
98 self._partial = parts.pop()
99 # parts is a list of strings, alternating between the line contents
100 # and the eol character(s). Gather up a list of lines after
101 # re-attaching the newlines.
102 lines = []
103 for i in range(len(parts) / 2):
104 lines.append(parts[i*2] + parts[i*2+1])
105 self.pushlines(lines)
106
Anthony Baxter39a0f042004-03-22 00:33:28 +0000107 def pushlines(self, lines):
Barry Warsaw418101f2004-05-09 03:29:23 +0000108 # Reverse and insert at the front of the lines.
109 self._lines[:0] = lines[::-1]
Anthony Baxter39a0f042004-03-22 00:33:28 +0000110
Barry Warsaw418101f2004-05-09 03:29:23 +0000111 def is_closed(self):
112 return self._closed
Anthony Baxter39a0f042004-03-22 00:33:28 +0000113
114 def __iter__(self):
115 return self
116
117 def next(self):
Barry Warsaw418101f2004-05-09 03:29:23 +0000118 line = self.readline()
119 if line == '':
Anthony Baxter39a0f042004-03-22 00:33:28 +0000120 raise StopIteration
Barry Warsaw418101f2004-05-09 03:29:23 +0000121 return line
Anthony Baxter39a0f042004-03-22 00:33:28 +0000122
Barry Warsaw418101f2004-05-09 03:29:23 +0000123
124
Anthony Baxter39a0f042004-03-22 00:33:28 +0000125class FeedParser:
Barry Warsaw418101f2004-05-09 03:29:23 +0000126 """A feed-style parser of email."""
Anthony Baxter39a0f042004-03-22 00:33:28 +0000127
Barry Warsaw418101f2004-05-09 03:29:23 +0000128 def __init__(self, _factory=Message.Message):
129 """_factory is called with no arguments to create a new message obj"""
130 self._factory = _factory
131 self._input = BufferedSubFile()
132 self._msgstack = []
Anthony Baxter39a0f042004-03-22 00:33:28 +0000133 self._parse = self._parsegen().next
Barry Warsaw418101f2004-05-09 03:29:23 +0000134 self._cur = None
135 self._last = None
136 self._headersonly = False
Anthony Baxter39a0f042004-03-22 00:33:28 +0000137
Barry Warsaw418101f2004-05-09 03:29:23 +0000138 # Non-public interface for supporting Parser's headersonly flag
139 def _set_headersonly(self):
140 self._headersonly = True
Anthony Baxter39a0f042004-03-22 00:33:28 +0000141
142 def feed(self, data):
Barry Warsaw418101f2004-05-09 03:29:23 +0000143 """Push more data into the parser."""
Anthony Baxter39a0f042004-03-22 00:33:28 +0000144 self._input.push(data)
145 self._call_parse()
146
147 def _call_parse(self):
148 try:
149 self._parse()
150 except StopIteration:
151 pass
152
Barry Warsaw418101f2004-05-09 03:29:23 +0000153 def close(self):
154 """Parse all remaining data and return the root message object."""
155 self._input.close()
156 self._call_parse()
157 root = self._pop_message()
158 assert not self._msgstack
159 return root
Anthony Baxter39a0f042004-03-22 00:33:28 +0000160
Barry Warsaw418101f2004-05-09 03:29:23 +0000161 def _new_message(self):
162 msg = self._factory()
163 if self._cur and self._cur.get_content_type() == 'multipart/digest':
164 msg.set_default_type('message/rfc822')
165 if self._msgstack:
166 self._msgstack[-1].attach(msg)
167 self._msgstack.append(msg)
168 self._cur = msg
169 self._cur.defects = []
170 self._last = msg
171
172 def _pop_message(self):
173 retval = self._msgstack.pop()
174 if self._msgstack:
175 self._cur = self._msgstack[-1]
176 else:
177 self._cur = None
178 return retval
179
180 def _parsegen(self):
181 # Create a new message and start by parsing headers.
182 self._new_message()
183 headers = []
184 # Collect the headers, searching for a line that doesn't match the RFC
185 # 2822 header or continuation pattern (including an empty line).
186 for line in self._input:
187 if line is NeedMoreData:
188 yield NeedMoreData
189 continue
190 if not headerRE.match(line):
191 # If we saw the RFC defined header/body separator
192 # (i.e. newline), just throw it away. Otherwise the line is
193 # part of the body so push it back.
194 if not NLCRE.match(line):
195 self._input.unreadline(line)
196 break
197 headers.append(line)
198 # Done with the headers, so parse them and figure out what we're
199 # supposed to see in the body of the message.
200 self._parse_headers(headers)
201 # Headers-only parsing is a backwards compatibility hack, which was
202 # necessary in the older parser, which could throw errors. All
203 # remaining lines in the input are thrown into the message body.
204 if self._headersonly:
205 lines = []
206 while True:
207 line = self._input.readline()
208 if line is NeedMoreData:
209 yield NeedMoreData
210 continue
211 if line == '':
212 break
213 lines.append(line)
214 self._cur.set_payload(EMPTYSTRING.join(lines))
215 return
Barry Warsaw418101f2004-05-09 03:29:23 +0000216 if self._cur.get_content_type() == 'message/delivery-status':
217 # message/delivery-status contains blocks of headers separated by
218 # a blank line. We'll represent each header block as a separate
Barry Warsawd38f4482004-05-11 20:19:09 +0000219 # nested message object, but the processing is a bit different
220 # than standard message/* types because there is no body for the
221 # nested messages. A blank line separates the subparts.
Barry Warsaw418101f2004-05-09 03:29:23 +0000222 while True:
223 self._input.push_eof_matcher(NLCRE.match)
224 for retval in self._parsegen():
225 if retval is NeedMoreData:
226 yield NeedMoreData
227 continue
228 break
229 msg = self._pop_message()
230 # We need to pop the EOF matcher in order to tell if we're at
231 # the end of the current file, not the end of the last block
232 # of message headers.
233 self._input.pop_eof_matcher()
234 # The input stream must be sitting at the newline or at the
235 # EOF. We want to see if we're at the end of this subpart, so
236 # first consume the blank line, then test the next line to see
237 # if we're at this subpart's EOF.
238 line = self._input.readline()
239 line = self._input.readline()
240 if line == '':
241 break
242 # Not at EOF so this is a line we're going to need.
243 self._input.unreadline(line)
244 return
Barry Warsawd38f4482004-05-11 20:19:09 +0000245 if self._cur.get_content_maintype() == 'message':
246 # The message claims to be a message/* type, then what follows is
247 # another RFC 2822 message.
248 for retval in self._parsegen():
249 if retval is NeedMoreData:
250 yield NeedMoreData
251 continue
252 break
253 self._pop_message()
254 return
Barry Warsaw418101f2004-05-09 03:29:23 +0000255 if self._cur.get_content_maintype() == 'multipart':
256 boundary = self._cur.get_boundary()
257 if boundary is None:
258 # The message /claims/ to be a multipart but it has not
259 # defined a boundary. That's a problem which we'll handle by
260 # reading everything until the EOF and marking the message as
261 # defective.
262 self._cur.defects.append(Errors.NoBoundaryInMultipart())
263 lines = []
264 for line in self._input:
265 if line is NeedMoreData:
266 yield NeedMoreData
267 continue
268 lines.append(line)
269 self._cur.set_payload(EMPTYSTRING.join(lines))
270 return
271 # Create a line match predicate which matches the inter-part
272 # boundary as well as the end-of-multipart boundary. Don't push
273 # this onto the input stream until we've scanned past the
274 # preamble.
275 separator = '--' + boundary
276 boundaryre = re.compile(
277 '(?P<sep>' + re.escape(separator) +
278 r')(?P<end>--)?(?P<ws>[ \t]*)(?P<linesep>\r\n|\r|\n)$')
279 capturing_preamble = True
280 preamble = []
281 linesep = False
282 while True:
283 line = self._input.readline()
284 if line is NeedMoreData:
285 yield NeedMoreData
286 continue
287 if line == '':
288 break
289 mo = boundaryre.match(line)
290 if mo:
291 # If we're looking at the end boundary, we're done with
292 # this multipart. If there was a newline at the end of
293 # the closing boundary, then we need to initialize the
294 # epilogue with the empty string (see below).
295 if mo.group('end'):
296 linesep = mo.group('linesep')
297 break
298 # We saw an inter-part boundary. Were we in the preamble?
299 if capturing_preamble:
300 if preamble:
301 # According to RFC 2046, the last newline belongs
302 # to the boundary.
303 lastline = preamble[-1]
304 eolmo = NLCRE_eol.search(lastline)
305 if eolmo:
306 preamble[-1] = lastline[:-len(eolmo.group(0))]
307 self._cur.preamble = EMPTYSTRING.join(preamble)
308 capturing_preamble = False
309 self._input.unreadline(line)
310 continue
Barry Warsaw486cb0a2004-05-11 22:23:59 +0000311 # We saw a boundary separating two parts. Consume any
312 # multiple boundary lines that may be following. Our
313 # interpretation of RFC 2046 BNF grammar does not produce
314 # body parts within such double boundaries.
315 while True:
316 line = self._input.readline()
317 mo = boundaryre.match(line)
318 if not mo:
319 self._input.unreadline(line)
320 break
321 # Recurse to parse this subpart; the input stream points
322 # at the subpart's first line.
Barry Warsaw418101f2004-05-09 03:29:23 +0000323 self._input.push_eof_matcher(boundaryre.match)
324 for retval in self._parsegen():
325 if retval is NeedMoreData:
326 yield NeedMoreData
327 continue
328 break
329 # Because of RFC 2046, the newline preceding the boundary
330 # separator actually belongs to the boundary, not the
331 # previous subpart's payload (or epilogue if the previous
332 # part is a multipart).
333 if self._last.get_content_maintype() == 'multipart':
334 epilogue = self._last.epilogue
335 if epilogue == '':
336 self._last.epilogue = None
337 elif epilogue is not None:
338 mo = NLCRE_eol.search(epilogue)
339 if mo:
340 end = len(mo.group(0))
341 self._last.epilogue = epilogue[:-end]
342 else:
343 payload = self._last.get_payload()
344 if isinstance(payload, basestring):
345 mo = NLCRE_eol.search(payload)
346 if mo:
347 payload = payload[:-len(mo.group(0))]
348 self._last.set_payload(payload)
349 self._input.pop_eof_matcher()
350 self._pop_message()
351 # Set the multipart up for newline cleansing, which will
352 # happen if we're in a nested multipart.
353 self._last = self._cur
354 else:
355 # I think we must be in the preamble
356 assert capturing_preamble
357 preamble.append(line)
358 # We've seen either the EOF or the end boundary. If we're still
359 # capturing the preamble, we never saw the start boundary. Note
360 # that as a defect and store the captured text as the payload.
361 # Otherwise everything from here to the EOF is epilogue.
362 if capturing_preamble:
363 self._cur.defects.append(Errors.StartBoundaryNotFound())
364 self._cur.set_payload(EMPTYSTRING.join(preamble))
365 return
366 # If the end boundary ended in a newline, we'll need to make sure
367 # the epilogue isn't None
368 if linesep:
369 epilogue = ['']
370 else:
371 epilogue = []
372 for line in self._input:
373 if line is NeedMoreData:
374 yield NeedMoreData
375 continue
376 epilogue.append(line)
377 # Any CRLF at the front of the epilogue is not technically part of
378 # the epilogue. Also, watch out for an empty string epilogue,
379 # which means a single newline.
Barry Warsaw5b44cd62004-05-11 18:10:15 +0000380 if epilogue:
381 firstline = epilogue[0]
382 bolmo = NLCRE_bol.match(firstline)
383 if bolmo:
384 epilogue[0] = firstline[len(bolmo.group(0)):]
Barry Warsaw418101f2004-05-09 03:29:23 +0000385 self._cur.epilogue = EMPTYSTRING.join(epilogue)
386 return
387 # Otherwise, it's some non-multipart type, so the entire rest of the
388 # file contents becomes the payload.
389 lines = []
390 for line in self._input:
391 if line is NeedMoreData:
392 yield NeedMoreData
393 continue
394 lines.append(line)
395 self._cur.set_payload(EMPTYSTRING.join(lines))
396
397 def _parse_headers(self, lines):
398 # Passed a list of lines that make up the headers for the current msg
Anthony Baxter39a0f042004-03-22 00:33:28 +0000399 lastheader = ''
400 lastvalue = []
Barry Warsaw418101f2004-05-09 03:29:23 +0000401 for lineno, line in enumerate(lines):
Anthony Baxter39a0f042004-03-22 00:33:28 +0000402 # Check for continuation
403 if line[0] in ' \t':
404 if not lastheader:
Barry Warsaw418101f2004-05-09 03:29:23 +0000405 # The first line of the headers was a continuation. This
406 # is illegal, so let's note the defect, store the illegal
407 # line, and ignore it for purposes of headers.
408 defect = Errors.FirstHeaderLineIsContinuation(line)
409 self._cur.defects.append(defect)
410 continue
Anthony Baxter39a0f042004-03-22 00:33:28 +0000411 lastvalue.append(line)
412 continue
Anthony Baxter39a0f042004-03-22 00:33:28 +0000413 if lastheader:
414 # XXX reconsider the joining of folded lines
Barry Warsaw418101f2004-05-09 03:29:23 +0000415 self._cur[lastheader] = EMPTYSTRING.join(lastvalue)[:-1]
Anthony Baxter39a0f042004-03-22 00:33:28 +0000416 lastheader, lastvalue = '', []
Barry Warsaw418101f2004-05-09 03:29:23 +0000417 # Check for envelope header, i.e. unix-from
Anthony Baxter39a0f042004-03-22 00:33:28 +0000418 if line.startswith('From '):
419 if lineno == 0:
Barry Warsawc29db262004-05-10 14:48:30 +0000420 # Strip off the trailing newline
421 mo = NLCRE_eol.search(line)
422 if mo:
423 line = line[:-len(mo.group(0))]
Anthony Baxter39a0f042004-03-22 00:33:28 +0000424 self._cur.set_unixfrom(line)
425 continue
Barry Warsaw418101f2004-05-09 03:29:23 +0000426 elif lineno == len(lines) - 1:
Anthony Baxter39a0f042004-03-22 00:33:28 +0000427 # Something looking like a unix-from at the end - it's
Barry Warsaw418101f2004-05-09 03:29:23 +0000428 # probably the first line of the body, so push back the
429 # line and stop.
Anthony Baxter39a0f042004-03-22 00:33:28 +0000430 self._input.unreadline(line)
431 return
432 else:
Barry Warsaw418101f2004-05-09 03:29:23 +0000433 # Weirdly placed unix-from line. Note this as a defect
434 # and ignore it.
435 defect = Errors.MisplacedEnvelopeHeader(line)
436 self._cur.defects.append(defect)
Anthony Baxter39a0f042004-03-22 00:33:28 +0000437 continue
Barry Warsaw418101f2004-05-09 03:29:23 +0000438 # Split the line on the colon separating field name from value.
Anthony Baxter39a0f042004-03-22 00:33:28 +0000439 i = line.find(':')
440 if i < 0:
Barry Warsaw418101f2004-05-09 03:29:23 +0000441 defect = Errors.MalformedHeader(line)
442 self._cur.defects.append(defect)
443 continue
Anthony Baxter39a0f042004-03-22 00:33:28 +0000444 lastheader = line[:i]
445 lastvalue = [line[i+1:].lstrip()]
Barry Warsaw418101f2004-05-09 03:29:23 +0000446 # Done with all the lines, so handle the last header.
Anthony Baxter39a0f042004-03-22 00:33:28 +0000447 if lastheader:
448 # XXX reconsider the joining of folded lines
449 self._cur[lastheader] = EMPTYSTRING.join(lastvalue).rstrip()