blob: af0e177d561f2d5c3e936fa8797e0fa8c9336ed3 [file] [log] [blame]
Barry Warsaw418101f2004-05-09 03:29:23 +00001# Copyright (C) 2004 Python Software Foundation
2# Authors: Baxter, Wouters and Warsaw
Anthony Baxter39a0f042004-03-22 00:33:28 +00003
Barry Warsaw418101f2004-05-09 03:29:23 +00004"""FeedParser - An email feed parser.
5
6The feed parser implements an interface for incrementally parsing an email
7message, line by line. This has advantages for certain applications, such as
8those reading email messages off a socket.
9
10FeedParser.feed() is the primary interface for pushing new data into the
11parser. It returns when there's nothing more it can do with the available
12data. When you have no more data to push into the parser, call .close().
13This completes the parsing and returns the root message object.
14
15The other advantage of this parser is that it will never throw a parsing
16exception. Instead, when it finds something unexpected, it adds a 'defect' to
17the current message. Defects are just instances that live on the message
18object's .defect attribute.
19"""
20
Anthony Baxter39a0f042004-03-22 00:33:28 +000021import re
Barry Warsaw418101f2004-05-09 03:29:23 +000022from email import Errors
23from email import Message
Anthony Baxter39a0f042004-03-22 00:33:28 +000024
25NLCRE = re.compile('\r\n|\r|\n')
Barry Warsaw418101f2004-05-09 03:29:23 +000026NLCRE_bol = re.compile('(\r\n|\r|\n)')
27NLCRE_eol = re.compile('(\r\n|\r|\n)$')
28NLCRE_crack = re.compile('(\r\n|\r|\n)')
29headerRE = re.compile(r'^(From |[-\w]{2,}:|[\t ])')
Anthony Baxter39a0f042004-03-22 00:33:28 +000030EMPTYSTRING = ''
31NL = '\n'
32
33NeedMoreData = object()
34
Anthony Baxter39a0f042004-03-22 00:33:28 +000035
Barry Warsaw418101f2004-05-09 03:29:23 +000036
37class BufferedSubFile(object):
38 """A file-ish object that can have new data loaded into it.
39
40 You can also push and pop line-matching predicates onto a stack. When the
41 current predicate matches the current line, a false EOF response
42 (i.e. empty string) is returned instead. This lets the parser adhere to a
43 simple abstraction -- it parses until EOF closes the current message.
44 """
Anthony Baxter39a0f042004-03-22 00:33:28 +000045 def __init__(self):
Barry Warsaw418101f2004-05-09 03:29:23 +000046 # The last partial line pushed into this object.
Anthony Baxter39a0f042004-03-22 00:33:28 +000047 self._partial = ''
Barry Warsaw418101f2004-05-09 03:29:23 +000048 # The list of full, pushed lines, in reverse order
49 self._lines = []
50 # The stack of false-EOF checking predicates.
51 self._eofstack = []
52 # A flag indicating whether the file has been closed or not.
53 self._closed = False
54
55 def push_eof_matcher(self, pred):
56 self._eofstack.append(pred)
57
58 def pop_eof_matcher(self):
59 return self._eofstack.pop()
60
61 def close(self):
62 # Don't forget any trailing partial line.
63 self._lines.append(self._partial)
64 self._closed = True
Anthony Baxter39a0f042004-03-22 00:33:28 +000065
66 def readline(self):
Barry Warsaw418101f2004-05-09 03:29:23 +000067 if not self._lines:
68 if self._closed:
Anthony Baxter39a0f042004-03-22 00:33:28 +000069 return ''
70 return NeedMoreData
Barry Warsaw418101f2004-05-09 03:29:23 +000071 # Pop the line off the stack and see if it matches the current
72 # false-EOF predicate.
73 line = self._lines.pop()
Barry Warsaw4e59bc12004-05-13 20:17:51 +000074 # RFC 2046, section 5.1.2 requires us to recognize outer level
75 # boundaries at any level of inner nesting. Do this, but be sure it's
76 # in the order of most to least nested.
77 for ateof in self._eofstack[::-1]:
78 if ateof(line):
Barry Warsaw418101f2004-05-09 03:29:23 +000079 # We're at the false EOF. But push the last line back first.
80 self._lines.append(line)
81 return ''
82 return line
Anthony Baxter39a0f042004-03-22 00:33:28 +000083
84 def unreadline(self, line):
Barry Warsaw418101f2004-05-09 03:29:23 +000085 # Let the consumer push a line back into the buffer.
86 self._lines.append(line)
Anthony Baxter39a0f042004-03-22 00:33:28 +000087
88 def push(self, data):
Barry Warsaw418101f2004-05-09 03:29:23 +000089 """Push some new data into this object."""
Anthony Baxter39a0f042004-03-22 00:33:28 +000090 # Handle any previous leftovers
Barry Warsaw418101f2004-05-09 03:29:23 +000091 data, self._partial = self._partial + data, ''
92 # Crack into lines, but preserve the newlines on the end of each
93 parts = NLCRE_crack.split(data)
94 # The *ahem* interesting behaviour of re.split when supplied grouping
95 # parentheses is that the last element of the resulting list is the
96 # data after the final RE. In the case of a NL/CR terminated string,
97 # this is the empty string.
98 self._partial = parts.pop()
99 # parts is a list of strings, alternating between the line contents
100 # and the eol character(s). Gather up a list of lines after
101 # re-attaching the newlines.
102 lines = []
103 for i in range(len(parts) / 2):
104 lines.append(parts[i*2] + parts[i*2+1])
105 self.pushlines(lines)
106
Anthony Baxter39a0f042004-03-22 00:33:28 +0000107 def pushlines(self, lines):
Barry Warsaw418101f2004-05-09 03:29:23 +0000108 # Reverse and insert at the front of the lines.
109 self._lines[:0] = lines[::-1]
Anthony Baxter39a0f042004-03-22 00:33:28 +0000110
Barry Warsaw418101f2004-05-09 03:29:23 +0000111 def is_closed(self):
112 return self._closed
Anthony Baxter39a0f042004-03-22 00:33:28 +0000113
114 def __iter__(self):
115 return self
116
117 def next(self):
Barry Warsaw418101f2004-05-09 03:29:23 +0000118 line = self.readline()
119 if line == '':
Anthony Baxter39a0f042004-03-22 00:33:28 +0000120 raise StopIteration
Barry Warsaw418101f2004-05-09 03:29:23 +0000121 return line
Anthony Baxter39a0f042004-03-22 00:33:28 +0000122
Barry Warsaw418101f2004-05-09 03:29:23 +0000123
124
Anthony Baxter39a0f042004-03-22 00:33:28 +0000125class FeedParser:
Barry Warsaw418101f2004-05-09 03:29:23 +0000126 """A feed-style parser of email."""
Anthony Baxter39a0f042004-03-22 00:33:28 +0000127
Barry Warsaw418101f2004-05-09 03:29:23 +0000128 def __init__(self, _factory=Message.Message):
129 """_factory is called with no arguments to create a new message obj"""
130 self._factory = _factory
131 self._input = BufferedSubFile()
132 self._msgstack = []
Anthony Baxter39a0f042004-03-22 00:33:28 +0000133 self._parse = self._parsegen().next
Barry Warsaw418101f2004-05-09 03:29:23 +0000134 self._cur = None
135 self._last = None
136 self._headersonly = False
Anthony Baxter39a0f042004-03-22 00:33:28 +0000137
Barry Warsaw418101f2004-05-09 03:29:23 +0000138 # Non-public interface for supporting Parser's headersonly flag
139 def _set_headersonly(self):
140 self._headersonly = True
Anthony Baxter39a0f042004-03-22 00:33:28 +0000141
142 def feed(self, data):
Barry Warsaw418101f2004-05-09 03:29:23 +0000143 """Push more data into the parser."""
Anthony Baxter39a0f042004-03-22 00:33:28 +0000144 self._input.push(data)
145 self._call_parse()
146
147 def _call_parse(self):
148 try:
149 self._parse()
150 except StopIteration:
151 pass
152
Barry Warsaw418101f2004-05-09 03:29:23 +0000153 def close(self):
154 """Parse all remaining data and return the root message object."""
155 self._input.close()
156 self._call_parse()
157 root = self._pop_message()
158 assert not self._msgstack
159 return root
Anthony Baxter39a0f042004-03-22 00:33:28 +0000160
Barry Warsaw418101f2004-05-09 03:29:23 +0000161 def _new_message(self):
162 msg = self._factory()
163 if self._cur and self._cur.get_content_type() == 'multipart/digest':
164 msg.set_default_type('message/rfc822')
165 if self._msgstack:
166 self._msgstack[-1].attach(msg)
167 self._msgstack.append(msg)
168 self._cur = msg
169 self._cur.defects = []
170 self._last = msg
171
172 def _pop_message(self):
173 retval = self._msgstack.pop()
174 if self._msgstack:
175 self._cur = self._msgstack[-1]
176 else:
177 self._cur = None
178 return retval
179
180 def _parsegen(self):
181 # Create a new message and start by parsing headers.
182 self._new_message()
183 headers = []
184 # Collect the headers, searching for a line that doesn't match the RFC
185 # 2822 header or continuation pattern (including an empty line).
186 for line in self._input:
187 if line is NeedMoreData:
188 yield NeedMoreData
189 continue
190 if not headerRE.match(line):
191 # If we saw the RFC defined header/body separator
192 # (i.e. newline), just throw it away. Otherwise the line is
193 # part of the body so push it back.
194 if not NLCRE.match(line):
195 self._input.unreadline(line)
196 break
197 headers.append(line)
198 # Done with the headers, so parse them and figure out what we're
199 # supposed to see in the body of the message.
200 self._parse_headers(headers)
201 # Headers-only parsing is a backwards compatibility hack, which was
202 # necessary in the older parser, which could throw errors. All
203 # remaining lines in the input are thrown into the message body.
204 if self._headersonly:
205 lines = []
206 while True:
207 line = self._input.readline()
208 if line is NeedMoreData:
209 yield NeedMoreData
210 continue
211 if line == '':
212 break
213 lines.append(line)
214 self._cur.set_payload(EMPTYSTRING.join(lines))
215 return
Barry Warsaw418101f2004-05-09 03:29:23 +0000216 if self._cur.get_content_type() == 'message/delivery-status':
217 # message/delivery-status contains blocks of headers separated by
218 # a blank line. We'll represent each header block as a separate
Barry Warsawd38f4482004-05-11 20:19:09 +0000219 # nested message object, but the processing is a bit different
220 # than standard message/* types because there is no body for the
221 # nested messages. A blank line separates the subparts.
Barry Warsaw418101f2004-05-09 03:29:23 +0000222 while True:
223 self._input.push_eof_matcher(NLCRE.match)
224 for retval in self._parsegen():
225 if retval is NeedMoreData:
226 yield NeedMoreData
227 continue
228 break
229 msg = self._pop_message()
230 # We need to pop the EOF matcher in order to tell if we're at
231 # the end of the current file, not the end of the last block
232 # of message headers.
233 self._input.pop_eof_matcher()
234 # The input stream must be sitting at the newline or at the
235 # EOF. We want to see if we're at the end of this subpart, so
236 # first consume the blank line, then test the next line to see
237 # if we're at this subpart's EOF.
238 line = self._input.readline()
239 line = self._input.readline()
240 if line == '':
241 break
242 # Not at EOF so this is a line we're going to need.
243 self._input.unreadline(line)
244 return
Barry Warsawd38f4482004-05-11 20:19:09 +0000245 if self._cur.get_content_maintype() == 'message':
246 # The message claims to be a message/* type, then what follows is
247 # another RFC 2822 message.
248 for retval in self._parsegen():
249 if retval is NeedMoreData:
250 yield NeedMoreData
251 continue
252 break
253 self._pop_message()
254 return
Barry Warsaw418101f2004-05-09 03:29:23 +0000255 if self._cur.get_content_maintype() == 'multipart':
256 boundary = self._cur.get_boundary()
257 if boundary is None:
258 # The message /claims/ to be a multipart but it has not
259 # defined a boundary. That's a problem which we'll handle by
260 # reading everything until the EOF and marking the message as
261 # defective.
262 self._cur.defects.append(Errors.NoBoundaryInMultipart())
263 lines = []
264 for line in self._input:
265 if line is NeedMoreData:
266 yield NeedMoreData
267 continue
268 lines.append(line)
269 self._cur.set_payload(EMPTYSTRING.join(lines))
270 return
271 # Create a line match predicate which matches the inter-part
272 # boundary as well as the end-of-multipart boundary. Don't push
273 # this onto the input stream until we've scanned past the
274 # preamble.
275 separator = '--' + boundary
276 boundaryre = re.compile(
277 '(?P<sep>' + re.escape(separator) +
278 r')(?P<end>--)?(?P<ws>[ \t]*)(?P<linesep>\r\n|\r|\n)$')
279 capturing_preamble = True
280 preamble = []
281 linesep = False
282 while True:
283 line = self._input.readline()
284 if line is NeedMoreData:
285 yield NeedMoreData
286 continue
287 if line == '':
288 break
289 mo = boundaryre.match(line)
290 if mo:
291 # If we're looking at the end boundary, we're done with
292 # this multipart. If there was a newline at the end of
293 # the closing boundary, then we need to initialize the
294 # epilogue with the empty string (see below).
295 if mo.group('end'):
296 linesep = mo.group('linesep')
297 break
298 # We saw an inter-part boundary. Were we in the preamble?
299 if capturing_preamble:
300 if preamble:
301 # According to RFC 2046, the last newline belongs
302 # to the boundary.
303 lastline = preamble[-1]
304 eolmo = NLCRE_eol.search(lastline)
305 if eolmo:
306 preamble[-1] = lastline[:-len(eolmo.group(0))]
307 self._cur.preamble = EMPTYSTRING.join(preamble)
308 capturing_preamble = False
309 self._input.unreadline(line)
310 continue
Barry Warsaw486cb0a2004-05-11 22:23:59 +0000311 # We saw a boundary separating two parts. Consume any
312 # multiple boundary lines that may be following. Our
313 # interpretation of RFC 2046 BNF grammar does not produce
314 # body parts within such double boundaries.
315 while True:
316 line = self._input.readline()
Barry Warsawe4aeb7d2004-05-15 16:26:28 +0000317 if line is NeedMoreData:
318 yield NeedMoreData
319 continue
Barry Warsaw486cb0a2004-05-11 22:23:59 +0000320 mo = boundaryre.match(line)
321 if not mo:
322 self._input.unreadline(line)
323 break
324 # Recurse to parse this subpart; the input stream points
325 # at the subpart's first line.
Barry Warsaw418101f2004-05-09 03:29:23 +0000326 self._input.push_eof_matcher(boundaryre.match)
327 for retval in self._parsegen():
328 if retval is NeedMoreData:
329 yield NeedMoreData
330 continue
331 break
332 # Because of RFC 2046, the newline preceding the boundary
333 # separator actually belongs to the boundary, not the
334 # previous subpart's payload (or epilogue if the previous
335 # part is a multipart).
336 if self._last.get_content_maintype() == 'multipart':
337 epilogue = self._last.epilogue
338 if epilogue == '':
339 self._last.epilogue = None
340 elif epilogue is not None:
341 mo = NLCRE_eol.search(epilogue)
342 if mo:
343 end = len(mo.group(0))
344 self._last.epilogue = epilogue[:-end]
345 else:
346 payload = self._last.get_payload()
347 if isinstance(payload, basestring):
348 mo = NLCRE_eol.search(payload)
349 if mo:
350 payload = payload[:-len(mo.group(0))]
351 self._last.set_payload(payload)
352 self._input.pop_eof_matcher()
353 self._pop_message()
354 # Set the multipart up for newline cleansing, which will
355 # happen if we're in a nested multipart.
356 self._last = self._cur
357 else:
358 # I think we must be in the preamble
359 assert capturing_preamble
360 preamble.append(line)
361 # We've seen either the EOF or the end boundary. If we're still
362 # capturing the preamble, we never saw the start boundary. Note
363 # that as a defect and store the captured text as the payload.
364 # Otherwise everything from here to the EOF is epilogue.
365 if capturing_preamble:
366 self._cur.defects.append(Errors.StartBoundaryNotFound())
367 self._cur.set_payload(EMPTYSTRING.join(preamble))
368 return
369 # If the end boundary ended in a newline, we'll need to make sure
370 # the epilogue isn't None
371 if linesep:
372 epilogue = ['']
373 else:
374 epilogue = []
375 for line in self._input:
376 if line is NeedMoreData:
377 yield NeedMoreData
378 continue
379 epilogue.append(line)
380 # Any CRLF at the front of the epilogue is not technically part of
381 # the epilogue. Also, watch out for an empty string epilogue,
382 # which means a single newline.
Barry Warsaw5b44cd62004-05-11 18:10:15 +0000383 if epilogue:
384 firstline = epilogue[0]
385 bolmo = NLCRE_bol.match(firstline)
386 if bolmo:
387 epilogue[0] = firstline[len(bolmo.group(0)):]
Barry Warsaw418101f2004-05-09 03:29:23 +0000388 self._cur.epilogue = EMPTYSTRING.join(epilogue)
389 return
390 # Otherwise, it's some non-multipart type, so the entire rest of the
391 # file contents becomes the payload.
392 lines = []
393 for line in self._input:
394 if line is NeedMoreData:
395 yield NeedMoreData
396 continue
397 lines.append(line)
398 self._cur.set_payload(EMPTYSTRING.join(lines))
399
400 def _parse_headers(self, lines):
401 # Passed a list of lines that make up the headers for the current msg
Anthony Baxter39a0f042004-03-22 00:33:28 +0000402 lastheader = ''
403 lastvalue = []
Barry Warsaw418101f2004-05-09 03:29:23 +0000404 for lineno, line in enumerate(lines):
Anthony Baxter39a0f042004-03-22 00:33:28 +0000405 # Check for continuation
406 if line[0] in ' \t':
407 if not lastheader:
Barry Warsaw418101f2004-05-09 03:29:23 +0000408 # The first line of the headers was a continuation. This
409 # is illegal, so let's note the defect, store the illegal
410 # line, and ignore it for purposes of headers.
411 defect = Errors.FirstHeaderLineIsContinuation(line)
412 self._cur.defects.append(defect)
413 continue
Anthony Baxter39a0f042004-03-22 00:33:28 +0000414 lastvalue.append(line)
415 continue
Anthony Baxter39a0f042004-03-22 00:33:28 +0000416 if lastheader:
417 # XXX reconsider the joining of folded lines
Barry Warsaw418101f2004-05-09 03:29:23 +0000418 self._cur[lastheader] = EMPTYSTRING.join(lastvalue)[:-1]
Anthony Baxter39a0f042004-03-22 00:33:28 +0000419 lastheader, lastvalue = '', []
Barry Warsaw418101f2004-05-09 03:29:23 +0000420 # Check for envelope header, i.e. unix-from
Anthony Baxter39a0f042004-03-22 00:33:28 +0000421 if line.startswith('From '):
422 if lineno == 0:
Barry Warsawc29db262004-05-10 14:48:30 +0000423 # Strip off the trailing newline
424 mo = NLCRE_eol.search(line)
425 if mo:
426 line = line[:-len(mo.group(0))]
Anthony Baxter39a0f042004-03-22 00:33:28 +0000427 self._cur.set_unixfrom(line)
428 continue
Barry Warsaw418101f2004-05-09 03:29:23 +0000429 elif lineno == len(lines) - 1:
Anthony Baxter39a0f042004-03-22 00:33:28 +0000430 # Something looking like a unix-from at the end - it's
Barry Warsaw418101f2004-05-09 03:29:23 +0000431 # probably the first line of the body, so push back the
432 # line and stop.
Anthony Baxter39a0f042004-03-22 00:33:28 +0000433 self._input.unreadline(line)
434 return
435 else:
Barry Warsaw418101f2004-05-09 03:29:23 +0000436 # Weirdly placed unix-from line. Note this as a defect
437 # and ignore it.
438 defect = Errors.MisplacedEnvelopeHeader(line)
439 self._cur.defects.append(defect)
Anthony Baxter39a0f042004-03-22 00:33:28 +0000440 continue
Barry Warsaw418101f2004-05-09 03:29:23 +0000441 # Split the line on the colon separating field name from value.
Anthony Baxter39a0f042004-03-22 00:33:28 +0000442 i = line.find(':')
443 if i < 0:
Barry Warsaw418101f2004-05-09 03:29:23 +0000444 defect = Errors.MalformedHeader(line)
445 self._cur.defects.append(defect)
446 continue
Anthony Baxter39a0f042004-03-22 00:33:28 +0000447 lastheader = line[:i]
448 lastvalue = [line[i+1:].lstrip()]
Barry Warsaw418101f2004-05-09 03:29:23 +0000449 # Done with all the lines, so handle the last header.
Anthony Baxter39a0f042004-03-22 00:33:28 +0000450 if lastheader:
451 # XXX reconsider the joining of folded lines
452 self._cur[lastheader] = EMPTYSTRING.join(lastvalue).rstrip()