blob: 294a6a5a789f83574c3f2987d9fba3b12f690456 [file] [log] [blame]
Barry Warsaw418101f2004-05-09 03:29:23 +00001# Copyright (C) 2004 Python Software Foundation
2# Authors: Baxter, Wouters and Warsaw
Anthony Baxter39a0f042004-03-22 00:33:28 +00003
Barry Warsaw418101f2004-05-09 03:29:23 +00004"""FeedParser - An email feed parser.
5
6The feed parser implements an interface for incrementally parsing an email
7message, line by line. This has advantages for certain applications, such as
8those reading email messages off a socket.
9
10FeedParser.feed() is the primary interface for pushing new data into the
11parser. It returns when there's nothing more it can do with the available
12data. When you have no more data to push into the parser, call .close().
13This completes the parsing and returns the root message object.
14
15The other advantage of this parser is that it will never throw a parsing
16exception. Instead, when it finds something unexpected, it adds a 'defect' to
17the current message. Defects are just instances that live on the message
18object's .defect attribute.
19"""
20
Anthony Baxter39a0f042004-03-22 00:33:28 +000021import re
Barry Warsaw418101f2004-05-09 03:29:23 +000022from email import Errors
23from email import Message
Anthony Baxter39a0f042004-03-22 00:33:28 +000024
25NLCRE = re.compile('\r\n|\r|\n')
Barry Warsaw418101f2004-05-09 03:29:23 +000026NLCRE_bol = re.compile('(\r\n|\r|\n)')
27NLCRE_eol = re.compile('(\r\n|\r|\n)$')
28NLCRE_crack = re.compile('(\r\n|\r|\n)')
29headerRE = re.compile(r'^(From |[-\w]{2,}:|[\t ])')
Anthony Baxter39a0f042004-03-22 00:33:28 +000030EMPTYSTRING = ''
31NL = '\n'
32
33NeedMoreData = object()
34
Anthony Baxter39a0f042004-03-22 00:33:28 +000035
Barry Warsaw418101f2004-05-09 03:29:23 +000036
37class BufferedSubFile(object):
38 """A file-ish object that can have new data loaded into it.
39
40 You can also push and pop line-matching predicates onto a stack. When the
41 current predicate matches the current line, a false EOF response
42 (i.e. empty string) is returned instead. This lets the parser adhere to a
43 simple abstraction -- it parses until EOF closes the current message.
44 """
Anthony Baxter39a0f042004-03-22 00:33:28 +000045 def __init__(self):
Barry Warsaw418101f2004-05-09 03:29:23 +000046 # The last partial line pushed into this object.
Anthony Baxter39a0f042004-03-22 00:33:28 +000047 self._partial = ''
Barry Warsaw418101f2004-05-09 03:29:23 +000048 # The list of full, pushed lines, in reverse order
49 self._lines = []
50 # The stack of false-EOF checking predicates.
51 self._eofstack = []
52 # A flag indicating whether the file has been closed or not.
53 self._closed = False
54
55 def push_eof_matcher(self, pred):
56 self._eofstack.append(pred)
57
58 def pop_eof_matcher(self):
59 return self._eofstack.pop()
60
61 def close(self):
62 # Don't forget any trailing partial line.
63 self._lines.append(self._partial)
64 self._closed = True
Anthony Baxter39a0f042004-03-22 00:33:28 +000065
66 def readline(self):
Barry Warsaw418101f2004-05-09 03:29:23 +000067 if not self._lines:
68 if self._closed:
Anthony Baxter39a0f042004-03-22 00:33:28 +000069 return ''
70 return NeedMoreData
Barry Warsaw418101f2004-05-09 03:29:23 +000071 # Pop the line off the stack and see if it matches the current
72 # false-EOF predicate.
73 line = self._lines.pop()
74 if self._eofstack:
75 matches = self._eofstack[-1]
76 if matches(line):
77 # We're at the false EOF. But push the last line back first.
78 self._lines.append(line)
79 return ''
80 return line
Anthony Baxter39a0f042004-03-22 00:33:28 +000081
82 def unreadline(self, line):
Barry Warsaw418101f2004-05-09 03:29:23 +000083 # Let the consumer push a line back into the buffer.
84 self._lines.append(line)
Anthony Baxter39a0f042004-03-22 00:33:28 +000085
86 def push(self, data):
Barry Warsaw418101f2004-05-09 03:29:23 +000087 """Push some new data into this object."""
Anthony Baxter39a0f042004-03-22 00:33:28 +000088 # Handle any previous leftovers
Barry Warsaw418101f2004-05-09 03:29:23 +000089 data, self._partial = self._partial + data, ''
90 # Crack into lines, but preserve the newlines on the end of each
91 parts = NLCRE_crack.split(data)
92 # The *ahem* interesting behaviour of re.split when supplied grouping
93 # parentheses is that the last element of the resulting list is the
94 # data after the final RE. In the case of a NL/CR terminated string,
95 # this is the empty string.
96 self._partial = parts.pop()
97 # parts is a list of strings, alternating between the line contents
98 # and the eol character(s). Gather up a list of lines after
99 # re-attaching the newlines.
100 lines = []
101 for i in range(len(parts) / 2):
102 lines.append(parts[i*2] + parts[i*2+1])
103 self.pushlines(lines)
104
Anthony Baxter39a0f042004-03-22 00:33:28 +0000105 def pushlines(self, lines):
Barry Warsaw418101f2004-05-09 03:29:23 +0000106 # Reverse and insert at the front of the lines.
107 self._lines[:0] = lines[::-1]
Anthony Baxter39a0f042004-03-22 00:33:28 +0000108
Barry Warsaw418101f2004-05-09 03:29:23 +0000109 def is_closed(self):
110 return self._closed
Anthony Baxter39a0f042004-03-22 00:33:28 +0000111
112 def __iter__(self):
113 return self
114
115 def next(self):
Barry Warsaw418101f2004-05-09 03:29:23 +0000116 line = self.readline()
117 if line == '':
Anthony Baxter39a0f042004-03-22 00:33:28 +0000118 raise StopIteration
Barry Warsaw418101f2004-05-09 03:29:23 +0000119 return line
Anthony Baxter39a0f042004-03-22 00:33:28 +0000120
Barry Warsaw418101f2004-05-09 03:29:23 +0000121
122
Anthony Baxter39a0f042004-03-22 00:33:28 +0000123class FeedParser:
Barry Warsaw418101f2004-05-09 03:29:23 +0000124 """A feed-style parser of email."""
Anthony Baxter39a0f042004-03-22 00:33:28 +0000125
Barry Warsaw418101f2004-05-09 03:29:23 +0000126 def __init__(self, _factory=Message.Message):
127 """_factory is called with no arguments to create a new message obj"""
128 self._factory = _factory
129 self._input = BufferedSubFile()
130 self._msgstack = []
Anthony Baxter39a0f042004-03-22 00:33:28 +0000131 self._parse = self._parsegen().next
Barry Warsaw418101f2004-05-09 03:29:23 +0000132 self._cur = None
133 self._last = None
134 self._headersonly = False
Anthony Baxter39a0f042004-03-22 00:33:28 +0000135
Barry Warsaw418101f2004-05-09 03:29:23 +0000136 # Non-public interface for supporting Parser's headersonly flag
137 def _set_headersonly(self):
138 self._headersonly = True
Anthony Baxter39a0f042004-03-22 00:33:28 +0000139
140 def feed(self, data):
Barry Warsaw418101f2004-05-09 03:29:23 +0000141 """Push more data into the parser."""
Anthony Baxter39a0f042004-03-22 00:33:28 +0000142 self._input.push(data)
143 self._call_parse()
144
145 def _call_parse(self):
146 try:
147 self._parse()
148 except StopIteration:
149 pass
150
Barry Warsaw418101f2004-05-09 03:29:23 +0000151 def close(self):
152 """Parse all remaining data and return the root message object."""
153 self._input.close()
154 self._call_parse()
155 root = self._pop_message()
156 assert not self._msgstack
157 return root
Anthony Baxter39a0f042004-03-22 00:33:28 +0000158
Barry Warsaw418101f2004-05-09 03:29:23 +0000159 def _new_message(self):
160 msg = self._factory()
161 if self._cur and self._cur.get_content_type() == 'multipart/digest':
162 msg.set_default_type('message/rfc822')
163 if self._msgstack:
164 self._msgstack[-1].attach(msg)
165 self._msgstack.append(msg)
166 self._cur = msg
167 self._cur.defects = []
168 self._last = msg
169
170 def _pop_message(self):
171 retval = self._msgstack.pop()
172 if self._msgstack:
173 self._cur = self._msgstack[-1]
174 else:
175 self._cur = None
176 return retval
177
178 def _parsegen(self):
179 # Create a new message and start by parsing headers.
180 self._new_message()
181 headers = []
182 # Collect the headers, searching for a line that doesn't match the RFC
183 # 2822 header or continuation pattern (including an empty line).
184 for line in self._input:
185 if line is NeedMoreData:
186 yield NeedMoreData
187 continue
188 if not headerRE.match(line):
189 # If we saw the RFC defined header/body separator
190 # (i.e. newline), just throw it away. Otherwise the line is
191 # part of the body so push it back.
192 if not NLCRE.match(line):
193 self._input.unreadline(line)
194 break
195 headers.append(line)
196 # Done with the headers, so parse them and figure out what we're
197 # supposed to see in the body of the message.
198 self._parse_headers(headers)
199 # Headers-only parsing is a backwards compatibility hack, which was
200 # necessary in the older parser, which could throw errors. All
201 # remaining lines in the input are thrown into the message body.
202 if self._headersonly:
203 lines = []
204 while True:
205 line = self._input.readline()
206 if line is NeedMoreData:
207 yield NeedMoreData
208 continue
209 if line == '':
210 break
211 lines.append(line)
212 self._cur.set_payload(EMPTYSTRING.join(lines))
213 return
Barry Warsaw418101f2004-05-09 03:29:23 +0000214 if self._cur.get_content_type() == 'message/delivery-status':
215 # message/delivery-status contains blocks of headers separated by
216 # a blank line. We'll represent each header block as a separate
Barry Warsawd38f4482004-05-11 20:19:09 +0000217 # nested message object, but the processing is a bit different
218 # than standard message/* types because there is no body for the
219 # nested messages. A blank line separates the subparts.
Barry Warsaw418101f2004-05-09 03:29:23 +0000220 while True:
221 self._input.push_eof_matcher(NLCRE.match)
222 for retval in self._parsegen():
223 if retval is NeedMoreData:
224 yield NeedMoreData
225 continue
226 break
227 msg = self._pop_message()
228 # We need to pop the EOF matcher in order to tell if we're at
229 # the end of the current file, not the end of the last block
230 # of message headers.
231 self._input.pop_eof_matcher()
232 # The input stream must be sitting at the newline or at the
233 # EOF. We want to see if we're at the end of this subpart, so
234 # first consume the blank line, then test the next line to see
235 # if we're at this subpart's EOF.
236 line = self._input.readline()
237 line = self._input.readline()
238 if line == '':
239 break
240 # Not at EOF so this is a line we're going to need.
241 self._input.unreadline(line)
242 return
Barry Warsawd38f4482004-05-11 20:19:09 +0000243 if self._cur.get_content_maintype() == 'message':
244 # The message claims to be a message/* type, then what follows is
245 # another RFC 2822 message.
246 for retval in self._parsegen():
247 if retval is NeedMoreData:
248 yield NeedMoreData
249 continue
250 break
251 self._pop_message()
252 return
Barry Warsaw418101f2004-05-09 03:29:23 +0000253 if self._cur.get_content_maintype() == 'multipart':
254 boundary = self._cur.get_boundary()
255 if boundary is None:
256 # The message /claims/ to be a multipart but it has not
257 # defined a boundary. That's a problem which we'll handle by
258 # reading everything until the EOF and marking the message as
259 # defective.
260 self._cur.defects.append(Errors.NoBoundaryInMultipart())
261 lines = []
262 for line in self._input:
263 if line is NeedMoreData:
264 yield NeedMoreData
265 continue
266 lines.append(line)
267 self._cur.set_payload(EMPTYSTRING.join(lines))
268 return
269 # Create a line match predicate which matches the inter-part
270 # boundary as well as the end-of-multipart boundary. Don't push
271 # this onto the input stream until we've scanned past the
272 # preamble.
273 separator = '--' + boundary
274 boundaryre = re.compile(
275 '(?P<sep>' + re.escape(separator) +
276 r')(?P<end>--)?(?P<ws>[ \t]*)(?P<linesep>\r\n|\r|\n)$')
277 capturing_preamble = True
278 preamble = []
279 linesep = False
280 while True:
281 line = self._input.readline()
282 if line is NeedMoreData:
283 yield NeedMoreData
284 continue
285 if line == '':
286 break
287 mo = boundaryre.match(line)
288 if mo:
289 # If we're looking at the end boundary, we're done with
290 # this multipart. If there was a newline at the end of
291 # the closing boundary, then we need to initialize the
292 # epilogue with the empty string (see below).
293 if mo.group('end'):
294 linesep = mo.group('linesep')
295 break
296 # We saw an inter-part boundary. Were we in the preamble?
297 if capturing_preamble:
298 if preamble:
299 # According to RFC 2046, the last newline belongs
300 # to the boundary.
301 lastline = preamble[-1]
302 eolmo = NLCRE_eol.search(lastline)
303 if eolmo:
304 preamble[-1] = lastline[:-len(eolmo.group(0))]
305 self._cur.preamble = EMPTYSTRING.join(preamble)
306 capturing_preamble = False
307 self._input.unreadline(line)
308 continue
Barry Warsaw486cb0a2004-05-11 22:23:59 +0000309 # We saw a boundary separating two parts. Consume any
310 # multiple boundary lines that may be following. Our
311 # interpretation of RFC 2046 BNF grammar does not produce
312 # body parts within such double boundaries.
313 while True:
314 line = self._input.readline()
315 mo = boundaryre.match(line)
316 if not mo:
317 self._input.unreadline(line)
318 break
319 # Recurse to parse this subpart; the input stream points
320 # at the subpart's first line.
Barry Warsaw418101f2004-05-09 03:29:23 +0000321 self._input.push_eof_matcher(boundaryre.match)
322 for retval in self._parsegen():
323 if retval is NeedMoreData:
324 yield NeedMoreData
325 continue
326 break
327 # Because of RFC 2046, the newline preceding the boundary
328 # separator actually belongs to the boundary, not the
329 # previous subpart's payload (or epilogue if the previous
330 # part is a multipart).
331 if self._last.get_content_maintype() == 'multipart':
332 epilogue = self._last.epilogue
333 if epilogue == '':
334 self._last.epilogue = None
335 elif epilogue is not None:
336 mo = NLCRE_eol.search(epilogue)
337 if mo:
338 end = len(mo.group(0))
339 self._last.epilogue = epilogue[:-end]
340 else:
341 payload = self._last.get_payload()
342 if isinstance(payload, basestring):
343 mo = NLCRE_eol.search(payload)
344 if mo:
345 payload = payload[:-len(mo.group(0))]
346 self._last.set_payload(payload)
347 self._input.pop_eof_matcher()
348 self._pop_message()
349 # Set the multipart up for newline cleansing, which will
350 # happen if we're in a nested multipart.
351 self._last = self._cur
352 else:
353 # I think we must be in the preamble
354 assert capturing_preamble
355 preamble.append(line)
356 # We've seen either the EOF or the end boundary. If we're still
357 # capturing the preamble, we never saw the start boundary. Note
358 # that as a defect and store the captured text as the payload.
359 # Otherwise everything from here to the EOF is epilogue.
360 if capturing_preamble:
361 self._cur.defects.append(Errors.StartBoundaryNotFound())
362 self._cur.set_payload(EMPTYSTRING.join(preamble))
363 return
364 # If the end boundary ended in a newline, we'll need to make sure
365 # the epilogue isn't None
366 if linesep:
367 epilogue = ['']
368 else:
369 epilogue = []
370 for line in self._input:
371 if line is NeedMoreData:
372 yield NeedMoreData
373 continue
374 epilogue.append(line)
375 # Any CRLF at the front of the epilogue is not technically part of
376 # the epilogue. Also, watch out for an empty string epilogue,
377 # which means a single newline.
Barry Warsaw5b44cd62004-05-11 18:10:15 +0000378 if epilogue:
379 firstline = epilogue[0]
380 bolmo = NLCRE_bol.match(firstline)
381 if bolmo:
382 epilogue[0] = firstline[len(bolmo.group(0)):]
Barry Warsaw418101f2004-05-09 03:29:23 +0000383 self._cur.epilogue = EMPTYSTRING.join(epilogue)
384 return
385 # Otherwise, it's some non-multipart type, so the entire rest of the
386 # file contents becomes the payload.
387 lines = []
388 for line in self._input:
389 if line is NeedMoreData:
390 yield NeedMoreData
391 continue
392 lines.append(line)
393 self._cur.set_payload(EMPTYSTRING.join(lines))
394
395 def _parse_headers(self, lines):
396 # Passed a list of lines that make up the headers for the current msg
Anthony Baxter39a0f042004-03-22 00:33:28 +0000397 lastheader = ''
398 lastvalue = []
Barry Warsaw418101f2004-05-09 03:29:23 +0000399 for lineno, line in enumerate(lines):
Anthony Baxter39a0f042004-03-22 00:33:28 +0000400 # Check for continuation
401 if line[0] in ' \t':
402 if not lastheader:
Barry Warsaw418101f2004-05-09 03:29:23 +0000403 # The first line of the headers was a continuation. This
404 # is illegal, so let's note the defect, store the illegal
405 # line, and ignore it for purposes of headers.
406 defect = Errors.FirstHeaderLineIsContinuation(line)
407 self._cur.defects.append(defect)
408 continue
Anthony Baxter39a0f042004-03-22 00:33:28 +0000409 lastvalue.append(line)
410 continue
Anthony Baxter39a0f042004-03-22 00:33:28 +0000411 if lastheader:
412 # XXX reconsider the joining of folded lines
Barry Warsaw418101f2004-05-09 03:29:23 +0000413 self._cur[lastheader] = EMPTYSTRING.join(lastvalue)[:-1]
Anthony Baxter39a0f042004-03-22 00:33:28 +0000414 lastheader, lastvalue = '', []
Barry Warsaw418101f2004-05-09 03:29:23 +0000415 # Check for envelope header, i.e. unix-from
Anthony Baxter39a0f042004-03-22 00:33:28 +0000416 if line.startswith('From '):
417 if lineno == 0:
Barry Warsawc29db262004-05-10 14:48:30 +0000418 # Strip off the trailing newline
419 mo = NLCRE_eol.search(line)
420 if mo:
421 line = line[:-len(mo.group(0))]
Anthony Baxter39a0f042004-03-22 00:33:28 +0000422 self._cur.set_unixfrom(line)
423 continue
Barry Warsaw418101f2004-05-09 03:29:23 +0000424 elif lineno == len(lines) - 1:
Anthony Baxter39a0f042004-03-22 00:33:28 +0000425 # Something looking like a unix-from at the end - it's
Barry Warsaw418101f2004-05-09 03:29:23 +0000426 # probably the first line of the body, so push back the
427 # line and stop.
Anthony Baxter39a0f042004-03-22 00:33:28 +0000428 self._input.unreadline(line)
429 return
430 else:
Barry Warsaw418101f2004-05-09 03:29:23 +0000431 # Weirdly placed unix-from line. Note this as a defect
432 # and ignore it.
433 defect = Errors.MisplacedEnvelopeHeader(line)
434 self._cur.defects.append(defect)
Anthony Baxter39a0f042004-03-22 00:33:28 +0000435 continue
Barry Warsaw418101f2004-05-09 03:29:23 +0000436 # Split the line on the colon separating field name from value.
Anthony Baxter39a0f042004-03-22 00:33:28 +0000437 i = line.find(':')
438 if i < 0:
Barry Warsaw418101f2004-05-09 03:29:23 +0000439 defect = Errors.MalformedHeader(line)
440 self._cur.defects.append(defect)
441 continue
Anthony Baxter39a0f042004-03-22 00:33:28 +0000442 lastheader = line[:i]
443 lastvalue = [line[i+1:].lstrip()]
Barry Warsaw418101f2004-05-09 03:29:23 +0000444 # Done with all the lines, so handle the last header.
Anthony Baxter39a0f042004-03-22 00:33:28 +0000445 if lastheader:
446 # XXX reconsider the joining of folded lines
447 self._cur[lastheader] = EMPTYSTRING.join(lastvalue).rstrip()