blob: d28170e9496c914b00ba7dab332626ad8b9d5a54 [file] [log] [blame]
Barry Warsaw418101f2004-05-09 03:29:23 +00001# Copyright (C) 2004 Python Software Foundation
2# Authors: Baxter, Wouters and Warsaw
Anthony Baxter39a0f042004-03-22 00:33:28 +00003
Barry Warsaw418101f2004-05-09 03:29:23 +00004"""FeedParser - An email feed parser.
5
6The feed parser implements an interface for incrementally parsing an email
7message, line by line. This has advantages for certain applications, such as
8those reading email messages off a socket.
9
10FeedParser.feed() is the primary interface for pushing new data into the
11parser. It returns when there's nothing more it can do with the available
12data. When you have no more data to push into the parser, call .close().
13This completes the parsing and returns the root message object.
14
15The other advantage of this parser is that it will never throw a parsing
16exception. Instead, when it finds something unexpected, it adds a 'defect' to
17the current message. Defects are just instances that live on the message
18object's .defect attribute.
19"""
20
Anthony Baxter39a0f042004-03-22 00:33:28 +000021import re
Barry Warsaw418101f2004-05-09 03:29:23 +000022from email import Errors
23from email import Message
Anthony Baxter39a0f042004-03-22 00:33:28 +000024
25NLCRE = re.compile('\r\n|\r|\n')
Barry Warsaw418101f2004-05-09 03:29:23 +000026NLCRE_bol = re.compile('(\r\n|\r|\n)')
27NLCRE_eol = re.compile('(\r\n|\r|\n)$')
28NLCRE_crack = re.compile('(\r\n|\r|\n)')
29headerRE = re.compile(r'^(From |[-\w]{2,}:|[\t ])')
Anthony Baxter39a0f042004-03-22 00:33:28 +000030EMPTYSTRING = ''
31NL = '\n'
32
33NeedMoreData = object()
34
Anthony Baxter39a0f042004-03-22 00:33:28 +000035
Barry Warsaw418101f2004-05-09 03:29:23 +000036
37class BufferedSubFile(object):
38 """A file-ish object that can have new data loaded into it.
39
40 You can also push and pop line-matching predicates onto a stack. When the
41 current predicate matches the current line, a false EOF response
42 (i.e. empty string) is returned instead. This lets the parser adhere to a
43 simple abstraction -- it parses until EOF closes the current message.
44 """
Anthony Baxter39a0f042004-03-22 00:33:28 +000045 def __init__(self):
Barry Warsaw418101f2004-05-09 03:29:23 +000046 # The last partial line pushed into this object.
Anthony Baxter39a0f042004-03-22 00:33:28 +000047 self._partial = ''
Barry Warsaw418101f2004-05-09 03:29:23 +000048 # The list of full, pushed lines, in reverse order
49 self._lines = []
50 # The stack of false-EOF checking predicates.
51 self._eofstack = []
52 # A flag indicating whether the file has been closed or not.
53 self._closed = False
54
55 def push_eof_matcher(self, pred):
56 self._eofstack.append(pred)
57
58 def pop_eof_matcher(self):
59 return self._eofstack.pop()
60
61 def close(self):
62 # Don't forget any trailing partial line.
63 self._lines.append(self._partial)
64 self._closed = True
Anthony Baxter39a0f042004-03-22 00:33:28 +000065
66 def readline(self):
Barry Warsaw418101f2004-05-09 03:29:23 +000067 if not self._lines:
68 if self._closed:
Anthony Baxter39a0f042004-03-22 00:33:28 +000069 return ''
70 return NeedMoreData
Barry Warsaw418101f2004-05-09 03:29:23 +000071 # Pop the line off the stack and see if it matches the current
72 # false-EOF predicate.
73 line = self._lines.pop()
74 if self._eofstack:
75 matches = self._eofstack[-1]
76 if matches(line):
77 # We're at the false EOF. But push the last line back first.
78 self._lines.append(line)
79 return ''
80 return line
Anthony Baxter39a0f042004-03-22 00:33:28 +000081
82 def unreadline(self, line):
Barry Warsaw418101f2004-05-09 03:29:23 +000083 # Let the consumer push a line back into the buffer.
84 self._lines.append(line)
Anthony Baxter39a0f042004-03-22 00:33:28 +000085
86 def push(self, data):
Barry Warsaw418101f2004-05-09 03:29:23 +000087 """Push some new data into this object."""
Anthony Baxter39a0f042004-03-22 00:33:28 +000088 # Handle any previous leftovers
Barry Warsaw418101f2004-05-09 03:29:23 +000089 data, self._partial = self._partial + data, ''
90 # Crack into lines, but preserve the newlines on the end of each
91 parts = NLCRE_crack.split(data)
92 # The *ahem* interesting behaviour of re.split when supplied grouping
93 # parentheses is that the last element of the resulting list is the
94 # data after the final RE. In the case of a NL/CR terminated string,
95 # this is the empty string.
96 self._partial = parts.pop()
97 # parts is a list of strings, alternating between the line contents
98 # and the eol character(s). Gather up a list of lines after
99 # re-attaching the newlines.
100 lines = []
101 for i in range(len(parts) / 2):
102 lines.append(parts[i*2] + parts[i*2+1])
103 self.pushlines(lines)
104
Anthony Baxter39a0f042004-03-22 00:33:28 +0000105 def pushlines(self, lines):
Barry Warsaw418101f2004-05-09 03:29:23 +0000106 # Reverse and insert at the front of the lines.
107 self._lines[:0] = lines[::-1]
Anthony Baxter39a0f042004-03-22 00:33:28 +0000108
Barry Warsaw418101f2004-05-09 03:29:23 +0000109 def is_closed(self):
110 return self._closed
Anthony Baxter39a0f042004-03-22 00:33:28 +0000111
112 def __iter__(self):
113 return self
114
115 def next(self):
Barry Warsaw418101f2004-05-09 03:29:23 +0000116 line = self.readline()
117 if line == '':
Anthony Baxter39a0f042004-03-22 00:33:28 +0000118 raise StopIteration
Barry Warsaw418101f2004-05-09 03:29:23 +0000119 return line
Anthony Baxter39a0f042004-03-22 00:33:28 +0000120
Barry Warsaw418101f2004-05-09 03:29:23 +0000121
122
Anthony Baxter39a0f042004-03-22 00:33:28 +0000123class FeedParser:
Barry Warsaw418101f2004-05-09 03:29:23 +0000124 """A feed-style parser of email."""
Anthony Baxter39a0f042004-03-22 00:33:28 +0000125
Barry Warsaw418101f2004-05-09 03:29:23 +0000126 def __init__(self, _factory=Message.Message):
127 """_factory is called with no arguments to create a new message obj"""
128 self._factory = _factory
129 self._input = BufferedSubFile()
130 self._msgstack = []
Anthony Baxter39a0f042004-03-22 00:33:28 +0000131 self._parse = self._parsegen().next
Barry Warsaw418101f2004-05-09 03:29:23 +0000132 self._cur = None
133 self._last = None
134 self._headersonly = False
Anthony Baxter39a0f042004-03-22 00:33:28 +0000135
Barry Warsaw418101f2004-05-09 03:29:23 +0000136 # Non-public interface for supporting Parser's headersonly flag
137 def _set_headersonly(self):
138 self._headersonly = True
Anthony Baxter39a0f042004-03-22 00:33:28 +0000139
140 def feed(self, data):
Barry Warsaw418101f2004-05-09 03:29:23 +0000141 """Push more data into the parser."""
Anthony Baxter39a0f042004-03-22 00:33:28 +0000142 self._input.push(data)
143 self._call_parse()
144
145 def _call_parse(self):
146 try:
147 self._parse()
148 except StopIteration:
149 pass
150
Barry Warsaw418101f2004-05-09 03:29:23 +0000151 def close(self):
152 """Parse all remaining data and return the root message object."""
153 self._input.close()
154 self._call_parse()
155 root = self._pop_message()
156 assert not self._msgstack
157 return root
Anthony Baxter39a0f042004-03-22 00:33:28 +0000158
Barry Warsaw418101f2004-05-09 03:29:23 +0000159 def _new_message(self):
160 msg = self._factory()
161 if self._cur and self._cur.get_content_type() == 'multipart/digest':
162 msg.set_default_type('message/rfc822')
163 if self._msgstack:
164 self._msgstack[-1].attach(msg)
165 self._msgstack.append(msg)
166 self._cur = msg
167 self._cur.defects = []
168 self._last = msg
169
170 def _pop_message(self):
171 retval = self._msgstack.pop()
172 if self._msgstack:
173 self._cur = self._msgstack[-1]
174 else:
175 self._cur = None
176 return retval
177
178 def _parsegen(self):
179 # Create a new message and start by parsing headers.
180 self._new_message()
181 headers = []
182 # Collect the headers, searching for a line that doesn't match the RFC
183 # 2822 header or continuation pattern (including an empty line).
184 for line in self._input:
185 if line is NeedMoreData:
186 yield NeedMoreData
187 continue
188 if not headerRE.match(line):
189 # If we saw the RFC defined header/body separator
190 # (i.e. newline), just throw it away. Otherwise the line is
191 # part of the body so push it back.
192 if not NLCRE.match(line):
193 self._input.unreadline(line)
194 break
195 headers.append(line)
196 # Done with the headers, so parse them and figure out what we're
197 # supposed to see in the body of the message.
198 self._parse_headers(headers)
199 # Headers-only parsing is a backwards compatibility hack, which was
200 # necessary in the older parser, which could throw errors. All
201 # remaining lines in the input are thrown into the message body.
202 if self._headersonly:
203 lines = []
204 while True:
205 line = self._input.readline()
206 if line is NeedMoreData:
207 yield NeedMoreData
208 continue
209 if line == '':
210 break
211 lines.append(line)
212 self._cur.set_payload(EMPTYSTRING.join(lines))
213 return
Barry Warsaw418101f2004-05-09 03:29:23 +0000214 if self._cur.get_content_type() == 'message/delivery-status':
215 # message/delivery-status contains blocks of headers separated by
216 # a blank line. We'll represent each header block as a separate
Barry Warsawd38f4482004-05-11 20:19:09 +0000217 # nested message object, but the processing is a bit different
218 # than standard message/* types because there is no body for the
219 # nested messages. A blank line separates the subparts.
Barry Warsaw418101f2004-05-09 03:29:23 +0000220 while True:
221 self._input.push_eof_matcher(NLCRE.match)
222 for retval in self._parsegen():
223 if retval is NeedMoreData:
224 yield NeedMoreData
225 continue
226 break
227 msg = self._pop_message()
228 # We need to pop the EOF matcher in order to tell if we're at
229 # the end of the current file, not the end of the last block
230 # of message headers.
231 self._input.pop_eof_matcher()
232 # The input stream must be sitting at the newline or at the
233 # EOF. We want to see if we're at the end of this subpart, so
234 # first consume the blank line, then test the next line to see
235 # if we're at this subpart's EOF.
236 line = self._input.readline()
237 line = self._input.readline()
238 if line == '':
239 break
240 # Not at EOF so this is a line we're going to need.
241 self._input.unreadline(line)
242 return
Barry Warsawd38f4482004-05-11 20:19:09 +0000243 if self._cur.get_content_maintype() == 'message':
244 # The message claims to be a message/* type, then what follows is
245 # another RFC 2822 message.
246 for retval in self._parsegen():
247 if retval is NeedMoreData:
248 yield NeedMoreData
249 continue
250 break
251 self._pop_message()
252 return
Barry Warsaw418101f2004-05-09 03:29:23 +0000253 if self._cur.get_content_maintype() == 'multipart':
254 boundary = self._cur.get_boundary()
255 if boundary is None:
256 # The message /claims/ to be a multipart but it has not
257 # defined a boundary. That's a problem which we'll handle by
258 # reading everything until the EOF and marking the message as
259 # defective.
260 self._cur.defects.append(Errors.NoBoundaryInMultipart())
261 lines = []
262 for line in self._input:
263 if line is NeedMoreData:
264 yield NeedMoreData
265 continue
266 lines.append(line)
267 self._cur.set_payload(EMPTYSTRING.join(lines))
268 return
269 # Create a line match predicate which matches the inter-part
270 # boundary as well as the end-of-multipart boundary. Don't push
271 # this onto the input stream until we've scanned past the
272 # preamble.
273 separator = '--' + boundary
274 boundaryre = re.compile(
275 '(?P<sep>' + re.escape(separator) +
276 r')(?P<end>--)?(?P<ws>[ \t]*)(?P<linesep>\r\n|\r|\n)$')
277 capturing_preamble = True
278 preamble = []
279 linesep = False
280 while True:
281 line = self._input.readline()
282 if line is NeedMoreData:
283 yield NeedMoreData
284 continue
285 if line == '':
286 break
287 mo = boundaryre.match(line)
288 if mo:
289 # If we're looking at the end boundary, we're done with
290 # this multipart. If there was a newline at the end of
291 # the closing boundary, then we need to initialize the
292 # epilogue with the empty string (see below).
293 if mo.group('end'):
294 linesep = mo.group('linesep')
295 break
296 # We saw an inter-part boundary. Were we in the preamble?
297 if capturing_preamble:
298 if preamble:
299 # According to RFC 2046, the last newline belongs
300 # to the boundary.
301 lastline = preamble[-1]
302 eolmo = NLCRE_eol.search(lastline)
303 if eolmo:
304 preamble[-1] = lastline[:-len(eolmo.group(0))]
305 self._cur.preamble = EMPTYSTRING.join(preamble)
306 capturing_preamble = False
307 self._input.unreadline(line)
308 continue
309 # We saw a boundary separating two parts. Recurse to
310 # parse this subpart; the input stream points at the
311 # subpart's first line.
312 self._input.push_eof_matcher(boundaryre.match)
313 for retval in self._parsegen():
314 if retval is NeedMoreData:
315 yield NeedMoreData
316 continue
317 break
318 # Because of RFC 2046, the newline preceding the boundary
319 # separator actually belongs to the boundary, not the
320 # previous subpart's payload (or epilogue if the previous
321 # part is a multipart).
322 if self._last.get_content_maintype() == 'multipart':
323 epilogue = self._last.epilogue
324 if epilogue == '':
325 self._last.epilogue = None
326 elif epilogue is not None:
327 mo = NLCRE_eol.search(epilogue)
328 if mo:
329 end = len(mo.group(0))
330 self._last.epilogue = epilogue[:-end]
331 else:
332 payload = self._last.get_payload()
333 if isinstance(payload, basestring):
334 mo = NLCRE_eol.search(payload)
335 if mo:
336 payload = payload[:-len(mo.group(0))]
337 self._last.set_payload(payload)
338 self._input.pop_eof_matcher()
339 self._pop_message()
340 # Set the multipart up for newline cleansing, which will
341 # happen if we're in a nested multipart.
342 self._last = self._cur
343 else:
344 # I think we must be in the preamble
345 assert capturing_preamble
346 preamble.append(line)
347 # We've seen either the EOF or the end boundary. If we're still
348 # capturing the preamble, we never saw the start boundary. Note
349 # that as a defect and store the captured text as the payload.
350 # Otherwise everything from here to the EOF is epilogue.
351 if capturing_preamble:
352 self._cur.defects.append(Errors.StartBoundaryNotFound())
353 self._cur.set_payload(EMPTYSTRING.join(preamble))
354 return
355 # If the end boundary ended in a newline, we'll need to make sure
356 # the epilogue isn't None
357 if linesep:
358 epilogue = ['']
359 else:
360 epilogue = []
361 for line in self._input:
362 if line is NeedMoreData:
363 yield NeedMoreData
364 continue
365 epilogue.append(line)
366 # Any CRLF at the front of the epilogue is not technically part of
367 # the epilogue. Also, watch out for an empty string epilogue,
368 # which means a single newline.
Barry Warsaw5b44cd62004-05-11 18:10:15 +0000369 if epilogue:
370 firstline = epilogue[0]
371 bolmo = NLCRE_bol.match(firstline)
372 if bolmo:
373 epilogue[0] = firstline[len(bolmo.group(0)):]
Barry Warsaw418101f2004-05-09 03:29:23 +0000374 self._cur.epilogue = EMPTYSTRING.join(epilogue)
375 return
376 # Otherwise, it's some non-multipart type, so the entire rest of the
377 # file contents becomes the payload.
378 lines = []
379 for line in self._input:
380 if line is NeedMoreData:
381 yield NeedMoreData
382 continue
383 lines.append(line)
384 self._cur.set_payload(EMPTYSTRING.join(lines))
385
386 def _parse_headers(self, lines):
387 # Passed a list of lines that make up the headers for the current msg
Anthony Baxter39a0f042004-03-22 00:33:28 +0000388 lastheader = ''
389 lastvalue = []
Barry Warsaw418101f2004-05-09 03:29:23 +0000390 for lineno, line in enumerate(lines):
Anthony Baxter39a0f042004-03-22 00:33:28 +0000391 # Check for continuation
392 if line[0] in ' \t':
393 if not lastheader:
Barry Warsaw418101f2004-05-09 03:29:23 +0000394 # The first line of the headers was a continuation. This
395 # is illegal, so let's note the defect, store the illegal
396 # line, and ignore it for purposes of headers.
397 defect = Errors.FirstHeaderLineIsContinuation(line)
398 self._cur.defects.append(defect)
399 continue
Anthony Baxter39a0f042004-03-22 00:33:28 +0000400 lastvalue.append(line)
401 continue
Anthony Baxter39a0f042004-03-22 00:33:28 +0000402 if lastheader:
403 # XXX reconsider the joining of folded lines
Barry Warsaw418101f2004-05-09 03:29:23 +0000404 self._cur[lastheader] = EMPTYSTRING.join(lastvalue)[:-1]
Anthony Baxter39a0f042004-03-22 00:33:28 +0000405 lastheader, lastvalue = '', []
Barry Warsaw418101f2004-05-09 03:29:23 +0000406 # Check for envelope header, i.e. unix-from
Anthony Baxter39a0f042004-03-22 00:33:28 +0000407 if line.startswith('From '):
408 if lineno == 0:
Barry Warsawc29db262004-05-10 14:48:30 +0000409 # Strip off the trailing newline
410 mo = NLCRE_eol.search(line)
411 if mo:
412 line = line[:-len(mo.group(0))]
Anthony Baxter39a0f042004-03-22 00:33:28 +0000413 self._cur.set_unixfrom(line)
414 continue
Barry Warsaw418101f2004-05-09 03:29:23 +0000415 elif lineno == len(lines) - 1:
Anthony Baxter39a0f042004-03-22 00:33:28 +0000416 # Something looking like a unix-from at the end - it's
Barry Warsaw418101f2004-05-09 03:29:23 +0000417 # probably the first line of the body, so push back the
418 # line and stop.
Anthony Baxter39a0f042004-03-22 00:33:28 +0000419 self._input.unreadline(line)
420 return
421 else:
Barry Warsaw418101f2004-05-09 03:29:23 +0000422 # Weirdly placed unix-from line. Note this as a defect
423 # and ignore it.
424 defect = Errors.MisplacedEnvelopeHeader(line)
425 self._cur.defects.append(defect)
Anthony Baxter39a0f042004-03-22 00:33:28 +0000426 continue
Barry Warsaw418101f2004-05-09 03:29:23 +0000427 # Split the line on the colon separating field name from value.
Anthony Baxter39a0f042004-03-22 00:33:28 +0000428 i = line.find(':')
429 if i < 0:
Barry Warsaw418101f2004-05-09 03:29:23 +0000430 defect = Errors.MalformedHeader(line)
431 self._cur.defects.append(defect)
432 continue
Anthony Baxter39a0f042004-03-22 00:33:28 +0000433 lastheader = line[:i]
434 lastvalue = [line[i+1:].lstrip()]
Barry Warsaw418101f2004-05-09 03:29:23 +0000435 # Done with all the lines, so handle the last header.
Anthony Baxter39a0f042004-03-22 00:33:28 +0000436 if lastheader:
437 # XXX reconsider the joining of folded lines
438 self._cur[lastheader] = EMPTYSTRING.join(lastvalue).rstrip()