blob: 0bb92712aee4c6f3163a7d13aaf15bbe7aca4577 [file] [log] [blame]
Barry Warsaw418101f2004-05-09 03:29:23 +00001# Copyright (C) 2004 Python Software Foundation
2# Authors: Baxter, Wouters and Warsaw
Anthony Baxter39a0f042004-03-22 00:33:28 +00003
Barry Warsaw418101f2004-05-09 03:29:23 +00004"""FeedParser - An email feed parser.
5
6The feed parser implements an interface for incrementally parsing an email
7message, line by line. This has advantages for certain applications, such as
8those reading email messages off a socket.
9
10FeedParser.feed() is the primary interface for pushing new data into the
11parser. It returns when there's nothing more it can do with the available
12data. When you have no more data to push into the parser, call .close().
13This completes the parsing and returns the root message object.
14
15The other advantage of this parser is that it will never throw a parsing
16exception. Instead, when it finds something unexpected, it adds a 'defect' to
17the current message. Defects are just instances that live on the message
18object's .defect attribute.
19"""
20
Anthony Baxter39a0f042004-03-22 00:33:28 +000021import re
Barry Warsaw418101f2004-05-09 03:29:23 +000022from email import Errors
23from email import Message
Anthony Baxter39a0f042004-03-22 00:33:28 +000024
25NLCRE = re.compile('\r\n|\r|\n')
Barry Warsaw418101f2004-05-09 03:29:23 +000026NLCRE_bol = re.compile('(\r\n|\r|\n)')
27NLCRE_eol = re.compile('(\r\n|\r|\n)$')
28NLCRE_crack = re.compile('(\r\n|\r|\n)')
29headerRE = re.compile(r'^(From |[-\w]{2,}:|[\t ])')
Anthony Baxter39a0f042004-03-22 00:33:28 +000030EMPTYSTRING = ''
31NL = '\n'
32
33NeedMoreData = object()
34
Anthony Baxter39a0f042004-03-22 00:33:28 +000035
Barry Warsaw418101f2004-05-09 03:29:23 +000036
37class BufferedSubFile(object):
38 """A file-ish object that can have new data loaded into it.
39
40 You can also push and pop line-matching predicates onto a stack. When the
41 current predicate matches the current line, a false EOF response
42 (i.e. empty string) is returned instead. This lets the parser adhere to a
43 simple abstraction -- it parses until EOF closes the current message.
44 """
Anthony Baxter39a0f042004-03-22 00:33:28 +000045 def __init__(self):
Barry Warsaw418101f2004-05-09 03:29:23 +000046 # The last partial line pushed into this object.
Anthony Baxter39a0f042004-03-22 00:33:28 +000047 self._partial = ''
Barry Warsaw418101f2004-05-09 03:29:23 +000048 # The list of full, pushed lines, in reverse order
49 self._lines = []
50 # The stack of false-EOF checking predicates.
51 self._eofstack = []
52 # A flag indicating whether the file has been closed or not.
53 self._closed = False
54
55 def push_eof_matcher(self, pred):
56 self._eofstack.append(pred)
57
58 def pop_eof_matcher(self):
59 return self._eofstack.pop()
60
61 def close(self):
62 # Don't forget any trailing partial line.
63 self._lines.append(self._partial)
64 self._closed = True
Anthony Baxter39a0f042004-03-22 00:33:28 +000065
66 def readline(self):
Barry Warsaw418101f2004-05-09 03:29:23 +000067 if not self._lines:
68 if self._closed:
Anthony Baxter39a0f042004-03-22 00:33:28 +000069 return ''
70 return NeedMoreData
Barry Warsaw418101f2004-05-09 03:29:23 +000071 # Pop the line off the stack and see if it matches the current
72 # false-EOF predicate.
73 line = self._lines.pop()
74 if self._eofstack:
75 matches = self._eofstack[-1]
76 if matches(line):
77 # We're at the false EOF. But push the last line back first.
78 self._lines.append(line)
79 return ''
80 return line
Anthony Baxter39a0f042004-03-22 00:33:28 +000081
82 def unreadline(self, line):
Barry Warsaw418101f2004-05-09 03:29:23 +000083 # Let the consumer push a line back into the buffer.
84 self._lines.append(line)
Anthony Baxter39a0f042004-03-22 00:33:28 +000085
86 def push(self, data):
Barry Warsaw418101f2004-05-09 03:29:23 +000087 """Push some new data into this object."""
Anthony Baxter39a0f042004-03-22 00:33:28 +000088 # Handle any previous leftovers
Barry Warsaw418101f2004-05-09 03:29:23 +000089 data, self._partial = self._partial + data, ''
90 # Crack into lines, but preserve the newlines on the end of each
91 parts = NLCRE_crack.split(data)
92 # The *ahem* interesting behaviour of re.split when supplied grouping
93 # parentheses is that the last element of the resulting list is the
94 # data after the final RE. In the case of a NL/CR terminated string,
95 # this is the empty string.
96 self._partial = parts.pop()
97 # parts is a list of strings, alternating between the line contents
98 # and the eol character(s). Gather up a list of lines after
99 # re-attaching the newlines.
100 lines = []
101 for i in range(len(parts) / 2):
102 lines.append(parts[i*2] + parts[i*2+1])
103 self.pushlines(lines)
104
Anthony Baxter39a0f042004-03-22 00:33:28 +0000105 def pushlines(self, lines):
Barry Warsaw418101f2004-05-09 03:29:23 +0000106 # Reverse and insert at the front of the lines.
107 self._lines[:0] = lines[::-1]
Anthony Baxter39a0f042004-03-22 00:33:28 +0000108
Barry Warsaw418101f2004-05-09 03:29:23 +0000109 def is_closed(self):
110 return self._closed
Anthony Baxter39a0f042004-03-22 00:33:28 +0000111
112 def __iter__(self):
113 return self
114
115 def next(self):
Barry Warsaw418101f2004-05-09 03:29:23 +0000116 line = self.readline()
117 if line == '':
Anthony Baxter39a0f042004-03-22 00:33:28 +0000118 raise StopIteration
Barry Warsaw418101f2004-05-09 03:29:23 +0000119 return line
Anthony Baxter39a0f042004-03-22 00:33:28 +0000120
Barry Warsaw418101f2004-05-09 03:29:23 +0000121
122
Anthony Baxter39a0f042004-03-22 00:33:28 +0000123class FeedParser:
Barry Warsaw418101f2004-05-09 03:29:23 +0000124 """A feed-style parser of email."""
Anthony Baxter39a0f042004-03-22 00:33:28 +0000125
Barry Warsaw418101f2004-05-09 03:29:23 +0000126 def __init__(self, _factory=Message.Message):
127 """_factory is called with no arguments to create a new message obj"""
128 self._factory = _factory
129 self._input = BufferedSubFile()
130 self._msgstack = []
Anthony Baxter39a0f042004-03-22 00:33:28 +0000131 self._parse = self._parsegen().next
Barry Warsaw418101f2004-05-09 03:29:23 +0000132 self._cur = None
133 self._last = None
134 self._headersonly = False
Anthony Baxter39a0f042004-03-22 00:33:28 +0000135
Barry Warsaw418101f2004-05-09 03:29:23 +0000136 # Non-public interface for supporting Parser's headersonly flag
137 def _set_headersonly(self):
138 self._headersonly = True
Anthony Baxter39a0f042004-03-22 00:33:28 +0000139
140 def feed(self, data):
Barry Warsaw418101f2004-05-09 03:29:23 +0000141 """Push more data into the parser."""
Anthony Baxter39a0f042004-03-22 00:33:28 +0000142 self._input.push(data)
143 self._call_parse()
144
145 def _call_parse(self):
146 try:
147 self._parse()
148 except StopIteration:
149 pass
150
Barry Warsaw418101f2004-05-09 03:29:23 +0000151 def close(self):
152 """Parse all remaining data and return the root message object."""
153 self._input.close()
154 self._call_parse()
155 root = self._pop_message()
156 assert not self._msgstack
157 return root
Anthony Baxter39a0f042004-03-22 00:33:28 +0000158
Barry Warsaw418101f2004-05-09 03:29:23 +0000159 def _new_message(self):
160 msg = self._factory()
161 if self._cur and self._cur.get_content_type() == 'multipart/digest':
162 msg.set_default_type('message/rfc822')
163 if self._msgstack:
164 self._msgstack[-1].attach(msg)
165 self._msgstack.append(msg)
166 self._cur = msg
167 self._cur.defects = []
168 self._last = msg
169
170 def _pop_message(self):
171 retval = self._msgstack.pop()
172 if self._msgstack:
173 self._cur = self._msgstack[-1]
174 else:
175 self._cur = None
176 return retval
177
178 def _parsegen(self):
179 # Create a new message and start by parsing headers.
180 self._new_message()
181 headers = []
182 # Collect the headers, searching for a line that doesn't match the RFC
183 # 2822 header or continuation pattern (including an empty line).
184 for line in self._input:
185 if line is NeedMoreData:
186 yield NeedMoreData
187 continue
188 if not headerRE.match(line):
189 # If we saw the RFC defined header/body separator
190 # (i.e. newline), just throw it away. Otherwise the line is
191 # part of the body so push it back.
192 if not NLCRE.match(line):
193 self._input.unreadline(line)
194 break
195 headers.append(line)
196 # Done with the headers, so parse them and figure out what we're
197 # supposed to see in the body of the message.
198 self._parse_headers(headers)
199 # Headers-only parsing is a backwards compatibility hack, which was
200 # necessary in the older parser, which could throw errors. All
201 # remaining lines in the input are thrown into the message body.
202 if self._headersonly:
203 lines = []
204 while True:
205 line = self._input.readline()
206 if line is NeedMoreData:
207 yield NeedMoreData
208 continue
209 if line == '':
210 break
211 lines.append(line)
212 self._cur.set_payload(EMPTYSTRING.join(lines))
213 return
214 # So now the input is sitting at the first body line. If the message
215 # claims to be a message/rfc822 type, then what follows is another RFC
216 # 2822 message.
217 if self._cur.get_content_type() == 'message/rfc822':
218 for retval in self._parsegen():
219 if retval is NeedMoreData:
220 yield NeedMoreData
221 continue
222 break
223 self._pop_message()
224 return
225 if self._cur.get_content_type() == 'message/delivery-status':
226 # message/delivery-status contains blocks of headers separated by
227 # a blank line. We'll represent each header block as a separate
228 # nested message object. A blank line separates the subparts.
229 while True:
230 self._input.push_eof_matcher(NLCRE.match)
231 for retval in self._parsegen():
232 if retval is NeedMoreData:
233 yield NeedMoreData
234 continue
235 break
236 msg = self._pop_message()
237 # We need to pop the EOF matcher in order to tell if we're at
238 # the end of the current file, not the end of the last block
239 # of message headers.
240 self._input.pop_eof_matcher()
241 # The input stream must be sitting at the newline or at the
242 # EOF. We want to see if we're at the end of this subpart, so
243 # first consume the blank line, then test the next line to see
244 # if we're at this subpart's EOF.
245 line = self._input.readline()
246 line = self._input.readline()
247 if line == '':
248 break
249 # Not at EOF so this is a line we're going to need.
250 self._input.unreadline(line)
251 return
252 if self._cur.get_content_maintype() == 'multipart':
253 boundary = self._cur.get_boundary()
254 if boundary is None:
255 # The message /claims/ to be a multipart but it has not
256 # defined a boundary. That's a problem which we'll handle by
257 # reading everything until the EOF and marking the message as
258 # defective.
259 self._cur.defects.append(Errors.NoBoundaryInMultipart())
260 lines = []
261 for line in self._input:
262 if line is NeedMoreData:
263 yield NeedMoreData
264 continue
265 lines.append(line)
266 self._cur.set_payload(EMPTYSTRING.join(lines))
267 return
268 # Create a line match predicate which matches the inter-part
269 # boundary as well as the end-of-multipart boundary. Don't push
270 # this onto the input stream until we've scanned past the
271 # preamble.
272 separator = '--' + boundary
273 boundaryre = re.compile(
274 '(?P<sep>' + re.escape(separator) +
275 r')(?P<end>--)?(?P<ws>[ \t]*)(?P<linesep>\r\n|\r|\n)$')
276 capturing_preamble = True
277 preamble = []
278 linesep = False
279 while True:
280 line = self._input.readline()
281 if line is NeedMoreData:
282 yield NeedMoreData
283 continue
284 if line == '':
285 break
286 mo = boundaryre.match(line)
287 if mo:
288 # If we're looking at the end boundary, we're done with
289 # this multipart. If there was a newline at the end of
290 # the closing boundary, then we need to initialize the
291 # epilogue with the empty string (see below).
292 if mo.group('end'):
293 linesep = mo.group('linesep')
294 break
295 # We saw an inter-part boundary. Were we in the preamble?
296 if capturing_preamble:
297 if preamble:
298 # According to RFC 2046, the last newline belongs
299 # to the boundary.
300 lastline = preamble[-1]
301 eolmo = NLCRE_eol.search(lastline)
302 if eolmo:
303 preamble[-1] = lastline[:-len(eolmo.group(0))]
304 self._cur.preamble = EMPTYSTRING.join(preamble)
305 capturing_preamble = False
306 self._input.unreadline(line)
307 continue
308 # We saw a boundary separating two parts. Recurse to
309 # parse this subpart; the input stream points at the
310 # subpart's first line.
311 self._input.push_eof_matcher(boundaryre.match)
312 for retval in self._parsegen():
313 if retval is NeedMoreData:
314 yield NeedMoreData
315 continue
316 break
317 # Because of RFC 2046, the newline preceding the boundary
318 # separator actually belongs to the boundary, not the
319 # previous subpart's payload (or epilogue if the previous
320 # part is a multipart).
321 if self._last.get_content_maintype() == 'multipart':
322 epilogue = self._last.epilogue
323 if epilogue == '':
324 self._last.epilogue = None
325 elif epilogue is not None:
326 mo = NLCRE_eol.search(epilogue)
327 if mo:
328 end = len(mo.group(0))
329 self._last.epilogue = epilogue[:-end]
330 else:
331 payload = self._last.get_payload()
332 if isinstance(payload, basestring):
333 mo = NLCRE_eol.search(payload)
334 if mo:
335 payload = payload[:-len(mo.group(0))]
336 self._last.set_payload(payload)
337 self._input.pop_eof_matcher()
338 self._pop_message()
339 # Set the multipart up for newline cleansing, which will
340 # happen if we're in a nested multipart.
341 self._last = self._cur
342 else:
343 # I think we must be in the preamble
344 assert capturing_preamble
345 preamble.append(line)
346 # We've seen either the EOF or the end boundary. If we're still
347 # capturing the preamble, we never saw the start boundary. Note
348 # that as a defect and store the captured text as the payload.
349 # Otherwise everything from here to the EOF is epilogue.
350 if capturing_preamble:
351 self._cur.defects.append(Errors.StartBoundaryNotFound())
352 self._cur.set_payload(EMPTYSTRING.join(preamble))
353 return
354 # If the end boundary ended in a newline, we'll need to make sure
355 # the epilogue isn't None
356 if linesep:
357 epilogue = ['']
358 else:
359 epilogue = []
360 for line in self._input:
361 if line is NeedMoreData:
362 yield NeedMoreData
363 continue
364 epilogue.append(line)
365 # Any CRLF at the front of the epilogue is not technically part of
366 # the epilogue. Also, watch out for an empty string epilogue,
367 # which means a single newline.
368 firstline = epilogue[0]
369 bolmo = NLCRE_bol.match(firstline)
370 if bolmo:
371 epilogue[0] = firstline[len(bolmo.group(0)):]
372 self._cur.epilogue = EMPTYSTRING.join(epilogue)
373 return
374 # Otherwise, it's some non-multipart type, so the entire rest of the
375 # file contents becomes the payload.
376 lines = []
377 for line in self._input:
378 if line is NeedMoreData:
379 yield NeedMoreData
380 continue
381 lines.append(line)
382 self._cur.set_payload(EMPTYSTRING.join(lines))
383
384 def _parse_headers(self, lines):
385 # Passed a list of lines that make up the headers for the current msg
Anthony Baxter39a0f042004-03-22 00:33:28 +0000386 lastheader = ''
387 lastvalue = []
Barry Warsaw418101f2004-05-09 03:29:23 +0000388 for lineno, line in enumerate(lines):
Anthony Baxter39a0f042004-03-22 00:33:28 +0000389 # Check for continuation
390 if line[0] in ' \t':
391 if not lastheader:
Barry Warsaw418101f2004-05-09 03:29:23 +0000392 # The first line of the headers was a continuation. This
393 # is illegal, so let's note the defect, store the illegal
394 # line, and ignore it for purposes of headers.
395 defect = Errors.FirstHeaderLineIsContinuation(line)
396 self._cur.defects.append(defect)
397 continue
Anthony Baxter39a0f042004-03-22 00:33:28 +0000398 lastvalue.append(line)
399 continue
Anthony Baxter39a0f042004-03-22 00:33:28 +0000400 if lastheader:
401 # XXX reconsider the joining of folded lines
Barry Warsaw418101f2004-05-09 03:29:23 +0000402 self._cur[lastheader] = EMPTYSTRING.join(lastvalue)[:-1]
Anthony Baxter39a0f042004-03-22 00:33:28 +0000403 lastheader, lastvalue = '', []
Barry Warsaw418101f2004-05-09 03:29:23 +0000404 # Check for envelope header, i.e. unix-from
Anthony Baxter39a0f042004-03-22 00:33:28 +0000405 if line.startswith('From '):
406 if lineno == 0:
407 self._cur.set_unixfrom(line)
408 continue
Barry Warsaw418101f2004-05-09 03:29:23 +0000409 elif lineno == len(lines) - 1:
Anthony Baxter39a0f042004-03-22 00:33:28 +0000410 # Something looking like a unix-from at the end - it's
Barry Warsaw418101f2004-05-09 03:29:23 +0000411 # probably the first line of the body, so push back the
412 # line and stop.
Anthony Baxter39a0f042004-03-22 00:33:28 +0000413 self._input.unreadline(line)
414 return
415 else:
Barry Warsaw418101f2004-05-09 03:29:23 +0000416 # Weirdly placed unix-from line. Note this as a defect
417 # and ignore it.
418 defect = Errors.MisplacedEnvelopeHeader(line)
419 self._cur.defects.append(defect)
Anthony Baxter39a0f042004-03-22 00:33:28 +0000420 continue
Barry Warsaw418101f2004-05-09 03:29:23 +0000421 # Split the line on the colon separating field name from value.
Anthony Baxter39a0f042004-03-22 00:33:28 +0000422 i = line.find(':')
423 if i < 0:
Barry Warsaw418101f2004-05-09 03:29:23 +0000424 defect = Errors.MalformedHeader(line)
425 self._cur.defects.append(defect)
426 continue
Anthony Baxter39a0f042004-03-22 00:33:28 +0000427 lastheader = line[:i]
428 lastvalue = [line[i+1:].lstrip()]
Barry Warsaw418101f2004-05-09 03:29:23 +0000429 # Done with all the lines, so handle the last header.
Anthony Baxter39a0f042004-03-22 00:33:28 +0000430 if lastheader:
431 # XXX reconsider the joining of folded lines
432 self._cur[lastheader] = EMPTYSTRING.join(lastvalue).rstrip()