blob: ea41e9571df8bcb1a29df2ecba2ae9a51ac9a6b8 [file] [log] [blame]
Guido van Rossum8b3febe2007-08-30 01:15:14 +00001# Copyright (C) 2004-2006 Python Software Foundation
2# Authors: Baxter, Wouters and Warsaw
3# Contact: email-sig@python.org
4
5"""FeedParser - An email feed parser.
6
7The feed parser implements an interface for incrementally parsing an email
8message, line by line. This has advantages for certain applications, such as
9those reading email messages off a socket.
10
11FeedParser.feed() is the primary interface for pushing new data into the
12parser. It returns when there's nothing more it can do with the available
13data. When you have no more data to push into the parser, call .close().
14This completes the parsing and returns the root message object.
15
Andrew Svetlov737fb892012-12-18 21:14:22 +020016The other advantage of this parser is that it will never raise a parsing
Guido van Rossum8b3febe2007-08-30 01:15:14 +000017exception. Instead, when it finds something unexpected, it adds a 'defect' to
18the current message. Defects are just instances that live on the message
19object's .defects attribute.
20"""
21
R David Murray1b6c7242012-03-16 22:43:05 -040022__all__ = ['FeedParser', 'BytesFeedParser']
Guido van Rossum8b3febe2007-08-30 01:15:14 +000023
24import re
25
26from email import errors
27from email import message
R David Murrayc27e5222012-05-25 15:01:48 -040028from email._policybase import compat32
Guido van Rossum8b3febe2007-08-30 01:15:14 +000029
30NLCRE = re.compile('\r\n|\r|\n')
31NLCRE_bol = re.compile('(\r\n|\r|\n)')
R. David Murray45e0e142010-06-16 02:19:40 +000032NLCRE_eol = re.compile('(\r\n|\r|\n)\Z')
Guido van Rossum8b3febe2007-08-30 01:15:14 +000033NLCRE_crack = re.compile('(\r\n|\r|\n)')
34# RFC 2822 $3.6.8 Optional fields. ftext is %d33-57 / %d59-126, Any character
35# except controls, SP, and ":".
36headerRE = re.compile(r'^(From |[\041-\071\073-\176]{1,}:|[\t ])')
37EMPTYSTRING = ''
38NL = '\n'
39
40NeedMoreData = object()
41
42
43
44class BufferedSubFile(object):
45 """A file-ish object that can have new data loaded into it.
46
47 You can also push and pop line-matching predicates onto a stack. When the
48 current predicate matches the current line, a false EOF response
49 (i.e. empty string) is returned instead. This lets the parser adhere to a
50 simple abstraction -- it parses until EOF closes the current message.
51 """
52 def __init__(self):
53 # The last partial line pushed into this object.
54 self._partial = ''
55 # The list of full, pushed lines, in reverse order
56 self._lines = []
57 # The stack of false-EOF checking predicates.
58 self._eofstack = []
59 # A flag indicating whether the file has been closed or not.
60 self._closed = False
61
62 def push_eof_matcher(self, pred):
63 self._eofstack.append(pred)
64
65 def pop_eof_matcher(self):
66 return self._eofstack.pop()
67
68 def close(self):
69 # Don't forget any trailing partial line.
70 self._lines.append(self._partial)
71 self._partial = ''
72 self._closed = True
73
74 def readline(self):
75 if not self._lines:
76 if self._closed:
77 return ''
78 return NeedMoreData
79 # Pop the line off the stack and see if it matches the current
80 # false-EOF predicate.
81 line = self._lines.pop()
82 # RFC 2046, section 5.1.2 requires us to recognize outer level
83 # boundaries at any level of inner nesting. Do this, but be sure it's
84 # in the order of most to least nested.
85 for ateof in self._eofstack[::-1]:
86 if ateof(line):
87 # We're at the false EOF. But push the last line back first.
88 self._lines.append(line)
89 return ''
90 return line
91
92 def unreadline(self, line):
93 # Let the consumer push a line back into the buffer.
94 assert line is not NeedMoreData
95 self._lines.append(line)
96
97 def push(self, data):
98 """Push some new data into this object."""
99 # Handle any previous leftovers
100 data, self._partial = self._partial + data, ''
101 # Crack into lines, but preserve the newlines on the end of each
102 parts = NLCRE_crack.split(data)
103 # The *ahem* interesting behaviour of re.split when supplied grouping
104 # parentheses is that the last element of the resulting list is the
105 # data after the final RE. In the case of a NL/CR terminated string,
106 # this is the empty string.
107 self._partial = parts.pop()
R. David Murray45bf773f2010-07-17 01:19:57 +0000108 #GAN 29Mar09 bugs 1555570, 1721862 Confusion at 8K boundary ending with \r:
109 # is there a \n to follow later?
110 if not self._partial and parts and parts[-1].endswith('\r'):
111 self._partial = parts.pop(-2)+parts.pop()
Guido van Rossum8b3febe2007-08-30 01:15:14 +0000112 # parts is a list of strings, alternating between the line contents
113 # and the eol character(s). Gather up a list of lines after
114 # re-attaching the newlines.
115 lines = []
116 for i in range(len(parts) // 2):
117 lines.append(parts[i*2] + parts[i*2+1])
118 self.pushlines(lines)
119
120 def pushlines(self, lines):
121 # Reverse and insert at the front of the lines.
122 self._lines[:0] = lines[::-1]
123
Guido van Rossum8b3febe2007-08-30 01:15:14 +0000124 def __iter__(self):
125 return self
126
127 def __next__(self):
128 line = self.readline()
129 if line == '':
130 raise StopIteration
131 return line
132
133
134
135class FeedParser:
136 """A feed-style parser of email."""
137
R David Murrayc27e5222012-05-25 15:01:48 -0400138 def __init__(self, _factory=message.Message, *, policy=compat32):
R David Murray3edd22a2011-04-18 13:59:37 -0400139 """_factory is called with no arguments to create a new message obj
140
141 The policy keyword specifies a policy object that controls a number of
142 aspects of the parser's operation. The default policy maintains
143 backward compatibility.
144
145 """
Guido van Rossum8b3febe2007-08-30 01:15:14 +0000146 self._factory = _factory
R David Murray3edd22a2011-04-18 13:59:37 -0400147 self.policy = policy
R David Murrayc27e5222012-05-25 15:01:48 -0400148 try:
149 _factory(policy=self.policy)
150 self._factory_kwds = lambda: {'policy': self.policy}
151 except TypeError:
152 # Assume this is an old-style factory
153 self._factory_kwds = lambda: {}
Guido van Rossum8b3febe2007-08-30 01:15:14 +0000154 self._input = BufferedSubFile()
155 self._msgstack = []
156 self._parse = self._parsegen().__next__
157 self._cur = None
158 self._last = None
159 self._headersonly = False
160
161 # Non-public interface for supporting Parser's headersonly flag
162 def _set_headersonly(self):
163 self._headersonly = True
164
165 def feed(self, data):
166 """Push more data into the parser."""
167 self._input.push(data)
168 self._call_parse()
169
170 def _call_parse(self):
171 try:
172 self._parse()
173 except StopIteration:
174 pass
175
176 def close(self):
177 """Parse all remaining data and return the root message object."""
178 self._input.close()
179 self._call_parse()
180 root = self._pop_message()
181 assert not self._msgstack
182 # Look for final set of defects
183 if root.get_content_maintype() == 'multipart' \
184 and not root.is_multipart():
R David Murray3edd22a2011-04-18 13:59:37 -0400185 defect = errors.MultipartInvariantViolationDefect()
186 self.policy.handle_defect(root, defect)
Guido van Rossum8b3febe2007-08-30 01:15:14 +0000187 return root
188
189 def _new_message(self):
R David Murrayc27e5222012-05-25 15:01:48 -0400190 msg = self._factory(**self._factory_kwds())
Guido van Rossum8b3febe2007-08-30 01:15:14 +0000191 if self._cur and self._cur.get_content_type() == 'multipart/digest':
192 msg.set_default_type('message/rfc822')
193 if self._msgstack:
194 self._msgstack[-1].attach(msg)
195 self._msgstack.append(msg)
196 self._cur = msg
197 self._last = msg
198
199 def _pop_message(self):
200 retval = self._msgstack.pop()
201 if self._msgstack:
202 self._cur = self._msgstack[-1]
203 else:
204 self._cur = None
205 return retval
206
207 def _parsegen(self):
208 # Create a new message and start by parsing headers.
209 self._new_message()
210 headers = []
211 # Collect the headers, searching for a line that doesn't match the RFC
212 # 2822 header or continuation pattern (including an empty line).
213 for line in self._input:
214 if line is NeedMoreData:
215 yield NeedMoreData
216 continue
217 if not headerRE.match(line):
218 # If we saw the RFC defined header/body separator
219 # (i.e. newline), just throw it away. Otherwise the line is
220 # part of the body so push it back.
221 if not NLCRE.match(line):
R David Murrayadbdcdb2012-05-27 20:45:01 -0400222 defect = errors.MissingHeaderBodySeparatorDefect()
223 self.policy.handle_defect(self._cur, defect)
Guido van Rossum8b3febe2007-08-30 01:15:14 +0000224 self._input.unreadline(line)
225 break
226 headers.append(line)
227 # Done with the headers, so parse them and figure out what we're
228 # supposed to see in the body of the message.
229 self._parse_headers(headers)
230 # Headers-only parsing is a backwards compatibility hack, which was
Andrew Svetlov737fb892012-12-18 21:14:22 +0200231 # necessary in the older parser, which could raise errors. All
Guido van Rossum8b3febe2007-08-30 01:15:14 +0000232 # remaining lines in the input are thrown into the message body.
233 if self._headersonly:
234 lines = []
235 while True:
236 line = self._input.readline()
237 if line is NeedMoreData:
238 yield NeedMoreData
239 continue
240 if line == '':
241 break
242 lines.append(line)
243 self._cur.set_payload(EMPTYSTRING.join(lines))
244 return
245 if self._cur.get_content_type() == 'message/delivery-status':
246 # message/delivery-status contains blocks of headers separated by
247 # a blank line. We'll represent each header block as a separate
248 # nested message object, but the processing is a bit different
249 # than standard message/* types because there is no body for the
250 # nested messages. A blank line separates the subparts.
251 while True:
252 self._input.push_eof_matcher(NLCRE.match)
253 for retval in self._parsegen():
254 if retval is NeedMoreData:
255 yield NeedMoreData
256 continue
257 break
258 msg = self._pop_message()
259 # We need to pop the EOF matcher in order to tell if we're at
260 # the end of the current file, not the end of the last block
261 # of message headers.
262 self._input.pop_eof_matcher()
263 # The input stream must be sitting at the newline or at the
264 # EOF. We want to see if we're at the end of this subpart, so
265 # first consume the blank line, then test the next line to see
266 # if we're at this subpart's EOF.
267 while True:
268 line = self._input.readline()
269 if line is NeedMoreData:
270 yield NeedMoreData
271 continue
272 break
273 while True:
274 line = self._input.readline()
275 if line is NeedMoreData:
276 yield NeedMoreData
277 continue
278 break
279 if line == '':
280 break
281 # Not at EOF so this is a line we're going to need.
282 self._input.unreadline(line)
283 return
284 if self._cur.get_content_maintype() == 'message':
285 # The message claims to be a message/* type, then what follows is
286 # another RFC 2822 message.
287 for retval in self._parsegen():
288 if retval is NeedMoreData:
289 yield NeedMoreData
290 continue
291 break
292 self._pop_message()
293 return
294 if self._cur.get_content_maintype() == 'multipart':
295 boundary = self._cur.get_boundary()
296 if boundary is None:
297 # The message /claims/ to be a multipart but it has not
298 # defined a boundary. That's a problem which we'll handle by
299 # reading everything until the EOF and marking the message as
300 # defective.
R David Murray3edd22a2011-04-18 13:59:37 -0400301 defect = errors.NoBoundaryInMultipartDefect()
302 self.policy.handle_defect(self._cur, defect)
Guido van Rossum8b3febe2007-08-30 01:15:14 +0000303 lines = []
304 for line in self._input:
305 if line is NeedMoreData:
306 yield NeedMoreData
307 continue
308 lines.append(line)
309 self._cur.set_payload(EMPTYSTRING.join(lines))
310 return
R David Murray749073a2011-06-22 13:47:53 -0400311 # Make sure a valid content type was specified per RFC 2045:6.4.
312 if (self._cur.get('content-transfer-encoding', '8bit').lower()
313 not in ('7bit', '8bit', 'binary')):
314 defect = errors.InvalidMultipartContentTransferEncodingDefect()
315 self.policy.handle_defect(self._cur, defect)
Guido van Rossum8b3febe2007-08-30 01:15:14 +0000316 # Create a line match predicate which matches the inter-part
317 # boundary as well as the end-of-multipart boundary. Don't push
318 # this onto the input stream until we've scanned past the
319 # preamble.
320 separator = '--' + boundary
321 boundaryre = re.compile(
322 '(?P<sep>' + re.escape(separator) +
323 r')(?P<end>--)?(?P<ws>[ \t]*)(?P<linesep>\r\n|\r|\n)?$')
324 capturing_preamble = True
325 preamble = []
326 linesep = False
R David Murray7ef3ff32012-05-27 22:20:42 -0400327 close_boundary_seen = False
Guido van Rossum8b3febe2007-08-30 01:15:14 +0000328 while True:
329 line = self._input.readline()
330 if line is NeedMoreData:
331 yield NeedMoreData
332 continue
333 if line == '':
334 break
335 mo = boundaryre.match(line)
336 if mo:
337 # If we're looking at the end boundary, we're done with
338 # this multipart. If there was a newline at the end of
339 # the closing boundary, then we need to initialize the
340 # epilogue with the empty string (see below).
341 if mo.group('end'):
R David Murray7ef3ff32012-05-27 22:20:42 -0400342 close_boundary_seen = True
Guido van Rossum8b3febe2007-08-30 01:15:14 +0000343 linesep = mo.group('linesep')
344 break
345 # We saw an inter-part boundary. Were we in the preamble?
346 if capturing_preamble:
347 if preamble:
348 # According to RFC 2046, the last newline belongs
349 # to the boundary.
350 lastline = preamble[-1]
351 eolmo = NLCRE_eol.search(lastline)
352 if eolmo:
353 preamble[-1] = lastline[:-len(eolmo.group(0))]
354 self._cur.preamble = EMPTYSTRING.join(preamble)
355 capturing_preamble = False
356 self._input.unreadline(line)
357 continue
358 # We saw a boundary separating two parts. Consume any
359 # multiple boundary lines that may be following. Our
360 # interpretation of RFC 2046 BNF grammar does not produce
361 # body parts within such double boundaries.
362 while True:
363 line = self._input.readline()
364 if line is NeedMoreData:
365 yield NeedMoreData
366 continue
367 mo = boundaryre.match(line)
368 if not mo:
369 self._input.unreadline(line)
370 break
371 # Recurse to parse this subpart; the input stream points
372 # at the subpart's first line.
373 self._input.push_eof_matcher(boundaryre.match)
374 for retval in self._parsegen():
375 if retval is NeedMoreData:
376 yield NeedMoreData
377 continue
378 break
379 # Because of RFC 2046, the newline preceding the boundary
380 # separator actually belongs to the boundary, not the
381 # previous subpart's payload (or epilogue if the previous
382 # part is a multipart).
383 if self._last.get_content_maintype() == 'multipart':
384 epilogue = self._last.epilogue
385 if epilogue == '':
386 self._last.epilogue = None
387 elif epilogue is not None:
388 mo = NLCRE_eol.search(epilogue)
389 if mo:
390 end = len(mo.group(0))
391 self._last.epilogue = epilogue[:-end]
392 else:
R David Murrayc5c14722011-04-06 08:13:02 -0400393 payload = self._last._payload
Guido van Rossum3172c5d2007-10-16 18:12:55 +0000394 if isinstance(payload, str):
Guido van Rossum8b3febe2007-08-30 01:15:14 +0000395 mo = NLCRE_eol.search(payload)
396 if mo:
397 payload = payload[:-len(mo.group(0))]
R David Murrayc5c14722011-04-06 08:13:02 -0400398 self._last._payload = payload
Guido van Rossum8b3febe2007-08-30 01:15:14 +0000399 self._input.pop_eof_matcher()
400 self._pop_message()
401 # Set the multipart up for newline cleansing, which will
402 # happen if we're in a nested multipart.
403 self._last = self._cur
404 else:
405 # I think we must be in the preamble
406 assert capturing_preamble
407 preamble.append(line)
408 # We've seen either the EOF or the end boundary. If we're still
409 # capturing the preamble, we never saw the start boundary. Note
410 # that as a defect and store the captured text as the payload.
Guido van Rossum8b3febe2007-08-30 01:15:14 +0000411 if capturing_preamble:
R David Murray3edd22a2011-04-18 13:59:37 -0400412 defect = errors.StartBoundaryNotFoundDefect()
413 self.policy.handle_defect(self._cur, defect)
Guido van Rossum8b3febe2007-08-30 01:15:14 +0000414 self._cur.set_payload(EMPTYSTRING.join(preamble))
415 epilogue = []
416 for line in self._input:
417 if line is NeedMoreData:
418 yield NeedMoreData
419 continue
420 self._cur.epilogue = EMPTYSTRING.join(epilogue)
421 return
R David Murray7ef3ff32012-05-27 22:20:42 -0400422 # If we're not processing the preamble, then we might have seen
423 # EOF without seeing that end boundary...that is also a defect.
424 if not close_boundary_seen:
425 defect = errors.CloseBoundaryNotFoundDefect()
426 self.policy.handle_defect(self._cur, defect)
427 return
428 # Everything from here to the EOF is epilogue. If the end boundary
429 # ended in a newline, we'll need to make sure the epilogue isn't
430 # None
Guido van Rossum8b3febe2007-08-30 01:15:14 +0000431 if linesep:
432 epilogue = ['']
433 else:
434 epilogue = []
435 for line in self._input:
436 if line is NeedMoreData:
437 yield NeedMoreData
438 continue
439 epilogue.append(line)
440 # Any CRLF at the front of the epilogue is not technically part of
441 # the epilogue. Also, watch out for an empty string epilogue,
442 # which means a single newline.
443 if epilogue:
444 firstline = epilogue[0]
445 bolmo = NLCRE_bol.match(firstline)
446 if bolmo:
447 epilogue[0] = firstline[len(bolmo.group(0)):]
448 self._cur.epilogue = EMPTYSTRING.join(epilogue)
449 return
450 # Otherwise, it's some non-multipart type, so the entire rest of the
451 # file contents becomes the payload.
452 lines = []
453 for line in self._input:
454 if line is NeedMoreData:
455 yield NeedMoreData
456 continue
457 lines.append(line)
458 self._cur.set_payload(EMPTYSTRING.join(lines))
459
460 def _parse_headers(self, lines):
461 # Passed a list of lines that make up the headers for the current msg
462 lastheader = ''
463 lastvalue = []
464 for lineno, line in enumerate(lines):
465 # Check for continuation
466 if line[0] in ' \t':
467 if not lastheader:
468 # The first line of the headers was a continuation. This
469 # is illegal, so let's note the defect, store the illegal
470 # line, and ignore it for purposes of headers.
471 defect = errors.FirstHeaderLineIsContinuationDefect(line)
R David Murray3edd22a2011-04-18 13:59:37 -0400472 self.policy.handle_defect(self._cur, defect)
Guido van Rossum8b3febe2007-08-30 01:15:14 +0000473 continue
474 lastvalue.append(line)
475 continue
476 if lastheader:
R David Murrayc27e5222012-05-25 15:01:48 -0400477 self._cur.set_raw(*self.policy.header_source_parse(lastvalue))
Guido van Rossum8b3febe2007-08-30 01:15:14 +0000478 lastheader, lastvalue = '', []
479 # Check for envelope header, i.e. unix-from
480 if line.startswith('From '):
481 if lineno == 0:
482 # Strip off the trailing newline
483 mo = NLCRE_eol.search(line)
484 if mo:
485 line = line[:-len(mo.group(0))]
486 self._cur.set_unixfrom(line)
487 continue
488 elif lineno == len(lines) - 1:
489 # Something looking like a unix-from at the end - it's
490 # probably the first line of the body, so push back the
491 # line and stop.
492 self._input.unreadline(line)
493 return
494 else:
495 # Weirdly placed unix-from line. Note this as a defect
496 # and ignore it.
497 defect = errors.MisplacedEnvelopeHeaderDefect(line)
498 self._cur.defects.append(defect)
499 continue
500 # Split the line on the colon separating field name from value.
R David Murrayadbdcdb2012-05-27 20:45:01 -0400501 # There will always be a colon, because if there wasn't the part of
502 # the parser that calls us would have started parsing the body.
Guido van Rossum8b3febe2007-08-30 01:15:14 +0000503 i = line.find(':')
R David Murrayadbdcdb2012-05-27 20:45:01 -0400504 assert i>0, "_parse_headers fed line with no : and no leading WS"
Guido van Rossum8b3febe2007-08-30 01:15:14 +0000505 lastheader = line[:i]
R David Murrayc27e5222012-05-25 15:01:48 -0400506 lastvalue = [line]
Guido van Rossum8b3febe2007-08-30 01:15:14 +0000507 # Done with all the lines, so handle the last header.
508 if lastheader:
R David Murrayc27e5222012-05-25 15:01:48 -0400509 self._cur.set_raw(*self.policy.header_source_parse(lastvalue))
R. David Murray96fd54e2010-10-08 15:55:28 +0000510
R David Murrayc27e5222012-05-25 15:01:48 -0400511
R. David Murray96fd54e2010-10-08 15:55:28 +0000512class BytesFeedParser(FeedParser):
513 """Like FeedParser, but feed accepts bytes."""
514
515 def feed(self, data):
516 super().feed(data.decode('ascii', 'surrogateescape'))