blob: 7c07ca86457a2aae626ebcee0c4cb024b5d70ed6 [file] [log] [blame]
Guido van Rossum8b3febe2007-08-30 01:15:14 +00001# Copyright (C) 2004-2006 Python Software Foundation
2# Authors: Baxter, Wouters and Warsaw
3# Contact: email-sig@python.org
4
5"""FeedParser - An email feed parser.
6
7The feed parser implements an interface for incrementally parsing an email
8message, line by line. This has advantages for certain applications, such as
9those reading email messages off a socket.
10
11FeedParser.feed() is the primary interface for pushing new data into the
12parser. It returns when there's nothing more it can do with the available
13data. When you have no more data to push into the parser, call .close().
14This completes the parsing and returns the root message object.
15
Andrew Svetlov737fb892012-12-18 21:14:22 +020016The other advantage of this parser is that it will never raise a parsing
Guido van Rossum8b3febe2007-08-30 01:15:14 +000017exception. Instead, when it finds something unexpected, it adds a 'defect' to
18the current message. Defects are just instances that live on the message
19object's .defects attribute.
20"""
21
R David Murray1b6c7242012-03-16 22:43:05 -040022__all__ = ['FeedParser', 'BytesFeedParser']
Guido van Rossum8b3febe2007-08-30 01:15:14 +000023
24import re
25
26from email import errors
R David Murrayc27e5222012-05-25 15:01:48 -040027from email._policybase import compat32
Raymond Hettingerf070f1c2015-05-22 17:23:28 -070028from collections import deque
R David Murraydc1650c2016-09-07 17:44:34 -040029from io import StringIO
Guido van Rossum8b3febe2007-08-30 01:15:14 +000030
R David Murray44b548d2016-09-08 13:59:53 -040031NLCRE = re.compile(r'\r\n|\r|\n')
32NLCRE_bol = re.compile(r'(\r\n|\r|\n)')
33NLCRE_eol = re.compile(r'(\r\n|\r|\n)\Z')
34NLCRE_crack = re.compile(r'(\r\n|\r|\n)')
Guido van Rossum8b3febe2007-08-30 01:15:14 +000035# RFC 2822 $3.6.8 Optional fields. ftext is %d33-57 / %d59-126, Any character
36# except controls, SP, and ":".
Benjamin Peterson155ceaa2015-01-25 23:30:30 -050037headerRE = re.compile(r'^(From |[\041-\071\073-\176]*:|[\t ])')
Guido van Rossum8b3febe2007-08-30 01:15:14 +000038EMPTYSTRING = ''
39NL = '\n'
40
41NeedMoreData = object()
42
43
44
45class BufferedSubFile(object):
46 """A file-ish object that can have new data loaded into it.
47
48 You can also push and pop line-matching predicates onto a stack. When the
49 current predicate matches the current line, a false EOF response
50 (i.e. empty string) is returned instead. This lets the parser adhere to a
51 simple abstraction -- it parses until EOF closes the current message.
52 """
53 def __init__(self):
R David Murraydc1650c2016-09-07 17:44:34 -040054 # Text stream of the last partial line pushed into this object.
55 # See issue 22233 for why this is a text stream and not a list.
56 self._partial = StringIO(newline='')
Raymond Hettingerf070f1c2015-05-22 17:23:28 -070057 # A deque of full, pushed lines
58 self._lines = deque()
Guido van Rossum8b3febe2007-08-30 01:15:14 +000059 # The stack of false-EOF checking predicates.
60 self._eofstack = []
61 # A flag indicating whether the file has been closed or not.
62 self._closed = False
63
64 def push_eof_matcher(self, pred):
65 self._eofstack.append(pred)
66
67 def pop_eof_matcher(self):
68 return self._eofstack.pop()
69
70 def close(self):
71 # Don't forget any trailing partial line.
R David Murraydc1650c2016-09-07 17:44:34 -040072 self._partial.seek(0)
73 self.pushlines(self._partial.readlines())
74 self._partial.seek(0)
75 self._partial.truncate()
Guido van Rossum8b3febe2007-08-30 01:15:14 +000076 self._closed = True
77
78 def readline(self):
79 if not self._lines:
80 if self._closed:
81 return ''
82 return NeedMoreData
83 # Pop the line off the stack and see if it matches the current
84 # false-EOF predicate.
Raymond Hettingerf070f1c2015-05-22 17:23:28 -070085 line = self._lines.popleft()
Guido van Rossum8b3febe2007-08-30 01:15:14 +000086 # RFC 2046, section 5.1.2 requires us to recognize outer level
87 # boundaries at any level of inner nesting. Do this, but be sure it's
88 # in the order of most to least nested.
Raymond Hettingerf070f1c2015-05-22 17:23:28 -070089 for ateof in reversed(self._eofstack):
Guido van Rossum8b3febe2007-08-30 01:15:14 +000090 if ateof(line):
91 # We're at the false EOF. But push the last line back first.
Raymond Hettingerf070f1c2015-05-22 17:23:28 -070092 self._lines.appendleft(line)
Guido van Rossum8b3febe2007-08-30 01:15:14 +000093 return ''
94 return line
95
96 def unreadline(self, line):
97 # Let the consumer push a line back into the buffer.
98 assert line is not NeedMoreData
Raymond Hettingerf070f1c2015-05-22 17:23:28 -070099 self._lines.appendleft(line)
Guido van Rossum8b3febe2007-08-30 01:15:14 +0000100
101 def push(self, data):
102 """Push some new data into this object."""
R David Murraydc1650c2016-09-07 17:44:34 -0400103 self._partial.write(data)
104 if '\n' not in data and '\r' not in data:
105 # No new complete lines, wait for more.
Serhiy Storchaka320a1c02014-08-12 13:59:11 +0300106 return
107
R David Murraydc1650c2016-09-07 17:44:34 -0400108 # Crack into lines, preserving the linesep characters.
109 self._partial.seek(0)
110 parts = self._partial.readlines()
111 self._partial.seek(0)
112 self._partial.truncate()
Serhiy Storchaka320a1c02014-08-12 13:59:11 +0300113
R David Murray2940e712013-02-13 21:17:13 -0500114 # If the last element of the list does not end in a newline, then treat
115 # it as a partial line. We only check for '\n' here because a line
116 # ending with '\r' might be a line that was split in the middle of a
117 # '\r\n' sequence (see bugs 1555570 and 1721862).
Serhiy Storchaka320a1c02014-08-12 13:59:11 +0300118 if not parts[-1].endswith('\n'):
R David Murraydc1650c2016-09-07 17:44:34 -0400119 self._partial.write(parts.pop())
R David Murray2940e712013-02-13 21:17:13 -0500120 self.pushlines(parts)
Guido van Rossum8b3febe2007-08-30 01:15:14 +0000121
122 def pushlines(self, lines):
Raymond Hettingerf070f1c2015-05-22 17:23:28 -0700123 self._lines.extend(lines)
Guido van Rossum8b3febe2007-08-30 01:15:14 +0000124
Guido van Rossum8b3febe2007-08-30 01:15:14 +0000125 def __iter__(self):
126 return self
127
128 def __next__(self):
129 line = self.readline()
130 if line == '':
131 raise StopIteration
132 return line
133
134
135
136class FeedParser:
137 """A feed-style parser of email."""
138
R David Murrayaa212972014-02-07 10:44:16 -0500139 def __init__(self, _factory=None, *, policy=compat32):
R David Murray3edd22a2011-04-18 13:59:37 -0400140 """_factory is called with no arguments to create a new message obj
141
142 The policy keyword specifies a policy object that controls a number of
143 aspects of the parser's operation. The default policy maintains
144 backward compatibility.
145
146 """
R David Murray3edd22a2011-04-18 13:59:37 -0400147 self.policy = policy
R David Murray702b0462016-07-15 21:29:13 -0400148 self._old_style_factory = False
R David Murrayaa212972014-02-07 10:44:16 -0500149 if _factory is None:
R David Murrayb067c8f2016-09-10 00:22:25 -0400150 if policy.message_factory is None:
151 from email.message import Message
152 self._factory = Message
153 else:
154 self._factory = policy.message_factory
R David Murrayaa212972014-02-07 10:44:16 -0500155 else:
156 self._factory = _factory
157 try:
158 _factory(policy=self.policy)
159 except TypeError:
160 # Assume this is an old-style factory
R David Murray702b0462016-07-15 21:29:13 -0400161 self._old_style_factory = True
Guido van Rossum8b3febe2007-08-30 01:15:14 +0000162 self._input = BufferedSubFile()
163 self._msgstack = []
164 self._parse = self._parsegen().__next__
165 self._cur = None
166 self._last = None
167 self._headersonly = False
168
169 # Non-public interface for supporting Parser's headersonly flag
170 def _set_headersonly(self):
171 self._headersonly = True
172
173 def feed(self, data):
174 """Push more data into the parser."""
175 self._input.push(data)
176 self._call_parse()
177
178 def _call_parse(self):
179 try:
180 self._parse()
181 except StopIteration:
182 pass
183
184 def close(self):
185 """Parse all remaining data and return the root message object."""
186 self._input.close()
187 self._call_parse()
188 root = self._pop_message()
189 assert not self._msgstack
190 # Look for final set of defects
191 if root.get_content_maintype() == 'multipart' \
192 and not root.is_multipart():
R David Murray3edd22a2011-04-18 13:59:37 -0400193 defect = errors.MultipartInvariantViolationDefect()
194 self.policy.handle_defect(root, defect)
Guido van Rossum8b3febe2007-08-30 01:15:14 +0000195 return root
196
197 def _new_message(self):
R David Murray702b0462016-07-15 21:29:13 -0400198 if self._old_style_factory:
199 msg = self._factory()
200 else:
201 msg = self._factory(policy=self.policy)
Guido van Rossum8b3febe2007-08-30 01:15:14 +0000202 if self._cur and self._cur.get_content_type() == 'multipart/digest':
203 msg.set_default_type('message/rfc822')
204 if self._msgstack:
205 self._msgstack[-1].attach(msg)
206 self._msgstack.append(msg)
207 self._cur = msg
208 self._last = msg
209
210 def _pop_message(self):
211 retval = self._msgstack.pop()
212 if self._msgstack:
213 self._cur = self._msgstack[-1]
214 else:
215 self._cur = None
216 return retval
217
218 def _parsegen(self):
219 # Create a new message and start by parsing headers.
220 self._new_message()
221 headers = []
222 # Collect the headers, searching for a line that doesn't match the RFC
223 # 2822 header or continuation pattern (including an empty line).
224 for line in self._input:
225 if line is NeedMoreData:
226 yield NeedMoreData
227 continue
228 if not headerRE.match(line):
229 # If we saw the RFC defined header/body separator
230 # (i.e. newline), just throw it away. Otherwise the line is
231 # part of the body so push it back.
232 if not NLCRE.match(line):
R David Murrayadbdcdb2012-05-27 20:45:01 -0400233 defect = errors.MissingHeaderBodySeparatorDefect()
234 self.policy.handle_defect(self._cur, defect)
Guido van Rossum8b3febe2007-08-30 01:15:14 +0000235 self._input.unreadline(line)
236 break
237 headers.append(line)
238 # Done with the headers, so parse them and figure out what we're
239 # supposed to see in the body of the message.
240 self._parse_headers(headers)
241 # Headers-only parsing is a backwards compatibility hack, which was
Andrew Svetlov737fb892012-12-18 21:14:22 +0200242 # necessary in the older parser, which could raise errors. All
Guido van Rossum8b3febe2007-08-30 01:15:14 +0000243 # remaining lines in the input are thrown into the message body.
244 if self._headersonly:
245 lines = []
246 while True:
247 line = self._input.readline()
248 if line is NeedMoreData:
249 yield NeedMoreData
250 continue
251 if line == '':
252 break
253 lines.append(line)
254 self._cur.set_payload(EMPTYSTRING.join(lines))
255 return
256 if self._cur.get_content_type() == 'message/delivery-status':
257 # message/delivery-status contains blocks of headers separated by
258 # a blank line. We'll represent each header block as a separate
259 # nested message object, but the processing is a bit different
260 # than standard message/* types because there is no body for the
261 # nested messages. A blank line separates the subparts.
262 while True:
263 self._input.push_eof_matcher(NLCRE.match)
264 for retval in self._parsegen():
265 if retval is NeedMoreData:
266 yield NeedMoreData
267 continue
268 break
269 msg = self._pop_message()
270 # We need to pop the EOF matcher in order to tell if we're at
271 # the end of the current file, not the end of the last block
272 # of message headers.
273 self._input.pop_eof_matcher()
274 # The input stream must be sitting at the newline or at the
275 # EOF. We want to see if we're at the end of this subpart, so
276 # first consume the blank line, then test the next line to see
277 # if we're at this subpart's EOF.
278 while True:
279 line = self._input.readline()
280 if line is NeedMoreData:
281 yield NeedMoreData
282 continue
283 break
284 while True:
285 line = self._input.readline()
286 if line is NeedMoreData:
287 yield NeedMoreData
288 continue
289 break
290 if line == '':
291 break
292 # Not at EOF so this is a line we're going to need.
293 self._input.unreadline(line)
294 return
295 if self._cur.get_content_maintype() == 'message':
296 # The message claims to be a message/* type, then what follows is
297 # another RFC 2822 message.
298 for retval in self._parsegen():
299 if retval is NeedMoreData:
300 yield NeedMoreData
301 continue
302 break
303 self._pop_message()
304 return
305 if self._cur.get_content_maintype() == 'multipart':
306 boundary = self._cur.get_boundary()
307 if boundary is None:
308 # The message /claims/ to be a multipart but it has not
309 # defined a boundary. That's a problem which we'll handle by
310 # reading everything until the EOF and marking the message as
311 # defective.
R David Murray3edd22a2011-04-18 13:59:37 -0400312 defect = errors.NoBoundaryInMultipartDefect()
313 self.policy.handle_defect(self._cur, defect)
Guido van Rossum8b3febe2007-08-30 01:15:14 +0000314 lines = []
315 for line in self._input:
316 if line is NeedMoreData:
317 yield NeedMoreData
318 continue
319 lines.append(line)
320 self._cur.set_payload(EMPTYSTRING.join(lines))
321 return
R David Murray749073a2011-06-22 13:47:53 -0400322 # Make sure a valid content type was specified per RFC 2045:6.4.
323 if (self._cur.get('content-transfer-encoding', '8bit').lower()
324 not in ('7bit', '8bit', 'binary')):
325 defect = errors.InvalidMultipartContentTransferEncodingDefect()
326 self.policy.handle_defect(self._cur, defect)
Guido van Rossum8b3febe2007-08-30 01:15:14 +0000327 # Create a line match predicate which matches the inter-part
328 # boundary as well as the end-of-multipart boundary. Don't push
329 # this onto the input stream until we've scanned past the
330 # preamble.
331 separator = '--' + boundary
332 boundaryre = re.compile(
333 '(?P<sep>' + re.escape(separator) +
334 r')(?P<end>--)?(?P<ws>[ \t]*)(?P<linesep>\r\n|\r|\n)?$')
335 capturing_preamble = True
336 preamble = []
337 linesep = False
R David Murray7ef3ff32012-05-27 22:20:42 -0400338 close_boundary_seen = False
Guido van Rossum8b3febe2007-08-30 01:15:14 +0000339 while True:
340 line = self._input.readline()
341 if line is NeedMoreData:
342 yield NeedMoreData
343 continue
344 if line == '':
345 break
346 mo = boundaryre.match(line)
347 if mo:
348 # If we're looking at the end boundary, we're done with
349 # this multipart. If there was a newline at the end of
350 # the closing boundary, then we need to initialize the
351 # epilogue with the empty string (see below).
352 if mo.group('end'):
R David Murray7ef3ff32012-05-27 22:20:42 -0400353 close_boundary_seen = True
Guido van Rossum8b3febe2007-08-30 01:15:14 +0000354 linesep = mo.group('linesep')
355 break
356 # We saw an inter-part boundary. Were we in the preamble?
357 if capturing_preamble:
358 if preamble:
359 # According to RFC 2046, the last newline belongs
360 # to the boundary.
361 lastline = preamble[-1]
362 eolmo = NLCRE_eol.search(lastline)
363 if eolmo:
364 preamble[-1] = lastline[:-len(eolmo.group(0))]
365 self._cur.preamble = EMPTYSTRING.join(preamble)
366 capturing_preamble = False
367 self._input.unreadline(line)
368 continue
369 # We saw a boundary separating two parts. Consume any
370 # multiple boundary lines that may be following. Our
371 # interpretation of RFC 2046 BNF grammar does not produce
372 # body parts within such double boundaries.
373 while True:
374 line = self._input.readline()
375 if line is NeedMoreData:
376 yield NeedMoreData
377 continue
378 mo = boundaryre.match(line)
379 if not mo:
380 self._input.unreadline(line)
381 break
382 # Recurse to parse this subpart; the input stream points
383 # at the subpart's first line.
384 self._input.push_eof_matcher(boundaryre.match)
385 for retval in self._parsegen():
386 if retval is NeedMoreData:
387 yield NeedMoreData
388 continue
389 break
390 # Because of RFC 2046, the newline preceding the boundary
391 # separator actually belongs to the boundary, not the
392 # previous subpart's payload (or epilogue if the previous
393 # part is a multipart).
394 if self._last.get_content_maintype() == 'multipart':
395 epilogue = self._last.epilogue
396 if epilogue == '':
397 self._last.epilogue = None
398 elif epilogue is not None:
399 mo = NLCRE_eol.search(epilogue)
400 if mo:
401 end = len(mo.group(0))
402 self._last.epilogue = epilogue[:-end]
403 else:
R David Murrayc5c14722011-04-06 08:13:02 -0400404 payload = self._last._payload
Guido van Rossum3172c5d2007-10-16 18:12:55 +0000405 if isinstance(payload, str):
Guido van Rossum8b3febe2007-08-30 01:15:14 +0000406 mo = NLCRE_eol.search(payload)
407 if mo:
408 payload = payload[:-len(mo.group(0))]
R David Murrayc5c14722011-04-06 08:13:02 -0400409 self._last._payload = payload
Guido van Rossum8b3febe2007-08-30 01:15:14 +0000410 self._input.pop_eof_matcher()
411 self._pop_message()
412 # Set the multipart up for newline cleansing, which will
413 # happen if we're in a nested multipart.
414 self._last = self._cur
415 else:
416 # I think we must be in the preamble
417 assert capturing_preamble
418 preamble.append(line)
419 # We've seen either the EOF or the end boundary. If we're still
420 # capturing the preamble, we never saw the start boundary. Note
421 # that as a defect and store the captured text as the payload.
Guido van Rossum8b3febe2007-08-30 01:15:14 +0000422 if capturing_preamble:
R David Murray3edd22a2011-04-18 13:59:37 -0400423 defect = errors.StartBoundaryNotFoundDefect()
424 self.policy.handle_defect(self._cur, defect)
Guido van Rossum8b3febe2007-08-30 01:15:14 +0000425 self._cur.set_payload(EMPTYSTRING.join(preamble))
426 epilogue = []
427 for line in self._input:
428 if line is NeedMoreData:
429 yield NeedMoreData
430 continue
431 self._cur.epilogue = EMPTYSTRING.join(epilogue)
432 return
R David Murray7ef3ff32012-05-27 22:20:42 -0400433 # If we're not processing the preamble, then we might have seen
434 # EOF without seeing that end boundary...that is also a defect.
435 if not close_boundary_seen:
436 defect = errors.CloseBoundaryNotFoundDefect()
437 self.policy.handle_defect(self._cur, defect)
438 return
439 # Everything from here to the EOF is epilogue. If the end boundary
440 # ended in a newline, we'll need to make sure the epilogue isn't
441 # None
Guido van Rossum8b3febe2007-08-30 01:15:14 +0000442 if linesep:
443 epilogue = ['']
444 else:
445 epilogue = []
446 for line in self._input:
447 if line is NeedMoreData:
448 yield NeedMoreData
449 continue
450 epilogue.append(line)
451 # Any CRLF at the front of the epilogue is not technically part of
452 # the epilogue. Also, watch out for an empty string epilogue,
453 # which means a single newline.
454 if epilogue:
455 firstline = epilogue[0]
456 bolmo = NLCRE_bol.match(firstline)
457 if bolmo:
458 epilogue[0] = firstline[len(bolmo.group(0)):]
459 self._cur.epilogue = EMPTYSTRING.join(epilogue)
460 return
461 # Otherwise, it's some non-multipart type, so the entire rest of the
462 # file contents becomes the payload.
463 lines = []
464 for line in self._input:
465 if line is NeedMoreData:
466 yield NeedMoreData
467 continue
468 lines.append(line)
469 self._cur.set_payload(EMPTYSTRING.join(lines))
470
471 def _parse_headers(self, lines):
472 # Passed a list of lines that make up the headers for the current msg
473 lastheader = ''
474 lastvalue = []
475 for lineno, line in enumerate(lines):
476 # Check for continuation
477 if line[0] in ' \t':
478 if not lastheader:
479 # The first line of the headers was a continuation. This
480 # is illegal, so let's note the defect, store the illegal
481 # line, and ignore it for purposes of headers.
482 defect = errors.FirstHeaderLineIsContinuationDefect(line)
R David Murray3edd22a2011-04-18 13:59:37 -0400483 self.policy.handle_defect(self._cur, defect)
Guido van Rossum8b3febe2007-08-30 01:15:14 +0000484 continue
485 lastvalue.append(line)
486 continue
487 if lastheader:
R David Murrayc27e5222012-05-25 15:01:48 -0400488 self._cur.set_raw(*self.policy.header_source_parse(lastvalue))
Guido van Rossum8b3febe2007-08-30 01:15:14 +0000489 lastheader, lastvalue = '', []
490 # Check for envelope header, i.e. unix-from
491 if line.startswith('From '):
492 if lineno == 0:
493 # Strip off the trailing newline
494 mo = NLCRE_eol.search(line)
495 if mo:
496 line = line[:-len(mo.group(0))]
497 self._cur.set_unixfrom(line)
498 continue
499 elif lineno == len(lines) - 1:
500 # Something looking like a unix-from at the end - it's
501 # probably the first line of the body, so push back the
502 # line and stop.
503 self._input.unreadline(line)
504 return
505 else:
506 # Weirdly placed unix-from line. Note this as a defect
507 # and ignore it.
508 defect = errors.MisplacedEnvelopeHeaderDefect(line)
509 self._cur.defects.append(defect)
510 continue
511 # Split the line on the colon separating field name from value.
R David Murrayadbdcdb2012-05-27 20:45:01 -0400512 # There will always be a colon, because if there wasn't the part of
513 # the parser that calls us would have started parsing the body.
Guido van Rossum8b3febe2007-08-30 01:15:14 +0000514 i = line.find(':')
Benjamin Peterson155ceaa2015-01-25 23:30:30 -0500515
516 # If the colon is on the start of the line the header is clearly
517 # malformed, but we might be able to salvage the rest of the
518 # message. Track the error but keep going.
519 if i == 0:
520 defect = errors.InvalidHeaderDefect("Missing header name.")
521 self._cur.defects.append(defect)
522 continue
523
R David Murrayadbdcdb2012-05-27 20:45:01 -0400524 assert i>0, "_parse_headers fed line with no : and no leading WS"
Guido van Rossum8b3febe2007-08-30 01:15:14 +0000525 lastheader = line[:i]
R David Murrayc27e5222012-05-25 15:01:48 -0400526 lastvalue = [line]
Guido van Rossum8b3febe2007-08-30 01:15:14 +0000527 # Done with all the lines, so handle the last header.
528 if lastheader:
R David Murrayc27e5222012-05-25 15:01:48 -0400529 self._cur.set_raw(*self.policy.header_source_parse(lastvalue))
R. David Murray96fd54e2010-10-08 15:55:28 +0000530
R David Murrayc27e5222012-05-25 15:01:48 -0400531
R. David Murray96fd54e2010-10-08 15:55:28 +0000532class BytesFeedParser(FeedParser):
533 """Like FeedParser, but feed accepts bytes."""
534
535 def feed(self, data):
536 super().feed(data.decode('ascii', 'surrogateescape'))