blob: 3d74978cdbbbbc9e2caed23a33e0f209de8e68c9 [file] [log] [blame]
Guido van Rossum8b3febe2007-08-30 01:15:14 +00001# Copyright (C) 2004-2006 Python Software Foundation
2# Authors: Baxter, Wouters and Warsaw
3# Contact: email-sig@python.org
4
5"""FeedParser - An email feed parser.
6
7The feed parser implements an interface for incrementally parsing an email
8message, line by line. This has advantages for certain applications, such as
9those reading email messages off a socket.
10
11FeedParser.feed() is the primary interface for pushing new data into the
12parser. It returns when there's nothing more it can do with the available
13data. When you have no more data to push into the parser, call .close().
14This completes the parsing and returns the root message object.
15
Andrew Svetlov737fb892012-12-18 21:14:22 +020016The other advantage of this parser is that it will never raise a parsing
Guido van Rossum8b3febe2007-08-30 01:15:14 +000017exception. Instead, when it finds something unexpected, it adds a 'defect' to
18the current message. Defects are just instances that live on the message
19object's .defects attribute.
20"""
21
R David Murray1b6c7242012-03-16 22:43:05 -040022__all__ = ['FeedParser', 'BytesFeedParser']
Guido van Rossum8b3febe2007-08-30 01:15:14 +000023
24import re
25
26from email import errors
R David Murrayc27e5222012-05-25 15:01:48 -040027from email._policybase import compat32
Raymond Hettingerf070f1c2015-05-22 17:23:28 -070028from collections import deque
R David Murraydc1650c2016-09-07 17:44:34 -040029from io import StringIO
Guido van Rossum8b3febe2007-08-30 01:15:14 +000030
R David Murray44b548d2016-09-08 13:59:53 -040031NLCRE = re.compile(r'\r\n|\r|\n')
32NLCRE_bol = re.compile(r'(\r\n|\r|\n)')
33NLCRE_eol = re.compile(r'(\r\n|\r|\n)\Z')
34NLCRE_crack = re.compile(r'(\r\n|\r|\n)')
Guido van Rossum8b3febe2007-08-30 01:15:14 +000035# RFC 2822 $3.6.8 Optional fields. ftext is %d33-57 / %d59-126, Any character
36# except controls, SP, and ":".
Benjamin Peterson155ceaa2015-01-25 23:30:30 -050037headerRE = re.compile(r'^(From |[\041-\071\073-\176]*:|[\t ])')
Guido van Rossum8b3febe2007-08-30 01:15:14 +000038EMPTYSTRING = ''
39NL = '\n'
40
41NeedMoreData = object()
42
43
44
45class BufferedSubFile(object):
46 """A file-ish object that can have new data loaded into it.
47
48 You can also push and pop line-matching predicates onto a stack. When the
49 current predicate matches the current line, a false EOF response
50 (i.e. empty string) is returned instead. This lets the parser adhere to a
51 simple abstraction -- it parses until EOF closes the current message.
52 """
53 def __init__(self):
R David Murraydc1650c2016-09-07 17:44:34 -040054 # Text stream of the last partial line pushed into this object.
55 # See issue 22233 for why this is a text stream and not a list.
56 self._partial = StringIO(newline='')
Raymond Hettingerf070f1c2015-05-22 17:23:28 -070057 # A deque of full, pushed lines
58 self._lines = deque()
Guido van Rossum8b3febe2007-08-30 01:15:14 +000059 # The stack of false-EOF checking predicates.
60 self._eofstack = []
61 # A flag indicating whether the file has been closed or not.
62 self._closed = False
63
64 def push_eof_matcher(self, pred):
65 self._eofstack.append(pred)
66
67 def pop_eof_matcher(self):
68 return self._eofstack.pop()
69
70 def close(self):
71 # Don't forget any trailing partial line.
R David Murraydc1650c2016-09-07 17:44:34 -040072 self._partial.seek(0)
73 self.pushlines(self._partial.readlines())
74 self._partial.seek(0)
75 self._partial.truncate()
Guido van Rossum8b3febe2007-08-30 01:15:14 +000076 self._closed = True
77
78 def readline(self):
79 if not self._lines:
80 if self._closed:
81 return ''
82 return NeedMoreData
83 # Pop the line off the stack and see if it matches the current
84 # false-EOF predicate.
Raymond Hettingerf070f1c2015-05-22 17:23:28 -070085 line = self._lines.popleft()
Guido van Rossum8b3febe2007-08-30 01:15:14 +000086 # RFC 2046, section 5.1.2 requires us to recognize outer level
87 # boundaries at any level of inner nesting. Do this, but be sure it's
88 # in the order of most to least nested.
Raymond Hettingerf070f1c2015-05-22 17:23:28 -070089 for ateof in reversed(self._eofstack):
Guido van Rossum8b3febe2007-08-30 01:15:14 +000090 if ateof(line):
91 # We're at the false EOF. But push the last line back first.
Raymond Hettingerf070f1c2015-05-22 17:23:28 -070092 self._lines.appendleft(line)
Guido van Rossum8b3febe2007-08-30 01:15:14 +000093 return ''
94 return line
95
96 def unreadline(self, line):
97 # Let the consumer push a line back into the buffer.
98 assert line is not NeedMoreData
Raymond Hettingerf070f1c2015-05-22 17:23:28 -070099 self._lines.appendleft(line)
Guido van Rossum8b3febe2007-08-30 01:15:14 +0000100
101 def push(self, data):
102 """Push some new data into this object."""
R David Murraydc1650c2016-09-07 17:44:34 -0400103 self._partial.write(data)
104 if '\n' not in data and '\r' not in data:
105 # No new complete lines, wait for more.
Serhiy Storchaka320a1c02014-08-12 13:59:11 +0300106 return
107
R David Murraydc1650c2016-09-07 17:44:34 -0400108 # Crack into lines, preserving the linesep characters.
109 self._partial.seek(0)
110 parts = self._partial.readlines()
111 self._partial.seek(0)
112 self._partial.truncate()
Serhiy Storchaka320a1c02014-08-12 13:59:11 +0300113
R David Murray2940e712013-02-13 21:17:13 -0500114 # If the last element of the list does not end in a newline, then treat
115 # it as a partial line. We only check for '\n' here because a line
116 # ending with '\r' might be a line that was split in the middle of a
117 # '\r\n' sequence (see bugs 1555570 and 1721862).
Serhiy Storchaka320a1c02014-08-12 13:59:11 +0300118 if not parts[-1].endswith('\n'):
R David Murraydc1650c2016-09-07 17:44:34 -0400119 self._partial.write(parts.pop())
R David Murray2940e712013-02-13 21:17:13 -0500120 self.pushlines(parts)
Guido van Rossum8b3febe2007-08-30 01:15:14 +0000121
122 def pushlines(self, lines):
Raymond Hettingerf070f1c2015-05-22 17:23:28 -0700123 self._lines.extend(lines)
Guido van Rossum8b3febe2007-08-30 01:15:14 +0000124
Guido van Rossum8b3febe2007-08-30 01:15:14 +0000125 def __iter__(self):
126 return self
127
128 def __next__(self):
129 line = self.readline()
130 if line == '':
131 raise StopIteration
132 return line
133
134
135
136class FeedParser:
137 """A feed-style parser of email."""
138
R David Murrayaa212972014-02-07 10:44:16 -0500139 def __init__(self, _factory=None, *, policy=compat32):
R David Murray3edd22a2011-04-18 13:59:37 -0400140 """_factory is called with no arguments to create a new message obj
141
142 The policy keyword specifies a policy object that controls a number of
143 aspects of the parser's operation. The default policy maintains
144 backward compatibility.
145
146 """
R David Murray3edd22a2011-04-18 13:59:37 -0400147 self.policy = policy
R David Murray702b0462016-07-15 21:29:13 -0400148 self._old_style_factory = False
R David Murrayaa212972014-02-07 10:44:16 -0500149 if _factory is None:
R David Murray06ed2182016-09-09 18:39:18 -0400150 self._factory = policy.message_factory
R David Murrayaa212972014-02-07 10:44:16 -0500151 else:
152 self._factory = _factory
153 try:
154 _factory(policy=self.policy)
155 except TypeError:
156 # Assume this is an old-style factory
R David Murray702b0462016-07-15 21:29:13 -0400157 self._old_style_factory = True
Guido van Rossum8b3febe2007-08-30 01:15:14 +0000158 self._input = BufferedSubFile()
159 self._msgstack = []
160 self._parse = self._parsegen().__next__
161 self._cur = None
162 self._last = None
163 self._headersonly = False
164
165 # Non-public interface for supporting Parser's headersonly flag
166 def _set_headersonly(self):
167 self._headersonly = True
168
169 def feed(self, data):
170 """Push more data into the parser."""
171 self._input.push(data)
172 self._call_parse()
173
174 def _call_parse(self):
175 try:
176 self._parse()
177 except StopIteration:
178 pass
179
180 def close(self):
181 """Parse all remaining data and return the root message object."""
182 self._input.close()
183 self._call_parse()
184 root = self._pop_message()
185 assert not self._msgstack
186 # Look for final set of defects
187 if root.get_content_maintype() == 'multipart' \
188 and not root.is_multipart():
R David Murray3edd22a2011-04-18 13:59:37 -0400189 defect = errors.MultipartInvariantViolationDefect()
190 self.policy.handle_defect(root, defect)
Guido van Rossum8b3febe2007-08-30 01:15:14 +0000191 return root
192
193 def _new_message(self):
R David Murray702b0462016-07-15 21:29:13 -0400194 if self._old_style_factory:
195 msg = self._factory()
196 else:
197 msg = self._factory(policy=self.policy)
Guido van Rossum8b3febe2007-08-30 01:15:14 +0000198 if self._cur and self._cur.get_content_type() == 'multipart/digest':
199 msg.set_default_type('message/rfc822')
200 if self._msgstack:
201 self._msgstack[-1].attach(msg)
202 self._msgstack.append(msg)
203 self._cur = msg
204 self._last = msg
205
206 def _pop_message(self):
207 retval = self._msgstack.pop()
208 if self._msgstack:
209 self._cur = self._msgstack[-1]
210 else:
211 self._cur = None
212 return retval
213
214 def _parsegen(self):
215 # Create a new message and start by parsing headers.
216 self._new_message()
217 headers = []
218 # Collect the headers, searching for a line that doesn't match the RFC
219 # 2822 header or continuation pattern (including an empty line).
220 for line in self._input:
221 if line is NeedMoreData:
222 yield NeedMoreData
223 continue
224 if not headerRE.match(line):
225 # If we saw the RFC defined header/body separator
226 # (i.e. newline), just throw it away. Otherwise the line is
227 # part of the body so push it back.
228 if not NLCRE.match(line):
R David Murrayadbdcdb2012-05-27 20:45:01 -0400229 defect = errors.MissingHeaderBodySeparatorDefect()
230 self.policy.handle_defect(self._cur, defect)
Guido van Rossum8b3febe2007-08-30 01:15:14 +0000231 self._input.unreadline(line)
232 break
233 headers.append(line)
234 # Done with the headers, so parse them and figure out what we're
235 # supposed to see in the body of the message.
236 self._parse_headers(headers)
237 # Headers-only parsing is a backwards compatibility hack, which was
Andrew Svetlov737fb892012-12-18 21:14:22 +0200238 # necessary in the older parser, which could raise errors. All
Guido van Rossum8b3febe2007-08-30 01:15:14 +0000239 # remaining lines in the input are thrown into the message body.
240 if self._headersonly:
241 lines = []
242 while True:
243 line = self._input.readline()
244 if line is NeedMoreData:
245 yield NeedMoreData
246 continue
247 if line == '':
248 break
249 lines.append(line)
250 self._cur.set_payload(EMPTYSTRING.join(lines))
251 return
252 if self._cur.get_content_type() == 'message/delivery-status':
253 # message/delivery-status contains blocks of headers separated by
254 # a blank line. We'll represent each header block as a separate
255 # nested message object, but the processing is a bit different
256 # than standard message/* types because there is no body for the
257 # nested messages. A blank line separates the subparts.
258 while True:
259 self._input.push_eof_matcher(NLCRE.match)
260 for retval in self._parsegen():
261 if retval is NeedMoreData:
262 yield NeedMoreData
263 continue
264 break
265 msg = self._pop_message()
266 # We need to pop the EOF matcher in order to tell if we're at
267 # the end of the current file, not the end of the last block
268 # of message headers.
269 self._input.pop_eof_matcher()
270 # The input stream must be sitting at the newline or at the
271 # EOF. We want to see if we're at the end of this subpart, so
272 # first consume the blank line, then test the next line to see
273 # if we're at this subpart's EOF.
274 while True:
275 line = self._input.readline()
276 if line is NeedMoreData:
277 yield NeedMoreData
278 continue
279 break
280 while True:
281 line = self._input.readline()
282 if line is NeedMoreData:
283 yield NeedMoreData
284 continue
285 break
286 if line == '':
287 break
288 # Not at EOF so this is a line we're going to need.
289 self._input.unreadline(line)
290 return
291 if self._cur.get_content_maintype() == 'message':
292 # The message claims to be a message/* type, then what follows is
293 # another RFC 2822 message.
294 for retval in self._parsegen():
295 if retval is NeedMoreData:
296 yield NeedMoreData
297 continue
298 break
299 self._pop_message()
300 return
301 if self._cur.get_content_maintype() == 'multipart':
302 boundary = self._cur.get_boundary()
303 if boundary is None:
304 # The message /claims/ to be a multipart but it has not
305 # defined a boundary. That's a problem which we'll handle by
306 # reading everything until the EOF and marking the message as
307 # defective.
R David Murray3edd22a2011-04-18 13:59:37 -0400308 defect = errors.NoBoundaryInMultipartDefect()
309 self.policy.handle_defect(self._cur, defect)
Guido van Rossum8b3febe2007-08-30 01:15:14 +0000310 lines = []
311 for line in self._input:
312 if line is NeedMoreData:
313 yield NeedMoreData
314 continue
315 lines.append(line)
316 self._cur.set_payload(EMPTYSTRING.join(lines))
317 return
R David Murray749073a2011-06-22 13:47:53 -0400318 # Make sure a valid content type was specified per RFC 2045:6.4.
319 if (self._cur.get('content-transfer-encoding', '8bit').lower()
320 not in ('7bit', '8bit', 'binary')):
321 defect = errors.InvalidMultipartContentTransferEncodingDefect()
322 self.policy.handle_defect(self._cur, defect)
Guido van Rossum8b3febe2007-08-30 01:15:14 +0000323 # Create a line match predicate which matches the inter-part
324 # boundary as well as the end-of-multipart boundary. Don't push
325 # this onto the input stream until we've scanned past the
326 # preamble.
327 separator = '--' + boundary
328 boundaryre = re.compile(
329 '(?P<sep>' + re.escape(separator) +
330 r')(?P<end>--)?(?P<ws>[ \t]*)(?P<linesep>\r\n|\r|\n)?$')
331 capturing_preamble = True
332 preamble = []
333 linesep = False
R David Murray7ef3ff32012-05-27 22:20:42 -0400334 close_boundary_seen = False
Guido van Rossum8b3febe2007-08-30 01:15:14 +0000335 while True:
336 line = self._input.readline()
337 if line is NeedMoreData:
338 yield NeedMoreData
339 continue
340 if line == '':
341 break
342 mo = boundaryre.match(line)
343 if mo:
344 # If we're looking at the end boundary, we're done with
345 # this multipart. If there was a newline at the end of
346 # the closing boundary, then we need to initialize the
347 # epilogue with the empty string (see below).
348 if mo.group('end'):
R David Murray7ef3ff32012-05-27 22:20:42 -0400349 close_boundary_seen = True
Guido van Rossum8b3febe2007-08-30 01:15:14 +0000350 linesep = mo.group('linesep')
351 break
352 # We saw an inter-part boundary. Were we in the preamble?
353 if capturing_preamble:
354 if preamble:
355 # According to RFC 2046, the last newline belongs
356 # to the boundary.
357 lastline = preamble[-1]
358 eolmo = NLCRE_eol.search(lastline)
359 if eolmo:
360 preamble[-1] = lastline[:-len(eolmo.group(0))]
361 self._cur.preamble = EMPTYSTRING.join(preamble)
362 capturing_preamble = False
363 self._input.unreadline(line)
364 continue
365 # We saw a boundary separating two parts. Consume any
366 # multiple boundary lines that may be following. Our
367 # interpretation of RFC 2046 BNF grammar does not produce
368 # body parts within such double boundaries.
369 while True:
370 line = self._input.readline()
371 if line is NeedMoreData:
372 yield NeedMoreData
373 continue
374 mo = boundaryre.match(line)
375 if not mo:
376 self._input.unreadline(line)
377 break
378 # Recurse to parse this subpart; the input stream points
379 # at the subpart's first line.
380 self._input.push_eof_matcher(boundaryre.match)
381 for retval in self._parsegen():
382 if retval is NeedMoreData:
383 yield NeedMoreData
384 continue
385 break
386 # Because of RFC 2046, the newline preceding the boundary
387 # separator actually belongs to the boundary, not the
388 # previous subpart's payload (or epilogue if the previous
389 # part is a multipart).
390 if self._last.get_content_maintype() == 'multipart':
391 epilogue = self._last.epilogue
392 if epilogue == '':
393 self._last.epilogue = None
394 elif epilogue is not None:
395 mo = NLCRE_eol.search(epilogue)
396 if mo:
397 end = len(mo.group(0))
398 self._last.epilogue = epilogue[:-end]
399 else:
R David Murrayc5c14722011-04-06 08:13:02 -0400400 payload = self._last._payload
Guido van Rossum3172c5d2007-10-16 18:12:55 +0000401 if isinstance(payload, str):
Guido van Rossum8b3febe2007-08-30 01:15:14 +0000402 mo = NLCRE_eol.search(payload)
403 if mo:
404 payload = payload[:-len(mo.group(0))]
R David Murrayc5c14722011-04-06 08:13:02 -0400405 self._last._payload = payload
Guido van Rossum8b3febe2007-08-30 01:15:14 +0000406 self._input.pop_eof_matcher()
407 self._pop_message()
408 # Set the multipart up for newline cleansing, which will
409 # happen if we're in a nested multipart.
410 self._last = self._cur
411 else:
412 # I think we must be in the preamble
413 assert capturing_preamble
414 preamble.append(line)
415 # We've seen either the EOF or the end boundary. If we're still
416 # capturing the preamble, we never saw the start boundary. Note
417 # that as a defect and store the captured text as the payload.
Guido van Rossum8b3febe2007-08-30 01:15:14 +0000418 if capturing_preamble:
R David Murray3edd22a2011-04-18 13:59:37 -0400419 defect = errors.StartBoundaryNotFoundDefect()
420 self.policy.handle_defect(self._cur, defect)
Guido van Rossum8b3febe2007-08-30 01:15:14 +0000421 self._cur.set_payload(EMPTYSTRING.join(preamble))
422 epilogue = []
423 for line in self._input:
424 if line is NeedMoreData:
425 yield NeedMoreData
426 continue
427 self._cur.epilogue = EMPTYSTRING.join(epilogue)
428 return
R David Murray7ef3ff32012-05-27 22:20:42 -0400429 # If we're not processing the preamble, then we might have seen
430 # EOF without seeing that end boundary...that is also a defect.
431 if not close_boundary_seen:
432 defect = errors.CloseBoundaryNotFoundDefect()
433 self.policy.handle_defect(self._cur, defect)
434 return
435 # Everything from here to the EOF is epilogue. If the end boundary
436 # ended in a newline, we'll need to make sure the epilogue isn't
437 # None
Guido van Rossum8b3febe2007-08-30 01:15:14 +0000438 if linesep:
439 epilogue = ['']
440 else:
441 epilogue = []
442 for line in self._input:
443 if line is NeedMoreData:
444 yield NeedMoreData
445 continue
446 epilogue.append(line)
447 # Any CRLF at the front of the epilogue is not technically part of
448 # the epilogue. Also, watch out for an empty string epilogue,
449 # which means a single newline.
450 if epilogue:
451 firstline = epilogue[0]
452 bolmo = NLCRE_bol.match(firstline)
453 if bolmo:
454 epilogue[0] = firstline[len(bolmo.group(0)):]
455 self._cur.epilogue = EMPTYSTRING.join(epilogue)
456 return
457 # Otherwise, it's some non-multipart type, so the entire rest of the
458 # file contents becomes the payload.
459 lines = []
460 for line in self._input:
461 if line is NeedMoreData:
462 yield NeedMoreData
463 continue
464 lines.append(line)
465 self._cur.set_payload(EMPTYSTRING.join(lines))
466
467 def _parse_headers(self, lines):
468 # Passed a list of lines that make up the headers for the current msg
469 lastheader = ''
470 lastvalue = []
471 for lineno, line in enumerate(lines):
472 # Check for continuation
473 if line[0] in ' \t':
474 if not lastheader:
475 # The first line of the headers was a continuation. This
476 # is illegal, so let's note the defect, store the illegal
477 # line, and ignore it for purposes of headers.
478 defect = errors.FirstHeaderLineIsContinuationDefect(line)
R David Murray3edd22a2011-04-18 13:59:37 -0400479 self.policy.handle_defect(self._cur, defect)
Guido van Rossum8b3febe2007-08-30 01:15:14 +0000480 continue
481 lastvalue.append(line)
482 continue
483 if lastheader:
R David Murrayc27e5222012-05-25 15:01:48 -0400484 self._cur.set_raw(*self.policy.header_source_parse(lastvalue))
Guido van Rossum8b3febe2007-08-30 01:15:14 +0000485 lastheader, lastvalue = '', []
486 # Check for envelope header, i.e. unix-from
487 if line.startswith('From '):
488 if lineno == 0:
489 # Strip off the trailing newline
490 mo = NLCRE_eol.search(line)
491 if mo:
492 line = line[:-len(mo.group(0))]
493 self._cur.set_unixfrom(line)
494 continue
495 elif lineno == len(lines) - 1:
496 # Something looking like a unix-from at the end - it's
497 # probably the first line of the body, so push back the
498 # line and stop.
499 self._input.unreadline(line)
500 return
501 else:
502 # Weirdly placed unix-from line. Note this as a defect
503 # and ignore it.
504 defect = errors.MisplacedEnvelopeHeaderDefect(line)
505 self._cur.defects.append(defect)
506 continue
507 # Split the line on the colon separating field name from value.
R David Murrayadbdcdb2012-05-27 20:45:01 -0400508 # There will always be a colon, because if there wasn't the part of
509 # the parser that calls us would have started parsing the body.
Guido van Rossum8b3febe2007-08-30 01:15:14 +0000510 i = line.find(':')
Benjamin Peterson155ceaa2015-01-25 23:30:30 -0500511
512 # If the colon is on the start of the line the header is clearly
513 # malformed, but we might be able to salvage the rest of the
514 # message. Track the error but keep going.
515 if i == 0:
516 defect = errors.InvalidHeaderDefect("Missing header name.")
517 self._cur.defects.append(defect)
518 continue
519
R David Murrayadbdcdb2012-05-27 20:45:01 -0400520 assert i>0, "_parse_headers fed line with no : and no leading WS"
Guido van Rossum8b3febe2007-08-30 01:15:14 +0000521 lastheader = line[:i]
R David Murrayc27e5222012-05-25 15:01:48 -0400522 lastvalue = [line]
Guido van Rossum8b3febe2007-08-30 01:15:14 +0000523 # Done with all the lines, so handle the last header.
524 if lastheader:
R David Murrayc27e5222012-05-25 15:01:48 -0400525 self._cur.set_raw(*self.policy.header_source_parse(lastvalue))
R. David Murray96fd54e2010-10-08 15:55:28 +0000526
R David Murrayc27e5222012-05-25 15:01:48 -0400527
R. David Murray96fd54e2010-10-08 15:55:28 +0000528class BytesFeedParser(FeedParser):
529 """Like FeedParser, but feed accepts bytes."""
530
531 def feed(self, data):
532 super().feed(data.decode('ascii', 'surrogateescape'))