blob: 2fa77d7afc8575120ed6bedf8c42ed8234450b0b [file] [log] [blame]
Guido van Rossum8b3febe2007-08-30 01:15:14 +00001# Copyright (C) 2004-2006 Python Software Foundation
2# Authors: Baxter, Wouters and Warsaw
3# Contact: email-sig@python.org
4
5"""FeedParser - An email feed parser.
6
7The feed parser implements an interface for incrementally parsing an email
8message, line by line. This has advantages for certain applications, such as
9those reading email messages off a socket.
10
11FeedParser.feed() is the primary interface for pushing new data into the
12parser. It returns when there's nothing more it can do with the available
13data. When you have no more data to push into the parser, call .close().
14This completes the parsing and returns the root message object.
15
Andrew Svetlov737fb892012-12-18 21:14:22 +020016The other advantage of this parser is that it will never raise a parsing
Guido van Rossum8b3febe2007-08-30 01:15:14 +000017exception. Instead, when it finds something unexpected, it adds a 'defect' to
18the current message. Defects are just instances that live on the message
19object's .defects attribute.
20"""
21
R David Murray1b6c7242012-03-16 22:43:05 -040022__all__ = ['FeedParser', 'BytesFeedParser']
Guido van Rossum8b3febe2007-08-30 01:15:14 +000023
24import re
25
26from email import errors
27from email import message
R David Murrayc27e5222012-05-25 15:01:48 -040028from email._policybase import compat32
Raymond Hettingerf070f1c2015-05-22 17:23:28 -070029from collections import deque
R David Murraydc1650c2016-09-07 17:44:34 -040030from io import StringIO
Guido van Rossum8b3febe2007-08-30 01:15:14 +000031
R David Murray44b548d2016-09-08 13:59:53 -040032NLCRE = re.compile(r'\r\n|\r|\n')
33NLCRE_bol = re.compile(r'(\r\n|\r|\n)')
34NLCRE_eol = re.compile(r'(\r\n|\r|\n)\Z')
35NLCRE_crack = re.compile(r'(\r\n|\r|\n)')
Guido van Rossum8b3febe2007-08-30 01:15:14 +000036# RFC 2822 $3.6.8 Optional fields. ftext is %d33-57 / %d59-126, Any character
37# except controls, SP, and ":".
Benjamin Peterson155ceaa2015-01-25 23:30:30 -050038headerRE = re.compile(r'^(From |[\041-\071\073-\176]*:|[\t ])')
Guido van Rossum8b3febe2007-08-30 01:15:14 +000039EMPTYSTRING = ''
40NL = '\n'
41
42NeedMoreData = object()
43
44
45
46class BufferedSubFile(object):
47 """A file-ish object that can have new data loaded into it.
48
49 You can also push and pop line-matching predicates onto a stack. When the
50 current predicate matches the current line, a false EOF response
51 (i.e. empty string) is returned instead. This lets the parser adhere to a
52 simple abstraction -- it parses until EOF closes the current message.
53 """
54 def __init__(self):
R David Murraydc1650c2016-09-07 17:44:34 -040055 # Text stream of the last partial line pushed into this object.
56 # See issue 22233 for why this is a text stream and not a list.
57 self._partial = StringIO(newline='')
Raymond Hettingerf070f1c2015-05-22 17:23:28 -070058 # A deque of full, pushed lines
59 self._lines = deque()
Guido van Rossum8b3febe2007-08-30 01:15:14 +000060 # The stack of false-EOF checking predicates.
61 self._eofstack = []
62 # A flag indicating whether the file has been closed or not.
63 self._closed = False
64
65 def push_eof_matcher(self, pred):
66 self._eofstack.append(pred)
67
68 def pop_eof_matcher(self):
69 return self._eofstack.pop()
70
71 def close(self):
72 # Don't forget any trailing partial line.
R David Murraydc1650c2016-09-07 17:44:34 -040073 self._partial.seek(0)
74 self.pushlines(self._partial.readlines())
75 self._partial.seek(0)
76 self._partial.truncate()
Guido van Rossum8b3febe2007-08-30 01:15:14 +000077 self._closed = True
78
79 def readline(self):
80 if not self._lines:
81 if self._closed:
82 return ''
83 return NeedMoreData
84 # Pop the line off the stack and see if it matches the current
85 # false-EOF predicate.
Raymond Hettingerf070f1c2015-05-22 17:23:28 -070086 line = self._lines.popleft()
Guido van Rossum8b3febe2007-08-30 01:15:14 +000087 # RFC 2046, section 5.1.2 requires us to recognize outer level
88 # boundaries at any level of inner nesting. Do this, but be sure it's
89 # in the order of most to least nested.
Raymond Hettingerf070f1c2015-05-22 17:23:28 -070090 for ateof in reversed(self._eofstack):
Guido van Rossum8b3febe2007-08-30 01:15:14 +000091 if ateof(line):
92 # We're at the false EOF. But push the last line back first.
Raymond Hettingerf070f1c2015-05-22 17:23:28 -070093 self._lines.appendleft(line)
Guido van Rossum8b3febe2007-08-30 01:15:14 +000094 return ''
95 return line
96
97 def unreadline(self, line):
98 # Let the consumer push a line back into the buffer.
99 assert line is not NeedMoreData
Raymond Hettingerf070f1c2015-05-22 17:23:28 -0700100 self._lines.appendleft(line)
Guido van Rossum8b3febe2007-08-30 01:15:14 +0000101
102 def push(self, data):
103 """Push some new data into this object."""
R David Murraydc1650c2016-09-07 17:44:34 -0400104 self._partial.write(data)
105 if '\n' not in data and '\r' not in data:
106 # No new complete lines, wait for more.
Serhiy Storchaka320a1c02014-08-12 13:59:11 +0300107 return
108
R David Murraydc1650c2016-09-07 17:44:34 -0400109 # Crack into lines, preserving the linesep characters.
110 self._partial.seek(0)
111 parts = self._partial.readlines()
112 self._partial.seek(0)
113 self._partial.truncate()
Serhiy Storchaka320a1c02014-08-12 13:59:11 +0300114
R David Murray2940e712013-02-13 21:17:13 -0500115 # If the last element of the list does not end in a newline, then treat
116 # it as a partial line. We only check for '\n' here because a line
117 # ending with '\r' might be a line that was split in the middle of a
118 # '\r\n' sequence (see bugs 1555570 and 1721862).
Serhiy Storchaka320a1c02014-08-12 13:59:11 +0300119 if not parts[-1].endswith('\n'):
R David Murraydc1650c2016-09-07 17:44:34 -0400120 self._partial.write(parts.pop())
R David Murray2940e712013-02-13 21:17:13 -0500121 self.pushlines(parts)
Guido van Rossum8b3febe2007-08-30 01:15:14 +0000122
123 def pushlines(self, lines):
Raymond Hettingerf070f1c2015-05-22 17:23:28 -0700124 self._lines.extend(lines)
Guido van Rossum8b3febe2007-08-30 01:15:14 +0000125
Guido van Rossum8b3febe2007-08-30 01:15:14 +0000126 def __iter__(self):
127 return self
128
129 def __next__(self):
130 line = self.readline()
131 if line == '':
132 raise StopIteration
133 return line
134
135
136
137class FeedParser:
138 """A feed-style parser of email."""
139
R David Murrayaa212972014-02-07 10:44:16 -0500140 def __init__(self, _factory=None, *, policy=compat32):
R David Murray3edd22a2011-04-18 13:59:37 -0400141 """_factory is called with no arguments to create a new message obj
142
143 The policy keyword specifies a policy object that controls a number of
144 aspects of the parser's operation. The default policy maintains
145 backward compatibility.
146
147 """
R David Murray3edd22a2011-04-18 13:59:37 -0400148 self.policy = policy
R David Murray702b0462016-07-15 21:29:13 -0400149 self._old_style_factory = False
R David Murrayaa212972014-02-07 10:44:16 -0500150 if _factory is None:
151 # What this should be:
152 #self._factory = policy.default_message_factory
153 # but, because we are post 3.4 feature freeze, fix with temp hack:
154 if self.policy is compat32:
155 self._factory = message.Message
156 else:
157 self._factory = message.EmailMessage
158 else:
159 self._factory = _factory
160 try:
161 _factory(policy=self.policy)
162 except TypeError:
163 # Assume this is an old-style factory
R David Murray702b0462016-07-15 21:29:13 -0400164 self._old_style_factory = True
Guido van Rossum8b3febe2007-08-30 01:15:14 +0000165 self._input = BufferedSubFile()
166 self._msgstack = []
167 self._parse = self._parsegen().__next__
168 self._cur = None
169 self._last = None
170 self._headersonly = False
171
172 # Non-public interface for supporting Parser's headersonly flag
173 def _set_headersonly(self):
174 self._headersonly = True
175
176 def feed(self, data):
177 """Push more data into the parser."""
178 self._input.push(data)
179 self._call_parse()
180
181 def _call_parse(self):
182 try:
183 self._parse()
184 except StopIteration:
185 pass
186
187 def close(self):
188 """Parse all remaining data and return the root message object."""
189 self._input.close()
190 self._call_parse()
191 root = self._pop_message()
192 assert not self._msgstack
193 # Look for final set of defects
194 if root.get_content_maintype() == 'multipart' \
195 and not root.is_multipart():
R David Murray3edd22a2011-04-18 13:59:37 -0400196 defect = errors.MultipartInvariantViolationDefect()
197 self.policy.handle_defect(root, defect)
Guido van Rossum8b3febe2007-08-30 01:15:14 +0000198 return root
199
200 def _new_message(self):
R David Murray702b0462016-07-15 21:29:13 -0400201 if self._old_style_factory:
202 msg = self._factory()
203 else:
204 msg = self._factory(policy=self.policy)
Guido van Rossum8b3febe2007-08-30 01:15:14 +0000205 if self._cur and self._cur.get_content_type() == 'multipart/digest':
206 msg.set_default_type('message/rfc822')
207 if self._msgstack:
208 self._msgstack[-1].attach(msg)
209 self._msgstack.append(msg)
210 self._cur = msg
211 self._last = msg
212
213 def _pop_message(self):
214 retval = self._msgstack.pop()
215 if self._msgstack:
216 self._cur = self._msgstack[-1]
217 else:
218 self._cur = None
219 return retval
220
221 def _parsegen(self):
222 # Create a new message and start by parsing headers.
223 self._new_message()
224 headers = []
225 # Collect the headers, searching for a line that doesn't match the RFC
226 # 2822 header or continuation pattern (including an empty line).
227 for line in self._input:
228 if line is NeedMoreData:
229 yield NeedMoreData
230 continue
231 if not headerRE.match(line):
232 # If we saw the RFC defined header/body separator
233 # (i.e. newline), just throw it away. Otherwise the line is
234 # part of the body so push it back.
235 if not NLCRE.match(line):
R David Murrayadbdcdb2012-05-27 20:45:01 -0400236 defect = errors.MissingHeaderBodySeparatorDefect()
237 self.policy.handle_defect(self._cur, defect)
Guido van Rossum8b3febe2007-08-30 01:15:14 +0000238 self._input.unreadline(line)
239 break
240 headers.append(line)
241 # Done with the headers, so parse them and figure out what we're
242 # supposed to see in the body of the message.
243 self._parse_headers(headers)
244 # Headers-only parsing is a backwards compatibility hack, which was
Andrew Svetlov737fb892012-12-18 21:14:22 +0200245 # necessary in the older parser, which could raise errors. All
Guido van Rossum8b3febe2007-08-30 01:15:14 +0000246 # remaining lines in the input are thrown into the message body.
247 if self._headersonly:
248 lines = []
249 while True:
250 line = self._input.readline()
251 if line is NeedMoreData:
252 yield NeedMoreData
253 continue
254 if line == '':
255 break
256 lines.append(line)
257 self._cur.set_payload(EMPTYSTRING.join(lines))
258 return
259 if self._cur.get_content_type() == 'message/delivery-status':
260 # message/delivery-status contains blocks of headers separated by
261 # a blank line. We'll represent each header block as a separate
262 # nested message object, but the processing is a bit different
263 # than standard message/* types because there is no body for the
264 # nested messages. A blank line separates the subparts.
265 while True:
266 self._input.push_eof_matcher(NLCRE.match)
267 for retval in self._parsegen():
268 if retval is NeedMoreData:
269 yield NeedMoreData
270 continue
271 break
272 msg = self._pop_message()
273 # We need to pop the EOF matcher in order to tell if we're at
274 # the end of the current file, not the end of the last block
275 # of message headers.
276 self._input.pop_eof_matcher()
277 # The input stream must be sitting at the newline or at the
278 # EOF. We want to see if we're at the end of this subpart, so
279 # first consume the blank line, then test the next line to see
280 # if we're at this subpart's EOF.
281 while True:
282 line = self._input.readline()
283 if line is NeedMoreData:
284 yield NeedMoreData
285 continue
286 break
287 while True:
288 line = self._input.readline()
289 if line is NeedMoreData:
290 yield NeedMoreData
291 continue
292 break
293 if line == '':
294 break
295 # Not at EOF so this is a line we're going to need.
296 self._input.unreadline(line)
297 return
298 if self._cur.get_content_maintype() == 'message':
299 # The message claims to be a message/* type, then what follows is
300 # another RFC 2822 message.
301 for retval in self._parsegen():
302 if retval is NeedMoreData:
303 yield NeedMoreData
304 continue
305 break
306 self._pop_message()
307 return
308 if self._cur.get_content_maintype() == 'multipart':
309 boundary = self._cur.get_boundary()
310 if boundary is None:
311 # The message /claims/ to be a multipart but it has not
312 # defined a boundary. That's a problem which we'll handle by
313 # reading everything until the EOF and marking the message as
314 # defective.
R David Murray3edd22a2011-04-18 13:59:37 -0400315 defect = errors.NoBoundaryInMultipartDefect()
316 self.policy.handle_defect(self._cur, defect)
Guido van Rossum8b3febe2007-08-30 01:15:14 +0000317 lines = []
318 for line in self._input:
319 if line is NeedMoreData:
320 yield NeedMoreData
321 continue
322 lines.append(line)
323 self._cur.set_payload(EMPTYSTRING.join(lines))
324 return
R David Murray749073a2011-06-22 13:47:53 -0400325 # Make sure a valid content type was specified per RFC 2045:6.4.
326 if (self._cur.get('content-transfer-encoding', '8bit').lower()
327 not in ('7bit', '8bit', 'binary')):
328 defect = errors.InvalidMultipartContentTransferEncodingDefect()
329 self.policy.handle_defect(self._cur, defect)
Guido van Rossum8b3febe2007-08-30 01:15:14 +0000330 # Create a line match predicate which matches the inter-part
331 # boundary as well as the end-of-multipart boundary. Don't push
332 # this onto the input stream until we've scanned past the
333 # preamble.
334 separator = '--' + boundary
335 boundaryre = re.compile(
336 '(?P<sep>' + re.escape(separator) +
337 r')(?P<end>--)?(?P<ws>[ \t]*)(?P<linesep>\r\n|\r|\n)?$')
338 capturing_preamble = True
339 preamble = []
340 linesep = False
R David Murray7ef3ff32012-05-27 22:20:42 -0400341 close_boundary_seen = False
Guido van Rossum8b3febe2007-08-30 01:15:14 +0000342 while True:
343 line = self._input.readline()
344 if line is NeedMoreData:
345 yield NeedMoreData
346 continue
347 if line == '':
348 break
349 mo = boundaryre.match(line)
350 if mo:
351 # If we're looking at the end boundary, we're done with
352 # this multipart. If there was a newline at the end of
353 # the closing boundary, then we need to initialize the
354 # epilogue with the empty string (see below).
355 if mo.group('end'):
R David Murray7ef3ff32012-05-27 22:20:42 -0400356 close_boundary_seen = True
Guido van Rossum8b3febe2007-08-30 01:15:14 +0000357 linesep = mo.group('linesep')
358 break
359 # We saw an inter-part boundary. Were we in the preamble?
360 if capturing_preamble:
361 if preamble:
362 # According to RFC 2046, the last newline belongs
363 # to the boundary.
364 lastline = preamble[-1]
365 eolmo = NLCRE_eol.search(lastline)
366 if eolmo:
367 preamble[-1] = lastline[:-len(eolmo.group(0))]
368 self._cur.preamble = EMPTYSTRING.join(preamble)
369 capturing_preamble = False
370 self._input.unreadline(line)
371 continue
372 # We saw a boundary separating two parts. Consume any
373 # multiple boundary lines that may be following. Our
374 # interpretation of RFC 2046 BNF grammar does not produce
375 # body parts within such double boundaries.
376 while True:
377 line = self._input.readline()
378 if line is NeedMoreData:
379 yield NeedMoreData
380 continue
381 mo = boundaryre.match(line)
382 if not mo:
383 self._input.unreadline(line)
384 break
385 # Recurse to parse this subpart; the input stream points
386 # at the subpart's first line.
387 self._input.push_eof_matcher(boundaryre.match)
388 for retval in self._parsegen():
389 if retval is NeedMoreData:
390 yield NeedMoreData
391 continue
392 break
393 # Because of RFC 2046, the newline preceding the boundary
394 # separator actually belongs to the boundary, not the
395 # previous subpart's payload (or epilogue if the previous
396 # part is a multipart).
397 if self._last.get_content_maintype() == 'multipart':
398 epilogue = self._last.epilogue
399 if epilogue == '':
400 self._last.epilogue = None
401 elif epilogue is not None:
402 mo = NLCRE_eol.search(epilogue)
403 if mo:
404 end = len(mo.group(0))
405 self._last.epilogue = epilogue[:-end]
406 else:
R David Murrayc5c14722011-04-06 08:13:02 -0400407 payload = self._last._payload
Guido van Rossum3172c5d2007-10-16 18:12:55 +0000408 if isinstance(payload, str):
Guido van Rossum8b3febe2007-08-30 01:15:14 +0000409 mo = NLCRE_eol.search(payload)
410 if mo:
411 payload = payload[:-len(mo.group(0))]
R David Murrayc5c14722011-04-06 08:13:02 -0400412 self._last._payload = payload
Guido van Rossum8b3febe2007-08-30 01:15:14 +0000413 self._input.pop_eof_matcher()
414 self._pop_message()
415 # Set the multipart up for newline cleansing, which will
416 # happen if we're in a nested multipart.
417 self._last = self._cur
418 else:
419 # I think we must be in the preamble
420 assert capturing_preamble
421 preamble.append(line)
422 # We've seen either the EOF or the end boundary. If we're still
423 # capturing the preamble, we never saw the start boundary. Note
424 # that as a defect and store the captured text as the payload.
Guido van Rossum8b3febe2007-08-30 01:15:14 +0000425 if capturing_preamble:
R David Murray3edd22a2011-04-18 13:59:37 -0400426 defect = errors.StartBoundaryNotFoundDefect()
427 self.policy.handle_defect(self._cur, defect)
Guido van Rossum8b3febe2007-08-30 01:15:14 +0000428 self._cur.set_payload(EMPTYSTRING.join(preamble))
429 epilogue = []
430 for line in self._input:
431 if line is NeedMoreData:
432 yield NeedMoreData
433 continue
434 self._cur.epilogue = EMPTYSTRING.join(epilogue)
435 return
R David Murray7ef3ff32012-05-27 22:20:42 -0400436 # If we're not processing the preamble, then we might have seen
437 # EOF without seeing that end boundary...that is also a defect.
438 if not close_boundary_seen:
439 defect = errors.CloseBoundaryNotFoundDefect()
440 self.policy.handle_defect(self._cur, defect)
441 return
442 # Everything from here to the EOF is epilogue. If the end boundary
443 # ended in a newline, we'll need to make sure the epilogue isn't
444 # None
Guido van Rossum8b3febe2007-08-30 01:15:14 +0000445 if linesep:
446 epilogue = ['']
447 else:
448 epilogue = []
449 for line in self._input:
450 if line is NeedMoreData:
451 yield NeedMoreData
452 continue
453 epilogue.append(line)
454 # Any CRLF at the front of the epilogue is not technically part of
455 # the epilogue. Also, watch out for an empty string epilogue,
456 # which means a single newline.
457 if epilogue:
458 firstline = epilogue[0]
459 bolmo = NLCRE_bol.match(firstline)
460 if bolmo:
461 epilogue[0] = firstline[len(bolmo.group(0)):]
462 self._cur.epilogue = EMPTYSTRING.join(epilogue)
463 return
464 # Otherwise, it's some non-multipart type, so the entire rest of the
465 # file contents becomes the payload.
466 lines = []
467 for line in self._input:
468 if line is NeedMoreData:
469 yield NeedMoreData
470 continue
471 lines.append(line)
472 self._cur.set_payload(EMPTYSTRING.join(lines))
473
474 def _parse_headers(self, lines):
475 # Passed a list of lines that make up the headers for the current msg
476 lastheader = ''
477 lastvalue = []
478 for lineno, line in enumerate(lines):
479 # Check for continuation
480 if line[0] in ' \t':
481 if not lastheader:
482 # The first line of the headers was a continuation. This
483 # is illegal, so let's note the defect, store the illegal
484 # line, and ignore it for purposes of headers.
485 defect = errors.FirstHeaderLineIsContinuationDefect(line)
R David Murray3edd22a2011-04-18 13:59:37 -0400486 self.policy.handle_defect(self._cur, defect)
Guido van Rossum8b3febe2007-08-30 01:15:14 +0000487 continue
488 lastvalue.append(line)
489 continue
490 if lastheader:
R David Murrayc27e5222012-05-25 15:01:48 -0400491 self._cur.set_raw(*self.policy.header_source_parse(lastvalue))
Guido van Rossum8b3febe2007-08-30 01:15:14 +0000492 lastheader, lastvalue = '', []
493 # Check for envelope header, i.e. unix-from
494 if line.startswith('From '):
495 if lineno == 0:
496 # Strip off the trailing newline
497 mo = NLCRE_eol.search(line)
498 if mo:
499 line = line[:-len(mo.group(0))]
500 self._cur.set_unixfrom(line)
501 continue
502 elif lineno == len(lines) - 1:
503 # Something looking like a unix-from at the end - it's
504 # probably the first line of the body, so push back the
505 # line and stop.
506 self._input.unreadline(line)
507 return
508 else:
509 # Weirdly placed unix-from line. Note this as a defect
510 # and ignore it.
511 defect = errors.MisplacedEnvelopeHeaderDefect(line)
512 self._cur.defects.append(defect)
513 continue
514 # Split the line on the colon separating field name from value.
R David Murrayadbdcdb2012-05-27 20:45:01 -0400515 # There will always be a colon, because if there wasn't the part of
516 # the parser that calls us would have started parsing the body.
Guido van Rossum8b3febe2007-08-30 01:15:14 +0000517 i = line.find(':')
Benjamin Peterson155ceaa2015-01-25 23:30:30 -0500518
519 # If the colon is on the start of the line the header is clearly
520 # malformed, but we might be able to salvage the rest of the
521 # message. Track the error but keep going.
522 if i == 0:
523 defect = errors.InvalidHeaderDefect("Missing header name.")
524 self._cur.defects.append(defect)
525 continue
526
R David Murrayadbdcdb2012-05-27 20:45:01 -0400527 assert i>0, "_parse_headers fed line with no : and no leading WS"
Guido van Rossum8b3febe2007-08-30 01:15:14 +0000528 lastheader = line[:i]
R David Murrayc27e5222012-05-25 15:01:48 -0400529 lastvalue = [line]
Guido van Rossum8b3febe2007-08-30 01:15:14 +0000530 # Done with all the lines, so handle the last header.
531 if lastheader:
R David Murrayc27e5222012-05-25 15:01:48 -0400532 self._cur.set_raw(*self.policy.header_source_parse(lastvalue))
R. David Murray96fd54e2010-10-08 15:55:28 +0000533
R David Murrayc27e5222012-05-25 15:01:48 -0400534
R. David Murray96fd54e2010-10-08 15:55:28 +0000535class BytesFeedParser(FeedParser):
536 """Like FeedParser, but feed accepts bytes."""
537
538 def feed(self, data):
539 super().feed(data.decode('ascii', 'surrogateescape'))