blob: c54201819fa0287373b52b51672cc3f5ce335d43 [file] [log] [blame]
Guido van Rossum8b3febe2007-08-30 01:15:14 +00001# Copyright (C) 2004-2006 Python Software Foundation
2# Authors: Baxter, Wouters and Warsaw
3# Contact: email-sig@python.org
4
5"""FeedParser - An email feed parser.
6
7The feed parser implements an interface for incrementally parsing an email
8message, line by line. This has advantages for certain applications, such as
9those reading email messages off a socket.
10
11FeedParser.feed() is the primary interface for pushing new data into the
12parser. It returns when there's nothing more it can do with the available
13data. When you have no more data to push into the parser, call .close().
14This completes the parsing and returns the root message object.
15
Andrew Svetlov737fb892012-12-18 21:14:22 +020016The other advantage of this parser is that it will never raise a parsing
Guido van Rossum8b3febe2007-08-30 01:15:14 +000017exception. Instead, when it finds something unexpected, it adds a 'defect' to
18the current message. Defects are just instances that live on the message
19object's .defects attribute.
20"""
21
R David Murray1b6c7242012-03-16 22:43:05 -040022__all__ = ['FeedParser', 'BytesFeedParser']
Guido van Rossum8b3febe2007-08-30 01:15:14 +000023
24import re
25
26from email import errors
27from email import message
R David Murrayc27e5222012-05-25 15:01:48 -040028from email._policybase import compat32
Raymond Hettingerf070f1c2015-05-22 17:23:28 -070029from collections import deque
Guido van Rossum8b3febe2007-08-30 01:15:14 +000030
31NLCRE = re.compile('\r\n|\r|\n')
32NLCRE_bol = re.compile('(\r\n|\r|\n)')
R. David Murray45e0e142010-06-16 02:19:40 +000033NLCRE_eol = re.compile('(\r\n|\r|\n)\Z')
Guido van Rossum8b3febe2007-08-30 01:15:14 +000034NLCRE_crack = re.compile('(\r\n|\r|\n)')
35# RFC 2822 $3.6.8 Optional fields. ftext is %d33-57 / %d59-126, Any character
36# except controls, SP, and ":".
Benjamin Peterson155ceaa2015-01-25 23:30:30 -050037headerRE = re.compile(r'^(From |[\041-\071\073-\176]*:|[\t ])')
Guido van Rossum8b3febe2007-08-30 01:15:14 +000038EMPTYSTRING = ''
39NL = '\n'
40
41NeedMoreData = object()
42
43
44
45class BufferedSubFile(object):
46 """A file-ish object that can have new data loaded into it.
47
48 You can also push and pop line-matching predicates onto a stack. When the
49 current predicate matches the current line, a false EOF response
50 (i.e. empty string) is returned instead. This lets the parser adhere to a
51 simple abstraction -- it parses until EOF closes the current message.
52 """
53 def __init__(self):
Serhiy Storchaka320a1c02014-08-12 13:59:11 +030054 # Chunks of the last partial line pushed into this object.
55 self._partial = []
Raymond Hettingerf070f1c2015-05-22 17:23:28 -070056 # A deque of full, pushed lines
57 self._lines = deque()
Guido van Rossum8b3febe2007-08-30 01:15:14 +000058 # The stack of false-EOF checking predicates.
59 self._eofstack = []
60 # A flag indicating whether the file has been closed or not.
61 self._closed = False
62
63 def push_eof_matcher(self, pred):
64 self._eofstack.append(pred)
65
66 def pop_eof_matcher(self):
67 return self._eofstack.pop()
68
69 def close(self):
70 # Don't forget any trailing partial line.
Serhiy Storchaka320a1c02014-08-12 13:59:11 +030071 self.pushlines(''.join(self._partial).splitlines(True))
72 self._partial = []
Guido van Rossum8b3febe2007-08-30 01:15:14 +000073 self._closed = True
74
75 def readline(self):
76 if not self._lines:
77 if self._closed:
78 return ''
79 return NeedMoreData
80 # Pop the line off the stack and see if it matches the current
81 # false-EOF predicate.
Raymond Hettingerf070f1c2015-05-22 17:23:28 -070082 line = self._lines.popleft()
Guido van Rossum8b3febe2007-08-30 01:15:14 +000083 # RFC 2046, section 5.1.2 requires us to recognize outer level
84 # boundaries at any level of inner nesting. Do this, but be sure it's
85 # in the order of most to least nested.
Raymond Hettingerf070f1c2015-05-22 17:23:28 -070086 for ateof in reversed(self._eofstack):
Guido van Rossum8b3febe2007-08-30 01:15:14 +000087 if ateof(line):
88 # We're at the false EOF. But push the last line back first.
Raymond Hettingerf070f1c2015-05-22 17:23:28 -070089 self._lines.appendleft(line)
Guido van Rossum8b3febe2007-08-30 01:15:14 +000090 return ''
91 return line
92
93 def unreadline(self, line):
94 # Let the consumer push a line back into the buffer.
95 assert line is not NeedMoreData
Raymond Hettingerf070f1c2015-05-22 17:23:28 -070096 self._lines.appendleft(line)
Guido van Rossum8b3febe2007-08-30 01:15:14 +000097
98 def push(self, data):
99 """Push some new data into this object."""
R David Murray2940e712013-02-13 21:17:13 -0500100 # Crack into lines, but preserve the linesep characters on the end of each
101 parts = data.splitlines(True)
Serhiy Storchaka320a1c02014-08-12 13:59:11 +0300102
103 if not parts or not parts[0].endswith(('\n', '\r')):
104 # No new complete lines, so just accumulate partials
105 self._partial += parts
106 return
107
108 if self._partial:
109 # If there are previous leftovers, complete them now
110 self._partial.append(parts[0])
111 parts[0:1] = ''.join(self._partial).splitlines(True)
112 del self._partial[:]
113
R David Murray2940e712013-02-13 21:17:13 -0500114 # If the last element of the list does not end in a newline, then treat
115 # it as a partial line. We only check for '\n' here because a line
116 # ending with '\r' might be a line that was split in the middle of a
117 # '\r\n' sequence (see bugs 1555570 and 1721862).
Serhiy Storchaka320a1c02014-08-12 13:59:11 +0300118 if not parts[-1].endswith('\n'):
119 self._partial = [parts.pop()]
R David Murray2940e712013-02-13 21:17:13 -0500120 self.pushlines(parts)
Guido van Rossum8b3febe2007-08-30 01:15:14 +0000121
122 def pushlines(self, lines):
Raymond Hettingerf070f1c2015-05-22 17:23:28 -0700123 self._lines.extend(lines)
Guido van Rossum8b3febe2007-08-30 01:15:14 +0000124
Guido van Rossum8b3febe2007-08-30 01:15:14 +0000125 def __iter__(self):
126 return self
127
128 def __next__(self):
129 line = self.readline()
130 if line == '':
131 raise StopIteration
132 return line
133
134
135
136class FeedParser:
137 """A feed-style parser of email."""
138
R David Murrayaa212972014-02-07 10:44:16 -0500139 def __init__(self, _factory=None, *, policy=compat32):
R David Murray3edd22a2011-04-18 13:59:37 -0400140 """_factory is called with no arguments to create a new message obj
141
142 The policy keyword specifies a policy object that controls a number of
143 aspects of the parser's operation. The default policy maintains
144 backward compatibility.
145
146 """
R David Murray3edd22a2011-04-18 13:59:37 -0400147 self.policy = policy
R David Murray702b0462016-07-15 21:29:13 -0400148 self._old_style_factory = False
R David Murrayaa212972014-02-07 10:44:16 -0500149 if _factory is None:
150 # What this should be:
151 #self._factory = policy.default_message_factory
152 # but, because we are post 3.4 feature freeze, fix with temp hack:
153 if self.policy is compat32:
154 self._factory = message.Message
155 else:
156 self._factory = message.EmailMessage
157 else:
158 self._factory = _factory
159 try:
160 _factory(policy=self.policy)
161 except TypeError:
162 # Assume this is an old-style factory
R David Murray702b0462016-07-15 21:29:13 -0400163 self._old_style_factory = True
Guido van Rossum8b3febe2007-08-30 01:15:14 +0000164 self._input = BufferedSubFile()
165 self._msgstack = []
166 self._parse = self._parsegen().__next__
167 self._cur = None
168 self._last = None
169 self._headersonly = False
170
171 # Non-public interface for supporting Parser's headersonly flag
172 def _set_headersonly(self):
173 self._headersonly = True
174
175 def feed(self, data):
176 """Push more data into the parser."""
177 self._input.push(data)
178 self._call_parse()
179
180 def _call_parse(self):
181 try:
182 self._parse()
183 except StopIteration:
184 pass
185
186 def close(self):
187 """Parse all remaining data and return the root message object."""
188 self._input.close()
189 self._call_parse()
190 root = self._pop_message()
191 assert not self._msgstack
192 # Look for final set of defects
193 if root.get_content_maintype() == 'multipart' \
194 and not root.is_multipart():
R David Murray3edd22a2011-04-18 13:59:37 -0400195 defect = errors.MultipartInvariantViolationDefect()
196 self.policy.handle_defect(root, defect)
Guido van Rossum8b3febe2007-08-30 01:15:14 +0000197 return root
198
199 def _new_message(self):
R David Murray702b0462016-07-15 21:29:13 -0400200 if self._old_style_factory:
201 msg = self._factory()
202 else:
203 msg = self._factory(policy=self.policy)
Guido van Rossum8b3febe2007-08-30 01:15:14 +0000204 if self._cur and self._cur.get_content_type() == 'multipart/digest':
205 msg.set_default_type('message/rfc822')
206 if self._msgstack:
207 self._msgstack[-1].attach(msg)
208 self._msgstack.append(msg)
209 self._cur = msg
210 self._last = msg
211
212 def _pop_message(self):
213 retval = self._msgstack.pop()
214 if self._msgstack:
215 self._cur = self._msgstack[-1]
216 else:
217 self._cur = None
218 return retval
219
220 def _parsegen(self):
221 # Create a new message and start by parsing headers.
222 self._new_message()
223 headers = []
224 # Collect the headers, searching for a line that doesn't match the RFC
225 # 2822 header or continuation pattern (including an empty line).
226 for line in self._input:
227 if line is NeedMoreData:
228 yield NeedMoreData
229 continue
230 if not headerRE.match(line):
231 # If we saw the RFC defined header/body separator
232 # (i.e. newline), just throw it away. Otherwise the line is
233 # part of the body so push it back.
234 if not NLCRE.match(line):
R David Murrayadbdcdb2012-05-27 20:45:01 -0400235 defect = errors.MissingHeaderBodySeparatorDefect()
236 self.policy.handle_defect(self._cur, defect)
Guido van Rossum8b3febe2007-08-30 01:15:14 +0000237 self._input.unreadline(line)
238 break
239 headers.append(line)
240 # Done with the headers, so parse them and figure out what we're
241 # supposed to see in the body of the message.
242 self._parse_headers(headers)
243 # Headers-only parsing is a backwards compatibility hack, which was
Andrew Svetlov737fb892012-12-18 21:14:22 +0200244 # necessary in the older parser, which could raise errors. All
Guido van Rossum8b3febe2007-08-30 01:15:14 +0000245 # remaining lines in the input are thrown into the message body.
246 if self._headersonly:
247 lines = []
248 while True:
249 line = self._input.readline()
250 if line is NeedMoreData:
251 yield NeedMoreData
252 continue
253 if line == '':
254 break
255 lines.append(line)
256 self._cur.set_payload(EMPTYSTRING.join(lines))
257 return
258 if self._cur.get_content_type() == 'message/delivery-status':
259 # message/delivery-status contains blocks of headers separated by
260 # a blank line. We'll represent each header block as a separate
261 # nested message object, but the processing is a bit different
262 # than standard message/* types because there is no body for the
263 # nested messages. A blank line separates the subparts.
264 while True:
265 self._input.push_eof_matcher(NLCRE.match)
266 for retval in self._parsegen():
267 if retval is NeedMoreData:
268 yield NeedMoreData
269 continue
270 break
271 msg = self._pop_message()
272 # We need to pop the EOF matcher in order to tell if we're at
273 # the end of the current file, not the end of the last block
274 # of message headers.
275 self._input.pop_eof_matcher()
276 # The input stream must be sitting at the newline or at the
277 # EOF. We want to see if we're at the end of this subpart, so
278 # first consume the blank line, then test the next line to see
279 # if we're at this subpart's EOF.
280 while True:
281 line = self._input.readline()
282 if line is NeedMoreData:
283 yield NeedMoreData
284 continue
285 break
286 while True:
287 line = self._input.readline()
288 if line is NeedMoreData:
289 yield NeedMoreData
290 continue
291 break
292 if line == '':
293 break
294 # Not at EOF so this is a line we're going to need.
295 self._input.unreadline(line)
296 return
297 if self._cur.get_content_maintype() == 'message':
298 # The message claims to be a message/* type, then what follows is
299 # another RFC 2822 message.
300 for retval in self._parsegen():
301 if retval is NeedMoreData:
302 yield NeedMoreData
303 continue
304 break
305 self._pop_message()
306 return
307 if self._cur.get_content_maintype() == 'multipart':
308 boundary = self._cur.get_boundary()
309 if boundary is None:
310 # The message /claims/ to be a multipart but it has not
311 # defined a boundary. That's a problem which we'll handle by
312 # reading everything until the EOF and marking the message as
313 # defective.
R David Murray3edd22a2011-04-18 13:59:37 -0400314 defect = errors.NoBoundaryInMultipartDefect()
315 self.policy.handle_defect(self._cur, defect)
Guido van Rossum8b3febe2007-08-30 01:15:14 +0000316 lines = []
317 for line in self._input:
318 if line is NeedMoreData:
319 yield NeedMoreData
320 continue
321 lines.append(line)
322 self._cur.set_payload(EMPTYSTRING.join(lines))
323 return
R David Murray749073a2011-06-22 13:47:53 -0400324 # Make sure a valid content type was specified per RFC 2045:6.4.
325 if (self._cur.get('content-transfer-encoding', '8bit').lower()
326 not in ('7bit', '8bit', 'binary')):
327 defect = errors.InvalidMultipartContentTransferEncodingDefect()
328 self.policy.handle_defect(self._cur, defect)
Guido van Rossum8b3febe2007-08-30 01:15:14 +0000329 # Create a line match predicate which matches the inter-part
330 # boundary as well as the end-of-multipart boundary. Don't push
331 # this onto the input stream until we've scanned past the
332 # preamble.
333 separator = '--' + boundary
334 boundaryre = re.compile(
335 '(?P<sep>' + re.escape(separator) +
336 r')(?P<end>--)?(?P<ws>[ \t]*)(?P<linesep>\r\n|\r|\n)?$')
337 capturing_preamble = True
338 preamble = []
339 linesep = False
R David Murray7ef3ff32012-05-27 22:20:42 -0400340 close_boundary_seen = False
Guido van Rossum8b3febe2007-08-30 01:15:14 +0000341 while True:
342 line = self._input.readline()
343 if line is NeedMoreData:
344 yield NeedMoreData
345 continue
346 if line == '':
347 break
348 mo = boundaryre.match(line)
349 if mo:
350 # If we're looking at the end boundary, we're done with
351 # this multipart. If there was a newline at the end of
352 # the closing boundary, then we need to initialize the
353 # epilogue with the empty string (see below).
354 if mo.group('end'):
R David Murray7ef3ff32012-05-27 22:20:42 -0400355 close_boundary_seen = True
Guido van Rossum8b3febe2007-08-30 01:15:14 +0000356 linesep = mo.group('linesep')
357 break
358 # We saw an inter-part boundary. Were we in the preamble?
359 if capturing_preamble:
360 if preamble:
361 # According to RFC 2046, the last newline belongs
362 # to the boundary.
363 lastline = preamble[-1]
364 eolmo = NLCRE_eol.search(lastline)
365 if eolmo:
366 preamble[-1] = lastline[:-len(eolmo.group(0))]
367 self._cur.preamble = EMPTYSTRING.join(preamble)
368 capturing_preamble = False
369 self._input.unreadline(line)
370 continue
371 # We saw a boundary separating two parts. Consume any
372 # multiple boundary lines that may be following. Our
373 # interpretation of RFC 2046 BNF grammar does not produce
374 # body parts within such double boundaries.
375 while True:
376 line = self._input.readline()
377 if line is NeedMoreData:
378 yield NeedMoreData
379 continue
380 mo = boundaryre.match(line)
381 if not mo:
382 self._input.unreadline(line)
383 break
384 # Recurse to parse this subpart; the input stream points
385 # at the subpart's first line.
386 self._input.push_eof_matcher(boundaryre.match)
387 for retval in self._parsegen():
388 if retval is NeedMoreData:
389 yield NeedMoreData
390 continue
391 break
392 # Because of RFC 2046, the newline preceding the boundary
393 # separator actually belongs to the boundary, not the
394 # previous subpart's payload (or epilogue if the previous
395 # part is a multipart).
396 if self._last.get_content_maintype() == 'multipart':
397 epilogue = self._last.epilogue
398 if epilogue == '':
399 self._last.epilogue = None
400 elif epilogue is not None:
401 mo = NLCRE_eol.search(epilogue)
402 if mo:
403 end = len(mo.group(0))
404 self._last.epilogue = epilogue[:-end]
405 else:
R David Murrayc5c14722011-04-06 08:13:02 -0400406 payload = self._last._payload
Guido van Rossum3172c5d2007-10-16 18:12:55 +0000407 if isinstance(payload, str):
Guido van Rossum8b3febe2007-08-30 01:15:14 +0000408 mo = NLCRE_eol.search(payload)
409 if mo:
410 payload = payload[:-len(mo.group(0))]
R David Murrayc5c14722011-04-06 08:13:02 -0400411 self._last._payload = payload
Guido van Rossum8b3febe2007-08-30 01:15:14 +0000412 self._input.pop_eof_matcher()
413 self._pop_message()
414 # Set the multipart up for newline cleansing, which will
415 # happen if we're in a nested multipart.
416 self._last = self._cur
417 else:
418 # I think we must be in the preamble
419 assert capturing_preamble
420 preamble.append(line)
421 # We've seen either the EOF or the end boundary. If we're still
422 # capturing the preamble, we never saw the start boundary. Note
423 # that as a defect and store the captured text as the payload.
Guido van Rossum8b3febe2007-08-30 01:15:14 +0000424 if capturing_preamble:
R David Murray3edd22a2011-04-18 13:59:37 -0400425 defect = errors.StartBoundaryNotFoundDefect()
426 self.policy.handle_defect(self._cur, defect)
Guido van Rossum8b3febe2007-08-30 01:15:14 +0000427 self._cur.set_payload(EMPTYSTRING.join(preamble))
428 epilogue = []
429 for line in self._input:
430 if line is NeedMoreData:
431 yield NeedMoreData
432 continue
433 self._cur.epilogue = EMPTYSTRING.join(epilogue)
434 return
R David Murray7ef3ff32012-05-27 22:20:42 -0400435 # If we're not processing the preamble, then we might have seen
436 # EOF without seeing that end boundary...that is also a defect.
437 if not close_boundary_seen:
438 defect = errors.CloseBoundaryNotFoundDefect()
439 self.policy.handle_defect(self._cur, defect)
440 return
441 # Everything from here to the EOF is epilogue. If the end boundary
442 # ended in a newline, we'll need to make sure the epilogue isn't
443 # None
Guido van Rossum8b3febe2007-08-30 01:15:14 +0000444 if linesep:
445 epilogue = ['']
446 else:
447 epilogue = []
448 for line in self._input:
449 if line is NeedMoreData:
450 yield NeedMoreData
451 continue
452 epilogue.append(line)
453 # Any CRLF at the front of the epilogue is not technically part of
454 # the epilogue. Also, watch out for an empty string epilogue,
455 # which means a single newline.
456 if epilogue:
457 firstline = epilogue[0]
458 bolmo = NLCRE_bol.match(firstline)
459 if bolmo:
460 epilogue[0] = firstline[len(bolmo.group(0)):]
461 self._cur.epilogue = EMPTYSTRING.join(epilogue)
462 return
463 # Otherwise, it's some non-multipart type, so the entire rest of the
464 # file contents becomes the payload.
465 lines = []
466 for line in self._input:
467 if line is NeedMoreData:
468 yield NeedMoreData
469 continue
470 lines.append(line)
471 self._cur.set_payload(EMPTYSTRING.join(lines))
472
473 def _parse_headers(self, lines):
474 # Passed a list of lines that make up the headers for the current msg
475 lastheader = ''
476 lastvalue = []
477 for lineno, line in enumerate(lines):
478 # Check for continuation
479 if line[0] in ' \t':
480 if not lastheader:
481 # The first line of the headers was a continuation. This
482 # is illegal, so let's note the defect, store the illegal
483 # line, and ignore it for purposes of headers.
484 defect = errors.FirstHeaderLineIsContinuationDefect(line)
R David Murray3edd22a2011-04-18 13:59:37 -0400485 self.policy.handle_defect(self._cur, defect)
Guido van Rossum8b3febe2007-08-30 01:15:14 +0000486 continue
487 lastvalue.append(line)
488 continue
489 if lastheader:
R David Murrayc27e5222012-05-25 15:01:48 -0400490 self._cur.set_raw(*self.policy.header_source_parse(lastvalue))
Guido van Rossum8b3febe2007-08-30 01:15:14 +0000491 lastheader, lastvalue = '', []
492 # Check for envelope header, i.e. unix-from
493 if line.startswith('From '):
494 if lineno == 0:
495 # Strip off the trailing newline
496 mo = NLCRE_eol.search(line)
497 if mo:
498 line = line[:-len(mo.group(0))]
499 self._cur.set_unixfrom(line)
500 continue
501 elif lineno == len(lines) - 1:
502 # Something looking like a unix-from at the end - it's
503 # probably the first line of the body, so push back the
504 # line and stop.
505 self._input.unreadline(line)
506 return
507 else:
508 # Weirdly placed unix-from line. Note this as a defect
509 # and ignore it.
510 defect = errors.MisplacedEnvelopeHeaderDefect(line)
511 self._cur.defects.append(defect)
512 continue
513 # Split the line on the colon separating field name from value.
R David Murrayadbdcdb2012-05-27 20:45:01 -0400514 # There will always be a colon, because if there wasn't the part of
515 # the parser that calls us would have started parsing the body.
Guido van Rossum8b3febe2007-08-30 01:15:14 +0000516 i = line.find(':')
Benjamin Peterson155ceaa2015-01-25 23:30:30 -0500517
518 # If the colon is on the start of the line the header is clearly
519 # malformed, but we might be able to salvage the rest of the
520 # message. Track the error but keep going.
521 if i == 0:
522 defect = errors.InvalidHeaderDefect("Missing header name.")
523 self._cur.defects.append(defect)
524 continue
525
R David Murrayadbdcdb2012-05-27 20:45:01 -0400526 assert i>0, "_parse_headers fed line with no : and no leading WS"
Guido van Rossum8b3febe2007-08-30 01:15:14 +0000527 lastheader = line[:i]
R David Murrayc27e5222012-05-25 15:01:48 -0400528 lastvalue = [line]
Guido van Rossum8b3febe2007-08-30 01:15:14 +0000529 # Done with all the lines, so handle the last header.
530 if lastheader:
R David Murrayc27e5222012-05-25 15:01:48 -0400531 self._cur.set_raw(*self.policy.header_source_parse(lastvalue))
R. David Murray96fd54e2010-10-08 15:55:28 +0000532
R David Murrayc27e5222012-05-25 15:01:48 -0400533
R. David Murray96fd54e2010-10-08 15:55:28 +0000534class BytesFeedParser(FeedParser):
535 """Like FeedParser, but feed accepts bytes."""
536
537 def feed(self, data):
538 super().feed(data.decode('ascii', 'surrogateescape'))