blob: c95b27f12f3131376dc41b410f7fe6dd02594d66 [file] [log] [blame]
Guido van Rossum8b3febe2007-08-30 01:15:14 +00001# Copyright (C) 2004-2006 Python Software Foundation
2# Authors: Baxter, Wouters and Warsaw
3# Contact: email-sig@python.org
4
5"""FeedParser - An email feed parser.
6
7The feed parser implements an interface for incrementally parsing an email
8message, line by line. This has advantages for certain applications, such as
9those reading email messages off a socket.
10
11FeedParser.feed() is the primary interface for pushing new data into the
12parser. It returns when there's nothing more it can do with the available
13data. When you have no more data to push into the parser, call .close().
14This completes the parsing and returns the root message object.
15
Andrew Svetlov737fb892012-12-18 21:14:22 +020016The other advantage of this parser is that it will never raise a parsing
Guido van Rossum8b3febe2007-08-30 01:15:14 +000017exception. Instead, when it finds something unexpected, it adds a 'defect' to
18the current message. Defects are just instances that live on the message
19object's .defects attribute.
20"""
21
R David Murray1b6c7242012-03-16 22:43:05 -040022__all__ = ['FeedParser', 'BytesFeedParser']
Guido van Rossum8b3febe2007-08-30 01:15:14 +000023
24import re
25
26from email import errors
27from email import message
R David Murrayc27e5222012-05-25 15:01:48 -040028from email._policybase import compat32
Guido van Rossum8b3febe2007-08-30 01:15:14 +000029
30NLCRE = re.compile('\r\n|\r|\n')
31NLCRE_bol = re.compile('(\r\n|\r|\n)')
R. David Murray45e0e142010-06-16 02:19:40 +000032NLCRE_eol = re.compile('(\r\n|\r|\n)\Z')
Guido van Rossum8b3febe2007-08-30 01:15:14 +000033NLCRE_crack = re.compile('(\r\n|\r|\n)')
34# RFC 2822 $3.6.8 Optional fields. ftext is %d33-57 / %d59-126, Any character
35# except controls, SP, and ":".
Benjamin Peterson155ceaa2015-01-25 23:30:30 -050036headerRE = re.compile(r'^(From |[\041-\071\073-\176]*:|[\t ])')
Guido van Rossum8b3febe2007-08-30 01:15:14 +000037EMPTYSTRING = ''
38NL = '\n'
39
40NeedMoreData = object()
41
42
43
44class BufferedSubFile(object):
45 """A file-ish object that can have new data loaded into it.
46
47 You can also push and pop line-matching predicates onto a stack. When the
48 current predicate matches the current line, a false EOF response
49 (i.e. empty string) is returned instead. This lets the parser adhere to a
50 simple abstraction -- it parses until EOF closes the current message.
51 """
52 def __init__(self):
Serhiy Storchaka320a1c02014-08-12 13:59:11 +030053 # Chunks of the last partial line pushed into this object.
54 self._partial = []
Guido van Rossum8b3febe2007-08-30 01:15:14 +000055 # The list of full, pushed lines, in reverse order
56 self._lines = []
57 # The stack of false-EOF checking predicates.
58 self._eofstack = []
59 # A flag indicating whether the file has been closed or not.
60 self._closed = False
61
62 def push_eof_matcher(self, pred):
63 self._eofstack.append(pred)
64
65 def pop_eof_matcher(self):
66 return self._eofstack.pop()
67
68 def close(self):
69 # Don't forget any trailing partial line.
Serhiy Storchaka320a1c02014-08-12 13:59:11 +030070 self.pushlines(''.join(self._partial).splitlines(True))
71 self._partial = []
Guido van Rossum8b3febe2007-08-30 01:15:14 +000072 self._closed = True
73
74 def readline(self):
75 if not self._lines:
76 if self._closed:
77 return ''
78 return NeedMoreData
79 # Pop the line off the stack and see if it matches the current
80 # false-EOF predicate.
81 line = self._lines.pop()
82 # RFC 2046, section 5.1.2 requires us to recognize outer level
83 # boundaries at any level of inner nesting. Do this, but be sure it's
84 # in the order of most to least nested.
85 for ateof in self._eofstack[::-1]:
86 if ateof(line):
87 # We're at the false EOF. But push the last line back first.
88 self._lines.append(line)
89 return ''
90 return line
91
92 def unreadline(self, line):
93 # Let the consumer push a line back into the buffer.
94 assert line is not NeedMoreData
95 self._lines.append(line)
96
97 def push(self, data):
98 """Push some new data into this object."""
R David Murray2940e712013-02-13 21:17:13 -050099 # Crack into lines, but preserve the linesep characters on the end of each
100 parts = data.splitlines(True)
Serhiy Storchaka320a1c02014-08-12 13:59:11 +0300101
102 if not parts or not parts[0].endswith(('\n', '\r')):
103 # No new complete lines, so just accumulate partials
104 self._partial += parts
105 return
106
107 if self._partial:
108 # If there are previous leftovers, complete them now
109 self._partial.append(parts[0])
110 parts[0:1] = ''.join(self._partial).splitlines(True)
111 del self._partial[:]
112
R David Murray2940e712013-02-13 21:17:13 -0500113 # If the last element of the list does not end in a newline, then treat
114 # it as a partial line. We only check for '\n' here because a line
115 # ending with '\r' might be a line that was split in the middle of a
116 # '\r\n' sequence (see bugs 1555570 and 1721862).
Serhiy Storchaka320a1c02014-08-12 13:59:11 +0300117 if not parts[-1].endswith('\n'):
118 self._partial = [parts.pop()]
R David Murray2940e712013-02-13 21:17:13 -0500119 self.pushlines(parts)
Guido van Rossum8b3febe2007-08-30 01:15:14 +0000120
121 def pushlines(self, lines):
122 # Reverse and insert at the front of the lines.
123 self._lines[:0] = lines[::-1]
124
Guido van Rossum8b3febe2007-08-30 01:15:14 +0000125 def __iter__(self):
126 return self
127
128 def __next__(self):
129 line = self.readline()
130 if line == '':
131 raise StopIteration
132 return line
133
134
135
136class FeedParser:
137 """A feed-style parser of email."""
138
R David Murrayaa212972014-02-07 10:44:16 -0500139 def __init__(self, _factory=None, *, policy=compat32):
R David Murray3edd22a2011-04-18 13:59:37 -0400140 """_factory is called with no arguments to create a new message obj
141
142 The policy keyword specifies a policy object that controls a number of
143 aspects of the parser's operation. The default policy maintains
144 backward compatibility.
145
146 """
R David Murray3edd22a2011-04-18 13:59:37 -0400147 self.policy = policy
R David Murrayaa212972014-02-07 10:44:16 -0500148 self._factory_kwds = lambda: {'policy': self.policy}
149 if _factory is None:
150 # What this should be:
151 #self._factory = policy.default_message_factory
152 # but, because we are post 3.4 feature freeze, fix with temp hack:
153 if self.policy is compat32:
154 self._factory = message.Message
155 else:
156 self._factory = message.EmailMessage
157 else:
158 self._factory = _factory
159 try:
160 _factory(policy=self.policy)
161 except TypeError:
162 # Assume this is an old-style factory
163 self._factory_kwds = lambda: {}
Guido van Rossum8b3febe2007-08-30 01:15:14 +0000164 self._input = BufferedSubFile()
165 self._msgstack = []
166 self._parse = self._parsegen().__next__
167 self._cur = None
168 self._last = None
169 self._headersonly = False
170
171 # Non-public interface for supporting Parser's headersonly flag
172 def _set_headersonly(self):
173 self._headersonly = True
174
175 def feed(self, data):
176 """Push more data into the parser."""
177 self._input.push(data)
178 self._call_parse()
179
180 def _call_parse(self):
181 try:
182 self._parse()
183 except StopIteration:
184 pass
185
186 def close(self):
187 """Parse all remaining data and return the root message object."""
188 self._input.close()
189 self._call_parse()
190 root = self._pop_message()
191 assert not self._msgstack
192 # Look for final set of defects
193 if root.get_content_maintype() == 'multipart' \
194 and not root.is_multipart():
R David Murray3edd22a2011-04-18 13:59:37 -0400195 defect = errors.MultipartInvariantViolationDefect()
196 self.policy.handle_defect(root, defect)
Guido van Rossum8b3febe2007-08-30 01:15:14 +0000197 return root
198
199 def _new_message(self):
R David Murrayc27e5222012-05-25 15:01:48 -0400200 msg = self._factory(**self._factory_kwds())
Guido van Rossum8b3febe2007-08-30 01:15:14 +0000201 if self._cur and self._cur.get_content_type() == 'multipart/digest':
202 msg.set_default_type('message/rfc822')
203 if self._msgstack:
204 self._msgstack[-1].attach(msg)
205 self._msgstack.append(msg)
206 self._cur = msg
207 self._last = msg
208
209 def _pop_message(self):
210 retval = self._msgstack.pop()
211 if self._msgstack:
212 self._cur = self._msgstack[-1]
213 else:
214 self._cur = None
215 return retval
216
217 def _parsegen(self):
218 # Create a new message and start by parsing headers.
219 self._new_message()
220 headers = []
221 # Collect the headers, searching for a line that doesn't match the RFC
222 # 2822 header or continuation pattern (including an empty line).
223 for line in self._input:
224 if line is NeedMoreData:
225 yield NeedMoreData
226 continue
227 if not headerRE.match(line):
228 # If we saw the RFC defined header/body separator
229 # (i.e. newline), just throw it away. Otherwise the line is
230 # part of the body so push it back.
231 if not NLCRE.match(line):
R David Murrayadbdcdb2012-05-27 20:45:01 -0400232 defect = errors.MissingHeaderBodySeparatorDefect()
233 self.policy.handle_defect(self._cur, defect)
Guido van Rossum8b3febe2007-08-30 01:15:14 +0000234 self._input.unreadline(line)
235 break
236 headers.append(line)
237 # Done with the headers, so parse them and figure out what we're
238 # supposed to see in the body of the message.
239 self._parse_headers(headers)
240 # Headers-only parsing is a backwards compatibility hack, which was
Andrew Svetlov737fb892012-12-18 21:14:22 +0200241 # necessary in the older parser, which could raise errors. All
Guido van Rossum8b3febe2007-08-30 01:15:14 +0000242 # remaining lines in the input are thrown into the message body.
243 if self._headersonly:
244 lines = []
245 while True:
246 line = self._input.readline()
247 if line is NeedMoreData:
248 yield NeedMoreData
249 continue
250 if line == '':
251 break
252 lines.append(line)
253 self._cur.set_payload(EMPTYSTRING.join(lines))
254 return
255 if self._cur.get_content_type() == 'message/delivery-status':
256 # message/delivery-status contains blocks of headers separated by
257 # a blank line. We'll represent each header block as a separate
258 # nested message object, but the processing is a bit different
259 # than standard message/* types because there is no body for the
260 # nested messages. A blank line separates the subparts.
261 while True:
262 self._input.push_eof_matcher(NLCRE.match)
263 for retval in self._parsegen():
264 if retval is NeedMoreData:
265 yield NeedMoreData
266 continue
267 break
268 msg = self._pop_message()
269 # We need to pop the EOF matcher in order to tell if we're at
270 # the end of the current file, not the end of the last block
271 # of message headers.
272 self._input.pop_eof_matcher()
273 # The input stream must be sitting at the newline or at the
274 # EOF. We want to see if we're at the end of this subpart, so
275 # first consume the blank line, then test the next line to see
276 # if we're at this subpart's EOF.
277 while True:
278 line = self._input.readline()
279 if line is NeedMoreData:
280 yield NeedMoreData
281 continue
282 break
283 while True:
284 line = self._input.readline()
285 if line is NeedMoreData:
286 yield NeedMoreData
287 continue
288 break
289 if line == '':
290 break
291 # Not at EOF so this is a line we're going to need.
292 self._input.unreadline(line)
293 return
294 if self._cur.get_content_maintype() == 'message':
295 # The message claims to be a message/* type, then what follows is
296 # another RFC 2822 message.
297 for retval in self._parsegen():
298 if retval is NeedMoreData:
299 yield NeedMoreData
300 continue
301 break
302 self._pop_message()
303 return
304 if self._cur.get_content_maintype() == 'multipart':
305 boundary = self._cur.get_boundary()
306 if boundary is None:
307 # The message /claims/ to be a multipart but it has not
308 # defined a boundary. That's a problem which we'll handle by
309 # reading everything until the EOF and marking the message as
310 # defective.
R David Murray3edd22a2011-04-18 13:59:37 -0400311 defect = errors.NoBoundaryInMultipartDefect()
312 self.policy.handle_defect(self._cur, defect)
Guido van Rossum8b3febe2007-08-30 01:15:14 +0000313 lines = []
314 for line in self._input:
315 if line is NeedMoreData:
316 yield NeedMoreData
317 continue
318 lines.append(line)
319 self._cur.set_payload(EMPTYSTRING.join(lines))
320 return
R David Murray749073a2011-06-22 13:47:53 -0400321 # Make sure a valid content type was specified per RFC 2045:6.4.
322 if (self._cur.get('content-transfer-encoding', '8bit').lower()
323 not in ('7bit', '8bit', 'binary')):
324 defect = errors.InvalidMultipartContentTransferEncodingDefect()
325 self.policy.handle_defect(self._cur, defect)
Guido van Rossum8b3febe2007-08-30 01:15:14 +0000326 # Create a line match predicate which matches the inter-part
327 # boundary as well as the end-of-multipart boundary. Don't push
328 # this onto the input stream until we've scanned past the
329 # preamble.
330 separator = '--' + boundary
331 boundaryre = re.compile(
332 '(?P<sep>' + re.escape(separator) +
333 r')(?P<end>--)?(?P<ws>[ \t]*)(?P<linesep>\r\n|\r|\n)?$')
334 capturing_preamble = True
335 preamble = []
336 linesep = False
R David Murray7ef3ff32012-05-27 22:20:42 -0400337 close_boundary_seen = False
Guido van Rossum8b3febe2007-08-30 01:15:14 +0000338 while True:
339 line = self._input.readline()
340 if line is NeedMoreData:
341 yield NeedMoreData
342 continue
343 if line == '':
344 break
345 mo = boundaryre.match(line)
346 if mo:
347 # If we're looking at the end boundary, we're done with
348 # this multipart. If there was a newline at the end of
349 # the closing boundary, then we need to initialize the
350 # epilogue with the empty string (see below).
351 if mo.group('end'):
R David Murray7ef3ff32012-05-27 22:20:42 -0400352 close_boundary_seen = True
Guido van Rossum8b3febe2007-08-30 01:15:14 +0000353 linesep = mo.group('linesep')
354 break
355 # We saw an inter-part boundary. Were we in the preamble?
356 if capturing_preamble:
357 if preamble:
358 # According to RFC 2046, the last newline belongs
359 # to the boundary.
360 lastline = preamble[-1]
361 eolmo = NLCRE_eol.search(lastline)
362 if eolmo:
363 preamble[-1] = lastline[:-len(eolmo.group(0))]
364 self._cur.preamble = EMPTYSTRING.join(preamble)
365 capturing_preamble = False
366 self._input.unreadline(line)
367 continue
368 # We saw a boundary separating two parts. Consume any
369 # multiple boundary lines that may be following. Our
370 # interpretation of RFC 2046 BNF grammar does not produce
371 # body parts within such double boundaries.
372 while True:
373 line = self._input.readline()
374 if line is NeedMoreData:
375 yield NeedMoreData
376 continue
377 mo = boundaryre.match(line)
378 if not mo:
379 self._input.unreadline(line)
380 break
381 # Recurse to parse this subpart; the input stream points
382 # at the subpart's first line.
383 self._input.push_eof_matcher(boundaryre.match)
384 for retval in self._parsegen():
385 if retval is NeedMoreData:
386 yield NeedMoreData
387 continue
388 break
389 # Because of RFC 2046, the newline preceding the boundary
390 # separator actually belongs to the boundary, not the
391 # previous subpart's payload (or epilogue if the previous
392 # part is a multipart).
393 if self._last.get_content_maintype() == 'multipart':
394 epilogue = self._last.epilogue
395 if epilogue == '':
396 self._last.epilogue = None
397 elif epilogue is not None:
398 mo = NLCRE_eol.search(epilogue)
399 if mo:
400 end = len(mo.group(0))
401 self._last.epilogue = epilogue[:-end]
402 else:
R David Murrayc5c14722011-04-06 08:13:02 -0400403 payload = self._last._payload
Guido van Rossum3172c5d2007-10-16 18:12:55 +0000404 if isinstance(payload, str):
Guido van Rossum8b3febe2007-08-30 01:15:14 +0000405 mo = NLCRE_eol.search(payload)
406 if mo:
407 payload = payload[:-len(mo.group(0))]
R David Murrayc5c14722011-04-06 08:13:02 -0400408 self._last._payload = payload
Guido van Rossum8b3febe2007-08-30 01:15:14 +0000409 self._input.pop_eof_matcher()
410 self._pop_message()
411 # Set the multipart up for newline cleansing, which will
412 # happen if we're in a nested multipart.
413 self._last = self._cur
414 else:
415 # I think we must be in the preamble
416 assert capturing_preamble
417 preamble.append(line)
418 # We've seen either the EOF or the end boundary. If we're still
419 # capturing the preamble, we never saw the start boundary. Note
420 # that as a defect and store the captured text as the payload.
Guido van Rossum8b3febe2007-08-30 01:15:14 +0000421 if capturing_preamble:
R David Murray3edd22a2011-04-18 13:59:37 -0400422 defect = errors.StartBoundaryNotFoundDefect()
423 self.policy.handle_defect(self._cur, defect)
Guido van Rossum8b3febe2007-08-30 01:15:14 +0000424 self._cur.set_payload(EMPTYSTRING.join(preamble))
425 epilogue = []
426 for line in self._input:
427 if line is NeedMoreData:
428 yield NeedMoreData
429 continue
430 self._cur.epilogue = EMPTYSTRING.join(epilogue)
431 return
R David Murray7ef3ff32012-05-27 22:20:42 -0400432 # If we're not processing the preamble, then we might have seen
433 # EOF without seeing that end boundary...that is also a defect.
434 if not close_boundary_seen:
435 defect = errors.CloseBoundaryNotFoundDefect()
436 self.policy.handle_defect(self._cur, defect)
437 return
438 # Everything from here to the EOF is epilogue. If the end boundary
439 # ended in a newline, we'll need to make sure the epilogue isn't
440 # None
Guido van Rossum8b3febe2007-08-30 01:15:14 +0000441 if linesep:
442 epilogue = ['']
443 else:
444 epilogue = []
445 for line in self._input:
446 if line is NeedMoreData:
447 yield NeedMoreData
448 continue
449 epilogue.append(line)
450 # Any CRLF at the front of the epilogue is not technically part of
451 # the epilogue. Also, watch out for an empty string epilogue,
452 # which means a single newline.
453 if epilogue:
454 firstline = epilogue[0]
455 bolmo = NLCRE_bol.match(firstline)
456 if bolmo:
457 epilogue[0] = firstline[len(bolmo.group(0)):]
458 self._cur.epilogue = EMPTYSTRING.join(epilogue)
459 return
460 # Otherwise, it's some non-multipart type, so the entire rest of the
461 # file contents becomes the payload.
462 lines = []
463 for line in self._input:
464 if line is NeedMoreData:
465 yield NeedMoreData
466 continue
467 lines.append(line)
468 self._cur.set_payload(EMPTYSTRING.join(lines))
469
470 def _parse_headers(self, lines):
471 # Passed a list of lines that make up the headers for the current msg
472 lastheader = ''
473 lastvalue = []
474 for lineno, line in enumerate(lines):
475 # Check for continuation
476 if line[0] in ' \t':
477 if not lastheader:
478 # The first line of the headers was a continuation. This
479 # is illegal, so let's note the defect, store the illegal
480 # line, and ignore it for purposes of headers.
481 defect = errors.FirstHeaderLineIsContinuationDefect(line)
R David Murray3edd22a2011-04-18 13:59:37 -0400482 self.policy.handle_defect(self._cur, defect)
Guido van Rossum8b3febe2007-08-30 01:15:14 +0000483 continue
484 lastvalue.append(line)
485 continue
486 if lastheader:
R David Murrayc27e5222012-05-25 15:01:48 -0400487 self._cur.set_raw(*self.policy.header_source_parse(lastvalue))
Guido van Rossum8b3febe2007-08-30 01:15:14 +0000488 lastheader, lastvalue = '', []
489 # Check for envelope header, i.e. unix-from
490 if line.startswith('From '):
491 if lineno == 0:
492 # Strip off the trailing newline
493 mo = NLCRE_eol.search(line)
494 if mo:
495 line = line[:-len(mo.group(0))]
496 self._cur.set_unixfrom(line)
497 continue
498 elif lineno == len(lines) - 1:
499 # Something looking like a unix-from at the end - it's
500 # probably the first line of the body, so push back the
501 # line and stop.
502 self._input.unreadline(line)
503 return
504 else:
505 # Weirdly placed unix-from line. Note this as a defect
506 # and ignore it.
507 defect = errors.MisplacedEnvelopeHeaderDefect(line)
508 self._cur.defects.append(defect)
509 continue
510 # Split the line on the colon separating field name from value.
R David Murrayadbdcdb2012-05-27 20:45:01 -0400511 # There will always be a colon, because if there wasn't the part of
512 # the parser that calls us would have started parsing the body.
Guido van Rossum8b3febe2007-08-30 01:15:14 +0000513 i = line.find(':')
Benjamin Peterson155ceaa2015-01-25 23:30:30 -0500514
515 # If the colon is on the start of the line the header is clearly
516 # malformed, but we might be able to salvage the rest of the
517 # message. Track the error but keep going.
518 if i == 0:
519 defect = errors.InvalidHeaderDefect("Missing header name.")
520 self._cur.defects.append(defect)
521 continue
522
R David Murrayadbdcdb2012-05-27 20:45:01 -0400523 assert i>0, "_parse_headers fed line with no : and no leading WS"
Guido van Rossum8b3febe2007-08-30 01:15:14 +0000524 lastheader = line[:i]
R David Murrayc27e5222012-05-25 15:01:48 -0400525 lastvalue = [line]
Guido van Rossum8b3febe2007-08-30 01:15:14 +0000526 # Done with all the lines, so handle the last header.
527 if lastheader:
R David Murrayc27e5222012-05-25 15:01:48 -0400528 self._cur.set_raw(*self.policy.header_source_parse(lastvalue))
R. David Murray96fd54e2010-10-08 15:55:28 +0000529
R David Murrayc27e5222012-05-25 15:01:48 -0400530
R. David Murray96fd54e2010-10-08 15:55:28 +0000531class BytesFeedParser(FeedParser):
532 """Like FeedParser, but feed accepts bytes."""
533
534 def feed(self, data):
535 super().feed(data.decode('ascii', 'surrogateescape'))