blob: eb75fe35793a75a9b676b943c8c23947eab159d1 [file] [log] [blame]
Guido van Rossum8b3febe2007-08-30 01:15:14 +00001# Copyright (C) 2004-2006 Python Software Foundation
2# Authors: Baxter, Wouters and Warsaw
3# Contact: email-sig@python.org
4
5"""FeedParser - An email feed parser.
6
7The feed parser implements an interface for incrementally parsing an email
8message, line by line. This has advantages for certain applications, such as
9those reading email messages off a socket.
10
11FeedParser.feed() is the primary interface for pushing new data into the
12parser. It returns when there's nothing more it can do with the available
13data. When you have no more data to push into the parser, call .close().
14This completes the parsing and returns the root message object.
15
Andrew Svetlov737fb892012-12-18 21:14:22 +020016The other advantage of this parser is that it will never raise a parsing
Guido van Rossum8b3febe2007-08-30 01:15:14 +000017exception. Instead, when it finds something unexpected, it adds a 'defect' to
18the current message. Defects are just instances that live on the message
19object's .defects attribute.
20"""
21
R David Murray1b6c7242012-03-16 22:43:05 -040022__all__ = ['FeedParser', 'BytesFeedParser']
Guido van Rossum8b3febe2007-08-30 01:15:14 +000023
24import re
25
26from email import errors
27from email import message
R David Murrayc27e5222012-05-25 15:01:48 -040028from email._policybase import compat32
Guido van Rossum8b3febe2007-08-30 01:15:14 +000029
30NLCRE = re.compile('\r\n|\r|\n')
31NLCRE_bol = re.compile('(\r\n|\r|\n)')
R. David Murray45e0e142010-06-16 02:19:40 +000032NLCRE_eol = re.compile('(\r\n|\r|\n)\Z')
Guido van Rossum8b3febe2007-08-30 01:15:14 +000033NLCRE_crack = re.compile('(\r\n|\r|\n)')
34# RFC 2822 $3.6.8 Optional fields. ftext is %d33-57 / %d59-126, Any character
35# except controls, SP, and ":".
36headerRE = re.compile(r'^(From |[\041-\071\073-\176]{1,}:|[\t ])')
37EMPTYSTRING = ''
38NL = '\n'
39
40NeedMoreData = object()
41
42
43
44class BufferedSubFile(object):
45 """A file-ish object that can have new data loaded into it.
46
47 You can also push and pop line-matching predicates onto a stack. When the
48 current predicate matches the current line, a false EOF response
49 (i.e. empty string) is returned instead. This lets the parser adhere to a
50 simple abstraction -- it parses until EOF closes the current message.
51 """
52 def __init__(self):
53 # The last partial line pushed into this object.
54 self._partial = ''
55 # The list of full, pushed lines, in reverse order
56 self._lines = []
57 # The stack of false-EOF checking predicates.
58 self._eofstack = []
59 # A flag indicating whether the file has been closed or not.
60 self._closed = False
61
62 def push_eof_matcher(self, pred):
63 self._eofstack.append(pred)
64
65 def pop_eof_matcher(self):
66 return self._eofstack.pop()
67
68 def close(self):
69 # Don't forget any trailing partial line.
70 self._lines.append(self._partial)
71 self._partial = ''
72 self._closed = True
73
74 def readline(self):
75 if not self._lines:
76 if self._closed:
77 return ''
78 return NeedMoreData
79 # Pop the line off the stack and see if it matches the current
80 # false-EOF predicate.
81 line = self._lines.pop()
82 # RFC 2046, section 5.1.2 requires us to recognize outer level
83 # boundaries at any level of inner nesting. Do this, but be sure it's
84 # in the order of most to least nested.
85 for ateof in self._eofstack[::-1]:
86 if ateof(line):
87 # We're at the false EOF. But push the last line back first.
88 self._lines.append(line)
89 return ''
90 return line
91
92 def unreadline(self, line):
93 # Let the consumer push a line back into the buffer.
94 assert line is not NeedMoreData
95 self._lines.append(line)
96
97 def push(self, data):
98 """Push some new data into this object."""
99 # Handle any previous leftovers
100 data, self._partial = self._partial + data, ''
R David Murray2940e712013-02-13 21:17:13 -0500101 # Crack into lines, but preserve the linesep characters on the end of each
102 parts = data.splitlines(True)
103 # If the last element of the list does not end in a newline, then treat
104 # it as a partial line. We only check for '\n' here because a line
105 # ending with '\r' might be a line that was split in the middle of a
106 # '\r\n' sequence (see bugs 1555570 and 1721862).
107 if parts and not parts[-1].endswith('\n'):
108 self._partial = parts.pop()
109 self.pushlines(parts)
Guido van Rossum8b3febe2007-08-30 01:15:14 +0000110
111 def pushlines(self, lines):
112 # Reverse and insert at the front of the lines.
113 self._lines[:0] = lines[::-1]
114
Guido van Rossum8b3febe2007-08-30 01:15:14 +0000115 def __iter__(self):
116 return self
117
118 def __next__(self):
119 line = self.readline()
120 if line == '':
121 raise StopIteration
122 return line
123
124
125
126class FeedParser:
127 """A feed-style parser of email."""
128
R David Murrayc27e5222012-05-25 15:01:48 -0400129 def __init__(self, _factory=message.Message, *, policy=compat32):
R David Murray3edd22a2011-04-18 13:59:37 -0400130 """_factory is called with no arguments to create a new message obj
131
132 The policy keyword specifies a policy object that controls a number of
133 aspects of the parser's operation. The default policy maintains
134 backward compatibility.
135
136 """
Guido van Rossum8b3febe2007-08-30 01:15:14 +0000137 self._factory = _factory
R David Murray3edd22a2011-04-18 13:59:37 -0400138 self.policy = policy
R David Murrayc27e5222012-05-25 15:01:48 -0400139 try:
140 _factory(policy=self.policy)
141 self._factory_kwds = lambda: {'policy': self.policy}
142 except TypeError:
143 # Assume this is an old-style factory
144 self._factory_kwds = lambda: {}
Guido van Rossum8b3febe2007-08-30 01:15:14 +0000145 self._input = BufferedSubFile()
146 self._msgstack = []
147 self._parse = self._parsegen().__next__
148 self._cur = None
149 self._last = None
150 self._headersonly = False
151
152 # Non-public interface for supporting Parser's headersonly flag
153 def _set_headersonly(self):
154 self._headersonly = True
155
156 def feed(self, data):
157 """Push more data into the parser."""
158 self._input.push(data)
159 self._call_parse()
160
161 def _call_parse(self):
162 try:
163 self._parse()
164 except StopIteration:
165 pass
166
167 def close(self):
168 """Parse all remaining data and return the root message object."""
169 self._input.close()
170 self._call_parse()
171 root = self._pop_message()
172 assert not self._msgstack
173 # Look for final set of defects
174 if root.get_content_maintype() == 'multipart' \
175 and not root.is_multipart():
R David Murray3edd22a2011-04-18 13:59:37 -0400176 defect = errors.MultipartInvariantViolationDefect()
177 self.policy.handle_defect(root, defect)
Guido van Rossum8b3febe2007-08-30 01:15:14 +0000178 return root
179
180 def _new_message(self):
R David Murrayc27e5222012-05-25 15:01:48 -0400181 msg = self._factory(**self._factory_kwds())
Guido van Rossum8b3febe2007-08-30 01:15:14 +0000182 if self._cur and self._cur.get_content_type() == 'multipart/digest':
183 msg.set_default_type('message/rfc822')
184 if self._msgstack:
185 self._msgstack[-1].attach(msg)
186 self._msgstack.append(msg)
187 self._cur = msg
188 self._last = msg
189
190 def _pop_message(self):
191 retval = self._msgstack.pop()
192 if self._msgstack:
193 self._cur = self._msgstack[-1]
194 else:
195 self._cur = None
196 return retval
197
198 def _parsegen(self):
199 # Create a new message and start by parsing headers.
200 self._new_message()
201 headers = []
202 # Collect the headers, searching for a line that doesn't match the RFC
203 # 2822 header or continuation pattern (including an empty line).
204 for line in self._input:
205 if line is NeedMoreData:
206 yield NeedMoreData
207 continue
208 if not headerRE.match(line):
209 # If we saw the RFC defined header/body separator
210 # (i.e. newline), just throw it away. Otherwise the line is
211 # part of the body so push it back.
212 if not NLCRE.match(line):
R David Murrayadbdcdb2012-05-27 20:45:01 -0400213 defect = errors.MissingHeaderBodySeparatorDefect()
214 self.policy.handle_defect(self._cur, defect)
Guido van Rossum8b3febe2007-08-30 01:15:14 +0000215 self._input.unreadline(line)
216 break
217 headers.append(line)
218 # Done with the headers, so parse them and figure out what we're
219 # supposed to see in the body of the message.
220 self._parse_headers(headers)
221 # Headers-only parsing is a backwards compatibility hack, which was
Andrew Svetlov737fb892012-12-18 21:14:22 +0200222 # necessary in the older parser, which could raise errors. All
Guido van Rossum8b3febe2007-08-30 01:15:14 +0000223 # remaining lines in the input are thrown into the message body.
224 if self._headersonly:
225 lines = []
226 while True:
227 line = self._input.readline()
228 if line is NeedMoreData:
229 yield NeedMoreData
230 continue
231 if line == '':
232 break
233 lines.append(line)
234 self._cur.set_payload(EMPTYSTRING.join(lines))
235 return
236 if self._cur.get_content_type() == 'message/delivery-status':
237 # message/delivery-status contains blocks of headers separated by
238 # a blank line. We'll represent each header block as a separate
239 # nested message object, but the processing is a bit different
240 # than standard message/* types because there is no body for the
241 # nested messages. A blank line separates the subparts.
242 while True:
243 self._input.push_eof_matcher(NLCRE.match)
244 for retval in self._parsegen():
245 if retval is NeedMoreData:
246 yield NeedMoreData
247 continue
248 break
249 msg = self._pop_message()
250 # We need to pop the EOF matcher in order to tell if we're at
251 # the end of the current file, not the end of the last block
252 # of message headers.
253 self._input.pop_eof_matcher()
254 # The input stream must be sitting at the newline or at the
255 # EOF. We want to see if we're at the end of this subpart, so
256 # first consume the blank line, then test the next line to see
257 # if we're at this subpart's EOF.
258 while True:
259 line = self._input.readline()
260 if line is NeedMoreData:
261 yield NeedMoreData
262 continue
263 break
264 while True:
265 line = self._input.readline()
266 if line is NeedMoreData:
267 yield NeedMoreData
268 continue
269 break
270 if line == '':
271 break
272 # Not at EOF so this is a line we're going to need.
273 self._input.unreadline(line)
274 return
275 if self._cur.get_content_maintype() == 'message':
276 # The message claims to be a message/* type, then what follows is
277 # another RFC 2822 message.
278 for retval in self._parsegen():
279 if retval is NeedMoreData:
280 yield NeedMoreData
281 continue
282 break
283 self._pop_message()
284 return
285 if self._cur.get_content_maintype() == 'multipart':
286 boundary = self._cur.get_boundary()
287 if boundary is None:
288 # The message /claims/ to be a multipart but it has not
289 # defined a boundary. That's a problem which we'll handle by
290 # reading everything until the EOF and marking the message as
291 # defective.
R David Murray3edd22a2011-04-18 13:59:37 -0400292 defect = errors.NoBoundaryInMultipartDefect()
293 self.policy.handle_defect(self._cur, defect)
Guido van Rossum8b3febe2007-08-30 01:15:14 +0000294 lines = []
295 for line in self._input:
296 if line is NeedMoreData:
297 yield NeedMoreData
298 continue
299 lines.append(line)
300 self._cur.set_payload(EMPTYSTRING.join(lines))
301 return
R David Murray749073a2011-06-22 13:47:53 -0400302 # Make sure a valid content type was specified per RFC 2045:6.4.
303 if (self._cur.get('content-transfer-encoding', '8bit').lower()
304 not in ('7bit', '8bit', 'binary')):
305 defect = errors.InvalidMultipartContentTransferEncodingDefect()
306 self.policy.handle_defect(self._cur, defect)
Guido van Rossum8b3febe2007-08-30 01:15:14 +0000307 # Create a line match predicate which matches the inter-part
308 # boundary as well as the end-of-multipart boundary. Don't push
309 # this onto the input stream until we've scanned past the
310 # preamble.
311 separator = '--' + boundary
312 boundaryre = re.compile(
313 '(?P<sep>' + re.escape(separator) +
314 r')(?P<end>--)?(?P<ws>[ \t]*)(?P<linesep>\r\n|\r|\n)?$')
315 capturing_preamble = True
316 preamble = []
317 linesep = False
R David Murray7ef3ff32012-05-27 22:20:42 -0400318 close_boundary_seen = False
Guido van Rossum8b3febe2007-08-30 01:15:14 +0000319 while True:
320 line = self._input.readline()
321 if line is NeedMoreData:
322 yield NeedMoreData
323 continue
324 if line == '':
325 break
326 mo = boundaryre.match(line)
327 if mo:
328 # If we're looking at the end boundary, we're done with
329 # this multipart. If there was a newline at the end of
330 # the closing boundary, then we need to initialize the
331 # epilogue with the empty string (see below).
332 if mo.group('end'):
R David Murray7ef3ff32012-05-27 22:20:42 -0400333 close_boundary_seen = True
Guido van Rossum8b3febe2007-08-30 01:15:14 +0000334 linesep = mo.group('linesep')
335 break
336 # We saw an inter-part boundary. Were we in the preamble?
337 if capturing_preamble:
338 if preamble:
339 # According to RFC 2046, the last newline belongs
340 # to the boundary.
341 lastline = preamble[-1]
342 eolmo = NLCRE_eol.search(lastline)
343 if eolmo:
344 preamble[-1] = lastline[:-len(eolmo.group(0))]
345 self._cur.preamble = EMPTYSTRING.join(preamble)
346 capturing_preamble = False
347 self._input.unreadline(line)
348 continue
349 # We saw a boundary separating two parts. Consume any
350 # multiple boundary lines that may be following. Our
351 # interpretation of RFC 2046 BNF grammar does not produce
352 # body parts within such double boundaries.
353 while True:
354 line = self._input.readline()
355 if line is NeedMoreData:
356 yield NeedMoreData
357 continue
358 mo = boundaryre.match(line)
359 if not mo:
360 self._input.unreadline(line)
361 break
362 # Recurse to parse this subpart; the input stream points
363 # at the subpart's first line.
364 self._input.push_eof_matcher(boundaryre.match)
365 for retval in self._parsegen():
366 if retval is NeedMoreData:
367 yield NeedMoreData
368 continue
369 break
370 # Because of RFC 2046, the newline preceding the boundary
371 # separator actually belongs to the boundary, not the
372 # previous subpart's payload (or epilogue if the previous
373 # part is a multipart).
374 if self._last.get_content_maintype() == 'multipart':
375 epilogue = self._last.epilogue
376 if epilogue == '':
377 self._last.epilogue = None
378 elif epilogue is not None:
379 mo = NLCRE_eol.search(epilogue)
380 if mo:
381 end = len(mo.group(0))
382 self._last.epilogue = epilogue[:-end]
383 else:
R David Murrayc5c14722011-04-06 08:13:02 -0400384 payload = self._last._payload
Guido van Rossum3172c5d2007-10-16 18:12:55 +0000385 if isinstance(payload, str):
Guido van Rossum8b3febe2007-08-30 01:15:14 +0000386 mo = NLCRE_eol.search(payload)
387 if mo:
388 payload = payload[:-len(mo.group(0))]
R David Murrayc5c14722011-04-06 08:13:02 -0400389 self._last._payload = payload
Guido van Rossum8b3febe2007-08-30 01:15:14 +0000390 self._input.pop_eof_matcher()
391 self._pop_message()
392 # Set the multipart up for newline cleansing, which will
393 # happen if we're in a nested multipart.
394 self._last = self._cur
395 else:
396 # I think we must be in the preamble
397 assert capturing_preamble
398 preamble.append(line)
399 # We've seen either the EOF or the end boundary. If we're still
400 # capturing the preamble, we never saw the start boundary. Note
401 # that as a defect and store the captured text as the payload.
Guido van Rossum8b3febe2007-08-30 01:15:14 +0000402 if capturing_preamble:
R David Murray3edd22a2011-04-18 13:59:37 -0400403 defect = errors.StartBoundaryNotFoundDefect()
404 self.policy.handle_defect(self._cur, defect)
Guido van Rossum8b3febe2007-08-30 01:15:14 +0000405 self._cur.set_payload(EMPTYSTRING.join(preamble))
406 epilogue = []
407 for line in self._input:
408 if line is NeedMoreData:
409 yield NeedMoreData
410 continue
411 self._cur.epilogue = EMPTYSTRING.join(epilogue)
412 return
R David Murray7ef3ff32012-05-27 22:20:42 -0400413 # If we're not processing the preamble, then we might have seen
414 # EOF without seeing that end boundary...that is also a defect.
415 if not close_boundary_seen:
416 defect = errors.CloseBoundaryNotFoundDefect()
417 self.policy.handle_defect(self._cur, defect)
418 return
419 # Everything from here to the EOF is epilogue. If the end boundary
420 # ended in a newline, we'll need to make sure the epilogue isn't
421 # None
Guido van Rossum8b3febe2007-08-30 01:15:14 +0000422 if linesep:
423 epilogue = ['']
424 else:
425 epilogue = []
426 for line in self._input:
427 if line is NeedMoreData:
428 yield NeedMoreData
429 continue
430 epilogue.append(line)
431 # Any CRLF at the front of the epilogue is not technically part of
432 # the epilogue. Also, watch out for an empty string epilogue,
433 # which means a single newline.
434 if epilogue:
435 firstline = epilogue[0]
436 bolmo = NLCRE_bol.match(firstline)
437 if bolmo:
438 epilogue[0] = firstline[len(bolmo.group(0)):]
439 self._cur.epilogue = EMPTYSTRING.join(epilogue)
440 return
441 # Otherwise, it's some non-multipart type, so the entire rest of the
442 # file contents becomes the payload.
443 lines = []
444 for line in self._input:
445 if line is NeedMoreData:
446 yield NeedMoreData
447 continue
448 lines.append(line)
449 self._cur.set_payload(EMPTYSTRING.join(lines))
450
451 def _parse_headers(self, lines):
452 # Passed a list of lines that make up the headers for the current msg
453 lastheader = ''
454 lastvalue = []
455 for lineno, line in enumerate(lines):
456 # Check for continuation
457 if line[0] in ' \t':
458 if not lastheader:
459 # The first line of the headers was a continuation. This
460 # is illegal, so let's note the defect, store the illegal
461 # line, and ignore it for purposes of headers.
462 defect = errors.FirstHeaderLineIsContinuationDefect(line)
R David Murray3edd22a2011-04-18 13:59:37 -0400463 self.policy.handle_defect(self._cur, defect)
Guido van Rossum8b3febe2007-08-30 01:15:14 +0000464 continue
465 lastvalue.append(line)
466 continue
467 if lastheader:
R David Murrayc27e5222012-05-25 15:01:48 -0400468 self._cur.set_raw(*self.policy.header_source_parse(lastvalue))
Guido van Rossum8b3febe2007-08-30 01:15:14 +0000469 lastheader, lastvalue = '', []
470 # Check for envelope header, i.e. unix-from
471 if line.startswith('From '):
472 if lineno == 0:
473 # Strip off the trailing newline
474 mo = NLCRE_eol.search(line)
475 if mo:
476 line = line[:-len(mo.group(0))]
477 self._cur.set_unixfrom(line)
478 continue
479 elif lineno == len(lines) - 1:
480 # Something looking like a unix-from at the end - it's
481 # probably the first line of the body, so push back the
482 # line and stop.
483 self._input.unreadline(line)
484 return
485 else:
486 # Weirdly placed unix-from line. Note this as a defect
487 # and ignore it.
488 defect = errors.MisplacedEnvelopeHeaderDefect(line)
489 self._cur.defects.append(defect)
490 continue
491 # Split the line on the colon separating field name from value.
R David Murrayadbdcdb2012-05-27 20:45:01 -0400492 # There will always be a colon, because if there wasn't the part of
493 # the parser that calls us would have started parsing the body.
Guido van Rossum8b3febe2007-08-30 01:15:14 +0000494 i = line.find(':')
R David Murrayadbdcdb2012-05-27 20:45:01 -0400495 assert i>0, "_parse_headers fed line with no : and no leading WS"
Guido van Rossum8b3febe2007-08-30 01:15:14 +0000496 lastheader = line[:i]
R David Murrayc27e5222012-05-25 15:01:48 -0400497 lastvalue = [line]
Guido van Rossum8b3febe2007-08-30 01:15:14 +0000498 # Done with all the lines, so handle the last header.
499 if lastheader:
R David Murrayc27e5222012-05-25 15:01:48 -0400500 self._cur.set_raw(*self.policy.header_source_parse(lastvalue))
R. David Murray96fd54e2010-10-08 15:55:28 +0000501
R David Murrayc27e5222012-05-25 15:01:48 -0400502
R. David Murray96fd54e2010-10-08 15:55:28 +0000503class BytesFeedParser(FeedParser):
504 """Like FeedParser, but feed accepts bytes."""
505
506 def feed(self, data):
507 super().feed(data.decode('ascii', 'surrogateescape'))