blob: 0706cae9bf9fe885e28a7d096b1fe623e79b448d [file] [log] [blame]
Guido van Rossum8b3febe2007-08-30 01:15:14 +00001# Copyright (C) 2004-2006 Python Software Foundation
2# Authors: Baxter, Wouters and Warsaw
3# Contact: email-sig@python.org
4
5"""FeedParser - An email feed parser.
6
7The feed parser implements an interface for incrementally parsing an email
8message, line by line. This has advantages for certain applications, such as
9those reading email messages off a socket.
10
11FeedParser.feed() is the primary interface for pushing new data into the
12parser. It returns when there's nothing more it can do with the available
13data. When you have no more data to push into the parser, call .close().
14This completes the parsing and returns the root message object.
15
16The other advantage of this parser is that it will never throw a parsing
17exception. Instead, when it finds something unexpected, it adds a 'defect' to
18the current message. Defects are just instances that live on the message
19object's .defects attribute.
20"""
21
R David Murray1b6c7242012-03-16 22:43:05 -040022__all__ = ['FeedParser', 'BytesFeedParser']
Guido van Rossum8b3febe2007-08-30 01:15:14 +000023
24import re
25
26from email import errors
27from email import message
R David Murrayc27e5222012-05-25 15:01:48 -040028from email._policybase import compat32
Guido van Rossum8b3febe2007-08-30 01:15:14 +000029
30NLCRE = re.compile('\r\n|\r|\n')
31NLCRE_bol = re.compile('(\r\n|\r|\n)')
R. David Murray45e0e142010-06-16 02:19:40 +000032NLCRE_eol = re.compile('(\r\n|\r|\n)\Z')
Guido van Rossum8b3febe2007-08-30 01:15:14 +000033NLCRE_crack = re.compile('(\r\n|\r|\n)')
34# RFC 2822 $3.6.8 Optional fields. ftext is %d33-57 / %d59-126, Any character
35# except controls, SP, and ":".
36headerRE = re.compile(r'^(From |[\041-\071\073-\176]{1,}:|[\t ])')
37EMPTYSTRING = ''
38NL = '\n'
39
40NeedMoreData = object()
41
42
43
44class BufferedSubFile(object):
45 """A file-ish object that can have new data loaded into it.
46
47 You can also push and pop line-matching predicates onto a stack. When the
48 current predicate matches the current line, a false EOF response
49 (i.e. empty string) is returned instead. This lets the parser adhere to a
50 simple abstraction -- it parses until EOF closes the current message.
51 """
52 def __init__(self):
53 # The last partial line pushed into this object.
54 self._partial = ''
55 # The list of full, pushed lines, in reverse order
56 self._lines = []
57 # The stack of false-EOF checking predicates.
58 self._eofstack = []
59 # A flag indicating whether the file has been closed or not.
60 self._closed = False
61
62 def push_eof_matcher(self, pred):
63 self._eofstack.append(pred)
64
65 def pop_eof_matcher(self):
66 return self._eofstack.pop()
67
68 def close(self):
69 # Don't forget any trailing partial line.
70 self._lines.append(self._partial)
71 self._partial = ''
72 self._closed = True
73
74 def readline(self):
75 if not self._lines:
76 if self._closed:
77 return ''
78 return NeedMoreData
79 # Pop the line off the stack and see if it matches the current
80 # false-EOF predicate.
81 line = self._lines.pop()
82 # RFC 2046, section 5.1.2 requires us to recognize outer level
83 # boundaries at any level of inner nesting. Do this, but be sure it's
84 # in the order of most to least nested.
85 for ateof in self._eofstack[::-1]:
86 if ateof(line):
87 # We're at the false EOF. But push the last line back first.
88 self._lines.append(line)
89 return ''
90 return line
91
92 def unreadline(self, line):
93 # Let the consumer push a line back into the buffer.
94 assert line is not NeedMoreData
95 self._lines.append(line)
96
97 def push(self, data):
98 """Push some new data into this object."""
99 # Handle any previous leftovers
100 data, self._partial = self._partial + data, ''
101 # Crack into lines, but preserve the newlines on the end of each
102 parts = NLCRE_crack.split(data)
103 # The *ahem* interesting behaviour of re.split when supplied grouping
104 # parentheses is that the last element of the resulting list is the
105 # data after the final RE. In the case of a NL/CR terminated string,
106 # this is the empty string.
107 self._partial = parts.pop()
R. David Murray45bf773f2010-07-17 01:19:57 +0000108 #GAN 29Mar09 bugs 1555570, 1721862 Confusion at 8K boundary ending with \r:
109 # is there a \n to follow later?
110 if not self._partial and parts and parts[-1].endswith('\r'):
111 self._partial = parts.pop(-2)+parts.pop()
Guido van Rossum8b3febe2007-08-30 01:15:14 +0000112 # parts is a list of strings, alternating between the line contents
113 # and the eol character(s). Gather up a list of lines after
114 # re-attaching the newlines.
115 lines = []
116 for i in range(len(parts) // 2):
117 lines.append(parts[i*2] + parts[i*2+1])
118 self.pushlines(lines)
119
120 def pushlines(self, lines):
121 # Reverse and insert at the front of the lines.
122 self._lines[:0] = lines[::-1]
123
Guido van Rossum8b3febe2007-08-30 01:15:14 +0000124 def __iter__(self):
125 return self
126
127 def __next__(self):
128 line = self.readline()
129 if line == '':
130 raise StopIteration
131 return line
132
133
134
135class FeedParser:
136 """A feed-style parser of email."""
137
R David Murrayc27e5222012-05-25 15:01:48 -0400138 def __init__(self, _factory=message.Message, *, policy=compat32):
R David Murray3edd22a2011-04-18 13:59:37 -0400139 """_factory is called with no arguments to create a new message obj
140
141 The policy keyword specifies a policy object that controls a number of
142 aspects of the parser's operation. The default policy maintains
143 backward compatibility.
144
145 """
Guido van Rossum8b3febe2007-08-30 01:15:14 +0000146 self._factory = _factory
R David Murray3edd22a2011-04-18 13:59:37 -0400147 self.policy = policy
R David Murrayc27e5222012-05-25 15:01:48 -0400148 try:
149 _factory(policy=self.policy)
150 self._factory_kwds = lambda: {'policy': self.policy}
151 except TypeError:
152 # Assume this is an old-style factory
153 self._factory_kwds = lambda: {}
Guido van Rossum8b3febe2007-08-30 01:15:14 +0000154 self._input = BufferedSubFile()
155 self._msgstack = []
156 self._parse = self._parsegen().__next__
157 self._cur = None
158 self._last = None
159 self._headersonly = False
160
161 # Non-public interface for supporting Parser's headersonly flag
162 def _set_headersonly(self):
163 self._headersonly = True
164
165 def feed(self, data):
166 """Push more data into the parser."""
167 self._input.push(data)
168 self._call_parse()
169
170 def _call_parse(self):
171 try:
172 self._parse()
173 except StopIteration:
174 pass
175
176 def close(self):
177 """Parse all remaining data and return the root message object."""
178 self._input.close()
179 self._call_parse()
180 root = self._pop_message()
181 assert not self._msgstack
182 # Look for final set of defects
183 if root.get_content_maintype() == 'multipart' \
184 and not root.is_multipart():
R David Murray3edd22a2011-04-18 13:59:37 -0400185 defect = errors.MultipartInvariantViolationDefect()
186 self.policy.handle_defect(root, defect)
Guido van Rossum8b3febe2007-08-30 01:15:14 +0000187 return root
188
189 def _new_message(self):
R David Murrayc27e5222012-05-25 15:01:48 -0400190 msg = self._factory(**self._factory_kwds())
Guido van Rossum8b3febe2007-08-30 01:15:14 +0000191 if self._cur and self._cur.get_content_type() == 'multipart/digest':
192 msg.set_default_type('message/rfc822')
193 if self._msgstack:
194 self._msgstack[-1].attach(msg)
195 self._msgstack.append(msg)
196 self._cur = msg
197 self._last = msg
198
199 def _pop_message(self):
200 retval = self._msgstack.pop()
201 if self._msgstack:
202 self._cur = self._msgstack[-1]
203 else:
204 self._cur = None
205 return retval
206
207 def _parsegen(self):
208 # Create a new message and start by parsing headers.
209 self._new_message()
210 headers = []
211 # Collect the headers, searching for a line that doesn't match the RFC
212 # 2822 header or continuation pattern (including an empty line).
213 for line in self._input:
214 if line is NeedMoreData:
215 yield NeedMoreData
216 continue
217 if not headerRE.match(line):
218 # If we saw the RFC defined header/body separator
219 # (i.e. newline), just throw it away. Otherwise the line is
220 # part of the body so push it back.
221 if not NLCRE.match(line):
222 self._input.unreadline(line)
223 break
224 headers.append(line)
225 # Done with the headers, so parse them and figure out what we're
226 # supposed to see in the body of the message.
227 self._parse_headers(headers)
228 # Headers-only parsing is a backwards compatibility hack, which was
229 # necessary in the older parser, which could throw errors. All
230 # remaining lines in the input are thrown into the message body.
231 if self._headersonly:
232 lines = []
233 while True:
234 line = self._input.readline()
235 if line is NeedMoreData:
236 yield NeedMoreData
237 continue
238 if line == '':
239 break
240 lines.append(line)
241 self._cur.set_payload(EMPTYSTRING.join(lines))
242 return
243 if self._cur.get_content_type() == 'message/delivery-status':
244 # message/delivery-status contains blocks of headers separated by
245 # a blank line. We'll represent each header block as a separate
246 # nested message object, but the processing is a bit different
247 # than standard message/* types because there is no body for the
248 # nested messages. A blank line separates the subparts.
249 while True:
250 self._input.push_eof_matcher(NLCRE.match)
251 for retval in self._parsegen():
252 if retval is NeedMoreData:
253 yield NeedMoreData
254 continue
255 break
256 msg = self._pop_message()
257 # We need to pop the EOF matcher in order to tell if we're at
258 # the end of the current file, not the end of the last block
259 # of message headers.
260 self._input.pop_eof_matcher()
261 # The input stream must be sitting at the newline or at the
262 # EOF. We want to see if we're at the end of this subpart, so
263 # first consume the blank line, then test the next line to see
264 # if we're at this subpart's EOF.
265 while True:
266 line = self._input.readline()
267 if line is NeedMoreData:
268 yield NeedMoreData
269 continue
270 break
271 while True:
272 line = self._input.readline()
273 if line is NeedMoreData:
274 yield NeedMoreData
275 continue
276 break
277 if line == '':
278 break
279 # Not at EOF so this is a line we're going to need.
280 self._input.unreadline(line)
281 return
282 if self._cur.get_content_maintype() == 'message':
283 # The message claims to be a message/* type, then what follows is
284 # another RFC 2822 message.
285 for retval in self._parsegen():
286 if retval is NeedMoreData:
287 yield NeedMoreData
288 continue
289 break
290 self._pop_message()
291 return
292 if self._cur.get_content_maintype() == 'multipart':
293 boundary = self._cur.get_boundary()
294 if boundary is None:
295 # The message /claims/ to be a multipart but it has not
296 # defined a boundary. That's a problem which we'll handle by
297 # reading everything until the EOF and marking the message as
298 # defective.
R David Murray3edd22a2011-04-18 13:59:37 -0400299 defect = errors.NoBoundaryInMultipartDefect()
300 self.policy.handle_defect(self._cur, defect)
Guido van Rossum8b3febe2007-08-30 01:15:14 +0000301 lines = []
302 for line in self._input:
303 if line is NeedMoreData:
304 yield NeedMoreData
305 continue
306 lines.append(line)
307 self._cur.set_payload(EMPTYSTRING.join(lines))
308 return
R David Murray749073a2011-06-22 13:47:53 -0400309 # Make sure a valid content type was specified per RFC 2045:6.4.
310 if (self._cur.get('content-transfer-encoding', '8bit').lower()
311 not in ('7bit', '8bit', 'binary')):
312 defect = errors.InvalidMultipartContentTransferEncodingDefect()
313 self.policy.handle_defect(self._cur, defect)
Guido van Rossum8b3febe2007-08-30 01:15:14 +0000314 # Create a line match predicate which matches the inter-part
315 # boundary as well as the end-of-multipart boundary. Don't push
316 # this onto the input stream until we've scanned past the
317 # preamble.
318 separator = '--' + boundary
319 boundaryre = re.compile(
320 '(?P<sep>' + re.escape(separator) +
321 r')(?P<end>--)?(?P<ws>[ \t]*)(?P<linesep>\r\n|\r|\n)?$')
322 capturing_preamble = True
323 preamble = []
324 linesep = False
325 while True:
326 line = self._input.readline()
327 if line is NeedMoreData:
328 yield NeedMoreData
329 continue
330 if line == '':
331 break
332 mo = boundaryre.match(line)
333 if mo:
334 # If we're looking at the end boundary, we're done with
335 # this multipart. If there was a newline at the end of
336 # the closing boundary, then we need to initialize the
337 # epilogue with the empty string (see below).
338 if mo.group('end'):
339 linesep = mo.group('linesep')
340 break
341 # We saw an inter-part boundary. Were we in the preamble?
342 if capturing_preamble:
343 if preamble:
344 # According to RFC 2046, the last newline belongs
345 # to the boundary.
346 lastline = preamble[-1]
347 eolmo = NLCRE_eol.search(lastline)
348 if eolmo:
349 preamble[-1] = lastline[:-len(eolmo.group(0))]
350 self._cur.preamble = EMPTYSTRING.join(preamble)
351 capturing_preamble = False
352 self._input.unreadline(line)
353 continue
354 # We saw a boundary separating two parts. Consume any
355 # multiple boundary lines that may be following. Our
356 # interpretation of RFC 2046 BNF grammar does not produce
357 # body parts within such double boundaries.
358 while True:
359 line = self._input.readline()
360 if line is NeedMoreData:
361 yield NeedMoreData
362 continue
363 mo = boundaryre.match(line)
364 if not mo:
365 self._input.unreadline(line)
366 break
367 # Recurse to parse this subpart; the input stream points
368 # at the subpart's first line.
369 self._input.push_eof_matcher(boundaryre.match)
370 for retval in self._parsegen():
371 if retval is NeedMoreData:
372 yield NeedMoreData
373 continue
374 break
375 # Because of RFC 2046, the newline preceding the boundary
376 # separator actually belongs to the boundary, not the
377 # previous subpart's payload (or epilogue if the previous
378 # part is a multipart).
379 if self._last.get_content_maintype() == 'multipart':
380 epilogue = self._last.epilogue
381 if epilogue == '':
382 self._last.epilogue = None
383 elif epilogue is not None:
384 mo = NLCRE_eol.search(epilogue)
385 if mo:
386 end = len(mo.group(0))
387 self._last.epilogue = epilogue[:-end]
388 else:
R David Murrayc5c14722011-04-06 08:13:02 -0400389 payload = self._last._payload
Guido van Rossum3172c5d2007-10-16 18:12:55 +0000390 if isinstance(payload, str):
Guido van Rossum8b3febe2007-08-30 01:15:14 +0000391 mo = NLCRE_eol.search(payload)
392 if mo:
393 payload = payload[:-len(mo.group(0))]
R David Murrayc5c14722011-04-06 08:13:02 -0400394 self._last._payload = payload
Guido van Rossum8b3febe2007-08-30 01:15:14 +0000395 self._input.pop_eof_matcher()
396 self._pop_message()
397 # Set the multipart up for newline cleansing, which will
398 # happen if we're in a nested multipart.
399 self._last = self._cur
400 else:
401 # I think we must be in the preamble
402 assert capturing_preamble
403 preamble.append(line)
404 # We've seen either the EOF or the end boundary. If we're still
405 # capturing the preamble, we never saw the start boundary. Note
406 # that as a defect and store the captured text as the payload.
407 # Everything from here to the EOF is epilogue.
408 if capturing_preamble:
R David Murray3edd22a2011-04-18 13:59:37 -0400409 defect = errors.StartBoundaryNotFoundDefect()
410 self.policy.handle_defect(self._cur, defect)
Guido van Rossum8b3febe2007-08-30 01:15:14 +0000411 self._cur.set_payload(EMPTYSTRING.join(preamble))
412 epilogue = []
413 for line in self._input:
414 if line is NeedMoreData:
415 yield NeedMoreData
416 continue
417 self._cur.epilogue = EMPTYSTRING.join(epilogue)
418 return
419 # If the end boundary ended in a newline, we'll need to make sure
420 # the epilogue isn't None
421 if linesep:
422 epilogue = ['']
423 else:
424 epilogue = []
425 for line in self._input:
426 if line is NeedMoreData:
427 yield NeedMoreData
428 continue
429 epilogue.append(line)
430 # Any CRLF at the front of the epilogue is not technically part of
431 # the epilogue. Also, watch out for an empty string epilogue,
432 # which means a single newline.
433 if epilogue:
434 firstline = epilogue[0]
435 bolmo = NLCRE_bol.match(firstline)
436 if bolmo:
437 epilogue[0] = firstline[len(bolmo.group(0)):]
438 self._cur.epilogue = EMPTYSTRING.join(epilogue)
439 return
440 # Otherwise, it's some non-multipart type, so the entire rest of the
441 # file contents becomes the payload.
442 lines = []
443 for line in self._input:
444 if line is NeedMoreData:
445 yield NeedMoreData
446 continue
447 lines.append(line)
448 self._cur.set_payload(EMPTYSTRING.join(lines))
449
450 def _parse_headers(self, lines):
451 # Passed a list of lines that make up the headers for the current msg
452 lastheader = ''
453 lastvalue = []
454 for lineno, line in enumerate(lines):
455 # Check for continuation
456 if line[0] in ' \t':
457 if not lastheader:
458 # The first line of the headers was a continuation. This
459 # is illegal, so let's note the defect, store the illegal
460 # line, and ignore it for purposes of headers.
461 defect = errors.FirstHeaderLineIsContinuationDefect(line)
R David Murray3edd22a2011-04-18 13:59:37 -0400462 self.policy.handle_defect(self._cur, defect)
Guido van Rossum8b3febe2007-08-30 01:15:14 +0000463 continue
464 lastvalue.append(line)
465 continue
466 if lastheader:
R David Murrayc27e5222012-05-25 15:01:48 -0400467 self._cur.set_raw(*self.policy.header_source_parse(lastvalue))
Guido van Rossum8b3febe2007-08-30 01:15:14 +0000468 lastheader, lastvalue = '', []
469 # Check for envelope header, i.e. unix-from
470 if line.startswith('From '):
471 if lineno == 0:
472 # Strip off the trailing newline
473 mo = NLCRE_eol.search(line)
474 if mo:
475 line = line[:-len(mo.group(0))]
476 self._cur.set_unixfrom(line)
477 continue
478 elif lineno == len(lines) - 1:
479 # Something looking like a unix-from at the end - it's
480 # probably the first line of the body, so push back the
481 # line and stop.
482 self._input.unreadline(line)
483 return
484 else:
485 # Weirdly placed unix-from line. Note this as a defect
486 # and ignore it.
487 defect = errors.MisplacedEnvelopeHeaderDefect(line)
488 self._cur.defects.append(defect)
489 continue
490 # Split the line on the colon separating field name from value.
491 i = line.find(':')
492 if i < 0:
493 defect = errors.MalformedHeaderDefect(line)
R David Murrayc27e5222012-05-25 15:01:48 -0400494 # XXX: fixme (defect not going through policy)
Guido van Rossum8b3febe2007-08-30 01:15:14 +0000495 self._cur.defects.append(defect)
496 continue
497 lastheader = line[:i]
R David Murrayc27e5222012-05-25 15:01:48 -0400498 lastvalue = [line]
Guido van Rossum8b3febe2007-08-30 01:15:14 +0000499 # Done with all the lines, so handle the last header.
500 if lastheader:
R David Murrayc27e5222012-05-25 15:01:48 -0400501 self._cur.set_raw(*self.policy.header_source_parse(lastvalue))
R. David Murray96fd54e2010-10-08 15:55:28 +0000502
R David Murrayc27e5222012-05-25 15:01:48 -0400503
R. David Murray96fd54e2010-10-08 15:55:28 +0000504class BytesFeedParser(FeedParser):
505 """Like FeedParser, but feed accepts bytes."""
506
507 def feed(self, data):
508 super().feed(data.decode('ascii', 'surrogateescape'))