blob: e754d89cb63b85b3a9064a14f5d88014a1864ef4 [file] [log] [blame]
Guido van Rossum8b3febe2007-08-30 01:15:14 +00001# Copyright (C) 2004-2006 Python Software Foundation
2# Authors: Baxter, Wouters and Warsaw
3# Contact: email-sig@python.org
4
5"""FeedParser - An email feed parser.
6
7The feed parser implements an interface for incrementally parsing an email
8message, line by line. This has advantages for certain applications, such as
9those reading email messages off a socket.
10
11FeedParser.feed() is the primary interface for pushing new data into the
12parser. It returns when there's nothing more it can do with the available
13data. When you have no more data to push into the parser, call .close().
14This completes the parsing and returns the root message object.
15
16The other advantage of this parser is that it will never throw a parsing
17exception. Instead, when it finds something unexpected, it adds a 'defect' to
18the current message. Defects are just instances that live on the message
19object's .defects attribute.
20"""
21
22__all__ = ['FeedParser']
23
24import re
25
26from email import errors
27from email import message
R David Murray3edd22a2011-04-18 13:59:37 -040028from email import policy
Guido van Rossum8b3febe2007-08-30 01:15:14 +000029
30NLCRE = re.compile('\r\n|\r|\n')
31NLCRE_bol = re.compile('(\r\n|\r|\n)')
R. David Murray45e0e142010-06-16 02:19:40 +000032NLCRE_eol = re.compile('(\r\n|\r|\n)\Z')
Guido van Rossum8b3febe2007-08-30 01:15:14 +000033NLCRE_crack = re.compile('(\r\n|\r|\n)')
34# RFC 2822 $3.6.8 Optional fields. ftext is %d33-57 / %d59-126, Any character
35# except controls, SP, and ":".
36headerRE = re.compile(r'^(From |[\041-\071\073-\176]{1,}:|[\t ])')
37EMPTYSTRING = ''
38NL = '\n'
39
40NeedMoreData = object()
41
42
43
44class BufferedSubFile(object):
45 """A file-ish object that can have new data loaded into it.
46
47 You can also push and pop line-matching predicates onto a stack. When the
48 current predicate matches the current line, a false EOF response
49 (i.e. empty string) is returned instead. This lets the parser adhere to a
50 simple abstraction -- it parses until EOF closes the current message.
51 """
52 def __init__(self):
53 # The last partial line pushed into this object.
54 self._partial = ''
55 # The list of full, pushed lines, in reverse order
56 self._lines = []
57 # The stack of false-EOF checking predicates.
58 self._eofstack = []
59 # A flag indicating whether the file has been closed or not.
60 self._closed = False
61
62 def push_eof_matcher(self, pred):
63 self._eofstack.append(pred)
64
65 def pop_eof_matcher(self):
66 return self._eofstack.pop()
67
68 def close(self):
69 # Don't forget any trailing partial line.
70 self._lines.append(self._partial)
71 self._partial = ''
72 self._closed = True
73
74 def readline(self):
75 if not self._lines:
76 if self._closed:
77 return ''
78 return NeedMoreData
79 # Pop the line off the stack and see if it matches the current
80 # false-EOF predicate.
81 line = self._lines.pop()
82 # RFC 2046, section 5.1.2 requires us to recognize outer level
83 # boundaries at any level of inner nesting. Do this, but be sure it's
84 # in the order of most to least nested.
85 for ateof in self._eofstack[::-1]:
86 if ateof(line):
87 # We're at the false EOF. But push the last line back first.
88 self._lines.append(line)
89 return ''
90 return line
91
92 def unreadline(self, line):
93 # Let the consumer push a line back into the buffer.
94 assert line is not NeedMoreData
95 self._lines.append(line)
96
97 def push(self, data):
98 """Push some new data into this object."""
99 # Handle any previous leftovers
100 data, self._partial = self._partial + data, ''
101 # Crack into lines, but preserve the newlines on the end of each
102 parts = NLCRE_crack.split(data)
103 # The *ahem* interesting behaviour of re.split when supplied grouping
104 # parentheses is that the last element of the resulting list is the
105 # data after the final RE. In the case of a NL/CR terminated string,
106 # this is the empty string.
107 self._partial = parts.pop()
R. David Murray45bf773f2010-07-17 01:19:57 +0000108 #GAN 29Mar09 bugs 1555570, 1721862 Confusion at 8K boundary ending with \r:
109 # is there a \n to follow later?
110 if not self._partial and parts and parts[-1].endswith('\r'):
111 self._partial = parts.pop(-2)+parts.pop()
Guido van Rossum8b3febe2007-08-30 01:15:14 +0000112 # parts is a list of strings, alternating between the line contents
113 # and the eol character(s). Gather up a list of lines after
114 # re-attaching the newlines.
115 lines = []
116 for i in range(len(parts) // 2):
117 lines.append(parts[i*2] + parts[i*2+1])
118 self.pushlines(lines)
119
120 def pushlines(self, lines):
121 # Reverse and insert at the front of the lines.
122 self._lines[:0] = lines[::-1]
123
Guido van Rossum8b3febe2007-08-30 01:15:14 +0000124 def __iter__(self):
125 return self
126
127 def __next__(self):
128 line = self.readline()
129 if line == '':
130 raise StopIteration
131 return line
132
133
134
135class FeedParser:
136 """A feed-style parser of email."""
137
R David Murray3edd22a2011-04-18 13:59:37 -0400138 def __init__(self, _factory=message.Message, *, policy=policy.default):
139 """_factory is called with no arguments to create a new message obj
140
141 The policy keyword specifies a policy object that controls a number of
142 aspects of the parser's operation. The default policy maintains
143 backward compatibility.
144
145 """
Guido van Rossum8b3febe2007-08-30 01:15:14 +0000146 self._factory = _factory
R David Murray3edd22a2011-04-18 13:59:37 -0400147 self.policy = policy
Guido van Rossum8b3febe2007-08-30 01:15:14 +0000148 self._input = BufferedSubFile()
149 self._msgstack = []
150 self._parse = self._parsegen().__next__
151 self._cur = None
152 self._last = None
153 self._headersonly = False
154
155 # Non-public interface for supporting Parser's headersonly flag
156 def _set_headersonly(self):
157 self._headersonly = True
158
159 def feed(self, data):
160 """Push more data into the parser."""
161 self._input.push(data)
162 self._call_parse()
163
164 def _call_parse(self):
165 try:
166 self._parse()
167 except StopIteration:
168 pass
169
170 def close(self):
171 """Parse all remaining data and return the root message object."""
172 self._input.close()
173 self._call_parse()
174 root = self._pop_message()
175 assert not self._msgstack
176 # Look for final set of defects
177 if root.get_content_maintype() == 'multipart' \
178 and not root.is_multipart():
R David Murray3edd22a2011-04-18 13:59:37 -0400179 defect = errors.MultipartInvariantViolationDefect()
180 self.policy.handle_defect(root, defect)
Guido van Rossum8b3febe2007-08-30 01:15:14 +0000181 return root
182
183 def _new_message(self):
184 msg = self._factory()
185 if self._cur and self._cur.get_content_type() == 'multipart/digest':
186 msg.set_default_type('message/rfc822')
187 if self._msgstack:
188 self._msgstack[-1].attach(msg)
189 self._msgstack.append(msg)
190 self._cur = msg
191 self._last = msg
192
193 def _pop_message(self):
194 retval = self._msgstack.pop()
195 if self._msgstack:
196 self._cur = self._msgstack[-1]
197 else:
198 self._cur = None
199 return retval
200
201 def _parsegen(self):
202 # Create a new message and start by parsing headers.
203 self._new_message()
204 headers = []
205 # Collect the headers, searching for a line that doesn't match the RFC
206 # 2822 header or continuation pattern (including an empty line).
207 for line in self._input:
208 if line is NeedMoreData:
209 yield NeedMoreData
210 continue
211 if not headerRE.match(line):
212 # If we saw the RFC defined header/body separator
213 # (i.e. newline), just throw it away. Otherwise the line is
214 # part of the body so push it back.
215 if not NLCRE.match(line):
216 self._input.unreadline(line)
217 break
218 headers.append(line)
219 # Done with the headers, so parse them and figure out what we're
220 # supposed to see in the body of the message.
221 self._parse_headers(headers)
222 # Headers-only parsing is a backwards compatibility hack, which was
223 # necessary in the older parser, which could throw errors. All
224 # remaining lines in the input are thrown into the message body.
225 if self._headersonly:
226 lines = []
227 while True:
228 line = self._input.readline()
229 if line is NeedMoreData:
230 yield NeedMoreData
231 continue
232 if line == '':
233 break
234 lines.append(line)
235 self._cur.set_payload(EMPTYSTRING.join(lines))
236 return
237 if self._cur.get_content_type() == 'message/delivery-status':
238 # message/delivery-status contains blocks of headers separated by
239 # a blank line. We'll represent each header block as a separate
240 # nested message object, but the processing is a bit different
241 # than standard message/* types because there is no body for the
242 # nested messages. A blank line separates the subparts.
243 while True:
244 self._input.push_eof_matcher(NLCRE.match)
245 for retval in self._parsegen():
246 if retval is NeedMoreData:
247 yield NeedMoreData
248 continue
249 break
250 msg = self._pop_message()
251 # We need to pop the EOF matcher in order to tell if we're at
252 # the end of the current file, not the end of the last block
253 # of message headers.
254 self._input.pop_eof_matcher()
255 # The input stream must be sitting at the newline or at the
256 # EOF. We want to see if we're at the end of this subpart, so
257 # first consume the blank line, then test the next line to see
258 # if we're at this subpart's EOF.
259 while True:
260 line = self._input.readline()
261 if line is NeedMoreData:
262 yield NeedMoreData
263 continue
264 break
265 while True:
266 line = self._input.readline()
267 if line is NeedMoreData:
268 yield NeedMoreData
269 continue
270 break
271 if line == '':
272 break
273 # Not at EOF so this is a line we're going to need.
274 self._input.unreadline(line)
275 return
276 if self._cur.get_content_maintype() == 'message':
277 # The message claims to be a message/* type, then what follows is
278 # another RFC 2822 message.
279 for retval in self._parsegen():
280 if retval is NeedMoreData:
281 yield NeedMoreData
282 continue
283 break
284 self._pop_message()
285 return
286 if self._cur.get_content_maintype() == 'multipart':
287 boundary = self._cur.get_boundary()
288 if boundary is None:
289 # The message /claims/ to be a multipart but it has not
290 # defined a boundary. That's a problem which we'll handle by
291 # reading everything until the EOF and marking the message as
292 # defective.
R David Murray3edd22a2011-04-18 13:59:37 -0400293 defect = errors.NoBoundaryInMultipartDefect()
294 self.policy.handle_defect(self._cur, defect)
Guido van Rossum8b3febe2007-08-30 01:15:14 +0000295 lines = []
296 for line in self._input:
297 if line is NeedMoreData:
298 yield NeedMoreData
299 continue
300 lines.append(line)
301 self._cur.set_payload(EMPTYSTRING.join(lines))
302 return
R David Murray749073a2011-06-22 13:47:53 -0400303 # Make sure a valid content type was specified per RFC 2045:6.4.
304 if (self._cur.get('content-transfer-encoding', '8bit').lower()
305 not in ('7bit', '8bit', 'binary')):
306 defect = errors.InvalidMultipartContentTransferEncodingDefect()
307 self.policy.handle_defect(self._cur, defect)
Guido van Rossum8b3febe2007-08-30 01:15:14 +0000308 # Create a line match predicate which matches the inter-part
309 # boundary as well as the end-of-multipart boundary. Don't push
310 # this onto the input stream until we've scanned past the
311 # preamble.
312 separator = '--' + boundary
313 boundaryre = re.compile(
314 '(?P<sep>' + re.escape(separator) +
315 r')(?P<end>--)?(?P<ws>[ \t]*)(?P<linesep>\r\n|\r|\n)?$')
316 capturing_preamble = True
317 preamble = []
318 linesep = False
319 while True:
320 line = self._input.readline()
321 if line is NeedMoreData:
322 yield NeedMoreData
323 continue
324 if line == '':
325 break
326 mo = boundaryre.match(line)
327 if mo:
328 # If we're looking at the end boundary, we're done with
329 # this multipart. If there was a newline at the end of
330 # the closing boundary, then we need to initialize the
331 # epilogue with the empty string (see below).
332 if mo.group('end'):
333 linesep = mo.group('linesep')
334 break
335 # We saw an inter-part boundary. Were we in the preamble?
336 if capturing_preamble:
337 if preamble:
338 # According to RFC 2046, the last newline belongs
339 # to the boundary.
340 lastline = preamble[-1]
341 eolmo = NLCRE_eol.search(lastline)
342 if eolmo:
343 preamble[-1] = lastline[:-len(eolmo.group(0))]
344 self._cur.preamble = EMPTYSTRING.join(preamble)
345 capturing_preamble = False
346 self._input.unreadline(line)
347 continue
348 # We saw a boundary separating two parts. Consume any
349 # multiple boundary lines that may be following. Our
350 # interpretation of RFC 2046 BNF grammar does not produce
351 # body parts within such double boundaries.
352 while True:
353 line = self._input.readline()
354 if line is NeedMoreData:
355 yield NeedMoreData
356 continue
357 mo = boundaryre.match(line)
358 if not mo:
359 self._input.unreadline(line)
360 break
361 # Recurse to parse this subpart; the input stream points
362 # at the subpart's first line.
363 self._input.push_eof_matcher(boundaryre.match)
364 for retval in self._parsegen():
365 if retval is NeedMoreData:
366 yield NeedMoreData
367 continue
368 break
369 # Because of RFC 2046, the newline preceding the boundary
370 # separator actually belongs to the boundary, not the
371 # previous subpart's payload (or epilogue if the previous
372 # part is a multipart).
373 if self._last.get_content_maintype() == 'multipart':
374 epilogue = self._last.epilogue
375 if epilogue == '':
376 self._last.epilogue = None
377 elif epilogue is not None:
378 mo = NLCRE_eol.search(epilogue)
379 if mo:
380 end = len(mo.group(0))
381 self._last.epilogue = epilogue[:-end]
382 else:
R David Murrayc5c14722011-04-06 08:13:02 -0400383 payload = self._last._payload
Guido van Rossum3172c5d2007-10-16 18:12:55 +0000384 if isinstance(payload, str):
Guido van Rossum8b3febe2007-08-30 01:15:14 +0000385 mo = NLCRE_eol.search(payload)
386 if mo:
387 payload = payload[:-len(mo.group(0))]
R David Murrayc5c14722011-04-06 08:13:02 -0400388 self._last._payload = payload
Guido van Rossum8b3febe2007-08-30 01:15:14 +0000389 self._input.pop_eof_matcher()
390 self._pop_message()
391 # Set the multipart up for newline cleansing, which will
392 # happen if we're in a nested multipart.
393 self._last = self._cur
394 else:
395 # I think we must be in the preamble
396 assert capturing_preamble
397 preamble.append(line)
398 # We've seen either the EOF or the end boundary. If we're still
399 # capturing the preamble, we never saw the start boundary. Note
400 # that as a defect and store the captured text as the payload.
401 # Everything from here to the EOF is epilogue.
402 if capturing_preamble:
R David Murray3edd22a2011-04-18 13:59:37 -0400403 defect = errors.StartBoundaryNotFoundDefect()
404 self.policy.handle_defect(self._cur, defect)
Guido van Rossum8b3febe2007-08-30 01:15:14 +0000405 self._cur.set_payload(EMPTYSTRING.join(preamble))
406 epilogue = []
407 for line in self._input:
408 if line is NeedMoreData:
409 yield NeedMoreData
410 continue
411 self._cur.epilogue = EMPTYSTRING.join(epilogue)
412 return
413 # If the end boundary ended in a newline, we'll need to make sure
414 # the epilogue isn't None
415 if linesep:
416 epilogue = ['']
417 else:
418 epilogue = []
419 for line in self._input:
420 if line is NeedMoreData:
421 yield NeedMoreData
422 continue
423 epilogue.append(line)
424 # Any CRLF at the front of the epilogue is not technically part of
425 # the epilogue. Also, watch out for an empty string epilogue,
426 # which means a single newline.
427 if epilogue:
428 firstline = epilogue[0]
429 bolmo = NLCRE_bol.match(firstline)
430 if bolmo:
431 epilogue[0] = firstline[len(bolmo.group(0)):]
432 self._cur.epilogue = EMPTYSTRING.join(epilogue)
433 return
434 # Otherwise, it's some non-multipart type, so the entire rest of the
435 # file contents becomes the payload.
436 lines = []
437 for line in self._input:
438 if line is NeedMoreData:
439 yield NeedMoreData
440 continue
441 lines.append(line)
442 self._cur.set_payload(EMPTYSTRING.join(lines))
443
444 def _parse_headers(self, lines):
445 # Passed a list of lines that make up the headers for the current msg
446 lastheader = ''
447 lastvalue = []
448 for lineno, line in enumerate(lines):
449 # Check for continuation
450 if line[0] in ' \t':
451 if not lastheader:
452 # The first line of the headers was a continuation. This
453 # is illegal, so let's note the defect, store the illegal
454 # line, and ignore it for purposes of headers.
455 defect = errors.FirstHeaderLineIsContinuationDefect(line)
R David Murray3edd22a2011-04-18 13:59:37 -0400456 self.policy.handle_defect(self._cur, defect)
Guido van Rossum8b3febe2007-08-30 01:15:14 +0000457 continue
458 lastvalue.append(line)
459 continue
460 if lastheader:
461 # XXX reconsider the joining of folded lines
462 lhdr = EMPTYSTRING.join(lastvalue)[:-1].rstrip('\r\n')
463 self._cur[lastheader] = lhdr
464 lastheader, lastvalue = '', []
465 # Check for envelope header, i.e. unix-from
466 if line.startswith('From '):
467 if lineno == 0:
468 # Strip off the trailing newline
469 mo = NLCRE_eol.search(line)
470 if mo:
471 line = line[:-len(mo.group(0))]
472 self._cur.set_unixfrom(line)
473 continue
474 elif lineno == len(lines) - 1:
475 # Something looking like a unix-from at the end - it's
476 # probably the first line of the body, so push back the
477 # line and stop.
478 self._input.unreadline(line)
479 return
480 else:
481 # Weirdly placed unix-from line. Note this as a defect
482 # and ignore it.
483 defect = errors.MisplacedEnvelopeHeaderDefect(line)
484 self._cur.defects.append(defect)
485 continue
486 # Split the line on the colon separating field name from value.
487 i = line.find(':')
488 if i < 0:
489 defect = errors.MalformedHeaderDefect(line)
490 self._cur.defects.append(defect)
491 continue
492 lastheader = line[:i]
493 lastvalue = [line[i+1:].lstrip()]
494 # Done with all the lines, so handle the last header.
495 if lastheader:
496 # XXX reconsider the joining of folded lines
497 self._cur[lastheader] = EMPTYSTRING.join(lastvalue).rstrip('\r\n')
R. David Murray96fd54e2010-10-08 15:55:28 +0000498
499
500class BytesFeedParser(FeedParser):
501 """Like FeedParser, but feed accepts bytes."""
502
503 def feed(self, data):
504 super().feed(data.decode('ascii', 'surrogateescape'))