blob: 1b752d0193ad1203248b42d7d8ac8241e25e671b [file] [log] [blame]
Guido van Rossum8b3febe2007-08-30 01:15:14 +00001# Copyright (C) 2004-2006 Python Software Foundation
2# Authors: Baxter, Wouters and Warsaw
3# Contact: email-sig@python.org
4
5"""FeedParser - An email feed parser.
6
7The feed parser implements an interface for incrementally parsing an email
8message, line by line. This has advantages for certain applications, such as
9those reading email messages off a socket.
10
11FeedParser.feed() is the primary interface for pushing new data into the
12parser. It returns when there's nothing more it can do with the available
13data. When you have no more data to push into the parser, call .close().
14This completes the parsing and returns the root message object.
15
16The other advantage of this parser is that it will never throw a parsing
17exception. Instead, when it finds something unexpected, it adds a 'defect' to
18the current message. Defects are just instances that live on the message
19object's .defects attribute.
20"""
21
22__all__ = ['FeedParser']
23
24import re
25
26from email import errors
27from email import message
28
29NLCRE = re.compile('\r\n|\r|\n')
30NLCRE_bol = re.compile('(\r\n|\r|\n)')
R. David Murray45e0e142010-06-16 02:19:40 +000031NLCRE_eol = re.compile('(\r\n|\r|\n)\Z')
Guido van Rossum8b3febe2007-08-30 01:15:14 +000032NLCRE_crack = re.compile('(\r\n|\r|\n)')
33# RFC 2822 $3.6.8 Optional fields. ftext is %d33-57 / %d59-126, Any character
34# except controls, SP, and ":".
35headerRE = re.compile(r'^(From |[\041-\071\073-\176]{1,}:|[\t ])')
36EMPTYSTRING = ''
37NL = '\n'
38
39NeedMoreData = object()
40
41
42
43class BufferedSubFile(object):
44 """A file-ish object that can have new data loaded into it.
45
46 You can also push and pop line-matching predicates onto a stack. When the
47 current predicate matches the current line, a false EOF response
48 (i.e. empty string) is returned instead. This lets the parser adhere to a
49 simple abstraction -- it parses until EOF closes the current message.
50 """
51 def __init__(self):
52 # The last partial line pushed into this object.
53 self._partial = ''
54 # The list of full, pushed lines, in reverse order
55 self._lines = []
56 # The stack of false-EOF checking predicates.
57 self._eofstack = []
58 # A flag indicating whether the file has been closed or not.
59 self._closed = False
60
61 def push_eof_matcher(self, pred):
62 self._eofstack.append(pred)
63
64 def pop_eof_matcher(self):
65 return self._eofstack.pop()
66
67 def close(self):
68 # Don't forget any trailing partial line.
69 self._lines.append(self._partial)
70 self._partial = ''
71 self._closed = True
72
73 def readline(self):
74 if not self._lines:
75 if self._closed:
76 return ''
77 return NeedMoreData
78 # Pop the line off the stack and see if it matches the current
79 # false-EOF predicate.
80 line = self._lines.pop()
81 # RFC 2046, section 5.1.2 requires us to recognize outer level
82 # boundaries at any level of inner nesting. Do this, but be sure it's
83 # in the order of most to least nested.
84 for ateof in self._eofstack[::-1]:
85 if ateof(line):
86 # We're at the false EOF. But push the last line back first.
87 self._lines.append(line)
88 return ''
89 return line
90
91 def unreadline(self, line):
92 # Let the consumer push a line back into the buffer.
93 assert line is not NeedMoreData
94 self._lines.append(line)
95
96 def push(self, data):
97 """Push some new data into this object."""
98 # Handle any previous leftovers
99 data, self._partial = self._partial + data, ''
100 # Crack into lines, but preserve the newlines on the end of each
101 parts = NLCRE_crack.split(data)
102 # The *ahem* interesting behaviour of re.split when supplied grouping
103 # parentheses is that the last element of the resulting list is the
104 # data after the final RE. In the case of a NL/CR terminated string,
105 # this is the empty string.
106 self._partial = parts.pop()
R. David Murray45bf773f2010-07-17 01:19:57 +0000107 #GAN 29Mar09 bugs 1555570, 1721862 Confusion at 8K boundary ending with \r:
108 # is there a \n to follow later?
109 if not self._partial and parts and parts[-1].endswith('\r'):
110 self._partial = parts.pop(-2)+parts.pop()
Guido van Rossum8b3febe2007-08-30 01:15:14 +0000111 # parts is a list of strings, alternating between the line contents
112 # and the eol character(s). Gather up a list of lines after
113 # re-attaching the newlines.
114 lines = []
115 for i in range(len(parts) // 2):
116 lines.append(parts[i*2] + parts[i*2+1])
117 self.pushlines(lines)
118
119 def pushlines(self, lines):
120 # Reverse and insert at the front of the lines.
121 self._lines[:0] = lines[::-1]
122
Guido van Rossum8b3febe2007-08-30 01:15:14 +0000123 def __iter__(self):
124 return self
125
126 def __next__(self):
127 line = self.readline()
128 if line == '':
129 raise StopIteration
130 return line
131
132
133
134class FeedParser:
135 """A feed-style parser of email."""
136
137 def __init__(self, _factory=message.Message):
138 """_factory is called with no arguments to create a new message obj"""
139 self._factory = _factory
140 self._input = BufferedSubFile()
141 self._msgstack = []
142 self._parse = self._parsegen().__next__
143 self._cur = None
144 self._last = None
145 self._headersonly = False
146
147 # Non-public interface for supporting Parser's headersonly flag
148 def _set_headersonly(self):
149 self._headersonly = True
150
151 def feed(self, data):
152 """Push more data into the parser."""
153 self._input.push(data)
154 self._call_parse()
155
156 def _call_parse(self):
157 try:
158 self._parse()
159 except StopIteration:
160 pass
161
162 def close(self):
163 """Parse all remaining data and return the root message object."""
164 self._input.close()
165 self._call_parse()
166 root = self._pop_message()
167 assert not self._msgstack
168 # Look for final set of defects
169 if root.get_content_maintype() == 'multipart' \
170 and not root.is_multipart():
171 root.defects.append(errors.MultipartInvariantViolationDefect())
172 return root
173
174 def _new_message(self):
175 msg = self._factory()
176 if self._cur and self._cur.get_content_type() == 'multipart/digest':
177 msg.set_default_type('message/rfc822')
178 if self._msgstack:
179 self._msgstack[-1].attach(msg)
180 self._msgstack.append(msg)
181 self._cur = msg
182 self._last = msg
183
184 def _pop_message(self):
185 retval = self._msgstack.pop()
186 if self._msgstack:
187 self._cur = self._msgstack[-1]
188 else:
189 self._cur = None
190 return retval
191
192 def _parsegen(self):
193 # Create a new message and start by parsing headers.
194 self._new_message()
195 headers = []
196 # Collect the headers, searching for a line that doesn't match the RFC
197 # 2822 header or continuation pattern (including an empty line).
198 for line in self._input:
199 if line is NeedMoreData:
200 yield NeedMoreData
201 continue
202 if not headerRE.match(line):
203 # If we saw the RFC defined header/body separator
204 # (i.e. newline), just throw it away. Otherwise the line is
205 # part of the body so push it back.
206 if not NLCRE.match(line):
207 self._input.unreadline(line)
208 break
209 headers.append(line)
210 # Done with the headers, so parse them and figure out what we're
211 # supposed to see in the body of the message.
212 self._parse_headers(headers)
213 # Headers-only parsing is a backwards compatibility hack, which was
214 # necessary in the older parser, which could throw errors. All
215 # remaining lines in the input are thrown into the message body.
216 if self._headersonly:
217 lines = []
218 while True:
219 line = self._input.readline()
220 if line is NeedMoreData:
221 yield NeedMoreData
222 continue
223 if line == '':
224 break
225 lines.append(line)
226 self._cur.set_payload(EMPTYSTRING.join(lines))
227 return
228 if self._cur.get_content_type() == 'message/delivery-status':
229 # message/delivery-status contains blocks of headers separated by
230 # a blank line. We'll represent each header block as a separate
231 # nested message object, but the processing is a bit different
232 # than standard message/* types because there is no body for the
233 # nested messages. A blank line separates the subparts.
234 while True:
235 self._input.push_eof_matcher(NLCRE.match)
236 for retval in self._parsegen():
237 if retval is NeedMoreData:
238 yield NeedMoreData
239 continue
240 break
241 msg = self._pop_message()
242 # We need to pop the EOF matcher in order to tell if we're at
243 # the end of the current file, not the end of the last block
244 # of message headers.
245 self._input.pop_eof_matcher()
246 # The input stream must be sitting at the newline or at the
247 # EOF. We want to see if we're at the end of this subpart, so
248 # first consume the blank line, then test the next line to see
249 # if we're at this subpart's EOF.
250 while True:
251 line = self._input.readline()
252 if line is NeedMoreData:
253 yield NeedMoreData
254 continue
255 break
256 while True:
257 line = self._input.readline()
258 if line is NeedMoreData:
259 yield NeedMoreData
260 continue
261 break
262 if line == '':
263 break
264 # Not at EOF so this is a line we're going to need.
265 self._input.unreadline(line)
266 return
267 if self._cur.get_content_maintype() == 'message':
268 # The message claims to be a message/* type, then what follows is
269 # another RFC 2822 message.
270 for retval in self._parsegen():
271 if retval is NeedMoreData:
272 yield NeedMoreData
273 continue
274 break
275 self._pop_message()
276 return
277 if self._cur.get_content_maintype() == 'multipart':
278 boundary = self._cur.get_boundary()
279 if boundary is None:
280 # The message /claims/ to be a multipart but it has not
281 # defined a boundary. That's a problem which we'll handle by
282 # reading everything until the EOF and marking the message as
283 # defective.
284 self._cur.defects.append(errors.NoBoundaryInMultipartDefect())
285 lines = []
286 for line in self._input:
287 if line is NeedMoreData:
288 yield NeedMoreData
289 continue
290 lines.append(line)
291 self._cur.set_payload(EMPTYSTRING.join(lines))
292 return
293 # Create a line match predicate which matches the inter-part
294 # boundary as well as the end-of-multipart boundary. Don't push
295 # this onto the input stream until we've scanned past the
296 # preamble.
297 separator = '--' + boundary
298 boundaryre = re.compile(
299 '(?P<sep>' + re.escape(separator) +
300 r')(?P<end>--)?(?P<ws>[ \t]*)(?P<linesep>\r\n|\r|\n)?$')
301 capturing_preamble = True
302 preamble = []
303 linesep = False
304 while True:
305 line = self._input.readline()
306 if line is NeedMoreData:
307 yield NeedMoreData
308 continue
309 if line == '':
310 break
311 mo = boundaryre.match(line)
312 if mo:
313 # If we're looking at the end boundary, we're done with
314 # this multipart. If there was a newline at the end of
315 # the closing boundary, then we need to initialize the
316 # epilogue with the empty string (see below).
317 if mo.group('end'):
318 linesep = mo.group('linesep')
319 break
320 # We saw an inter-part boundary. Were we in the preamble?
321 if capturing_preamble:
322 if preamble:
323 # According to RFC 2046, the last newline belongs
324 # to the boundary.
325 lastline = preamble[-1]
326 eolmo = NLCRE_eol.search(lastline)
327 if eolmo:
328 preamble[-1] = lastline[:-len(eolmo.group(0))]
329 self._cur.preamble = EMPTYSTRING.join(preamble)
330 capturing_preamble = False
331 self._input.unreadline(line)
332 continue
333 # We saw a boundary separating two parts. Consume any
334 # multiple boundary lines that may be following. Our
335 # interpretation of RFC 2046 BNF grammar does not produce
336 # body parts within such double boundaries.
337 while True:
338 line = self._input.readline()
339 if line is NeedMoreData:
340 yield NeedMoreData
341 continue
342 mo = boundaryre.match(line)
343 if not mo:
344 self._input.unreadline(line)
345 break
346 # Recurse to parse this subpart; the input stream points
347 # at the subpart's first line.
348 self._input.push_eof_matcher(boundaryre.match)
349 for retval in self._parsegen():
350 if retval is NeedMoreData:
351 yield NeedMoreData
352 continue
353 break
354 # Because of RFC 2046, the newline preceding the boundary
355 # separator actually belongs to the boundary, not the
356 # previous subpart's payload (or epilogue if the previous
357 # part is a multipart).
358 if self._last.get_content_maintype() == 'multipart':
359 epilogue = self._last.epilogue
360 if epilogue == '':
361 self._last.epilogue = None
362 elif epilogue is not None:
363 mo = NLCRE_eol.search(epilogue)
364 if mo:
365 end = len(mo.group(0))
366 self._last.epilogue = epilogue[:-end]
367 else:
R David Murrayc5c14722011-04-06 08:13:02 -0400368 payload = self._last._payload
Guido van Rossum3172c5d2007-10-16 18:12:55 +0000369 if isinstance(payload, str):
Guido van Rossum8b3febe2007-08-30 01:15:14 +0000370 mo = NLCRE_eol.search(payload)
371 if mo:
372 payload = payload[:-len(mo.group(0))]
R David Murrayc5c14722011-04-06 08:13:02 -0400373 self._last._payload = payload
Guido van Rossum8b3febe2007-08-30 01:15:14 +0000374 self._input.pop_eof_matcher()
375 self._pop_message()
376 # Set the multipart up for newline cleansing, which will
377 # happen if we're in a nested multipart.
378 self._last = self._cur
379 else:
380 # I think we must be in the preamble
381 assert capturing_preamble
382 preamble.append(line)
383 # We've seen either the EOF or the end boundary. If we're still
384 # capturing the preamble, we never saw the start boundary. Note
385 # that as a defect and store the captured text as the payload.
386 # Everything from here to the EOF is epilogue.
387 if capturing_preamble:
388 self._cur.defects.append(errors.StartBoundaryNotFoundDefect())
389 self._cur.set_payload(EMPTYSTRING.join(preamble))
390 epilogue = []
391 for line in self._input:
392 if line is NeedMoreData:
393 yield NeedMoreData
394 continue
395 self._cur.epilogue = EMPTYSTRING.join(epilogue)
396 return
397 # If the end boundary ended in a newline, we'll need to make sure
398 # the epilogue isn't None
399 if linesep:
400 epilogue = ['']
401 else:
402 epilogue = []
403 for line in self._input:
404 if line is NeedMoreData:
405 yield NeedMoreData
406 continue
407 epilogue.append(line)
408 # Any CRLF at the front of the epilogue is not technically part of
409 # the epilogue. Also, watch out for an empty string epilogue,
410 # which means a single newline.
411 if epilogue:
412 firstline = epilogue[0]
413 bolmo = NLCRE_bol.match(firstline)
414 if bolmo:
415 epilogue[0] = firstline[len(bolmo.group(0)):]
416 self._cur.epilogue = EMPTYSTRING.join(epilogue)
417 return
418 # Otherwise, it's some non-multipart type, so the entire rest of the
419 # file contents becomes the payload.
420 lines = []
421 for line in self._input:
422 if line is NeedMoreData:
423 yield NeedMoreData
424 continue
425 lines.append(line)
426 self._cur.set_payload(EMPTYSTRING.join(lines))
427
428 def _parse_headers(self, lines):
429 # Passed a list of lines that make up the headers for the current msg
430 lastheader = ''
431 lastvalue = []
432 for lineno, line in enumerate(lines):
433 # Check for continuation
434 if line[0] in ' \t':
435 if not lastheader:
436 # The first line of the headers was a continuation. This
437 # is illegal, so let's note the defect, store the illegal
438 # line, and ignore it for purposes of headers.
439 defect = errors.FirstHeaderLineIsContinuationDefect(line)
440 self._cur.defects.append(defect)
441 continue
442 lastvalue.append(line)
443 continue
444 if lastheader:
445 # XXX reconsider the joining of folded lines
446 lhdr = EMPTYSTRING.join(lastvalue)[:-1].rstrip('\r\n')
447 self._cur[lastheader] = lhdr
448 lastheader, lastvalue = '', []
449 # Check for envelope header, i.e. unix-from
450 if line.startswith('From '):
451 if lineno == 0:
452 # Strip off the trailing newline
453 mo = NLCRE_eol.search(line)
454 if mo:
455 line = line[:-len(mo.group(0))]
456 self._cur.set_unixfrom(line)
457 continue
458 elif lineno == len(lines) - 1:
459 # Something looking like a unix-from at the end - it's
460 # probably the first line of the body, so push back the
461 # line and stop.
462 self._input.unreadline(line)
463 return
464 else:
465 # Weirdly placed unix-from line. Note this as a defect
466 # and ignore it.
467 defect = errors.MisplacedEnvelopeHeaderDefect(line)
468 self._cur.defects.append(defect)
469 continue
470 # Split the line on the colon separating field name from value.
471 i = line.find(':')
472 if i < 0:
473 defect = errors.MalformedHeaderDefect(line)
474 self._cur.defects.append(defect)
475 continue
476 lastheader = line[:i]
477 lastvalue = [line[i+1:].lstrip()]
478 # Done with all the lines, so handle the last header.
479 if lastheader:
480 # XXX reconsider the joining of folded lines
481 self._cur[lastheader] = EMPTYSTRING.join(lastvalue).rstrip('\r\n')
R. David Murray96fd54e2010-10-08 15:55:28 +0000482
483
484class BytesFeedParser(FeedParser):
485 """Like FeedParser, but feed accepts bytes."""
486
487 def feed(self, data):
488 super().feed(data.decode('ascii', 'surrogateescape'))