Blame - Lib/email/feedparser.py - platform/external/python/cpython3

blob: 0706cae9bf9fe885e28a7d096b1fe623e79b448d [file] [log] [blame]

Guido van Rossum	8b3febe	2007-08-30 01:15:14 +0000	[diff] [blame]	1	# Copyright (C) 2004-2006 Python Software Foundation
				2	# Authors: Baxter, Wouters and Warsaw
				3	# Contact: email-sig@python.org
				4
				5	"""FeedParser - An email feed parser.
				6
				7	The feed parser implements an interface for incrementally parsing an email
				8	message, line by line. This has advantages for certain applications, such as
				9	those reading email messages off a socket.
				10
				11	FeedParser.feed() is the primary interface for pushing new data into the
				12	parser. It returns when there's nothing more it can do with the available
				13	data. When you have no more data to push into the parser, call .close().
				14	This completes the parsing and returns the root message object.
				15
				16	The other advantage of this parser is that it will never throw a parsing
				17	exception. Instead, when it finds something unexpected, it adds a 'defect' to
				18	the current message. Defects are just instances that live on the message
				19	object's .defects attribute.
				20	"""
				21
R David Murray	1b6c724	2012-03-16 22:43:05 -0400	[diff] [blame]	22	__all__ = ['FeedParser', 'BytesFeedParser']
Guido van Rossum	8b3febe	2007-08-30 01:15:14 +0000	[diff] [blame]	23
				24	import re
				25
				26	from email import errors
				27	from email import message
R David Murray	c27e522	2012-05-25 15:01:48 -0400	[diff] [blame^]	28	from email._policybase import compat32
Guido van Rossum	8b3febe	2007-08-30 01:15:14 +0000	[diff] [blame]	29
				30	NLCRE = re.compile('\r\n\|\r\|\n')
				31	NLCRE_bol = re.compile('(\r\n\|\r\|\n)')
R. David Murray	45e0e14	2010-06-16 02:19:40 +0000	[diff] [blame]	32	NLCRE_eol = re.compile('(\r\n\|\r\|\n)\Z')
Guido van Rossum	8b3febe	2007-08-30 01:15:14 +0000	[diff] [blame]	33	NLCRE_crack = re.compile('(\r\n\|\r\|\n)')
				34	# RFC 2822 $3.6.8 Optional fields. ftext is %d33-57 / %d59-126, Any character
				35	# except controls, SP, and ":".
				36	headerRE = re.compile(r'^(From \|[\041-\071\073-\176]{1,}:\|[\t ])')
				37	EMPTYSTRING = ''
				38	NL = '\n'
				39
				40	NeedMoreData = object()
				41
				42
				43
				44	class BufferedSubFile(object):
				45	"""A file-ish object that can have new data loaded into it.
				46
				47	You can also push and pop line-matching predicates onto a stack. When the
				48	current predicate matches the current line, a false EOF response
				49	(i.e. empty string) is returned instead. This lets the parser adhere to a
				50	simple abstraction -- it parses until EOF closes the current message.
				51	"""
				52	def __init__(self):
				53	# The last partial line pushed into this object.
				54	self._partial = ''
				55	# The list of full, pushed lines, in reverse order
				56	self._lines = []
				57	# The stack of false-EOF checking predicates.
				58	self._eofstack = []
				59	# A flag indicating whether the file has been closed or not.
				60	self._closed = False
				61
				62	def push_eof_matcher(self, pred):
				63	self._eofstack.append(pred)
				64
				65	def pop_eof_matcher(self):
				66	return self._eofstack.pop()
				67
				68	def close(self):
				69	# Don't forget any trailing partial line.
				70	self._lines.append(self._partial)
				71	self._partial = ''
				72	self._closed = True
				73
				74	def readline(self):
				75	if not self._lines:
				76	if self._closed:
				77	return ''
				78	return NeedMoreData
				79	# Pop the line off the stack and see if it matches the current
				80	# false-EOF predicate.
				81	line = self._lines.pop()
				82	# RFC 2046, section 5.1.2 requires us to recognize outer level
				83	# boundaries at any level of inner nesting. Do this, but be sure it's
				84	# in the order of most to least nested.
				85	for ateof in self._eofstack[::-1]:
				86	if ateof(line):
				87	# We're at the false EOF. But push the last line back first.
				88	self._lines.append(line)
				89	return ''
				90	return line
				91
				92	def unreadline(self, line):
				93	# Let the consumer push a line back into the buffer.
				94	assert line is not NeedMoreData
				95	self._lines.append(line)
				96
				97	def push(self, data):
				98	"""Push some new data into this object."""
				99	# Handle any previous leftovers
				100	data, self._partial = self._partial + data, ''
				101	# Crack into lines, but preserve the newlines on the end of each
				102	parts = NLCRE_crack.split(data)
				103	# The ahem interesting behaviour of re.split when supplied grouping
				104	# parentheses is that the last element of the resulting list is the
				105	# data after the final RE. In the case of a NL/CR terminated string,
				106	# this is the empty string.
				107	self._partial = parts.pop()
R. David Murray	45bf773f	2010-07-17 01:19:57 +0000	[diff] [blame]	108	#GAN 29Mar09 bugs 1555570, 1721862 Confusion at 8K boundary ending with \r:
				109	# is there a \n to follow later?
				110	if not self._partial and parts and parts[-1].endswith('\r'):
				111	self._partial = parts.pop(-2)+parts.pop()
Guido van Rossum	8b3febe	2007-08-30 01:15:14 +0000	[diff] [blame]	112	# parts is a list of strings, alternating between the line contents
				113	# and the eol character(s). Gather up a list of lines after
				114	# re-attaching the newlines.
				115	lines = []
				116	for i in range(len(parts) // 2):
				117	lines.append(parts[i2] + parts[i2+1])
				118	self.pushlines(lines)
				119
				120	def pushlines(self, lines):
				121	# Reverse and insert at the front of the lines.
				122	self._lines[:0] = lines[::-1]
				123
Guido van Rossum	8b3febe	2007-08-30 01:15:14 +0000	[diff] [blame]	124	def __iter__(self):
				125	return self
				126
				127	def __next__(self):
				128	line = self.readline()
				129	if line == '':
				130	raise StopIteration
				131	return line
				132
				133
				134
				135	class FeedParser:
				136	"""A feed-style parser of email."""
				137
R David Murray	c27e522	2012-05-25 15:01:48 -0400	[diff] [blame^]	138	def __init__(self, _factory=message.Message, *, policy=compat32):
R David Murray	3edd22a	2011-04-18 13:59:37 -0400	[diff] [blame]	139	"""_factory is called with no arguments to create a new message obj
				140
				141	The policy keyword specifies a policy object that controls a number of
				142	aspects of the parser's operation. The default policy maintains
				143	backward compatibility.
				144
				145	"""
Guido van Rossum	8b3febe	2007-08-30 01:15:14 +0000	[diff] [blame]	146	self._factory = _factory
R David Murray	3edd22a	2011-04-18 13:59:37 -0400	[diff] [blame]	147	self.policy = policy
R David Murray	c27e522	2012-05-25 15:01:48 -0400	[diff] [blame^]	148	try:
				149	_factory(policy=self.policy)
				150	self._factory_kwds = lambda: {'policy': self.policy}
				151	except TypeError:
				152	# Assume this is an old-style factory
				153	self._factory_kwds = lambda: {}
Guido van Rossum	8b3febe	2007-08-30 01:15:14 +0000	[diff] [blame]	154	self._input = BufferedSubFile()
				155	self._msgstack = []
				156	self._parse = self._parsegen().__next__
				157	self._cur = None
				158	self._last = None
				159	self._headersonly = False
				160
				161	# Non-public interface for supporting Parser's headersonly flag
				162	def _set_headersonly(self):
				163	self._headersonly = True
				164
				165	def feed(self, data):
				166	"""Push more data into the parser."""
				167	self._input.push(data)
				168	self._call_parse()
				169
				170	def _call_parse(self):
				171	try:
				172	self._parse()
				173	except StopIteration:
				174	pass
				175
				176	def close(self):
				177	"""Parse all remaining data and return the root message object."""
				178	self._input.close()
				179	self._call_parse()
				180	root = self._pop_message()
				181	assert not self._msgstack
				182	# Look for final set of defects
				183	if root.get_content_maintype() == 'multipart' \
				184	and not root.is_multipart():
R David Murray	3edd22a	2011-04-18 13:59:37 -0400	[diff] [blame]	185	defect = errors.MultipartInvariantViolationDefect()
				186	self.policy.handle_defect(root, defect)
Guido van Rossum	8b3febe	2007-08-30 01:15:14 +0000	[diff] [blame]	187	return root
				188
				189	def _new_message(self):
R David Murray	c27e522	2012-05-25 15:01:48 -0400	[diff] [blame^]	190	msg = self._factory(**self._factory_kwds())
Guido van Rossum	8b3febe	2007-08-30 01:15:14 +0000	[diff] [blame]	191	if self._cur and self._cur.get_content_type() == 'multipart/digest':
				192	msg.set_default_type('message/rfc822')
				193	if self._msgstack:
				194	self._msgstack[-1].attach(msg)
				195	self._msgstack.append(msg)
				196	self._cur = msg
				197	self._last = msg
				198
				199	def _pop_message(self):
				200	retval = self._msgstack.pop()
				201	if self._msgstack:
				202	self._cur = self._msgstack[-1]
				203	else:
				204	self._cur = None
				205	return retval
				206
				207	def _parsegen(self):
				208	# Create a new message and start by parsing headers.
				209	self._new_message()
				210	headers = []
				211	# Collect the headers, searching for a line that doesn't match the RFC
				212	# 2822 header or continuation pattern (including an empty line).
				213	for line in self._input:
				214	if line is NeedMoreData:
				215	yield NeedMoreData
				216	continue
				217	if not headerRE.match(line):
				218	# If we saw the RFC defined header/body separator
				219	# (i.e. newline), just throw it away. Otherwise the line is
				220	# part of the body so push it back.
				221	if not NLCRE.match(line):
				222	self._input.unreadline(line)
				223	break
				224	headers.append(line)
				225	# Done with the headers, so parse them and figure out what we're
				226	# supposed to see in the body of the message.
				227	self._parse_headers(headers)
				228	# Headers-only parsing is a backwards compatibility hack, which was
				229	# necessary in the older parser, which could throw errors. All
				230	# remaining lines in the input are thrown into the message body.
				231	if self._headersonly:
				232	lines = []
				233	while True:
				234	line = self._input.readline()
				235	if line is NeedMoreData:
				236	yield NeedMoreData
				237	continue
				238	if line == '':
				239	break
				240	lines.append(line)
				241	self._cur.set_payload(EMPTYSTRING.join(lines))
				242	return
				243	if self._cur.get_content_type() == 'message/delivery-status':
				244	# message/delivery-status contains blocks of headers separated by
				245	# a blank line. We'll represent each header block as a separate
				246	# nested message object, but the processing is a bit different
				247	# than standard message/* types because there is no body for the
				248	# nested messages. A blank line separates the subparts.
				249	while True:
				250	self._input.push_eof_matcher(NLCRE.match)
				251	for retval in self._parsegen():
				252	if retval is NeedMoreData:
				253	yield NeedMoreData
				254	continue
				255	break
				256	msg = self._pop_message()
				257	# We need to pop the EOF matcher in order to tell if we're at
				258	# the end of the current file, not the end of the last block
				259	# of message headers.
				260	self._input.pop_eof_matcher()
				261	# The input stream must be sitting at the newline or at the
				262	# EOF. We want to see if we're at the end of this subpart, so
				263	# first consume the blank line, then test the next line to see
				264	# if we're at this subpart's EOF.
				265	while True:
				266	line = self._input.readline()
				267	if line is NeedMoreData:
				268	yield NeedMoreData
				269	continue
				270	break
				271	while True:
				272	line = self._input.readline()
				273	if line is NeedMoreData:
				274	yield NeedMoreData
				275	continue
				276	break
				277	if line == '':
				278	break
				279	# Not at EOF so this is a line we're going to need.
				280	self._input.unreadline(line)
				281	return
				282	if self._cur.get_content_maintype() == 'message':
				283	# The message claims to be a message/* type, then what follows is
				284	# another RFC 2822 message.
				285	for retval in self._parsegen():
				286	if retval is NeedMoreData:
				287	yield NeedMoreData
				288	continue
				289	break
				290	self._pop_message()
				291	return
				292	if self._cur.get_content_maintype() == 'multipart':
				293	boundary = self._cur.get_boundary()
				294	if boundary is None:
				295	# The message /claims/ to be a multipart but it has not
				296	# defined a boundary. That's a problem which we'll handle by
				297	# reading everything until the EOF and marking the message as
				298	# defective.
R David Murray	3edd22a	2011-04-18 13:59:37 -0400	[diff] [blame]	299	defect = errors.NoBoundaryInMultipartDefect()
				300	self.policy.handle_defect(self._cur, defect)
Guido van Rossum	8b3febe	2007-08-30 01:15:14 +0000	[diff] [blame]	301	lines = []
				302	for line in self._input:
				303	if line is NeedMoreData:
				304	yield NeedMoreData
				305	continue
				306	lines.append(line)
				307	self._cur.set_payload(EMPTYSTRING.join(lines))
				308	return
R David Murray	749073a	2011-06-22 13:47:53 -0400	[diff] [blame]	309	# Make sure a valid content type was specified per RFC 2045:6.4.
				310	if (self._cur.get('content-transfer-encoding', '8bit').lower()
				311	not in ('7bit', '8bit', 'binary')):
				312	defect = errors.InvalidMultipartContentTransferEncodingDefect()
				313	self.policy.handle_defect(self._cur, defect)
Guido van Rossum	8b3febe	2007-08-30 01:15:14 +0000	[diff] [blame]	314	# Create a line match predicate which matches the inter-part
				315	# boundary as well as the end-of-multipart boundary. Don't push
				316	# this onto the input stream until we've scanned past the
				317	# preamble.
				318	separator = '--' + boundary
				319	boundaryre = re.compile(
				320	'(?P<sep>' + re.escape(separator) +
				321	r')(?P<end>--)?(?P<ws>[ \t]*)(?P<linesep>\r\n\|\r\|\n)?$')
				322	capturing_preamble = True
				323	preamble = []
				324	linesep = False
				325	while True:
				326	line = self._input.readline()
				327	if line is NeedMoreData:
				328	yield NeedMoreData
				329	continue
				330	if line == '':
				331	break
				332	mo = boundaryre.match(line)
				333	if mo:
				334	# If we're looking at the end boundary, we're done with
				335	# this multipart. If there was a newline at the end of
				336	# the closing boundary, then we need to initialize the
				337	# epilogue with the empty string (see below).
				338	if mo.group('end'):
				339	linesep = mo.group('linesep')
				340	break
				341	# We saw an inter-part boundary. Were we in the preamble?
				342	if capturing_preamble:
				343	if preamble:
				344	# According to RFC 2046, the last newline belongs
				345	# to the boundary.
				346	lastline = preamble[-1]
				347	eolmo = NLCRE_eol.search(lastline)
				348	if eolmo:
				349	preamble[-1] = lastline[:-len(eolmo.group(0))]
				350	self._cur.preamble = EMPTYSTRING.join(preamble)
				351	capturing_preamble = False
				352	self._input.unreadline(line)
				353	continue
				354	# We saw a boundary separating two parts. Consume any
				355	# multiple boundary lines that may be following. Our
				356	# interpretation of RFC 2046 BNF grammar does not produce
				357	# body parts within such double boundaries.
				358	while True:
				359	line = self._input.readline()
				360	if line is NeedMoreData:
				361	yield NeedMoreData
				362	continue
				363	mo = boundaryre.match(line)
				364	if not mo:
				365	self._input.unreadline(line)
				366	break
				367	# Recurse to parse this subpart; the input stream points
				368	# at the subpart's first line.
				369	self._input.push_eof_matcher(boundaryre.match)
				370	for retval in self._parsegen():
				371	if retval is NeedMoreData:
				372	yield NeedMoreData
				373	continue
				374	break
				375	# Because of RFC 2046, the newline preceding the boundary
				376	# separator actually belongs to the boundary, not the
				377	# previous subpart's payload (or epilogue if the previous
				378	# part is a multipart).
				379	if self._last.get_content_maintype() == 'multipart':
				380	epilogue = self._last.epilogue
				381	if epilogue == '':
				382	self._last.epilogue = None
				383	elif epilogue is not None:
				384	mo = NLCRE_eol.search(epilogue)
				385	if mo:
				386	end = len(mo.group(0))
				387	self._last.epilogue = epilogue[:-end]
				388	else:
R David Murray	c5c1472	2011-04-06 08:13:02 -0400	[diff] [blame]	389	payload = self._last._payload
Guido van Rossum	3172c5d	2007-10-16 18:12:55 +0000	[diff] [blame]	390	if isinstance(payload, str):
Guido van Rossum	8b3febe	2007-08-30 01:15:14 +0000	[diff] [blame]	391	mo = NLCRE_eol.search(payload)
				392	if mo:
				393	payload = payload[:-len(mo.group(0))]
R David Murray	c5c1472	2011-04-06 08:13:02 -0400	[diff] [blame]	394	self._last._payload = payload
Guido van Rossum	8b3febe	2007-08-30 01:15:14 +0000	[diff] [blame]	395	self._input.pop_eof_matcher()
				396	self._pop_message()
				397	# Set the multipart up for newline cleansing, which will
				398	# happen if we're in a nested multipart.
				399	self._last = self._cur
				400	else:
				401	# I think we must be in the preamble
				402	assert capturing_preamble
				403	preamble.append(line)
				404	# We've seen either the EOF or the end boundary. If we're still
				405	# capturing the preamble, we never saw the start boundary. Note
				406	# that as a defect and store the captured text as the payload.
				407	# Everything from here to the EOF is epilogue.
				408	if capturing_preamble:
R David Murray	3edd22a	2011-04-18 13:59:37 -0400	[diff] [blame]	409	defect = errors.StartBoundaryNotFoundDefect()
				410	self.policy.handle_defect(self._cur, defect)
Guido van Rossum	8b3febe	2007-08-30 01:15:14 +0000	[diff] [blame]	411	self._cur.set_payload(EMPTYSTRING.join(preamble))
				412	epilogue = []
				413	for line in self._input:
				414	if line is NeedMoreData:
				415	yield NeedMoreData
				416	continue
				417	self._cur.epilogue = EMPTYSTRING.join(epilogue)
				418	return
				419	# If the end boundary ended in a newline, we'll need to make sure
				420	# the epilogue isn't None
				421	if linesep:
				422	epilogue = ['']
				423	else:
				424	epilogue = []
				425	for line in self._input:
				426	if line is NeedMoreData:
				427	yield NeedMoreData
				428	continue
				429	epilogue.append(line)
				430	# Any CRLF at the front of the epilogue is not technically part of
				431	# the epilogue. Also, watch out for an empty string epilogue,
				432	# which means a single newline.
				433	if epilogue:
				434	firstline = epilogue[0]
				435	bolmo = NLCRE_bol.match(firstline)
				436	if bolmo:
				437	epilogue[0] = firstline[len(bolmo.group(0)):]
				438	self._cur.epilogue = EMPTYSTRING.join(epilogue)
				439	return
				440	# Otherwise, it's some non-multipart type, so the entire rest of the
				441	# file contents becomes the payload.
				442	lines = []
				443	for line in self._input:
				444	if line is NeedMoreData:
				445	yield NeedMoreData
				446	continue
				447	lines.append(line)
				448	self._cur.set_payload(EMPTYSTRING.join(lines))
				449
				450	def _parse_headers(self, lines):
				451	# Passed a list of lines that make up the headers for the current msg
				452	lastheader = ''
				453	lastvalue = []
				454	for lineno, line in enumerate(lines):
				455	# Check for continuation
				456	if line[0] in ' \t':
				457	if not lastheader:
				458	# The first line of the headers was a continuation. This
				459	# is illegal, so let's note the defect, store the illegal
				460	# line, and ignore it for purposes of headers.
				461	defect = errors.FirstHeaderLineIsContinuationDefect(line)
R David Murray	3edd22a	2011-04-18 13:59:37 -0400	[diff] [blame]	462	self.policy.handle_defect(self._cur, defect)
Guido van Rossum	8b3febe	2007-08-30 01:15:14 +0000	[diff] [blame]	463	continue
				464	lastvalue.append(line)
				465	continue
				466	if lastheader:
R David Murray	c27e522	2012-05-25 15:01:48 -0400	[diff] [blame^]	467	self._cur.set_raw(*self.policy.header_source_parse(lastvalue))
Guido van Rossum	8b3febe	2007-08-30 01:15:14 +0000	[diff] [blame]	468	lastheader, lastvalue = '', []
				469	# Check for envelope header, i.e. unix-from
				470	if line.startswith('From '):
				471	if lineno == 0:
				472	# Strip off the trailing newline
				473	mo = NLCRE_eol.search(line)
				474	if mo:
				475	line = line[:-len(mo.group(0))]
				476	self._cur.set_unixfrom(line)
				477	continue
				478	elif lineno == len(lines) - 1:
				479	# Something looking like a unix-from at the end - it's
				480	# probably the first line of the body, so push back the
				481	# line and stop.
				482	self._input.unreadline(line)
				483	return
				484	else:
				485	# Weirdly placed unix-from line. Note this as a defect
				486	# and ignore it.
				487	defect = errors.MisplacedEnvelopeHeaderDefect(line)
				488	self._cur.defects.append(defect)
				489	continue
				490	# Split the line on the colon separating field name from value.
				491	i = line.find(':')
				492	if i < 0:
				493	defect = errors.MalformedHeaderDefect(line)
R David Murray	c27e522	2012-05-25 15:01:48 -0400	[diff] [blame^]	494	# XXX: fixme (defect not going through policy)
Guido van Rossum	8b3febe	2007-08-30 01:15:14 +0000	[diff] [blame]	495	self._cur.defects.append(defect)
				496	continue
				497	lastheader = line[:i]
R David Murray	c27e522	2012-05-25 15:01:48 -0400	[diff] [blame^]	498	lastvalue = [line]
Guido van Rossum	8b3febe	2007-08-30 01:15:14 +0000	[diff] [blame]	499	# Done with all the lines, so handle the last header.
				500	if lastheader:
R David Murray	c27e522	2012-05-25 15:01:48 -0400	[diff] [blame^]	501	self._cur.set_raw(*self.policy.header_source_parse(lastvalue))
R. David Murray	96fd54e	2010-10-08 15:55:28 +0000	[diff] [blame]	502
R David Murray	c27e522	2012-05-25 15:01:48 -0400	[diff] [blame^]	503
R. David Murray	96fd54e	2010-10-08 15:55:28 +0000	[diff] [blame]	504	class BytesFeedParser(FeedParser):
				505	"""Like FeedParser, but feed accepts bytes."""
				506
				507	def feed(self, data):
				508	super().feed(data.decode('ascii', 'surrogateescape'))