Blame - Lib/email/feedparser.py - platform/external/python/cpython3

blob: e754d89cb63b85b3a9064a14f5d88014a1864ef4 [file] [log] [blame]

Guido van Rossum	8b3febe	2007-08-30 01:15:14 +0000	[diff] [blame]	1	# Copyright (C) 2004-2006 Python Software Foundation
				2	# Authors: Baxter, Wouters and Warsaw
				3	# Contact: email-sig@python.org
				4
				5	"""FeedParser - An email feed parser.
				6
				7	The feed parser implements an interface for incrementally parsing an email
				8	message, line by line. This has advantages for certain applications, such as
				9	those reading email messages off a socket.
				10
				11	FeedParser.feed() is the primary interface for pushing new data into the
				12	parser. It returns when there's nothing more it can do with the available
				13	data. When you have no more data to push into the parser, call .close().
				14	This completes the parsing and returns the root message object.
				15
				16	The other advantage of this parser is that it will never throw a parsing
				17	exception. Instead, when it finds something unexpected, it adds a 'defect' to
				18	the current message. Defects are just instances that live on the message
				19	object's .defects attribute.
				20	"""
				21
				22	__all__ = ['FeedParser']
				23
				24	import re
				25
				26	from email import errors
				27	from email import message
R David Murray	3edd22a	2011-04-18 13:59:37 -0400	[diff] [blame]	28	from email import policy
Guido van Rossum	8b3febe	2007-08-30 01:15:14 +0000	[diff] [blame]	29
				30	NLCRE = re.compile('\r\n\|\r\|\n')
				31	NLCRE_bol = re.compile('(\r\n\|\r\|\n)')
R. David Murray	45e0e14	2010-06-16 02:19:40 +0000	[diff] [blame]	32	NLCRE_eol = re.compile('(\r\n\|\r\|\n)\Z')
Guido van Rossum	8b3febe	2007-08-30 01:15:14 +0000	[diff] [blame]	33	NLCRE_crack = re.compile('(\r\n\|\r\|\n)')
				34	# RFC 2822 $3.6.8 Optional fields. ftext is %d33-57 / %d59-126, Any character
				35	# except controls, SP, and ":".
				36	headerRE = re.compile(r'^(From \|[\041-\071\073-\176]{1,}:\|[\t ])')
				37	EMPTYSTRING = ''
				38	NL = '\n'
				39
				40	NeedMoreData = object()
				41
				42
				43
				44	class BufferedSubFile(object):
				45	"""A file-ish object that can have new data loaded into it.
				46
				47	You can also push and pop line-matching predicates onto a stack. When the
				48	current predicate matches the current line, a false EOF response
				49	(i.e. empty string) is returned instead. This lets the parser adhere to a
				50	simple abstraction -- it parses until EOF closes the current message.
				51	"""
				52	def __init__(self):
				53	# The last partial line pushed into this object.
				54	self._partial = ''
				55	# The list of full, pushed lines, in reverse order
				56	self._lines = []
				57	# The stack of false-EOF checking predicates.
				58	self._eofstack = []
				59	# A flag indicating whether the file has been closed or not.
				60	self._closed = False
				61
				62	def push_eof_matcher(self, pred):
				63	self._eofstack.append(pred)
				64
				65	def pop_eof_matcher(self):
				66	return self._eofstack.pop()
				67
				68	def close(self):
				69	# Don't forget any trailing partial line.
				70	self._lines.append(self._partial)
				71	self._partial = ''
				72	self._closed = True
				73
				74	def readline(self):
				75	if not self._lines:
				76	if self._closed:
				77	return ''
				78	return NeedMoreData
				79	# Pop the line off the stack and see if it matches the current
				80	# false-EOF predicate.
				81	line = self._lines.pop()
				82	# RFC 2046, section 5.1.2 requires us to recognize outer level
				83	# boundaries at any level of inner nesting. Do this, but be sure it's
				84	# in the order of most to least nested.
				85	for ateof in self._eofstack[::-1]:
				86	if ateof(line):
				87	# We're at the false EOF. But push the last line back first.
				88	self._lines.append(line)
				89	return ''
				90	return line
				91
				92	def unreadline(self, line):
				93	# Let the consumer push a line back into the buffer.
				94	assert line is not NeedMoreData
				95	self._lines.append(line)
				96
				97	def push(self, data):
				98	"""Push some new data into this object."""
				99	# Handle any previous leftovers
				100	data, self._partial = self._partial + data, ''
				101	# Crack into lines, but preserve the newlines on the end of each
				102	parts = NLCRE_crack.split(data)
				103	# The ahem interesting behaviour of re.split when supplied grouping
				104	# parentheses is that the last element of the resulting list is the
				105	# data after the final RE. In the case of a NL/CR terminated string,
				106	# this is the empty string.
				107	self._partial = parts.pop()
R. David Murray	45bf773f	2010-07-17 01:19:57 +0000	[diff] [blame]	108	#GAN 29Mar09 bugs 1555570, 1721862 Confusion at 8K boundary ending with \r:
				109	# is there a \n to follow later?
				110	if not self._partial and parts and parts[-1].endswith('\r'):
				111	self._partial = parts.pop(-2)+parts.pop()
Guido van Rossum	8b3febe	2007-08-30 01:15:14 +0000	[diff] [blame]	112	# parts is a list of strings, alternating between the line contents
				113	# and the eol character(s). Gather up a list of lines after
				114	# re-attaching the newlines.
				115	lines = []
				116	for i in range(len(parts) // 2):
				117	lines.append(parts[i2] + parts[i2+1])
				118	self.pushlines(lines)
				119
				120	def pushlines(self, lines):
				121	# Reverse and insert at the front of the lines.
				122	self._lines[:0] = lines[::-1]
				123
Guido van Rossum	8b3febe	2007-08-30 01:15:14 +0000	[diff] [blame]	124	def __iter__(self):
				125	return self
				126
				127	def __next__(self):
				128	line = self.readline()
				129	if line == '':
				130	raise StopIteration
				131	return line
				132
				133
				134
				135	class FeedParser:
				136	"""A feed-style parser of email."""
				137
R David Murray	3edd22a	2011-04-18 13:59:37 -0400	[diff] [blame]	138	def __init__(self, _factory=message.Message, *, policy=policy.default):
				139	"""_factory is called with no arguments to create a new message obj
				140
				141	The policy keyword specifies a policy object that controls a number of
				142	aspects of the parser's operation. The default policy maintains
				143	backward compatibility.
				144
				145	"""
Guido van Rossum	8b3febe	2007-08-30 01:15:14 +0000	[diff] [blame]	146	self._factory = _factory
R David Murray	3edd22a	2011-04-18 13:59:37 -0400	[diff] [blame]	147	self.policy = policy
Guido van Rossum	8b3febe	2007-08-30 01:15:14 +0000	[diff] [blame]	148	self._input = BufferedSubFile()
				149	self._msgstack = []
				150	self._parse = self._parsegen().__next__
				151	self._cur = None
				152	self._last = None
				153	self._headersonly = False
				154
				155	# Non-public interface for supporting Parser's headersonly flag
				156	def _set_headersonly(self):
				157	self._headersonly = True
				158
				159	def feed(self, data):
				160	"""Push more data into the parser."""
				161	self._input.push(data)
				162	self._call_parse()
				163
				164	def _call_parse(self):
				165	try:
				166	self._parse()
				167	except StopIteration:
				168	pass
				169
				170	def close(self):
				171	"""Parse all remaining data and return the root message object."""
				172	self._input.close()
				173	self._call_parse()
				174	root = self._pop_message()
				175	assert not self._msgstack
				176	# Look for final set of defects
				177	if root.get_content_maintype() == 'multipart' \
				178	and not root.is_multipart():
R David Murray	3edd22a	2011-04-18 13:59:37 -0400	[diff] [blame]	179	defect = errors.MultipartInvariantViolationDefect()
				180	self.policy.handle_defect(root, defect)
Guido van Rossum	8b3febe	2007-08-30 01:15:14 +0000	[diff] [blame]	181	return root
				182
				183	def _new_message(self):
				184	msg = self._factory()
				185	if self._cur and self._cur.get_content_type() == 'multipart/digest':
				186	msg.set_default_type('message/rfc822')
				187	if self._msgstack:
				188	self._msgstack[-1].attach(msg)
				189	self._msgstack.append(msg)
				190	self._cur = msg
				191	self._last = msg
				192
				193	def _pop_message(self):
				194	retval = self._msgstack.pop()
				195	if self._msgstack:
				196	self._cur = self._msgstack[-1]
				197	else:
				198	self._cur = None
				199	return retval
				200
				201	def _parsegen(self):
				202	# Create a new message and start by parsing headers.
				203	self._new_message()
				204	headers = []
				205	# Collect the headers, searching for a line that doesn't match the RFC
				206	# 2822 header or continuation pattern (including an empty line).
				207	for line in self._input:
				208	if line is NeedMoreData:
				209	yield NeedMoreData
				210	continue
				211	if not headerRE.match(line):
				212	# If we saw the RFC defined header/body separator
				213	# (i.e. newline), just throw it away. Otherwise the line is
				214	# part of the body so push it back.
				215	if not NLCRE.match(line):
				216	self._input.unreadline(line)
				217	break
				218	headers.append(line)
				219	# Done with the headers, so parse them and figure out what we're
				220	# supposed to see in the body of the message.
				221	self._parse_headers(headers)
				222	# Headers-only parsing is a backwards compatibility hack, which was
				223	# necessary in the older parser, which could throw errors. All
				224	# remaining lines in the input are thrown into the message body.
				225	if self._headersonly:
				226	lines = []
				227	while True:
				228	line = self._input.readline()
				229	if line is NeedMoreData:
				230	yield NeedMoreData
				231	continue
				232	if line == '':
				233	break
				234	lines.append(line)
				235	self._cur.set_payload(EMPTYSTRING.join(lines))
				236	return
				237	if self._cur.get_content_type() == 'message/delivery-status':
				238	# message/delivery-status contains blocks of headers separated by
				239	# a blank line. We'll represent each header block as a separate
				240	# nested message object, but the processing is a bit different
				241	# than standard message/* types because there is no body for the
				242	# nested messages. A blank line separates the subparts.
				243	while True:
				244	self._input.push_eof_matcher(NLCRE.match)
				245	for retval in self._parsegen():
				246	if retval is NeedMoreData:
				247	yield NeedMoreData
				248	continue
				249	break
				250	msg = self._pop_message()
				251	# We need to pop the EOF matcher in order to tell if we're at
				252	# the end of the current file, not the end of the last block
				253	# of message headers.
				254	self._input.pop_eof_matcher()
				255	# The input stream must be sitting at the newline or at the
				256	# EOF. We want to see if we're at the end of this subpart, so
				257	# first consume the blank line, then test the next line to see
				258	# if we're at this subpart's EOF.
				259	while True:
				260	line = self._input.readline()
				261	if line is NeedMoreData:
				262	yield NeedMoreData
				263	continue
				264	break
				265	while True:
				266	line = self._input.readline()
				267	if line is NeedMoreData:
				268	yield NeedMoreData
				269	continue
				270	break
				271	if line == '':
				272	break
				273	# Not at EOF so this is a line we're going to need.
				274	self._input.unreadline(line)
				275	return
				276	if self._cur.get_content_maintype() == 'message':
				277	# The message claims to be a message/* type, then what follows is
				278	# another RFC 2822 message.
				279	for retval in self._parsegen():
				280	if retval is NeedMoreData:
				281	yield NeedMoreData
				282	continue
				283	break
				284	self._pop_message()
				285	return
				286	if self._cur.get_content_maintype() == 'multipart':
				287	boundary = self._cur.get_boundary()
				288	if boundary is None:
				289	# The message /claims/ to be a multipart but it has not
				290	# defined a boundary. That's a problem which we'll handle by
				291	# reading everything until the EOF and marking the message as
				292	# defective.
R David Murray	3edd22a	2011-04-18 13:59:37 -0400	[diff] [blame]	293	defect = errors.NoBoundaryInMultipartDefect()
				294	self.policy.handle_defect(self._cur, defect)
Guido van Rossum	8b3febe	2007-08-30 01:15:14 +0000	[diff] [blame]	295	lines = []
				296	for line in self._input:
				297	if line is NeedMoreData:
				298	yield NeedMoreData
				299	continue
				300	lines.append(line)
				301	self._cur.set_payload(EMPTYSTRING.join(lines))
				302	return
R David Murray	749073a	2011-06-22 13:47:53 -0400	[diff] [blame^]	303	# Make sure a valid content type was specified per RFC 2045:6.4.
				304	if (self._cur.get('content-transfer-encoding', '8bit').lower()
				305	not in ('7bit', '8bit', 'binary')):
				306	defect = errors.InvalidMultipartContentTransferEncodingDefect()
				307	self.policy.handle_defect(self._cur, defect)
Guido van Rossum	8b3febe	2007-08-30 01:15:14 +0000	[diff] [blame]	308	# Create a line match predicate which matches the inter-part
				309	# boundary as well as the end-of-multipart boundary. Don't push
				310	# this onto the input stream until we've scanned past the
				311	# preamble.
				312	separator = '--' + boundary
				313	boundaryre = re.compile(
				314	'(?P<sep>' + re.escape(separator) +
				315	r')(?P<end>--)?(?P<ws>[ \t]*)(?P<linesep>\r\n\|\r\|\n)?$')
				316	capturing_preamble = True
				317	preamble = []
				318	linesep = False
				319	while True:
				320	line = self._input.readline()
				321	if line is NeedMoreData:
				322	yield NeedMoreData
				323	continue
				324	if line == '':
				325	break
				326	mo = boundaryre.match(line)
				327	if mo:
				328	# If we're looking at the end boundary, we're done with
				329	# this multipart. If there was a newline at the end of
				330	# the closing boundary, then we need to initialize the
				331	# epilogue with the empty string (see below).
				332	if mo.group('end'):
				333	linesep = mo.group('linesep')
				334	break
				335	# We saw an inter-part boundary. Were we in the preamble?
				336	if capturing_preamble:
				337	if preamble:
				338	# According to RFC 2046, the last newline belongs
				339	# to the boundary.
				340	lastline = preamble[-1]
				341	eolmo = NLCRE_eol.search(lastline)
				342	if eolmo:
				343	preamble[-1] = lastline[:-len(eolmo.group(0))]
				344	self._cur.preamble = EMPTYSTRING.join(preamble)
				345	capturing_preamble = False
				346	self._input.unreadline(line)
				347	continue
				348	# We saw a boundary separating two parts. Consume any
				349	# multiple boundary lines that may be following. Our
				350	# interpretation of RFC 2046 BNF grammar does not produce
				351	# body parts within such double boundaries.
				352	while True:
				353	line = self._input.readline()
				354	if line is NeedMoreData:
				355	yield NeedMoreData
				356	continue
				357	mo = boundaryre.match(line)
				358	if not mo:
				359	self._input.unreadline(line)
				360	break
				361	# Recurse to parse this subpart; the input stream points
				362	# at the subpart's first line.
				363	self._input.push_eof_matcher(boundaryre.match)
				364	for retval in self._parsegen():
				365	if retval is NeedMoreData:
				366	yield NeedMoreData
				367	continue
				368	break
				369	# Because of RFC 2046, the newline preceding the boundary
				370	# separator actually belongs to the boundary, not the
				371	# previous subpart's payload (or epilogue if the previous
				372	# part is a multipart).
				373	if self._last.get_content_maintype() == 'multipart':
				374	epilogue = self._last.epilogue
				375	if epilogue == '':
				376	self._last.epilogue = None
				377	elif epilogue is not None:
				378	mo = NLCRE_eol.search(epilogue)
				379	if mo:
				380	end = len(mo.group(0))
				381	self._last.epilogue = epilogue[:-end]
				382	else:
R David Murray	c5c1472	2011-04-06 08:13:02 -0400	[diff] [blame]	383	payload = self._last._payload
Guido van Rossum	3172c5d	2007-10-16 18:12:55 +0000	[diff] [blame]	384	if isinstance(payload, str):
Guido van Rossum	8b3febe	2007-08-30 01:15:14 +0000	[diff] [blame]	385	mo = NLCRE_eol.search(payload)
				386	if mo:
				387	payload = payload[:-len(mo.group(0))]
R David Murray	c5c1472	2011-04-06 08:13:02 -0400	[diff] [blame]	388	self._last._payload = payload
Guido van Rossum	8b3febe	2007-08-30 01:15:14 +0000	[diff] [blame]	389	self._input.pop_eof_matcher()
				390	self._pop_message()
				391	# Set the multipart up for newline cleansing, which will
				392	# happen if we're in a nested multipart.
				393	self._last = self._cur
				394	else:
				395	# I think we must be in the preamble
				396	assert capturing_preamble
				397	preamble.append(line)
				398	# We've seen either the EOF or the end boundary. If we're still
				399	# capturing the preamble, we never saw the start boundary. Note
				400	# that as a defect and store the captured text as the payload.
				401	# Everything from here to the EOF is epilogue.
				402	if capturing_preamble:
R David Murray	3edd22a	2011-04-18 13:59:37 -0400	[diff] [blame]	403	defect = errors.StartBoundaryNotFoundDefect()
				404	self.policy.handle_defect(self._cur, defect)
Guido van Rossum	8b3febe	2007-08-30 01:15:14 +0000	[diff] [blame]	405	self._cur.set_payload(EMPTYSTRING.join(preamble))
				406	epilogue = []
				407	for line in self._input:
				408	if line is NeedMoreData:
				409	yield NeedMoreData
				410	continue
				411	self._cur.epilogue = EMPTYSTRING.join(epilogue)
				412	return
				413	# If the end boundary ended in a newline, we'll need to make sure
				414	# the epilogue isn't None
				415	if linesep:
				416	epilogue = ['']
				417	else:
				418	epilogue = []
				419	for line in self._input:
				420	if line is NeedMoreData:
				421	yield NeedMoreData
				422	continue
				423	epilogue.append(line)
				424	# Any CRLF at the front of the epilogue is not technically part of
				425	# the epilogue. Also, watch out for an empty string epilogue,
				426	# which means a single newline.
				427	if epilogue:
				428	firstline = epilogue[0]
				429	bolmo = NLCRE_bol.match(firstline)
				430	if bolmo:
				431	epilogue[0] = firstline[len(bolmo.group(0)):]
				432	self._cur.epilogue = EMPTYSTRING.join(epilogue)
				433	return
				434	# Otherwise, it's some non-multipart type, so the entire rest of the
				435	# file contents becomes the payload.
				436	lines = []
				437	for line in self._input:
				438	if line is NeedMoreData:
				439	yield NeedMoreData
				440	continue
				441	lines.append(line)
				442	self._cur.set_payload(EMPTYSTRING.join(lines))
				443
				444	def _parse_headers(self, lines):
				445	# Passed a list of lines that make up the headers for the current msg
				446	lastheader = ''
				447	lastvalue = []
				448	for lineno, line in enumerate(lines):
				449	# Check for continuation
				450	if line[0] in ' \t':
				451	if not lastheader:
				452	# The first line of the headers was a continuation. This
				453	# is illegal, so let's note the defect, store the illegal
				454	# line, and ignore it for purposes of headers.
				455	defect = errors.FirstHeaderLineIsContinuationDefect(line)
R David Murray	3edd22a	2011-04-18 13:59:37 -0400	[diff] [blame]	456	self.policy.handle_defect(self._cur, defect)
Guido van Rossum	8b3febe	2007-08-30 01:15:14 +0000	[diff] [blame]	457	continue
				458	lastvalue.append(line)
				459	continue
				460	if lastheader:
				461	# XXX reconsider the joining of folded lines
				462	lhdr = EMPTYSTRING.join(lastvalue)[:-1].rstrip('\r\n')
				463	self._cur[lastheader] = lhdr
				464	lastheader, lastvalue = '', []
				465	# Check for envelope header, i.e. unix-from
				466	if line.startswith('From '):
				467	if lineno == 0:
				468	# Strip off the trailing newline
				469	mo = NLCRE_eol.search(line)
				470	if mo:
				471	line = line[:-len(mo.group(0))]
				472	self._cur.set_unixfrom(line)
				473	continue
				474	elif lineno == len(lines) - 1:
				475	# Something looking like a unix-from at the end - it's
				476	# probably the first line of the body, so push back the
				477	# line and stop.
				478	self._input.unreadline(line)
				479	return
				480	else:
				481	# Weirdly placed unix-from line. Note this as a defect
				482	# and ignore it.
				483	defect = errors.MisplacedEnvelopeHeaderDefect(line)
				484	self._cur.defects.append(defect)
				485	continue
				486	# Split the line on the colon separating field name from value.
				487	i = line.find(':')
				488	if i < 0:
				489	defect = errors.MalformedHeaderDefect(line)
				490	self._cur.defects.append(defect)
				491	continue
				492	lastheader = line[:i]
				493	lastvalue = [line[i+1:].lstrip()]
				494	# Done with all the lines, so handle the last header.
				495	if lastheader:
				496	# XXX reconsider the joining of folded lines
				497	self._cur[lastheader] = EMPTYSTRING.join(lastvalue).rstrip('\r\n')
R. David Murray	96fd54e	2010-10-08 15:55:28 +0000	[diff] [blame]	498
				499
				500	class BytesFeedParser(FeedParser):
				501	"""Like FeedParser, but feed accepts bytes."""
				502
				503	def feed(self, data):
				504	super().feed(data.decode('ascii', 'surrogateescape'))