Blame - Lib/email/feedparser.py - platform/external/python/cpython3

blob: 60de49e97f2a6550a0de27dba2ee844560fa6484 [file] [log] [blame]

Guido van Rossum	8b3febe	2007-08-30 01:15:14 +0000	[diff] [blame]	1	# Copyright (C) 2004-2006 Python Software Foundation
				2	# Authors: Baxter, Wouters and Warsaw
				3	# Contact: email-sig@python.org
				4
				5	"""FeedParser - An email feed parser.
				6
				7	The feed parser implements an interface for incrementally parsing an email
				8	message, line by line. This has advantages for certain applications, such as
				9	those reading email messages off a socket.
				10
				11	FeedParser.feed() is the primary interface for pushing new data into the
				12	parser. It returns when there's nothing more it can do with the available
				13	data. When you have no more data to push into the parser, call .close().
				14	This completes the parsing and returns the root message object.
				15
				16	The other advantage of this parser is that it will never throw a parsing
				17	exception. Instead, when it finds something unexpected, it adds a 'defect' to
				18	the current message. Defects are just instances that live on the message
				19	object's .defects attribute.
				20	"""
				21
				22	__all__ = ['FeedParser']
				23
				24	import re
				25
				26	from email import errors
				27	from email import message
R David Murray	3edd22a	2011-04-18 13:59:37 -0400	[diff] [blame]	28	from email import policy
Guido van Rossum	8b3febe	2007-08-30 01:15:14 +0000	[diff] [blame]	29
				30	NLCRE = re.compile('\r\n\|\r\|\n')
				31	NLCRE_bol = re.compile('(\r\n\|\r\|\n)')
R. David Murray	45e0e14	2010-06-16 02:19:40 +0000	[diff] [blame]	32	NLCRE_eol = re.compile('(\r\n\|\r\|\n)\Z')
Guido van Rossum	8b3febe	2007-08-30 01:15:14 +0000	[diff] [blame]	33	NLCRE_crack = re.compile('(\r\n\|\r\|\n)')
				34	# RFC 2822 $3.6.8 Optional fields. ftext is %d33-57 / %d59-126, Any character
				35	# except controls, SP, and ":".
				36	headerRE = re.compile(r'^(From \|[\041-\071\073-\176]{1,}:\|[\t ])')
				37	EMPTYSTRING = ''
				38	NL = '\n'
				39
				40	NeedMoreData = object()
				41
				42
				43
				44	class BufferedSubFile(object):
				45	"""A file-ish object that can have new data loaded into it.
				46
				47	You can also push and pop line-matching predicates onto a stack. When the
				48	current predicate matches the current line, a false EOF response
				49	(i.e. empty string) is returned instead. This lets the parser adhere to a
				50	simple abstraction -- it parses until EOF closes the current message.
				51	"""
				52	def __init__(self):
				53	# The last partial line pushed into this object.
				54	self._partial = ''
				55	# The list of full, pushed lines, in reverse order
				56	self._lines = []
				57	# The stack of false-EOF checking predicates.
				58	self._eofstack = []
				59	# A flag indicating whether the file has been closed or not.
				60	self._closed = False
				61
				62	def push_eof_matcher(self, pred):
				63	self._eofstack.append(pred)
				64
				65	def pop_eof_matcher(self):
				66	return self._eofstack.pop()
				67
				68	def close(self):
				69	# Don't forget any trailing partial line.
				70	self._lines.append(self._partial)
				71	self._partial = ''
				72	self._closed = True
				73
				74	def readline(self):
				75	if not self._lines:
				76	if self._closed:
				77	return ''
				78	return NeedMoreData
				79	# Pop the line off the stack and see if it matches the current
				80	# false-EOF predicate.
				81	line = self._lines.pop()
				82	# RFC 2046, section 5.1.2 requires us to recognize outer level
				83	# boundaries at any level of inner nesting. Do this, but be sure it's
				84	# in the order of most to least nested.
				85	for ateof in self._eofstack[::-1]:
				86	if ateof(line):
				87	# We're at the false EOF. But push the last line back first.
				88	self._lines.append(line)
				89	return ''
				90	return line
				91
				92	def unreadline(self, line):
				93	# Let the consumer push a line back into the buffer.
				94	assert line is not NeedMoreData
				95	self._lines.append(line)
				96
				97	def push(self, data):
				98	"""Push some new data into this object."""
				99	# Handle any previous leftovers
				100	data, self._partial = self._partial + data, ''
				101	# Crack into lines, but preserve the newlines on the end of each
				102	parts = NLCRE_crack.split(data)
				103	# The ahem interesting behaviour of re.split when supplied grouping
				104	# parentheses is that the last element of the resulting list is the
				105	# data after the final RE. In the case of a NL/CR terminated string,
				106	# this is the empty string.
				107	self._partial = parts.pop()
R. David Murray	45bf773f	2010-07-17 01:19:57 +0000	[diff] [blame]	108	#GAN 29Mar09 bugs 1555570, 1721862 Confusion at 8K boundary ending with \r:
				109	# is there a \n to follow later?
				110	if not self._partial and parts and parts[-1].endswith('\r'):
				111	self._partial = parts.pop(-2)+parts.pop()
Guido van Rossum	8b3febe	2007-08-30 01:15:14 +0000	[diff] [blame]	112	# parts is a list of strings, alternating between the line contents
				113	# and the eol character(s). Gather up a list of lines after
				114	# re-attaching the newlines.
				115	lines = []
				116	for i in range(len(parts) // 2):
				117	lines.append(parts[i2] + parts[i2+1])
				118	self.pushlines(lines)
				119
				120	def pushlines(self, lines):
				121	# Reverse and insert at the front of the lines.
				122	self._lines[:0] = lines[::-1]
				123
Guido van Rossum	8b3febe	2007-08-30 01:15:14 +0000	[diff] [blame]	124	def __iter__(self):
				125	return self
				126
				127	def __next__(self):
				128	line = self.readline()
				129	if line == '':
				130	raise StopIteration
				131	return line
				132
				133
				134
				135	class FeedParser:
				136	"""A feed-style parser of email."""
				137
R David Murray	3edd22a	2011-04-18 13:59:37 -0400	[diff] [blame]	138	def __init__(self, _factory=message.Message, *, policy=policy.default):
				139	"""_factory is called with no arguments to create a new message obj
				140
				141	The policy keyword specifies a policy object that controls a number of
				142	aspects of the parser's operation. The default policy maintains
				143	backward compatibility.
				144
				145	"""
Guido van Rossum	8b3febe	2007-08-30 01:15:14 +0000	[diff] [blame]	146	self._factory = _factory
R David Murray	3edd22a	2011-04-18 13:59:37 -0400	[diff] [blame]	147	self.policy = policy
Guido van Rossum	8b3febe	2007-08-30 01:15:14 +0000	[diff] [blame]	148	self._input = BufferedSubFile()
				149	self._msgstack = []
				150	self._parse = self._parsegen().__next__
				151	self._cur = None
				152	self._last = None
				153	self._headersonly = False
				154
				155	# Non-public interface for supporting Parser's headersonly flag
				156	def _set_headersonly(self):
				157	self._headersonly = True
				158
				159	def feed(self, data):
				160	"""Push more data into the parser."""
				161	self._input.push(data)
				162	self._call_parse()
				163
				164	def _call_parse(self):
				165	try:
				166	self._parse()
				167	except StopIteration:
				168	pass
				169
				170	def close(self):
				171	"""Parse all remaining data and return the root message object."""
				172	self._input.close()
				173	self._call_parse()
				174	root = self._pop_message()
				175	assert not self._msgstack
				176	# Look for final set of defects
				177	if root.get_content_maintype() == 'multipart' \
				178	and not root.is_multipart():
R David Murray	3edd22a	2011-04-18 13:59:37 -0400	[diff] [blame]	179	defect = errors.MultipartInvariantViolationDefect()
				180	self.policy.handle_defect(root, defect)
Guido van Rossum	8b3febe	2007-08-30 01:15:14 +0000	[diff] [blame]	181	return root
				182
				183	def _new_message(self):
				184	msg = self._factory()
				185	if self._cur and self._cur.get_content_type() == 'multipart/digest':
				186	msg.set_default_type('message/rfc822')
				187	if self._msgstack:
				188	self._msgstack[-1].attach(msg)
				189	self._msgstack.append(msg)
				190	self._cur = msg
				191	self._last = msg
				192
				193	def _pop_message(self):
				194	retval = self._msgstack.pop()
				195	if self._msgstack:
				196	self._cur = self._msgstack[-1]
				197	else:
				198	self._cur = None
				199	return retval
				200
				201	def _parsegen(self):
				202	# Create a new message and start by parsing headers.
				203	self._new_message()
				204	headers = []
				205	# Collect the headers, searching for a line that doesn't match the RFC
				206	# 2822 header or continuation pattern (including an empty line).
				207	for line in self._input:
				208	if line is NeedMoreData:
				209	yield NeedMoreData
				210	continue
				211	if not headerRE.match(line):
				212	# If we saw the RFC defined header/body separator
				213	# (i.e. newline), just throw it away. Otherwise the line is
				214	# part of the body so push it back.
				215	if not NLCRE.match(line):
				216	self._input.unreadline(line)
				217	break
				218	headers.append(line)
				219	# Done with the headers, so parse them and figure out what we're
				220	# supposed to see in the body of the message.
				221	self._parse_headers(headers)
				222	# Headers-only parsing is a backwards compatibility hack, which was
				223	# necessary in the older parser, which could throw errors. All
				224	# remaining lines in the input are thrown into the message body.
				225	if self._headersonly:
				226	lines = []
				227	while True:
				228	line = self._input.readline()
				229	if line is NeedMoreData:
				230	yield NeedMoreData
				231	continue
				232	if line == '':
				233	break
				234	lines.append(line)
				235	self._cur.set_payload(EMPTYSTRING.join(lines))
				236	return
				237	if self._cur.get_content_type() == 'message/delivery-status':
				238	# message/delivery-status contains blocks of headers separated by
				239	# a blank line. We'll represent each header block as a separate
				240	# nested message object, but the processing is a bit different
				241	# than standard message/* types because there is no body for the
				242	# nested messages. A blank line separates the subparts.
				243	while True:
				244	self._input.push_eof_matcher(NLCRE.match)
				245	for retval in self._parsegen():
				246	if retval is NeedMoreData:
				247	yield NeedMoreData
				248	continue
				249	break
				250	msg = self._pop_message()
				251	# We need to pop the EOF matcher in order to tell if we're at
				252	# the end of the current file, not the end of the last block
				253	# of message headers.
				254	self._input.pop_eof_matcher()
				255	# The input stream must be sitting at the newline or at the
				256	# EOF. We want to see if we're at the end of this subpart, so
				257	# first consume the blank line, then test the next line to see
				258	# if we're at this subpart's EOF.
				259	while True:
				260	line = self._input.readline()
				261	if line is NeedMoreData:
				262	yield NeedMoreData
				263	continue
				264	break
				265	while True:
				266	line = self._input.readline()
				267	if line is NeedMoreData:
				268	yield NeedMoreData
				269	continue
				270	break
				271	if line == '':
				272	break
				273	# Not at EOF so this is a line we're going to need.
				274	self._input.unreadline(line)
				275	return
				276	if self._cur.get_content_maintype() == 'message':
				277	# The message claims to be a message/* type, then what follows is
				278	# another RFC 2822 message.
				279	for retval in self._parsegen():
				280	if retval is NeedMoreData:
				281	yield NeedMoreData
				282	continue
				283	break
				284	self._pop_message()
				285	return
				286	if self._cur.get_content_maintype() == 'multipart':
				287	boundary = self._cur.get_boundary()
				288	if boundary is None:
				289	# The message /claims/ to be a multipart but it has not
				290	# defined a boundary. That's a problem which we'll handle by
				291	# reading everything until the EOF and marking the message as
				292	# defective.
R David Murray	3edd22a	2011-04-18 13:59:37 -0400	[diff] [blame]	293	defect = errors.NoBoundaryInMultipartDefect()
				294	self.policy.handle_defect(self._cur, defect)
Guido van Rossum	8b3febe	2007-08-30 01:15:14 +0000	[diff] [blame]	295	lines = []
				296	for line in self._input:
				297	if line is NeedMoreData:
				298	yield NeedMoreData
				299	continue
				300	lines.append(line)
				301	self._cur.set_payload(EMPTYSTRING.join(lines))
				302	return
				303	# Create a line match predicate which matches the inter-part
				304	# boundary as well as the end-of-multipart boundary. Don't push
				305	# this onto the input stream until we've scanned past the
				306	# preamble.
				307	separator = '--' + boundary
				308	boundaryre = re.compile(
				309	'(?P<sep>' + re.escape(separator) +
				310	r')(?P<end>--)?(?P<ws>[ \t]*)(?P<linesep>\r\n\|\r\|\n)?$')
				311	capturing_preamble = True
				312	preamble = []
				313	linesep = False
				314	while True:
				315	line = self._input.readline()
				316	if line is NeedMoreData:
				317	yield NeedMoreData
				318	continue
				319	if line == '':
				320	break
				321	mo = boundaryre.match(line)
				322	if mo:
				323	# If we're looking at the end boundary, we're done with
				324	# this multipart. If there was a newline at the end of
				325	# the closing boundary, then we need to initialize the
				326	# epilogue with the empty string (see below).
				327	if mo.group('end'):
				328	linesep = mo.group('linesep')
				329	break
				330	# We saw an inter-part boundary. Were we in the preamble?
				331	if capturing_preamble:
				332	if preamble:
				333	# According to RFC 2046, the last newline belongs
				334	# to the boundary.
				335	lastline = preamble[-1]
				336	eolmo = NLCRE_eol.search(lastline)
				337	if eolmo:
				338	preamble[-1] = lastline[:-len(eolmo.group(0))]
				339	self._cur.preamble = EMPTYSTRING.join(preamble)
				340	capturing_preamble = False
				341	self._input.unreadline(line)
				342	continue
				343	# We saw a boundary separating two parts. Consume any
				344	# multiple boundary lines that may be following. Our
				345	# interpretation of RFC 2046 BNF grammar does not produce
				346	# body parts within such double boundaries.
				347	while True:
				348	line = self._input.readline()
				349	if line is NeedMoreData:
				350	yield NeedMoreData
				351	continue
				352	mo = boundaryre.match(line)
				353	if not mo:
				354	self._input.unreadline(line)
				355	break
				356	# Recurse to parse this subpart; the input stream points
				357	# at the subpart's first line.
				358	self._input.push_eof_matcher(boundaryre.match)
				359	for retval in self._parsegen():
				360	if retval is NeedMoreData:
				361	yield NeedMoreData
				362	continue
				363	break
				364	# Because of RFC 2046, the newline preceding the boundary
				365	# separator actually belongs to the boundary, not the
				366	# previous subpart's payload (or epilogue if the previous
				367	# part is a multipart).
				368	if self._last.get_content_maintype() == 'multipart':
				369	epilogue = self._last.epilogue
				370	if epilogue == '':
				371	self._last.epilogue = None
				372	elif epilogue is not None:
				373	mo = NLCRE_eol.search(epilogue)
				374	if mo:
				375	end = len(mo.group(0))
				376	self._last.epilogue = epilogue[:-end]
				377	else:
R David Murray	c5c1472	2011-04-06 08:13:02 -0400	[diff] [blame]	378	payload = self._last._payload
Guido van Rossum	3172c5d	2007-10-16 18:12:55 +0000	[diff] [blame]	379	if isinstance(payload, str):
Guido van Rossum	8b3febe	2007-08-30 01:15:14 +0000	[diff] [blame]	380	mo = NLCRE_eol.search(payload)
				381	if mo:
				382	payload = payload[:-len(mo.group(0))]
R David Murray	c5c1472	2011-04-06 08:13:02 -0400	[diff] [blame]	383	self._last._payload = payload
Guido van Rossum	8b3febe	2007-08-30 01:15:14 +0000	[diff] [blame]	384	self._input.pop_eof_matcher()
				385	self._pop_message()
				386	# Set the multipart up for newline cleansing, which will
				387	# happen if we're in a nested multipart.
				388	self._last = self._cur
				389	else:
				390	# I think we must be in the preamble
				391	assert capturing_preamble
				392	preamble.append(line)
				393	# We've seen either the EOF or the end boundary. If we're still
				394	# capturing the preamble, we never saw the start boundary. Note
				395	# that as a defect and store the captured text as the payload.
				396	# Everything from here to the EOF is epilogue.
				397	if capturing_preamble:
R David Murray	3edd22a	2011-04-18 13:59:37 -0400	[diff] [blame]	398	defect = errors.StartBoundaryNotFoundDefect()
				399	self.policy.handle_defect(self._cur, defect)
Guido van Rossum	8b3febe	2007-08-30 01:15:14 +0000	[diff] [blame]	400	self._cur.set_payload(EMPTYSTRING.join(preamble))
				401	epilogue = []
				402	for line in self._input:
				403	if line is NeedMoreData:
				404	yield NeedMoreData
				405	continue
				406	self._cur.epilogue = EMPTYSTRING.join(epilogue)
				407	return
				408	# If the end boundary ended in a newline, we'll need to make sure
				409	# the epilogue isn't None
				410	if linesep:
				411	epilogue = ['']
				412	else:
				413	epilogue = []
				414	for line in self._input:
				415	if line is NeedMoreData:
				416	yield NeedMoreData
				417	continue
				418	epilogue.append(line)
				419	# Any CRLF at the front of the epilogue is not technically part of
				420	# the epilogue. Also, watch out for an empty string epilogue,
				421	# which means a single newline.
				422	if epilogue:
				423	firstline = epilogue[0]
				424	bolmo = NLCRE_bol.match(firstline)
				425	if bolmo:
				426	epilogue[0] = firstline[len(bolmo.group(0)):]
				427	self._cur.epilogue = EMPTYSTRING.join(epilogue)
				428	return
				429	# Otherwise, it's some non-multipart type, so the entire rest of the
				430	# file contents becomes the payload.
				431	lines = []
				432	for line in self._input:
				433	if line is NeedMoreData:
				434	yield NeedMoreData
				435	continue
				436	lines.append(line)
				437	self._cur.set_payload(EMPTYSTRING.join(lines))
				438
				439	def _parse_headers(self, lines):
				440	# Passed a list of lines that make up the headers for the current msg
				441	lastheader = ''
				442	lastvalue = []
				443	for lineno, line in enumerate(lines):
				444	# Check for continuation
				445	if line[0] in ' \t':
				446	if not lastheader:
				447	# The first line of the headers was a continuation. This
				448	# is illegal, so let's note the defect, store the illegal
				449	# line, and ignore it for purposes of headers.
				450	defect = errors.FirstHeaderLineIsContinuationDefect(line)
R David Murray	3edd22a	2011-04-18 13:59:37 -0400	[diff] [blame]	451	self.policy.handle_defect(self._cur, defect)
Guido van Rossum	8b3febe	2007-08-30 01:15:14 +0000	[diff] [blame]	452	continue
				453	lastvalue.append(line)
				454	continue
				455	if lastheader:
				456	# XXX reconsider the joining of folded lines
				457	lhdr = EMPTYSTRING.join(lastvalue)[:-1].rstrip('\r\n')
				458	self._cur[lastheader] = lhdr
				459	lastheader, lastvalue = '', []
				460	# Check for envelope header, i.e. unix-from
				461	if line.startswith('From '):
				462	if lineno == 0:
				463	# Strip off the trailing newline
				464	mo = NLCRE_eol.search(line)
				465	if mo:
				466	line = line[:-len(mo.group(0))]
				467	self._cur.set_unixfrom(line)
				468	continue
				469	elif lineno == len(lines) - 1:
				470	# Something looking like a unix-from at the end - it's
				471	# probably the first line of the body, so push back the
				472	# line and stop.
				473	self._input.unreadline(line)
				474	return
				475	else:
				476	# Weirdly placed unix-from line. Note this as a defect
				477	# and ignore it.
				478	defect = errors.MisplacedEnvelopeHeaderDefect(line)
				479	self._cur.defects.append(defect)
				480	continue
				481	# Split the line on the colon separating field name from value.
				482	i = line.find(':')
				483	if i < 0:
				484	defect = errors.MalformedHeaderDefect(line)
				485	self._cur.defects.append(defect)
				486	continue
				487	lastheader = line[:i]
				488	lastvalue = [line[i+1:].lstrip()]
				489	# Done with all the lines, so handle the last header.
				490	if lastheader:
				491	# XXX reconsider the joining of folded lines
				492	self._cur[lastheader] = EMPTYSTRING.join(lastvalue).rstrip('\r\n')
R. David Murray	96fd54e	2010-10-08 15:55:28 +0000	[diff] [blame]	493
				494
				495	class BytesFeedParser(FeedParser):
				496	"""Like FeedParser, but feed accepts bytes."""
				497
				498	def feed(self, data):
				499	super().feed(data.decode('ascii', 'surrogateescape'))