Blame - Lib/email/feedparser.py - platform/external/python/cpython3

blob: 60a83255c0d30820e26f0dd08dd7585eff20bf04 [file] [log] [blame]

Guido van Rossum	8b3febe	2007-08-30 01:15:14 +0000	[diff] [blame]	1	# Copyright (C) 2004-2006 Python Software Foundation
				2	# Authors: Baxter, Wouters and Warsaw
				3	# Contact: email-sig@python.org
				4
				5	"""FeedParser - An email feed parser.
				6
				7	The feed parser implements an interface for incrementally parsing an email
				8	message, line by line. This has advantages for certain applications, such as
				9	those reading email messages off a socket.
				10
				11	FeedParser.feed() is the primary interface for pushing new data into the
				12	parser. It returns when there's nothing more it can do with the available
				13	data. When you have no more data to push into the parser, call .close().
				14	This completes the parsing and returns the root message object.
				15
				16	The other advantage of this parser is that it will never throw a parsing
				17	exception. Instead, when it finds something unexpected, it adds a 'defect' to
				18	the current message. Defects are just instances that live on the message
				19	object's .defects attribute.
				20	"""
				21
				22	__all__ = ['FeedParser']
				23
				24	import re
				25
				26	from email import errors
				27	from email import message
				28
				29	NLCRE = re.compile('\r\n\|\r\|\n')
				30	NLCRE_bol = re.compile('(\r\n\|\r\|\n)')
R. David Murray	45e0e14	2010-06-16 02:19:40 +0000	[diff] [blame]	31	NLCRE_eol = re.compile('(\r\n\|\r\|\n)\Z')
Guido van Rossum	8b3febe	2007-08-30 01:15:14 +0000	[diff] [blame]	32	NLCRE_crack = re.compile('(\r\n\|\r\|\n)')
				33	# RFC 2822 $3.6.8 Optional fields. ftext is %d33-57 / %d59-126, Any character
				34	# except controls, SP, and ":".
				35	headerRE = re.compile(r'^(From \|[\041-\071\073-\176]{1,}:\|[\t ])')
				36	EMPTYSTRING = ''
				37	NL = '\n'
				38
				39	NeedMoreData = object()
				40
				41
				42
				43	class BufferedSubFile(object):
				44	"""A file-ish object that can have new data loaded into it.
				45
				46	You can also push and pop line-matching predicates onto a stack. When the
				47	current predicate matches the current line, a false EOF response
				48	(i.e. empty string) is returned instead. This lets the parser adhere to a
				49	simple abstraction -- it parses until EOF closes the current message.
				50	"""
				51	def __init__(self):
				52	# The last partial line pushed into this object.
				53	self._partial = ''
				54	# The list of full, pushed lines, in reverse order
				55	self._lines = []
				56	# The stack of false-EOF checking predicates.
				57	self._eofstack = []
				58	# A flag indicating whether the file has been closed or not.
				59	self._closed = False
				60
				61	def push_eof_matcher(self, pred):
				62	self._eofstack.append(pred)
				63
				64	def pop_eof_matcher(self):
				65	return self._eofstack.pop()
				66
				67	def close(self):
				68	# Don't forget any trailing partial line.
				69	self._lines.append(self._partial)
				70	self._partial = ''
				71	self._closed = True
				72
				73	def readline(self):
				74	if not self._lines:
				75	if self._closed:
				76	return ''
				77	return NeedMoreData
				78	# Pop the line off the stack and see if it matches the current
				79	# false-EOF predicate.
				80	line = self._lines.pop()
				81	# RFC 2046, section 5.1.2 requires us to recognize outer level
				82	# boundaries at any level of inner nesting. Do this, but be sure it's
				83	# in the order of most to least nested.
				84	for ateof in self._eofstack[::-1]:
				85	if ateof(line):
				86	# We're at the false EOF. But push the last line back first.
				87	self._lines.append(line)
				88	return ''
				89	return line
				90
				91	def unreadline(self, line):
				92	# Let the consumer push a line back into the buffer.
				93	assert line is not NeedMoreData
				94	self._lines.append(line)
				95
				96	def push(self, data):
				97	"""Push some new data into this object."""
				98	# Handle any previous leftovers
				99	data, self._partial = self._partial + data, ''
				100	# Crack into lines, but preserve the newlines on the end of each
				101	parts = NLCRE_crack.split(data)
				102	# The ahem interesting behaviour of re.split when supplied grouping
				103	# parentheses is that the last element of the resulting list is the
				104	# data after the final RE. In the case of a NL/CR terminated string,
				105	# this is the empty string.
				106	self._partial = parts.pop()
R. David Murray	45bf773f	2010-07-17 01:19:57 +0000	[diff] [blame]	107	#GAN 29Mar09 bugs 1555570, 1721862 Confusion at 8K boundary ending with \r:
				108	# is there a \n to follow later?
				109	if not self._partial and parts and parts[-1].endswith('\r'):
				110	self._partial = parts.pop(-2)+parts.pop()
Guido van Rossum	8b3febe	2007-08-30 01:15:14 +0000	[diff] [blame]	111	# parts is a list of strings, alternating between the line contents
				112	# and the eol character(s). Gather up a list of lines after
				113	# re-attaching the newlines.
				114	lines = []
				115	for i in range(len(parts) // 2):
				116	lines.append(parts[i2] + parts[i2+1])
				117	self.pushlines(lines)
				118
				119	def pushlines(self, lines):
				120	# Reverse and insert at the front of the lines.
				121	self._lines[:0] = lines[::-1]
				122
				123	def is_closed(self):
				124	return self._closed
				125
				126	def __iter__(self):
				127	return self
				128
				129	def __next__(self):
				130	line = self.readline()
				131	if line == '':
				132	raise StopIteration
				133	return line
				134
				135
				136
				137	class FeedParser:
				138	"""A feed-style parser of email."""
				139
				140	def __init__(self, _factory=message.Message):
				141	"""_factory is called with no arguments to create a new message obj"""
				142	self._factory = _factory
				143	self._input = BufferedSubFile()
				144	self._msgstack = []
				145	self._parse = self._parsegen().__next__
				146	self._cur = None
				147	self._last = None
				148	self._headersonly = False
				149
				150	# Non-public interface for supporting Parser's headersonly flag
				151	def _set_headersonly(self):
				152	self._headersonly = True
				153
				154	def feed(self, data):
				155	"""Push more data into the parser."""
				156	self._input.push(data)
				157	self._call_parse()
				158
				159	def _call_parse(self):
				160	try:
				161	self._parse()
				162	except StopIteration:
				163	pass
				164
				165	def close(self):
				166	"""Parse all remaining data and return the root message object."""
				167	self._input.close()
				168	self._call_parse()
				169	root = self._pop_message()
				170	assert not self._msgstack
				171	# Look for final set of defects
				172	if root.get_content_maintype() == 'multipart' \
				173	and not root.is_multipart():
				174	root.defects.append(errors.MultipartInvariantViolationDefect())
				175	return root
				176
				177	def _new_message(self):
				178	msg = self._factory()
				179	if self._cur and self._cur.get_content_type() == 'multipart/digest':
				180	msg.set_default_type('message/rfc822')
				181	if self._msgstack:
				182	self._msgstack[-1].attach(msg)
				183	self._msgstack.append(msg)
				184	self._cur = msg
				185	self._last = msg
				186
				187	def _pop_message(self):
				188	retval = self._msgstack.pop()
				189	if self._msgstack:
				190	self._cur = self._msgstack[-1]
				191	else:
				192	self._cur = None
				193	return retval
				194
				195	def _parsegen(self):
				196	# Create a new message and start by parsing headers.
				197	self._new_message()
				198	headers = []
				199	# Collect the headers, searching for a line that doesn't match the RFC
				200	# 2822 header or continuation pattern (including an empty line).
				201	for line in self._input:
				202	if line is NeedMoreData:
				203	yield NeedMoreData
				204	continue
				205	if not headerRE.match(line):
				206	# If we saw the RFC defined header/body separator
				207	# (i.e. newline), just throw it away. Otherwise the line is
				208	# part of the body so push it back.
				209	if not NLCRE.match(line):
				210	self._input.unreadline(line)
				211	break
				212	headers.append(line)
				213	# Done with the headers, so parse them and figure out what we're
				214	# supposed to see in the body of the message.
				215	self._parse_headers(headers)
				216	# Headers-only parsing is a backwards compatibility hack, which was
				217	# necessary in the older parser, which could throw errors. All
				218	# remaining lines in the input are thrown into the message body.
				219	if self._headersonly:
				220	lines = []
				221	while True:
				222	line = self._input.readline()
				223	if line is NeedMoreData:
				224	yield NeedMoreData
				225	continue
				226	if line == '':
				227	break
				228	lines.append(line)
				229	self._cur.set_payload(EMPTYSTRING.join(lines))
				230	return
				231	if self._cur.get_content_type() == 'message/delivery-status':
				232	# message/delivery-status contains blocks of headers separated by
				233	# a blank line. We'll represent each header block as a separate
				234	# nested message object, but the processing is a bit different
				235	# than standard message/* types because there is no body for the
				236	# nested messages. A blank line separates the subparts.
				237	while True:
				238	self._input.push_eof_matcher(NLCRE.match)
				239	for retval in self._parsegen():
				240	if retval is NeedMoreData:
				241	yield NeedMoreData
				242	continue
				243	break
				244	msg = self._pop_message()
				245	# We need to pop the EOF matcher in order to tell if we're at
				246	# the end of the current file, not the end of the last block
				247	# of message headers.
				248	self._input.pop_eof_matcher()
				249	# The input stream must be sitting at the newline or at the
				250	# EOF. We want to see if we're at the end of this subpart, so
				251	# first consume the blank line, then test the next line to see
				252	# if we're at this subpart's EOF.
				253	while True:
				254	line = self._input.readline()
				255	if line is NeedMoreData:
				256	yield NeedMoreData
				257	continue
				258	break
				259	while True:
				260	line = self._input.readline()
				261	if line is NeedMoreData:
				262	yield NeedMoreData
				263	continue
				264	break
				265	if line == '':
				266	break
				267	# Not at EOF so this is a line we're going to need.
				268	self._input.unreadline(line)
				269	return
				270	if self._cur.get_content_maintype() == 'message':
				271	# The message claims to be a message/* type, then what follows is
				272	# another RFC 2822 message.
				273	for retval in self._parsegen():
				274	if retval is NeedMoreData:
				275	yield NeedMoreData
				276	continue
				277	break
				278	self._pop_message()
				279	return
				280	if self._cur.get_content_maintype() == 'multipart':
				281	boundary = self._cur.get_boundary()
				282	if boundary is None:
				283	# The message /claims/ to be a multipart but it has not
				284	# defined a boundary. That's a problem which we'll handle by
				285	# reading everything until the EOF and marking the message as
				286	# defective.
				287	self._cur.defects.append(errors.NoBoundaryInMultipartDefect())
				288	lines = []
				289	for line in self._input:
				290	if line is NeedMoreData:
				291	yield NeedMoreData
				292	continue
				293	lines.append(line)
				294	self._cur.set_payload(EMPTYSTRING.join(lines))
				295	return
				296	# Create a line match predicate which matches the inter-part
				297	# boundary as well as the end-of-multipart boundary. Don't push
				298	# this onto the input stream until we've scanned past the
				299	# preamble.
				300	separator = '--' + boundary
				301	boundaryre = re.compile(
				302	'(?P<sep>' + re.escape(separator) +
				303	r')(?P<end>--)?(?P<ws>[ \t]*)(?P<linesep>\r\n\|\r\|\n)?$')
				304	capturing_preamble = True
				305	preamble = []
				306	linesep = False
				307	while True:
				308	line = self._input.readline()
				309	if line is NeedMoreData:
				310	yield NeedMoreData
				311	continue
				312	if line == '':
				313	break
				314	mo = boundaryre.match(line)
				315	if mo:
				316	# If we're looking at the end boundary, we're done with
				317	# this multipart. If there was a newline at the end of
				318	# the closing boundary, then we need to initialize the
				319	# epilogue with the empty string (see below).
				320	if mo.group('end'):
				321	linesep = mo.group('linesep')
				322	break
				323	# We saw an inter-part boundary. Were we in the preamble?
				324	if capturing_preamble:
				325	if preamble:
				326	# According to RFC 2046, the last newline belongs
				327	# to the boundary.
				328	lastline = preamble[-1]
				329	eolmo = NLCRE_eol.search(lastline)
				330	if eolmo:
				331	preamble[-1] = lastline[:-len(eolmo.group(0))]
				332	self._cur.preamble = EMPTYSTRING.join(preamble)
				333	capturing_preamble = False
				334	self._input.unreadline(line)
				335	continue
				336	# We saw a boundary separating two parts. Consume any
				337	# multiple boundary lines that may be following. Our
				338	# interpretation of RFC 2046 BNF grammar does not produce
				339	# body parts within such double boundaries.
				340	while True:
				341	line = self._input.readline()
				342	if line is NeedMoreData:
				343	yield NeedMoreData
				344	continue
				345	mo = boundaryre.match(line)
				346	if not mo:
				347	self._input.unreadline(line)
				348	break
				349	# Recurse to parse this subpart; the input stream points
				350	# at the subpart's first line.
				351	self._input.push_eof_matcher(boundaryre.match)
				352	for retval in self._parsegen():
				353	if retval is NeedMoreData:
				354	yield NeedMoreData
				355	continue
				356	break
				357	# Because of RFC 2046, the newline preceding the boundary
				358	# separator actually belongs to the boundary, not the
				359	# previous subpart's payload (or epilogue if the previous
				360	# part is a multipart).
				361	if self._last.get_content_maintype() == 'multipart':
				362	epilogue = self._last.epilogue
				363	if epilogue == '':
				364	self._last.epilogue = None
				365	elif epilogue is not None:
				366	mo = NLCRE_eol.search(epilogue)
				367	if mo:
				368	end = len(mo.group(0))
				369	self._last.epilogue = epilogue[:-end]
				370	else:
R David Murray	c5c1472	2011-04-06 08:13:02 -0400	[diff] [blame^]	371	payload = self._last._payload
Guido van Rossum	3172c5d	2007-10-16 18:12:55 +0000	[diff] [blame]	372	if isinstance(payload, str):
Guido van Rossum	8b3febe	2007-08-30 01:15:14 +0000	[diff] [blame]	373	mo = NLCRE_eol.search(payload)
				374	if mo:
				375	payload = payload[:-len(mo.group(0))]
R David Murray	c5c1472	2011-04-06 08:13:02 -0400	[diff] [blame^]	376	self._last._payload = payload
Guido van Rossum	8b3febe	2007-08-30 01:15:14 +0000	[diff] [blame]	377	self._input.pop_eof_matcher()
				378	self._pop_message()
				379	# Set the multipart up for newline cleansing, which will
				380	# happen if we're in a nested multipart.
				381	self._last = self._cur
				382	else:
				383	# I think we must be in the preamble
				384	assert capturing_preamble
				385	preamble.append(line)
				386	# We've seen either the EOF or the end boundary. If we're still
				387	# capturing the preamble, we never saw the start boundary. Note
				388	# that as a defect and store the captured text as the payload.
				389	# Everything from here to the EOF is epilogue.
				390	if capturing_preamble:
				391	self._cur.defects.append(errors.StartBoundaryNotFoundDefect())
				392	self._cur.set_payload(EMPTYSTRING.join(preamble))
				393	epilogue = []
				394	for line in self._input:
				395	if line is NeedMoreData:
				396	yield NeedMoreData
				397	continue
				398	self._cur.epilogue = EMPTYSTRING.join(epilogue)
				399	return
				400	# If the end boundary ended in a newline, we'll need to make sure
				401	# the epilogue isn't None
				402	if linesep:
				403	epilogue = ['']
				404	else:
				405	epilogue = []
				406	for line in self._input:
				407	if line is NeedMoreData:
				408	yield NeedMoreData
				409	continue
				410	epilogue.append(line)
				411	# Any CRLF at the front of the epilogue is not technically part of
				412	# the epilogue. Also, watch out for an empty string epilogue,
				413	# which means a single newline.
				414	if epilogue:
				415	firstline = epilogue[0]
				416	bolmo = NLCRE_bol.match(firstline)
				417	if bolmo:
				418	epilogue[0] = firstline[len(bolmo.group(0)):]
				419	self._cur.epilogue = EMPTYSTRING.join(epilogue)
				420	return
				421	# Otherwise, it's some non-multipart type, so the entire rest of the
				422	# file contents becomes the payload.
				423	lines = []
				424	for line in self._input:
				425	if line is NeedMoreData:
				426	yield NeedMoreData
				427	continue
				428	lines.append(line)
				429	self._cur.set_payload(EMPTYSTRING.join(lines))
				430
				431	def _parse_headers(self, lines):
				432	# Passed a list of lines that make up the headers for the current msg
				433	lastheader = ''
				434	lastvalue = []
				435	for lineno, line in enumerate(lines):
				436	# Check for continuation
				437	if line[0] in ' \t':
				438	if not lastheader:
				439	# The first line of the headers was a continuation. This
				440	# is illegal, so let's note the defect, store the illegal
				441	# line, and ignore it for purposes of headers.
				442	defect = errors.FirstHeaderLineIsContinuationDefect(line)
				443	self._cur.defects.append(defect)
				444	continue
				445	lastvalue.append(line)
				446	continue
				447	if lastheader:
				448	# XXX reconsider the joining of folded lines
				449	lhdr = EMPTYSTRING.join(lastvalue)[:-1].rstrip('\r\n')
				450	self._cur[lastheader] = lhdr
				451	lastheader, lastvalue = '', []
				452	# Check for envelope header, i.e. unix-from
				453	if line.startswith('From '):
				454	if lineno == 0:
				455	# Strip off the trailing newline
				456	mo = NLCRE_eol.search(line)
				457	if mo:
				458	line = line[:-len(mo.group(0))]
				459	self._cur.set_unixfrom(line)
				460	continue
				461	elif lineno == len(lines) - 1:
				462	# Something looking like a unix-from at the end - it's
				463	# probably the first line of the body, so push back the
				464	# line and stop.
				465	self._input.unreadline(line)
				466	return
				467	else:
				468	# Weirdly placed unix-from line. Note this as a defect
				469	# and ignore it.
				470	defect = errors.MisplacedEnvelopeHeaderDefect(line)
				471	self._cur.defects.append(defect)
				472	continue
				473	# Split the line on the colon separating field name from value.
				474	i = line.find(':')
				475	if i < 0:
				476	defect = errors.MalformedHeaderDefect(line)
				477	self._cur.defects.append(defect)
				478	continue
				479	lastheader = line[:i]
				480	lastvalue = [line[i+1:].lstrip()]
				481	# Done with all the lines, so handle the last header.
				482	if lastheader:
				483	# XXX reconsider the joining of folded lines
				484	self._cur[lastheader] = EMPTYSTRING.join(lastvalue).rstrip('\r\n')
R. David Murray	96fd54e	2010-10-08 15:55:28 +0000	[diff] [blame]	485
				486
				487	class BytesFeedParser(FeedParser):
				488	"""Like FeedParser, but feed accepts bytes."""
				489
				490	def feed(self, data):
				491	super().feed(data.decode('ascii', 'surrogateescape'))