Blame - Lib/email/feedparser.py - platform/external/python/cpython3

blob: 1b752d0193ad1203248b42d7d8ac8241e25e671b [file] [log] [blame]

Guido van Rossum	8b3febe	2007-08-30 01:15:14 +0000	[diff] [blame]	1	# Copyright (C) 2004-2006 Python Software Foundation
				2	# Authors: Baxter, Wouters and Warsaw
				3	# Contact: email-sig@python.org
				4
				5	"""FeedParser - An email feed parser.
				6
				7	The feed parser implements an interface for incrementally parsing an email
				8	message, line by line. This has advantages for certain applications, such as
				9	those reading email messages off a socket.
				10
				11	FeedParser.feed() is the primary interface for pushing new data into the
				12	parser. It returns when there's nothing more it can do with the available
				13	data. When you have no more data to push into the parser, call .close().
				14	This completes the parsing and returns the root message object.
				15
				16	The other advantage of this parser is that it will never throw a parsing
				17	exception. Instead, when it finds something unexpected, it adds a 'defect' to
				18	the current message. Defects are just instances that live on the message
				19	object's .defects attribute.
				20	"""
				21
				22	__all__ = ['FeedParser']
				23
				24	import re
				25
				26	from email import errors
				27	from email import message
				28
				29	NLCRE = re.compile('\r\n\|\r\|\n')
				30	NLCRE_bol = re.compile('(\r\n\|\r\|\n)')
R. David Murray	45e0e14	2010-06-16 02:19:40 +0000	[diff] [blame]	31	NLCRE_eol = re.compile('(\r\n\|\r\|\n)\Z')
Guido van Rossum	8b3febe	2007-08-30 01:15:14 +0000	[diff] [blame]	32	NLCRE_crack = re.compile('(\r\n\|\r\|\n)')
				33	# RFC 2822 $3.6.8 Optional fields. ftext is %d33-57 / %d59-126, Any character
				34	# except controls, SP, and ":".
				35	headerRE = re.compile(r'^(From \|[\041-\071\073-\176]{1,}:\|[\t ])')
				36	EMPTYSTRING = ''
				37	NL = '\n'
				38
				39	NeedMoreData = object()
				40
				41
				42
				43	class BufferedSubFile(object):
				44	"""A file-ish object that can have new data loaded into it.
				45
				46	You can also push and pop line-matching predicates onto a stack. When the
				47	current predicate matches the current line, a false EOF response
				48	(i.e. empty string) is returned instead. This lets the parser adhere to a
				49	simple abstraction -- it parses until EOF closes the current message.
				50	"""
				51	def __init__(self):
				52	# The last partial line pushed into this object.
				53	self._partial = ''
				54	# The list of full, pushed lines, in reverse order
				55	self._lines = []
				56	# The stack of false-EOF checking predicates.
				57	self._eofstack = []
				58	# A flag indicating whether the file has been closed or not.
				59	self._closed = False
				60
				61	def push_eof_matcher(self, pred):
				62	self._eofstack.append(pred)
				63
				64	def pop_eof_matcher(self):
				65	return self._eofstack.pop()
				66
				67	def close(self):
				68	# Don't forget any trailing partial line.
				69	self._lines.append(self._partial)
				70	self._partial = ''
				71	self._closed = True
				72
				73	def readline(self):
				74	if not self._lines:
				75	if self._closed:
				76	return ''
				77	return NeedMoreData
				78	# Pop the line off the stack and see if it matches the current
				79	# false-EOF predicate.
				80	line = self._lines.pop()
				81	# RFC 2046, section 5.1.2 requires us to recognize outer level
				82	# boundaries at any level of inner nesting. Do this, but be sure it's
				83	# in the order of most to least nested.
				84	for ateof in self._eofstack[::-1]:
				85	if ateof(line):
				86	# We're at the false EOF. But push the last line back first.
				87	self._lines.append(line)
				88	return ''
				89	return line
				90
				91	def unreadline(self, line):
				92	# Let the consumer push a line back into the buffer.
				93	assert line is not NeedMoreData
				94	self._lines.append(line)
				95
				96	def push(self, data):
				97	"""Push some new data into this object."""
				98	# Handle any previous leftovers
				99	data, self._partial = self._partial + data, ''
				100	# Crack into lines, but preserve the newlines on the end of each
				101	parts = NLCRE_crack.split(data)
				102	# The ahem interesting behaviour of re.split when supplied grouping
				103	# parentheses is that the last element of the resulting list is the
				104	# data after the final RE. In the case of a NL/CR terminated string,
				105	# this is the empty string.
				106	self._partial = parts.pop()
R. David Murray	45bf773f	2010-07-17 01:19:57 +0000	[diff] [blame]	107	#GAN 29Mar09 bugs 1555570, 1721862 Confusion at 8K boundary ending with \r:
				108	# is there a \n to follow later?
				109	if not self._partial and parts and parts[-1].endswith('\r'):
				110	self._partial = parts.pop(-2)+parts.pop()
Guido van Rossum	8b3febe	2007-08-30 01:15:14 +0000	[diff] [blame]	111	# parts is a list of strings, alternating between the line contents
				112	# and the eol character(s). Gather up a list of lines after
				113	# re-attaching the newlines.
				114	lines = []
				115	for i in range(len(parts) // 2):
				116	lines.append(parts[i2] + parts[i2+1])
				117	self.pushlines(lines)
				118
				119	def pushlines(self, lines):
				120	# Reverse and insert at the front of the lines.
				121	self._lines[:0] = lines[::-1]
				122
Guido van Rossum	8b3febe	2007-08-30 01:15:14 +0000	[diff] [blame]	123	def __iter__(self):
				124	return self
				125
				126	def __next__(self):
				127	line = self.readline()
				128	if line == '':
				129	raise StopIteration
				130	return line
				131
				132
				133
				134	class FeedParser:
				135	"""A feed-style parser of email."""
				136
				137	def __init__(self, _factory=message.Message):
				138	"""_factory is called with no arguments to create a new message obj"""
				139	self._factory = _factory
				140	self._input = BufferedSubFile()
				141	self._msgstack = []
				142	self._parse = self._parsegen().__next__
				143	self._cur = None
				144	self._last = None
				145	self._headersonly = False
				146
				147	# Non-public interface for supporting Parser's headersonly flag
				148	def _set_headersonly(self):
				149	self._headersonly = True
				150
				151	def feed(self, data):
				152	"""Push more data into the parser."""
				153	self._input.push(data)
				154	self._call_parse()
				155
				156	def _call_parse(self):
				157	try:
				158	self._parse()
				159	except StopIteration:
				160	pass
				161
				162	def close(self):
				163	"""Parse all remaining data and return the root message object."""
				164	self._input.close()
				165	self._call_parse()
				166	root = self._pop_message()
				167	assert not self._msgstack
				168	# Look for final set of defects
				169	if root.get_content_maintype() == 'multipart' \
				170	and not root.is_multipart():
				171	root.defects.append(errors.MultipartInvariantViolationDefect())
				172	return root
				173
				174	def _new_message(self):
				175	msg = self._factory()
				176	if self._cur and self._cur.get_content_type() == 'multipart/digest':
				177	msg.set_default_type('message/rfc822')
				178	if self._msgstack:
				179	self._msgstack[-1].attach(msg)
				180	self._msgstack.append(msg)
				181	self._cur = msg
				182	self._last = msg
				183
				184	def _pop_message(self):
				185	retval = self._msgstack.pop()
				186	if self._msgstack:
				187	self._cur = self._msgstack[-1]
				188	else:
				189	self._cur = None
				190	return retval
				191
				192	def _parsegen(self):
				193	# Create a new message and start by parsing headers.
				194	self._new_message()
				195	headers = []
				196	# Collect the headers, searching for a line that doesn't match the RFC
				197	# 2822 header or continuation pattern (including an empty line).
				198	for line in self._input:
				199	if line is NeedMoreData:
				200	yield NeedMoreData
				201	continue
				202	if not headerRE.match(line):
				203	# If we saw the RFC defined header/body separator
				204	# (i.e. newline), just throw it away. Otherwise the line is
				205	# part of the body so push it back.
				206	if not NLCRE.match(line):
				207	self._input.unreadline(line)
				208	break
				209	headers.append(line)
				210	# Done with the headers, so parse them and figure out what we're
				211	# supposed to see in the body of the message.
				212	self._parse_headers(headers)
				213	# Headers-only parsing is a backwards compatibility hack, which was
				214	# necessary in the older parser, which could throw errors. All
				215	# remaining lines in the input are thrown into the message body.
				216	if self._headersonly:
				217	lines = []
				218	while True:
				219	line = self._input.readline()
				220	if line is NeedMoreData:
				221	yield NeedMoreData
				222	continue
				223	if line == '':
				224	break
				225	lines.append(line)
				226	self._cur.set_payload(EMPTYSTRING.join(lines))
				227	return
				228	if self._cur.get_content_type() == 'message/delivery-status':
				229	# message/delivery-status contains blocks of headers separated by
				230	# a blank line. We'll represent each header block as a separate
				231	# nested message object, but the processing is a bit different
				232	# than standard message/* types because there is no body for the
				233	# nested messages. A blank line separates the subparts.
				234	while True:
				235	self._input.push_eof_matcher(NLCRE.match)
				236	for retval in self._parsegen():
				237	if retval is NeedMoreData:
				238	yield NeedMoreData
				239	continue
				240	break
				241	msg = self._pop_message()
				242	# We need to pop the EOF matcher in order to tell if we're at
				243	# the end of the current file, not the end of the last block
				244	# of message headers.
				245	self._input.pop_eof_matcher()
				246	# The input stream must be sitting at the newline or at the
				247	# EOF. We want to see if we're at the end of this subpart, so
				248	# first consume the blank line, then test the next line to see
				249	# if we're at this subpart's EOF.
				250	while True:
				251	line = self._input.readline()
				252	if line is NeedMoreData:
				253	yield NeedMoreData
				254	continue
				255	break
				256	while True:
				257	line = self._input.readline()
				258	if line is NeedMoreData:
				259	yield NeedMoreData
				260	continue
				261	break
				262	if line == '':
				263	break
				264	# Not at EOF so this is a line we're going to need.
				265	self._input.unreadline(line)
				266	return
				267	if self._cur.get_content_maintype() == 'message':
				268	# The message claims to be a message/* type, then what follows is
				269	# another RFC 2822 message.
				270	for retval in self._parsegen():
				271	if retval is NeedMoreData:
				272	yield NeedMoreData
				273	continue
				274	break
				275	self._pop_message()
				276	return
				277	if self._cur.get_content_maintype() == 'multipart':
				278	boundary = self._cur.get_boundary()
				279	if boundary is None:
				280	# The message /claims/ to be a multipart but it has not
				281	# defined a boundary. That's a problem which we'll handle by
				282	# reading everything until the EOF and marking the message as
				283	# defective.
				284	self._cur.defects.append(errors.NoBoundaryInMultipartDefect())
				285	lines = []
				286	for line in self._input:
				287	if line is NeedMoreData:
				288	yield NeedMoreData
				289	continue
				290	lines.append(line)
				291	self._cur.set_payload(EMPTYSTRING.join(lines))
				292	return
				293	# Create a line match predicate which matches the inter-part
				294	# boundary as well as the end-of-multipart boundary. Don't push
				295	# this onto the input stream until we've scanned past the
				296	# preamble.
				297	separator = '--' + boundary
				298	boundaryre = re.compile(
				299	'(?P<sep>' + re.escape(separator) +
				300	r')(?P<end>--)?(?P<ws>[ \t]*)(?P<linesep>\r\n\|\r\|\n)?$')
				301	capturing_preamble = True
				302	preamble = []
				303	linesep = False
				304	while True:
				305	line = self._input.readline()
				306	if line is NeedMoreData:
				307	yield NeedMoreData
				308	continue
				309	if line == '':
				310	break
				311	mo = boundaryre.match(line)
				312	if mo:
				313	# If we're looking at the end boundary, we're done with
				314	# this multipart. If there was a newline at the end of
				315	# the closing boundary, then we need to initialize the
				316	# epilogue with the empty string (see below).
				317	if mo.group('end'):
				318	linesep = mo.group('linesep')
				319	break
				320	# We saw an inter-part boundary. Were we in the preamble?
				321	if capturing_preamble:
				322	if preamble:
				323	# According to RFC 2046, the last newline belongs
				324	# to the boundary.
				325	lastline = preamble[-1]
				326	eolmo = NLCRE_eol.search(lastline)
				327	if eolmo:
				328	preamble[-1] = lastline[:-len(eolmo.group(0))]
				329	self._cur.preamble = EMPTYSTRING.join(preamble)
				330	capturing_preamble = False
				331	self._input.unreadline(line)
				332	continue
				333	# We saw a boundary separating two parts. Consume any
				334	# multiple boundary lines that may be following. Our
				335	# interpretation of RFC 2046 BNF grammar does not produce
				336	# body parts within such double boundaries.
				337	while True:
				338	line = self._input.readline()
				339	if line is NeedMoreData:
				340	yield NeedMoreData
				341	continue
				342	mo = boundaryre.match(line)
				343	if not mo:
				344	self._input.unreadline(line)
				345	break
				346	# Recurse to parse this subpart; the input stream points
				347	# at the subpart's first line.
				348	self._input.push_eof_matcher(boundaryre.match)
				349	for retval in self._parsegen():
				350	if retval is NeedMoreData:
				351	yield NeedMoreData
				352	continue
				353	break
				354	# Because of RFC 2046, the newline preceding the boundary
				355	# separator actually belongs to the boundary, not the
				356	# previous subpart's payload (or epilogue if the previous
				357	# part is a multipart).
				358	if self._last.get_content_maintype() == 'multipart':
				359	epilogue = self._last.epilogue
				360	if epilogue == '':
				361	self._last.epilogue = None
				362	elif epilogue is not None:
				363	mo = NLCRE_eol.search(epilogue)
				364	if mo:
				365	end = len(mo.group(0))
				366	self._last.epilogue = epilogue[:-end]
				367	else:
R David Murray	c5c1472	2011-04-06 08:13:02 -0400	[diff] [blame]	368	payload = self._last._payload
Guido van Rossum	3172c5d	2007-10-16 18:12:55 +0000	[diff] [blame]	369	if isinstance(payload, str):
Guido van Rossum	8b3febe	2007-08-30 01:15:14 +0000	[diff] [blame]	370	mo = NLCRE_eol.search(payload)
				371	if mo:
				372	payload = payload[:-len(mo.group(0))]
R David Murray	c5c1472	2011-04-06 08:13:02 -0400	[diff] [blame]	373	self._last._payload = payload
Guido van Rossum	8b3febe	2007-08-30 01:15:14 +0000	[diff] [blame]	374	self._input.pop_eof_matcher()
				375	self._pop_message()
				376	# Set the multipart up for newline cleansing, which will
				377	# happen if we're in a nested multipart.
				378	self._last = self._cur
				379	else:
				380	# I think we must be in the preamble
				381	assert capturing_preamble
				382	preamble.append(line)
				383	# We've seen either the EOF or the end boundary. If we're still
				384	# capturing the preamble, we never saw the start boundary. Note
				385	# that as a defect and store the captured text as the payload.
				386	# Everything from here to the EOF is epilogue.
				387	if capturing_preamble:
				388	self._cur.defects.append(errors.StartBoundaryNotFoundDefect())
				389	self._cur.set_payload(EMPTYSTRING.join(preamble))
				390	epilogue = []
				391	for line in self._input:
				392	if line is NeedMoreData:
				393	yield NeedMoreData
				394	continue
				395	self._cur.epilogue = EMPTYSTRING.join(epilogue)
				396	return
				397	# If the end boundary ended in a newline, we'll need to make sure
				398	# the epilogue isn't None
				399	if linesep:
				400	epilogue = ['']
				401	else:
				402	epilogue = []
				403	for line in self._input:
				404	if line is NeedMoreData:
				405	yield NeedMoreData
				406	continue
				407	epilogue.append(line)
				408	# Any CRLF at the front of the epilogue is not technically part of
				409	# the epilogue. Also, watch out for an empty string epilogue,
				410	# which means a single newline.
				411	if epilogue:
				412	firstline = epilogue[0]
				413	bolmo = NLCRE_bol.match(firstline)
				414	if bolmo:
				415	epilogue[0] = firstline[len(bolmo.group(0)):]
				416	self._cur.epilogue = EMPTYSTRING.join(epilogue)
				417	return
				418	# Otherwise, it's some non-multipart type, so the entire rest of the
				419	# file contents becomes the payload.
				420	lines = []
				421	for line in self._input:
				422	if line is NeedMoreData:
				423	yield NeedMoreData
				424	continue
				425	lines.append(line)
				426	self._cur.set_payload(EMPTYSTRING.join(lines))
				427
				428	def _parse_headers(self, lines):
				429	# Passed a list of lines that make up the headers for the current msg
				430	lastheader = ''
				431	lastvalue = []
				432	for lineno, line in enumerate(lines):
				433	# Check for continuation
				434	if line[0] in ' \t':
				435	if not lastheader:
				436	# The first line of the headers was a continuation. This
				437	# is illegal, so let's note the defect, store the illegal
				438	# line, and ignore it for purposes of headers.
				439	defect = errors.FirstHeaderLineIsContinuationDefect(line)
				440	self._cur.defects.append(defect)
				441	continue
				442	lastvalue.append(line)
				443	continue
				444	if lastheader:
				445	# XXX reconsider the joining of folded lines
				446	lhdr = EMPTYSTRING.join(lastvalue)[:-1].rstrip('\r\n')
				447	self._cur[lastheader] = lhdr
				448	lastheader, lastvalue = '', []
				449	# Check for envelope header, i.e. unix-from
				450	if line.startswith('From '):
				451	if lineno == 0:
				452	# Strip off the trailing newline
				453	mo = NLCRE_eol.search(line)
				454	if mo:
				455	line = line[:-len(mo.group(0))]
				456	self._cur.set_unixfrom(line)
				457	continue
				458	elif lineno == len(lines) - 1:
				459	# Something looking like a unix-from at the end - it's
				460	# probably the first line of the body, so push back the
				461	# line and stop.
				462	self._input.unreadline(line)
				463	return
				464	else:
				465	# Weirdly placed unix-from line. Note this as a defect
				466	# and ignore it.
				467	defect = errors.MisplacedEnvelopeHeaderDefect(line)
				468	self._cur.defects.append(defect)
				469	continue
				470	# Split the line on the colon separating field name from value.
				471	i = line.find(':')
				472	if i < 0:
				473	defect = errors.MalformedHeaderDefect(line)
				474	self._cur.defects.append(defect)
				475	continue
				476	lastheader = line[:i]
				477	lastvalue = [line[i+1:].lstrip()]
				478	# Done with all the lines, so handle the last header.
				479	if lastheader:
				480	# XXX reconsider the joining of folded lines
				481	self._cur[lastheader] = EMPTYSTRING.join(lastvalue).rstrip('\r\n')
R. David Murray	96fd54e	2010-10-08 15:55:28 +0000	[diff] [blame]	482
				483
				484	class BytesFeedParser(FeedParser):
				485	"""Like FeedParser, but feed accepts bytes."""
				486
				487	def feed(self, data):
				488	super().feed(data.decode('ascii', 'surrogateescape'))