Blame - Lib/email/FeedParser.py - platform/external/python/cpython3

blob: a82d305ba529679fd512f3fbbe9b38b9f7604f5c [file] [log] [blame]

Anthony Baxter	39a0f04	2004-03-22 00:33:28 +0000	[diff] [blame^]	1	# A new Feed-style Parser
				2
				3	from email import Errors, Message
				4	import re
				5
				6	NLCRE = re.compile('\r\n\|\r\|\n')
				7
				8	EMPTYSTRING = ''
				9	NL = '\n'
				10
				11	NeedMoreData = object()
				12
				13	class FeedableLumpOfText:
				14	"A file-like object that can have new data loaded into it"
				15
				16	def __init__(self):
				17	self._partial = ''
				18	self._done = False
				19	# _pending is a list of lines, in reverse order
				20	self._pending = []
				21
				22	def readline(self):
				23	""" Return a line of data.
				24
				25	If data has been pushed back with unreadline(), the most recently
				26	returned unreadline()d data will be returned.
				27	"""
				28	if not self._pending:
				29	if self._done:
				30	return ''
				31	return NeedMoreData
				32	return self._pending.pop()
				33
				34	def unreadline(self, line):
				35	""" Push a line back into the object.
				36	"""
				37	self._pending.append(line)
				38
				39	def peekline(self):
				40	""" Non-destructively look at the next line """
				41	if not self._pending:
				42	if self._done:
				43	return ''
				44	return NeedMoreData
				45	return self._pending[-1]
				46
				47
				48	# for r in self._input.readuntil(regexp):
				49	# if r is NeedMoreData:
				50	# yield NeedMoreData
				51	# preamble, matchobj = r
				52	def readuntil(self, matchre, afterblank=False, includematch=False):
				53	""" Read a line at a time until we get the specified RE.
				54
				55	Returns the text up to (and including, if includematch is true) the
				56	matched text, and the RE match object. If afterblank is true,
				57	there must be a blank line before the matched text. Moves current
				58	filepointer to the line following the matched line. If we reach
				59	end-of-file, return what we've got so far, and return None as the
				60	RE match object.
				61	"""
				62	prematch = []
				63	blankseen = 0
				64	while 1:
				65	if not self._pending:
				66	if self._done:
				67	# end of file
				68	yield EMPTYSTRING.join(prematch), None
				69	else:
				70	yield NeedMoreData
				71	continue
				72	line = self._pending.pop()
				73	if afterblank:
				74	if NLCRE.match(line):
				75	blankseen = 1
				76	continue
				77	else:
				78	blankseen = 0
				79	m = matchre.match(line)
				80	if (m and not afterblank) or (m and afterblank and blankseen):
				81	if includematch:
				82	prematch.append(line)
				83	yield EMPTYSTRING.join(prematch), m
				84	prematch.append(line)
				85
				86
				87	NLatend = re.compile('(\r\n\|\r\|\n)$').match
				88	NLCRE_crack = re.compile('(\r\n\|\r\|\n)')
				89
				90	def push(self, data):
				91	""" Push some new data into this object """
				92	# Handle any previous leftovers
				93	data, self._partial = self._partial+data, ''
				94	# Crack into lines, but leave the newlines on the end of each
				95	lines = self.NLCRE_crack.split(data)
				96	# The ahem interesting behaviour of re.split when supplied
				97	# groups means that the last element is the data after the
				98	# final RE. In the case of a NL/CR terminated string, this is
				99	# the empty string.
				100	self._partial = lines.pop()
				101	o = []
				102	for i in range(len(lines) / 2):
				103	o.append(EMPTYSTRING.join([lines[i2], lines[i2+1]]))
				104	self.pushlines(o)
				105
				106	def pushlines(self, lines):
				107	""" Push a list of new lines into the object """
				108	# Reverse and insert at the front of _pending
				109	self._pending[:0] = lines[::-1]
				110
				111	def end(self):
				112	""" There is no more data """
				113	self._done = True
				114
				115	def is_done(self):
				116	return self._done
				117
				118	def __iter__(self):
				119	return self
				120
				121	def next(self):
				122	l = self.readline()
				123	if l == '':
				124	raise StopIteration
				125	return l
				126
				127	class FeedParser:
				128	"A feed-style parser of email. copy docstring here"
				129
				130	def __init__(self, _class=Message.Message):
				131	"fnord fnord fnord"
				132	self._class = _class
				133	self._input = FeedableLumpOfText()
				134	self._root = None
				135	self._objectstack = []
				136	self._parse = self._parsegen().next
				137
				138	def end(self):
				139	self._input.end()
				140	self._call_parse()
				141	return self._root
				142
				143	def feed(self, data):
				144	self._input.push(data)
				145	self._call_parse()
				146
				147	def _call_parse(self):
				148	try:
				149	self._parse()
				150	except StopIteration:
				151	pass
				152
				153	headerRE = re.compile(r'^(From \|[-\w]{2,}:\|[\t ])')
				154
				155	def _parse_headers(self,headerlist):
				156	# Passed a list of strings that are the headers for the
				157	# current object
				158	lastheader = ''
				159	lastvalue = []
				160
				161
				162	for lineno, line in enumerate(headerlist):
				163	# Check for continuation
				164	if line[0] in ' \t':
				165	if not lastheader:
				166	raise Errors.HeaderParseError('First line must not be a continuation')
				167	lastvalue.append(line)
				168	continue
				169
				170	if lastheader:
				171	# XXX reconsider the joining of folded lines
				172	self._cur[lastheader] = EMPTYSTRING.join(lastvalue).rstrip()
				173	lastheader, lastvalue = '', []
				174
				175	# Check for Unix-From
				176	if line.startswith('From '):
				177	if lineno == 0:
				178	self._cur.set_unixfrom(line)
				179	continue
				180	elif lineno == len(headerlist) - 1:
				181	# Something looking like a unix-from at the end - it's
				182	# probably the first line of the body
				183	self._input.unreadline(line)
				184	return
				185	else:
				186	# Weirdly placed unix-from line. Ignore it.
				187	continue
				188
				189	i = line.find(':')
				190	if i < 0:
				191	# The older parser had various special-cases here. We've
				192	# already handled them
				193	raise Errors.HeaderParseError(
				194	"Not a header, not a continuation: ``%s''" % line)
				195	lastheader = line[:i]
				196	lastvalue = [line[i+1:].lstrip()]
				197
				198	if lastheader:
				199	# XXX reconsider the joining of folded lines
				200	self._cur[lastheader] = EMPTYSTRING.join(lastvalue).rstrip()
				201
				202
				203	def _parsegen(self):
				204	# Parse any currently available text
				205	self._new_sub_object()
				206	self._root = self._cur
				207	completing = False
				208	last = None
				209
				210	for line in self._input:
				211	if line is NeedMoreData:
				212	yield None # Need More Data
				213	continue
				214	self._input.unreadline(line)
				215	if not completing:
				216	headers = []
				217	# Now collect all headers.
				218	for line in self._input:
				219	if line is NeedMoreData:
				220	yield None # Need More Data
				221	continue
				222	if not self.headerRE.match(line):
				223	self._parse_headers(headers)
				224	# A message/rfc822 has no body and no internal
				225	# boundary.
				226	if self._cur.get_content_maintype() == "message":
				227	self._new_sub_object()
				228	completing = False
				229	headers = []
				230	continue
				231	if line.strip():
				232	# No blank line between headers and body.
				233	# Push this line back, it's the first line of
				234	# the body.
				235	self._input.unreadline(line)
				236	break
				237	else:
				238	headers.append(line)
				239	else:
				240	# We're done with the data and are still inside the headers
				241	self._parse_headers(headers)
				242
				243	# Now we're dealing with the body
				244	boundary = self._cur.get_boundary()
				245	isdigest = (self._cur.get_content_type() == 'multipart/digest')
				246	if boundary and not self._cur._finishing:
				247	separator = '--' + boundary
				248	self._cur._boundaryRE = re.compile(
				249	r'(?P<sep>' + re.escape(separator) +
				250	r')(?P<end>--)?(?P<ws>[ \t]*)(?P<linesep>\r\n\|\r\|\n)$')
				251	for r in self._input.readuntil(self._cur._boundaryRE):
				252	if r is NeedMoreData:
				253	yield NeedMoreData
				254	else:
				255	preamble, matchobj = r
				256	break
				257	if not matchobj:
				258	# Broken - we hit the end of file. Just set the body
				259	# to the text.
				260	if completing:
				261	self._attach_trailer(last, preamble)
				262	else:
				263	self._attach_preamble(self._cur, preamble)
				264	# XXX move back to the parent container.
				265	self._pop_container()
				266	completing = True
				267	continue
				268	if preamble:
				269	if completing:
				270	preamble = preamble[:-len(matchobj.group('linesep'))]
				271	self._attach_trailer(last, preamble)
				272	else:
				273	self._attach_preamble(self._cur, preamble)
				274	elif not completing:
				275	# The module docs specify an empty preamble is None, not ''
				276	self._cur.preamble = None
				277	# If we _are_ completing, the last object gets no payload
				278
				279	if matchobj.group('end'):
				280	# That was the end boundary tag. Bounce back to the
				281	# parent container
				282	last = self._pop_container()
				283	self._input.unreadline(matchobj.group('linesep'))
				284	completing = True
				285	continue
				286
				287	# A number of MTAs produced by a nameless large company
				288	# we shall call "SicroMoft" produce repeated boundary
				289	# lines.
				290	while True:
				291	line = self._input.peekline()
				292	if line is NeedMoreData:
				293	yield None
				294	continue
				295	if self._cur._boundaryRE.match(line):
				296	self._input.readline()
				297	else:
				298	break
				299
				300	self._new_sub_object()
				301
				302	completing = False
				303	if isdigest:
				304	self._cur.set_default_type('message/rfc822')
				305	continue
				306	else:
				307	# non-multipart or after end-boundary
				308	if last is not self._root:
				309	last = self._pop_container()
				310	if self._cur.get_content_maintype() == "message":
				311	# We double-pop to leave the RFC822 object
				312	self._pop_container()
				313	completing = True
				314	elif self._cur._boundaryRE and last <> self._root:
				315	completing = True
				316	else:
				317	# Non-multipart top level, or in the trailer of the
				318	# top level multipart
				319	while not self._input.is_done():
				320	yield None
				321	data = list(self._input)
				322	body = EMPTYSTRING.join(data)
				323	self._attach_trailer(last, body)
				324
				325
				326	def _attach_trailer(self, obj, trailer):
				327	#import pdb ; pdb.set_trace()
				328	if obj.get_content_maintype() in ( "multipart", "message" ):
				329	obj.epilogue = trailer
				330	else:
				331	obj.set_payload(trailer)
				332
				333	def _attach_preamble(self, obj, trailer):
				334	if obj.get_content_maintype() in ( "multipart", "message" ):
				335	obj.preamble = trailer
				336	else:
				337	obj.set_payload(trailer)
				338
				339
				340	def _new_sub_object(self):
				341	new = self._class()
				342	#print "pushing", self._objectstack, repr(new)
				343	if self._objectstack:
				344	self._objectstack[-1].attach(new)
				345	self._objectstack.append(new)
				346	new._boundaryRE = None
				347	new._finishing = False
				348	self._cur = new
				349
				350	def _pop_container(self):
				351	# Move the pointer to the container of the current object.
				352	# Returns the (old) current object
				353	#import pdb ; pdb.set_trace()
				354	#print "popping", self._objectstack
				355	last = self._objectstack.pop()
				356	if self._objectstack:
				357	self._cur = self._objectstack[-1]
				358	else:
				359	self._cur._finishing = True
				360	return last
				361
				362