Blame - Lib/email/_header_value_parser.py - platform/external/python/cpython3

blob: a9bdf4458b62a5df64a81c71752bdf26bca74973 [file] [log] [blame]

R David Murray	0b6f6c8	2012-05-25 18:42:14 -0400	[diff] [blame]	1	"""Header value parser implementing various email-related RFC parsing rules.
				2
				3	The parsing methods defined in this module implement various email related
				4	parsing rules. Principal among them is RFC 5322, which is the followon
				5	to RFC 2822 and primarily a clarification of the former. It also implements
				6	RFC 2047 encoded word decoding.
				7
				8	RFC 5322 goes to considerable trouble to maintain backward compatibility with
				9	RFC 822 in the parse phase, while cleaning up the structure on the generation
				10	phase. This parser supports correct RFC 5322 generation by tagging white space
				11	as folding white space only when folding is allowed in the non-obsolete rule
				12	sets. Actually, the parser is even more generous when accepting input than RFC
				13	5322 mandates, following the spirit of Postel's Law, which RFC 5322 encourages.
				14	Where possible deviations from the standard are annotated on the 'defects'
				15	attribute of tokens that deviate.
				16
				17	The general structure of the parser follows RFC 5322, and uses its terminology
				18	where there is a direct correspondence. Where the implementation requires a
				19	somewhat different structure than that used by the formal grammar, new terms
				20	that mimic the closest existing terms are used. Thus, it really helps to have
				21	a copy of RFC 5322 handy when studying this code.
				22
				23	Input to the parser is a string that has already been unfolded according to
				24	RFC 5322 rules. According to the RFC this unfolding is the very first step, and
				25	this parser leaves the unfolding step to a higher level message parser, which
				26	will have already detected the line breaks that need unfolding while
				27	determining the beginning and end of each header.
				28
				29	The output of the parser is a TokenList object, which is a list subclass. A
				30	TokenList is a recursive data structure. The terminal nodes of the structure
				31	are Terminal objects, which are subclasses of str. These do not correspond
				32	directly to terminal objects in the formal grammar, but are instead more
				33	practical higher level combinations of true terminals.
				34
				35	All TokenList and Terminal objects have a 'value' attribute, which produces the
				36	semantically meaningful value of that part of the parse subtree. The value of
				37	all whitespace tokens (no matter how many sub-tokens they may contain) is a
				38	single space, as per the RFC rules. This includes 'CFWS', which is herein
				39	included in the general class of whitespace tokens. There is one exception to
				40	the rule that whitespace tokens are collapsed into single spaces in values: in
				41	the value of a 'bare-quoted-string' (a quoted-string with no leading or
				42	trailing whitespace), any whitespace that appeared between the quotation marks
				43	is preserved in the returned value. Note that in all Terminal strings quoted
				44	pairs are turned into their unquoted values.
				45
				46	All TokenList and Terminal objects also have a string value, which attempts to
				47	be a "canonical" representation of the RFC-compliant form of the substring that
				48	produced the parsed subtree, including minimal use of quoted pair quoting.
				49	Whitespace runs are not collapsed.
				50
				51	Comment tokens also have a 'content' attribute providing the string found
				52	between the parens (including any nested comments) with whitespace preserved.
				53
				54	All TokenList and Terminal objects have a 'defects' attribute which is a
				55	possibly empty list all of the defects found while creating the token. Defects
				56	may appear on any token in the tree, and a composite list of all defects in the
				57	subtree is available through the 'all_defects' attribute of any node. (For
				58	Terminal notes x.defects == x.all_defects.)
				59
				60	Each object in a parse tree is called a 'token', and each has a 'token_type'
				61	attribute that gives the name from the RFC 5322 grammar that it represents.
				62	Not all RFC 5322 nodes are produced, and there is one non-RFC 5322 node that
				63	may be produced: 'ptext'. A 'ptext' is a string of printable ascii characters.
				64	It is returned in place of lists of (ctext/quoted-pair) and
				65	(qtext/quoted-pair).
				66
				67	XXX: provide complete list of token types.
				68	"""
				69
				70	import re
R David Murray	97f43c0	2012-06-24 05:03:27 -0400	[diff] [blame]	71	import urllib # For urllib.parse.unquote
R David Murray	65171b2	2013-07-11 15:52:57 -0400	[diff] [blame]	72	from string import hexdigits
Victor Stinner	7fa767e	2014-03-20 09:16:38 +0100	[diff] [blame]	73	from collections import OrderedDict
R David Murray	7d0325d	2015-03-29 21:53:05 -0400	[diff] [blame]	74	from operator import itemgetter
R David Murray	0b6f6c8	2012-05-25 18:42:14 -0400	[diff] [blame]	75	from email import _encoded_words as _ew
				76	from email import errors
				77	from email import utils
				78
				79	#
				80	# Useful constants and functions
				81	#
				82
				83	WSP = set(' \t')
				84	CFWS_LEADER = WSP \| set('(')
				85	SPECIALS = set(r'()<>@,:;.\"[]')
				86	ATOM_ENDS = SPECIALS \| WSP
				87	DOT_ATOM_ENDS = ATOM_ENDS - set('.')
				88	# '.', '"', and '(' do not end phrases in order to support obs-phrase
				89	PHRASE_ENDS = SPECIALS - set('."(')
R David Murray	97f43c0	2012-06-24 05:03:27 -0400	[diff] [blame]	90	TSPECIALS = (SPECIALS \| set('/?=')) - set('.')
				91	TOKEN_ENDS = TSPECIALS \| WSP
				92	ASPECIALS = TSPECIALS \| set("*'%")
				93	ATTRIBUTE_ENDS = ASPECIALS \| WSP
				94	EXTENDED_ATTRIBUTE_ENDS = ATTRIBUTE_ENDS - set('%')
R David Murray	0b6f6c8	2012-05-25 18:42:14 -0400	[diff] [blame]	95
				96	def quote_string(value):
				97	return '"'+str(value).replace('\\', '\\\\').replace('"', r'\"')+'"'
				98
				99	#
				100	# Accumulator for header folding
				101	#
				102
				103	class _Folded:
				104
				105	def __init__(self, maxlen, policy):
				106	self.maxlen = maxlen
				107	self.policy = policy
				108	self.lastlen = 0
				109	self.stickyspace = None
				110	self.firstline = True
				111	self.done = []
				112	self.current = []
				113
				114	def newline(self):
				115	self.done.extend(self.current)
				116	self.done.append(self.policy.linesep)
				117	self.current.clear()
				118	self.lastlen = 0
				119
				120	def finalize(self):
				121	if self.current:
				122	self.newline()
				123
				124	def __str__(self):
				125	return ''.join(self.done)
				126
				127	def append(self, stoken):
				128	self.current.append(stoken)
				129
				130	def append_if_fits(self, token, stoken=None):
				131	if stoken is None:
				132	stoken = str(token)
				133	l = len(stoken)
				134	if self.stickyspace is not None:
				135	stickyspace_len = len(self.stickyspace)
				136	if self.lastlen + stickyspace_len + l <= self.maxlen:
				137	self.current.append(self.stickyspace)
				138	self.lastlen += stickyspace_len
				139	self.current.append(stoken)
				140	self.lastlen += l
				141	self.stickyspace = None
				142	self.firstline = False
				143	return True
				144	if token.has_fws:
				145	ws = token.pop_leading_fws()
				146	if ws is not None:
				147	self.stickyspace += str(ws)
				148	stickyspace_len += len(ws)
				149	token._fold(self)
				150	return True
				151	if stickyspace_len and l + 1 <= self.maxlen:
				152	margin = self.maxlen - l
				153	if 0 < margin < stickyspace_len:
				154	trim = stickyspace_len - margin
				155	self.current.append(self.stickyspace[:trim])
				156	self.stickyspace = self.stickyspace[trim:]
				157	stickyspace_len = trim
				158	self.newline()
				159	self.current.append(self.stickyspace)
				160	self.current.append(stoken)
				161	self.lastlen = l + stickyspace_len
				162	self.stickyspace = None
				163	self.firstline = False
				164	return True
				165	if not self.firstline:
				166	self.newline()
				167	self.current.append(self.stickyspace)
				168	self.current.append(stoken)
				169	self.stickyspace = None
				170	self.firstline = False
				171	return True
				172	if self.lastlen + l <= self.maxlen:
				173	self.current.append(stoken)
				174	self.lastlen += l
				175	return True
				176	if l < self.maxlen:
				177	self.newline()
				178	self.current.append(stoken)
				179	self.lastlen = l
				180	return True
				181	return False
				182
				183	#
				184	# TokenList and its subclasses
				185	#
				186
				187	class TokenList(list):
				188
				189	token_type = None
				190
				191	def __init__(self, args, *kw):
				192	super().__init__(args, *kw)
				193	self.defects = []
				194
				195	def __str__(self):
				196	return ''.join(str(x) for x in self)
				197
				198	def __repr__(self):
				199	return '{}({})'.format(self.__class__.__name__,
				200	super().__repr__())
				201
				202	@property
				203	def value(self):
				204	return ''.join(x.value for x in self if x.value)
				205
				206	@property
				207	def all_defects(self):
				208	return sum((x.all_defects for x in self), self.defects)
				209
				210	#
				211	# Folding API
				212	#
				213	# parts():
				214	#
				215	# return a list of objects that constitute the "higher level syntactic
				216	# objects" specified by the RFC as the best places to fold a header line.
				217	# The returned objects must include leading folding white space, even if
				218	# this means mutating the underlying parse tree of the object. Each object
				219	# is only responsible for returning its parts, and should not drill down
				220	# to any lower level except as required to meet the leading folding white
				221	# space constraint.
				222	#
				223	# _fold(folded):
				224	#
				225	# folded: the result accumulator. This is an instance of _Folded.
				226	# (XXX: I haven't finished factoring this out yet, the folding code
				227	# pretty much uses this as a state object.) When the folded.current
				228	# contains as much text as will fit, the _fold method should call
				229	# folded.newline.
				230	# folded.lastlen: the current length of the test stored in folded.current.
				231	# folded.maxlen: The maximum number of characters that may appear on a
				232	# folded line. Differs from the policy setting in that "no limit" is
				233	# represented by +inf, which means it can be used in the trivially
				234	# logical fashion in comparisons.
				235	#
				236	# Currently no subclasses implement parts, and I think this will remain
				237	# true. A subclass only needs to implement _fold when the generic version
				238	# isn't sufficient. _fold will need to be implemented primarily when it is
				239	# possible for encoded words to appear in the specialized token-list, since
				240	# there is no generic algorithm that can know where exactly the encoded
				241	# words are allowed. A _fold implementation is responsible for filling
				242	# lines in the same general way that the top level _fold does. It may, and
				243	# should, call the _fold method of sub-objects in a similar fashion to that
				244	# of the top level _fold.
				245	#
				246	# XXX: I'm hoping it will be possible to factor the existing code further
				247	# to reduce redundancy and make the logic clearer.
				248
				249	@property
				250	def parts(self):
				251	klass = self.__class__
				252	this = []
				253	for token in self:
				254	if token.startswith_fws():
				255	if this:
				256	yield this[0] if len(this)==1 else klass(this)
				257	this.clear()
				258	end_ws = token.pop_trailing_ws()
				259	this.append(token)
				260	if end_ws:
				261	yield klass(this)
				262	this = [end_ws]
				263	if this:
				264	yield this[0] if len(this)==1 else klass(this)
				265
				266	def startswith_fws(self):
				267	return self[0].startswith_fws()
				268
				269	def pop_leading_fws(self):
				270	if self[0].token_type == 'fws':
				271	return self.pop(0)
				272	return self[0].pop_leading_fws()
				273
				274	def pop_trailing_ws(self):
				275	if self[-1].token_type == 'cfws':
				276	return self.pop(-1)
				277	return self[-1].pop_trailing_ws()
				278
				279	@property
				280	def has_fws(self):
				281	for part in self:
				282	if part.has_fws:
				283	return True
				284	return False
				285
				286	def has_leading_comment(self):
				287	return self[0].has_leading_comment()
				288
				289	@property
				290	def comments(self):
				291	comments = []
				292	for token in self:
				293	comments.extend(token.comments)
				294	return comments
				295
				296	def fold(self, *, policy):
				297	# max_line_length 0/None means no limit, ie: infinitely long.
				298	maxlen = policy.max_line_length or float("+inf")
				299	folded = _Folded(maxlen, policy)
				300	self._fold(folded)
				301	folded.finalize()
				302	return str(folded)
				303
				304	def as_encoded_word(self, charset):
				305	# This works only for things returned by 'parts', which include
				306	# the leading fws, if any, that should be used.
				307	res = []
				308	ws = self.pop_leading_fws()
				309	if ws:
				310	res.append(ws)
				311	trailer = self.pop(-1) if self[-1].token_type=='fws' else ''
				312	res.append(_ew.encode(str(self), charset))
				313	res.append(trailer)
				314	return ''.join(res)
				315
				316	def cte_encode(self, charset, policy):
				317	res = []
				318	for part in self:
				319	res.append(part.cte_encode(charset, policy))
				320	return ''.join(res)
				321
				322	def _fold(self, folded):
				323	for part in self.parts:
				324	tstr = str(part)
				325	tlen = len(tstr)
				326	try:
				327	str(part).encode('us-ascii')
				328	except UnicodeEncodeError:
				329	if any(isinstance(x, errors.UndecodableBytesDefect)
				330	for x in part.all_defects):
				331	charset = 'unknown-8bit'
				332	else:
				333	# XXX: this should be a policy setting
				334	charset = 'utf-8'
				335	tstr = part.cte_encode(charset, folded.policy)
				336	tlen = len(tstr)
				337	if folded.append_if_fits(part, tstr):
				338	continue
				339	# Peel off the leading whitespace if any and make it sticky, to
				340	# avoid infinite recursion.
				341	ws = part.pop_leading_fws()
				342	if ws is not None:
				343	# Peel off the leading whitespace and make it sticky, to
				344	# avoid infinite recursion.
				345	folded.stickyspace = str(part.pop(0))
				346	if folded.append_if_fits(part):
				347	continue
				348	if part.has_fws:
				349	part._fold(folded)
				350	continue
				351	# There are no fold points in this one; it is too long for a single
				352	# line and can't be split...we just have to put it on its own line.
				353	folded.append(tstr)
				354	folded.newline()
				355
				356	def pprint(self, indent=''):
				357	print('\n'.join(self._pp(indent='')))
				358
				359	def ppstr(self, indent=''):
				360	return '\n'.join(self._pp(indent=''))
				361
				362	def _pp(self, indent=''):
				363	yield '{}{}/{}('.format(
				364	indent,
				365	self.__class__.__name__,
				366	self.token_type)
				367	for token in self:
R David Murray	97f43c0	2012-06-24 05:03:27 -0400	[diff] [blame]	368	if not hasattr(token, '_pp'):
				369	yield (indent + ' !! invalid element in token '
				370	'list: {!r}'.format(token))
				371	else:
Philip Jenvey	4993cc0	2012-10-01 12:53:43 -0700	[diff] [blame]	372	yield from token._pp(indent+' ')
R David Murray	0b6f6c8	2012-05-25 18:42:14 -0400	[diff] [blame]	373	if self.defects:
				374	extra = ' Defects: {}'.format(self.defects)
				375	else:
				376	extra = ''
				377	yield '{}){}'.format(indent, extra)
				378
				379
				380	class WhiteSpaceTokenList(TokenList):
				381
				382	@property
				383	def value(self):
				384	return ' '
				385
				386	@property
				387	def comments(self):
				388	return [x.content for x in self if x.token_type=='comment']
				389
				390
				391	class UnstructuredTokenList(TokenList):
				392
				393	token_type = 'unstructured'
				394
				395	def _fold(self, folded):
R David Murray	0b6f6c8	2012-05-25 18:42:14 -0400	[diff] [blame]	396	last_ew = None
				397	for part in self.parts:
				398	tstr = str(part)
				399	is_ew = False
				400	try:
				401	str(part).encode('us-ascii')
				402	except UnicodeEncodeError:
				403	if any(isinstance(x, errors.UndecodableBytesDefect)
				404	for x in part.all_defects):
				405	charset = 'unknown-8bit'
				406	else:
				407	charset = 'utf-8'
				408	if last_ew is not None:
				409	# We've already done an EW, combine this one with it
				410	# if there's room.
				411	chunk = get_unstructured(
				412	''.join(folded.current[last_ew:]+[tstr])).as_encoded_word(charset)
				413	oldlastlen = sum(len(x) for x in folded.current[:last_ew])
				414	schunk = str(chunk)
				415	lchunk = len(schunk)
				416	if oldlastlen + lchunk <= folded.maxlen:
				417	del folded.current[last_ew:]
				418	folded.append(schunk)
				419	folded.lastlen = oldlastlen + lchunk
				420	continue
				421	tstr = part.as_encoded_word(charset)
				422	is_ew = True
				423	if folded.append_if_fits(part, tstr):
				424	if is_ew:
				425	last_ew = len(folded.current) - 1
				426	continue
				427	if is_ew or last_ew:
				428	# It's too big to fit on the line, but since we've
				429	# got encoded words we can use encoded word folding.
				430	part._fold_as_ew(folded)
				431	continue
				432	# Peel off the leading whitespace if any and make it sticky, to
				433	# avoid infinite recursion.
				434	ws = part.pop_leading_fws()
				435	if ws is not None:
				436	folded.stickyspace = str(ws)
				437	if folded.append_if_fits(part):
				438	continue
				439	if part.has_fws:
				440	part.fold(folded)
				441	continue
				442	# It can't be split...we just have to put it on its own line.
				443	folded.append(tstr)
				444	folded.newline()
				445	last_ew = None
				446
				447	def cte_encode(self, charset, policy):
				448	res = []
				449	last_ew = None
				450	for part in self:
				451	spart = str(part)
				452	try:
				453	spart.encode('us-ascii')
				454	res.append(spart)
				455	except UnicodeEncodeError:
				456	if last_ew is None:
				457	res.append(part.cte_encode(charset, policy))
				458	last_ew = len(res)
				459	else:
				460	tl = get_unstructured(''.join(res[last_ew:] + [spart]))
				461	res.append(tl.as_encoded_word())
				462	return ''.join(res)
				463
				464
				465	class Phrase(TokenList):
				466
				467	token_type = 'phrase'
				468
				469	def _fold(self, folded):
				470	# As with Unstructured, we can have pure ASCII with or without
				471	# surrogateescape encoded bytes, or we could have unicode. But this
				472	# case is more complicated, since we have to deal with the various
				473	# sub-token types and how they can be composed in the face of
				474	# unicode-that-needs-CTE-encoding, and the fact that if a token a
				475	# comment that becomes a barrier across which we can't compose encoded
				476	# words.
				477	last_ew = None
				478	for part in self.parts:
				479	tstr = str(part)
				480	tlen = len(tstr)
				481	has_ew = False
				482	try:
				483	str(part).encode('us-ascii')
				484	except UnicodeEncodeError:
				485	if any(isinstance(x, errors.UndecodableBytesDefect)
				486	for x in part.all_defects):
				487	charset = 'unknown-8bit'
				488	else:
				489	charset = 'utf-8'
				490	if last_ew is not None and not part.has_leading_comment():
				491	# We've already done an EW, let's see if we can combine
				492	# this one with it. The last_ew logic ensures that all we
				493	# have at this point is atoms, no comments or quoted
				494	# strings. So we can treat the text between the last
				495	# encoded word and the content of this token as
				496	# unstructured text, and things will work correctly. But
				497	# we have to strip off any trailing comment on this token
				498	# first, and if it is a quoted string we have to pull out
				499	# the content (we're encoding it, so it no longer needs to
				500	# be quoted).
				501	if part[-1].token_type == 'cfws' and part.comments:
				502	remainder = part.pop(-1)
				503	else:
				504	remainder = ''
				505	for i, token in enumerate(part):
				506	if token.token_type == 'bare-quoted-string':
				507	part[i] = UnstructuredTokenList(token[:])
				508	chunk = get_unstructured(
				509	''.join(folded.current[last_ew:]+[tstr])).as_encoded_word(charset)
				510	schunk = str(chunk)
				511	lchunk = len(schunk)
				512	if last_ew + lchunk <= folded.maxlen:
				513	del folded.current[last_ew:]
				514	folded.append(schunk)
				515	folded.lastlen = sum(len(x) for x in folded.current)
				516	continue
				517	tstr = part.as_encoded_word(charset)
				518	tlen = len(tstr)
				519	has_ew = True
				520	if folded.append_if_fits(part, tstr):
				521	if has_ew and not part.comments:
				522	last_ew = len(folded.current) - 1
				523	elif part.comments or part.token_type == 'quoted-string':
				524	# If a comment is involved we can't combine EWs. And if a
				525	# quoted string is involved, it's not worth the effort to
				526	# try to combine them.
				527	last_ew = None
				528	continue
				529	part._fold(folded)
				530
				531	def cte_encode(self, charset, policy):
				532	res = []
				533	last_ew = None
				534	is_ew = False
				535	for part in self:
				536	spart = str(part)
				537	try:
				538	spart.encode('us-ascii')
				539	res.append(spart)
				540	except UnicodeEncodeError:
				541	is_ew = True
				542	if last_ew is None:
				543	if not part.comments:
				544	last_ew = len(res)
				545	res.append(part.cte_encode(charset, policy))
				546	elif not part.has_leading_comment():
				547	if part[-1].token_type == 'cfws' and part.comments:
				548	remainder = part.pop(-1)
				549	else:
				550	remainder = ''
				551	for i, token in enumerate(part):
				552	if token.token_type == 'bare-quoted-string':
				553	part[i] = UnstructuredTokenList(token[:])
				554	tl = get_unstructured(''.join(res[last_ew:] + [spart]))
				555	res[last_ew:] = [tl.as_encoded_word(charset)]
				556	if part.comments or (not is_ew and part.token_type == 'quoted-string'):
				557	last_ew = None
				558	return ''.join(res)
				559
				560	class Word(TokenList):
				561
				562	token_type = 'word'
				563
				564
				565	class CFWSList(WhiteSpaceTokenList):
				566
				567	token_type = 'cfws'
				568
				569	def has_leading_comment(self):
				570	return bool(self.comments)
				571
				572
				573	class Atom(TokenList):
				574
				575	token_type = 'atom'
				576
				577
R David Murray	97f43c0	2012-06-24 05:03:27 -0400	[diff] [blame]	578	class Token(TokenList):
				579
				580	token_type = 'token'
				581
				582
R David Murray	0b6f6c8	2012-05-25 18:42:14 -0400	[diff] [blame]	583	class EncodedWord(TokenList):
				584
				585	token_type = 'encoded-word'
				586	cte = None
				587	charset = None
				588	lang = None
				589
				590	@property
				591	def encoded(self):
				592	if self.cte is not None:
				593	return self.cte
				594	_ew.encode(str(self), self.charset)
				595
				596
				597
				598	class QuotedString(TokenList):
				599
				600	token_type = 'quoted-string'
				601
				602	@property
				603	def content(self):
				604	for x in self:
				605	if x.token_type == 'bare-quoted-string':
				606	return x.value
				607
				608	@property
				609	def quoted_value(self):
				610	res = []
				611	for x in self:
				612	if x.token_type == 'bare-quoted-string':
				613	res.append(str(x))
				614	else:
				615	res.append(x.value)
				616	return ''.join(res)
				617
R David Murray	97f43c0	2012-06-24 05:03:27 -0400	[diff] [blame]	618	@property
				619	def stripped_value(self):
				620	for token in self:
				621	if token.token_type == 'bare-quoted-string':
				622	return token.value
				623
R David Murray	0b6f6c8	2012-05-25 18:42:14 -0400	[diff] [blame]	624
				625	class BareQuotedString(QuotedString):
				626
				627	token_type = 'bare-quoted-string'
				628
				629	def __str__(self):
R David Murray	97f43c0	2012-06-24 05:03:27 -0400	[diff] [blame]	630	return quote_string(''.join(str(x) for x in self))
R David Murray	0b6f6c8	2012-05-25 18:42:14 -0400	[diff] [blame]	631
				632	@property
				633	def value(self):
				634	return ''.join(str(x) for x in self)
				635
				636
				637	class Comment(WhiteSpaceTokenList):
				638
				639	token_type = 'comment'
				640
				641	def __str__(self):
				642	return ''.join(sum([
				643	["("],
				644	[self.quote(x) for x in self],
				645	[")"],
				646	], []))
				647
				648	def quote(self, value):
				649	if value.token_type == 'comment':
				650	return str(value)
				651	return str(value).replace('\\', '\\\\').replace(
				652	'(', '\(').replace(
				653	')', '\)')
				654
				655	@property
				656	def content(self):
				657	return ''.join(str(x) for x in self)
				658
				659	@property
				660	def comments(self):
				661	return [self.content]
				662
				663	class AddressList(TokenList):
				664
				665	token_type = 'address-list'
				666
				667	@property
				668	def addresses(self):
				669	return [x for x in self if x.token_type=='address']
				670
				671	@property
				672	def mailboxes(self):
				673	return sum((x.mailboxes
				674	for x in self if x.token_type=='address'), [])
				675
				676	@property
				677	def all_mailboxes(self):
				678	return sum((x.all_mailboxes
				679	for x in self if x.token_type=='address'), [])
				680
				681
				682	class Address(TokenList):
				683
				684	token_type = 'address'
				685
				686	@property
				687	def display_name(self):
				688	if self[0].token_type == 'group':
				689	return self[0].display_name
				690
				691	@property
				692	def mailboxes(self):
				693	if self[0].token_type == 'mailbox':
				694	return [self[0]]
				695	elif self[0].token_type == 'invalid-mailbox':
				696	return []
				697	return self[0].mailboxes
				698
				699	@property
				700	def all_mailboxes(self):
				701	if self[0].token_type == 'mailbox':
				702	return [self[0]]
				703	elif self[0].token_type == 'invalid-mailbox':
				704	return [self[0]]
				705	return self[0].all_mailboxes
				706
				707	class MailboxList(TokenList):
				708
				709	token_type = 'mailbox-list'
				710
				711	@property
				712	def mailboxes(self):
				713	return [x for x in self if x.token_type=='mailbox']
				714
				715	@property
				716	def all_mailboxes(self):
				717	return [x for x in self
				718	if x.token_type in ('mailbox', 'invalid-mailbox')]
				719
				720
				721	class GroupList(TokenList):
				722
				723	token_type = 'group-list'
				724
				725	@property
				726	def mailboxes(self):
				727	if not self or self[0].token_type != 'mailbox-list':
				728	return []
				729	return self[0].mailboxes
				730
				731	@property
				732	def all_mailboxes(self):
				733	if not self or self[0].token_type != 'mailbox-list':
				734	return []
				735	return self[0].all_mailboxes
				736
				737
				738	class Group(TokenList):
				739
				740	token_type = "group"
				741
				742	@property
				743	def mailboxes(self):
				744	if self[2].token_type != 'group-list':
				745	return []
				746	return self[2].mailboxes
				747
				748	@property
				749	def all_mailboxes(self):
				750	if self[2].token_type != 'group-list':
				751	return []
				752	return self[2].all_mailboxes
				753
				754	@property
				755	def display_name(self):
				756	return self[0].display_name
				757
				758
				759	class NameAddr(TokenList):
				760
				761	token_type = 'name-addr'
				762
				763	@property
				764	def display_name(self):
				765	if len(self) == 1:
				766	return None
				767	return self[0].display_name
				768
				769	@property
				770	def local_part(self):
				771	return self[-1].local_part
				772
				773	@property
				774	def domain(self):
				775	return self[-1].domain
				776
				777	@property
				778	def route(self):
				779	return self[-1].route
				780
				781	@property
				782	def addr_spec(self):
				783	return self[-1].addr_spec
				784
				785
				786	class AngleAddr(TokenList):
				787
				788	token_type = 'angle-addr'
				789
				790	@property
				791	def local_part(self):
				792	for x in self:
				793	if x.token_type == 'addr-spec':
				794	return x.local_part
				795
				796	@property
				797	def domain(self):
				798	for x in self:
				799	if x.token_type == 'addr-spec':
				800	return x.domain
				801
				802	@property
				803	def route(self):
				804	for x in self:
				805	if x.token_type == 'obs-route':
				806	return x.domains
				807
				808	@property
				809	def addr_spec(self):
				810	for x in self:
				811	if x.token_type == 'addr-spec':
				812	return x.addr_spec
R David Murray	032eed3	2012-05-26 14:31:12 -0400	[diff] [blame]	813	else:
				814	return '<>'
R David Murray	0b6f6c8	2012-05-25 18:42:14 -0400	[diff] [blame]	815
				816
				817	class ObsRoute(TokenList):
				818
				819	token_type = 'obs-route'
				820
				821	@property
				822	def domains(self):
				823	return [x.domain for x in self if x.token_type == 'domain']
				824
				825
				826	class Mailbox(TokenList):
				827
				828	token_type = 'mailbox'
				829
				830	@property
				831	def display_name(self):
				832	if self[0].token_type == 'name-addr':
				833	return self[0].display_name
				834
				835	@property
				836	def local_part(self):
				837	return self[0].local_part
				838
				839	@property
				840	def domain(self):
				841	return self[0].domain
				842
				843	@property
				844	def route(self):
				845	if self[0].token_type == 'name-addr':
				846	return self[0].route
				847
				848	@property
				849	def addr_spec(self):
				850	return self[0].addr_spec
				851
				852
				853	class InvalidMailbox(TokenList):
				854
				855	token_type = 'invalid-mailbox'
				856
				857	@property
				858	def display_name(self):
				859	return None
				860
				861	local_part = domain = route = addr_spec = display_name
				862
				863
				864	class Domain(TokenList):
				865
				866	token_type = 'domain'
				867
				868	@property
				869	def domain(self):
				870	return ''.join(super().value.split())
				871
				872
				873	class DotAtom(TokenList):
				874
				875	token_type = 'dot-atom'
				876
				877
				878	class DotAtomText(TokenList):
				879
				880	token_type = 'dot-atom-text'
				881
				882
				883	class AddrSpec(TokenList):
				884
				885	token_type = 'addr-spec'
				886
				887	@property
				888	def local_part(self):
				889	return self[0].local_part
				890
				891	@property
				892	def domain(self):
				893	if len(self) < 3:
				894	return None
				895	return self[-1].domain
				896
				897	@property
				898	def value(self):
				899	if len(self) < 3:
				900	return self[0].value
				901	return self[0].value.rstrip()+self[1].value+self[2].value.lstrip()
				902
				903	@property
				904	def addr_spec(self):
				905	nameset = set(self.local_part)
				906	if len(nameset) > len(nameset-DOT_ATOM_ENDS):
				907	lp = quote_string(self.local_part)
				908	else:
				909	lp = self.local_part
				910	if self.domain is not None:
				911	return lp + '@' + self.domain
				912	return lp
				913
				914
				915	class ObsLocalPart(TokenList):
				916
				917	token_type = 'obs-local-part'
				918
				919
				920	class DisplayName(Phrase):
				921
				922	token_type = 'display-name'
				923
				924	@property
				925	def display_name(self):
				926	res = TokenList(self)
				927	if res[0].token_type == 'cfws':
				928	res.pop(0)
				929	else:
				930	if res[0][0].token_type == 'cfws':
				931	res[0] = TokenList(res[0][1:])
				932	if res[-1].token_type == 'cfws':
				933	res.pop()
				934	else:
				935	if res[-1][-1].token_type == 'cfws':
				936	res[-1] = TokenList(res[-1][:-1])
				937	return res.value
				938
				939	@property
				940	def value(self):
				941	quote = False
				942	if self.defects:
				943	quote = True
				944	else:
				945	for x in self:
				946	if x.token_type == 'quoted-string':
				947	quote = True
				948	if quote:
				949	pre = post = ''
				950	if self[0].token_type=='cfws' or self[0][0].token_type=='cfws':
				951	pre = ' '
				952	if self[-1].token_type=='cfws' or self[-1][-1].token_type=='cfws':
				953	post = ' '
				954	return pre+quote_string(self.display_name)+post
				955	else:
				956	return super().value
				957
				958
				959	class LocalPart(TokenList):
				960
				961	token_type = 'local-part'
				962
				963	@property
				964	def value(self):
				965	if self[0].token_type == "quoted-string":
				966	return self[0].quoted_value
				967	else:
				968	return self[0].value
				969
				970	@property
				971	def local_part(self):
				972	# Strip whitespace from front, back, and around dots.
				973	res = [DOT]
				974	last = DOT
				975	last_is_tl = False
				976	for tok in self[0] + [DOT]:
				977	if tok.token_type == 'cfws':
				978	continue
				979	if (last_is_tl and tok.token_type == 'dot' and
				980	last[-1].token_type == 'cfws'):
				981	res[-1] = TokenList(last[:-1])
				982	is_tl = isinstance(tok, TokenList)
				983	if (is_tl and last.token_type == 'dot' and
				984	tok[0].token_type == 'cfws'):
				985	res.append(TokenList(tok[1:]))
				986	else:
				987	res.append(tok)
				988	last = res[-1]
				989	last_is_tl = is_tl
				990	res = TokenList(res[1:-1])
				991	return res.value
				992
				993
				994	class DomainLiteral(TokenList):
				995
				996	token_type = 'domain-literal'
				997
				998	@property
				999	def domain(self):
				1000	return ''.join(super().value.split())
				1001
				1002	@property
				1003	def ip(self):
				1004	for x in self:
				1005	if x.token_type == 'ptext':
				1006	return x.value
				1007
				1008
R David Murray	97f43c0	2012-06-24 05:03:27 -0400	[diff] [blame]	1009	class MIMEVersion(TokenList):
				1010
				1011	token_type = 'mime-version'
				1012	major = None
				1013	minor = None
				1014
				1015
				1016	class Parameter(TokenList):
				1017
				1018	token_type = 'parameter'
				1019	sectioned = False
				1020	extended = False
				1021	charset = 'us-ascii'
				1022
				1023	@property
				1024	def section_number(self):
				1025	# Because the first token, the attribute (name) eats CFWS, the second
				1026	# token is always the section if there is one.
				1027	return self[1].number if self.sectioned else 0
				1028
				1029	@property
				1030	def param_value(self):
				1031	# This is part of the "handle quoted extended parameters" hack.
				1032	for token in self:
				1033	if token.token_type == 'value':
				1034	return token.stripped_value
				1035	if token.token_type == 'quoted-string':
				1036	for token in token:
				1037	if token.token_type == 'bare-quoted-string':
				1038	for token in token:
				1039	if token.token_type == 'value':
				1040	return token.stripped_value
				1041	return ''
				1042
				1043
				1044	class InvalidParameter(Parameter):
				1045
				1046	token_type = 'invalid-parameter'
				1047
				1048
				1049	class Attribute(TokenList):
				1050
				1051	token_type = 'attribute'
				1052
				1053	@property
				1054	def stripped_value(self):
				1055	for token in self:
				1056	if token.token_type.endswith('attrtext'):
				1057	return token.value
				1058
				1059	class Section(TokenList):
				1060
				1061	token_type = 'section'
				1062	number = None
				1063
				1064
				1065	class Value(TokenList):
				1066
				1067	token_type = 'value'
				1068
				1069	@property
				1070	def stripped_value(self):
				1071	token = self[0]
				1072	if token.token_type == 'cfws':
				1073	token = self[1]
				1074	if token.token_type.endswith(
				1075	('quoted-string', 'attribute', 'extended-attribute')):
				1076	return token.stripped_value
				1077	return self.value
				1078
				1079
				1080	class MimeParameters(TokenList):
				1081
				1082	token_type = 'mime-parameters'
				1083
				1084	@property
				1085	def params(self):
				1086	# The RFC specifically states that the ordering of parameters is not
				1087	# guaranteed and may be reordered by the transport layer. So we have
				1088	# to assume the RFC 2231 pieces can come in any order. However, we
				1089	# output them in the order that we first see a given name, which gives
				1090	# us a stable __str__.
				1091	params = OrderedDict()
				1092	for token in self:
				1093	if not token.token_type.endswith('parameter'):
				1094	continue
				1095	if token[0].token_type != 'attribute':
				1096	continue
				1097	name = token[0].value.strip()
				1098	if name not in params:
				1099	params[name] = []
				1100	params[name].append((token.section_number, token))
				1101	for name, parts in params.items():
R David Murray	7d0325d	2015-03-29 21:53:05 -0400	[diff] [blame]	1102	parts = sorted(parts, key=itemgetter(0))
				1103	first_param = parts[0][1]
				1104	charset = first_param.charset
				1105	# Our arbitrary error recovery is to ignore duplicate parameters,
				1106	# to use appearance order if there are duplicate rfc 2231 parts,
				1107	# and to ignore gaps. This mimics the error recovery of get_param.
				1108	if not first_param.extended and len(parts) > 1:
				1109	if parts[1][0] == 0:
				1110	parts[1][1].defects.append(errors.InvalidHeaderDefect(
				1111	'duplicate parameter name; duplicate(s) ignored'))
				1112	parts = parts[:1]
				1113	# Else assume the 0 was missing...note that this is different
				1114	# from get_param, but we registered a defect for this earlier.
R David Murray	97f43c0	2012-06-24 05:03:27 -0400	[diff] [blame]	1115	value_parts = []
R David Murray	7d0325d	2015-03-29 21:53:05 -0400	[diff] [blame]	1116	i = 0
				1117	for section_number, param in parts:
R David Murray	97f43c0	2012-06-24 05:03:27 -0400	[diff] [blame]	1118	if section_number != i:
R David Murray	7d0325d	2015-03-29 21:53:05 -0400	[diff] [blame]	1119	# We could get fancier here and look for a complete
				1120	# duplicate extended parameter and ignore the second one
				1121	# seen. But we're not doing that. The old code didn't.
				1122	if not param.extended:
				1123	param.defects.append(errors.InvalidHeaderDefect(
				1124	'duplicate parameter name; duplicate ignored'))
				1125	continue
				1126	else:
				1127	param.defects.append(errors.InvalidHeaderDefect(
				1128	"inconsistent RFC2231 parameter numbering"))
				1129	i += 1
R David Murray	97f43c0	2012-06-24 05:03:27 -0400	[diff] [blame]	1130	value = param.param_value
				1131	if param.extended:
				1132	try:
				1133	value = urllib.parse.unquote_to_bytes(value)
				1134	except UnicodeEncodeError:
				1135	# source had surrogate escaped bytes. What we do now
				1136	# is a bit of an open question. I'm not sure this is
				1137	# the best choice, but it is what the old algorithm did
				1138	value = urllib.parse.unquote(value, encoding='latin-1')
				1139	else:
				1140	try:
				1141	value = value.decode(charset, 'surrogateescape')
				1142	except LookupError:
				1143	# XXX: there should really be a custom defect for
				1144	# unknown character set to make it easy to find,
				1145	# because otherwise unknown charset is a silent
				1146	# failure.
				1147	value = value.decode('us-ascii', 'surrogateescape')
				1148	if utils._has_surrogates(value):
				1149	param.defects.append(errors.UndecodableBytesDefect())
				1150	value_parts.append(value)
				1151	value = ''.join(value_parts)
				1152	yield name, value
				1153
				1154	def __str__(self):
				1155	params = []
				1156	for name, value in self.params:
				1157	if value:
				1158	params.append('{}={}'.format(name, quote_string(value)))
				1159	else:
				1160	params.append(name)
				1161	params = '; '.join(params)
				1162	return ' ' + params if params else ''
				1163
				1164
				1165	class ParameterizedHeaderValue(TokenList):
				1166
				1167	@property
				1168	def params(self):
				1169	for token in reversed(self):
				1170	if token.token_type == 'mime-parameters':
				1171	return token.params
				1172	return {}
				1173
				1174	@property
				1175	def parts(self):
				1176	if self and self[-1].token_type == 'mime-parameters':
				1177	# We don't want to start a new line if all of the params don't fit
				1178	# after the value, so unwrap the parameter list.
				1179	return TokenList(self[:-1] + self[-1])
				1180	return TokenList(self).parts
				1181
				1182
				1183	class ContentType(ParameterizedHeaderValue):
				1184
				1185	token_type = 'content-type'
				1186	maintype = 'text'
				1187	subtype = 'plain'
				1188
				1189
				1190	class ContentDisposition(ParameterizedHeaderValue):
				1191
				1192	token_type = 'content-disposition'
				1193	content_disposition = None
				1194
				1195
				1196	class ContentTransferEncoding(TokenList):
				1197
				1198	token_type = 'content-transfer-encoding'
				1199	cte = '7bit'
				1200
				1201
R David Murray	0b6f6c8	2012-05-25 18:42:14 -0400	[diff] [blame]	1202	class HeaderLabel(TokenList):
				1203
				1204	token_type = 'header-label'
				1205
				1206
				1207	class Header(TokenList):
				1208
				1209	token_type = 'header'
				1210
				1211	def _fold(self, folded):
				1212	folded.append(str(self.pop(0)))
				1213	folded.lastlen = len(folded.current[0])
				1214	# The first line of the header is different from all others: we don't
				1215	# want to start a new object on a new line if it has any fold points in
				1216	# it that would allow part of it to be on the first header line.
				1217	# Further, if the first fold point would fit on the new line, we want
				1218	# to do that, but if it doesn't we want to put it on the first line.
				1219	# Folded supports this via the stickyspace attribute. If this
				1220	# attribute is not None, it does the special handling.
				1221	folded.stickyspace = str(self.pop(0)) if self[0].token_type == 'cfws' else ''
				1222	rest = self.pop(0)
				1223	if self:
				1224	raise ValueError("Malformed Header token list")
				1225	rest._fold(folded)
				1226
				1227
				1228	#
				1229	# Terminal classes and instances
				1230	#
				1231
				1232	class Terminal(str):
				1233
				1234	def __new__(cls, value, token_type):
				1235	self = super().__new__(cls, value)
				1236	self.token_type = token_type
				1237	self.defects = []
				1238	return self
				1239
				1240	def __repr__(self):
				1241	return "{}({})".format(self.__class__.__name__, super().__repr__())
				1242
				1243	@property
				1244	def all_defects(self):
				1245	return list(self.defects)
				1246
				1247	def _pp(self, indent=''):
				1248	return ["{}{}/{}({}){}".format(
				1249	indent,
				1250	self.__class__.__name__,
				1251	self.token_type,
				1252	super().__repr__(),
				1253	'' if not self.defects else ' {}'.format(self.defects),
				1254	)]
				1255
				1256	def cte_encode(self, charset, policy):
				1257	value = str(self)
				1258	try:
				1259	value.encode('us-ascii')
				1260	return value
				1261	except UnicodeEncodeError:
				1262	return _ew.encode(value, charset)
				1263
				1264	def pop_trailing_ws(self):
				1265	# This terminates the recursion.
				1266	return None
				1267
				1268	def pop_leading_fws(self):
				1269	# This terminates the recursion.
				1270	return None
				1271
				1272	@property
				1273	def comments(self):
				1274	return []
				1275
				1276	def has_leading_comment(self):
				1277	return False
				1278
				1279	def __getnewargs__(self):
				1280	return(str(self), self.token_type)
				1281
				1282
				1283	class WhiteSpaceTerminal(Terminal):
				1284
				1285	@property
				1286	def value(self):
				1287	return ' '
				1288
				1289	def startswith_fws(self):
				1290	return True
				1291
				1292	has_fws = True
				1293
				1294
				1295	class ValueTerminal(Terminal):
				1296
				1297	@property
				1298	def value(self):
				1299	return self
				1300
				1301	def startswith_fws(self):
				1302	return False
				1303
				1304	has_fws = False
				1305
				1306	def as_encoded_word(self, charset):
				1307	return _ew.encode(str(self), charset)
				1308
				1309
				1310	class EWWhiteSpaceTerminal(WhiteSpaceTerminal):
				1311
				1312	@property
				1313	def value(self):
				1314	return ''
				1315
				1316	@property
				1317	def encoded(self):
				1318	return self[:]
				1319
				1320	def __str__(self):
				1321	return ''
				1322
				1323	has_fws = True
				1324
				1325
				1326	# XXX these need to become classes and used as instances so
				1327	# that a program can't change them in a parse tree and screw
				1328	# up other parse trees. Maybe should have tests for that, too.
				1329	DOT = ValueTerminal('.', 'dot')
				1330	ListSeparator = ValueTerminal(',', 'list-separator')
				1331	RouteComponentMarker = ValueTerminal('@', 'route-component-marker')
				1332
				1333	#
				1334	# Parser
				1335	#
				1336
Victor Stinner	765531d	2013-03-26 01:11:54 +0100	[diff] [blame]	1337	# Parse strings according to RFC822/2047/2822/5322 rules.
				1338	#
				1339	# This is a stateless parser. Each get_XXX function accepts a string and
				1340	# returns either a Terminal or a TokenList representing the RFC object named
				1341	# by the method and a string containing the remaining unparsed characters
				1342	# from the input. Thus a parser method consumes the next syntactic construct
				1343	# of a given type and returns a token representing the construct plus the
				1344	# unparsed remainder of the input string.
				1345	#
				1346	# For example, if the first element of a structured header is a 'phrase',
				1347	# then:
				1348	#
				1349	# phrase, value = get_phrase(value)
				1350	#
				1351	# returns the complete phrase from the start of the string value, plus any
				1352	# characters left in the string after the phrase is removed.
R David Murray	0b6f6c8	2012-05-25 18:42:14 -0400	[diff] [blame]	1353
				1354	_wsp_splitter = re.compile(r'([{}]+)'.format(''.join(WSP))).split
				1355	_non_atom_end_matcher = re.compile(r"[^{}]+".format(
				1356	''.join(ATOM_ENDS).replace('\\','\\\\').replace(']','\]'))).match
				1357	_non_printable_finder = re.compile(r"[\x00-\x20\x7F]").findall
R David Murray	97f43c0	2012-06-24 05:03:27 -0400	[diff] [blame]	1358	_non_token_end_matcher = re.compile(r"[^{}]+".format(
				1359	''.join(TOKEN_ENDS).replace('\\','\\\\').replace(']','\]'))).match
				1360	_non_attribute_end_matcher = re.compile(r"[^{}]+".format(
				1361	''.join(ATTRIBUTE_ENDS).replace('\\','\\\\').replace(']','\]'))).match
				1362	_non_extended_attribute_end_matcher = re.compile(r"[^{}]+".format(
				1363	''.join(EXTENDED_ATTRIBUTE_ENDS).replace(
				1364	'\\','\\\\').replace(']','\]'))).match
R David Murray	0b6f6c8	2012-05-25 18:42:14 -0400	[diff] [blame]	1365
				1366	def _validate_xtext(xtext):
				1367	"""If input token contains ASCII non-printables, register a defect."""
				1368
				1369	non_printables = _non_printable_finder(xtext)
				1370	if non_printables:
				1371	xtext.defects.append(errors.NonPrintableDefect(non_printables))
				1372	if utils._has_surrogates(xtext):
				1373	xtext.defects.append(errors.UndecodableBytesDefect(
				1374	"Non-ASCII characters found in header token"))
				1375
				1376	def _get_ptext_to_endchars(value, endchars):
				1377	"""Scan printables/quoted-pairs until endchars and return unquoted ptext.
				1378
				1379	This function turns a run of qcontent, ccontent-without-comments, or
				1380	dtext-with-quoted-printables into a single string by unquoting any
				1381	quoted printables. It returns the string, the remaining value, and
				1382	a flag that is True iff there were any quoted printables decoded.
				1383
				1384	"""
				1385	fragment, *remainder = _wsp_splitter(value, 1)
				1386	vchars = []
				1387	escape = False
				1388	had_qp = False
				1389	for pos in range(len(fragment)):
				1390	if fragment[pos] == '\\':
				1391	if escape:
				1392	escape = False
				1393	had_qp = True
				1394	else:
				1395	escape = True
				1396	continue
				1397	if escape:
				1398	escape = False
				1399	elif fragment[pos] in endchars:
				1400	break
				1401	vchars.append(fragment[pos])
				1402	else:
				1403	pos = pos + 1
				1404	return ''.join(vchars), ''.join([fragment[pos:]] + remainder), had_qp
				1405
R David Murray	0b6f6c8	2012-05-25 18:42:14 -0400	[diff] [blame]	1406	def get_fws(value):
				1407	"""FWS = 1*WSP
				1408
				1409	This isn't the RFC definition. We're using fws to represent tokens where
				1410	folding can be done, but when we are parsing the unfolding has already
				1411	been done so we don't need to watch out for CRLF.
				1412
				1413	"""
				1414	newvalue = value.lstrip()
				1415	fws = WhiteSpaceTerminal(value[:len(value)-len(newvalue)], 'fws')
				1416	return fws, newvalue
				1417
				1418	def get_encoded_word(value):
				1419	""" encoded-word = "=?" charset "?" encoding "?" encoded-text "?="
				1420
				1421	"""
				1422	ew = EncodedWord()
				1423	if not value.startswith('=?'):
				1424	raise errors.HeaderParseError(
				1425	"expected encoded word but found {}".format(value))
				1426	tok, *remainder = value[2:].split('?=', 1)
				1427	if tok == value[2:]:
				1428	raise errors.HeaderParseError(
				1429	"expected encoded word but found {}".format(value))
				1430	remstr = ''.join(remainder)
R David Murray	65171b2	2013-07-11 15:52:57 -0400	[diff] [blame]	1431	if len(remstr) > 1 and remstr[0] in hexdigits and remstr[1] in hexdigits:
				1432	# The ? after the CTE was followed by an encoded word escape (=XX).
R David Murray	0b6f6c8	2012-05-25 18:42:14 -0400	[diff] [blame]	1433	rest, *remainder = remstr.split('?=', 1)
				1434	tok = tok + '?=' + rest
				1435	if len(tok.split()) > 1:
				1436	ew.defects.append(errors.InvalidHeaderDefect(
				1437	"whitespace inside encoded word"))
				1438	ew.cte = value
				1439	value = ''.join(remainder)
				1440	try:
				1441	text, charset, lang, defects = _ew.decode('=?' + tok + '?=')
				1442	except ValueError:
				1443	raise errors.HeaderParseError(
				1444	"encoded word format invalid: '{}'".format(ew.cte))
				1445	ew.charset = charset
				1446	ew.lang = lang
				1447	ew.defects.extend(defects)
				1448	while text:
				1449	if text[0] in WSP:
				1450	token, text = get_fws(text)
				1451	ew.append(token)
				1452	continue
				1453	chars, *remainder = _wsp_splitter(text, 1)
				1454	vtext = ValueTerminal(chars, 'vtext')
				1455	_validate_xtext(vtext)
				1456	ew.append(vtext)
				1457	text = ''.join(remainder)
				1458	return ew, value
				1459
				1460	def get_unstructured(value):
				1461	"""unstructured = (([FWS] vchar) WSP) / obs-unstruct
				1462	obs-unstruct = ((LF CR (obs-utext) LF CR)) / FWS)
				1463	obs-utext = %d0 / obs-NO-WS-CTL / LF / CR
				1464
				1465	obs-NO-WS-CTL is control characters except WSP/CR/LF.
				1466
				1467	So, basically, we have printable runs, plus control characters or nulls in
				1468	the obsolete syntax, separated by whitespace. Since RFC 2047 uses the
				1469	obsolete syntax in its specification, but requires whitespace on either
				1470	side of the encoded words, I can see no reason to need to separate the
				1471	non-printable-non-whitespace from the printable runs if they occur, so we
				1472	parse this into xtext tokens separated by WSP tokens.
				1473
				1474	Because an 'unstructured' value must by definition constitute the entire
				1475	value, this 'get' routine does not return a remaining value, only the
				1476	parsed TokenList.
				1477
				1478	"""
				1479	# XXX: but what about bare CR and LF? They might signal the start or
R David Murray	65171b2	2013-07-11 15:52:57 -0400	[diff] [blame]	1480	# end of an encoded word. YAGNI for now, since our current parsers
				1481	# will never send us strings with bare CR or LF.
R David Murray	0b6f6c8	2012-05-25 18:42:14 -0400	[diff] [blame]	1482
				1483	unstructured = UnstructuredTokenList()
				1484	while value:
				1485	if value[0] in WSP:
				1486	token, value = get_fws(value)
				1487	unstructured.append(token)
				1488	continue
				1489	if value.startswith('=?'):
				1490	try:
				1491	token, value = get_encoded_word(value)
				1492	except errors.HeaderParseError:
R David Murray	65171b2	2013-07-11 15:52:57 -0400	[diff] [blame]	1493	# XXX: Need to figure out how to register defects when
				1494	# appropriate here.
R David Murray	0b6f6c8	2012-05-25 18:42:14 -0400	[diff] [blame]	1495	pass
				1496	else:
				1497	have_ws = True
				1498	if len(unstructured) > 0:
				1499	if unstructured[-1].token_type != 'fws':
				1500	unstructured.defects.append(errors.InvalidHeaderDefect(
				1501	"missing whitespace before encoded word"))
				1502	have_ws = False
				1503	if have_ws and len(unstructured) > 1:
				1504	if unstructured[-2].token_type == 'encoded-word':
				1505	unstructured[-1] = EWWhiteSpaceTerminal(
				1506	unstructured[-1], 'fws')
				1507	unstructured.append(token)
				1508	continue
				1509	tok, *remainder = _wsp_splitter(value, 1)
				1510	vtext = ValueTerminal(tok, 'vtext')
				1511	_validate_xtext(vtext)
				1512	unstructured.append(vtext)
				1513	value = ''.join(remainder)
				1514	return unstructured
				1515
				1516	def get_qp_ctext(value):
				1517	"""ctext = <printable ascii except \ ( )>
				1518
				1519	This is not the RFC ctext, since we are handling nested comments in comment
				1520	and unquoting quoted-pairs here. We allow anything except the '()'
				1521	characters, but if we find any ASCII other than the RFC defined printable
				1522	ASCII an NonPrintableDefect is added to the token's defects list. Since
				1523	quoted pairs are converted to their unquoted values, what is returned is
				1524	a 'ptext' token. In this case it is a WhiteSpaceTerminal, so it's value
				1525	is ' '.
				1526
				1527	"""
				1528	ptext, value, _ = _get_ptext_to_endchars(value, '()')
				1529	ptext = WhiteSpaceTerminal(ptext, 'ptext')
				1530	_validate_xtext(ptext)
				1531	return ptext, value
				1532
				1533	def get_qcontent(value):
				1534	"""qcontent = qtext / quoted-pair
				1535
				1536	We allow anything except the DQUOTE character, but if we find any ASCII
				1537	other than the RFC defined printable ASCII an NonPrintableDefect is
				1538	added to the token's defects list. Any quoted pairs are converted to their
				1539	unquoted values, so what is returned is a 'ptext' token. In this case it
				1540	is a ValueTerminal.
				1541
				1542	"""
				1543	ptext, value, _ = _get_ptext_to_endchars(value, '"')
				1544	ptext = ValueTerminal(ptext, 'ptext')
				1545	_validate_xtext(ptext)
				1546	return ptext, value
				1547
				1548	def get_atext(value):
				1549	"""atext = <matches _atext_matcher>
				1550
				1551	We allow any non-ATOM_ENDS in atext, but add an InvalidATextDefect to
				1552	the token's defects list if we find non-atext characters.
				1553	"""
				1554	m = _non_atom_end_matcher(value)
				1555	if not m:
				1556	raise errors.HeaderParseError(
				1557	"expected atext but found '{}'".format(value))
				1558	atext = m.group()
				1559	value = value[len(atext):]
				1560	atext = ValueTerminal(atext, 'atext')
				1561	_validate_xtext(atext)
				1562	return atext, value
				1563
				1564	def get_bare_quoted_string(value):
				1565	"""bare-quoted-string = DQUOTE *([FWS] qcontent) [FWS] DQUOTE
				1566
				1567	A quoted-string without the leading or trailing white space. Its
				1568	value is the text between the quote marks, with whitespace
				1569	preserved and quoted pairs decoded.
				1570	"""
				1571	if value[0] != '"':
				1572	raise errors.HeaderParseError(
				1573	"expected '\"' but found '{}'".format(value))
				1574	bare_quoted_string = BareQuotedString()
				1575	value = value[1:]
				1576	while value and value[0] != '"':
				1577	if value[0] in WSP:
				1578	token, value = get_fws(value)
R David Murray	0400d33	2014-02-08 13:12:00 -0500	[diff] [blame]	1579	elif value[:2] == '=?':
				1580	try:
				1581	token, value = get_encoded_word(value)
				1582	bare_quoted_string.defects.append(errors.InvalidHeaderDefect(
				1583	"encoded word inside quoted string"))
				1584	except errors.HeaderParseError:
				1585	token, value = get_qcontent(value)
R David Murray	0b6f6c8	2012-05-25 18:42:14 -0400	[diff] [blame]	1586	else:
				1587	token, value = get_qcontent(value)
				1588	bare_quoted_string.append(token)
				1589	if not value:
				1590	bare_quoted_string.defects.append(errors.InvalidHeaderDefect(
				1591	"end of header inside quoted string"))
				1592	return bare_quoted_string, value
				1593	return bare_quoted_string, value[1:]
				1594
				1595	def get_comment(value):
				1596	"""comment = "(" *([FWS] ccontent) [FWS] ")"
				1597	ccontent = ctext / quoted-pair / comment
				1598
				1599	We handle nested comments here, and quoted-pair in our qp-ctext routine.
				1600	"""
				1601	if value and value[0] != '(':
				1602	raise errors.HeaderParseError(
				1603	"expected '(' but found '{}'".format(value))
				1604	comment = Comment()
				1605	value = value[1:]
				1606	while value and value[0] != ")":
				1607	if value[0] in WSP:
				1608	token, value = get_fws(value)
				1609	elif value[0] == '(':
				1610	token, value = get_comment(value)
				1611	else:
				1612	token, value = get_qp_ctext(value)
				1613	comment.append(token)
				1614	if not value:
				1615	comment.defects.append(errors.InvalidHeaderDefect(
				1616	"end of header inside comment"))
				1617	return comment, value
				1618	return comment, value[1:]
				1619
				1620	def get_cfws(value):
				1621	"""CFWS = (1*([FWS] comment) [FWS]) / FWS
				1622
				1623	"""
				1624	cfws = CFWSList()
				1625	while value and value[0] in CFWS_LEADER:
				1626	if value[0] in WSP:
				1627	token, value = get_fws(value)
				1628	else:
				1629	token, value = get_comment(value)
				1630	cfws.append(token)
				1631	return cfws, value
				1632
				1633	def get_quoted_string(value):
				1634	"""quoted-string = [CFWS] <bare-quoted-string> [CFWS]
				1635
				1636	'bare-quoted-string' is an intermediate class defined by this
				1637	parser and not by the RFC grammar. It is the quoted string
				1638	without any attached CFWS.
				1639	"""
				1640	quoted_string = QuotedString()
				1641	if value and value[0] in CFWS_LEADER:
				1642	token, value = get_cfws(value)
				1643	quoted_string.append(token)
				1644	token, value = get_bare_quoted_string(value)
				1645	quoted_string.append(token)
				1646	if value and value[0] in CFWS_LEADER:
				1647	token, value = get_cfws(value)
				1648	quoted_string.append(token)
				1649	return quoted_string, value
				1650
				1651	def get_atom(value):
				1652	"""atom = [CFWS] 1*atext [CFWS]
				1653
R David Murray	923512f	2013-07-12 16:00:28 -0400	[diff] [blame]	1654	An atom could be an rfc2047 encoded word.
R David Murray	0b6f6c8	2012-05-25 18:42:14 -0400	[diff] [blame]	1655	"""
				1656	atom = Atom()
				1657	if value and value[0] in CFWS_LEADER:
				1658	token, value = get_cfws(value)
				1659	atom.append(token)
				1660	if value and value[0] in ATOM_ENDS:
				1661	raise errors.HeaderParseError(
				1662	"expected atom but found '{}'".format(value))
R David Murray	923512f	2013-07-12 16:00:28 -0400	[diff] [blame]	1663	if value.startswith('=?'):
				1664	try:
				1665	token, value = get_encoded_word(value)
				1666	except errors.HeaderParseError:
				1667	# XXX: need to figure out how to register defects when
				1668	# appropriate here.
				1669	token, value = get_atext(value)
				1670	else:
				1671	token, value = get_atext(value)
R David Murray	0b6f6c8	2012-05-25 18:42:14 -0400	[diff] [blame]	1672	atom.append(token)
				1673	if value and value[0] in CFWS_LEADER:
				1674	token, value = get_cfws(value)
				1675	atom.append(token)
				1676	return atom, value
				1677
				1678	def get_dot_atom_text(value):
				1679	""" dot-text = 1atext ("." 1*atext)
				1680
				1681	"""
				1682	dot_atom_text = DotAtomText()
				1683	if not value or value[0] in ATOM_ENDS:
				1684	raise errors.HeaderParseError("expected atom at a start of "
				1685	"dot-atom-text but found '{}'".format(value))
				1686	while value and value[0] not in ATOM_ENDS:
				1687	token, value = get_atext(value)
				1688	dot_atom_text.append(token)
				1689	if value and value[0] == '.':
				1690	dot_atom_text.append(DOT)
				1691	value = value[1:]
				1692	if dot_atom_text[-1] is DOT:
				1693	raise errors.HeaderParseError("expected atom at end of dot-atom-text "
				1694	"but found '{}'".format('.'+value))
				1695	return dot_atom_text, value
				1696
				1697	def get_dot_atom(value):
				1698	""" dot-atom = [CFWS] dot-atom-text [CFWS]
				1699
R David Murray	923512f	2013-07-12 16:00:28 -0400	[diff] [blame]	1700	Any place we can have a dot atom, we could instead have an rfc2047 encoded
				1701	word.
R David Murray	0b6f6c8	2012-05-25 18:42:14 -0400	[diff] [blame]	1702	"""
				1703	dot_atom = DotAtom()
				1704	if value[0] in CFWS_LEADER:
				1705	token, value = get_cfws(value)
				1706	dot_atom.append(token)
R David Murray	923512f	2013-07-12 16:00:28 -0400	[diff] [blame]	1707	if value.startswith('=?'):
				1708	try:
				1709	token, value = get_encoded_word(value)
				1710	except errors.HeaderParseError:
				1711	# XXX: need to figure out how to register defects when
				1712	# appropriate here.
				1713	token, value = get_dot_atom_text(value)
				1714	else:
				1715	token, value = get_dot_atom_text(value)
R David Murray	0b6f6c8	2012-05-25 18:42:14 -0400	[diff] [blame]	1716	dot_atom.append(token)
				1717	if value and value[0] in CFWS_LEADER:
				1718	token, value = get_cfws(value)
				1719	dot_atom.append(token)
				1720	return dot_atom, value
				1721
				1722	def get_word(value):
				1723	"""word = atom / quoted-string
				1724
				1725	Either atom or quoted-string may start with CFWS. We have to peel off this
				1726	CFWS first to determine which type of word to parse. Afterward we splice
				1727	the leading CFWS, if any, into the parsed sub-token.
				1728
				1729	If neither an atom or a quoted-string is found before the next special, a
				1730	HeaderParseError is raised.
				1731
				1732	The token returned is either an Atom or a QuotedString, as appropriate.
				1733	This means the 'word' level of the formal grammar is not represented in the
				1734	parse tree; this is because having that extra layer when manipulating the
				1735	parse tree is more confusing than it is helpful.
				1736
				1737	"""
				1738	if value[0] in CFWS_LEADER:
				1739	leader, value = get_cfws(value)
				1740	else:
				1741	leader = None
				1742	if value[0]=='"':
				1743	token, value = get_quoted_string(value)
				1744	elif value[0] in SPECIALS:
				1745	raise errors.HeaderParseError("Expected 'atom' or 'quoted-string' "
				1746	"but found '{}'".format(value))
				1747	else:
				1748	token, value = get_atom(value)
				1749	if leader is not None:
				1750	token[:0] = [leader]
				1751	return token, value
				1752
				1753	def get_phrase(value):
				1754	""" phrase = 1*word / obs-phrase
				1755	obs-phrase = word *(word / "." / CFWS)
				1756
				1757	This means a phrase can be a sequence of words, periods, and CFWS in any
				1758	order as long as it starts with at least one word. If anything other than
				1759	words is detected, an ObsoleteHeaderDefect is added to the token's defect
				1760	list. We also accept a phrase that starts with CFWS followed by a dot;
				1761	this is registered as an InvalidHeaderDefect, since it is not supported by
				1762	even the obsolete grammar.
				1763
				1764	"""
				1765	phrase = Phrase()
				1766	try:
				1767	token, value = get_word(value)
				1768	phrase.append(token)
				1769	except errors.HeaderParseError:
				1770	phrase.defects.append(errors.InvalidHeaderDefect(
				1771	"phrase does not start with word"))
				1772	while value and value[0] not in PHRASE_ENDS:
				1773	if value[0]=='.':
				1774	phrase.append(DOT)
				1775	phrase.defects.append(errors.ObsoleteHeaderDefect(
				1776	"period in 'phrase'"))
				1777	value = value[1:]
				1778	else:
				1779	try:
				1780	token, value = get_word(value)
				1781	except errors.HeaderParseError:
				1782	if value[0] in CFWS_LEADER:
				1783	token, value = get_cfws(value)
				1784	phrase.defects.append(errors.ObsoleteHeaderDefect(
				1785	"comment found without atom"))
				1786	else:
				1787	raise
				1788	phrase.append(token)
				1789	return phrase, value
				1790
				1791	def get_local_part(value):
				1792	""" local-part = dot-atom / quoted-string / obs-local-part
				1793
				1794	"""
				1795	local_part = LocalPart()
				1796	leader = None
				1797	if value[0] in CFWS_LEADER:
				1798	leader, value = get_cfws(value)
				1799	if not value:
				1800	raise errors.HeaderParseError(
				1801	"expected local-part but found '{}'".format(value))
				1802	try:
				1803	token, value = get_dot_atom(value)
				1804	except errors.HeaderParseError:
				1805	try:
				1806	token, value = get_word(value)
				1807	except errors.HeaderParseError:
				1808	if value[0] != '\\' and value[0] in PHRASE_ENDS:
				1809	raise
				1810	token = TokenList()
				1811	if leader is not None:
				1812	token[:0] = [leader]
				1813	local_part.append(token)
				1814	if value and (value[0]=='\\' or value[0] not in PHRASE_ENDS):
				1815	obs_local_part, value = get_obs_local_part(str(local_part) + value)
				1816	if obs_local_part.token_type == 'invalid-obs-local-part':
				1817	local_part.defects.append(errors.InvalidHeaderDefect(
				1818	"local-part is not dot-atom, quoted-string, or obs-local-part"))
				1819	else:
				1820	local_part.defects.append(errors.ObsoleteHeaderDefect(
				1821	"local-part is not a dot-atom (contains CFWS)"))
				1822	local_part[0] = obs_local_part
				1823	try:
				1824	local_part.value.encode('ascii')
				1825	except UnicodeEncodeError:
				1826	local_part.defects.append(errors.NonASCIILocalPartDefect(
				1827	"local-part contains non-ASCII characters)"))
				1828	return local_part, value
				1829
				1830	def get_obs_local_part(value):
				1831	""" obs-local-part = word *("." word)
				1832	"""
				1833	obs_local_part = ObsLocalPart()
				1834	last_non_ws_was_dot = False
				1835	while value and (value[0]=='\\' or value[0] not in PHRASE_ENDS):
				1836	if value[0] == '.':
				1837	if last_non_ws_was_dot:
				1838	obs_local_part.defects.append(errors.InvalidHeaderDefect(
				1839	"invalid repeated '.'"))
				1840	obs_local_part.append(DOT)
				1841	last_non_ws_was_dot = True
				1842	value = value[1:]
				1843	continue
				1844	elif value[0]=='\\':
				1845	obs_local_part.append(ValueTerminal(value[0],
				1846	'misplaced-special'))
				1847	value = value[1:]
				1848	obs_local_part.defects.append(errors.InvalidHeaderDefect(
				1849	"'\\' character outside of quoted-string/ccontent"))
				1850	last_non_ws_was_dot = False
				1851	continue
				1852	if obs_local_part and obs_local_part[-1].token_type != 'dot':
				1853	obs_local_part.defects.append(errors.InvalidHeaderDefect(
				1854	"missing '.' between words"))
				1855	try:
				1856	token, value = get_word(value)
				1857	last_non_ws_was_dot = False
				1858	except errors.HeaderParseError:
				1859	if value[0] not in CFWS_LEADER:
				1860	raise
				1861	token, value = get_cfws(value)
				1862	obs_local_part.append(token)
				1863	if (obs_local_part[0].token_type == 'dot' or
				1864	obs_local_part[0].token_type=='cfws' and
				1865	obs_local_part[1].token_type=='dot'):
				1866	obs_local_part.defects.append(errors.InvalidHeaderDefect(
				1867	"Invalid leading '.' in local part"))
				1868	if (obs_local_part[-1].token_type == 'dot' or
				1869	obs_local_part[-1].token_type=='cfws' and
				1870	obs_local_part[-2].token_type=='dot'):
				1871	obs_local_part.defects.append(errors.InvalidHeaderDefect(
				1872	"Invalid trailing '.' in local part"))
				1873	if obs_local_part.defects:
				1874	obs_local_part.token_type = 'invalid-obs-local-part'
				1875	return obs_local_part, value
				1876
				1877	def get_dtext(value):
				1878	""" dtext = <printable ascii except \ [ ]> / obs-dtext
				1879	obs-dtext = obs-NO-WS-CTL / quoted-pair
				1880
Terry Jan Reedy	0f84764	2013-03-11 18:34:00 -0400	[diff] [blame]	1881	We allow anything except the excluded characters, but if we find any
R David Murray	0b6f6c8	2012-05-25 18:42:14 -0400	[diff] [blame]	1882	ASCII other than the RFC defined printable ASCII an NonPrintableDefect is
				1883	added to the token's defects list. Quoted pairs are converted to their
				1884	unquoted values, so what is returned is a ptext token, in this case a
				1885	ValueTerminal. If there were quoted-printables, an ObsoleteHeaderDefect is
				1886	added to the returned token's defect list.
				1887
				1888	"""
				1889	ptext, value, had_qp = _get_ptext_to_endchars(value, '[]')
				1890	ptext = ValueTerminal(ptext, 'ptext')
				1891	if had_qp:
				1892	ptext.defects.append(errors.ObsoleteHeaderDefect(
				1893	"quoted printable found in domain-literal"))
				1894	_validate_xtext(ptext)
				1895	return ptext, value
				1896
				1897	def _check_for_early_dl_end(value, domain_literal):
				1898	if value:
				1899	return False
				1900	domain_literal.append(errors.InvalidHeaderDefect(
				1901	"end of input inside domain-literal"))
				1902	domain_literal.append(ValueTerminal(']', 'domain-literal-end'))
				1903	return True
				1904
				1905	def get_domain_literal(value):
				1906	""" domain-literal = [CFWS] "[" *([FWS] dtext) [FWS] "]" [CFWS]
				1907
				1908	"""
				1909	domain_literal = DomainLiteral()
				1910	if value[0] in CFWS_LEADER:
				1911	token, value = get_cfws(value)
				1912	domain_literal.append(token)
				1913	if not value:
				1914	raise errors.HeaderParseError("expected domain-literal")
				1915	if value[0] != '[':
				1916	raise errors.HeaderParseError("expected '[' at start of domain-literal "
				1917	"but found '{}'".format(value))
				1918	value = value[1:]
				1919	if _check_for_early_dl_end(value, domain_literal):
				1920	return domain_literal, value
				1921	domain_literal.append(ValueTerminal('[', 'domain-literal-start'))
				1922	if value[0] in WSP:
				1923	token, value = get_fws(value)
				1924	domain_literal.append(token)
				1925	token, value = get_dtext(value)
				1926	domain_literal.append(token)
				1927	if _check_for_early_dl_end(value, domain_literal):
				1928	return domain_literal, value
				1929	if value[0] in WSP:
				1930	token, value = get_fws(value)
				1931	domain_literal.append(token)
				1932	if _check_for_early_dl_end(value, domain_literal):
				1933	return domain_literal, value
				1934	if value[0] != ']':
				1935	raise errors.HeaderParseError("expected ']' at end of domain-literal "
				1936	"but found '{}'".format(value))
				1937	domain_literal.append(ValueTerminal(']', 'domain-literal-end'))
				1938	value = value[1:]
				1939	if value and value[0] in CFWS_LEADER:
				1940	token, value = get_cfws(value)
				1941	domain_literal.append(token)
				1942	return domain_literal, value
				1943
				1944	def get_domain(value):
				1945	""" domain = dot-atom / domain-literal / obs-domain
				1946	obs-domain = atom *("." atom))
				1947
				1948	"""
				1949	domain = Domain()
				1950	leader = None
				1951	if value[0] in CFWS_LEADER:
				1952	leader, value = get_cfws(value)
				1953	if not value:
				1954	raise errors.HeaderParseError(
				1955	"expected domain but found '{}'".format(value))
				1956	if value[0] == '[':
				1957	token, value = get_domain_literal(value)
				1958	if leader is not None:
				1959	token[:0] = [leader]
				1960	domain.append(token)
				1961	return domain, value
				1962	try:
				1963	token, value = get_dot_atom(value)
				1964	except errors.HeaderParseError:
				1965	token, value = get_atom(value)
				1966	if leader is not None:
				1967	token[:0] = [leader]
				1968	domain.append(token)
				1969	if value and value[0] == '.':
				1970	domain.defects.append(errors.ObsoleteHeaderDefect(
				1971	"domain is not a dot-atom (contains CFWS)"))
				1972	if domain[0].token_type == 'dot-atom':
				1973	domain[:] = domain[0]
				1974	while value and value[0] == '.':
				1975	domain.append(DOT)
				1976	token, value = get_atom(value[1:])
				1977	domain.append(token)
				1978	return domain, value
				1979
				1980	def get_addr_spec(value):
				1981	""" addr-spec = local-part "@" domain
				1982
				1983	"""
				1984	addr_spec = AddrSpec()
				1985	token, value = get_local_part(value)
				1986	addr_spec.append(token)
				1987	if not value or value[0] != '@':
				1988	addr_spec.defects.append(errors.InvalidHeaderDefect(
				1989	"add-spec local part with no domain"))
				1990	return addr_spec, value
				1991	addr_spec.append(ValueTerminal('@', 'address-at-symbol'))
				1992	token, value = get_domain(value[1:])
				1993	addr_spec.append(token)
				1994	return addr_spec, value
				1995
				1996	def get_obs_route(value):
				1997	""" obs-route = obs-domain-list ":"
				1998	obs-domain-list = (CFWS / ",") "@" domain ("," [CFWS] ["@" domain])
				1999
				2000	Returns an obs-route token with the appropriate sub-tokens (that is,
				2001	there is no obs-domain-list in the parse tree).
				2002	"""
				2003	obs_route = ObsRoute()
				2004	while value and (value[0]==',' or value[0] in CFWS_LEADER):
				2005	if value[0] in CFWS_LEADER:
				2006	token, value = get_cfws(value)
				2007	obs_route.append(token)
				2008	elif value[0] == ',':
				2009	obs_route.append(ListSeparator)
				2010	value = value[1:]
				2011	if not value or value[0] != '@':
				2012	raise errors.HeaderParseError(
				2013	"expected obs-route domain but found '{}'".format(value))
				2014	obs_route.append(RouteComponentMarker)
				2015	token, value = get_domain(value[1:])
				2016	obs_route.append(token)
				2017	while value and value[0]==',':
				2018	obs_route.append(ListSeparator)
				2019	value = value[1:]
				2020	if not value:
				2021	break
				2022	if value[0] in CFWS_LEADER:
				2023	token, value = get_cfws(value)
				2024	obs_route.append(token)
				2025	if value[0] == '@':
				2026	obs_route.append(RouteComponentMarker)
				2027	token, value = get_domain(value[1:])
				2028	obs_route.append(token)
				2029	if not value:
				2030	raise errors.HeaderParseError("end of header while parsing obs-route")
				2031	if value[0] != ':':
				2032	raise errors.HeaderParseError( "expected ':' marking end of "
				2033	"obs-route but found '{}'".format(value))
				2034	obs_route.append(ValueTerminal(':', 'end-of-obs-route-marker'))
				2035	return obs_route, value[1:]
				2036
				2037	def get_angle_addr(value):
				2038	""" angle-addr = [CFWS] "<" addr-spec ">" [CFWS] / obs-angle-addr
				2039	obs-angle-addr = [CFWS] "<" obs-route addr-spec ">" [CFWS]
				2040
				2041	"""
				2042	angle_addr = AngleAddr()
				2043	if value[0] in CFWS_LEADER:
				2044	token, value = get_cfws(value)
				2045	angle_addr.append(token)
				2046	if not value or value[0] != '<':
				2047	raise errors.HeaderParseError(
				2048	"expected angle-addr but found '{}'".format(value))
				2049	angle_addr.append(ValueTerminal('<', 'angle-addr-start'))
				2050	value = value[1:]
R David Murray	032eed3	2012-05-26 14:31:12 -0400	[diff] [blame]	2051	# Although it is not legal per RFC5322, SMTP uses '<>' in certain
				2052	# circumstances.
				2053	if value[0] == '>':
				2054	angle_addr.append(ValueTerminal('>', 'angle-addr-end'))
				2055	angle_addr.defects.append(errors.InvalidHeaderDefect(
				2056	"null addr-spec in angle-addr"))
				2057	value = value[1:]
				2058	return angle_addr, value
R David Murray	0b6f6c8	2012-05-25 18:42:14 -0400	[diff] [blame]	2059	try:
				2060	token, value = get_addr_spec(value)
				2061	except errors.HeaderParseError:
				2062	try:
				2063	token, value = get_obs_route(value)
				2064	angle_addr.defects.append(errors.ObsoleteHeaderDefect(
				2065	"obsolete route specification in angle-addr"))
				2066	except errors.HeaderParseError:
				2067	raise errors.HeaderParseError(
R David Murray	032eed3	2012-05-26 14:31:12 -0400	[diff] [blame]	2068	"expected addr-spec or obs-route but found '{}'".format(value))
R David Murray	0b6f6c8	2012-05-25 18:42:14 -0400	[diff] [blame]	2069	angle_addr.append(token)
				2070	token, value = get_addr_spec(value)
				2071	angle_addr.append(token)
				2072	if value and value[0] == '>':
				2073	value = value[1:]
				2074	else:
				2075	angle_addr.defects.append(errors.InvalidHeaderDefect(
				2076	"missing trailing '>' on angle-addr"))
				2077	angle_addr.append(ValueTerminal('>', 'angle-addr-end'))
				2078	if value and value[0] in CFWS_LEADER:
				2079	token, value = get_cfws(value)
				2080	angle_addr.append(token)
				2081	return angle_addr, value
				2082
				2083	def get_display_name(value):
				2084	""" display-name = phrase
				2085
				2086	Because this is simply a name-rule, we don't return a display-name
				2087	token containing a phrase, but rather a display-name token with
				2088	the content of the phrase.
				2089
				2090	"""
				2091	display_name = DisplayName()
				2092	token, value = get_phrase(value)
				2093	display_name.extend(token[:])
				2094	display_name.defects = token.defects[:]
				2095	return display_name, value
				2096
				2097
				2098	def get_name_addr(value):
				2099	""" name-addr = [display-name] angle-addr
				2100
				2101	"""
				2102	name_addr = NameAddr()
				2103	# Both the optional display name and the angle-addr can start with cfws.
				2104	leader = None
				2105	if value[0] in CFWS_LEADER:
				2106	leader, value = get_cfws(value)
				2107	if not value:
				2108	raise errors.HeaderParseError(
				2109	"expected name-addr but found '{}'".format(leader))
				2110	if value[0] != '<':
				2111	if value[0] in PHRASE_ENDS:
				2112	raise errors.HeaderParseError(
				2113	"expected name-addr but found '{}'".format(value))
				2114	token, value = get_display_name(value)
				2115	if not value:
				2116	raise errors.HeaderParseError(
				2117	"expected name-addr but found '{}'".format(token))
				2118	if leader is not None:
				2119	token[0][:0] = [leader]
				2120	leader = None
				2121	name_addr.append(token)
				2122	token, value = get_angle_addr(value)
				2123	if leader is not None:
				2124	token[:0] = [leader]
				2125	name_addr.append(token)
				2126	return name_addr, value
				2127
				2128	def get_mailbox(value):
				2129	""" mailbox = name-addr / addr-spec
				2130
				2131	"""
				2132	# The only way to figure out if we are dealing with a name-addr or an
				2133	# addr-spec is to try parsing each one.
				2134	mailbox = Mailbox()
				2135	try:
				2136	token, value = get_name_addr(value)
				2137	except errors.HeaderParseError:
				2138	try:
				2139	token, value = get_addr_spec(value)
				2140	except errors.HeaderParseError:
				2141	raise errors.HeaderParseError(
				2142	"expected mailbox but found '{}'".format(value))
				2143	if any(isinstance(x, errors.InvalidHeaderDefect)
				2144	for x in token.all_defects):
				2145	mailbox.token_type = 'invalid-mailbox'
				2146	mailbox.append(token)
				2147	return mailbox, value
				2148
				2149	def get_invalid_mailbox(value, endchars):
				2150	""" Read everything up to one of the chars in endchars.
				2151
				2152	This is outside the formal grammar. The InvalidMailbox TokenList that is
				2153	returned acts like a Mailbox, but the data attributes are None.
				2154
				2155	"""
				2156	invalid_mailbox = InvalidMailbox()
				2157	while value and value[0] not in endchars:
				2158	if value[0] in PHRASE_ENDS:
				2159	invalid_mailbox.append(ValueTerminal(value[0],
				2160	'misplaced-special'))
				2161	value = value[1:]
				2162	else:
				2163	token, value = get_phrase(value)
				2164	invalid_mailbox.append(token)
				2165	return invalid_mailbox, value
				2166
				2167	def get_mailbox_list(value):
				2168	""" mailbox-list = (mailbox *("," mailbox)) / obs-mbox-list
				2169	obs-mbox-list = ([CFWS] ",") mailbox ("," [mailbox / CFWS])
				2170
				2171	For this routine we go outside the formal grammar in order to improve error
				2172	handling. We recognize the end of the mailbox list only at the end of the
				2173	value or at a ';' (the group terminator). This is so that we can turn
				2174	invalid mailboxes into InvalidMailbox tokens and continue parsing any
				2175	remaining valid mailboxes. We also allow all mailbox entries to be null,
				2176	and this condition is handled appropriately at a higher level.
				2177
				2178	"""
				2179	mailbox_list = MailboxList()
				2180	while value and value[0] != ';':
				2181	try:
				2182	token, value = get_mailbox(value)
				2183	mailbox_list.append(token)
				2184	except errors.HeaderParseError:
				2185	leader = None
				2186	if value[0] in CFWS_LEADER:
				2187	leader, value = get_cfws(value)
				2188	if not value or value[0] in ',;':
				2189	mailbox_list.append(leader)
				2190	mailbox_list.defects.append(errors.ObsoleteHeaderDefect(
				2191	"empty element in mailbox-list"))
				2192	else:
				2193	token, value = get_invalid_mailbox(value, ',;')
				2194	if leader is not None:
				2195	token[:0] = [leader]
				2196	mailbox_list.append(token)
				2197	mailbox_list.defects.append(errors.InvalidHeaderDefect(
				2198	"invalid mailbox in mailbox-list"))
				2199	elif value[0] == ',':
				2200	mailbox_list.defects.append(errors.ObsoleteHeaderDefect(
				2201	"empty element in mailbox-list"))
				2202	else:
				2203	token, value = get_invalid_mailbox(value, ',;')
				2204	if leader is not None:
				2205	token[:0] = [leader]
				2206	mailbox_list.append(token)
				2207	mailbox_list.defects.append(errors.InvalidHeaderDefect(
				2208	"invalid mailbox in mailbox-list"))
				2209	if value and value[0] not in ',;':
				2210	# Crap after mailbox; treat it as an invalid mailbox.
				2211	# The mailbox info will still be available.
				2212	mailbox = mailbox_list[-1]
				2213	mailbox.token_type = 'invalid-mailbox'
				2214	token, value = get_invalid_mailbox(value, ',;')
				2215	mailbox.extend(token)
				2216	mailbox_list.defects.append(errors.InvalidHeaderDefect(
				2217	"invalid mailbox in mailbox-list"))
				2218	if value and value[0] == ',':
				2219	mailbox_list.append(ListSeparator)
				2220	value = value[1:]
				2221	return mailbox_list, value
				2222
				2223
				2224	def get_group_list(value):
				2225	""" group-list = mailbox-list / CFWS / obs-group-list
				2226	obs-group-list = 1*([CFWS] ",") [CFWS]
				2227
				2228	"""
				2229	group_list = GroupList()
				2230	if not value:
				2231	group_list.defects.append(errors.InvalidHeaderDefect(
				2232	"end of header before group-list"))
				2233	return group_list, value
				2234	leader = None
				2235	if value and value[0] in CFWS_LEADER:
				2236	leader, value = get_cfws(value)
				2237	if not value:
				2238	# This should never happen in email parsing, since CFWS-only is a
				2239	# legal alternative to group-list in a group, which is the only
				2240	# place group-list appears.
				2241	group_list.defects.append(errors.InvalidHeaderDefect(
				2242	"end of header in group-list"))
				2243	group_list.append(leader)
				2244	return group_list, value
				2245	if value[0] == ';':
				2246	group_list.append(leader)
				2247	return group_list, value
				2248	token, value = get_mailbox_list(value)
				2249	if len(token.all_mailboxes)==0:
				2250	if leader is not None:
				2251	group_list.append(leader)
				2252	group_list.extend(token)
				2253	group_list.defects.append(errors.ObsoleteHeaderDefect(
				2254	"group-list with empty entries"))
				2255	return group_list, value
				2256	if leader is not None:
				2257	token[:0] = [leader]
				2258	group_list.append(token)
				2259	return group_list, value
				2260
				2261	def get_group(value):
				2262	""" group = display-name ":" [group-list] ";" [CFWS]
				2263
				2264	"""
				2265	group = Group()
				2266	token, value = get_display_name(value)
				2267	if not value or value[0] != ':':
				2268	raise errors.HeaderParseError("expected ':' at end of group "
				2269	"display name but found '{}'".format(value))
				2270	group.append(token)
				2271	group.append(ValueTerminal(':', 'group-display-name-terminator'))
				2272	value = value[1:]
				2273	if value and value[0] == ';':
				2274	group.append(ValueTerminal(';', 'group-terminator'))
				2275	return group, value[1:]
				2276	token, value = get_group_list(value)
				2277	group.append(token)
				2278	if not value:
				2279	group.defects.append(errors.InvalidHeaderDefect(
				2280	"end of header in group"))
				2281	if value[0] != ';':
				2282	raise errors.HeaderParseError(
				2283	"expected ';' at end of group but found {}".format(value))
				2284	group.append(ValueTerminal(';', 'group-terminator'))
				2285	value = value[1:]
				2286	if value and value[0] in CFWS_LEADER:
				2287	token, value = get_cfws(value)
				2288	group.append(token)
				2289	return group, value
				2290
				2291	def get_address(value):
				2292	""" address = mailbox / group
				2293
				2294	Note that counter-intuitively, an address can be either a single address or
				2295	a list of addresses (a group). This is why the returned Address object has
				2296	a 'mailboxes' attribute which treats a single address as a list of length
				2297	one. When you need to differentiate between to two cases, extract the single
				2298	element, which is either a mailbox or a group token.
				2299
				2300	"""
				2301	# The formal grammar isn't very helpful when parsing an address. mailbox
				2302	# and group, especially when allowing for obsolete forms, start off very
				2303	# similarly. It is only when you reach one of @, <, or : that you know
				2304	# what you've got. So, we try each one in turn, starting with the more
				2305	# likely of the two. We could perhaps make this more efficient by looking
				2306	# for a phrase and then branching based on the next character, but that
				2307	# would be a premature optimization.
				2308	address = Address()
				2309	try:
				2310	token, value = get_group(value)
				2311	except errors.HeaderParseError:
				2312	try:
				2313	token, value = get_mailbox(value)
				2314	except errors.HeaderParseError:
				2315	raise errors.HeaderParseError(
				2316	"expected address but found '{}'".format(value))
				2317	address.append(token)
				2318	return address, value
				2319
				2320	def get_address_list(value):
				2321	""" address_list = (address *("," address)) / obs-addr-list
				2322	obs-addr-list = ([CFWS] ",") address ("," [address / CFWS])
				2323
				2324	We depart from the formal grammar here by continuing to parse until the end
				2325	of the input, assuming the input to be entirely composed of an
				2326	address-list. This is always true in email parsing, and allows us
				2327	to skip invalid addresses to parse additional valid ones.
				2328
				2329	"""
				2330	address_list = AddressList()
				2331	while value:
				2332	try:
				2333	token, value = get_address(value)
				2334	address_list.append(token)
				2335	except errors.HeaderParseError as err:
				2336	leader = None
				2337	if value[0] in CFWS_LEADER:
				2338	leader, value = get_cfws(value)
				2339	if not value or value[0] == ',':
				2340	address_list.append(leader)
				2341	address_list.defects.append(errors.ObsoleteHeaderDefect(
				2342	"address-list entry with no content"))
				2343	else:
				2344	token, value = get_invalid_mailbox(value, ',')
				2345	if leader is not None:
				2346	token[:0] = [leader]
				2347	address_list.append(Address([token]))
				2348	address_list.defects.append(errors.InvalidHeaderDefect(
				2349	"invalid address in address-list"))
				2350	elif value[0] == ',':
				2351	address_list.defects.append(errors.ObsoleteHeaderDefect(
				2352	"empty element in address-list"))
				2353	else:
				2354	token, value = get_invalid_mailbox(value, ',')
				2355	if leader is not None:
				2356	token[:0] = [leader]
				2357	address_list.append(Address([token]))
				2358	address_list.defects.append(errors.InvalidHeaderDefect(
				2359	"invalid address in address-list"))
				2360	if value and value[0] != ',':
				2361	# Crap after address; treat it as an invalid mailbox.
				2362	# The mailbox info will still be available.
				2363	mailbox = address_list[-1][0]
				2364	mailbox.token_type = 'invalid-mailbox'
				2365	token, value = get_invalid_mailbox(value, ',')
				2366	mailbox.extend(token)
				2367	address_list.defects.append(errors.InvalidHeaderDefect(
				2368	"invalid address in address-list"))
				2369	if value: # Must be a , at this point.
				2370	address_list.append(ValueTerminal(',', 'list-separator'))
				2371	value = value[1:]
				2372	return address_list, value
R David Murray	97f43c0	2012-06-24 05:03:27 -0400	[diff] [blame]	2373
				2374	#
				2375	# XXX: As I begin to add additional header parsers, I'm realizing we probably
				2376	# have two level of parser routines: the get_XXX methods that get a token in
				2377	# the grammar, and parse_XXX methods that parse an entire field value. So
				2378	# get_address_list above should really be a parse_ method, as probably should
				2379	# be get_unstructured.
				2380	#
				2381
				2382	def parse_mime_version(value):
				2383	""" mime-version = [CFWS] 1digit [CFWS] "." [CFWS] 1digit [CFWS]
				2384
				2385	"""
				2386	# The [CFWS] is implicit in the RFC 2045 BNF.
				2387	# XXX: This routine is a bit verbose, should factor out a get_int method.
				2388	mime_version = MIMEVersion()
				2389	if not value:
				2390	mime_version.defects.append(errors.HeaderMissingRequiredValue(
				2391	"Missing MIME version number (eg: 1.0)"))
				2392	return mime_version
				2393	if value[0] in CFWS_LEADER:
				2394	token, value = get_cfws(value)
				2395	mime_version.append(token)
				2396	if not value:
				2397	mime_version.defects.append(errors.HeaderMissingRequiredValue(
				2398	"Expected MIME version number but found only CFWS"))
				2399	digits = ''
				2400	while value and value[0] != '.' and value[0] not in CFWS_LEADER:
				2401	digits += value[0]
				2402	value = value[1:]
				2403	if not digits.isdigit():
				2404	mime_version.defects.append(errors.InvalidHeaderDefect(
				2405	"Expected MIME major version number but found {!r}".format(digits)))
				2406	mime_version.append(ValueTerminal(digits, 'xtext'))
				2407	else:
				2408	mime_version.major = int(digits)
				2409	mime_version.append(ValueTerminal(digits, 'digits'))
				2410	if value and value[0] in CFWS_LEADER:
				2411	token, value = get_cfws(value)
				2412	mime_version.append(token)
				2413	if not value or value[0] != '.':
				2414	if mime_version.major is not None:
				2415	mime_version.defects.append(errors.InvalidHeaderDefect(
				2416	"Incomplete MIME version; found only major number"))
				2417	if value:
				2418	mime_version.append(ValueTerminal(value, 'xtext'))
				2419	return mime_version
				2420	mime_version.append(ValueTerminal('.', 'version-separator'))
				2421	value = value[1:]
				2422	if value and value[0] in CFWS_LEADER:
				2423	token, value = get_cfws(value)
				2424	mime_version.append(token)
				2425	if not value:
				2426	if mime_version.major is not None:
				2427	mime_version.defects.append(errors.InvalidHeaderDefect(
				2428	"Incomplete MIME version; found only major number"))
				2429	return mime_version
				2430	digits = ''
				2431	while value and value[0] not in CFWS_LEADER:
				2432	digits += value[0]
				2433	value = value[1:]
				2434	if not digits.isdigit():
				2435	mime_version.defects.append(errors.InvalidHeaderDefect(
				2436	"Expected MIME minor version number but found {!r}".format(digits)))
				2437	mime_version.append(ValueTerminal(digits, 'xtext'))
				2438	else:
				2439	mime_version.minor = int(digits)
				2440	mime_version.append(ValueTerminal(digits, 'digits'))
				2441	if value and value[0] in CFWS_LEADER:
				2442	token, value = get_cfws(value)
				2443	mime_version.append(token)
				2444	if value:
				2445	mime_version.defects.append(errors.InvalidHeaderDefect(
				2446	"Excess non-CFWS text after MIME version"))
				2447	mime_version.append(ValueTerminal(value, 'xtext'))
				2448	return mime_version
				2449
				2450	def get_invalid_parameter(value):
				2451	""" Read everything up to the next ';'.
				2452
				2453	This is outside the formal grammar. The InvalidParameter TokenList that is
				2454	returned acts like a Parameter, but the data attributes are None.
				2455
				2456	"""
				2457	invalid_parameter = InvalidParameter()
				2458	while value and value[0] != ';':
				2459	if value[0] in PHRASE_ENDS:
				2460	invalid_parameter.append(ValueTerminal(value[0],
				2461	'misplaced-special'))
				2462	value = value[1:]
				2463	else:
				2464	token, value = get_phrase(value)
				2465	invalid_parameter.append(token)
				2466	return invalid_parameter, value
				2467
				2468	def get_ttext(value):
				2469	"""ttext = <matches _ttext_matcher>
				2470
				2471	We allow any non-TOKEN_ENDS in ttext, but add defects to the token's
				2472	defects list if we find non-ttext characters. We also register defects for
				2473	any non-printables even though the RFC doesn't exclude all of them,
				2474	because we follow the spirit of RFC 5322.
				2475
				2476	"""
				2477	m = _non_token_end_matcher(value)
				2478	if not m:
				2479	raise errors.HeaderParseError(
				2480	"expected ttext but found '{}'".format(value))
				2481	ttext = m.group()
				2482	value = value[len(ttext):]
				2483	ttext = ValueTerminal(ttext, 'ttext')
				2484	_validate_xtext(ttext)
				2485	return ttext, value
				2486
				2487	def get_token(value):
				2488	"""token = [CFWS] 1*ttext [CFWS]
				2489
				2490	The RFC equivalent of ttext is any US-ASCII chars except space, ctls, or
				2491	tspecials. We also exclude tabs even though the RFC doesn't.
				2492
				2493	The RFC implies the CFWS but is not explicit about it in the BNF.
				2494
				2495	"""
				2496	mtoken = Token()
				2497	if value and value[0] in CFWS_LEADER:
				2498	token, value = get_cfws(value)
				2499	mtoken.append(token)
				2500	if value and value[0] in TOKEN_ENDS:
				2501	raise errors.HeaderParseError(
				2502	"expected token but found '{}'".format(value))
				2503	token, value = get_ttext(value)
				2504	mtoken.append(token)
				2505	if value and value[0] in CFWS_LEADER:
				2506	token, value = get_cfws(value)
				2507	mtoken.append(token)
				2508	return mtoken, value
				2509
				2510	def get_attrtext(value):
				2511	"""attrtext = 1*(any non-ATTRIBUTE_ENDS character)
				2512
				2513	We allow any non-ATTRIBUTE_ENDS in attrtext, but add defects to the
				2514	token's defects list if we find non-attrtext characters. We also register
				2515	defects for any non-printables even though the RFC doesn't exclude all of
				2516	them, because we follow the spirit of RFC 5322.
				2517
				2518	"""
				2519	m = _non_attribute_end_matcher(value)
				2520	if not m:
				2521	raise errors.HeaderParseError(
				2522	"expected attrtext but found {!r}".format(value))
				2523	attrtext = m.group()
				2524	value = value[len(attrtext):]
				2525	attrtext = ValueTerminal(attrtext, 'attrtext')
				2526	_validate_xtext(attrtext)
				2527	return attrtext, value
				2528
				2529	def get_attribute(value):
				2530	""" [CFWS] 1*attrtext [CFWS]
				2531
				2532	This version of the BNF makes the CFWS explicit, and as usual we use a
				2533	value terminal for the actual run of characters. The RFC equivalent of
				2534	attrtext is the token characters, with the subtraction of '*', "'", and '%'.
				2535	We include tab in the excluded set just as we do for token.
				2536
				2537	"""
				2538	attribute = Attribute()
				2539	if value and value[0] in CFWS_LEADER:
				2540	token, value = get_cfws(value)
				2541	attribute.append(token)
				2542	if value and value[0] in ATTRIBUTE_ENDS:
				2543	raise errors.HeaderParseError(
				2544	"expected token but found '{}'".format(value))
				2545	token, value = get_attrtext(value)
				2546	attribute.append(token)
				2547	if value and value[0] in CFWS_LEADER:
				2548	token, value = get_cfws(value)
				2549	attribute.append(token)
				2550	return attribute, value
				2551
				2552	def get_extended_attrtext(value):
				2553	"""attrtext = 1*(any non-ATTRIBUTE_ENDS character plus '%')
				2554
				2555	This is a special parsing routine so that we get a value that
				2556	includes % escapes as a single string (which we decode as a single
				2557	string later).
				2558
				2559	"""
				2560	m = _non_extended_attribute_end_matcher(value)
				2561	if not m:
				2562	raise errors.HeaderParseError(
				2563	"expected extended attrtext but found {!r}".format(value))
				2564	attrtext = m.group()
				2565	value = value[len(attrtext):]
				2566	attrtext = ValueTerminal(attrtext, 'extended-attrtext')
				2567	_validate_xtext(attrtext)
				2568	return attrtext, value
				2569
				2570	def get_extended_attribute(value):
				2571	""" [CFWS] 1*extended_attrtext [CFWS]
				2572
				2573	This is like the non-extended version except we allow % characters, so that
				2574	we can pick up an encoded value as a single string.
				2575
				2576	"""
				2577	# XXX: should we have an ExtendedAttribute TokenList?
				2578	attribute = Attribute()
				2579	if value and value[0] in CFWS_LEADER:
				2580	token, value = get_cfws(value)
				2581	attribute.append(token)
				2582	if value and value[0] in EXTENDED_ATTRIBUTE_ENDS:
				2583	raise errors.HeaderParseError(
				2584	"expected token but found '{}'".format(value))
				2585	token, value = get_extended_attrtext(value)
				2586	attribute.append(token)
				2587	if value and value[0] in CFWS_LEADER:
				2588	token, value = get_cfws(value)
				2589	attribute.append(token)
				2590	return attribute, value
				2591
				2592	def get_section(value):
				2593	""" '*' digits
				2594
				2595	The formal BNF is more complicated because leading 0s are not allowed. We
				2596	check for that and add a defect. We also assume no CFWS is allowed between
				2597	the '*' and the digits, though the RFC is not crystal clear on that.
				2598	The caller should already have dealt with leading CFWS.
				2599
				2600	"""
				2601	section = Section()
				2602	if not value or value[0] != '*':
				2603	raise errors.HeaderParseError("Expected section but found {}".format(
				2604	value))
				2605	section.append(ValueTerminal('*', 'section-marker'))
				2606	value = value[1:]
				2607	if not value or not value[0].isdigit():
				2608	raise errors.HeaderParseError("Expected section number but "
				2609	"found {}".format(value))
				2610	digits = ''
				2611	while value and value[0].isdigit():
				2612	digits += value[0]
				2613	value = value[1:]
				2614	if digits[0] == '0' and digits != '0':
				2615	section.defects.append(errors.InvalidHeaderError("section number"
				2616	"has an invalid leading 0"))
				2617	section.number = int(digits)
				2618	section.append(ValueTerminal(digits, 'digits'))
				2619	return section, value
				2620
				2621
				2622	def get_value(value):
				2623	""" quoted-string / attribute
				2624
				2625	"""
				2626	v = Value()
				2627	if not value:
				2628	raise errors.HeaderParseError("Expected value but found end of string")
				2629	leader = None
				2630	if value[0] in CFWS_LEADER:
				2631	leader, value = get_cfws(value)
				2632	if not value:
				2633	raise errors.HeaderParseError("Expected value but found "
				2634	"only {}".format(leader))
				2635	if value[0] == '"':
				2636	token, value = get_quoted_string(value)
				2637	else:
				2638	token, value = get_extended_attribute(value)
				2639	if leader is not None:
				2640	token[:0] = [leader]
				2641	v.append(token)
				2642	return v, value
				2643
				2644	def get_parameter(value):
				2645	""" attribute [section] ["*"] [CFWS] "=" value
				2646
				2647	The CFWS is implied by the RFC but not made explicit in the BNF. This
				2648	simplified form of the BNF from the RFC is made to conform with the RFC BNF
				2649	through some extra checks. We do it this way because it makes both error
				2650	recovery and working with the resulting parse tree easier.
				2651	"""
				2652	# It is possible CFWS would also be implicitly allowed between the section
				2653	# and the 'extended-attribute' marker (the '*') , but we've never seen that
				2654	# in the wild and we will therefore ignore the possibility.
				2655	param = Parameter()
				2656	token, value = get_attribute(value)
				2657	param.append(token)
				2658	if not value or value[0] == ';':
				2659	param.defects.append(errors.InvalidHeaderDefect("Parameter contains "
				2660	"name ({}) but no value".format(token)))
				2661	return param, value
				2662	if value[0] == '*':
				2663	try:
				2664	token, value = get_section(value)
				2665	param.sectioned = True
				2666	param.append(token)
				2667	except errors.HeaderParseError:
				2668	pass
				2669	if not value:
				2670	raise errors.HeaderParseError("Incomplete parameter")
				2671	if value[0] == '*':
				2672	param.append(ValueTerminal('*', 'extended-parameter-marker'))
				2673	value = value[1:]
				2674	param.extended = True
				2675	if value[0] != '=':
				2676	raise errors.HeaderParseError("Parameter not followed by '='")
				2677	param.append(ValueTerminal('=', 'parameter-separator'))
				2678	value = value[1:]
				2679	leader = None
				2680	if value and value[0] in CFWS_LEADER:
				2681	token, value = get_cfws(value)
				2682	param.append(token)
				2683	remainder = None
				2684	appendto = param
				2685	if param.extended and value and value[0] == '"':
				2686	# Now for some serious hackery to handle the common invalid case of
				2687	# double quotes around an extended value. We also accept (with defect)
				2688	# a value marked as encoded that isn't really.
				2689	qstring, remainder = get_quoted_string(value)
				2690	inner_value = qstring.stripped_value
				2691	semi_valid = False
				2692	if param.section_number == 0:
				2693	if inner_value and inner_value[0] == "'":
				2694	semi_valid = True
				2695	else:
				2696	token, rest = get_attrtext(inner_value)
				2697	if rest and rest[0] == "'":
				2698	semi_valid = True
				2699	else:
				2700	try:
				2701	token, rest = get_extended_attrtext(inner_value)
				2702	except:
				2703	pass
				2704	else:
				2705	if not rest:
				2706	semi_valid = True
				2707	if semi_valid:
				2708	param.defects.append(errors.InvalidHeaderDefect(
				2709	"Quoted string value for extended parameter is invalid"))
				2710	param.append(qstring)
				2711	for t in qstring:
				2712	if t.token_type == 'bare-quoted-string':
				2713	t[:] = []
				2714	appendto = t
				2715	break
				2716	value = inner_value
				2717	else:
				2718	remainder = None
				2719	param.defects.append(errors.InvalidHeaderDefect(
				2720	"Parameter marked as extended but appears to have a "
				2721	"quoted string value that is non-encoded"))
				2722	if value and value[0] == "'":
				2723	token = None
				2724	else:
				2725	token, value = get_value(value)
				2726	if not param.extended or param.section_number > 0:
				2727	if not value or value[0] != "'":
				2728	appendto.append(token)
				2729	if remainder is not None:
				2730	assert not value, value
				2731	value = remainder
				2732	return param, value
				2733	param.defects.append(errors.InvalidHeaderDefect(
				2734	"Apparent initial-extended-value but attribute "
				2735	"was not marked as extended or was not initial section"))
				2736	if not value:
				2737	# Assume the charset/lang is missing and the token is the value.
				2738	param.defects.append(errors.InvalidHeaderDefect(
				2739	"Missing required charset/lang delimiters"))
				2740	appendto.append(token)
				2741	if remainder is None:
				2742	return param, value
				2743	else:
				2744	if token is not None:
				2745	for t in token:
				2746	if t.token_type == 'extended-attrtext':
				2747	break
				2748	t.token_type == 'attrtext'
				2749	appendto.append(t)
				2750	param.charset = t.value
				2751	if value[0] != "'":
				2752	raise errors.HeaderParseError("Expected RFC2231 char/lang encoding "
				2753	"delimiter, but found {!r}".format(value))
				2754	appendto.append(ValueTerminal("'", 'RFC2231 delimiter'))
				2755	value = value[1:]
				2756	if value and value[0] != "'":
				2757	token, value = get_attrtext(value)
				2758	appendto.append(token)
				2759	param.lang = token.value
				2760	if not value or value[0] != "'":
				2761	raise errors.HeaderParseError("Expected RFC2231 char/lang encoding "
				2762	"delimiter, but found {}".format(value))
				2763	appendto.append(ValueTerminal("'", 'RFC2231 delimiter'))
				2764	value = value[1:]
				2765	if remainder is not None:
				2766	# Treat the rest of value as bare quoted string content.
				2767	v = Value()
				2768	while value:
				2769	if value[0] in WSP:
				2770	token, value = get_fws(value)
				2771	else:
				2772	token, value = get_qcontent(value)
				2773	v.append(token)
				2774	token = v
				2775	else:
				2776	token, value = get_value(value)
				2777	appendto.append(token)
				2778	if remainder is not None:
				2779	assert not value, value
				2780	value = remainder
				2781	return param, value
				2782
				2783	def parse_mime_parameters(value):
				2784	""" parameter *( ";" parameter )
				2785
				2786	That BNF is meant to indicate this routine should only be called after
				2787	finding and handling the leading ';'. There is no corresponding rule in
				2788	the formal RFC grammar, but it is more convenient for us for the set of
				2789	parameters to be treated as its own TokenList.
				2790
				2791	This is 'parse' routine because it consumes the reminaing value, but it
				2792	would never be called to parse a full header. Instead it is called to
				2793	parse everything after the non-parameter value of a specific MIME header.
				2794
				2795	"""
				2796	mime_parameters = MimeParameters()
				2797	while value:
				2798	try:
				2799	token, value = get_parameter(value)
				2800	mime_parameters.append(token)
				2801	except errors.HeaderParseError as err:
				2802	leader = None
				2803	if value[0] in CFWS_LEADER:
				2804	leader, value = get_cfws(value)
				2805	if not value:
				2806	mime_parameters.append(leader)
				2807	return mime_parameters
				2808	if value[0] == ';':
				2809	if leader is not None:
				2810	mime_parameters.append(leader)
				2811	mime_parameters.defects.append(errors.InvalidHeaderDefect(
				2812	"parameter entry with no content"))
				2813	else:
				2814	token, value = get_invalid_parameter(value)
				2815	if leader:
				2816	token[:0] = [leader]
				2817	mime_parameters.append(token)
				2818	mime_parameters.defects.append(errors.InvalidHeaderDefect(
				2819	"invalid parameter {!r}".format(token)))
				2820	if value and value[0] != ';':
				2821	# Junk after the otherwise valid parameter. Mark it as
				2822	# invalid, but it will have a value.
				2823	param = mime_parameters[-1]
				2824	param.token_type = 'invalid-parameter'
				2825	token, value = get_invalid_parameter(value)
				2826	param.extend(token)
				2827	mime_parameters.defects.append(errors.InvalidHeaderDefect(
				2828	"parameter with invalid trailing text {!r}".format(token)))
				2829	if value:
				2830	# Must be a ';' at this point.
				2831	mime_parameters.append(ValueTerminal(';', 'parameter-separator'))
				2832	value = value[1:]
				2833	return mime_parameters
				2834
				2835	def _find_mime_parameters(tokenlist, value):
				2836	"""Do our best to find the parameters in an invalid MIME header
				2837
				2838	"""
				2839	while value and value[0] != ';':
				2840	if value[0] in PHRASE_ENDS:
				2841	tokenlist.append(ValueTerminal(value[0], 'misplaced-special'))
				2842	value = value[1:]
				2843	else:
				2844	token, value = get_phrase(value)
				2845	tokenlist.append(token)
				2846	if not value:
				2847	return
				2848	tokenlist.append(ValueTerminal(';', 'parameter-separator'))
				2849	tokenlist.append(parse_mime_parameters(value[1:]))
				2850
				2851	def parse_content_type_header(value):
				2852	""" maintype "/" subtype *( ";" parameter )
				2853
				2854	The maintype and substype are tokens. Theoretically they could
				2855	be checked against the official IANA list + x-token, but we
				2856	don't do that.
				2857	"""
				2858	ctype = ContentType()
				2859	recover = False
				2860	if not value:
				2861	ctype.defects.append(errors.HeaderMissingRequiredValue(
				2862	"Missing content type specification"))
				2863	return ctype
				2864	try:
				2865	token, value = get_token(value)
				2866	except errors.HeaderParseError:
				2867	ctype.defects.append(errors.InvalidHeaderDefect(
				2868	"Expected content maintype but found {!r}".format(value)))
				2869	_find_mime_parameters(ctype, value)
				2870	return ctype
				2871	ctype.append(token)
				2872	# XXX: If we really want to follow the formal grammer we should make
				2873	# mantype and subtype specialized TokenLists here. Probably not worth it.
				2874	if not value or value[0] != '/':
				2875	ctype.defects.append(errors.InvalidHeaderDefect(
				2876	"Invalid content type"))
				2877	if value:
				2878	_find_mime_parameters(ctype, value)
				2879	return ctype
				2880	ctype.maintype = token.value.strip().lower()
				2881	ctype.append(ValueTerminal('/', 'content-type-separator'))
				2882	value = value[1:]
				2883	try:
				2884	token, value = get_token(value)
				2885	except errors.HeaderParseError:
				2886	ctype.defects.append(errors.InvalidHeaderDefect(
				2887	"Expected content subtype but found {!r}".format(value)))
				2888	_find_mime_parameters(ctype, value)
				2889	return ctype
				2890	ctype.append(token)
				2891	ctype.subtype = token.value.strip().lower()
				2892	if not value:
				2893	return ctype
				2894	if value[0] != ';':
				2895	ctype.defects.append(errors.InvalidHeaderDefect(
				2896	"Only parameters are valid after content type, but "
				2897	"found {!r}".format(value)))
				2898	# The RFC requires that a syntactically invalid content-type be treated
				2899	# as text/plain. Perhaps we should postel this, but we should probably
				2900	# only do that if we were checking the subtype value against IANA.
				2901	del ctype.maintype, ctype.subtype
				2902	_find_mime_parameters(ctype, value)
				2903	return ctype
				2904	ctype.append(ValueTerminal(';', 'parameter-separator'))
				2905	ctype.append(parse_mime_parameters(value[1:]))
				2906	return ctype
				2907
				2908	def parse_content_disposition_header(value):
				2909	""" disposition-type *( ";" parameter )
				2910
				2911	"""
				2912	disp_header = ContentDisposition()
				2913	if not value:
				2914	disp_header.defects.append(errors.HeaderMissingRequiredValue(
				2915	"Missing content disposition"))
				2916	return disp_header
				2917	try:
				2918	token, value = get_token(value)
				2919	except errors.HeaderParseError:
Ezio Melotti	d577480	2014-08-04 17:16:49 +0300	[diff] [blame]	2920	disp_header.defects.append(errors.InvalidHeaderDefect(
R David Murray	97f43c0	2012-06-24 05:03:27 -0400	[diff] [blame]	2921	"Expected content disposition but found {!r}".format(value)))
				2922	_find_mime_parameters(disp_header, value)
				2923	return disp_header
				2924	disp_header.append(token)
				2925	disp_header.content_disposition = token.value.strip().lower()
				2926	if not value:
				2927	return disp_header
				2928	if value[0] != ';':
				2929	disp_header.defects.append(errors.InvalidHeaderDefect(
				2930	"Only parameters are valid after content disposition, but "
				2931	"found {!r}".format(value)))
				2932	_find_mime_parameters(disp_header, value)
				2933	return disp_header
				2934	disp_header.append(ValueTerminal(';', 'parameter-separator'))
				2935	disp_header.append(parse_mime_parameters(value[1:]))
				2936	return disp_header
				2937
				2938	def parse_content_transfer_encoding_header(value):
				2939	""" mechanism
				2940
				2941	"""
				2942	# We should probably validate the values, since the list is fixed.
				2943	cte_header = ContentTransferEncoding()
				2944	if not value:
				2945	cte_header.defects.append(errors.HeaderMissingRequiredValue(
				2946	"Missing content transfer encoding"))
				2947	return cte_header
				2948	try:
				2949	token, value = get_token(value)
				2950	except errors.HeaderParseError:
Ezio Melotti	d577480	2014-08-04 17:16:49 +0300	[diff] [blame]	2951	cte_header.defects.append(errors.InvalidHeaderDefect(
				2952	"Expected content transfer encoding but found {!r}".format(value)))
R David Murray	97f43c0	2012-06-24 05:03:27 -0400	[diff] [blame]	2953	else:
				2954	cte_header.append(token)
				2955	cte_header.cte = token.value.strip().lower()
				2956	if not value:
				2957	return cte_header
				2958	while value:
				2959	cte_header.defects.append(errors.InvalidHeaderDefect(
				2960	"Extra text after content transfer encoding"))
				2961	if value[0] in PHRASE_ENDS:
				2962	cte_header.append(ValueTerminal(value[0], 'misplaced-special'))
				2963	value = value[1:]
				2964	else:
				2965	token, value = get_phrase(value)
				2966	cte_header.append(token)
				2967	return cte_header