Blame - Lib/email/_header_value_parser.py - platform/external/python/cpython3

blob: 9b9697f77346a60ab37321f46c4b58845135bd1a [file] [log] [blame]

R David Murray	0b6f6c8	2012-05-25 18:42:14 -0400	[diff] [blame]	1	"""Header value parser implementing various email-related RFC parsing rules.
				2
				3	The parsing methods defined in this module implement various email related
				4	parsing rules. Principal among them is RFC 5322, which is the followon
				5	to RFC 2822 and primarily a clarification of the former. It also implements
				6	RFC 2047 encoded word decoding.
				7
				8	RFC 5322 goes to considerable trouble to maintain backward compatibility with
				9	RFC 822 in the parse phase, while cleaning up the structure on the generation
				10	phase. This parser supports correct RFC 5322 generation by tagging white space
				11	as folding white space only when folding is allowed in the non-obsolete rule
				12	sets. Actually, the parser is even more generous when accepting input than RFC
				13	5322 mandates, following the spirit of Postel's Law, which RFC 5322 encourages.
				14	Where possible deviations from the standard are annotated on the 'defects'
				15	attribute of tokens that deviate.
				16
				17	The general structure of the parser follows RFC 5322, and uses its terminology
				18	where there is a direct correspondence. Where the implementation requires a
				19	somewhat different structure than that used by the formal grammar, new terms
				20	that mimic the closest existing terms are used. Thus, it really helps to have
				21	a copy of RFC 5322 handy when studying this code.
				22
				23	Input to the parser is a string that has already been unfolded according to
				24	RFC 5322 rules. According to the RFC this unfolding is the very first step, and
				25	this parser leaves the unfolding step to a higher level message parser, which
				26	will have already detected the line breaks that need unfolding while
				27	determining the beginning and end of each header.
				28
				29	The output of the parser is a TokenList object, which is a list subclass. A
				30	TokenList is a recursive data structure. The terminal nodes of the structure
				31	are Terminal objects, which are subclasses of str. These do not correspond
				32	directly to terminal objects in the formal grammar, but are instead more
				33	practical higher level combinations of true terminals.
				34
				35	All TokenList and Terminal objects have a 'value' attribute, which produces the
				36	semantically meaningful value of that part of the parse subtree. The value of
				37	all whitespace tokens (no matter how many sub-tokens they may contain) is a
				38	single space, as per the RFC rules. This includes 'CFWS', which is herein
				39	included in the general class of whitespace tokens. There is one exception to
				40	the rule that whitespace tokens are collapsed into single spaces in values: in
				41	the value of a 'bare-quoted-string' (a quoted-string with no leading or
				42	trailing whitespace), any whitespace that appeared between the quotation marks
				43	is preserved in the returned value. Note that in all Terminal strings quoted
				44	pairs are turned into their unquoted values.
				45
				46	All TokenList and Terminal objects also have a string value, which attempts to
				47	be a "canonical" representation of the RFC-compliant form of the substring that
				48	produced the parsed subtree, including minimal use of quoted pair quoting.
				49	Whitespace runs are not collapsed.
				50
				51	Comment tokens also have a 'content' attribute providing the string found
				52	between the parens (including any nested comments) with whitespace preserved.
				53
				54	All TokenList and Terminal objects have a 'defects' attribute which is a
				55	possibly empty list all of the defects found while creating the token. Defects
				56	may appear on any token in the tree, and a composite list of all defects in the
				57	subtree is available through the 'all_defects' attribute of any node. (For
				58	Terminal notes x.defects == x.all_defects.)
				59
				60	Each object in a parse tree is called a 'token', and each has a 'token_type'
				61	attribute that gives the name from the RFC 5322 grammar that it represents.
				62	Not all RFC 5322 nodes are produced, and there is one non-RFC 5322 node that
				63	may be produced: 'ptext'. A 'ptext' is a string of printable ascii characters.
				64	It is returned in place of lists of (ctext/quoted-pair) and
				65	(qtext/quoted-pair).
				66
				67	XXX: provide complete list of token types.
				68	"""
				69
				70	import re
R David Murray	97f43c0	2012-06-24 05:03:27 -0400	[diff] [blame]	71	import urllib # For urllib.parse.unquote
R David Murray	65171b2	2013-07-11 15:52:57 -0400	[diff] [blame]	72	from string import hexdigits
Victor Stinner	7fa767e	2014-03-20 09:16:38 +0100	[diff] [blame]	73	from collections import OrderedDict
R David Murray	7d0325d	2015-03-29 21:53:05 -0400	[diff] [blame]	74	from operator import itemgetter
R David Murray	0b6f6c8	2012-05-25 18:42:14 -0400	[diff] [blame]	75	from email import _encoded_words as _ew
				76	from email import errors
				77	from email import utils
				78
				79	#
				80	# Useful constants and functions
				81	#
				82
				83	WSP = set(' \t')
				84	CFWS_LEADER = WSP \| set('(')
				85	SPECIALS = set(r'()<>@,:;.\"[]')
				86	ATOM_ENDS = SPECIALS \| WSP
				87	DOT_ATOM_ENDS = ATOM_ENDS - set('.')
				88	# '.', '"', and '(' do not end phrases in order to support obs-phrase
				89	PHRASE_ENDS = SPECIALS - set('."(')
R David Murray	97f43c0	2012-06-24 05:03:27 -0400	[diff] [blame]	90	TSPECIALS = (SPECIALS \| set('/?=')) - set('.')
				91	TOKEN_ENDS = TSPECIALS \| WSP
				92	ASPECIALS = TSPECIALS \| set("*'%")
				93	ATTRIBUTE_ENDS = ASPECIALS \| WSP
				94	EXTENDED_ATTRIBUTE_ENDS = ATTRIBUTE_ENDS - set('%')
R David Murray	0b6f6c8	2012-05-25 18:42:14 -0400	[diff] [blame]	95
				96	def quote_string(value):
				97	return '"'+str(value).replace('\\', '\\\\').replace('"', r'\"')+'"'
				98
				99	#
				100	# Accumulator for header folding
				101	#
				102
				103	class _Folded:
				104
				105	def __init__(self, maxlen, policy):
				106	self.maxlen = maxlen
				107	self.policy = policy
				108	self.lastlen = 0
				109	self.stickyspace = None
				110	self.firstline = True
				111	self.done = []
				112	self.current = []
				113
				114	def newline(self):
				115	self.done.extend(self.current)
				116	self.done.append(self.policy.linesep)
				117	self.current.clear()
				118	self.lastlen = 0
				119
				120	def finalize(self):
				121	if self.current:
				122	self.newline()
				123
				124	def __str__(self):
				125	return ''.join(self.done)
				126
				127	def append(self, stoken):
				128	self.current.append(stoken)
				129
				130	def append_if_fits(self, token, stoken=None):
				131	if stoken is None:
				132	stoken = str(token)
				133	l = len(stoken)
				134	if self.stickyspace is not None:
				135	stickyspace_len = len(self.stickyspace)
				136	if self.lastlen + stickyspace_len + l <= self.maxlen:
				137	self.current.append(self.stickyspace)
				138	self.lastlen += stickyspace_len
				139	self.current.append(stoken)
				140	self.lastlen += l
				141	self.stickyspace = None
				142	self.firstline = False
				143	return True
				144	if token.has_fws:
				145	ws = token.pop_leading_fws()
				146	if ws is not None:
				147	self.stickyspace += str(ws)
				148	stickyspace_len += len(ws)
				149	token._fold(self)
				150	return True
				151	if stickyspace_len and l + 1 <= self.maxlen:
				152	margin = self.maxlen - l
				153	if 0 < margin < stickyspace_len:
				154	trim = stickyspace_len - margin
				155	self.current.append(self.stickyspace[:trim])
				156	self.stickyspace = self.stickyspace[trim:]
				157	stickyspace_len = trim
				158	self.newline()
				159	self.current.append(self.stickyspace)
				160	self.current.append(stoken)
				161	self.lastlen = l + stickyspace_len
				162	self.stickyspace = None
				163	self.firstline = False
				164	return True
				165	if not self.firstline:
				166	self.newline()
				167	self.current.append(self.stickyspace)
				168	self.current.append(stoken)
				169	self.stickyspace = None
				170	self.firstline = False
				171	return True
				172	if self.lastlen + l <= self.maxlen:
				173	self.current.append(stoken)
				174	self.lastlen += l
				175	return True
				176	if l < self.maxlen:
				177	self.newline()
				178	self.current.append(stoken)
				179	self.lastlen = l
				180	return True
				181	return False
				182
				183	#
				184	# TokenList and its subclasses
				185	#
				186
				187	class TokenList(list):
				188
				189	token_type = None
				190
				191	def __init__(self, args, *kw):
				192	super().__init__(args, *kw)
				193	self.defects = []
				194
				195	def __str__(self):
				196	return ''.join(str(x) for x in self)
				197
				198	def __repr__(self):
				199	return '{}({})'.format(self.__class__.__name__,
				200	super().__repr__())
				201
				202	@property
				203	def value(self):
				204	return ''.join(x.value for x in self if x.value)
				205
				206	@property
				207	def all_defects(self):
				208	return sum((x.all_defects for x in self), self.defects)
				209
				210	#
				211	# Folding API
				212	#
				213	# parts():
				214	#
				215	# return a list of objects that constitute the "higher level syntactic
				216	# objects" specified by the RFC as the best places to fold a header line.
				217	# The returned objects must include leading folding white space, even if
				218	# this means mutating the underlying parse tree of the object. Each object
				219	# is only responsible for returning its parts, and should not drill down
				220	# to any lower level except as required to meet the leading folding white
				221	# space constraint.
				222	#
				223	# _fold(folded):
				224	#
				225	# folded: the result accumulator. This is an instance of _Folded.
				226	# (XXX: I haven't finished factoring this out yet, the folding code
				227	# pretty much uses this as a state object.) When the folded.current
				228	# contains as much text as will fit, the _fold method should call
				229	# folded.newline.
				230	# folded.lastlen: the current length of the test stored in folded.current.
				231	# folded.maxlen: The maximum number of characters that may appear on a
				232	# folded line. Differs from the policy setting in that "no limit" is
				233	# represented by +inf, which means it can be used in the trivially
				234	# logical fashion in comparisons.
				235	#
				236	# Currently no subclasses implement parts, and I think this will remain
				237	# true. A subclass only needs to implement _fold when the generic version
				238	# isn't sufficient. _fold will need to be implemented primarily when it is
				239	# possible for encoded words to appear in the specialized token-list, since
				240	# there is no generic algorithm that can know where exactly the encoded
				241	# words are allowed. A _fold implementation is responsible for filling
				242	# lines in the same general way that the top level _fold does. It may, and
				243	# should, call the _fold method of sub-objects in a similar fashion to that
				244	# of the top level _fold.
				245	#
				246	# XXX: I'm hoping it will be possible to factor the existing code further
				247	# to reduce redundancy and make the logic clearer.
				248
				249	@property
				250	def parts(self):
				251	klass = self.__class__
				252	this = []
				253	for token in self:
				254	if token.startswith_fws():
				255	if this:
				256	yield this[0] if len(this)==1 else klass(this)
				257	this.clear()
				258	end_ws = token.pop_trailing_ws()
				259	this.append(token)
				260	if end_ws:
				261	yield klass(this)
				262	this = [end_ws]
				263	if this:
				264	yield this[0] if len(this)==1 else klass(this)
				265
				266	def startswith_fws(self):
				267	return self[0].startswith_fws()
				268
				269	def pop_leading_fws(self):
				270	if self[0].token_type == 'fws':
				271	return self.pop(0)
				272	return self[0].pop_leading_fws()
				273
				274	def pop_trailing_ws(self):
				275	if self[-1].token_type == 'cfws':
				276	return self.pop(-1)
				277	return self[-1].pop_trailing_ws()
				278
				279	@property
				280	def has_fws(self):
				281	for part in self:
				282	if part.has_fws:
				283	return True
				284	return False
				285
				286	def has_leading_comment(self):
				287	return self[0].has_leading_comment()
				288
				289	@property
				290	def comments(self):
				291	comments = []
				292	for token in self:
				293	comments.extend(token.comments)
				294	return comments
				295
				296	def fold(self, *, policy):
				297	# max_line_length 0/None means no limit, ie: infinitely long.
				298	maxlen = policy.max_line_length or float("+inf")
				299	folded = _Folded(maxlen, policy)
				300	self._fold(folded)
				301	folded.finalize()
				302	return str(folded)
				303
				304	def as_encoded_word(self, charset):
				305	# This works only for things returned by 'parts', which include
				306	# the leading fws, if any, that should be used.
				307	res = []
				308	ws = self.pop_leading_fws()
				309	if ws:
				310	res.append(ws)
				311	trailer = self.pop(-1) if self[-1].token_type=='fws' else ''
				312	res.append(_ew.encode(str(self), charset))
				313	res.append(trailer)
				314	return ''.join(res)
				315
				316	def cte_encode(self, charset, policy):
				317	res = []
				318	for part in self:
				319	res.append(part.cte_encode(charset, policy))
				320	return ''.join(res)
				321
				322	def _fold(self, folded):
R David Murray	224ef3e	2015-05-17 11:29:21 -0400	[diff] [blame]	323	encoding = 'utf-8' if folded.policy.utf8 else 'ascii'
R David Murray	0b6f6c8	2012-05-25 18:42:14 -0400	[diff] [blame]	324	for part in self.parts:
				325	tstr = str(part)
				326	tlen = len(tstr)
				327	try:
R David Murray	224ef3e	2015-05-17 11:29:21 -0400	[diff] [blame]	328	str(part).encode(encoding)
R David Murray	0b6f6c8	2012-05-25 18:42:14 -0400	[diff] [blame]	329	except UnicodeEncodeError:
				330	if any(isinstance(x, errors.UndecodableBytesDefect)
				331	for x in part.all_defects):
				332	charset = 'unknown-8bit'
				333	else:
R David Murray	224ef3e	2015-05-17 11:29:21 -0400	[diff] [blame]	334	# XXX: this should be a policy setting when utf8 is False.
R David Murray	0b6f6c8	2012-05-25 18:42:14 -0400	[diff] [blame]	335	charset = 'utf-8'
				336	tstr = part.cte_encode(charset, folded.policy)
				337	tlen = len(tstr)
				338	if folded.append_if_fits(part, tstr):
				339	continue
				340	# Peel off the leading whitespace if any and make it sticky, to
				341	# avoid infinite recursion.
				342	ws = part.pop_leading_fws()
				343	if ws is not None:
Joel Hillacre	b350c22	2017-06-26 15:41:35 -0600	[diff] [blame]	344	folded.stickyspace = str(ws)
R David Murray	0b6f6c8	2012-05-25 18:42:14 -0400	[diff] [blame]	345	if folded.append_if_fits(part):
				346	continue
				347	if part.has_fws:
				348	part._fold(folded)
				349	continue
				350	# There are no fold points in this one; it is too long for a single
				351	# line and can't be split...we just have to put it on its own line.
				352	folded.append(tstr)
				353	folded.newline()
				354
				355	def pprint(self, indent=''):
				356	print('\n'.join(self._pp(indent='')))
				357
				358	def ppstr(self, indent=''):
				359	return '\n'.join(self._pp(indent=''))
				360
				361	def _pp(self, indent=''):
				362	yield '{}{}/{}('.format(
				363	indent,
				364	self.__class__.__name__,
				365	self.token_type)
				366	for token in self:
R David Murray	97f43c0	2012-06-24 05:03:27 -0400	[diff] [blame]	367	if not hasattr(token, '_pp'):
				368	yield (indent + ' !! invalid element in token '
				369	'list: {!r}'.format(token))
				370	else:
Philip Jenvey	4993cc0	2012-10-01 12:53:43 -0700	[diff] [blame]	371	yield from token._pp(indent+' ')
R David Murray	0b6f6c8	2012-05-25 18:42:14 -0400	[diff] [blame]	372	if self.defects:
				373	extra = ' Defects: {}'.format(self.defects)
				374	else:
				375	extra = ''
				376	yield '{}){}'.format(indent, extra)
				377
				378
				379	class WhiteSpaceTokenList(TokenList):
				380
				381	@property
				382	def value(self):
				383	return ' '
				384
				385	@property
				386	def comments(self):
				387	return [x.content for x in self if x.token_type=='comment']
				388
				389
				390	class UnstructuredTokenList(TokenList):
				391
				392	token_type = 'unstructured'
				393
				394	def _fold(self, folded):
R David Murray	0b6f6c8	2012-05-25 18:42:14 -0400	[diff] [blame]	395	last_ew = None
R David Murray	224ef3e	2015-05-17 11:29:21 -0400	[diff] [blame]	396	encoding = 'utf-8' if folded.policy.utf8 else 'ascii'
R David Murray	0b6f6c8	2012-05-25 18:42:14 -0400	[diff] [blame]	397	for part in self.parts:
				398	tstr = str(part)
				399	is_ew = False
				400	try:
R David Murray	224ef3e	2015-05-17 11:29:21 -0400	[diff] [blame]	401	str(part).encode(encoding)
R David Murray	0b6f6c8	2012-05-25 18:42:14 -0400	[diff] [blame]	402	except UnicodeEncodeError:
				403	if any(isinstance(x, errors.UndecodableBytesDefect)
				404	for x in part.all_defects):
				405	charset = 'unknown-8bit'
				406	else:
				407	charset = 'utf-8'
				408	if last_ew is not None:
				409	# We've already done an EW, combine this one with it
				410	# if there's room.
				411	chunk = get_unstructured(
				412	''.join(folded.current[last_ew:]+[tstr])).as_encoded_word(charset)
				413	oldlastlen = sum(len(x) for x in folded.current[:last_ew])
				414	schunk = str(chunk)
				415	lchunk = len(schunk)
				416	if oldlastlen + lchunk <= folded.maxlen:
				417	del folded.current[last_ew:]
				418	folded.append(schunk)
				419	folded.lastlen = oldlastlen + lchunk
				420	continue
				421	tstr = part.as_encoded_word(charset)
				422	is_ew = True
				423	if folded.append_if_fits(part, tstr):
				424	if is_ew:
				425	last_ew = len(folded.current) - 1
				426	continue
				427	if is_ew or last_ew:
				428	# It's too big to fit on the line, but since we've
				429	# got encoded words we can use encoded word folding.
				430	part._fold_as_ew(folded)
				431	continue
				432	# Peel off the leading whitespace if any and make it sticky, to
				433	# avoid infinite recursion.
				434	ws = part.pop_leading_fws()
				435	if ws is not None:
				436	folded.stickyspace = str(ws)
				437	if folded.append_if_fits(part):
				438	continue
				439	if part.has_fws:
Serhiy Storchaka	6c2f1fd	2016-07-17 13:25:15 +0300	[diff] [blame]	440	part._fold(folded)
R David Murray	0b6f6c8	2012-05-25 18:42:14 -0400	[diff] [blame]	441	continue
				442	# It can't be split...we just have to put it on its own line.
				443	folded.append(tstr)
				444	folded.newline()
				445	last_ew = None
				446
				447	def cte_encode(self, charset, policy):
				448	res = []
				449	last_ew = None
				450	for part in self:
				451	spart = str(part)
				452	try:
				453	spart.encode('us-ascii')
				454	res.append(spart)
				455	except UnicodeEncodeError:
				456	if last_ew is None:
				457	res.append(part.cte_encode(charset, policy))
				458	last_ew = len(res)
				459	else:
				460	tl = get_unstructured(''.join(res[last_ew:] + [spart]))
Serhiy Storchaka	6c2f1fd	2016-07-17 13:25:15 +0300	[diff] [blame]	461	res.append(tl.as_encoded_word(charset))
R David Murray	0b6f6c8	2012-05-25 18:42:14 -0400	[diff] [blame]	462	return ''.join(res)
				463
				464
				465	class Phrase(TokenList):
				466
				467	token_type = 'phrase'
				468
				469	def _fold(self, folded):
				470	# As with Unstructured, we can have pure ASCII with or without
				471	# surrogateescape encoded bytes, or we could have unicode. But this
				472	# case is more complicated, since we have to deal with the various
				473	# sub-token types and how they can be composed in the face of
				474	# unicode-that-needs-CTE-encoding, and the fact that if a token a
				475	# comment that becomes a barrier across which we can't compose encoded
				476	# words.
				477	last_ew = None
R David Murray	224ef3e	2015-05-17 11:29:21 -0400	[diff] [blame]	478	encoding = 'utf-8' if folded.policy.utf8 else 'ascii'
R David Murray	0b6f6c8	2012-05-25 18:42:14 -0400	[diff] [blame]	479	for part in self.parts:
				480	tstr = str(part)
				481	tlen = len(tstr)
				482	has_ew = False
				483	try:
R David Murray	224ef3e	2015-05-17 11:29:21 -0400	[diff] [blame]	484	str(part).encode(encoding)
R David Murray	0b6f6c8	2012-05-25 18:42:14 -0400	[diff] [blame]	485	except UnicodeEncodeError:
				486	if any(isinstance(x, errors.UndecodableBytesDefect)
				487	for x in part.all_defects):
				488	charset = 'unknown-8bit'
				489	else:
				490	charset = 'utf-8'
				491	if last_ew is not None and not part.has_leading_comment():
				492	# We've already done an EW, let's see if we can combine
				493	# this one with it. The last_ew logic ensures that all we
				494	# have at this point is atoms, no comments or quoted
				495	# strings. So we can treat the text between the last
				496	# encoded word and the content of this token as
				497	# unstructured text, and things will work correctly. But
				498	# we have to strip off any trailing comment on this token
				499	# first, and if it is a quoted string we have to pull out
				500	# the content (we're encoding it, so it no longer needs to
				501	# be quoted).
				502	if part[-1].token_type == 'cfws' and part.comments:
				503	remainder = part.pop(-1)
				504	else:
				505	remainder = ''
				506	for i, token in enumerate(part):
				507	if token.token_type == 'bare-quoted-string':
				508	part[i] = UnstructuredTokenList(token[:])
				509	chunk = get_unstructured(
				510	''.join(folded.current[last_ew:]+[tstr])).as_encoded_word(charset)
				511	schunk = str(chunk)
				512	lchunk = len(schunk)
				513	if last_ew + lchunk <= folded.maxlen:
				514	del folded.current[last_ew:]
				515	folded.append(schunk)
				516	folded.lastlen = sum(len(x) for x in folded.current)
				517	continue
				518	tstr = part.as_encoded_word(charset)
				519	tlen = len(tstr)
				520	has_ew = True
				521	if folded.append_if_fits(part, tstr):
				522	if has_ew and not part.comments:
				523	last_ew = len(folded.current) - 1
				524	elif part.comments or part.token_type == 'quoted-string':
				525	# If a comment is involved we can't combine EWs. And if a
				526	# quoted string is involved, it's not worth the effort to
				527	# try to combine them.
				528	last_ew = None
				529	continue
				530	part._fold(folded)
				531
				532	def cte_encode(self, charset, policy):
				533	res = []
				534	last_ew = None
				535	is_ew = False
				536	for part in self:
				537	spart = str(part)
				538	try:
				539	spart.encode('us-ascii')
				540	res.append(spart)
				541	except UnicodeEncodeError:
				542	is_ew = True
				543	if last_ew is None:
				544	if not part.comments:
				545	last_ew = len(res)
				546	res.append(part.cte_encode(charset, policy))
				547	elif not part.has_leading_comment():
				548	if part[-1].token_type == 'cfws' and part.comments:
				549	remainder = part.pop(-1)
				550	else:
				551	remainder = ''
				552	for i, token in enumerate(part):
				553	if token.token_type == 'bare-quoted-string':
				554	part[i] = UnstructuredTokenList(token[:])
				555	tl = get_unstructured(''.join(res[last_ew:] + [spart]))
				556	res[last_ew:] = [tl.as_encoded_word(charset)]
				557	if part.comments or (not is_ew and part.token_type == 'quoted-string'):
				558	last_ew = None
				559	return ''.join(res)
				560
				561	class Word(TokenList):
				562
				563	token_type = 'word'
				564
				565
				566	class CFWSList(WhiteSpaceTokenList):
				567
				568	token_type = 'cfws'
				569
				570	def has_leading_comment(self):
				571	return bool(self.comments)
				572
				573
				574	class Atom(TokenList):
				575
				576	token_type = 'atom'
				577
				578
R David Murray	97f43c0	2012-06-24 05:03:27 -0400	[diff] [blame]	579	class Token(TokenList):
				580
				581	token_type = 'token'
				582
				583
R David Murray	0b6f6c8	2012-05-25 18:42:14 -0400	[diff] [blame]	584	class EncodedWord(TokenList):
				585
				586	token_type = 'encoded-word'
				587	cte = None
				588	charset = None
				589	lang = None
				590
				591	@property
				592	def encoded(self):
				593	if self.cte is not None:
				594	return self.cte
				595	_ew.encode(str(self), self.charset)
				596
				597
				598
				599	class QuotedString(TokenList):
				600
				601	token_type = 'quoted-string'
				602
				603	@property
				604	def content(self):
				605	for x in self:
				606	if x.token_type == 'bare-quoted-string':
				607	return x.value
				608
				609	@property
				610	def quoted_value(self):
				611	res = []
				612	for x in self:
				613	if x.token_type == 'bare-quoted-string':
				614	res.append(str(x))
				615	else:
				616	res.append(x.value)
				617	return ''.join(res)
				618
R David Murray	97f43c0	2012-06-24 05:03:27 -0400	[diff] [blame]	619	@property
				620	def stripped_value(self):
				621	for token in self:
				622	if token.token_type == 'bare-quoted-string':
				623	return token.value
				624
R David Murray	0b6f6c8	2012-05-25 18:42:14 -0400	[diff] [blame]	625
				626	class BareQuotedString(QuotedString):
				627
				628	token_type = 'bare-quoted-string'
				629
				630	def __str__(self):
R David Murray	97f43c0	2012-06-24 05:03:27 -0400	[diff] [blame]	631	return quote_string(''.join(str(x) for x in self))
R David Murray	0b6f6c8	2012-05-25 18:42:14 -0400	[diff] [blame]	632
				633	@property
				634	def value(self):
				635	return ''.join(str(x) for x in self)
				636
				637
				638	class Comment(WhiteSpaceTokenList):
				639
				640	token_type = 'comment'
				641
				642	def __str__(self):
				643	return ''.join(sum([
				644	["("],
				645	[self.quote(x) for x in self],
				646	[")"],
				647	], []))
				648
				649	def quote(self, value):
				650	if value.token_type == 'comment':
				651	return str(value)
				652	return str(value).replace('\\', '\\\\').replace(
R David Murray	44b548d	2016-09-08 13:59:53 -0400	[diff] [blame]	653	'(', r'\(').replace(
				654	')', r'\)')
R David Murray	0b6f6c8	2012-05-25 18:42:14 -0400	[diff] [blame]	655
				656	@property
				657	def content(self):
				658	return ''.join(str(x) for x in self)
				659
				660	@property
				661	def comments(self):
				662	return [self.content]
				663
				664	class AddressList(TokenList):
				665
				666	token_type = 'address-list'
				667
				668	@property
				669	def addresses(self):
				670	return [x for x in self if x.token_type=='address']
				671
				672	@property
				673	def mailboxes(self):
				674	return sum((x.mailboxes
				675	for x in self if x.token_type=='address'), [])
				676
				677	@property
				678	def all_mailboxes(self):
				679	return sum((x.all_mailboxes
				680	for x in self if x.token_type=='address'), [])
				681
				682
				683	class Address(TokenList):
				684
				685	token_type = 'address'
				686
				687	@property
				688	def display_name(self):
				689	if self[0].token_type == 'group':
				690	return self[0].display_name
				691
				692	@property
				693	def mailboxes(self):
				694	if self[0].token_type == 'mailbox':
				695	return [self[0]]
				696	elif self[0].token_type == 'invalid-mailbox':
				697	return []
				698	return self[0].mailboxes
				699
				700	@property
				701	def all_mailboxes(self):
				702	if self[0].token_type == 'mailbox':
				703	return [self[0]]
				704	elif self[0].token_type == 'invalid-mailbox':
				705	return [self[0]]
				706	return self[0].all_mailboxes
				707
				708	class MailboxList(TokenList):
				709
				710	token_type = 'mailbox-list'
				711
				712	@property
				713	def mailboxes(self):
				714	return [x for x in self if x.token_type=='mailbox']
				715
				716	@property
				717	def all_mailboxes(self):
				718	return [x for x in self
				719	if x.token_type in ('mailbox', 'invalid-mailbox')]
				720
				721
				722	class GroupList(TokenList):
				723
				724	token_type = 'group-list'
				725
				726	@property
				727	def mailboxes(self):
				728	if not self or self[0].token_type != 'mailbox-list':
				729	return []
				730	return self[0].mailboxes
				731
				732	@property
				733	def all_mailboxes(self):
				734	if not self or self[0].token_type != 'mailbox-list':
				735	return []
				736	return self[0].all_mailboxes
				737
				738
				739	class Group(TokenList):
				740
				741	token_type = "group"
				742
				743	@property
				744	def mailboxes(self):
				745	if self[2].token_type != 'group-list':
				746	return []
				747	return self[2].mailboxes
				748
				749	@property
				750	def all_mailboxes(self):
				751	if self[2].token_type != 'group-list':
				752	return []
				753	return self[2].all_mailboxes
				754
				755	@property
				756	def display_name(self):
				757	return self[0].display_name
				758
				759
				760	class NameAddr(TokenList):
				761
				762	token_type = 'name-addr'
				763
				764	@property
				765	def display_name(self):
				766	if len(self) == 1:
				767	return None
				768	return self[0].display_name
				769
				770	@property
				771	def local_part(self):
				772	return self[-1].local_part
				773
				774	@property
				775	def domain(self):
				776	return self[-1].domain
				777
				778	@property
				779	def route(self):
				780	return self[-1].route
				781
				782	@property
				783	def addr_spec(self):
				784	return self[-1].addr_spec
				785
				786
				787	class AngleAddr(TokenList):
				788
				789	token_type = 'angle-addr'
				790
				791	@property
				792	def local_part(self):
				793	for x in self:
				794	if x.token_type == 'addr-spec':
				795	return x.local_part
				796
				797	@property
				798	def domain(self):
				799	for x in self:
				800	if x.token_type == 'addr-spec':
				801	return x.domain
				802
				803	@property
				804	def route(self):
				805	for x in self:
				806	if x.token_type == 'obs-route':
				807	return x.domains
				808
				809	@property
				810	def addr_spec(self):
				811	for x in self:
				812	if x.token_type == 'addr-spec':
				813	return x.addr_spec
R David Murray	032eed3	2012-05-26 14:31:12 -0400	[diff] [blame]	814	else:
				815	return '<>'
R David Murray	0b6f6c8	2012-05-25 18:42:14 -0400	[diff] [blame]	816
				817
				818	class ObsRoute(TokenList):
				819
				820	token_type = 'obs-route'
				821
				822	@property
				823	def domains(self):
				824	return [x.domain for x in self if x.token_type == 'domain']
				825
				826
				827	class Mailbox(TokenList):
				828
				829	token_type = 'mailbox'
				830
				831	@property
				832	def display_name(self):
				833	if self[0].token_type == 'name-addr':
				834	return self[0].display_name
				835
				836	@property
				837	def local_part(self):
				838	return self[0].local_part
				839
				840	@property
				841	def domain(self):
				842	return self[0].domain
				843
				844	@property
				845	def route(self):
				846	if self[0].token_type == 'name-addr':
				847	return self[0].route
				848
				849	@property
				850	def addr_spec(self):
				851	return self[0].addr_spec
				852
				853
				854	class InvalidMailbox(TokenList):
				855
				856	token_type = 'invalid-mailbox'
				857
				858	@property
				859	def display_name(self):
				860	return None
				861
				862	local_part = domain = route = addr_spec = display_name
				863
				864
				865	class Domain(TokenList):
				866
				867	token_type = 'domain'
				868
				869	@property
				870	def domain(self):
				871	return ''.join(super().value.split())
				872
				873
				874	class DotAtom(TokenList):
				875
				876	token_type = 'dot-atom'
				877
				878
				879	class DotAtomText(TokenList):
				880
				881	token_type = 'dot-atom-text'
				882
				883
				884	class AddrSpec(TokenList):
				885
				886	token_type = 'addr-spec'
				887
				888	@property
				889	def local_part(self):
				890	return self[0].local_part
				891
				892	@property
				893	def domain(self):
				894	if len(self) < 3:
				895	return None
				896	return self[-1].domain
				897
				898	@property
				899	def value(self):
				900	if len(self) < 3:
				901	return self[0].value
				902	return self[0].value.rstrip()+self[1].value+self[2].value.lstrip()
				903
				904	@property
				905	def addr_spec(self):
				906	nameset = set(self.local_part)
				907	if len(nameset) > len(nameset-DOT_ATOM_ENDS):
				908	lp = quote_string(self.local_part)
				909	else:
				910	lp = self.local_part
				911	if self.domain is not None:
				912	return lp + '@' + self.domain
				913	return lp
				914
				915
				916	class ObsLocalPart(TokenList):
				917
				918	token_type = 'obs-local-part'
				919
				920
				921	class DisplayName(Phrase):
				922
				923	token_type = 'display-name'
				924
				925	@property
				926	def display_name(self):
				927	res = TokenList(self)
				928	if res[0].token_type == 'cfws':
				929	res.pop(0)
				930	else:
				931	if res[0][0].token_type == 'cfws':
				932	res[0] = TokenList(res[0][1:])
				933	if res[-1].token_type == 'cfws':
				934	res.pop()
				935	else:
				936	if res[-1][-1].token_type == 'cfws':
				937	res[-1] = TokenList(res[-1][:-1])
				938	return res.value
				939
				940	@property
				941	def value(self):
				942	quote = False
				943	if self.defects:
				944	quote = True
				945	else:
				946	for x in self:
				947	if x.token_type == 'quoted-string':
				948	quote = True
				949	if quote:
				950	pre = post = ''
				951	if self[0].token_type=='cfws' or self[0][0].token_type=='cfws':
				952	pre = ' '
				953	if self[-1].token_type=='cfws' or self[-1][-1].token_type=='cfws':
				954	post = ' '
				955	return pre+quote_string(self.display_name)+post
				956	else:
				957	return super().value
				958
				959
				960	class LocalPart(TokenList):
				961
				962	token_type = 'local-part'
				963
				964	@property
				965	def value(self):
				966	if self[0].token_type == "quoted-string":
				967	return self[0].quoted_value
				968	else:
				969	return self[0].value
				970
				971	@property
				972	def local_part(self):
				973	# Strip whitespace from front, back, and around dots.
				974	res = [DOT]
				975	last = DOT
				976	last_is_tl = False
				977	for tok in self[0] + [DOT]:
				978	if tok.token_type == 'cfws':
				979	continue
				980	if (last_is_tl and tok.token_type == 'dot' and
				981	last[-1].token_type == 'cfws'):
				982	res[-1] = TokenList(last[:-1])
				983	is_tl = isinstance(tok, TokenList)
				984	if (is_tl and last.token_type == 'dot' and
				985	tok[0].token_type == 'cfws'):
				986	res.append(TokenList(tok[1:]))
				987	else:
				988	res.append(tok)
				989	last = res[-1]
				990	last_is_tl = is_tl
				991	res = TokenList(res[1:-1])
				992	return res.value
				993
				994
				995	class DomainLiteral(TokenList):
				996
				997	token_type = 'domain-literal'
				998
				999	@property
				1000	def domain(self):
				1001	return ''.join(super().value.split())
				1002
				1003	@property
				1004	def ip(self):
				1005	for x in self:
				1006	if x.token_type == 'ptext':
				1007	return x.value
				1008
				1009
R David Murray	97f43c0	2012-06-24 05:03:27 -0400	[diff] [blame]	1010	class MIMEVersion(TokenList):
				1011
				1012	token_type = 'mime-version'
				1013	major = None
				1014	minor = None
				1015
				1016
				1017	class Parameter(TokenList):
				1018
				1019	token_type = 'parameter'
				1020	sectioned = False
				1021	extended = False
				1022	charset = 'us-ascii'
				1023
				1024	@property
				1025	def section_number(self):
				1026	# Because the first token, the attribute (name) eats CFWS, the second
				1027	# token is always the section if there is one.
				1028	return self[1].number if self.sectioned else 0
				1029
				1030	@property
				1031	def param_value(self):
				1032	# This is part of the "handle quoted extended parameters" hack.
				1033	for token in self:
				1034	if token.token_type == 'value':
				1035	return token.stripped_value
				1036	if token.token_type == 'quoted-string':
				1037	for token in token:
				1038	if token.token_type == 'bare-quoted-string':
				1039	for token in token:
				1040	if token.token_type == 'value':
				1041	return token.stripped_value
				1042	return ''
				1043
				1044
				1045	class InvalidParameter(Parameter):
				1046
				1047	token_type = 'invalid-parameter'
				1048
				1049
				1050	class Attribute(TokenList):
				1051
				1052	token_type = 'attribute'
				1053
				1054	@property
				1055	def stripped_value(self):
				1056	for token in self:
				1057	if token.token_type.endswith('attrtext'):
				1058	return token.value
				1059
				1060	class Section(TokenList):
				1061
				1062	token_type = 'section'
				1063	number = None
				1064
				1065
				1066	class Value(TokenList):
				1067
				1068	token_type = 'value'
				1069
				1070	@property
				1071	def stripped_value(self):
				1072	token = self[0]
				1073	if token.token_type == 'cfws':
				1074	token = self[1]
				1075	if token.token_type.endswith(
				1076	('quoted-string', 'attribute', 'extended-attribute')):
				1077	return token.stripped_value
				1078	return self.value
				1079
				1080
				1081	class MimeParameters(TokenList):
				1082
				1083	token_type = 'mime-parameters'
				1084
				1085	@property
				1086	def params(self):
				1087	# The RFC specifically states that the ordering of parameters is not
				1088	# guaranteed and may be reordered by the transport layer. So we have
				1089	# to assume the RFC 2231 pieces can come in any order. However, we
				1090	# output them in the order that we first see a given name, which gives
				1091	# us a stable __str__.
				1092	params = OrderedDict()
				1093	for token in self:
				1094	if not token.token_type.endswith('parameter'):
				1095	continue
				1096	if token[0].token_type != 'attribute':
				1097	continue
				1098	name = token[0].value.strip()
				1099	if name not in params:
				1100	params[name] = []
				1101	params[name].append((token.section_number, token))
				1102	for name, parts in params.items():
R David Murray	7d0325d	2015-03-29 21:53:05 -0400	[diff] [blame]	1103	parts = sorted(parts, key=itemgetter(0))
				1104	first_param = parts[0][1]
				1105	charset = first_param.charset
				1106	# Our arbitrary error recovery is to ignore duplicate parameters,
				1107	# to use appearance order if there are duplicate rfc 2231 parts,
				1108	# and to ignore gaps. This mimics the error recovery of get_param.
				1109	if not first_param.extended and len(parts) > 1:
				1110	if parts[1][0] == 0:
				1111	parts[1][1].defects.append(errors.InvalidHeaderDefect(
				1112	'duplicate parameter name; duplicate(s) ignored'))
				1113	parts = parts[:1]
				1114	# Else assume the 0 was missing...note that this is different
				1115	# from get_param, but we registered a defect for this earlier.
R David Murray	97f43c0	2012-06-24 05:03:27 -0400	[diff] [blame]	1116	value_parts = []
R David Murray	7d0325d	2015-03-29 21:53:05 -0400	[diff] [blame]	1117	i = 0
				1118	for section_number, param in parts:
R David Murray	97f43c0	2012-06-24 05:03:27 -0400	[diff] [blame]	1119	if section_number != i:
R David Murray	7d0325d	2015-03-29 21:53:05 -0400	[diff] [blame]	1120	# We could get fancier here and look for a complete
				1121	# duplicate extended parameter and ignore the second one
				1122	# seen. But we're not doing that. The old code didn't.
				1123	if not param.extended:
				1124	param.defects.append(errors.InvalidHeaderDefect(
				1125	'duplicate parameter name; duplicate ignored'))
				1126	continue
				1127	else:
				1128	param.defects.append(errors.InvalidHeaderDefect(
				1129	"inconsistent RFC2231 parameter numbering"))
				1130	i += 1
R David Murray	97f43c0	2012-06-24 05:03:27 -0400	[diff] [blame]	1131	value = param.param_value
				1132	if param.extended:
				1133	try:
				1134	value = urllib.parse.unquote_to_bytes(value)
				1135	except UnicodeEncodeError:
				1136	# source had surrogate escaped bytes. What we do now
				1137	# is a bit of an open question. I'm not sure this is
				1138	# the best choice, but it is what the old algorithm did
				1139	value = urllib.parse.unquote(value, encoding='latin-1')
				1140	else:
				1141	try:
				1142	value = value.decode(charset, 'surrogateescape')
				1143	except LookupError:
				1144	# XXX: there should really be a custom defect for
				1145	# unknown character set to make it easy to find,
				1146	# because otherwise unknown charset is a silent
				1147	# failure.
				1148	value = value.decode('us-ascii', 'surrogateescape')
				1149	if utils._has_surrogates(value):
				1150	param.defects.append(errors.UndecodableBytesDefect())
				1151	value_parts.append(value)
				1152	value = ''.join(value_parts)
				1153	yield name, value
				1154
				1155	def __str__(self):
				1156	params = []
				1157	for name, value in self.params:
				1158	if value:
				1159	params.append('{}={}'.format(name, quote_string(value)))
				1160	else:
				1161	params.append(name)
				1162	params = '; '.join(params)
				1163	return ' ' + params if params else ''
				1164
				1165
				1166	class ParameterizedHeaderValue(TokenList):
				1167
				1168	@property
				1169	def params(self):
				1170	for token in reversed(self):
				1171	if token.token_type == 'mime-parameters':
				1172	return token.params
				1173	return {}
				1174
				1175	@property
				1176	def parts(self):
				1177	if self and self[-1].token_type == 'mime-parameters':
				1178	# We don't want to start a new line if all of the params don't fit
				1179	# after the value, so unwrap the parameter list.
				1180	return TokenList(self[:-1] + self[-1])
				1181	return TokenList(self).parts
				1182
				1183
				1184	class ContentType(ParameterizedHeaderValue):
				1185
				1186	token_type = 'content-type'
				1187	maintype = 'text'
				1188	subtype = 'plain'
				1189
				1190
				1191	class ContentDisposition(ParameterizedHeaderValue):
				1192
				1193	token_type = 'content-disposition'
				1194	content_disposition = None
				1195
				1196
				1197	class ContentTransferEncoding(TokenList):
				1198
				1199	token_type = 'content-transfer-encoding'
				1200	cte = '7bit'
				1201
				1202
R David Murray	0b6f6c8	2012-05-25 18:42:14 -0400	[diff] [blame]	1203	class HeaderLabel(TokenList):
				1204
				1205	token_type = 'header-label'
				1206
				1207
				1208	class Header(TokenList):
				1209
				1210	token_type = 'header'
				1211
				1212	def _fold(self, folded):
				1213	folded.append(str(self.pop(0)))
				1214	folded.lastlen = len(folded.current[0])
				1215	# The first line of the header is different from all others: we don't
				1216	# want to start a new object on a new line if it has any fold points in
				1217	# it that would allow part of it to be on the first header line.
				1218	# Further, if the first fold point would fit on the new line, we want
				1219	# to do that, but if it doesn't we want to put it on the first line.
				1220	# Folded supports this via the stickyspace attribute. If this
				1221	# attribute is not None, it does the special handling.
				1222	folded.stickyspace = str(self.pop(0)) if self[0].token_type == 'cfws' else ''
				1223	rest = self.pop(0)
				1224	if self:
				1225	raise ValueError("Malformed Header token list")
				1226	rest._fold(folded)
				1227
				1228
				1229	#
				1230	# Terminal classes and instances
				1231	#
				1232
				1233	class Terminal(str):
				1234
				1235	def __new__(cls, value, token_type):
				1236	self = super().__new__(cls, value)
				1237	self.token_type = token_type
				1238	self.defects = []
				1239	return self
				1240
				1241	def __repr__(self):
				1242	return "{}({})".format(self.__class__.__name__, super().__repr__())
				1243
				1244	@property
				1245	def all_defects(self):
				1246	return list(self.defects)
				1247
				1248	def _pp(self, indent=''):
				1249	return ["{}{}/{}({}){}".format(
				1250	indent,
				1251	self.__class__.__name__,
				1252	self.token_type,
				1253	super().__repr__(),
				1254	'' if not self.defects else ' {}'.format(self.defects),
				1255	)]
				1256
				1257	def cte_encode(self, charset, policy):
				1258	value = str(self)
				1259	try:
				1260	value.encode('us-ascii')
				1261	return value
				1262	except UnicodeEncodeError:
				1263	return _ew.encode(value, charset)
				1264
				1265	def pop_trailing_ws(self):
				1266	# This terminates the recursion.
				1267	return None
				1268
				1269	def pop_leading_fws(self):
				1270	# This terminates the recursion.
				1271	return None
				1272
				1273	@property
				1274	def comments(self):
				1275	return []
				1276
				1277	def has_leading_comment(self):
				1278	return False
				1279
				1280	def __getnewargs__(self):
				1281	return(str(self), self.token_type)
				1282
				1283
				1284	class WhiteSpaceTerminal(Terminal):
				1285
				1286	@property
				1287	def value(self):
				1288	return ' '
				1289
				1290	def startswith_fws(self):
				1291	return True
				1292
				1293	has_fws = True
				1294
				1295
				1296	class ValueTerminal(Terminal):
				1297
				1298	@property
				1299	def value(self):
				1300	return self
				1301
				1302	def startswith_fws(self):
				1303	return False
				1304
				1305	has_fws = False
				1306
				1307	def as_encoded_word(self, charset):
				1308	return _ew.encode(str(self), charset)
				1309
				1310
				1311	class EWWhiteSpaceTerminal(WhiteSpaceTerminal):
				1312
				1313	@property
				1314	def value(self):
				1315	return ''
				1316
				1317	@property
				1318	def encoded(self):
				1319	return self[:]
				1320
				1321	def __str__(self):
				1322	return ''
				1323
				1324	has_fws = True
				1325
				1326
				1327	# XXX these need to become classes and used as instances so
				1328	# that a program can't change them in a parse tree and screw
				1329	# up other parse trees. Maybe should have tests for that, too.
				1330	DOT = ValueTerminal('.', 'dot')
				1331	ListSeparator = ValueTerminal(',', 'list-separator')
				1332	RouteComponentMarker = ValueTerminal('@', 'route-component-marker')
				1333
				1334	#
				1335	# Parser
				1336	#
				1337
Victor Stinner	765531d	2013-03-26 01:11:54 +0100	[diff] [blame]	1338	# Parse strings according to RFC822/2047/2822/5322 rules.
				1339	#
				1340	# This is a stateless parser. Each get_XXX function accepts a string and
				1341	# returns either a Terminal or a TokenList representing the RFC object named
				1342	# by the method and a string containing the remaining unparsed characters
				1343	# from the input. Thus a parser method consumes the next syntactic construct
				1344	# of a given type and returns a token representing the construct plus the
				1345	# unparsed remainder of the input string.
				1346	#
				1347	# For example, if the first element of a structured header is a 'phrase',
				1348	# then:
				1349	#
				1350	# phrase, value = get_phrase(value)
				1351	#
				1352	# returns the complete phrase from the start of the string value, plus any
				1353	# characters left in the string after the phrase is removed.
R David Murray	0b6f6c8	2012-05-25 18:42:14 -0400	[diff] [blame]	1354
				1355	_wsp_splitter = re.compile(r'([{}]+)'.format(''.join(WSP))).split
				1356	_non_atom_end_matcher = re.compile(r"[^{}]+".format(
R David Murray	44b548d	2016-09-08 13:59:53 -0400	[diff] [blame]	1357	''.join(ATOM_ENDS).replace('\\','\\\\').replace(']',r'\]'))).match
R David Murray	0b6f6c8	2012-05-25 18:42:14 -0400	[diff] [blame]	1358	_non_printable_finder = re.compile(r"[\x00-\x20\x7F]").findall
R David Murray	97f43c0	2012-06-24 05:03:27 -0400	[diff] [blame]	1359	_non_token_end_matcher = re.compile(r"[^{}]+".format(
R David Murray	44b548d	2016-09-08 13:59:53 -0400	[diff] [blame]	1360	''.join(TOKEN_ENDS).replace('\\','\\\\').replace(']',r'\]'))).match
R David Murray	97f43c0	2012-06-24 05:03:27 -0400	[diff] [blame]	1361	_non_attribute_end_matcher = re.compile(r"[^{}]+".format(
R David Murray	44b548d	2016-09-08 13:59:53 -0400	[diff] [blame]	1362	''.join(ATTRIBUTE_ENDS).replace('\\','\\\\').replace(']',r'\]'))).match
R David Murray	97f43c0	2012-06-24 05:03:27 -0400	[diff] [blame]	1363	_non_extended_attribute_end_matcher = re.compile(r"[^{}]+".format(
				1364	''.join(EXTENDED_ATTRIBUTE_ENDS).replace(
R David Murray	44b548d	2016-09-08 13:59:53 -0400	[diff] [blame]	1365	'\\','\\\\').replace(']',r'\]'))).match
R David Murray	0b6f6c8	2012-05-25 18:42:14 -0400	[diff] [blame]	1366
				1367	def _validate_xtext(xtext):
				1368	"""If input token contains ASCII non-printables, register a defect."""
				1369
				1370	non_printables = _non_printable_finder(xtext)
				1371	if non_printables:
				1372	xtext.defects.append(errors.NonPrintableDefect(non_printables))
				1373	if utils._has_surrogates(xtext):
				1374	xtext.defects.append(errors.UndecodableBytesDefect(
				1375	"Non-ASCII characters found in header token"))
				1376
				1377	def _get_ptext_to_endchars(value, endchars):
				1378	"""Scan printables/quoted-pairs until endchars and return unquoted ptext.
				1379
				1380	This function turns a run of qcontent, ccontent-without-comments, or
				1381	dtext-with-quoted-printables into a single string by unquoting any
				1382	quoted printables. It returns the string, the remaining value, and
				1383	a flag that is True iff there were any quoted printables decoded.
				1384
				1385	"""
				1386	fragment, *remainder = _wsp_splitter(value, 1)
				1387	vchars = []
				1388	escape = False
				1389	had_qp = False
				1390	for pos in range(len(fragment)):
				1391	if fragment[pos] == '\\':
				1392	if escape:
				1393	escape = False
				1394	had_qp = True
				1395	else:
				1396	escape = True
				1397	continue
				1398	if escape:
				1399	escape = False
				1400	elif fragment[pos] in endchars:
				1401	break
				1402	vchars.append(fragment[pos])
				1403	else:
				1404	pos = pos + 1
				1405	return ''.join(vchars), ''.join([fragment[pos:]] + remainder), had_qp
				1406
R David Murray	0b6f6c8	2012-05-25 18:42:14 -0400	[diff] [blame]	1407	def get_fws(value):
				1408	"""FWS = 1*WSP
				1409
				1410	This isn't the RFC definition. We're using fws to represent tokens where
				1411	folding can be done, but when we are parsing the unfolding has already
				1412	been done so we don't need to watch out for CRLF.
				1413
				1414	"""
				1415	newvalue = value.lstrip()
				1416	fws = WhiteSpaceTerminal(value[:len(value)-len(newvalue)], 'fws')
				1417	return fws, newvalue
				1418
				1419	def get_encoded_word(value):
				1420	""" encoded-word = "=?" charset "?" encoding "?" encoded-text "?="
				1421
				1422	"""
				1423	ew = EncodedWord()
				1424	if not value.startswith('=?'):
				1425	raise errors.HeaderParseError(
				1426	"expected encoded word but found {}".format(value))
				1427	tok, *remainder = value[2:].split('?=', 1)
				1428	if tok == value[2:]:
				1429	raise errors.HeaderParseError(
				1430	"expected encoded word but found {}".format(value))
				1431	remstr = ''.join(remainder)
R David Murray	65171b2	2013-07-11 15:52:57 -0400	[diff] [blame]	1432	if len(remstr) > 1 and remstr[0] in hexdigits and remstr[1] in hexdigits:
				1433	# The ? after the CTE was followed by an encoded word escape (=XX).
R David Murray	0b6f6c8	2012-05-25 18:42:14 -0400	[diff] [blame]	1434	rest, *remainder = remstr.split('?=', 1)
				1435	tok = tok + '?=' + rest
				1436	if len(tok.split()) > 1:
				1437	ew.defects.append(errors.InvalidHeaderDefect(
				1438	"whitespace inside encoded word"))
				1439	ew.cte = value
				1440	value = ''.join(remainder)
				1441	try:
				1442	text, charset, lang, defects = _ew.decode('=?' + tok + '?=')
				1443	except ValueError:
				1444	raise errors.HeaderParseError(
				1445	"encoded word format invalid: '{}'".format(ew.cte))
				1446	ew.charset = charset
				1447	ew.lang = lang
				1448	ew.defects.extend(defects)
				1449	while text:
				1450	if text[0] in WSP:
				1451	token, text = get_fws(text)
				1452	ew.append(token)
				1453	continue
				1454	chars, *remainder = _wsp_splitter(text, 1)
				1455	vtext = ValueTerminal(chars, 'vtext')
				1456	_validate_xtext(vtext)
				1457	ew.append(vtext)
				1458	text = ''.join(remainder)
				1459	return ew, value
				1460
				1461	def get_unstructured(value):
				1462	"""unstructured = (([FWS] vchar) WSP) / obs-unstruct
				1463	obs-unstruct = ((LF CR (obs-utext) LF CR)) / FWS)
				1464	obs-utext = %d0 / obs-NO-WS-CTL / LF / CR
				1465
				1466	obs-NO-WS-CTL is control characters except WSP/CR/LF.
				1467
				1468	So, basically, we have printable runs, plus control characters or nulls in
				1469	the obsolete syntax, separated by whitespace. Since RFC 2047 uses the
				1470	obsolete syntax in its specification, but requires whitespace on either
				1471	side of the encoded words, I can see no reason to need to separate the
				1472	non-printable-non-whitespace from the printable runs if they occur, so we
				1473	parse this into xtext tokens separated by WSP tokens.
				1474
				1475	Because an 'unstructured' value must by definition constitute the entire
				1476	value, this 'get' routine does not return a remaining value, only the
				1477	parsed TokenList.
				1478
				1479	"""
				1480	# XXX: but what about bare CR and LF? They might signal the start or
R David Murray	65171b2	2013-07-11 15:52:57 -0400	[diff] [blame]	1481	# end of an encoded word. YAGNI for now, since our current parsers
				1482	# will never send us strings with bare CR or LF.
R David Murray	0b6f6c8	2012-05-25 18:42:14 -0400	[diff] [blame]	1483
				1484	unstructured = UnstructuredTokenList()
				1485	while value:
				1486	if value[0] in WSP:
				1487	token, value = get_fws(value)
				1488	unstructured.append(token)
				1489	continue
				1490	if value.startswith('=?'):
				1491	try:
				1492	token, value = get_encoded_word(value)
				1493	except errors.HeaderParseError:
R David Murray	65171b2	2013-07-11 15:52:57 -0400	[diff] [blame]	1494	# XXX: Need to figure out how to register defects when
				1495	# appropriate here.
R David Murray	0b6f6c8	2012-05-25 18:42:14 -0400	[diff] [blame]	1496	pass
				1497	else:
				1498	have_ws = True
				1499	if len(unstructured) > 0:
				1500	if unstructured[-1].token_type != 'fws':
				1501	unstructured.defects.append(errors.InvalidHeaderDefect(
				1502	"missing whitespace before encoded word"))
				1503	have_ws = False
				1504	if have_ws and len(unstructured) > 1:
				1505	if unstructured[-2].token_type == 'encoded-word':
				1506	unstructured[-1] = EWWhiteSpaceTerminal(
				1507	unstructured[-1], 'fws')
				1508	unstructured.append(token)
				1509	continue
				1510	tok, *remainder = _wsp_splitter(value, 1)
				1511	vtext = ValueTerminal(tok, 'vtext')
				1512	_validate_xtext(vtext)
				1513	unstructured.append(vtext)
				1514	value = ''.join(remainder)
				1515	return unstructured
				1516
				1517	def get_qp_ctext(value):
R David Murray	44b548d	2016-09-08 13:59:53 -0400	[diff] [blame]	1518	r"""ctext = <printable ascii except \ ( )>
R David Murray	0b6f6c8	2012-05-25 18:42:14 -0400	[diff] [blame]	1519
				1520	This is not the RFC ctext, since we are handling nested comments in comment
				1521	and unquoting quoted-pairs here. We allow anything except the '()'
				1522	characters, but if we find any ASCII other than the RFC defined printable
Serhiy Storchaka	6a7b3a7	2016-04-17 08:32:47 +0300	[diff] [blame]	1523	ASCII, a NonPrintableDefect is added to the token's defects list. Since
R David Murray	0b6f6c8	2012-05-25 18:42:14 -0400	[diff] [blame]	1524	quoted pairs are converted to their unquoted values, what is returned is
				1525	a 'ptext' token. In this case it is a WhiteSpaceTerminal, so it's value
				1526	is ' '.
				1527
				1528	"""
				1529	ptext, value, _ = _get_ptext_to_endchars(value, '()')
				1530	ptext = WhiteSpaceTerminal(ptext, 'ptext')
				1531	_validate_xtext(ptext)
				1532	return ptext, value
				1533
				1534	def get_qcontent(value):
				1535	"""qcontent = qtext / quoted-pair
				1536
				1537	We allow anything except the DQUOTE character, but if we find any ASCII
Serhiy Storchaka	6a7b3a7	2016-04-17 08:32:47 +0300	[diff] [blame]	1538	other than the RFC defined printable ASCII, a NonPrintableDefect is
R David Murray	0b6f6c8	2012-05-25 18:42:14 -0400	[diff] [blame]	1539	added to the token's defects list. Any quoted pairs are converted to their
				1540	unquoted values, so what is returned is a 'ptext' token. In this case it
				1541	is a ValueTerminal.
				1542
				1543	"""
				1544	ptext, value, _ = _get_ptext_to_endchars(value, '"')
				1545	ptext = ValueTerminal(ptext, 'ptext')
				1546	_validate_xtext(ptext)
				1547	return ptext, value
				1548
				1549	def get_atext(value):
				1550	"""atext = <matches _atext_matcher>
				1551
				1552	We allow any non-ATOM_ENDS in atext, but add an InvalidATextDefect to
				1553	the token's defects list if we find non-atext characters.
				1554	"""
				1555	m = _non_atom_end_matcher(value)
				1556	if not m:
				1557	raise errors.HeaderParseError(
				1558	"expected atext but found '{}'".format(value))
				1559	atext = m.group()
				1560	value = value[len(atext):]
				1561	atext = ValueTerminal(atext, 'atext')
				1562	_validate_xtext(atext)
				1563	return atext, value
				1564
				1565	def get_bare_quoted_string(value):
				1566	"""bare-quoted-string = DQUOTE *([FWS] qcontent) [FWS] DQUOTE
				1567
				1568	A quoted-string without the leading or trailing white space. Its
				1569	value is the text between the quote marks, with whitespace
				1570	preserved and quoted pairs decoded.
				1571	"""
				1572	if value[0] != '"':
				1573	raise errors.HeaderParseError(
				1574	"expected '\"' but found '{}'".format(value))
				1575	bare_quoted_string = BareQuotedString()
				1576	value = value[1:]
				1577	while value and value[0] != '"':
				1578	if value[0] in WSP:
				1579	token, value = get_fws(value)
R David Murray	0400d33	2014-02-08 13:12:00 -0500	[diff] [blame]	1580	elif value[:2] == '=?':
				1581	try:
				1582	token, value = get_encoded_word(value)
				1583	bare_quoted_string.defects.append(errors.InvalidHeaderDefect(
				1584	"encoded word inside quoted string"))
				1585	except errors.HeaderParseError:
				1586	token, value = get_qcontent(value)
R David Murray	0b6f6c8	2012-05-25 18:42:14 -0400	[diff] [blame]	1587	else:
				1588	token, value = get_qcontent(value)
				1589	bare_quoted_string.append(token)
				1590	if not value:
				1591	bare_quoted_string.defects.append(errors.InvalidHeaderDefect(
				1592	"end of header inside quoted string"))
				1593	return bare_quoted_string, value
				1594	return bare_quoted_string, value[1:]
				1595
				1596	def get_comment(value):
				1597	"""comment = "(" *([FWS] ccontent) [FWS] ")"
				1598	ccontent = ctext / quoted-pair / comment
				1599
				1600	We handle nested comments here, and quoted-pair in our qp-ctext routine.
				1601	"""
				1602	if value and value[0] != '(':
				1603	raise errors.HeaderParseError(
				1604	"expected '(' but found '{}'".format(value))
				1605	comment = Comment()
				1606	value = value[1:]
				1607	while value and value[0] != ")":
				1608	if value[0] in WSP:
				1609	token, value = get_fws(value)
				1610	elif value[0] == '(':
				1611	token, value = get_comment(value)
				1612	else:
				1613	token, value = get_qp_ctext(value)
				1614	comment.append(token)
				1615	if not value:
				1616	comment.defects.append(errors.InvalidHeaderDefect(
				1617	"end of header inside comment"))
				1618	return comment, value
				1619	return comment, value[1:]
				1620
				1621	def get_cfws(value):
				1622	"""CFWS = (1*([FWS] comment) [FWS]) / FWS
				1623
				1624	"""
				1625	cfws = CFWSList()
				1626	while value and value[0] in CFWS_LEADER:
				1627	if value[0] in WSP:
				1628	token, value = get_fws(value)
				1629	else:
				1630	token, value = get_comment(value)
				1631	cfws.append(token)
				1632	return cfws, value
				1633
				1634	def get_quoted_string(value):
				1635	"""quoted-string = [CFWS] <bare-quoted-string> [CFWS]
				1636
				1637	'bare-quoted-string' is an intermediate class defined by this
				1638	parser and not by the RFC grammar. It is the quoted string
				1639	without any attached CFWS.
				1640	"""
				1641	quoted_string = QuotedString()
				1642	if value and value[0] in CFWS_LEADER:
				1643	token, value = get_cfws(value)
				1644	quoted_string.append(token)
				1645	token, value = get_bare_quoted_string(value)
				1646	quoted_string.append(token)
				1647	if value and value[0] in CFWS_LEADER:
				1648	token, value = get_cfws(value)
				1649	quoted_string.append(token)
				1650	return quoted_string, value
				1651
				1652	def get_atom(value):
				1653	"""atom = [CFWS] 1*atext [CFWS]
				1654
R David Murray	923512f	2013-07-12 16:00:28 -0400	[diff] [blame]	1655	An atom could be an rfc2047 encoded word.
R David Murray	0b6f6c8	2012-05-25 18:42:14 -0400	[diff] [blame]	1656	"""
				1657	atom = Atom()
				1658	if value and value[0] in CFWS_LEADER:
				1659	token, value = get_cfws(value)
				1660	atom.append(token)
				1661	if value and value[0] in ATOM_ENDS:
				1662	raise errors.HeaderParseError(
				1663	"expected atom but found '{}'".format(value))
R David Murray	923512f	2013-07-12 16:00:28 -0400	[diff] [blame]	1664	if value.startswith('=?'):
				1665	try:
				1666	token, value = get_encoded_word(value)
				1667	except errors.HeaderParseError:
				1668	# XXX: need to figure out how to register defects when
				1669	# appropriate here.
				1670	token, value = get_atext(value)
				1671	else:
				1672	token, value = get_atext(value)
R David Murray	0b6f6c8	2012-05-25 18:42:14 -0400	[diff] [blame]	1673	atom.append(token)
				1674	if value and value[0] in CFWS_LEADER:
				1675	token, value = get_cfws(value)
				1676	atom.append(token)
				1677	return atom, value
				1678
				1679	def get_dot_atom_text(value):
				1680	""" dot-text = 1atext ("." 1*atext)
				1681
				1682	"""
				1683	dot_atom_text = DotAtomText()
				1684	if not value or value[0] in ATOM_ENDS:
				1685	raise errors.HeaderParseError("expected atom at a start of "
				1686	"dot-atom-text but found '{}'".format(value))
				1687	while value and value[0] not in ATOM_ENDS:
				1688	token, value = get_atext(value)
				1689	dot_atom_text.append(token)
				1690	if value and value[0] == '.':
				1691	dot_atom_text.append(DOT)
				1692	value = value[1:]
				1693	if dot_atom_text[-1] is DOT:
				1694	raise errors.HeaderParseError("expected atom at end of dot-atom-text "
				1695	"but found '{}'".format('.'+value))
				1696	return dot_atom_text, value
				1697
				1698	def get_dot_atom(value):
				1699	""" dot-atom = [CFWS] dot-atom-text [CFWS]
				1700
R David Murray	923512f	2013-07-12 16:00:28 -0400	[diff] [blame]	1701	Any place we can have a dot atom, we could instead have an rfc2047 encoded
				1702	word.
R David Murray	0b6f6c8	2012-05-25 18:42:14 -0400	[diff] [blame]	1703	"""
				1704	dot_atom = DotAtom()
				1705	if value[0] in CFWS_LEADER:
				1706	token, value = get_cfws(value)
				1707	dot_atom.append(token)
R David Murray	923512f	2013-07-12 16:00:28 -0400	[diff] [blame]	1708	if value.startswith('=?'):
				1709	try:
				1710	token, value = get_encoded_word(value)
				1711	except errors.HeaderParseError:
				1712	# XXX: need to figure out how to register defects when
				1713	# appropriate here.
				1714	token, value = get_dot_atom_text(value)
				1715	else:
				1716	token, value = get_dot_atom_text(value)
R David Murray	0b6f6c8	2012-05-25 18:42:14 -0400	[diff] [blame]	1717	dot_atom.append(token)
				1718	if value and value[0] in CFWS_LEADER:
				1719	token, value = get_cfws(value)
				1720	dot_atom.append(token)
				1721	return dot_atom, value
				1722
				1723	def get_word(value):
				1724	"""word = atom / quoted-string
				1725
				1726	Either atom or quoted-string may start with CFWS. We have to peel off this
				1727	CFWS first to determine which type of word to parse. Afterward we splice
				1728	the leading CFWS, if any, into the parsed sub-token.
				1729
				1730	If neither an atom or a quoted-string is found before the next special, a
				1731	HeaderParseError is raised.
				1732
				1733	The token returned is either an Atom or a QuotedString, as appropriate.
				1734	This means the 'word' level of the formal grammar is not represented in the
				1735	parse tree; this is because having that extra layer when manipulating the
				1736	parse tree is more confusing than it is helpful.
				1737
				1738	"""
				1739	if value[0] in CFWS_LEADER:
				1740	leader, value = get_cfws(value)
				1741	else:
				1742	leader = None
				1743	if value[0]=='"':
				1744	token, value = get_quoted_string(value)
				1745	elif value[0] in SPECIALS:
				1746	raise errors.HeaderParseError("Expected 'atom' or 'quoted-string' "
				1747	"but found '{}'".format(value))
				1748	else:
				1749	token, value = get_atom(value)
				1750	if leader is not None:
				1751	token[:0] = [leader]
				1752	return token, value
				1753
				1754	def get_phrase(value):
				1755	""" phrase = 1*word / obs-phrase
				1756	obs-phrase = word *(word / "." / CFWS)
				1757
				1758	This means a phrase can be a sequence of words, periods, and CFWS in any
				1759	order as long as it starts with at least one word. If anything other than
				1760	words is detected, an ObsoleteHeaderDefect is added to the token's defect
				1761	list. We also accept a phrase that starts with CFWS followed by a dot;
				1762	this is registered as an InvalidHeaderDefect, since it is not supported by
				1763	even the obsolete grammar.
				1764
				1765	"""
				1766	phrase = Phrase()
				1767	try:
				1768	token, value = get_word(value)
				1769	phrase.append(token)
				1770	except errors.HeaderParseError:
				1771	phrase.defects.append(errors.InvalidHeaderDefect(
				1772	"phrase does not start with word"))
				1773	while value and value[0] not in PHRASE_ENDS:
				1774	if value[0]=='.':
				1775	phrase.append(DOT)
				1776	phrase.defects.append(errors.ObsoleteHeaderDefect(
				1777	"period in 'phrase'"))
				1778	value = value[1:]
				1779	else:
				1780	try:
				1781	token, value = get_word(value)
				1782	except errors.HeaderParseError:
				1783	if value[0] in CFWS_LEADER:
				1784	token, value = get_cfws(value)
				1785	phrase.defects.append(errors.ObsoleteHeaderDefect(
				1786	"comment found without atom"))
				1787	else:
				1788	raise
				1789	phrase.append(token)
				1790	return phrase, value
				1791
				1792	def get_local_part(value):
				1793	""" local-part = dot-atom / quoted-string / obs-local-part
				1794
				1795	"""
				1796	local_part = LocalPart()
				1797	leader = None
				1798	if value[0] in CFWS_LEADER:
				1799	leader, value = get_cfws(value)
				1800	if not value:
				1801	raise errors.HeaderParseError(
				1802	"expected local-part but found '{}'".format(value))
				1803	try:
				1804	token, value = get_dot_atom(value)
				1805	except errors.HeaderParseError:
				1806	try:
				1807	token, value = get_word(value)
				1808	except errors.HeaderParseError:
				1809	if value[0] != '\\' and value[0] in PHRASE_ENDS:
				1810	raise
				1811	token = TokenList()
				1812	if leader is not None:
				1813	token[:0] = [leader]
				1814	local_part.append(token)
				1815	if value and (value[0]=='\\' or value[0] not in PHRASE_ENDS):
				1816	obs_local_part, value = get_obs_local_part(str(local_part) + value)
				1817	if obs_local_part.token_type == 'invalid-obs-local-part':
				1818	local_part.defects.append(errors.InvalidHeaderDefect(
				1819	"local-part is not dot-atom, quoted-string, or obs-local-part"))
				1820	else:
				1821	local_part.defects.append(errors.ObsoleteHeaderDefect(
				1822	"local-part is not a dot-atom (contains CFWS)"))
				1823	local_part[0] = obs_local_part
				1824	try:
				1825	local_part.value.encode('ascii')
				1826	except UnicodeEncodeError:
				1827	local_part.defects.append(errors.NonASCIILocalPartDefect(
				1828	"local-part contains non-ASCII characters)"))
				1829	return local_part, value
				1830
				1831	def get_obs_local_part(value):
				1832	""" obs-local-part = word *("." word)
				1833	"""
				1834	obs_local_part = ObsLocalPart()
				1835	last_non_ws_was_dot = False
				1836	while value and (value[0]=='\\' or value[0] not in PHRASE_ENDS):
				1837	if value[0] == '.':
				1838	if last_non_ws_was_dot:
				1839	obs_local_part.defects.append(errors.InvalidHeaderDefect(
				1840	"invalid repeated '.'"))
				1841	obs_local_part.append(DOT)
				1842	last_non_ws_was_dot = True
				1843	value = value[1:]
				1844	continue
				1845	elif value[0]=='\\':
				1846	obs_local_part.append(ValueTerminal(value[0],
				1847	'misplaced-special'))
				1848	value = value[1:]
				1849	obs_local_part.defects.append(errors.InvalidHeaderDefect(
				1850	"'\\' character outside of quoted-string/ccontent"))
				1851	last_non_ws_was_dot = False
				1852	continue
				1853	if obs_local_part and obs_local_part[-1].token_type != 'dot':
				1854	obs_local_part.defects.append(errors.InvalidHeaderDefect(
				1855	"missing '.' between words"))
				1856	try:
				1857	token, value = get_word(value)
				1858	last_non_ws_was_dot = False
				1859	except errors.HeaderParseError:
				1860	if value[0] not in CFWS_LEADER:
				1861	raise
				1862	token, value = get_cfws(value)
				1863	obs_local_part.append(token)
				1864	if (obs_local_part[0].token_type == 'dot' or
				1865	obs_local_part[0].token_type=='cfws' and
				1866	obs_local_part[1].token_type=='dot'):
				1867	obs_local_part.defects.append(errors.InvalidHeaderDefect(
				1868	"Invalid leading '.' in local part"))
				1869	if (obs_local_part[-1].token_type == 'dot' or
				1870	obs_local_part[-1].token_type=='cfws' and
				1871	obs_local_part[-2].token_type=='dot'):
				1872	obs_local_part.defects.append(errors.InvalidHeaderDefect(
				1873	"Invalid trailing '.' in local part"))
				1874	if obs_local_part.defects:
				1875	obs_local_part.token_type = 'invalid-obs-local-part'
				1876	return obs_local_part, value
				1877
				1878	def get_dtext(value):
R David Murray	44b548d	2016-09-08 13:59:53 -0400	[diff] [blame]	1879	r""" dtext = <printable ascii except \ [ ]> / obs-dtext
R David Murray	0b6f6c8	2012-05-25 18:42:14 -0400	[diff] [blame]	1880	obs-dtext = obs-NO-WS-CTL / quoted-pair
				1881
Terry Jan Reedy	0f84764	2013-03-11 18:34:00 -0400	[diff] [blame]	1882	We allow anything except the excluded characters, but if we find any
Serhiy Storchaka	6a7b3a7	2016-04-17 08:32:47 +0300	[diff] [blame]	1883	ASCII other than the RFC defined printable ASCII, a NonPrintableDefect is
R David Murray	0b6f6c8	2012-05-25 18:42:14 -0400	[diff] [blame]	1884	added to the token's defects list. Quoted pairs are converted to their
				1885	unquoted values, so what is returned is a ptext token, in this case a
				1886	ValueTerminal. If there were quoted-printables, an ObsoleteHeaderDefect is
				1887	added to the returned token's defect list.
				1888
				1889	"""
				1890	ptext, value, had_qp = _get_ptext_to_endchars(value, '[]')
				1891	ptext = ValueTerminal(ptext, 'ptext')
				1892	if had_qp:
				1893	ptext.defects.append(errors.ObsoleteHeaderDefect(
				1894	"quoted printable found in domain-literal"))
				1895	_validate_xtext(ptext)
				1896	return ptext, value
				1897
				1898	def _check_for_early_dl_end(value, domain_literal):
				1899	if value:
				1900	return False
				1901	domain_literal.append(errors.InvalidHeaderDefect(
				1902	"end of input inside domain-literal"))
				1903	domain_literal.append(ValueTerminal(']', 'domain-literal-end'))
				1904	return True
				1905
				1906	def get_domain_literal(value):
				1907	""" domain-literal = [CFWS] "[" *([FWS] dtext) [FWS] "]" [CFWS]
				1908
				1909	"""
				1910	domain_literal = DomainLiteral()
				1911	if value[0] in CFWS_LEADER:
				1912	token, value = get_cfws(value)
				1913	domain_literal.append(token)
				1914	if not value:
				1915	raise errors.HeaderParseError("expected domain-literal")
				1916	if value[0] != '[':
				1917	raise errors.HeaderParseError("expected '[' at start of domain-literal "
				1918	"but found '{}'".format(value))
				1919	value = value[1:]
				1920	if _check_for_early_dl_end(value, domain_literal):
				1921	return domain_literal, value
				1922	domain_literal.append(ValueTerminal('[', 'domain-literal-start'))
				1923	if value[0] in WSP:
				1924	token, value = get_fws(value)
				1925	domain_literal.append(token)
				1926	token, value = get_dtext(value)
				1927	domain_literal.append(token)
				1928	if _check_for_early_dl_end(value, domain_literal):
				1929	return domain_literal, value
				1930	if value[0] in WSP:
				1931	token, value = get_fws(value)
				1932	domain_literal.append(token)
				1933	if _check_for_early_dl_end(value, domain_literal):
				1934	return domain_literal, value
				1935	if value[0] != ']':
				1936	raise errors.HeaderParseError("expected ']' at end of domain-literal "
				1937	"but found '{}'".format(value))
				1938	domain_literal.append(ValueTerminal(']', 'domain-literal-end'))
				1939	value = value[1:]
				1940	if value and value[0] in CFWS_LEADER:
				1941	token, value = get_cfws(value)
				1942	domain_literal.append(token)
				1943	return domain_literal, value
				1944
				1945	def get_domain(value):
				1946	""" domain = dot-atom / domain-literal / obs-domain
				1947	obs-domain = atom *("." atom))
				1948
				1949	"""
				1950	domain = Domain()
				1951	leader = None
				1952	if value[0] in CFWS_LEADER:
				1953	leader, value = get_cfws(value)
				1954	if not value:
				1955	raise errors.HeaderParseError(
				1956	"expected domain but found '{}'".format(value))
				1957	if value[0] == '[':
				1958	token, value = get_domain_literal(value)
				1959	if leader is not None:
				1960	token[:0] = [leader]
				1961	domain.append(token)
				1962	return domain, value
				1963	try:
				1964	token, value = get_dot_atom(value)
				1965	except errors.HeaderParseError:
				1966	token, value = get_atom(value)
				1967	if leader is not None:
				1968	token[:0] = [leader]
				1969	domain.append(token)
				1970	if value and value[0] == '.':
				1971	domain.defects.append(errors.ObsoleteHeaderDefect(
				1972	"domain is not a dot-atom (contains CFWS)"))
				1973	if domain[0].token_type == 'dot-atom':
				1974	domain[:] = domain[0]
				1975	while value and value[0] == '.':
				1976	domain.append(DOT)
				1977	token, value = get_atom(value[1:])
				1978	domain.append(token)
				1979	return domain, value
				1980
				1981	def get_addr_spec(value):
				1982	""" addr-spec = local-part "@" domain
				1983
				1984	"""
				1985	addr_spec = AddrSpec()
				1986	token, value = get_local_part(value)
				1987	addr_spec.append(token)
				1988	if not value or value[0] != '@':
				1989	addr_spec.defects.append(errors.InvalidHeaderDefect(
				1990	"add-spec local part with no domain"))
				1991	return addr_spec, value
				1992	addr_spec.append(ValueTerminal('@', 'address-at-symbol'))
				1993	token, value = get_domain(value[1:])
				1994	addr_spec.append(token)
				1995	return addr_spec, value
				1996
				1997	def get_obs_route(value):
				1998	""" obs-route = obs-domain-list ":"
				1999	obs-domain-list = (CFWS / ",") "@" domain ("," [CFWS] ["@" domain])
				2000
				2001	Returns an obs-route token with the appropriate sub-tokens (that is,
				2002	there is no obs-domain-list in the parse tree).
				2003	"""
				2004	obs_route = ObsRoute()
				2005	while value and (value[0]==',' or value[0] in CFWS_LEADER):
				2006	if value[0] in CFWS_LEADER:
				2007	token, value = get_cfws(value)
				2008	obs_route.append(token)
				2009	elif value[0] == ',':
				2010	obs_route.append(ListSeparator)
				2011	value = value[1:]
				2012	if not value or value[0] != '@':
				2013	raise errors.HeaderParseError(
				2014	"expected obs-route domain but found '{}'".format(value))
				2015	obs_route.append(RouteComponentMarker)
				2016	token, value = get_domain(value[1:])
				2017	obs_route.append(token)
				2018	while value and value[0]==',':
				2019	obs_route.append(ListSeparator)
				2020	value = value[1:]
				2021	if not value:
				2022	break
				2023	if value[0] in CFWS_LEADER:
				2024	token, value = get_cfws(value)
				2025	obs_route.append(token)
				2026	if value[0] == '@':
				2027	obs_route.append(RouteComponentMarker)
				2028	token, value = get_domain(value[1:])
				2029	obs_route.append(token)
				2030	if not value:
				2031	raise errors.HeaderParseError("end of header while parsing obs-route")
				2032	if value[0] != ':':
				2033	raise errors.HeaderParseError( "expected ':' marking end of "
				2034	"obs-route but found '{}'".format(value))
				2035	obs_route.append(ValueTerminal(':', 'end-of-obs-route-marker'))
				2036	return obs_route, value[1:]
				2037
				2038	def get_angle_addr(value):
				2039	""" angle-addr = [CFWS] "<" addr-spec ">" [CFWS] / obs-angle-addr
				2040	obs-angle-addr = [CFWS] "<" obs-route addr-spec ">" [CFWS]
				2041
				2042	"""
				2043	angle_addr = AngleAddr()
				2044	if value[0] in CFWS_LEADER:
				2045	token, value = get_cfws(value)
				2046	angle_addr.append(token)
				2047	if not value or value[0] != '<':
				2048	raise errors.HeaderParseError(
				2049	"expected angle-addr but found '{}'".format(value))
				2050	angle_addr.append(ValueTerminal('<', 'angle-addr-start'))
				2051	value = value[1:]
R David Murray	032eed3	2012-05-26 14:31:12 -0400	[diff] [blame]	2052	# Although it is not legal per RFC5322, SMTP uses '<>' in certain
				2053	# circumstances.
				2054	if value[0] == '>':
				2055	angle_addr.append(ValueTerminal('>', 'angle-addr-end'))
				2056	angle_addr.defects.append(errors.InvalidHeaderDefect(
				2057	"null addr-spec in angle-addr"))
				2058	value = value[1:]
				2059	return angle_addr, value
R David Murray	0b6f6c8	2012-05-25 18:42:14 -0400	[diff] [blame]	2060	try:
				2061	token, value = get_addr_spec(value)
				2062	except errors.HeaderParseError:
				2063	try:
				2064	token, value = get_obs_route(value)
				2065	angle_addr.defects.append(errors.ObsoleteHeaderDefect(
				2066	"obsolete route specification in angle-addr"))
				2067	except errors.HeaderParseError:
				2068	raise errors.HeaderParseError(
R David Murray	032eed3	2012-05-26 14:31:12 -0400	[diff] [blame]	2069	"expected addr-spec or obs-route but found '{}'".format(value))
R David Murray	0b6f6c8	2012-05-25 18:42:14 -0400	[diff] [blame]	2070	angle_addr.append(token)
				2071	token, value = get_addr_spec(value)
				2072	angle_addr.append(token)
				2073	if value and value[0] == '>':
				2074	value = value[1:]
				2075	else:
				2076	angle_addr.defects.append(errors.InvalidHeaderDefect(
				2077	"missing trailing '>' on angle-addr"))
				2078	angle_addr.append(ValueTerminal('>', 'angle-addr-end'))
				2079	if value and value[0] in CFWS_LEADER:
				2080	token, value = get_cfws(value)
				2081	angle_addr.append(token)
				2082	return angle_addr, value
				2083
				2084	def get_display_name(value):
				2085	""" display-name = phrase
				2086
				2087	Because this is simply a name-rule, we don't return a display-name
				2088	token containing a phrase, but rather a display-name token with
				2089	the content of the phrase.
				2090
				2091	"""
				2092	display_name = DisplayName()
				2093	token, value = get_phrase(value)
				2094	display_name.extend(token[:])
				2095	display_name.defects = token.defects[:]
				2096	return display_name, value
				2097
				2098
				2099	def get_name_addr(value):
				2100	""" name-addr = [display-name] angle-addr
				2101
				2102	"""
				2103	name_addr = NameAddr()
				2104	# Both the optional display name and the angle-addr can start with cfws.
				2105	leader = None
				2106	if value[0] in CFWS_LEADER:
				2107	leader, value = get_cfws(value)
				2108	if not value:
				2109	raise errors.HeaderParseError(
				2110	"expected name-addr but found '{}'".format(leader))
				2111	if value[0] != '<':
				2112	if value[0] in PHRASE_ENDS:
				2113	raise errors.HeaderParseError(
				2114	"expected name-addr but found '{}'".format(value))
				2115	token, value = get_display_name(value)
				2116	if not value:
				2117	raise errors.HeaderParseError(
				2118	"expected name-addr but found '{}'".format(token))
				2119	if leader is not None:
				2120	token[0][:0] = [leader]
				2121	leader = None
				2122	name_addr.append(token)
				2123	token, value = get_angle_addr(value)
				2124	if leader is not None:
				2125	token[:0] = [leader]
				2126	name_addr.append(token)
				2127	return name_addr, value
				2128
				2129	def get_mailbox(value):
				2130	""" mailbox = name-addr / addr-spec
				2131
				2132	"""
				2133	# The only way to figure out if we are dealing with a name-addr or an
				2134	# addr-spec is to try parsing each one.
				2135	mailbox = Mailbox()
				2136	try:
				2137	token, value = get_name_addr(value)
				2138	except errors.HeaderParseError:
				2139	try:
				2140	token, value = get_addr_spec(value)
				2141	except errors.HeaderParseError:
				2142	raise errors.HeaderParseError(
				2143	"expected mailbox but found '{}'".format(value))
				2144	if any(isinstance(x, errors.InvalidHeaderDefect)
				2145	for x in token.all_defects):
				2146	mailbox.token_type = 'invalid-mailbox'
				2147	mailbox.append(token)
				2148	return mailbox, value
				2149
				2150	def get_invalid_mailbox(value, endchars):
				2151	""" Read everything up to one of the chars in endchars.
				2152
				2153	This is outside the formal grammar. The InvalidMailbox TokenList that is
				2154	returned acts like a Mailbox, but the data attributes are None.
				2155
				2156	"""
				2157	invalid_mailbox = InvalidMailbox()
				2158	while value and value[0] not in endchars:
				2159	if value[0] in PHRASE_ENDS:
				2160	invalid_mailbox.append(ValueTerminal(value[0],
				2161	'misplaced-special'))
				2162	value = value[1:]
				2163	else:
				2164	token, value = get_phrase(value)
				2165	invalid_mailbox.append(token)
				2166	return invalid_mailbox, value
				2167
				2168	def get_mailbox_list(value):
				2169	""" mailbox-list = (mailbox *("," mailbox)) / obs-mbox-list
				2170	obs-mbox-list = ([CFWS] ",") mailbox ("," [mailbox / CFWS])
				2171
				2172	For this routine we go outside the formal grammar in order to improve error
				2173	handling. We recognize the end of the mailbox list only at the end of the
				2174	value or at a ';' (the group terminator). This is so that we can turn
				2175	invalid mailboxes into InvalidMailbox tokens and continue parsing any
				2176	remaining valid mailboxes. We also allow all mailbox entries to be null,
				2177	and this condition is handled appropriately at a higher level.
				2178
				2179	"""
				2180	mailbox_list = MailboxList()
				2181	while value and value[0] != ';':
				2182	try:
				2183	token, value = get_mailbox(value)
				2184	mailbox_list.append(token)
				2185	except errors.HeaderParseError:
				2186	leader = None
				2187	if value[0] in CFWS_LEADER:
				2188	leader, value = get_cfws(value)
				2189	if not value or value[0] in ',;':
				2190	mailbox_list.append(leader)
				2191	mailbox_list.defects.append(errors.ObsoleteHeaderDefect(
				2192	"empty element in mailbox-list"))
				2193	else:
				2194	token, value = get_invalid_mailbox(value, ',;')
				2195	if leader is not None:
				2196	token[:0] = [leader]
				2197	mailbox_list.append(token)
				2198	mailbox_list.defects.append(errors.InvalidHeaderDefect(
				2199	"invalid mailbox in mailbox-list"))
				2200	elif value[0] == ',':
				2201	mailbox_list.defects.append(errors.ObsoleteHeaderDefect(
				2202	"empty element in mailbox-list"))
				2203	else:
				2204	token, value = get_invalid_mailbox(value, ',;')
				2205	if leader is not None:
				2206	token[:0] = [leader]
				2207	mailbox_list.append(token)
				2208	mailbox_list.defects.append(errors.InvalidHeaderDefect(
				2209	"invalid mailbox in mailbox-list"))
				2210	if value and value[0] not in ',;':
				2211	# Crap after mailbox; treat it as an invalid mailbox.
				2212	# The mailbox info will still be available.
				2213	mailbox = mailbox_list[-1]
				2214	mailbox.token_type = 'invalid-mailbox'
				2215	token, value = get_invalid_mailbox(value, ',;')
				2216	mailbox.extend(token)
				2217	mailbox_list.defects.append(errors.InvalidHeaderDefect(
				2218	"invalid mailbox in mailbox-list"))
				2219	if value and value[0] == ',':
				2220	mailbox_list.append(ListSeparator)
				2221	value = value[1:]
				2222	return mailbox_list, value
				2223
				2224
				2225	def get_group_list(value):
				2226	""" group-list = mailbox-list / CFWS / obs-group-list
				2227	obs-group-list = 1*([CFWS] ",") [CFWS]
				2228
				2229	"""
				2230	group_list = GroupList()
				2231	if not value:
				2232	group_list.defects.append(errors.InvalidHeaderDefect(
				2233	"end of header before group-list"))
				2234	return group_list, value
				2235	leader = None
				2236	if value and value[0] in CFWS_LEADER:
				2237	leader, value = get_cfws(value)
				2238	if not value:
				2239	# This should never happen in email parsing, since CFWS-only is a
				2240	# legal alternative to group-list in a group, which is the only
				2241	# place group-list appears.
				2242	group_list.defects.append(errors.InvalidHeaderDefect(
				2243	"end of header in group-list"))
				2244	group_list.append(leader)
				2245	return group_list, value
				2246	if value[0] == ';':
				2247	group_list.append(leader)
				2248	return group_list, value
				2249	token, value = get_mailbox_list(value)
				2250	if len(token.all_mailboxes)==0:
				2251	if leader is not None:
				2252	group_list.append(leader)
				2253	group_list.extend(token)
				2254	group_list.defects.append(errors.ObsoleteHeaderDefect(
				2255	"group-list with empty entries"))
				2256	return group_list, value
				2257	if leader is not None:
				2258	token[:0] = [leader]
				2259	group_list.append(token)
				2260	return group_list, value
				2261
				2262	def get_group(value):
				2263	""" group = display-name ":" [group-list] ";" [CFWS]
				2264
				2265	"""
				2266	group = Group()
				2267	token, value = get_display_name(value)
				2268	if not value or value[0] != ':':
				2269	raise errors.HeaderParseError("expected ':' at end of group "
				2270	"display name but found '{}'".format(value))
				2271	group.append(token)
				2272	group.append(ValueTerminal(':', 'group-display-name-terminator'))
				2273	value = value[1:]
				2274	if value and value[0] == ';':
				2275	group.append(ValueTerminal(';', 'group-terminator'))
				2276	return group, value[1:]
				2277	token, value = get_group_list(value)
				2278	group.append(token)
				2279	if not value:
				2280	group.defects.append(errors.InvalidHeaderDefect(
				2281	"end of header in group"))
				2282	if value[0] != ';':
				2283	raise errors.HeaderParseError(
				2284	"expected ';' at end of group but found {}".format(value))
				2285	group.append(ValueTerminal(';', 'group-terminator'))
				2286	value = value[1:]
				2287	if value and value[0] in CFWS_LEADER:
				2288	token, value = get_cfws(value)
				2289	group.append(token)
				2290	return group, value
				2291
				2292	def get_address(value):
				2293	""" address = mailbox / group
				2294
				2295	Note that counter-intuitively, an address can be either a single address or
				2296	a list of addresses (a group). This is why the returned Address object has
				2297	a 'mailboxes' attribute which treats a single address as a list of length
				2298	one. When you need to differentiate between to two cases, extract the single
				2299	element, which is either a mailbox or a group token.
				2300
				2301	"""
				2302	# The formal grammar isn't very helpful when parsing an address. mailbox
				2303	# and group, especially when allowing for obsolete forms, start off very
				2304	# similarly. It is only when you reach one of @, <, or : that you know
				2305	# what you've got. So, we try each one in turn, starting with the more
				2306	# likely of the two. We could perhaps make this more efficient by looking
				2307	# for a phrase and then branching based on the next character, but that
				2308	# would be a premature optimization.
				2309	address = Address()
				2310	try:
				2311	token, value = get_group(value)
				2312	except errors.HeaderParseError:
				2313	try:
				2314	token, value = get_mailbox(value)
				2315	except errors.HeaderParseError:
				2316	raise errors.HeaderParseError(
				2317	"expected address but found '{}'".format(value))
				2318	address.append(token)
				2319	return address, value
				2320
				2321	def get_address_list(value):
				2322	""" address_list = (address *("," address)) / obs-addr-list
				2323	obs-addr-list = ([CFWS] ",") address ("," [address / CFWS])
				2324
				2325	We depart from the formal grammar here by continuing to parse until the end
				2326	of the input, assuming the input to be entirely composed of an
				2327	address-list. This is always true in email parsing, and allows us
				2328	to skip invalid addresses to parse additional valid ones.
				2329
				2330	"""
				2331	address_list = AddressList()
				2332	while value:
				2333	try:
				2334	token, value = get_address(value)
				2335	address_list.append(token)
				2336	except errors.HeaderParseError as err:
				2337	leader = None
				2338	if value[0] in CFWS_LEADER:
				2339	leader, value = get_cfws(value)
				2340	if not value or value[0] == ',':
				2341	address_list.append(leader)
				2342	address_list.defects.append(errors.ObsoleteHeaderDefect(
				2343	"address-list entry with no content"))
				2344	else:
				2345	token, value = get_invalid_mailbox(value, ',')
				2346	if leader is not None:
				2347	token[:0] = [leader]
				2348	address_list.append(Address([token]))
				2349	address_list.defects.append(errors.InvalidHeaderDefect(
				2350	"invalid address in address-list"))
				2351	elif value[0] == ',':
				2352	address_list.defects.append(errors.ObsoleteHeaderDefect(
				2353	"empty element in address-list"))
				2354	else:
				2355	token, value = get_invalid_mailbox(value, ',')
				2356	if leader is not None:
				2357	token[:0] = [leader]
				2358	address_list.append(Address([token]))
				2359	address_list.defects.append(errors.InvalidHeaderDefect(
				2360	"invalid address in address-list"))
				2361	if value and value[0] != ',':
				2362	# Crap after address; treat it as an invalid mailbox.
				2363	# The mailbox info will still be available.
				2364	mailbox = address_list[-1][0]
				2365	mailbox.token_type = 'invalid-mailbox'
				2366	token, value = get_invalid_mailbox(value, ',')
				2367	mailbox.extend(token)
				2368	address_list.defects.append(errors.InvalidHeaderDefect(
				2369	"invalid address in address-list"))
				2370	if value: # Must be a , at this point.
				2371	address_list.append(ValueTerminal(',', 'list-separator'))
				2372	value = value[1:]
				2373	return address_list, value
R David Murray	97f43c0	2012-06-24 05:03:27 -0400	[diff] [blame]	2374
				2375	#
				2376	# XXX: As I begin to add additional header parsers, I'm realizing we probably
				2377	# have two level of parser routines: the get_XXX methods that get a token in
				2378	# the grammar, and parse_XXX methods that parse an entire field value. So
				2379	# get_address_list above should really be a parse_ method, as probably should
				2380	# be get_unstructured.
				2381	#
				2382
				2383	def parse_mime_version(value):
				2384	""" mime-version = [CFWS] 1digit [CFWS] "." [CFWS] 1digit [CFWS]
				2385
				2386	"""
				2387	# The [CFWS] is implicit in the RFC 2045 BNF.
				2388	# XXX: This routine is a bit verbose, should factor out a get_int method.
				2389	mime_version = MIMEVersion()
				2390	if not value:
				2391	mime_version.defects.append(errors.HeaderMissingRequiredValue(
				2392	"Missing MIME version number (eg: 1.0)"))
				2393	return mime_version
				2394	if value[0] in CFWS_LEADER:
				2395	token, value = get_cfws(value)
				2396	mime_version.append(token)
				2397	if not value:
				2398	mime_version.defects.append(errors.HeaderMissingRequiredValue(
				2399	"Expected MIME version number but found only CFWS"))
				2400	digits = ''
				2401	while value and value[0] != '.' and value[0] not in CFWS_LEADER:
				2402	digits += value[0]
				2403	value = value[1:]
				2404	if not digits.isdigit():
				2405	mime_version.defects.append(errors.InvalidHeaderDefect(
				2406	"Expected MIME major version number but found {!r}".format(digits)))
				2407	mime_version.append(ValueTerminal(digits, 'xtext'))
				2408	else:
				2409	mime_version.major = int(digits)
				2410	mime_version.append(ValueTerminal(digits, 'digits'))
				2411	if value and value[0] in CFWS_LEADER:
				2412	token, value = get_cfws(value)
				2413	mime_version.append(token)
				2414	if not value or value[0] != '.':
				2415	if mime_version.major is not None:
				2416	mime_version.defects.append(errors.InvalidHeaderDefect(
				2417	"Incomplete MIME version; found only major number"))
				2418	if value:
				2419	mime_version.append(ValueTerminal(value, 'xtext'))
				2420	return mime_version
				2421	mime_version.append(ValueTerminal('.', 'version-separator'))
				2422	value = value[1:]
				2423	if value and value[0] in CFWS_LEADER:
				2424	token, value = get_cfws(value)
				2425	mime_version.append(token)
				2426	if not value:
				2427	if mime_version.major is not None:
				2428	mime_version.defects.append(errors.InvalidHeaderDefect(
				2429	"Incomplete MIME version; found only major number"))
				2430	return mime_version
				2431	digits = ''
				2432	while value and value[0] not in CFWS_LEADER:
				2433	digits += value[0]
				2434	value = value[1:]
				2435	if not digits.isdigit():
				2436	mime_version.defects.append(errors.InvalidHeaderDefect(
				2437	"Expected MIME minor version number but found {!r}".format(digits)))
				2438	mime_version.append(ValueTerminal(digits, 'xtext'))
				2439	else:
				2440	mime_version.minor = int(digits)
				2441	mime_version.append(ValueTerminal(digits, 'digits'))
				2442	if value and value[0] in CFWS_LEADER:
				2443	token, value = get_cfws(value)
				2444	mime_version.append(token)
				2445	if value:
				2446	mime_version.defects.append(errors.InvalidHeaderDefect(
				2447	"Excess non-CFWS text after MIME version"))
				2448	mime_version.append(ValueTerminal(value, 'xtext'))
				2449	return mime_version
				2450
				2451	def get_invalid_parameter(value):
				2452	""" Read everything up to the next ';'.
				2453
				2454	This is outside the formal grammar. The InvalidParameter TokenList that is
				2455	returned acts like a Parameter, but the data attributes are None.
				2456
				2457	"""
				2458	invalid_parameter = InvalidParameter()
				2459	while value and value[0] != ';':
				2460	if value[0] in PHRASE_ENDS:
				2461	invalid_parameter.append(ValueTerminal(value[0],
				2462	'misplaced-special'))
				2463	value = value[1:]
				2464	else:
				2465	token, value = get_phrase(value)
				2466	invalid_parameter.append(token)
				2467	return invalid_parameter, value
				2468
				2469	def get_ttext(value):
				2470	"""ttext = <matches _ttext_matcher>
				2471
				2472	We allow any non-TOKEN_ENDS in ttext, but add defects to the token's
				2473	defects list if we find non-ttext characters. We also register defects for
				2474	any non-printables even though the RFC doesn't exclude all of them,
				2475	because we follow the spirit of RFC 5322.
				2476
				2477	"""
				2478	m = _non_token_end_matcher(value)
				2479	if not m:
				2480	raise errors.HeaderParseError(
				2481	"expected ttext but found '{}'".format(value))
				2482	ttext = m.group()
				2483	value = value[len(ttext):]
				2484	ttext = ValueTerminal(ttext, 'ttext')
				2485	_validate_xtext(ttext)
				2486	return ttext, value
				2487
				2488	def get_token(value):
				2489	"""token = [CFWS] 1*ttext [CFWS]
				2490
				2491	The RFC equivalent of ttext is any US-ASCII chars except space, ctls, or
				2492	tspecials. We also exclude tabs even though the RFC doesn't.
				2493
				2494	The RFC implies the CFWS but is not explicit about it in the BNF.
				2495
				2496	"""
				2497	mtoken = Token()
				2498	if value and value[0] in CFWS_LEADER:
				2499	token, value = get_cfws(value)
				2500	mtoken.append(token)
				2501	if value and value[0] in TOKEN_ENDS:
				2502	raise errors.HeaderParseError(
				2503	"expected token but found '{}'".format(value))
				2504	token, value = get_ttext(value)
				2505	mtoken.append(token)
				2506	if value and value[0] in CFWS_LEADER:
				2507	token, value = get_cfws(value)
				2508	mtoken.append(token)
				2509	return mtoken, value
				2510
				2511	def get_attrtext(value):
				2512	"""attrtext = 1*(any non-ATTRIBUTE_ENDS character)
				2513
				2514	We allow any non-ATTRIBUTE_ENDS in attrtext, but add defects to the
				2515	token's defects list if we find non-attrtext characters. We also register
				2516	defects for any non-printables even though the RFC doesn't exclude all of
				2517	them, because we follow the spirit of RFC 5322.
				2518
				2519	"""
				2520	m = _non_attribute_end_matcher(value)
				2521	if not m:
				2522	raise errors.HeaderParseError(
				2523	"expected attrtext but found {!r}".format(value))
				2524	attrtext = m.group()
				2525	value = value[len(attrtext):]
				2526	attrtext = ValueTerminal(attrtext, 'attrtext')
				2527	_validate_xtext(attrtext)
				2528	return attrtext, value
				2529
				2530	def get_attribute(value):
				2531	""" [CFWS] 1*attrtext [CFWS]
				2532
				2533	This version of the BNF makes the CFWS explicit, and as usual we use a
				2534	value terminal for the actual run of characters. The RFC equivalent of
				2535	attrtext is the token characters, with the subtraction of '*', "'", and '%'.
				2536	We include tab in the excluded set just as we do for token.
				2537
				2538	"""
				2539	attribute = Attribute()
				2540	if value and value[0] in CFWS_LEADER:
				2541	token, value = get_cfws(value)
				2542	attribute.append(token)
				2543	if value and value[0] in ATTRIBUTE_ENDS:
				2544	raise errors.HeaderParseError(
				2545	"expected token but found '{}'".format(value))
				2546	token, value = get_attrtext(value)
				2547	attribute.append(token)
				2548	if value and value[0] in CFWS_LEADER:
				2549	token, value = get_cfws(value)
				2550	attribute.append(token)
				2551	return attribute, value
				2552
				2553	def get_extended_attrtext(value):
				2554	"""attrtext = 1*(any non-ATTRIBUTE_ENDS character plus '%')
				2555
				2556	This is a special parsing routine so that we get a value that
				2557	includes % escapes as a single string (which we decode as a single
				2558	string later).
				2559
				2560	"""
				2561	m = _non_extended_attribute_end_matcher(value)
				2562	if not m:
				2563	raise errors.HeaderParseError(
				2564	"expected extended attrtext but found {!r}".format(value))
				2565	attrtext = m.group()
				2566	value = value[len(attrtext):]
				2567	attrtext = ValueTerminal(attrtext, 'extended-attrtext')
				2568	_validate_xtext(attrtext)
				2569	return attrtext, value
				2570
				2571	def get_extended_attribute(value):
				2572	""" [CFWS] 1*extended_attrtext [CFWS]
				2573
				2574	This is like the non-extended version except we allow % characters, so that
				2575	we can pick up an encoded value as a single string.
				2576
				2577	"""
				2578	# XXX: should we have an ExtendedAttribute TokenList?
				2579	attribute = Attribute()
				2580	if value and value[0] in CFWS_LEADER:
				2581	token, value = get_cfws(value)
				2582	attribute.append(token)
				2583	if value and value[0] in EXTENDED_ATTRIBUTE_ENDS:
				2584	raise errors.HeaderParseError(
				2585	"expected token but found '{}'".format(value))
				2586	token, value = get_extended_attrtext(value)
				2587	attribute.append(token)
				2588	if value and value[0] in CFWS_LEADER:
				2589	token, value = get_cfws(value)
				2590	attribute.append(token)
				2591	return attribute, value
				2592
				2593	def get_section(value):
				2594	""" '*' digits
				2595
				2596	The formal BNF is more complicated because leading 0s are not allowed. We
				2597	check for that and add a defect. We also assume no CFWS is allowed between
				2598	the '*' and the digits, though the RFC is not crystal clear on that.
				2599	The caller should already have dealt with leading CFWS.
				2600
				2601	"""
				2602	section = Section()
				2603	if not value or value[0] != '*':
				2604	raise errors.HeaderParseError("Expected section but found {}".format(
				2605	value))
				2606	section.append(ValueTerminal('*', 'section-marker'))
				2607	value = value[1:]
				2608	if not value or not value[0].isdigit():
				2609	raise errors.HeaderParseError("Expected section number but "
				2610	"found {}".format(value))
				2611	digits = ''
				2612	while value and value[0].isdigit():
				2613	digits += value[0]
				2614	value = value[1:]
				2615	if digits[0] == '0' and digits != '0':
				2616	section.defects.append(errors.InvalidHeaderError("section number"
				2617	"has an invalid leading 0"))
				2618	section.number = int(digits)
				2619	section.append(ValueTerminal(digits, 'digits'))
				2620	return section, value
				2621
				2622
				2623	def get_value(value):
				2624	""" quoted-string / attribute
				2625
				2626	"""
				2627	v = Value()
				2628	if not value:
				2629	raise errors.HeaderParseError("Expected value but found end of string")
				2630	leader = None
				2631	if value[0] in CFWS_LEADER:
				2632	leader, value = get_cfws(value)
				2633	if not value:
				2634	raise errors.HeaderParseError("Expected value but found "
				2635	"only {}".format(leader))
				2636	if value[0] == '"':
				2637	token, value = get_quoted_string(value)
				2638	else:
				2639	token, value = get_extended_attribute(value)
				2640	if leader is not None:
				2641	token[:0] = [leader]
				2642	v.append(token)
				2643	return v, value
				2644
				2645	def get_parameter(value):
				2646	""" attribute [section] ["*"] [CFWS] "=" value
				2647
				2648	The CFWS is implied by the RFC but not made explicit in the BNF. This
				2649	simplified form of the BNF from the RFC is made to conform with the RFC BNF
				2650	through some extra checks. We do it this way because it makes both error
				2651	recovery and working with the resulting parse tree easier.
				2652	"""
				2653	# It is possible CFWS would also be implicitly allowed between the section
				2654	# and the 'extended-attribute' marker (the '*') , but we've never seen that
				2655	# in the wild and we will therefore ignore the possibility.
				2656	param = Parameter()
				2657	token, value = get_attribute(value)
				2658	param.append(token)
				2659	if not value or value[0] == ';':
				2660	param.defects.append(errors.InvalidHeaderDefect("Parameter contains "
				2661	"name ({}) but no value".format(token)))
				2662	return param, value
				2663	if value[0] == '*':
				2664	try:
				2665	token, value = get_section(value)
				2666	param.sectioned = True
				2667	param.append(token)
				2668	except errors.HeaderParseError:
				2669	pass
				2670	if not value:
				2671	raise errors.HeaderParseError("Incomplete parameter")
				2672	if value[0] == '*':
				2673	param.append(ValueTerminal('*', 'extended-parameter-marker'))
				2674	value = value[1:]
				2675	param.extended = True
				2676	if value[0] != '=':
				2677	raise errors.HeaderParseError("Parameter not followed by '='")
				2678	param.append(ValueTerminal('=', 'parameter-separator'))
				2679	value = value[1:]
				2680	leader = None
				2681	if value and value[0] in CFWS_LEADER:
				2682	token, value = get_cfws(value)
				2683	param.append(token)
				2684	remainder = None
				2685	appendto = param
				2686	if param.extended and value and value[0] == '"':
				2687	# Now for some serious hackery to handle the common invalid case of
				2688	# double quotes around an extended value. We also accept (with defect)
				2689	# a value marked as encoded that isn't really.
				2690	qstring, remainder = get_quoted_string(value)
				2691	inner_value = qstring.stripped_value
				2692	semi_valid = False
				2693	if param.section_number == 0:
				2694	if inner_value and inner_value[0] == "'":
				2695	semi_valid = True
				2696	else:
				2697	token, rest = get_attrtext(inner_value)
				2698	if rest and rest[0] == "'":
				2699	semi_valid = True
				2700	else:
				2701	try:
				2702	token, rest = get_extended_attrtext(inner_value)
				2703	except:
				2704	pass
				2705	else:
				2706	if not rest:
				2707	semi_valid = True
				2708	if semi_valid:
				2709	param.defects.append(errors.InvalidHeaderDefect(
				2710	"Quoted string value for extended parameter is invalid"))
				2711	param.append(qstring)
				2712	for t in qstring:
				2713	if t.token_type == 'bare-quoted-string':
				2714	t[:] = []
				2715	appendto = t
				2716	break
				2717	value = inner_value
				2718	else:
				2719	remainder = None
				2720	param.defects.append(errors.InvalidHeaderDefect(
				2721	"Parameter marked as extended but appears to have a "
				2722	"quoted string value that is non-encoded"))
				2723	if value and value[0] == "'":
				2724	token = None
				2725	else:
				2726	token, value = get_value(value)
				2727	if not param.extended or param.section_number > 0:
				2728	if not value or value[0] != "'":
				2729	appendto.append(token)
				2730	if remainder is not None:
				2731	assert not value, value
				2732	value = remainder
				2733	return param, value
				2734	param.defects.append(errors.InvalidHeaderDefect(
				2735	"Apparent initial-extended-value but attribute "
				2736	"was not marked as extended or was not initial section"))
				2737	if not value:
				2738	# Assume the charset/lang is missing and the token is the value.
				2739	param.defects.append(errors.InvalidHeaderDefect(
				2740	"Missing required charset/lang delimiters"))
				2741	appendto.append(token)
				2742	if remainder is None:
				2743	return param, value
				2744	else:
				2745	if token is not None:
				2746	for t in token:
				2747	if t.token_type == 'extended-attrtext':
				2748	break
				2749	t.token_type == 'attrtext'
				2750	appendto.append(t)
				2751	param.charset = t.value
				2752	if value[0] != "'":
				2753	raise errors.HeaderParseError("Expected RFC2231 char/lang encoding "
				2754	"delimiter, but found {!r}".format(value))
				2755	appendto.append(ValueTerminal("'", 'RFC2231 delimiter'))
				2756	value = value[1:]
				2757	if value and value[0] != "'":
				2758	token, value = get_attrtext(value)
				2759	appendto.append(token)
				2760	param.lang = token.value
				2761	if not value or value[0] != "'":
				2762	raise errors.HeaderParseError("Expected RFC2231 char/lang encoding "
				2763	"delimiter, but found {}".format(value))
				2764	appendto.append(ValueTerminal("'", 'RFC2231 delimiter'))
				2765	value = value[1:]
				2766	if remainder is not None:
				2767	# Treat the rest of value as bare quoted string content.
				2768	v = Value()
				2769	while value:
				2770	if value[0] in WSP:
				2771	token, value = get_fws(value)
				2772	else:
				2773	token, value = get_qcontent(value)
				2774	v.append(token)
				2775	token = v
				2776	else:
				2777	token, value = get_value(value)
				2778	appendto.append(token)
				2779	if remainder is not None:
				2780	assert not value, value
				2781	value = remainder
				2782	return param, value
				2783
				2784	def parse_mime_parameters(value):
				2785	""" parameter *( ";" parameter )
				2786
				2787	That BNF is meant to indicate this routine should only be called after
				2788	finding and handling the leading ';'. There is no corresponding rule in
				2789	the formal RFC grammar, but it is more convenient for us for the set of
				2790	parameters to be treated as its own TokenList.
				2791
				2792	This is 'parse' routine because it consumes the reminaing value, but it
				2793	would never be called to parse a full header. Instead it is called to
				2794	parse everything after the non-parameter value of a specific MIME header.
				2795
				2796	"""
				2797	mime_parameters = MimeParameters()
				2798	while value:
				2799	try:
				2800	token, value = get_parameter(value)
				2801	mime_parameters.append(token)
				2802	except errors.HeaderParseError as err:
				2803	leader = None
				2804	if value[0] in CFWS_LEADER:
				2805	leader, value = get_cfws(value)
				2806	if not value:
				2807	mime_parameters.append(leader)
				2808	return mime_parameters
				2809	if value[0] == ';':
				2810	if leader is not None:
				2811	mime_parameters.append(leader)
				2812	mime_parameters.defects.append(errors.InvalidHeaderDefect(
				2813	"parameter entry with no content"))
				2814	else:
				2815	token, value = get_invalid_parameter(value)
				2816	if leader:
				2817	token[:0] = [leader]
				2818	mime_parameters.append(token)
				2819	mime_parameters.defects.append(errors.InvalidHeaderDefect(
				2820	"invalid parameter {!r}".format(token)))
				2821	if value and value[0] != ';':
				2822	# Junk after the otherwise valid parameter. Mark it as
				2823	# invalid, but it will have a value.
				2824	param = mime_parameters[-1]
				2825	param.token_type = 'invalid-parameter'
				2826	token, value = get_invalid_parameter(value)
				2827	param.extend(token)
				2828	mime_parameters.defects.append(errors.InvalidHeaderDefect(
				2829	"parameter with invalid trailing text {!r}".format(token)))
				2830	if value:
				2831	# Must be a ';' at this point.
				2832	mime_parameters.append(ValueTerminal(';', 'parameter-separator'))
				2833	value = value[1:]
				2834	return mime_parameters
				2835
				2836	def _find_mime_parameters(tokenlist, value):
				2837	"""Do our best to find the parameters in an invalid MIME header
				2838
				2839	"""
				2840	while value and value[0] != ';':
				2841	if value[0] in PHRASE_ENDS:
				2842	tokenlist.append(ValueTerminal(value[0], 'misplaced-special'))
				2843	value = value[1:]
				2844	else:
				2845	token, value = get_phrase(value)
				2846	tokenlist.append(token)
				2847	if not value:
				2848	return
				2849	tokenlist.append(ValueTerminal(';', 'parameter-separator'))
				2850	tokenlist.append(parse_mime_parameters(value[1:]))
				2851
				2852	def parse_content_type_header(value):
				2853	""" maintype "/" subtype *( ";" parameter )
				2854
				2855	The maintype and substype are tokens. Theoretically they could
				2856	be checked against the official IANA list + x-token, but we
				2857	don't do that.
				2858	"""
				2859	ctype = ContentType()
				2860	recover = False
				2861	if not value:
				2862	ctype.defects.append(errors.HeaderMissingRequiredValue(
				2863	"Missing content type specification"))
				2864	return ctype
				2865	try:
				2866	token, value = get_token(value)
				2867	except errors.HeaderParseError:
				2868	ctype.defects.append(errors.InvalidHeaderDefect(
				2869	"Expected content maintype but found {!r}".format(value)))
				2870	_find_mime_parameters(ctype, value)
				2871	return ctype
				2872	ctype.append(token)
Martin Panter	46f5072	2016-05-26 05:35:26 +0000	[diff] [blame]	2873	# XXX: If we really want to follow the formal grammar we should make
R David Murray	97f43c0	2012-06-24 05:03:27 -0400	[diff] [blame]	2874	# mantype and subtype specialized TokenLists here. Probably not worth it.
				2875	if not value or value[0] != '/':
				2876	ctype.defects.append(errors.InvalidHeaderDefect(
				2877	"Invalid content type"))
				2878	if value:
				2879	_find_mime_parameters(ctype, value)
				2880	return ctype
				2881	ctype.maintype = token.value.strip().lower()
				2882	ctype.append(ValueTerminal('/', 'content-type-separator'))
				2883	value = value[1:]
				2884	try:
				2885	token, value = get_token(value)
				2886	except errors.HeaderParseError:
				2887	ctype.defects.append(errors.InvalidHeaderDefect(
				2888	"Expected content subtype but found {!r}".format(value)))
				2889	_find_mime_parameters(ctype, value)
				2890	return ctype
				2891	ctype.append(token)
				2892	ctype.subtype = token.value.strip().lower()
				2893	if not value:
				2894	return ctype
				2895	if value[0] != ';':
				2896	ctype.defects.append(errors.InvalidHeaderDefect(
				2897	"Only parameters are valid after content type, but "
				2898	"found {!r}".format(value)))
				2899	# The RFC requires that a syntactically invalid content-type be treated
				2900	# as text/plain. Perhaps we should postel this, but we should probably
				2901	# only do that if we were checking the subtype value against IANA.
				2902	del ctype.maintype, ctype.subtype
				2903	_find_mime_parameters(ctype, value)
				2904	return ctype
				2905	ctype.append(ValueTerminal(';', 'parameter-separator'))
				2906	ctype.append(parse_mime_parameters(value[1:]))
				2907	return ctype
				2908
				2909	def parse_content_disposition_header(value):
				2910	""" disposition-type *( ";" parameter )
				2911
				2912	"""
				2913	disp_header = ContentDisposition()
				2914	if not value:
				2915	disp_header.defects.append(errors.HeaderMissingRequiredValue(
				2916	"Missing content disposition"))
				2917	return disp_header
				2918	try:
				2919	token, value = get_token(value)
				2920	except errors.HeaderParseError:
Ezio Melotti	d577480	2014-08-04 17:16:49 +0300	[diff] [blame]	2921	disp_header.defects.append(errors.InvalidHeaderDefect(
R David Murray	97f43c0	2012-06-24 05:03:27 -0400	[diff] [blame]	2922	"Expected content disposition but found {!r}".format(value)))
				2923	_find_mime_parameters(disp_header, value)
				2924	return disp_header
				2925	disp_header.append(token)
				2926	disp_header.content_disposition = token.value.strip().lower()
				2927	if not value:
				2928	return disp_header
				2929	if value[0] != ';':
				2930	disp_header.defects.append(errors.InvalidHeaderDefect(
				2931	"Only parameters are valid after content disposition, but "
				2932	"found {!r}".format(value)))
				2933	_find_mime_parameters(disp_header, value)
				2934	return disp_header
				2935	disp_header.append(ValueTerminal(';', 'parameter-separator'))
				2936	disp_header.append(parse_mime_parameters(value[1:]))
				2937	return disp_header
				2938
				2939	def parse_content_transfer_encoding_header(value):
				2940	""" mechanism
				2941
				2942	"""
				2943	# We should probably validate the values, since the list is fixed.
				2944	cte_header = ContentTransferEncoding()
				2945	if not value:
				2946	cte_header.defects.append(errors.HeaderMissingRequiredValue(
				2947	"Missing content transfer encoding"))
				2948	return cte_header
				2949	try:
				2950	token, value = get_token(value)
				2951	except errors.HeaderParseError:
Ezio Melotti	d577480	2014-08-04 17:16:49 +0300	[diff] [blame]	2952	cte_header.defects.append(errors.InvalidHeaderDefect(
				2953	"Expected content transfer encoding but found {!r}".format(value)))
R David Murray	97f43c0	2012-06-24 05:03:27 -0400	[diff] [blame]	2954	else:
				2955	cte_header.append(token)
				2956	cte_header.cte = token.value.strip().lower()
				2957	if not value:
				2958	return cte_header
				2959	while value:
				2960	cte_header.defects.append(errors.InvalidHeaderDefect(
				2961	"Extra text after content transfer encoding"))
				2962	if value[0] in PHRASE_ENDS:
				2963	cte_header.append(ValueTerminal(value[0], 'misplaced-special'))
				2964	value = value[1:]
				2965	else:
				2966	token, value = get_phrase(value)
				2967	cte_header.append(token)
				2968	return cte_header