Blame - Lib/email/_header_value_parser.py - platform/external/python/cpython3

blob: 26cfa52723fe2db10419f618a259800ffa68316f [file] [log] [blame]

R David Murray	0b6f6c8	2012-05-25 18:42:14 -0400	[diff] [blame]	1	"""Header value parser implementing various email-related RFC parsing rules.
				2
				3	The parsing methods defined in this module implement various email related
				4	parsing rules. Principal among them is RFC 5322, which is the followon
				5	to RFC 2822 and primarily a clarification of the former. It also implements
				6	RFC 2047 encoded word decoding.
				7
				8	RFC 5322 goes to considerable trouble to maintain backward compatibility with
				9	RFC 822 in the parse phase, while cleaning up the structure on the generation
				10	phase. This parser supports correct RFC 5322 generation by tagging white space
				11	as folding white space only when folding is allowed in the non-obsolete rule
				12	sets. Actually, the parser is even more generous when accepting input than RFC
				13	5322 mandates, following the spirit of Postel's Law, which RFC 5322 encourages.
				14	Where possible deviations from the standard are annotated on the 'defects'
				15	attribute of tokens that deviate.
				16
				17	The general structure of the parser follows RFC 5322, and uses its terminology
				18	where there is a direct correspondence. Where the implementation requires a
				19	somewhat different structure than that used by the formal grammar, new terms
				20	that mimic the closest existing terms are used. Thus, it really helps to have
				21	a copy of RFC 5322 handy when studying this code.
				22
				23	Input to the parser is a string that has already been unfolded according to
				24	RFC 5322 rules. According to the RFC this unfolding is the very first step, and
				25	this parser leaves the unfolding step to a higher level message parser, which
				26	will have already detected the line breaks that need unfolding while
				27	determining the beginning and end of each header.
				28
				29	The output of the parser is a TokenList object, which is a list subclass. A
				30	TokenList is a recursive data structure. The terminal nodes of the structure
				31	are Terminal objects, which are subclasses of str. These do not correspond
				32	directly to terminal objects in the formal grammar, but are instead more
				33	practical higher level combinations of true terminals.
				34
				35	All TokenList and Terminal objects have a 'value' attribute, which produces the
				36	semantically meaningful value of that part of the parse subtree. The value of
				37	all whitespace tokens (no matter how many sub-tokens they may contain) is a
				38	single space, as per the RFC rules. This includes 'CFWS', which is herein
				39	included in the general class of whitespace tokens. There is one exception to
				40	the rule that whitespace tokens are collapsed into single spaces in values: in
				41	the value of a 'bare-quoted-string' (a quoted-string with no leading or
				42	trailing whitespace), any whitespace that appeared between the quotation marks
				43	is preserved in the returned value. Note that in all Terminal strings quoted
				44	pairs are turned into their unquoted values.
				45
				46	All TokenList and Terminal objects also have a string value, which attempts to
				47	be a "canonical" representation of the RFC-compliant form of the substring that
				48	produced the parsed subtree, including minimal use of quoted pair quoting.
				49	Whitespace runs are not collapsed.
				50
				51	Comment tokens also have a 'content' attribute providing the string found
				52	between the parens (including any nested comments) with whitespace preserved.
				53
				54	All TokenList and Terminal objects have a 'defects' attribute which is a
				55	possibly empty list all of the defects found while creating the token. Defects
				56	may appear on any token in the tree, and a composite list of all defects in the
				57	subtree is available through the 'all_defects' attribute of any node. (For
				58	Terminal notes x.defects == x.all_defects.)
				59
				60	Each object in a parse tree is called a 'token', and each has a 'token_type'
				61	attribute that gives the name from the RFC 5322 grammar that it represents.
				62	Not all RFC 5322 nodes are produced, and there is one non-RFC 5322 node that
				63	may be produced: 'ptext'. A 'ptext' is a string of printable ascii characters.
				64	It is returned in place of lists of (ctext/quoted-pair) and
				65	(qtext/quoted-pair).
				66
				67	XXX: provide complete list of token types.
				68	"""
				69
				70	import re
R David Murray	97f43c0	2012-06-24 05:03:27 -0400	[diff] [blame]	71	import urllib # For urllib.parse.unquote
				72	from collections import namedtuple, OrderedDict
R David Murray	0b6f6c8	2012-05-25 18:42:14 -0400	[diff] [blame]	73	from email import _encoded_words as _ew
				74	from email import errors
				75	from email import utils
				76
				77	#
				78	# Useful constants and functions
				79	#
				80
				81	WSP = set(' \t')
				82	CFWS_LEADER = WSP \| set('(')
				83	SPECIALS = set(r'()<>@,:;.\"[]')
				84	ATOM_ENDS = SPECIALS \| WSP
				85	DOT_ATOM_ENDS = ATOM_ENDS - set('.')
				86	# '.', '"', and '(' do not end phrases in order to support obs-phrase
				87	PHRASE_ENDS = SPECIALS - set('."(')
R David Murray	97f43c0	2012-06-24 05:03:27 -0400	[diff] [blame]	88	TSPECIALS = (SPECIALS \| set('/?=')) - set('.')
				89	TOKEN_ENDS = TSPECIALS \| WSP
				90	ASPECIALS = TSPECIALS \| set("*'%")
				91	ATTRIBUTE_ENDS = ASPECIALS \| WSP
				92	EXTENDED_ATTRIBUTE_ENDS = ATTRIBUTE_ENDS - set('%')
R David Murray	0b6f6c8	2012-05-25 18:42:14 -0400	[diff] [blame]	93
				94	def quote_string(value):
				95	return '"'+str(value).replace('\\', '\\\\').replace('"', r'\"')+'"'
				96
				97	#
				98	# Accumulator for header folding
				99	#
				100
				101	class _Folded:
				102
				103	def __init__(self, maxlen, policy):
				104	self.maxlen = maxlen
				105	self.policy = policy
				106	self.lastlen = 0
				107	self.stickyspace = None
				108	self.firstline = True
				109	self.done = []
				110	self.current = []
				111
				112	def newline(self):
				113	self.done.extend(self.current)
				114	self.done.append(self.policy.linesep)
				115	self.current.clear()
				116	self.lastlen = 0
				117
				118	def finalize(self):
				119	if self.current:
				120	self.newline()
				121
				122	def __str__(self):
				123	return ''.join(self.done)
				124
				125	def append(self, stoken):
				126	self.current.append(stoken)
				127
				128	def append_if_fits(self, token, stoken=None):
				129	if stoken is None:
				130	stoken = str(token)
				131	l = len(stoken)
				132	if self.stickyspace is not None:
				133	stickyspace_len = len(self.stickyspace)
				134	if self.lastlen + stickyspace_len + l <= self.maxlen:
				135	self.current.append(self.stickyspace)
				136	self.lastlen += stickyspace_len
				137	self.current.append(stoken)
				138	self.lastlen += l
				139	self.stickyspace = None
				140	self.firstline = False
				141	return True
				142	if token.has_fws:
				143	ws = token.pop_leading_fws()
				144	if ws is not None:
				145	self.stickyspace += str(ws)
				146	stickyspace_len += len(ws)
				147	token._fold(self)
				148	return True
				149	if stickyspace_len and l + 1 <= self.maxlen:
				150	margin = self.maxlen - l
				151	if 0 < margin < stickyspace_len:
				152	trim = stickyspace_len - margin
				153	self.current.append(self.stickyspace[:trim])
				154	self.stickyspace = self.stickyspace[trim:]
				155	stickyspace_len = trim
				156	self.newline()
				157	self.current.append(self.stickyspace)
				158	self.current.append(stoken)
				159	self.lastlen = l + stickyspace_len
				160	self.stickyspace = None
				161	self.firstline = False
				162	return True
				163	if not self.firstline:
				164	self.newline()
				165	self.current.append(self.stickyspace)
				166	self.current.append(stoken)
				167	self.stickyspace = None
				168	self.firstline = False
				169	return True
				170	if self.lastlen + l <= self.maxlen:
				171	self.current.append(stoken)
				172	self.lastlen += l
				173	return True
				174	if l < self.maxlen:
				175	self.newline()
				176	self.current.append(stoken)
				177	self.lastlen = l
				178	return True
				179	return False
				180
				181	#
				182	# TokenList and its subclasses
				183	#
				184
				185	class TokenList(list):
				186
				187	token_type = None
				188
				189	def __init__(self, args, *kw):
				190	super().__init__(args, *kw)
				191	self.defects = []
				192
				193	def __str__(self):
				194	return ''.join(str(x) for x in self)
				195
				196	def __repr__(self):
				197	return '{}({})'.format(self.__class__.__name__,
				198	super().__repr__())
				199
				200	@property
				201	def value(self):
				202	return ''.join(x.value for x in self if x.value)
				203
				204	@property
				205	def all_defects(self):
				206	return sum((x.all_defects for x in self), self.defects)
				207
				208	#
				209	# Folding API
				210	#
				211	# parts():
				212	#
				213	# return a list of objects that constitute the "higher level syntactic
				214	# objects" specified by the RFC as the best places to fold a header line.
				215	# The returned objects must include leading folding white space, even if
				216	# this means mutating the underlying parse tree of the object. Each object
				217	# is only responsible for returning its parts, and should not drill down
				218	# to any lower level except as required to meet the leading folding white
				219	# space constraint.
				220	#
				221	# _fold(folded):
				222	#
				223	# folded: the result accumulator. This is an instance of _Folded.
				224	# (XXX: I haven't finished factoring this out yet, the folding code
				225	# pretty much uses this as a state object.) When the folded.current
				226	# contains as much text as will fit, the _fold method should call
				227	# folded.newline.
				228	# folded.lastlen: the current length of the test stored in folded.current.
				229	# folded.maxlen: The maximum number of characters that may appear on a
				230	# folded line. Differs from the policy setting in that "no limit" is
				231	# represented by +inf, which means it can be used in the trivially
				232	# logical fashion in comparisons.
				233	#
				234	# Currently no subclasses implement parts, and I think this will remain
				235	# true. A subclass only needs to implement _fold when the generic version
				236	# isn't sufficient. _fold will need to be implemented primarily when it is
				237	# possible for encoded words to appear in the specialized token-list, since
				238	# there is no generic algorithm that can know where exactly the encoded
				239	# words are allowed. A _fold implementation is responsible for filling
				240	# lines in the same general way that the top level _fold does. It may, and
				241	# should, call the _fold method of sub-objects in a similar fashion to that
				242	# of the top level _fold.
				243	#
				244	# XXX: I'm hoping it will be possible to factor the existing code further
				245	# to reduce redundancy and make the logic clearer.
				246
				247	@property
				248	def parts(self):
				249	klass = self.__class__
				250	this = []
				251	for token in self:
				252	if token.startswith_fws():
				253	if this:
				254	yield this[0] if len(this)==1 else klass(this)
				255	this.clear()
				256	end_ws = token.pop_trailing_ws()
				257	this.append(token)
				258	if end_ws:
				259	yield klass(this)
				260	this = [end_ws]
				261	if this:
				262	yield this[0] if len(this)==1 else klass(this)
				263
				264	def startswith_fws(self):
				265	return self[0].startswith_fws()
				266
				267	def pop_leading_fws(self):
				268	if self[0].token_type == 'fws':
				269	return self.pop(0)
				270	return self[0].pop_leading_fws()
				271
				272	def pop_trailing_ws(self):
				273	if self[-1].token_type == 'cfws':
				274	return self.pop(-1)
				275	return self[-1].pop_trailing_ws()
				276
				277	@property
				278	def has_fws(self):
				279	for part in self:
				280	if part.has_fws:
				281	return True
				282	return False
				283
				284	def has_leading_comment(self):
				285	return self[0].has_leading_comment()
				286
				287	@property
				288	def comments(self):
				289	comments = []
				290	for token in self:
				291	comments.extend(token.comments)
				292	return comments
				293
				294	def fold(self, *, policy):
				295	# max_line_length 0/None means no limit, ie: infinitely long.
				296	maxlen = policy.max_line_length or float("+inf")
				297	folded = _Folded(maxlen, policy)
				298	self._fold(folded)
				299	folded.finalize()
				300	return str(folded)
				301
				302	def as_encoded_word(self, charset):
				303	# This works only for things returned by 'parts', which include
				304	# the leading fws, if any, that should be used.
				305	res = []
				306	ws = self.pop_leading_fws()
				307	if ws:
				308	res.append(ws)
				309	trailer = self.pop(-1) if self[-1].token_type=='fws' else ''
				310	res.append(_ew.encode(str(self), charset))
				311	res.append(trailer)
				312	return ''.join(res)
				313
				314	def cte_encode(self, charset, policy):
				315	res = []
				316	for part in self:
				317	res.append(part.cte_encode(charset, policy))
				318	return ''.join(res)
				319
				320	def _fold(self, folded):
				321	for part in self.parts:
				322	tstr = str(part)
				323	tlen = len(tstr)
				324	try:
				325	str(part).encode('us-ascii')
				326	except UnicodeEncodeError:
				327	if any(isinstance(x, errors.UndecodableBytesDefect)
				328	for x in part.all_defects):
				329	charset = 'unknown-8bit'
				330	else:
				331	# XXX: this should be a policy setting
				332	charset = 'utf-8'
				333	tstr = part.cte_encode(charset, folded.policy)
				334	tlen = len(tstr)
				335	if folded.append_if_fits(part, tstr):
				336	continue
				337	# Peel off the leading whitespace if any and make it sticky, to
				338	# avoid infinite recursion.
				339	ws = part.pop_leading_fws()
				340	if ws is not None:
				341	# Peel off the leading whitespace and make it sticky, to
				342	# avoid infinite recursion.
				343	folded.stickyspace = str(part.pop(0))
				344	if folded.append_if_fits(part):
				345	continue
				346	if part.has_fws:
				347	part._fold(folded)
				348	continue
				349	# There are no fold points in this one; it is too long for a single
				350	# line and can't be split...we just have to put it on its own line.
				351	folded.append(tstr)
				352	folded.newline()
				353
				354	def pprint(self, indent=''):
				355	print('\n'.join(self._pp(indent='')))
				356
				357	def ppstr(self, indent=''):
				358	return '\n'.join(self._pp(indent=''))
				359
				360	def _pp(self, indent=''):
				361	yield '{}{}/{}('.format(
				362	indent,
				363	self.__class__.__name__,
				364	self.token_type)
				365	for token in self:
R David Murray	97f43c0	2012-06-24 05:03:27 -0400	[diff] [blame]	366	if not hasattr(token, '_pp'):
				367	yield (indent + ' !! invalid element in token '
				368	'list: {!r}'.format(token))
				369	else:
				370	for line in token._pp(indent+' '):
				371	yield line
R David Murray	0b6f6c8	2012-05-25 18:42:14 -0400	[diff] [blame]	372	if self.defects:
				373	extra = ' Defects: {}'.format(self.defects)
				374	else:
				375	extra = ''
				376	yield '{}){}'.format(indent, extra)
				377
				378
				379	class WhiteSpaceTokenList(TokenList):
				380
				381	@property
				382	def value(self):
				383	return ' '
				384
				385	@property
				386	def comments(self):
				387	return [x.content for x in self if x.token_type=='comment']
				388
				389
				390	class UnstructuredTokenList(TokenList):
				391
				392	token_type = 'unstructured'
				393
				394	def _fold(self, folded):
				395	if any(x.token_type=='encoded-word' for x in self):
				396	return self._fold_encoded(folded)
				397	# Here we can have either a pure ASCII string that may or may not
				398	# have surrogateescape encoded bytes, or a unicode string.
				399	last_ew = None
				400	for part in self.parts:
				401	tstr = str(part)
				402	is_ew = False
				403	try:
				404	str(part).encode('us-ascii')
				405	except UnicodeEncodeError:
				406	if any(isinstance(x, errors.UndecodableBytesDefect)
				407	for x in part.all_defects):
				408	charset = 'unknown-8bit'
				409	else:
				410	charset = 'utf-8'
				411	if last_ew is not None:
				412	# We've already done an EW, combine this one with it
				413	# if there's room.
				414	chunk = get_unstructured(
				415	''.join(folded.current[last_ew:]+[tstr])).as_encoded_word(charset)
				416	oldlastlen = sum(len(x) for x in folded.current[:last_ew])
				417	schunk = str(chunk)
				418	lchunk = len(schunk)
				419	if oldlastlen + lchunk <= folded.maxlen:
				420	del folded.current[last_ew:]
				421	folded.append(schunk)
				422	folded.lastlen = oldlastlen + lchunk
				423	continue
				424	tstr = part.as_encoded_word(charset)
				425	is_ew = True
				426	if folded.append_if_fits(part, tstr):
				427	if is_ew:
				428	last_ew = len(folded.current) - 1
				429	continue
				430	if is_ew or last_ew:
				431	# It's too big to fit on the line, but since we've
				432	# got encoded words we can use encoded word folding.
				433	part._fold_as_ew(folded)
				434	continue
				435	# Peel off the leading whitespace if any and make it sticky, to
				436	# avoid infinite recursion.
				437	ws = part.pop_leading_fws()
				438	if ws is not None:
				439	folded.stickyspace = str(ws)
				440	if folded.append_if_fits(part):
				441	continue
				442	if part.has_fws:
				443	part.fold(folded)
				444	continue
				445	# It can't be split...we just have to put it on its own line.
				446	folded.append(tstr)
				447	folded.newline()
				448	last_ew = None
				449
				450	def cte_encode(self, charset, policy):
				451	res = []
				452	last_ew = None
				453	for part in self:
				454	spart = str(part)
				455	try:
				456	spart.encode('us-ascii')
				457	res.append(spart)
				458	except UnicodeEncodeError:
				459	if last_ew is None:
				460	res.append(part.cte_encode(charset, policy))
				461	last_ew = len(res)
				462	else:
				463	tl = get_unstructured(''.join(res[last_ew:] + [spart]))
				464	res.append(tl.as_encoded_word())
				465	return ''.join(res)
				466
				467
				468	class Phrase(TokenList):
				469
				470	token_type = 'phrase'
				471
				472	def _fold(self, folded):
				473	# As with Unstructured, we can have pure ASCII with or without
				474	# surrogateescape encoded bytes, or we could have unicode. But this
				475	# case is more complicated, since we have to deal with the various
				476	# sub-token types and how they can be composed in the face of
				477	# unicode-that-needs-CTE-encoding, and the fact that if a token a
				478	# comment that becomes a barrier across which we can't compose encoded
				479	# words.
				480	last_ew = None
				481	for part in self.parts:
				482	tstr = str(part)
				483	tlen = len(tstr)
				484	has_ew = False
				485	try:
				486	str(part).encode('us-ascii')
				487	except UnicodeEncodeError:
				488	if any(isinstance(x, errors.UndecodableBytesDefect)
				489	for x in part.all_defects):
				490	charset = 'unknown-8bit'
				491	else:
				492	charset = 'utf-8'
				493	if last_ew is not None and not part.has_leading_comment():
				494	# We've already done an EW, let's see if we can combine
				495	# this one with it. The last_ew logic ensures that all we
				496	# have at this point is atoms, no comments or quoted
				497	# strings. So we can treat the text between the last
				498	# encoded word and the content of this token as
				499	# unstructured text, and things will work correctly. But
				500	# we have to strip off any trailing comment on this token
				501	# first, and if it is a quoted string we have to pull out
				502	# the content (we're encoding it, so it no longer needs to
				503	# be quoted).
				504	if part[-1].token_type == 'cfws' and part.comments:
				505	remainder = part.pop(-1)
				506	else:
				507	remainder = ''
				508	for i, token in enumerate(part):
				509	if token.token_type == 'bare-quoted-string':
				510	part[i] = UnstructuredTokenList(token[:])
				511	chunk = get_unstructured(
				512	''.join(folded.current[last_ew:]+[tstr])).as_encoded_word(charset)
				513	schunk = str(chunk)
				514	lchunk = len(schunk)
				515	if last_ew + lchunk <= folded.maxlen:
				516	del folded.current[last_ew:]
				517	folded.append(schunk)
				518	folded.lastlen = sum(len(x) for x in folded.current)
				519	continue
				520	tstr = part.as_encoded_word(charset)
				521	tlen = len(tstr)
				522	has_ew = True
				523	if folded.append_if_fits(part, tstr):
				524	if has_ew and not part.comments:
				525	last_ew = len(folded.current) - 1
				526	elif part.comments or part.token_type == 'quoted-string':
				527	# If a comment is involved we can't combine EWs. And if a
				528	# quoted string is involved, it's not worth the effort to
				529	# try to combine them.
				530	last_ew = None
				531	continue
				532	part._fold(folded)
				533
				534	def cte_encode(self, charset, policy):
				535	res = []
				536	last_ew = None
				537	is_ew = False
				538	for part in self:
				539	spart = str(part)
				540	try:
				541	spart.encode('us-ascii')
				542	res.append(spart)
				543	except UnicodeEncodeError:
				544	is_ew = True
				545	if last_ew is None:
				546	if not part.comments:
				547	last_ew = len(res)
				548	res.append(part.cte_encode(charset, policy))
				549	elif not part.has_leading_comment():
				550	if part[-1].token_type == 'cfws' and part.comments:
				551	remainder = part.pop(-1)
				552	else:
				553	remainder = ''
				554	for i, token in enumerate(part):
				555	if token.token_type == 'bare-quoted-string':
				556	part[i] = UnstructuredTokenList(token[:])
				557	tl = get_unstructured(''.join(res[last_ew:] + [spart]))
				558	res[last_ew:] = [tl.as_encoded_word(charset)]
				559	if part.comments or (not is_ew and part.token_type == 'quoted-string'):
				560	last_ew = None
				561	return ''.join(res)
				562
				563	class Word(TokenList):
				564
				565	token_type = 'word'
				566
				567
				568	class CFWSList(WhiteSpaceTokenList):
				569
				570	token_type = 'cfws'
				571
				572	def has_leading_comment(self):
				573	return bool(self.comments)
				574
				575
				576	class Atom(TokenList):
				577
				578	token_type = 'atom'
				579
				580
R David Murray	97f43c0	2012-06-24 05:03:27 -0400	[diff] [blame]	581	class Token(TokenList):
				582
				583	token_type = 'token'
				584
				585
R David Murray	0b6f6c8	2012-05-25 18:42:14 -0400	[diff] [blame]	586	class EncodedWord(TokenList):
				587
				588	token_type = 'encoded-word'
				589	cte = None
				590	charset = None
				591	lang = None
				592
				593	@property
				594	def encoded(self):
				595	if self.cte is not None:
				596	return self.cte
				597	_ew.encode(str(self), self.charset)
				598
				599
				600
				601	class QuotedString(TokenList):
				602
				603	token_type = 'quoted-string'
				604
				605	@property
				606	def content(self):
				607	for x in self:
				608	if x.token_type == 'bare-quoted-string':
				609	return x.value
				610
				611	@property
				612	def quoted_value(self):
				613	res = []
				614	for x in self:
				615	if x.token_type == 'bare-quoted-string':
				616	res.append(str(x))
				617	else:
				618	res.append(x.value)
				619	return ''.join(res)
				620
R David Murray	97f43c0	2012-06-24 05:03:27 -0400	[diff] [blame]	621	@property
				622	def stripped_value(self):
				623	for token in self:
				624	if token.token_type == 'bare-quoted-string':
				625	return token.value
				626
R David Murray	0b6f6c8	2012-05-25 18:42:14 -0400	[diff] [blame]	627
				628	class BareQuotedString(QuotedString):
				629
				630	token_type = 'bare-quoted-string'
				631
				632	def __str__(self):
R David Murray	97f43c0	2012-06-24 05:03:27 -0400	[diff] [blame]	633	return quote_string(''.join(str(x) for x in self))
R David Murray	0b6f6c8	2012-05-25 18:42:14 -0400	[diff] [blame]	634
				635	@property
				636	def value(self):
				637	return ''.join(str(x) for x in self)
				638
				639
				640	class Comment(WhiteSpaceTokenList):
				641
				642	token_type = 'comment'
				643
				644	def __str__(self):
				645	return ''.join(sum([
				646	["("],
				647	[self.quote(x) for x in self],
				648	[")"],
				649	], []))
				650
				651	def quote(self, value):
				652	if value.token_type == 'comment':
				653	return str(value)
				654	return str(value).replace('\\', '\\\\').replace(
				655	'(', '\(').replace(
				656	')', '\)')
				657
				658	@property
				659	def content(self):
				660	return ''.join(str(x) for x in self)
				661
				662	@property
				663	def comments(self):
				664	return [self.content]
				665
				666	class AddressList(TokenList):
				667
				668	token_type = 'address-list'
				669
				670	@property
				671	def addresses(self):
				672	return [x for x in self if x.token_type=='address']
				673
				674	@property
				675	def mailboxes(self):
				676	return sum((x.mailboxes
				677	for x in self if x.token_type=='address'), [])
				678
				679	@property
				680	def all_mailboxes(self):
				681	return sum((x.all_mailboxes
				682	for x in self if x.token_type=='address'), [])
				683
				684
				685	class Address(TokenList):
				686
				687	token_type = 'address'
				688
				689	@property
				690	def display_name(self):
				691	if self[0].token_type == 'group':
				692	return self[0].display_name
				693
				694	@property
				695	def mailboxes(self):
				696	if self[0].token_type == 'mailbox':
				697	return [self[0]]
				698	elif self[0].token_type == 'invalid-mailbox':
				699	return []
				700	return self[0].mailboxes
				701
				702	@property
				703	def all_mailboxes(self):
				704	if self[0].token_type == 'mailbox':
				705	return [self[0]]
				706	elif self[0].token_type == 'invalid-mailbox':
				707	return [self[0]]
				708	return self[0].all_mailboxes
				709
				710	class MailboxList(TokenList):
				711
				712	token_type = 'mailbox-list'
				713
				714	@property
				715	def mailboxes(self):
				716	return [x for x in self if x.token_type=='mailbox']
				717
				718	@property
				719	def all_mailboxes(self):
				720	return [x for x in self
				721	if x.token_type in ('mailbox', 'invalid-mailbox')]
				722
				723
				724	class GroupList(TokenList):
				725
				726	token_type = 'group-list'
				727
				728	@property
				729	def mailboxes(self):
				730	if not self or self[0].token_type != 'mailbox-list':
				731	return []
				732	return self[0].mailboxes
				733
				734	@property
				735	def all_mailboxes(self):
				736	if not self or self[0].token_type != 'mailbox-list':
				737	return []
				738	return self[0].all_mailboxes
				739
				740
				741	class Group(TokenList):
				742
				743	token_type = "group"
				744
				745	@property
				746	def mailboxes(self):
				747	if self[2].token_type != 'group-list':
				748	return []
				749	return self[2].mailboxes
				750
				751	@property
				752	def all_mailboxes(self):
				753	if self[2].token_type != 'group-list':
				754	return []
				755	return self[2].all_mailboxes
				756
				757	@property
				758	def display_name(self):
				759	return self[0].display_name
				760
				761
				762	class NameAddr(TokenList):
				763
				764	token_type = 'name-addr'
				765
				766	@property
				767	def display_name(self):
				768	if len(self) == 1:
				769	return None
				770	return self[0].display_name
				771
				772	@property
				773	def local_part(self):
				774	return self[-1].local_part
				775
				776	@property
				777	def domain(self):
				778	return self[-1].domain
				779
				780	@property
				781	def route(self):
				782	return self[-1].route
				783
				784	@property
				785	def addr_spec(self):
				786	return self[-1].addr_spec
				787
				788
				789	class AngleAddr(TokenList):
				790
				791	token_type = 'angle-addr'
				792
				793	@property
				794	def local_part(self):
				795	for x in self:
				796	if x.token_type == 'addr-spec':
				797	return x.local_part
				798
				799	@property
				800	def domain(self):
				801	for x in self:
				802	if x.token_type == 'addr-spec':
				803	return x.domain
				804
				805	@property
				806	def route(self):
				807	for x in self:
				808	if x.token_type == 'obs-route':
				809	return x.domains
				810
				811	@property
				812	def addr_spec(self):
				813	for x in self:
				814	if x.token_type == 'addr-spec':
				815	return x.addr_spec
R David Murray	032eed3	2012-05-26 14:31:12 -0400	[diff] [blame]	816	else:
				817	return '<>'
R David Murray	0b6f6c8	2012-05-25 18:42:14 -0400	[diff] [blame]	818
				819
				820	class ObsRoute(TokenList):
				821
				822	token_type = 'obs-route'
				823
				824	@property
				825	def domains(self):
				826	return [x.domain for x in self if x.token_type == 'domain']
				827
				828
				829	class Mailbox(TokenList):
				830
				831	token_type = 'mailbox'
				832
				833	@property
				834	def display_name(self):
				835	if self[0].token_type == 'name-addr':
				836	return self[0].display_name
				837
				838	@property
				839	def local_part(self):
				840	return self[0].local_part
				841
				842	@property
				843	def domain(self):
				844	return self[0].domain
				845
				846	@property
				847	def route(self):
				848	if self[0].token_type == 'name-addr':
				849	return self[0].route
				850
				851	@property
				852	def addr_spec(self):
				853	return self[0].addr_spec
				854
				855
				856	class InvalidMailbox(TokenList):
				857
				858	token_type = 'invalid-mailbox'
				859
				860	@property
				861	def display_name(self):
				862	return None
				863
				864	local_part = domain = route = addr_spec = display_name
				865
				866
				867	class Domain(TokenList):
				868
				869	token_type = 'domain'
				870
				871	@property
				872	def domain(self):
				873	return ''.join(super().value.split())
				874
				875
				876	class DotAtom(TokenList):
				877
				878	token_type = 'dot-atom'
				879
				880
				881	class DotAtomText(TokenList):
				882
				883	token_type = 'dot-atom-text'
				884
				885
				886	class AddrSpec(TokenList):
				887
				888	token_type = 'addr-spec'
				889
				890	@property
				891	def local_part(self):
				892	return self[0].local_part
				893
				894	@property
				895	def domain(self):
				896	if len(self) < 3:
				897	return None
				898	return self[-1].domain
				899
				900	@property
				901	def value(self):
				902	if len(self) < 3:
				903	return self[0].value
				904	return self[0].value.rstrip()+self[1].value+self[2].value.lstrip()
				905
				906	@property
				907	def addr_spec(self):
				908	nameset = set(self.local_part)
				909	if len(nameset) > len(nameset-DOT_ATOM_ENDS):
				910	lp = quote_string(self.local_part)
				911	else:
				912	lp = self.local_part
				913	if self.domain is not None:
				914	return lp + '@' + self.domain
				915	return lp
				916
				917
				918	class ObsLocalPart(TokenList):
				919
				920	token_type = 'obs-local-part'
				921
				922
				923	class DisplayName(Phrase):
				924
				925	token_type = 'display-name'
				926
				927	@property
				928	def display_name(self):
				929	res = TokenList(self)
				930	if res[0].token_type == 'cfws':
				931	res.pop(0)
				932	else:
				933	if res[0][0].token_type == 'cfws':
				934	res[0] = TokenList(res[0][1:])
				935	if res[-1].token_type == 'cfws':
				936	res.pop()
				937	else:
				938	if res[-1][-1].token_type == 'cfws':
				939	res[-1] = TokenList(res[-1][:-1])
				940	return res.value
				941
				942	@property
				943	def value(self):
				944	quote = False
				945	if self.defects:
				946	quote = True
				947	else:
				948	for x in self:
				949	if x.token_type == 'quoted-string':
				950	quote = True
				951	if quote:
				952	pre = post = ''
				953	if self[0].token_type=='cfws' or self[0][0].token_type=='cfws':
				954	pre = ' '
				955	if self[-1].token_type=='cfws' or self[-1][-1].token_type=='cfws':
				956	post = ' '
				957	return pre+quote_string(self.display_name)+post
				958	else:
				959	return super().value
				960
				961
				962	class LocalPart(TokenList):
				963
				964	token_type = 'local-part'
				965
				966	@property
				967	def value(self):
				968	if self[0].token_type == "quoted-string":
				969	return self[0].quoted_value
				970	else:
				971	return self[0].value
				972
				973	@property
				974	def local_part(self):
				975	# Strip whitespace from front, back, and around dots.
				976	res = [DOT]
				977	last = DOT
				978	last_is_tl = False
				979	for tok in self[0] + [DOT]:
				980	if tok.token_type == 'cfws':
				981	continue
				982	if (last_is_tl and tok.token_type == 'dot' and
				983	last[-1].token_type == 'cfws'):
				984	res[-1] = TokenList(last[:-1])
				985	is_tl = isinstance(tok, TokenList)
				986	if (is_tl and last.token_type == 'dot' and
				987	tok[0].token_type == 'cfws'):
				988	res.append(TokenList(tok[1:]))
				989	else:
				990	res.append(tok)
				991	last = res[-1]
				992	last_is_tl = is_tl
				993	res = TokenList(res[1:-1])
				994	return res.value
				995
				996
				997	class DomainLiteral(TokenList):
				998
				999	token_type = 'domain-literal'
				1000
				1001	@property
				1002	def domain(self):
				1003	return ''.join(super().value.split())
				1004
				1005	@property
				1006	def ip(self):
				1007	for x in self:
				1008	if x.token_type == 'ptext':
				1009	return x.value
				1010
				1011
R David Murray	97f43c0	2012-06-24 05:03:27 -0400	[diff] [blame]	1012	class MIMEVersion(TokenList):
				1013
				1014	token_type = 'mime-version'
				1015	major = None
				1016	minor = None
				1017
				1018
				1019	class Parameter(TokenList):
				1020
				1021	token_type = 'parameter'
				1022	sectioned = False
				1023	extended = False
				1024	charset = 'us-ascii'
				1025
				1026	@property
				1027	def section_number(self):
				1028	# Because the first token, the attribute (name) eats CFWS, the second
				1029	# token is always the section if there is one.
				1030	return self[1].number if self.sectioned else 0
				1031
				1032	@property
				1033	def param_value(self):
				1034	# This is part of the "handle quoted extended parameters" hack.
				1035	for token in self:
				1036	if token.token_type == 'value':
				1037	return token.stripped_value
				1038	if token.token_type == 'quoted-string':
				1039	for token in token:
				1040	if token.token_type == 'bare-quoted-string':
				1041	for token in token:
				1042	if token.token_type == 'value':
				1043	return token.stripped_value
				1044	return ''
				1045
				1046
				1047	class InvalidParameter(Parameter):
				1048
				1049	token_type = 'invalid-parameter'
				1050
				1051
				1052	class Attribute(TokenList):
				1053
				1054	token_type = 'attribute'
				1055
				1056	@property
				1057	def stripped_value(self):
				1058	for token in self:
				1059	if token.token_type.endswith('attrtext'):
				1060	return token.value
				1061
				1062	class Section(TokenList):
				1063
				1064	token_type = 'section'
				1065	number = None
				1066
				1067
				1068	class Value(TokenList):
				1069
				1070	token_type = 'value'
				1071
				1072	@property
				1073	def stripped_value(self):
				1074	token = self[0]
				1075	if token.token_type == 'cfws':
				1076	token = self[1]
				1077	if token.token_type.endswith(
				1078	('quoted-string', 'attribute', 'extended-attribute')):
				1079	return token.stripped_value
				1080	return self.value
				1081
				1082
				1083	class MimeParameters(TokenList):
				1084
				1085	token_type = 'mime-parameters'
				1086
				1087	@property
				1088	def params(self):
				1089	# The RFC specifically states that the ordering of parameters is not
				1090	# guaranteed and may be reordered by the transport layer. So we have
				1091	# to assume the RFC 2231 pieces can come in any order. However, we
				1092	# output them in the order that we first see a given name, which gives
				1093	# us a stable __str__.
				1094	params = OrderedDict()
				1095	for token in self:
				1096	if not token.token_type.endswith('parameter'):
				1097	continue
				1098	if token[0].token_type != 'attribute':
				1099	continue
				1100	name = token[0].value.strip()
				1101	if name not in params:
				1102	params[name] = []
				1103	params[name].append((token.section_number, token))
				1104	for name, parts in params.items():
				1105	parts = sorted(parts)
				1106	# XXX: there might be more recovery we could do here if, for
				1107	# example, this is really a case of a duplicate attribute name.
				1108	value_parts = []
				1109	charset = parts[0][1].charset
				1110	for i, (section_number, param) in enumerate(parts):
				1111	if section_number != i:
				1112	param.defects.append(errors.InvalidHeaderDefect(
				1113	"inconsistent multipart parameter numbering"))
				1114	value = param.param_value
				1115	if param.extended:
				1116	try:
				1117	value = urllib.parse.unquote_to_bytes(value)
				1118	except UnicodeEncodeError:
				1119	# source had surrogate escaped bytes. What we do now
				1120	# is a bit of an open question. I'm not sure this is
				1121	# the best choice, but it is what the old algorithm did
				1122	value = urllib.parse.unquote(value, encoding='latin-1')
				1123	else:
				1124	try:
				1125	value = value.decode(charset, 'surrogateescape')
				1126	except LookupError:
				1127	# XXX: there should really be a custom defect for
				1128	# unknown character set to make it easy to find,
				1129	# because otherwise unknown charset is a silent
				1130	# failure.
				1131	value = value.decode('us-ascii', 'surrogateescape')
				1132	if utils._has_surrogates(value):
				1133	param.defects.append(errors.UndecodableBytesDefect())
				1134	value_parts.append(value)
				1135	value = ''.join(value_parts)
				1136	yield name, value
				1137
				1138	def __str__(self):
				1139	params = []
				1140	for name, value in self.params:
				1141	if value:
				1142	params.append('{}={}'.format(name, quote_string(value)))
				1143	else:
				1144	params.append(name)
				1145	params = '; '.join(params)
				1146	return ' ' + params if params else ''
				1147
				1148
				1149	class ParameterizedHeaderValue(TokenList):
				1150
				1151	@property
				1152	def params(self):
				1153	for token in reversed(self):
				1154	if token.token_type == 'mime-parameters':
				1155	return token.params
				1156	return {}
				1157
				1158	@property
				1159	def parts(self):
				1160	if self and self[-1].token_type == 'mime-parameters':
				1161	# We don't want to start a new line if all of the params don't fit
				1162	# after the value, so unwrap the parameter list.
				1163	return TokenList(self[:-1] + self[-1])
				1164	return TokenList(self).parts
				1165
				1166
				1167	class ContentType(ParameterizedHeaderValue):
				1168
				1169	token_type = 'content-type'
				1170	maintype = 'text'
				1171	subtype = 'plain'
				1172
				1173
				1174	class ContentDisposition(ParameterizedHeaderValue):
				1175
				1176	token_type = 'content-disposition'
				1177	content_disposition = None
				1178
				1179
				1180	class ContentTransferEncoding(TokenList):
				1181
				1182	token_type = 'content-transfer-encoding'
				1183	cte = '7bit'
				1184
				1185
R David Murray	0b6f6c8	2012-05-25 18:42:14 -0400	[diff] [blame]	1186	class HeaderLabel(TokenList):
				1187
				1188	token_type = 'header-label'
				1189
				1190
				1191	class Header(TokenList):
				1192
				1193	token_type = 'header'
				1194
				1195	def _fold(self, folded):
				1196	folded.append(str(self.pop(0)))
				1197	folded.lastlen = len(folded.current[0])
				1198	# The first line of the header is different from all others: we don't
				1199	# want to start a new object on a new line if it has any fold points in
				1200	# it that would allow part of it to be on the first header line.
				1201	# Further, if the first fold point would fit on the new line, we want
				1202	# to do that, but if it doesn't we want to put it on the first line.
				1203	# Folded supports this via the stickyspace attribute. If this
				1204	# attribute is not None, it does the special handling.
				1205	folded.stickyspace = str(self.pop(0)) if self[0].token_type == 'cfws' else ''
				1206	rest = self.pop(0)
				1207	if self:
				1208	raise ValueError("Malformed Header token list")
				1209	rest._fold(folded)
				1210
				1211
				1212	#
				1213	# Terminal classes and instances
				1214	#
				1215
				1216	class Terminal(str):
				1217
				1218	def __new__(cls, value, token_type):
				1219	self = super().__new__(cls, value)
				1220	self.token_type = token_type
				1221	self.defects = []
				1222	return self
				1223
				1224	def __repr__(self):
				1225	return "{}({})".format(self.__class__.__name__, super().__repr__())
				1226
				1227	@property
				1228	def all_defects(self):
				1229	return list(self.defects)
				1230
				1231	def _pp(self, indent=''):
				1232	return ["{}{}/{}({}){}".format(
				1233	indent,
				1234	self.__class__.__name__,
				1235	self.token_type,
				1236	super().__repr__(),
				1237	'' if not self.defects else ' {}'.format(self.defects),
				1238	)]
				1239
				1240	def cte_encode(self, charset, policy):
				1241	value = str(self)
				1242	try:
				1243	value.encode('us-ascii')
				1244	return value
				1245	except UnicodeEncodeError:
				1246	return _ew.encode(value, charset)
				1247
				1248	def pop_trailing_ws(self):
				1249	# This terminates the recursion.
				1250	return None
				1251
				1252	def pop_leading_fws(self):
				1253	# This terminates the recursion.
				1254	return None
				1255
				1256	@property
				1257	def comments(self):
				1258	return []
				1259
				1260	def has_leading_comment(self):
				1261	return False
				1262
				1263	def __getnewargs__(self):
				1264	return(str(self), self.token_type)
				1265
				1266
				1267	class WhiteSpaceTerminal(Terminal):
				1268
				1269	@property
				1270	def value(self):
				1271	return ' '
				1272
				1273	def startswith_fws(self):
				1274	return True
				1275
				1276	has_fws = True
				1277
				1278
				1279	class ValueTerminal(Terminal):
				1280
				1281	@property
				1282	def value(self):
				1283	return self
				1284
				1285	def startswith_fws(self):
				1286	return False
				1287
				1288	has_fws = False
				1289
				1290	def as_encoded_word(self, charset):
				1291	return _ew.encode(str(self), charset)
				1292
				1293
				1294	class EWWhiteSpaceTerminal(WhiteSpaceTerminal):
				1295
				1296	@property
				1297	def value(self):
				1298	return ''
				1299
				1300	@property
				1301	def encoded(self):
				1302	return self[:]
				1303
				1304	def __str__(self):
				1305	return ''
				1306
				1307	has_fws = True
				1308
				1309
				1310	# XXX these need to become classes and used as instances so
				1311	# that a program can't change them in a parse tree and screw
				1312	# up other parse trees. Maybe should have tests for that, too.
				1313	DOT = ValueTerminal('.', 'dot')
				1314	ListSeparator = ValueTerminal(',', 'list-separator')
				1315	RouteComponentMarker = ValueTerminal('@', 'route-component-marker')
				1316
				1317	#
				1318	# Parser
				1319	#
				1320
				1321	"""Parse strings according to RFC822/2047/2822/5322 rules.
				1322
				1323	This is a stateless parser. Each get_XXX function accepts a string and
				1324	returns either a Terminal or a TokenList representing the RFC object named
				1325	by the method and a string containing the remaining unparsed characters
				1326	from the input. Thus a parser method consumes the next syntactic construct
				1327	of a given type and returns a token representing the construct plus the
				1328	unparsed remainder of the input string.
				1329
				1330	For example, if the first element of a structured header is a 'phrase',
				1331	then:
				1332
				1333	phrase, value = get_phrase(value)
				1334
				1335	returns the complete phrase from the start of the string value, plus any
				1336	characters left in the string after the phrase is removed.
				1337
				1338	"""
				1339
				1340	_wsp_splitter = re.compile(r'([{}]+)'.format(''.join(WSP))).split
				1341	_non_atom_end_matcher = re.compile(r"[^{}]+".format(
				1342	''.join(ATOM_ENDS).replace('\\','\\\\').replace(']','\]'))).match
				1343	_non_printable_finder = re.compile(r"[\x00-\x20\x7F]").findall
R David Murray	97f43c0	2012-06-24 05:03:27 -0400	[diff] [blame]	1344	_non_token_end_matcher = re.compile(r"[^{}]+".format(
				1345	''.join(TOKEN_ENDS).replace('\\','\\\\').replace(']','\]'))).match
				1346	_non_attribute_end_matcher = re.compile(r"[^{}]+".format(
				1347	''.join(ATTRIBUTE_ENDS).replace('\\','\\\\').replace(']','\]'))).match
				1348	_non_extended_attribute_end_matcher = re.compile(r"[^{}]+".format(
				1349	''.join(EXTENDED_ATTRIBUTE_ENDS).replace(
				1350	'\\','\\\\').replace(']','\]'))).match
R David Murray	0b6f6c8	2012-05-25 18:42:14 -0400	[diff] [blame]	1351
				1352	def _validate_xtext(xtext):
				1353	"""If input token contains ASCII non-printables, register a defect."""
				1354
				1355	non_printables = _non_printable_finder(xtext)
				1356	if non_printables:
				1357	xtext.defects.append(errors.NonPrintableDefect(non_printables))
				1358	if utils._has_surrogates(xtext):
				1359	xtext.defects.append(errors.UndecodableBytesDefect(
				1360	"Non-ASCII characters found in header token"))
				1361
				1362	def _get_ptext_to_endchars(value, endchars):
				1363	"""Scan printables/quoted-pairs until endchars and return unquoted ptext.
				1364
				1365	This function turns a run of qcontent, ccontent-without-comments, or
				1366	dtext-with-quoted-printables into a single string by unquoting any
				1367	quoted printables. It returns the string, the remaining value, and
				1368	a flag that is True iff there were any quoted printables decoded.
				1369
				1370	"""
				1371	fragment, *remainder = _wsp_splitter(value, 1)
				1372	vchars = []
				1373	escape = False
				1374	had_qp = False
				1375	for pos in range(len(fragment)):
				1376	if fragment[pos] == '\\':
				1377	if escape:
				1378	escape = False
				1379	had_qp = True
				1380	else:
				1381	escape = True
				1382	continue
				1383	if escape:
				1384	escape = False
				1385	elif fragment[pos] in endchars:
				1386	break
				1387	vchars.append(fragment[pos])
				1388	else:
				1389	pos = pos + 1
				1390	return ''.join(vchars), ''.join([fragment[pos:]] + remainder), had_qp
				1391
				1392	def _decode_ew_run(value):
				1393	""" Decode a run of RFC2047 encoded words.
				1394
				1395	_decode_ew_run(value) -> (text, value, defects)
				1396
				1397	Scans the supplied value for a run of tokens that look like they are RFC
				1398	2047 encoded words, decodes those words into text according to RFC 2047
				1399	rules (whitespace between encoded words is discarded), and returns the text
				1400	and the remaining value (including any leading whitespace on the remaining
				1401	value), as well as a list of any defects encountered while decoding. The
				1402	input value may not have any leading whitespace.
				1403
				1404	"""
				1405	res = []
				1406	defects = []
				1407	last_ws = ''
				1408	while value:
				1409	try:
				1410	tok, ws, value = _wsp_splitter(value, 1)
				1411	except ValueError:
				1412	tok, ws, value = value, '', ''
				1413	if not (tok.startswith('=?') and tok.endswith('?=')):
				1414	return ''.join(res), last_ws + tok + ws + value, defects
				1415	text, charset, lang, new_defects = _ew.decode(tok)
				1416	res.append(text)
				1417	defects.extend(new_defects)
				1418	last_ws = ws
				1419	return ''.join(res), last_ws, defects
				1420
				1421	def get_fws(value):
				1422	"""FWS = 1*WSP
				1423
				1424	This isn't the RFC definition. We're using fws to represent tokens where
				1425	folding can be done, but when we are parsing the unfolding has already
				1426	been done so we don't need to watch out for CRLF.
				1427
				1428	"""
				1429	newvalue = value.lstrip()
				1430	fws = WhiteSpaceTerminal(value[:len(value)-len(newvalue)], 'fws')
				1431	return fws, newvalue
				1432
				1433	def get_encoded_word(value):
				1434	""" encoded-word = "=?" charset "?" encoding "?" encoded-text "?="
				1435
				1436	"""
				1437	ew = EncodedWord()
				1438	if not value.startswith('=?'):
				1439	raise errors.HeaderParseError(
				1440	"expected encoded word but found {}".format(value))
				1441	tok, *remainder = value[2:].split('?=', 1)
				1442	if tok == value[2:]:
				1443	raise errors.HeaderParseError(
				1444	"expected encoded word but found {}".format(value))
				1445	remstr = ''.join(remainder)
				1446	if remstr[:2].isdigit():
				1447	rest, *remainder = remstr.split('?=', 1)
				1448	tok = tok + '?=' + rest
				1449	if len(tok.split()) > 1:
				1450	ew.defects.append(errors.InvalidHeaderDefect(
				1451	"whitespace inside encoded word"))
				1452	ew.cte = value
				1453	value = ''.join(remainder)
				1454	try:
				1455	text, charset, lang, defects = _ew.decode('=?' + tok + '?=')
				1456	except ValueError:
				1457	raise errors.HeaderParseError(
				1458	"encoded word format invalid: '{}'".format(ew.cte))
				1459	ew.charset = charset
				1460	ew.lang = lang
				1461	ew.defects.extend(defects)
				1462	while text:
				1463	if text[0] in WSP:
				1464	token, text = get_fws(text)
				1465	ew.append(token)
				1466	continue
				1467	chars, *remainder = _wsp_splitter(text, 1)
				1468	vtext = ValueTerminal(chars, 'vtext')
				1469	_validate_xtext(vtext)
				1470	ew.append(vtext)
				1471	text = ''.join(remainder)
				1472	return ew, value
				1473
				1474	def get_unstructured(value):
				1475	"""unstructured = (([FWS] vchar) WSP) / obs-unstruct
				1476	obs-unstruct = ((LF CR (obs-utext) LF CR)) / FWS)
				1477	obs-utext = %d0 / obs-NO-WS-CTL / LF / CR
				1478
				1479	obs-NO-WS-CTL is control characters except WSP/CR/LF.
				1480
				1481	So, basically, we have printable runs, plus control characters or nulls in
				1482	the obsolete syntax, separated by whitespace. Since RFC 2047 uses the
				1483	obsolete syntax in its specification, but requires whitespace on either
				1484	side of the encoded words, I can see no reason to need to separate the
				1485	non-printable-non-whitespace from the printable runs if they occur, so we
				1486	parse this into xtext tokens separated by WSP tokens.
				1487
				1488	Because an 'unstructured' value must by definition constitute the entire
				1489	value, this 'get' routine does not return a remaining value, only the
				1490	parsed TokenList.
				1491
				1492	"""
				1493	# XXX: but what about bare CR and LF? They might signal the start or
				1494	# end of an encoded word. YAGNI for now, since out current parsers
				1495	# will never send us strings with bard CR or LF.
				1496
				1497	unstructured = UnstructuredTokenList()
				1498	while value:
				1499	if value[0] in WSP:
				1500	token, value = get_fws(value)
				1501	unstructured.append(token)
				1502	continue
				1503	if value.startswith('=?'):
				1504	try:
				1505	token, value = get_encoded_word(value)
				1506	except errors.HeaderParseError:
				1507	pass
				1508	else:
				1509	have_ws = True
				1510	if len(unstructured) > 0:
				1511	if unstructured[-1].token_type != 'fws':
				1512	unstructured.defects.append(errors.InvalidHeaderDefect(
				1513	"missing whitespace before encoded word"))
				1514	have_ws = False
				1515	if have_ws and len(unstructured) > 1:
				1516	if unstructured[-2].token_type == 'encoded-word':
				1517	unstructured[-1] = EWWhiteSpaceTerminal(
				1518	unstructured[-1], 'fws')
				1519	unstructured.append(token)
				1520	continue
				1521	tok, *remainder = _wsp_splitter(value, 1)
				1522	vtext = ValueTerminal(tok, 'vtext')
				1523	_validate_xtext(vtext)
				1524	unstructured.append(vtext)
				1525	value = ''.join(remainder)
				1526	return unstructured
				1527
				1528	def get_qp_ctext(value):
				1529	"""ctext = <printable ascii except \ ( )>
				1530
				1531	This is not the RFC ctext, since we are handling nested comments in comment
				1532	and unquoting quoted-pairs here. We allow anything except the '()'
				1533	characters, but if we find any ASCII other than the RFC defined printable
				1534	ASCII an NonPrintableDefect is added to the token's defects list. Since
				1535	quoted pairs are converted to their unquoted values, what is returned is
				1536	a 'ptext' token. In this case it is a WhiteSpaceTerminal, so it's value
				1537	is ' '.
				1538
				1539	"""
				1540	ptext, value, _ = _get_ptext_to_endchars(value, '()')
				1541	ptext = WhiteSpaceTerminal(ptext, 'ptext')
				1542	_validate_xtext(ptext)
				1543	return ptext, value
				1544
				1545	def get_qcontent(value):
				1546	"""qcontent = qtext / quoted-pair
				1547
				1548	We allow anything except the DQUOTE character, but if we find any ASCII
				1549	other than the RFC defined printable ASCII an NonPrintableDefect is
				1550	added to the token's defects list. Any quoted pairs are converted to their
				1551	unquoted values, so what is returned is a 'ptext' token. In this case it
				1552	is a ValueTerminal.
				1553
				1554	"""
				1555	ptext, value, _ = _get_ptext_to_endchars(value, '"')
				1556	ptext = ValueTerminal(ptext, 'ptext')
				1557	_validate_xtext(ptext)
				1558	return ptext, value
				1559
				1560	def get_atext(value):
				1561	"""atext = <matches _atext_matcher>
				1562
				1563	We allow any non-ATOM_ENDS in atext, but add an InvalidATextDefect to
				1564	the token's defects list if we find non-atext characters.
				1565	"""
				1566	m = _non_atom_end_matcher(value)
				1567	if not m:
				1568	raise errors.HeaderParseError(
				1569	"expected atext but found '{}'".format(value))
				1570	atext = m.group()
				1571	value = value[len(atext):]
				1572	atext = ValueTerminal(atext, 'atext')
				1573	_validate_xtext(atext)
				1574	return atext, value
				1575
				1576	def get_bare_quoted_string(value):
				1577	"""bare-quoted-string = DQUOTE *([FWS] qcontent) [FWS] DQUOTE
				1578
				1579	A quoted-string without the leading or trailing white space. Its
				1580	value is the text between the quote marks, with whitespace
				1581	preserved and quoted pairs decoded.
				1582	"""
				1583	if value[0] != '"':
				1584	raise errors.HeaderParseError(
				1585	"expected '\"' but found '{}'".format(value))
				1586	bare_quoted_string = BareQuotedString()
				1587	value = value[1:]
				1588	while value and value[0] != '"':
				1589	if value[0] in WSP:
				1590	token, value = get_fws(value)
				1591	else:
				1592	token, value = get_qcontent(value)
				1593	bare_quoted_string.append(token)
				1594	if not value:
				1595	bare_quoted_string.defects.append(errors.InvalidHeaderDefect(
				1596	"end of header inside quoted string"))
				1597	return bare_quoted_string, value
				1598	return bare_quoted_string, value[1:]
				1599
				1600	def get_comment(value):
				1601	"""comment = "(" *([FWS] ccontent) [FWS] ")"
				1602	ccontent = ctext / quoted-pair / comment
				1603
				1604	We handle nested comments here, and quoted-pair in our qp-ctext routine.
				1605	"""
				1606	if value and value[0] != '(':
				1607	raise errors.HeaderParseError(
				1608	"expected '(' but found '{}'".format(value))
				1609	comment = Comment()
				1610	value = value[1:]
				1611	while value and value[0] != ")":
				1612	if value[0] in WSP:
				1613	token, value = get_fws(value)
				1614	elif value[0] == '(':
				1615	token, value = get_comment(value)
				1616	else:
				1617	token, value = get_qp_ctext(value)
				1618	comment.append(token)
				1619	if not value:
				1620	comment.defects.append(errors.InvalidHeaderDefect(
				1621	"end of header inside comment"))
				1622	return comment, value
				1623	return comment, value[1:]
				1624
				1625	def get_cfws(value):
				1626	"""CFWS = (1*([FWS] comment) [FWS]) / FWS
				1627
				1628	"""
				1629	cfws = CFWSList()
				1630	while value and value[0] in CFWS_LEADER:
				1631	if value[0] in WSP:
				1632	token, value = get_fws(value)
				1633	else:
				1634	token, value = get_comment(value)
				1635	cfws.append(token)
				1636	return cfws, value
				1637
				1638	def get_quoted_string(value):
				1639	"""quoted-string = [CFWS] <bare-quoted-string> [CFWS]
				1640
				1641	'bare-quoted-string' is an intermediate class defined by this
				1642	parser and not by the RFC grammar. It is the quoted string
				1643	without any attached CFWS.
				1644	"""
				1645	quoted_string = QuotedString()
				1646	if value and value[0] in CFWS_LEADER:
				1647	token, value = get_cfws(value)
				1648	quoted_string.append(token)
				1649	token, value = get_bare_quoted_string(value)
				1650	quoted_string.append(token)
				1651	if value and value[0] in CFWS_LEADER:
				1652	token, value = get_cfws(value)
				1653	quoted_string.append(token)
				1654	return quoted_string, value
				1655
				1656	def get_atom(value):
				1657	"""atom = [CFWS] 1*atext [CFWS]
				1658
				1659	"""
				1660	atom = Atom()
				1661	if value and value[0] in CFWS_LEADER:
				1662	token, value = get_cfws(value)
				1663	atom.append(token)
				1664	if value and value[0] in ATOM_ENDS:
				1665	raise errors.HeaderParseError(
				1666	"expected atom but found '{}'".format(value))
				1667	token, value = get_atext(value)
				1668	atom.append(token)
				1669	if value and value[0] in CFWS_LEADER:
				1670	token, value = get_cfws(value)
				1671	atom.append(token)
				1672	return atom, value
				1673
				1674	def get_dot_atom_text(value):
				1675	""" dot-text = 1atext ("." 1*atext)
				1676
				1677	"""
				1678	dot_atom_text = DotAtomText()
				1679	if not value or value[0] in ATOM_ENDS:
				1680	raise errors.HeaderParseError("expected atom at a start of "
				1681	"dot-atom-text but found '{}'".format(value))
				1682	while value and value[0] not in ATOM_ENDS:
				1683	token, value = get_atext(value)
				1684	dot_atom_text.append(token)
				1685	if value and value[0] == '.':
				1686	dot_atom_text.append(DOT)
				1687	value = value[1:]
				1688	if dot_atom_text[-1] is DOT:
				1689	raise errors.HeaderParseError("expected atom at end of dot-atom-text "
				1690	"but found '{}'".format('.'+value))
				1691	return dot_atom_text, value
				1692
				1693	def get_dot_atom(value):
				1694	""" dot-atom = [CFWS] dot-atom-text [CFWS]
				1695
				1696	"""
				1697	dot_atom = DotAtom()
				1698	if value[0] in CFWS_LEADER:
				1699	token, value = get_cfws(value)
				1700	dot_atom.append(token)
				1701	token, value = get_dot_atom_text(value)
				1702	dot_atom.append(token)
				1703	if value and value[0] in CFWS_LEADER:
				1704	token, value = get_cfws(value)
				1705	dot_atom.append(token)
				1706	return dot_atom, value
				1707
				1708	def get_word(value):
				1709	"""word = atom / quoted-string
				1710
				1711	Either atom or quoted-string may start with CFWS. We have to peel off this
				1712	CFWS first to determine which type of word to parse. Afterward we splice
				1713	the leading CFWS, if any, into the parsed sub-token.
				1714
				1715	If neither an atom or a quoted-string is found before the next special, a
				1716	HeaderParseError is raised.
				1717
				1718	The token returned is either an Atom or a QuotedString, as appropriate.
				1719	This means the 'word' level of the formal grammar is not represented in the
				1720	parse tree; this is because having that extra layer when manipulating the
				1721	parse tree is more confusing than it is helpful.
				1722
				1723	"""
				1724	if value[0] in CFWS_LEADER:
				1725	leader, value = get_cfws(value)
				1726	else:
				1727	leader = None
				1728	if value[0]=='"':
				1729	token, value = get_quoted_string(value)
				1730	elif value[0] in SPECIALS:
				1731	raise errors.HeaderParseError("Expected 'atom' or 'quoted-string' "
				1732	"but found '{}'".format(value))
				1733	else:
				1734	token, value = get_atom(value)
				1735	if leader is not None:
				1736	token[:0] = [leader]
				1737	return token, value
				1738
				1739	def get_phrase(value):
				1740	""" phrase = 1*word / obs-phrase
				1741	obs-phrase = word *(word / "." / CFWS)
				1742
				1743	This means a phrase can be a sequence of words, periods, and CFWS in any
				1744	order as long as it starts with at least one word. If anything other than
				1745	words is detected, an ObsoleteHeaderDefect is added to the token's defect
				1746	list. We also accept a phrase that starts with CFWS followed by a dot;
				1747	this is registered as an InvalidHeaderDefect, since it is not supported by
				1748	even the obsolete grammar.
				1749
				1750	"""
				1751	phrase = Phrase()
				1752	try:
				1753	token, value = get_word(value)
				1754	phrase.append(token)
				1755	except errors.HeaderParseError:
				1756	phrase.defects.append(errors.InvalidHeaderDefect(
				1757	"phrase does not start with word"))
				1758	while value and value[0] not in PHRASE_ENDS:
				1759	if value[0]=='.':
				1760	phrase.append(DOT)
				1761	phrase.defects.append(errors.ObsoleteHeaderDefect(
				1762	"period in 'phrase'"))
				1763	value = value[1:]
				1764	else:
				1765	try:
				1766	token, value = get_word(value)
				1767	except errors.HeaderParseError:
				1768	if value[0] in CFWS_LEADER:
				1769	token, value = get_cfws(value)
				1770	phrase.defects.append(errors.ObsoleteHeaderDefect(
				1771	"comment found without atom"))
				1772	else:
				1773	raise
				1774	phrase.append(token)
				1775	return phrase, value
				1776
				1777	def get_local_part(value):
				1778	""" local-part = dot-atom / quoted-string / obs-local-part
				1779
				1780	"""
				1781	local_part = LocalPart()
				1782	leader = None
				1783	if value[0] in CFWS_LEADER:
				1784	leader, value = get_cfws(value)
				1785	if not value:
				1786	raise errors.HeaderParseError(
				1787	"expected local-part but found '{}'".format(value))
				1788	try:
				1789	token, value = get_dot_atom(value)
				1790	except errors.HeaderParseError:
				1791	try:
				1792	token, value = get_word(value)
				1793	except errors.HeaderParseError:
				1794	if value[0] != '\\' and value[0] in PHRASE_ENDS:
				1795	raise
				1796	token = TokenList()
				1797	if leader is not None:
				1798	token[:0] = [leader]
				1799	local_part.append(token)
				1800	if value and (value[0]=='\\' or value[0] not in PHRASE_ENDS):
				1801	obs_local_part, value = get_obs_local_part(str(local_part) + value)
				1802	if obs_local_part.token_type == 'invalid-obs-local-part':
				1803	local_part.defects.append(errors.InvalidHeaderDefect(
				1804	"local-part is not dot-atom, quoted-string, or obs-local-part"))
				1805	else:
				1806	local_part.defects.append(errors.ObsoleteHeaderDefect(
				1807	"local-part is not a dot-atom (contains CFWS)"))
				1808	local_part[0] = obs_local_part
				1809	try:
				1810	local_part.value.encode('ascii')
				1811	except UnicodeEncodeError:
				1812	local_part.defects.append(errors.NonASCIILocalPartDefect(
				1813	"local-part contains non-ASCII characters)"))
				1814	return local_part, value
				1815
				1816	def get_obs_local_part(value):
				1817	""" obs-local-part = word *("." word)
				1818	"""
				1819	obs_local_part = ObsLocalPart()
				1820	last_non_ws_was_dot = False
				1821	while value and (value[0]=='\\' or value[0] not in PHRASE_ENDS):
				1822	if value[0] == '.':
				1823	if last_non_ws_was_dot:
				1824	obs_local_part.defects.append(errors.InvalidHeaderDefect(
				1825	"invalid repeated '.'"))
				1826	obs_local_part.append(DOT)
				1827	last_non_ws_was_dot = True
				1828	value = value[1:]
				1829	continue
				1830	elif value[0]=='\\':
				1831	obs_local_part.append(ValueTerminal(value[0],
				1832	'misplaced-special'))
				1833	value = value[1:]
				1834	obs_local_part.defects.append(errors.InvalidHeaderDefect(
				1835	"'\\' character outside of quoted-string/ccontent"))
				1836	last_non_ws_was_dot = False
				1837	continue
				1838	if obs_local_part and obs_local_part[-1].token_type != 'dot':
				1839	obs_local_part.defects.append(errors.InvalidHeaderDefect(
				1840	"missing '.' between words"))
				1841	try:
				1842	token, value = get_word(value)
				1843	last_non_ws_was_dot = False
				1844	except errors.HeaderParseError:
				1845	if value[0] not in CFWS_LEADER:
				1846	raise
				1847	token, value = get_cfws(value)
				1848	obs_local_part.append(token)
				1849	if (obs_local_part[0].token_type == 'dot' or
				1850	obs_local_part[0].token_type=='cfws' and
				1851	obs_local_part[1].token_type=='dot'):
				1852	obs_local_part.defects.append(errors.InvalidHeaderDefect(
				1853	"Invalid leading '.' in local part"))
				1854	if (obs_local_part[-1].token_type == 'dot' or
				1855	obs_local_part[-1].token_type=='cfws' and
				1856	obs_local_part[-2].token_type=='dot'):
				1857	obs_local_part.defects.append(errors.InvalidHeaderDefect(
				1858	"Invalid trailing '.' in local part"))
				1859	if obs_local_part.defects:
				1860	obs_local_part.token_type = 'invalid-obs-local-part'
				1861	return obs_local_part, value
				1862
				1863	def get_dtext(value):
				1864	""" dtext = <printable ascii except \ [ ]> / obs-dtext
				1865	obs-dtext = obs-NO-WS-CTL / quoted-pair
				1866
Terry Jan Reedy	0f84764	2013-03-11 18:34:00 -0400	[diff] [blame]	1867	We allow anything except the excluded characters, but if we find any
R David Murray	0b6f6c8	2012-05-25 18:42:14 -0400	[diff] [blame]	1868	ASCII other than the RFC defined printable ASCII an NonPrintableDefect is
				1869	added to the token's defects list. Quoted pairs are converted to their
				1870	unquoted values, so what is returned is a ptext token, in this case a
				1871	ValueTerminal. If there were quoted-printables, an ObsoleteHeaderDefect is
				1872	added to the returned token's defect list.
				1873
				1874	"""
				1875	ptext, value, had_qp = _get_ptext_to_endchars(value, '[]')
				1876	ptext = ValueTerminal(ptext, 'ptext')
				1877	if had_qp:
				1878	ptext.defects.append(errors.ObsoleteHeaderDefect(
				1879	"quoted printable found in domain-literal"))
				1880	_validate_xtext(ptext)
				1881	return ptext, value
				1882
				1883	def _check_for_early_dl_end(value, domain_literal):
				1884	if value:
				1885	return False
				1886	domain_literal.append(errors.InvalidHeaderDefect(
				1887	"end of input inside domain-literal"))
				1888	domain_literal.append(ValueTerminal(']', 'domain-literal-end'))
				1889	return True
				1890
				1891	def get_domain_literal(value):
				1892	""" domain-literal = [CFWS] "[" *([FWS] dtext) [FWS] "]" [CFWS]
				1893
				1894	"""
				1895	domain_literal = DomainLiteral()
				1896	if value[0] in CFWS_LEADER:
				1897	token, value = get_cfws(value)
				1898	domain_literal.append(token)
				1899	if not value:
				1900	raise errors.HeaderParseError("expected domain-literal")
				1901	if value[0] != '[':
				1902	raise errors.HeaderParseError("expected '[' at start of domain-literal "
				1903	"but found '{}'".format(value))
				1904	value = value[1:]
				1905	if _check_for_early_dl_end(value, domain_literal):
				1906	return domain_literal, value
				1907	domain_literal.append(ValueTerminal('[', 'domain-literal-start'))
				1908	if value[0] in WSP:
				1909	token, value = get_fws(value)
				1910	domain_literal.append(token)
				1911	token, value = get_dtext(value)
				1912	domain_literal.append(token)
				1913	if _check_for_early_dl_end(value, domain_literal):
				1914	return domain_literal, value
				1915	if value[0] in WSP:
				1916	token, value = get_fws(value)
				1917	domain_literal.append(token)
				1918	if _check_for_early_dl_end(value, domain_literal):
				1919	return domain_literal, value
				1920	if value[0] != ']':
				1921	raise errors.HeaderParseError("expected ']' at end of domain-literal "
				1922	"but found '{}'".format(value))
				1923	domain_literal.append(ValueTerminal(']', 'domain-literal-end'))
				1924	value = value[1:]
				1925	if value and value[0] in CFWS_LEADER:
				1926	token, value = get_cfws(value)
				1927	domain_literal.append(token)
				1928	return domain_literal, value
				1929
				1930	def get_domain(value):
				1931	""" domain = dot-atom / domain-literal / obs-domain
				1932	obs-domain = atom *("." atom))
				1933
				1934	"""
				1935	domain = Domain()
				1936	leader = None
				1937	if value[0] in CFWS_LEADER:
				1938	leader, value = get_cfws(value)
				1939	if not value:
				1940	raise errors.HeaderParseError(
				1941	"expected domain but found '{}'".format(value))
				1942	if value[0] == '[':
				1943	token, value = get_domain_literal(value)
				1944	if leader is not None:
				1945	token[:0] = [leader]
				1946	domain.append(token)
				1947	return domain, value
				1948	try:
				1949	token, value = get_dot_atom(value)
				1950	except errors.HeaderParseError:
				1951	token, value = get_atom(value)
				1952	if leader is not None:
				1953	token[:0] = [leader]
				1954	domain.append(token)
				1955	if value and value[0] == '.':
				1956	domain.defects.append(errors.ObsoleteHeaderDefect(
				1957	"domain is not a dot-atom (contains CFWS)"))
				1958	if domain[0].token_type == 'dot-atom':
				1959	domain[:] = domain[0]
				1960	while value and value[0] == '.':
				1961	domain.append(DOT)
				1962	token, value = get_atom(value[1:])
				1963	domain.append(token)
				1964	return domain, value
				1965
				1966	def get_addr_spec(value):
				1967	""" addr-spec = local-part "@" domain
				1968
				1969	"""
				1970	addr_spec = AddrSpec()
				1971	token, value = get_local_part(value)
				1972	addr_spec.append(token)
				1973	if not value or value[0] != '@':
				1974	addr_spec.defects.append(errors.InvalidHeaderDefect(
				1975	"add-spec local part with no domain"))
				1976	return addr_spec, value
				1977	addr_spec.append(ValueTerminal('@', 'address-at-symbol'))
				1978	token, value = get_domain(value[1:])
				1979	addr_spec.append(token)
				1980	return addr_spec, value
				1981
				1982	def get_obs_route(value):
				1983	""" obs-route = obs-domain-list ":"
				1984	obs-domain-list = (CFWS / ",") "@" domain ("," [CFWS] ["@" domain])
				1985
				1986	Returns an obs-route token with the appropriate sub-tokens (that is,
				1987	there is no obs-domain-list in the parse tree).
				1988	"""
				1989	obs_route = ObsRoute()
				1990	while value and (value[0]==',' or value[0] in CFWS_LEADER):
				1991	if value[0] in CFWS_LEADER:
				1992	token, value = get_cfws(value)
				1993	obs_route.append(token)
				1994	elif value[0] == ',':
				1995	obs_route.append(ListSeparator)
				1996	value = value[1:]
				1997	if not value or value[0] != '@':
				1998	raise errors.HeaderParseError(
				1999	"expected obs-route domain but found '{}'".format(value))
				2000	obs_route.append(RouteComponentMarker)
				2001	token, value = get_domain(value[1:])
				2002	obs_route.append(token)
				2003	while value and value[0]==',':
				2004	obs_route.append(ListSeparator)
				2005	value = value[1:]
				2006	if not value:
				2007	break
				2008	if value[0] in CFWS_LEADER:
				2009	token, value = get_cfws(value)
				2010	obs_route.append(token)
				2011	if value[0] == '@':
				2012	obs_route.append(RouteComponentMarker)
				2013	token, value = get_domain(value[1:])
				2014	obs_route.append(token)
				2015	if not value:
				2016	raise errors.HeaderParseError("end of header while parsing obs-route")
				2017	if value[0] != ':':
				2018	raise errors.HeaderParseError( "expected ':' marking end of "
				2019	"obs-route but found '{}'".format(value))
				2020	obs_route.append(ValueTerminal(':', 'end-of-obs-route-marker'))
				2021	return obs_route, value[1:]
				2022
				2023	def get_angle_addr(value):
				2024	""" angle-addr = [CFWS] "<" addr-spec ">" [CFWS] / obs-angle-addr
				2025	obs-angle-addr = [CFWS] "<" obs-route addr-spec ">" [CFWS]
				2026
				2027	"""
				2028	angle_addr = AngleAddr()
				2029	if value[0] in CFWS_LEADER:
				2030	token, value = get_cfws(value)
				2031	angle_addr.append(token)
				2032	if not value or value[0] != '<':
				2033	raise errors.HeaderParseError(
				2034	"expected angle-addr but found '{}'".format(value))
				2035	angle_addr.append(ValueTerminal('<', 'angle-addr-start'))
				2036	value = value[1:]
R David Murray	032eed3	2012-05-26 14:31:12 -0400	[diff] [blame]	2037	# Although it is not legal per RFC5322, SMTP uses '<>' in certain
				2038	# circumstances.
				2039	if value[0] == '>':
				2040	angle_addr.append(ValueTerminal('>', 'angle-addr-end'))
				2041	angle_addr.defects.append(errors.InvalidHeaderDefect(
				2042	"null addr-spec in angle-addr"))
				2043	value = value[1:]
				2044	return angle_addr, value
R David Murray	0b6f6c8	2012-05-25 18:42:14 -0400	[diff] [blame]	2045	try:
				2046	token, value = get_addr_spec(value)
				2047	except errors.HeaderParseError:
				2048	try:
				2049	token, value = get_obs_route(value)
				2050	angle_addr.defects.append(errors.ObsoleteHeaderDefect(
				2051	"obsolete route specification in angle-addr"))
				2052	except errors.HeaderParseError:
				2053	raise errors.HeaderParseError(
R David Murray	032eed3	2012-05-26 14:31:12 -0400	[diff] [blame]	2054	"expected addr-spec or obs-route but found '{}'".format(value))
R David Murray	0b6f6c8	2012-05-25 18:42:14 -0400	[diff] [blame]	2055	angle_addr.append(token)
				2056	token, value = get_addr_spec(value)
				2057	angle_addr.append(token)
				2058	if value and value[0] == '>':
				2059	value = value[1:]
				2060	else:
				2061	angle_addr.defects.append(errors.InvalidHeaderDefect(
				2062	"missing trailing '>' on angle-addr"))
				2063	angle_addr.append(ValueTerminal('>', 'angle-addr-end'))
				2064	if value and value[0] in CFWS_LEADER:
				2065	token, value = get_cfws(value)
				2066	angle_addr.append(token)
				2067	return angle_addr, value
				2068
				2069	def get_display_name(value):
				2070	""" display-name = phrase
				2071
				2072	Because this is simply a name-rule, we don't return a display-name
				2073	token containing a phrase, but rather a display-name token with
				2074	the content of the phrase.
				2075
				2076	"""
				2077	display_name = DisplayName()
				2078	token, value = get_phrase(value)
				2079	display_name.extend(token[:])
				2080	display_name.defects = token.defects[:]
				2081	return display_name, value
				2082
				2083
				2084	def get_name_addr(value):
				2085	""" name-addr = [display-name] angle-addr
				2086
				2087	"""
				2088	name_addr = NameAddr()
				2089	# Both the optional display name and the angle-addr can start with cfws.
				2090	leader = None
				2091	if value[0] in CFWS_LEADER:
				2092	leader, value = get_cfws(value)
				2093	if not value:
				2094	raise errors.HeaderParseError(
				2095	"expected name-addr but found '{}'".format(leader))
				2096	if value[0] != '<':
				2097	if value[0] in PHRASE_ENDS:
				2098	raise errors.HeaderParseError(
				2099	"expected name-addr but found '{}'".format(value))
				2100	token, value = get_display_name(value)
				2101	if not value:
				2102	raise errors.HeaderParseError(
				2103	"expected name-addr but found '{}'".format(token))
				2104	if leader is not None:
				2105	token[0][:0] = [leader]
				2106	leader = None
				2107	name_addr.append(token)
				2108	token, value = get_angle_addr(value)
				2109	if leader is not None:
				2110	token[:0] = [leader]
				2111	name_addr.append(token)
				2112	return name_addr, value
				2113
				2114	def get_mailbox(value):
				2115	""" mailbox = name-addr / addr-spec
				2116
				2117	"""
				2118	# The only way to figure out if we are dealing with a name-addr or an
				2119	# addr-spec is to try parsing each one.
				2120	mailbox = Mailbox()
				2121	try:
				2122	token, value = get_name_addr(value)
				2123	except errors.HeaderParseError:
				2124	try:
				2125	token, value = get_addr_spec(value)
				2126	except errors.HeaderParseError:
				2127	raise errors.HeaderParseError(
				2128	"expected mailbox but found '{}'".format(value))
				2129	if any(isinstance(x, errors.InvalidHeaderDefect)
				2130	for x in token.all_defects):
				2131	mailbox.token_type = 'invalid-mailbox'
				2132	mailbox.append(token)
				2133	return mailbox, value
				2134
				2135	def get_invalid_mailbox(value, endchars):
				2136	""" Read everything up to one of the chars in endchars.
				2137
				2138	This is outside the formal grammar. The InvalidMailbox TokenList that is
				2139	returned acts like a Mailbox, but the data attributes are None.
				2140
				2141	"""
				2142	invalid_mailbox = InvalidMailbox()
				2143	while value and value[0] not in endchars:
				2144	if value[0] in PHRASE_ENDS:
				2145	invalid_mailbox.append(ValueTerminal(value[0],
				2146	'misplaced-special'))
				2147	value = value[1:]
				2148	else:
				2149	token, value = get_phrase(value)
				2150	invalid_mailbox.append(token)
				2151	return invalid_mailbox, value
				2152
				2153	def get_mailbox_list(value):
				2154	""" mailbox-list = (mailbox *("," mailbox)) / obs-mbox-list
				2155	obs-mbox-list = ([CFWS] ",") mailbox ("," [mailbox / CFWS])
				2156
				2157	For this routine we go outside the formal grammar in order to improve error
				2158	handling. We recognize the end of the mailbox list only at the end of the
				2159	value or at a ';' (the group terminator). This is so that we can turn
				2160	invalid mailboxes into InvalidMailbox tokens and continue parsing any
				2161	remaining valid mailboxes. We also allow all mailbox entries to be null,
				2162	and this condition is handled appropriately at a higher level.
				2163
				2164	"""
				2165	mailbox_list = MailboxList()
				2166	while value and value[0] != ';':
				2167	try:
				2168	token, value = get_mailbox(value)
				2169	mailbox_list.append(token)
				2170	except errors.HeaderParseError:
				2171	leader = None
				2172	if value[0] in CFWS_LEADER:
				2173	leader, value = get_cfws(value)
				2174	if not value or value[0] in ',;':
				2175	mailbox_list.append(leader)
				2176	mailbox_list.defects.append(errors.ObsoleteHeaderDefect(
				2177	"empty element in mailbox-list"))
				2178	else:
				2179	token, value = get_invalid_mailbox(value, ',;')
				2180	if leader is not None:
				2181	token[:0] = [leader]
				2182	mailbox_list.append(token)
				2183	mailbox_list.defects.append(errors.InvalidHeaderDefect(
				2184	"invalid mailbox in mailbox-list"))
				2185	elif value[0] == ',':
				2186	mailbox_list.defects.append(errors.ObsoleteHeaderDefect(
				2187	"empty element in mailbox-list"))
				2188	else:
				2189	token, value = get_invalid_mailbox(value, ',;')
				2190	if leader is not None:
				2191	token[:0] = [leader]
				2192	mailbox_list.append(token)
				2193	mailbox_list.defects.append(errors.InvalidHeaderDefect(
				2194	"invalid mailbox in mailbox-list"))
				2195	if value and value[0] not in ',;':
				2196	# Crap after mailbox; treat it as an invalid mailbox.
				2197	# The mailbox info will still be available.
				2198	mailbox = mailbox_list[-1]
				2199	mailbox.token_type = 'invalid-mailbox'
				2200	token, value = get_invalid_mailbox(value, ',;')
				2201	mailbox.extend(token)
				2202	mailbox_list.defects.append(errors.InvalidHeaderDefect(
				2203	"invalid mailbox in mailbox-list"))
				2204	if value and value[0] == ',':
				2205	mailbox_list.append(ListSeparator)
				2206	value = value[1:]
				2207	return mailbox_list, value
				2208
				2209
				2210	def get_group_list(value):
				2211	""" group-list = mailbox-list / CFWS / obs-group-list
				2212	obs-group-list = 1*([CFWS] ",") [CFWS]
				2213
				2214	"""
				2215	group_list = GroupList()
				2216	if not value:
				2217	group_list.defects.append(errors.InvalidHeaderDefect(
				2218	"end of header before group-list"))
				2219	return group_list, value
				2220	leader = None
				2221	if value and value[0] in CFWS_LEADER:
				2222	leader, value = get_cfws(value)
				2223	if not value:
				2224	# This should never happen in email parsing, since CFWS-only is a
				2225	# legal alternative to group-list in a group, which is the only
				2226	# place group-list appears.
				2227	group_list.defects.append(errors.InvalidHeaderDefect(
				2228	"end of header in group-list"))
				2229	group_list.append(leader)
				2230	return group_list, value
				2231	if value[0] == ';':
				2232	group_list.append(leader)
				2233	return group_list, value
				2234	token, value = get_mailbox_list(value)
				2235	if len(token.all_mailboxes)==0:
				2236	if leader is not None:
				2237	group_list.append(leader)
				2238	group_list.extend(token)
				2239	group_list.defects.append(errors.ObsoleteHeaderDefect(
				2240	"group-list with empty entries"))
				2241	return group_list, value
				2242	if leader is not None:
				2243	token[:0] = [leader]
				2244	group_list.append(token)
				2245	return group_list, value
				2246
				2247	def get_group(value):
				2248	""" group = display-name ":" [group-list] ";" [CFWS]
				2249
				2250	"""
				2251	group = Group()
				2252	token, value = get_display_name(value)
				2253	if not value or value[0] != ':':
				2254	raise errors.HeaderParseError("expected ':' at end of group "
				2255	"display name but found '{}'".format(value))
				2256	group.append(token)
				2257	group.append(ValueTerminal(':', 'group-display-name-terminator'))
				2258	value = value[1:]
				2259	if value and value[0] == ';':
				2260	group.append(ValueTerminal(';', 'group-terminator'))
				2261	return group, value[1:]
				2262	token, value = get_group_list(value)
				2263	group.append(token)
				2264	if not value:
				2265	group.defects.append(errors.InvalidHeaderDefect(
				2266	"end of header in group"))
				2267	if value[0] != ';':
				2268	raise errors.HeaderParseError(
				2269	"expected ';' at end of group but found {}".format(value))
				2270	group.append(ValueTerminal(';', 'group-terminator'))
				2271	value = value[1:]
				2272	if value and value[0] in CFWS_LEADER:
				2273	token, value = get_cfws(value)
				2274	group.append(token)
				2275	return group, value
				2276
				2277	def get_address(value):
				2278	""" address = mailbox / group
				2279
				2280	Note that counter-intuitively, an address can be either a single address or
				2281	a list of addresses (a group). This is why the returned Address object has
				2282	a 'mailboxes' attribute which treats a single address as a list of length
				2283	one. When you need to differentiate between to two cases, extract the single
				2284	element, which is either a mailbox or a group token.
				2285
				2286	"""
				2287	# The formal grammar isn't very helpful when parsing an address. mailbox
				2288	# and group, especially when allowing for obsolete forms, start off very
				2289	# similarly. It is only when you reach one of @, <, or : that you know
				2290	# what you've got. So, we try each one in turn, starting with the more
				2291	# likely of the two. We could perhaps make this more efficient by looking
				2292	# for a phrase and then branching based on the next character, but that
				2293	# would be a premature optimization.
				2294	address = Address()
				2295	try:
				2296	token, value = get_group(value)
				2297	except errors.HeaderParseError:
				2298	try:
				2299	token, value = get_mailbox(value)
				2300	except errors.HeaderParseError:
				2301	raise errors.HeaderParseError(
				2302	"expected address but found '{}'".format(value))
				2303	address.append(token)
				2304	return address, value
				2305
				2306	def get_address_list(value):
				2307	""" address_list = (address *("," address)) / obs-addr-list
				2308	obs-addr-list = ([CFWS] ",") address ("," [address / CFWS])
				2309
				2310	We depart from the formal grammar here by continuing to parse until the end
				2311	of the input, assuming the input to be entirely composed of an
				2312	address-list. This is always true in email parsing, and allows us
				2313	to skip invalid addresses to parse additional valid ones.
				2314
				2315	"""
				2316	address_list = AddressList()
				2317	while value:
				2318	try:
				2319	token, value = get_address(value)
				2320	address_list.append(token)
				2321	except errors.HeaderParseError as err:
				2322	leader = None
				2323	if value[0] in CFWS_LEADER:
				2324	leader, value = get_cfws(value)
				2325	if not value or value[0] == ',':
				2326	address_list.append(leader)
				2327	address_list.defects.append(errors.ObsoleteHeaderDefect(
				2328	"address-list entry with no content"))
				2329	else:
				2330	token, value = get_invalid_mailbox(value, ',')
				2331	if leader is not None:
				2332	token[:0] = [leader]
				2333	address_list.append(Address([token]))
				2334	address_list.defects.append(errors.InvalidHeaderDefect(
				2335	"invalid address in address-list"))
				2336	elif value[0] == ',':
				2337	address_list.defects.append(errors.ObsoleteHeaderDefect(
				2338	"empty element in address-list"))
				2339	else:
				2340	token, value = get_invalid_mailbox(value, ',')
				2341	if leader is not None:
				2342	token[:0] = [leader]
				2343	address_list.append(Address([token]))
				2344	address_list.defects.append(errors.InvalidHeaderDefect(
				2345	"invalid address in address-list"))
				2346	if value and value[0] != ',':
				2347	# Crap after address; treat it as an invalid mailbox.
				2348	# The mailbox info will still be available.
				2349	mailbox = address_list[-1][0]
				2350	mailbox.token_type = 'invalid-mailbox'
				2351	token, value = get_invalid_mailbox(value, ',')
				2352	mailbox.extend(token)
				2353	address_list.defects.append(errors.InvalidHeaderDefect(
				2354	"invalid address in address-list"))
				2355	if value: # Must be a , at this point.
				2356	address_list.append(ValueTerminal(',', 'list-separator'))
				2357	value = value[1:]
				2358	return address_list, value
R David Murray	97f43c0	2012-06-24 05:03:27 -0400	[diff] [blame]	2359
				2360	#
				2361	# XXX: As I begin to add additional header parsers, I'm realizing we probably
				2362	# have two level of parser routines: the get_XXX methods that get a token in
				2363	# the grammar, and parse_XXX methods that parse an entire field value. So
				2364	# get_address_list above should really be a parse_ method, as probably should
				2365	# be get_unstructured.
				2366	#
				2367
				2368	def parse_mime_version(value):
				2369	""" mime-version = [CFWS] 1digit [CFWS] "." [CFWS] 1digit [CFWS]
				2370
				2371	"""
				2372	# The [CFWS] is implicit in the RFC 2045 BNF.
				2373	# XXX: This routine is a bit verbose, should factor out a get_int method.
				2374	mime_version = MIMEVersion()
				2375	if not value:
				2376	mime_version.defects.append(errors.HeaderMissingRequiredValue(
				2377	"Missing MIME version number (eg: 1.0)"))
				2378	return mime_version
				2379	if value[0] in CFWS_LEADER:
				2380	token, value = get_cfws(value)
				2381	mime_version.append(token)
				2382	if not value:
				2383	mime_version.defects.append(errors.HeaderMissingRequiredValue(
				2384	"Expected MIME version number but found only CFWS"))
				2385	digits = ''
				2386	while value and value[0] != '.' and value[0] not in CFWS_LEADER:
				2387	digits += value[0]
				2388	value = value[1:]
				2389	if not digits.isdigit():
				2390	mime_version.defects.append(errors.InvalidHeaderDefect(
				2391	"Expected MIME major version number but found {!r}".format(digits)))
				2392	mime_version.append(ValueTerminal(digits, 'xtext'))
				2393	else:
				2394	mime_version.major = int(digits)
				2395	mime_version.append(ValueTerminal(digits, 'digits'))
				2396	if value and value[0] in CFWS_LEADER:
				2397	token, value = get_cfws(value)
				2398	mime_version.append(token)
				2399	if not value or value[0] != '.':
				2400	if mime_version.major is not None:
				2401	mime_version.defects.append(errors.InvalidHeaderDefect(
				2402	"Incomplete MIME version; found only major number"))
				2403	if value:
				2404	mime_version.append(ValueTerminal(value, 'xtext'))
				2405	return mime_version
				2406	mime_version.append(ValueTerminal('.', 'version-separator'))
				2407	value = value[1:]
				2408	if value and value[0] in CFWS_LEADER:
				2409	token, value = get_cfws(value)
				2410	mime_version.append(token)
				2411	if not value:
				2412	if mime_version.major is not None:
				2413	mime_version.defects.append(errors.InvalidHeaderDefect(
				2414	"Incomplete MIME version; found only major number"))
				2415	return mime_version
				2416	digits = ''
				2417	while value and value[0] not in CFWS_LEADER:
				2418	digits += value[0]
				2419	value = value[1:]
				2420	if not digits.isdigit():
				2421	mime_version.defects.append(errors.InvalidHeaderDefect(
				2422	"Expected MIME minor version number but found {!r}".format(digits)))
				2423	mime_version.append(ValueTerminal(digits, 'xtext'))
				2424	else:
				2425	mime_version.minor = int(digits)
				2426	mime_version.append(ValueTerminal(digits, 'digits'))
				2427	if value and value[0] in CFWS_LEADER:
				2428	token, value = get_cfws(value)
				2429	mime_version.append(token)
				2430	if value:
				2431	mime_version.defects.append(errors.InvalidHeaderDefect(
				2432	"Excess non-CFWS text after MIME version"))
				2433	mime_version.append(ValueTerminal(value, 'xtext'))
				2434	return mime_version
				2435
				2436	def get_invalid_parameter(value):
				2437	""" Read everything up to the next ';'.
				2438
				2439	This is outside the formal grammar. The InvalidParameter TokenList that is
				2440	returned acts like a Parameter, but the data attributes are None.
				2441
				2442	"""
				2443	invalid_parameter = InvalidParameter()
				2444	while value and value[0] != ';':
				2445	if value[0] in PHRASE_ENDS:
				2446	invalid_parameter.append(ValueTerminal(value[0],
				2447	'misplaced-special'))
				2448	value = value[1:]
				2449	else:
				2450	token, value = get_phrase(value)
				2451	invalid_parameter.append(token)
				2452	return invalid_parameter, value
				2453
				2454	def get_ttext(value):
				2455	"""ttext = <matches _ttext_matcher>
				2456
				2457	We allow any non-TOKEN_ENDS in ttext, but add defects to the token's
				2458	defects list if we find non-ttext characters. We also register defects for
				2459	any non-printables even though the RFC doesn't exclude all of them,
				2460	because we follow the spirit of RFC 5322.
				2461
				2462	"""
				2463	m = _non_token_end_matcher(value)
				2464	if not m:
				2465	raise errors.HeaderParseError(
				2466	"expected ttext but found '{}'".format(value))
				2467	ttext = m.group()
				2468	value = value[len(ttext):]
				2469	ttext = ValueTerminal(ttext, 'ttext')
				2470	_validate_xtext(ttext)
				2471	return ttext, value
				2472
				2473	def get_token(value):
				2474	"""token = [CFWS] 1*ttext [CFWS]
				2475
				2476	The RFC equivalent of ttext is any US-ASCII chars except space, ctls, or
				2477	tspecials. We also exclude tabs even though the RFC doesn't.
				2478
				2479	The RFC implies the CFWS but is not explicit about it in the BNF.
				2480
				2481	"""
				2482	mtoken = Token()
				2483	if value and value[0] in CFWS_LEADER:
				2484	token, value = get_cfws(value)
				2485	mtoken.append(token)
				2486	if value and value[0] in TOKEN_ENDS:
				2487	raise errors.HeaderParseError(
				2488	"expected token but found '{}'".format(value))
				2489	token, value = get_ttext(value)
				2490	mtoken.append(token)
				2491	if value and value[0] in CFWS_LEADER:
				2492	token, value = get_cfws(value)
				2493	mtoken.append(token)
				2494	return mtoken, value
				2495
				2496	def get_attrtext(value):
				2497	"""attrtext = 1*(any non-ATTRIBUTE_ENDS character)
				2498
				2499	We allow any non-ATTRIBUTE_ENDS in attrtext, but add defects to the
				2500	token's defects list if we find non-attrtext characters. We also register
				2501	defects for any non-printables even though the RFC doesn't exclude all of
				2502	them, because we follow the spirit of RFC 5322.
				2503
				2504	"""
				2505	m = _non_attribute_end_matcher(value)
				2506	if not m:
				2507	raise errors.HeaderParseError(
				2508	"expected attrtext but found {!r}".format(value))
				2509	attrtext = m.group()
				2510	value = value[len(attrtext):]
				2511	attrtext = ValueTerminal(attrtext, 'attrtext')
				2512	_validate_xtext(attrtext)
				2513	return attrtext, value
				2514
				2515	def get_attribute(value):
				2516	""" [CFWS] 1*attrtext [CFWS]
				2517
				2518	This version of the BNF makes the CFWS explicit, and as usual we use a
				2519	value terminal for the actual run of characters. The RFC equivalent of
				2520	attrtext is the token characters, with the subtraction of '*', "'", and '%'.
				2521	We include tab in the excluded set just as we do for token.
				2522
				2523	"""
				2524	attribute = Attribute()
				2525	if value and value[0] in CFWS_LEADER:
				2526	token, value = get_cfws(value)
				2527	attribute.append(token)
				2528	if value and value[0] in ATTRIBUTE_ENDS:
				2529	raise errors.HeaderParseError(
				2530	"expected token but found '{}'".format(value))
				2531	token, value = get_attrtext(value)
				2532	attribute.append(token)
				2533	if value and value[0] in CFWS_LEADER:
				2534	token, value = get_cfws(value)
				2535	attribute.append(token)
				2536	return attribute, value
				2537
				2538	def get_extended_attrtext(value):
				2539	"""attrtext = 1*(any non-ATTRIBUTE_ENDS character plus '%')
				2540
				2541	This is a special parsing routine so that we get a value that
				2542	includes % escapes as a single string (which we decode as a single
				2543	string later).
				2544
				2545	"""
				2546	m = _non_extended_attribute_end_matcher(value)
				2547	if not m:
				2548	raise errors.HeaderParseError(
				2549	"expected extended attrtext but found {!r}".format(value))
				2550	attrtext = m.group()
				2551	value = value[len(attrtext):]
				2552	attrtext = ValueTerminal(attrtext, 'extended-attrtext')
				2553	_validate_xtext(attrtext)
				2554	return attrtext, value
				2555
				2556	def get_extended_attribute(value):
				2557	""" [CFWS] 1*extended_attrtext [CFWS]
				2558
				2559	This is like the non-extended version except we allow % characters, so that
				2560	we can pick up an encoded value as a single string.
				2561
				2562	"""
				2563	# XXX: should we have an ExtendedAttribute TokenList?
				2564	attribute = Attribute()
				2565	if value and value[0] in CFWS_LEADER:
				2566	token, value = get_cfws(value)
				2567	attribute.append(token)
				2568	if value and value[0] in EXTENDED_ATTRIBUTE_ENDS:
				2569	raise errors.HeaderParseError(
				2570	"expected token but found '{}'".format(value))
				2571	token, value = get_extended_attrtext(value)
				2572	attribute.append(token)
				2573	if value and value[0] in CFWS_LEADER:
				2574	token, value = get_cfws(value)
				2575	attribute.append(token)
				2576	return attribute, value
				2577
				2578	def get_section(value):
				2579	""" '*' digits
				2580
				2581	The formal BNF is more complicated because leading 0s are not allowed. We
				2582	check for that and add a defect. We also assume no CFWS is allowed between
				2583	the '*' and the digits, though the RFC is not crystal clear on that.
				2584	The caller should already have dealt with leading CFWS.
				2585
				2586	"""
				2587	section = Section()
				2588	if not value or value[0] != '*':
				2589	raise errors.HeaderParseError("Expected section but found {}".format(
				2590	value))
				2591	section.append(ValueTerminal('*', 'section-marker'))
				2592	value = value[1:]
				2593	if not value or not value[0].isdigit():
				2594	raise errors.HeaderParseError("Expected section number but "
				2595	"found {}".format(value))
				2596	digits = ''
				2597	while value and value[0].isdigit():
				2598	digits += value[0]
				2599	value = value[1:]
				2600	if digits[0] == '0' and digits != '0':
				2601	section.defects.append(errors.InvalidHeaderError("section number"
				2602	"has an invalid leading 0"))
				2603	section.number = int(digits)
				2604	section.append(ValueTerminal(digits, 'digits'))
				2605	return section, value
				2606
				2607
				2608	def get_value(value):
				2609	""" quoted-string / attribute
				2610
				2611	"""
				2612	v = Value()
				2613	if not value:
				2614	raise errors.HeaderParseError("Expected value but found end of string")
				2615	leader = None
				2616	if value[0] in CFWS_LEADER:
				2617	leader, value = get_cfws(value)
				2618	if not value:
				2619	raise errors.HeaderParseError("Expected value but found "
				2620	"only {}".format(leader))
				2621	if value[0] == '"':
				2622	token, value = get_quoted_string(value)
				2623	else:
				2624	token, value = get_extended_attribute(value)
				2625	if leader is not None:
				2626	token[:0] = [leader]
				2627	v.append(token)
				2628	return v, value
				2629
				2630	def get_parameter(value):
				2631	""" attribute [section] ["*"] [CFWS] "=" value
				2632
				2633	The CFWS is implied by the RFC but not made explicit in the BNF. This
				2634	simplified form of the BNF from the RFC is made to conform with the RFC BNF
				2635	through some extra checks. We do it this way because it makes both error
				2636	recovery and working with the resulting parse tree easier.
				2637	"""
				2638	# It is possible CFWS would also be implicitly allowed between the section
				2639	# and the 'extended-attribute' marker (the '*') , but we've never seen that
				2640	# in the wild and we will therefore ignore the possibility.
				2641	param = Parameter()
				2642	token, value = get_attribute(value)
				2643	param.append(token)
				2644	if not value or value[0] == ';':
				2645	param.defects.append(errors.InvalidHeaderDefect("Parameter contains "
				2646	"name ({}) but no value".format(token)))
				2647	return param, value
				2648	if value[0] == '*':
				2649	try:
				2650	token, value = get_section(value)
				2651	param.sectioned = True
				2652	param.append(token)
				2653	except errors.HeaderParseError:
				2654	pass
				2655	if not value:
				2656	raise errors.HeaderParseError("Incomplete parameter")
				2657	if value[0] == '*':
				2658	param.append(ValueTerminal('*', 'extended-parameter-marker'))
				2659	value = value[1:]
				2660	param.extended = True
				2661	if value[0] != '=':
				2662	raise errors.HeaderParseError("Parameter not followed by '='")
				2663	param.append(ValueTerminal('=', 'parameter-separator'))
				2664	value = value[1:]
				2665	leader = None
				2666	if value and value[0] in CFWS_LEADER:
				2667	token, value = get_cfws(value)
				2668	param.append(token)
				2669	remainder = None
				2670	appendto = param
				2671	if param.extended and value and value[0] == '"':
				2672	# Now for some serious hackery to handle the common invalid case of
				2673	# double quotes around an extended value. We also accept (with defect)
				2674	# a value marked as encoded that isn't really.
				2675	qstring, remainder = get_quoted_string(value)
				2676	inner_value = qstring.stripped_value
				2677	semi_valid = False
				2678	if param.section_number == 0:
				2679	if inner_value and inner_value[0] == "'":
				2680	semi_valid = True
				2681	else:
				2682	token, rest = get_attrtext(inner_value)
				2683	if rest and rest[0] == "'":
				2684	semi_valid = True
				2685	else:
				2686	try:
				2687	token, rest = get_extended_attrtext(inner_value)
				2688	except:
				2689	pass
				2690	else:
				2691	if not rest:
				2692	semi_valid = True
				2693	if semi_valid:
				2694	param.defects.append(errors.InvalidHeaderDefect(
				2695	"Quoted string value for extended parameter is invalid"))
				2696	param.append(qstring)
				2697	for t in qstring:
				2698	if t.token_type == 'bare-quoted-string':
				2699	t[:] = []
				2700	appendto = t
				2701	break
				2702	value = inner_value
				2703	else:
				2704	remainder = None
				2705	param.defects.append(errors.InvalidHeaderDefect(
				2706	"Parameter marked as extended but appears to have a "
				2707	"quoted string value that is non-encoded"))
				2708	if value and value[0] == "'":
				2709	token = None
				2710	else:
				2711	token, value = get_value(value)
				2712	if not param.extended or param.section_number > 0:
				2713	if not value or value[0] != "'":
				2714	appendto.append(token)
				2715	if remainder is not None:
				2716	assert not value, value
				2717	value = remainder
				2718	return param, value
				2719	param.defects.append(errors.InvalidHeaderDefect(
				2720	"Apparent initial-extended-value but attribute "
				2721	"was not marked as extended or was not initial section"))
				2722	if not value:
				2723	# Assume the charset/lang is missing and the token is the value.
				2724	param.defects.append(errors.InvalidHeaderDefect(
				2725	"Missing required charset/lang delimiters"))
				2726	appendto.append(token)
				2727	if remainder is None:
				2728	return param, value
				2729	else:
				2730	if token is not None:
				2731	for t in token:
				2732	if t.token_type == 'extended-attrtext':
				2733	break
				2734	t.token_type == 'attrtext'
				2735	appendto.append(t)
				2736	param.charset = t.value
				2737	if value[0] != "'":
				2738	raise errors.HeaderParseError("Expected RFC2231 char/lang encoding "
				2739	"delimiter, but found {!r}".format(value))
				2740	appendto.append(ValueTerminal("'", 'RFC2231 delimiter'))
				2741	value = value[1:]
				2742	if value and value[0] != "'":
				2743	token, value = get_attrtext(value)
				2744	appendto.append(token)
				2745	param.lang = token.value
				2746	if not value or value[0] != "'":
				2747	raise errors.HeaderParseError("Expected RFC2231 char/lang encoding "
				2748	"delimiter, but found {}".format(value))
				2749	appendto.append(ValueTerminal("'", 'RFC2231 delimiter'))
				2750	value = value[1:]
				2751	if remainder is not None:
				2752	# Treat the rest of value as bare quoted string content.
				2753	v = Value()
				2754	while value:
				2755	if value[0] in WSP:
				2756	token, value = get_fws(value)
				2757	else:
				2758	token, value = get_qcontent(value)
				2759	v.append(token)
				2760	token = v
				2761	else:
				2762	token, value = get_value(value)
				2763	appendto.append(token)
				2764	if remainder is not None:
				2765	assert not value, value
				2766	value = remainder
				2767	return param, value
				2768
				2769	def parse_mime_parameters(value):
				2770	""" parameter *( ";" parameter )
				2771
				2772	That BNF is meant to indicate this routine should only be called after
				2773	finding and handling the leading ';'. There is no corresponding rule in
				2774	the formal RFC grammar, but it is more convenient for us for the set of
				2775	parameters to be treated as its own TokenList.
				2776
				2777	This is 'parse' routine because it consumes the reminaing value, but it
				2778	would never be called to parse a full header. Instead it is called to
				2779	parse everything after the non-parameter value of a specific MIME header.
				2780
				2781	"""
				2782	mime_parameters = MimeParameters()
				2783	while value:
				2784	try:
				2785	token, value = get_parameter(value)
				2786	mime_parameters.append(token)
				2787	except errors.HeaderParseError as err:
				2788	leader = None
				2789	if value[0] in CFWS_LEADER:
				2790	leader, value = get_cfws(value)
				2791	if not value:
				2792	mime_parameters.append(leader)
				2793	return mime_parameters
				2794	if value[0] == ';':
				2795	if leader is not None:
				2796	mime_parameters.append(leader)
				2797	mime_parameters.defects.append(errors.InvalidHeaderDefect(
				2798	"parameter entry with no content"))
				2799	else:
				2800	token, value = get_invalid_parameter(value)
				2801	if leader:
				2802	token[:0] = [leader]
				2803	mime_parameters.append(token)
				2804	mime_parameters.defects.append(errors.InvalidHeaderDefect(
				2805	"invalid parameter {!r}".format(token)))
				2806	if value and value[0] != ';':
				2807	# Junk after the otherwise valid parameter. Mark it as
				2808	# invalid, but it will have a value.
				2809	param = mime_parameters[-1]
				2810	param.token_type = 'invalid-parameter'
				2811	token, value = get_invalid_parameter(value)
				2812	param.extend(token)
				2813	mime_parameters.defects.append(errors.InvalidHeaderDefect(
				2814	"parameter with invalid trailing text {!r}".format(token)))
				2815	if value:
				2816	# Must be a ';' at this point.
				2817	mime_parameters.append(ValueTerminal(';', 'parameter-separator'))
				2818	value = value[1:]
				2819	return mime_parameters
				2820
				2821	def _find_mime_parameters(tokenlist, value):
				2822	"""Do our best to find the parameters in an invalid MIME header
				2823
				2824	"""
				2825	while value and value[0] != ';':
				2826	if value[0] in PHRASE_ENDS:
				2827	tokenlist.append(ValueTerminal(value[0], 'misplaced-special'))
				2828	value = value[1:]
				2829	else:
				2830	token, value = get_phrase(value)
				2831	tokenlist.append(token)
				2832	if not value:
				2833	return
				2834	tokenlist.append(ValueTerminal(';', 'parameter-separator'))
				2835	tokenlist.append(parse_mime_parameters(value[1:]))
				2836
				2837	def parse_content_type_header(value):
				2838	""" maintype "/" subtype *( ";" parameter )
				2839
				2840	The maintype and substype are tokens. Theoretically they could
				2841	be checked against the official IANA list + x-token, but we
				2842	don't do that.
				2843	"""
				2844	ctype = ContentType()
				2845	recover = False
				2846	if not value:
				2847	ctype.defects.append(errors.HeaderMissingRequiredValue(
				2848	"Missing content type specification"))
				2849	return ctype
				2850	try:
				2851	token, value = get_token(value)
				2852	except errors.HeaderParseError:
				2853	ctype.defects.append(errors.InvalidHeaderDefect(
				2854	"Expected content maintype but found {!r}".format(value)))
				2855	_find_mime_parameters(ctype, value)
				2856	return ctype
				2857	ctype.append(token)
				2858	# XXX: If we really want to follow the formal grammer we should make
				2859	# mantype and subtype specialized TokenLists here. Probably not worth it.
				2860	if not value or value[0] != '/':
				2861	ctype.defects.append(errors.InvalidHeaderDefect(
				2862	"Invalid content type"))
				2863	if value:
				2864	_find_mime_parameters(ctype, value)
				2865	return ctype
				2866	ctype.maintype = token.value.strip().lower()
				2867	ctype.append(ValueTerminal('/', 'content-type-separator'))
				2868	value = value[1:]
				2869	try:
				2870	token, value = get_token(value)
				2871	except errors.HeaderParseError:
				2872	ctype.defects.append(errors.InvalidHeaderDefect(
				2873	"Expected content subtype but found {!r}".format(value)))
				2874	_find_mime_parameters(ctype, value)
				2875	return ctype
				2876	ctype.append(token)
				2877	ctype.subtype = token.value.strip().lower()
				2878	if not value:
				2879	return ctype
				2880	if value[0] != ';':
				2881	ctype.defects.append(errors.InvalidHeaderDefect(
				2882	"Only parameters are valid after content type, but "
				2883	"found {!r}".format(value)))
				2884	# The RFC requires that a syntactically invalid content-type be treated
				2885	# as text/plain. Perhaps we should postel this, but we should probably
				2886	# only do that if we were checking the subtype value against IANA.
				2887	del ctype.maintype, ctype.subtype
				2888	_find_mime_parameters(ctype, value)
				2889	return ctype
				2890	ctype.append(ValueTerminal(';', 'parameter-separator'))
				2891	ctype.append(parse_mime_parameters(value[1:]))
				2892	return ctype
				2893
				2894	def parse_content_disposition_header(value):
				2895	""" disposition-type *( ";" parameter )
				2896
				2897	"""
				2898	disp_header = ContentDisposition()
				2899	if not value:
				2900	disp_header.defects.append(errors.HeaderMissingRequiredValue(
				2901	"Missing content disposition"))
				2902	return disp_header
				2903	try:
				2904	token, value = get_token(value)
				2905	except errors.HeaderParseError:
				2906	ctype.defects.append(errors.InvalidHeaderDefect(
				2907	"Expected content disposition but found {!r}".format(value)))
				2908	_find_mime_parameters(disp_header, value)
				2909	return disp_header
				2910	disp_header.append(token)
				2911	disp_header.content_disposition = token.value.strip().lower()
				2912	if not value:
				2913	return disp_header
				2914	if value[0] != ';':
				2915	disp_header.defects.append(errors.InvalidHeaderDefect(
				2916	"Only parameters are valid after content disposition, but "
				2917	"found {!r}".format(value)))
				2918	_find_mime_parameters(disp_header, value)
				2919	return disp_header
				2920	disp_header.append(ValueTerminal(';', 'parameter-separator'))
				2921	disp_header.append(parse_mime_parameters(value[1:]))
				2922	return disp_header
				2923
				2924	def parse_content_transfer_encoding_header(value):
				2925	""" mechanism
				2926
				2927	"""
				2928	# We should probably validate the values, since the list is fixed.
				2929	cte_header = ContentTransferEncoding()
				2930	if not value:
				2931	cte_header.defects.append(errors.HeaderMissingRequiredValue(
				2932	"Missing content transfer encoding"))
				2933	return cte_header
				2934	try:
				2935	token, value = get_token(value)
				2936	except errors.HeaderParseError:
				2937	ctype.defects.append(errors.InvalidHeaderDefect(
				2938	"Expected content trnasfer encoding but found {!r}".format(value)))
				2939	else:
				2940	cte_header.append(token)
				2941	cte_header.cte = token.value.strip().lower()
				2942	if not value:
				2943	return cte_header
				2944	while value:
				2945	cte_header.defects.append(errors.InvalidHeaderDefect(
				2946	"Extra text after content transfer encoding"))
				2947	if value[0] in PHRASE_ENDS:
				2948	cte_header.append(ValueTerminal(value[0], 'misplaced-special'))
				2949	value = value[1:]
				2950	else:
				2951	token, value = get_phrase(value)
				2952	cte_header.append(token)
				2953	return cte_header