Blame - Lib/email/_header_value_parser.py - platform/external/python/cpython3

blob: bb26d5a556dbbea3f22e99a23cdf70c6d57e2d79 [file] [log] [blame]

R David Murray	0b6f6c8	2012-05-25 18:42:14 -0400	[diff] [blame]	1	"""Header value parser implementing various email-related RFC parsing rules.
				2
				3	The parsing methods defined in this module implement various email related
				4	parsing rules. Principal among them is RFC 5322, which is the followon
				5	to RFC 2822 and primarily a clarification of the former. It also implements
				6	RFC 2047 encoded word decoding.
				7
				8	RFC 5322 goes to considerable trouble to maintain backward compatibility with
				9	RFC 822 in the parse phase, while cleaning up the structure on the generation
				10	phase. This parser supports correct RFC 5322 generation by tagging white space
				11	as folding white space only when folding is allowed in the non-obsolete rule
				12	sets. Actually, the parser is even more generous when accepting input than RFC
				13	5322 mandates, following the spirit of Postel's Law, which RFC 5322 encourages.
				14	Where possible deviations from the standard are annotated on the 'defects'
				15	attribute of tokens that deviate.
				16
				17	The general structure of the parser follows RFC 5322, and uses its terminology
				18	where there is a direct correspondence. Where the implementation requires a
				19	somewhat different structure than that used by the formal grammar, new terms
				20	that mimic the closest existing terms are used. Thus, it really helps to have
				21	a copy of RFC 5322 handy when studying this code.
				22
				23	Input to the parser is a string that has already been unfolded according to
				24	RFC 5322 rules. According to the RFC this unfolding is the very first step, and
				25	this parser leaves the unfolding step to a higher level message parser, which
				26	will have already detected the line breaks that need unfolding while
				27	determining the beginning and end of each header.
				28
				29	The output of the parser is a TokenList object, which is a list subclass. A
				30	TokenList is a recursive data structure. The terminal nodes of the structure
				31	are Terminal objects, which are subclasses of str. These do not correspond
				32	directly to terminal objects in the formal grammar, but are instead more
				33	practical higher level combinations of true terminals.
				34
				35	All TokenList and Terminal objects have a 'value' attribute, which produces the
				36	semantically meaningful value of that part of the parse subtree. The value of
				37	all whitespace tokens (no matter how many sub-tokens they may contain) is a
				38	single space, as per the RFC rules. This includes 'CFWS', which is herein
				39	included in the general class of whitespace tokens. There is one exception to
				40	the rule that whitespace tokens are collapsed into single spaces in values: in
				41	the value of a 'bare-quoted-string' (a quoted-string with no leading or
				42	trailing whitespace), any whitespace that appeared between the quotation marks
				43	is preserved in the returned value. Note that in all Terminal strings quoted
				44	pairs are turned into their unquoted values.
				45
				46	All TokenList and Terminal objects also have a string value, which attempts to
				47	be a "canonical" representation of the RFC-compliant form of the substring that
				48	produced the parsed subtree, including minimal use of quoted pair quoting.
				49	Whitespace runs are not collapsed.
				50
				51	Comment tokens also have a 'content' attribute providing the string found
				52	between the parens (including any nested comments) with whitespace preserved.
				53
				54	All TokenList and Terminal objects have a 'defects' attribute which is a
				55	possibly empty list all of the defects found while creating the token. Defects
				56	may appear on any token in the tree, and a composite list of all defects in the
				57	subtree is available through the 'all_defects' attribute of any node. (For
				58	Terminal notes x.defects == x.all_defects.)
				59
				60	Each object in a parse tree is called a 'token', and each has a 'token_type'
				61	attribute that gives the name from the RFC 5322 grammar that it represents.
				62	Not all RFC 5322 nodes are produced, and there is one non-RFC 5322 node that
				63	may be produced: 'ptext'. A 'ptext' is a string of printable ascii characters.
				64	It is returned in place of lists of (ctext/quoted-pair) and
				65	(qtext/quoted-pair).
				66
				67	XXX: provide complete list of token types.
				68	"""
				69
				70	import re
R David Murray	97f43c0	2012-06-24 05:03:27 -0400	[diff] [blame]	71	import urllib # For urllib.parse.unquote
R David Murray	65171b2	2013-07-11 15:52:57 -0400	[diff] [blame]	72	from string import hexdigits
R David Murray	7d0325d	2015-03-29 21:53:05 -0400	[diff] [blame]	73	from operator import itemgetter
R David Murray	0b6f6c8	2012-05-25 18:42:14 -0400	[diff] [blame]	74	from email import _encoded_words as _ew
				75	from email import errors
				76	from email import utils
				77
				78	#
				79	# Useful constants and functions
				80	#
				81
				82	WSP = set(' \t')
				83	CFWS_LEADER = WSP \| set('(')
				84	SPECIALS = set(r'()<>@,:;.\"[]')
				85	ATOM_ENDS = SPECIALS \| WSP
				86	DOT_ATOM_ENDS = ATOM_ENDS - set('.')
				87	# '.', '"', and '(' do not end phrases in order to support obs-phrase
				88	PHRASE_ENDS = SPECIALS - set('."(')
R David Murray	97f43c0	2012-06-24 05:03:27 -0400	[diff] [blame]	89	TSPECIALS = (SPECIALS \| set('/?=')) - set('.')
				90	TOKEN_ENDS = TSPECIALS \| WSP
				91	ASPECIALS = TSPECIALS \| set("*'%")
				92	ATTRIBUTE_ENDS = ASPECIALS \| WSP
				93	EXTENDED_ATTRIBUTE_ENDS = ATTRIBUTE_ENDS - set('%')
R David Murray	0b6f6c8	2012-05-25 18:42:14 -0400	[diff] [blame]	94
				95	def quote_string(value):
				96	return '"'+str(value).replace('\\', '\\\\').replace('"', r'\"')+'"'
				97
				98	#
R David Murray	0b6f6c8	2012-05-25 18:42:14 -0400	[diff] [blame]	99	# TokenList and its subclasses
				100	#
				101
				102	class TokenList(list):
				103
				104	token_type = None
R. David Murray	85d5c18	2017-12-03 18:51:41 -0500	[diff] [blame]	105	syntactic_break = True
				106	ew_combine_allowed = True
R David Murray	0b6f6c8	2012-05-25 18:42:14 -0400	[diff] [blame]	107
				108	def __init__(self, args, *kw):
				109	super().__init__(args, *kw)
				110	self.defects = []
				111
				112	def __str__(self):
				113	return ''.join(str(x) for x in self)
				114
				115	def __repr__(self):
				116	return '{}({})'.format(self.__class__.__name__,
				117	super().__repr__())
				118
				119	@property
				120	def value(self):
				121	return ''.join(x.value for x in self if x.value)
				122
				123	@property
				124	def all_defects(self):
				125	return sum((x.all_defects for x in self), self.defects)
				126
R David Murray	0b6f6c8	2012-05-25 18:42:14 -0400	[diff] [blame]	127	def startswith_fws(self):
				128	return self[0].startswith_fws()
				129
R David Murray	0b6f6c8	2012-05-25 18:42:14 -0400	[diff] [blame]	130	@property
R. David Murray	85d5c18	2017-12-03 18:51:41 -0500	[diff] [blame]	131	def as_ew_allowed(self):
				132	"""True if all top level tokens of this part may be RFC2047 encoded."""
				133	return all(part.as_ew_allowed for part in self)
R David Murray	0b6f6c8	2012-05-25 18:42:14 -0400	[diff] [blame]	134
				135	@property
				136	def comments(self):
				137	comments = []
				138	for token in self:
				139	comments.extend(token.comments)
				140	return comments
				141
				142	def fold(self, *, policy):
R. David Murray	85d5c18	2017-12-03 18:51:41 -0500	[diff] [blame]	143	return _refold_parse_tree(self, policy=policy)
R David Murray	0b6f6c8	2012-05-25 18:42:14 -0400	[diff] [blame]	144
				145	def pprint(self, indent=''):
R. David Murray	85d5c18	2017-12-03 18:51:41 -0500	[diff] [blame]	146	print(self.ppstr(indent=indent))
R David Murray	0b6f6c8	2012-05-25 18:42:14 -0400	[diff] [blame]	147
				148	def ppstr(self, indent=''):
R. David Murray	85d5c18	2017-12-03 18:51:41 -0500	[diff] [blame]	149	return '\n'.join(self._pp(indent=indent))
R David Murray	0b6f6c8	2012-05-25 18:42:14 -0400	[diff] [blame]	150
				151	def _pp(self, indent=''):
				152	yield '{}{}/{}('.format(
				153	indent,
				154	self.__class__.__name__,
				155	self.token_type)
				156	for token in self:
R David Murray	97f43c0	2012-06-24 05:03:27 -0400	[diff] [blame]	157	if not hasattr(token, '_pp'):
				158	yield (indent + ' !! invalid element in token '
				159	'list: {!r}'.format(token))
				160	else:
Philip Jenvey	4993cc0	2012-10-01 12:53:43 -0700	[diff] [blame]	161	yield from token._pp(indent+' ')
R David Murray	0b6f6c8	2012-05-25 18:42:14 -0400	[diff] [blame]	162	if self.defects:
				163	extra = ' Defects: {}'.format(self.defects)
				164	else:
				165	extra = ''
				166	yield '{}){}'.format(indent, extra)
				167
				168
				169	class WhiteSpaceTokenList(TokenList):
				170
				171	@property
				172	def value(self):
				173	return ' '
				174
				175	@property
				176	def comments(self):
				177	return [x.content for x in self if x.token_type=='comment']
				178
				179
				180	class UnstructuredTokenList(TokenList):
				181
				182	token_type = 'unstructured'
				183
R David Murray	0b6f6c8	2012-05-25 18:42:14 -0400	[diff] [blame]	184
				185	class Phrase(TokenList):
				186
				187	token_type = 'phrase'
				188
R David Murray	0b6f6c8	2012-05-25 18:42:14 -0400	[diff] [blame]	189	class Word(TokenList):
				190
				191	token_type = 'word'
				192
				193
				194	class CFWSList(WhiteSpaceTokenList):
				195
				196	token_type = 'cfws'
				197
R David Murray	0b6f6c8	2012-05-25 18:42:14 -0400	[diff] [blame]	198
				199	class Atom(TokenList):
				200
				201	token_type = 'atom'
				202
				203
R David Murray	97f43c0	2012-06-24 05:03:27 -0400	[diff] [blame]	204	class Token(TokenList):
				205
				206	token_type = 'token'
R. David Murray	85d5c18	2017-12-03 18:51:41 -0500	[diff] [blame]	207	encode_as_ew = False
R David Murray	97f43c0	2012-06-24 05:03:27 -0400	[diff] [blame]	208
				209
R David Murray	0b6f6c8	2012-05-25 18:42:14 -0400	[diff] [blame]	210	class EncodedWord(TokenList):
				211
				212	token_type = 'encoded-word'
				213	cte = None
				214	charset = None
				215	lang = None
				216
R David Murray	0b6f6c8	2012-05-25 18:42:14 -0400	[diff] [blame]	217
				218	class QuotedString(TokenList):
				219
				220	token_type = 'quoted-string'
				221
				222	@property
				223	def content(self):
				224	for x in self:
				225	if x.token_type == 'bare-quoted-string':
				226	return x.value
				227
				228	@property
				229	def quoted_value(self):
				230	res = []
				231	for x in self:
				232	if x.token_type == 'bare-quoted-string':
				233	res.append(str(x))
				234	else:
				235	res.append(x.value)
				236	return ''.join(res)
				237
R David Murray	97f43c0	2012-06-24 05:03:27 -0400	[diff] [blame]	238	@property
				239	def stripped_value(self):
				240	for token in self:
				241	if token.token_type == 'bare-quoted-string':
				242	return token.value
				243
R David Murray	0b6f6c8	2012-05-25 18:42:14 -0400	[diff] [blame]	244
				245	class BareQuotedString(QuotedString):
				246
				247	token_type = 'bare-quoted-string'
				248
				249	def __str__(self):
R David Murray	97f43c0	2012-06-24 05:03:27 -0400	[diff] [blame]	250	return quote_string(''.join(str(x) for x in self))
R David Murray	0b6f6c8	2012-05-25 18:42:14 -0400	[diff] [blame]	251
				252	@property
				253	def value(self):
				254	return ''.join(str(x) for x in self)
				255
				256
				257	class Comment(WhiteSpaceTokenList):
				258
				259	token_type = 'comment'
				260
				261	def __str__(self):
				262	return ''.join(sum([
				263	["("],
				264	[self.quote(x) for x in self],
				265	[")"],
				266	], []))
				267
				268	def quote(self, value):
				269	if value.token_type == 'comment':
				270	return str(value)
				271	return str(value).replace('\\', '\\\\').replace(
R David Murray	44b548d	2016-09-08 13:59:53 -0400	[diff] [blame]	272	'(', r'\(').replace(
				273	')', r'\)')
R David Murray	0b6f6c8	2012-05-25 18:42:14 -0400	[diff] [blame]	274
				275	@property
				276	def content(self):
				277	return ''.join(str(x) for x in self)
				278
				279	@property
				280	def comments(self):
				281	return [self.content]
				282
				283	class AddressList(TokenList):
				284
				285	token_type = 'address-list'
				286
				287	@property
				288	def addresses(self):
				289	return [x for x in self if x.token_type=='address']
				290
				291	@property
				292	def mailboxes(self):
				293	return sum((x.mailboxes
				294	for x in self if x.token_type=='address'), [])
				295
				296	@property
				297	def all_mailboxes(self):
				298	return sum((x.all_mailboxes
				299	for x in self if x.token_type=='address'), [])
				300
				301
				302	class Address(TokenList):
				303
				304	token_type = 'address'
				305
				306	@property
				307	def display_name(self):
				308	if self[0].token_type == 'group':
				309	return self[0].display_name
				310
				311	@property
				312	def mailboxes(self):
				313	if self[0].token_type == 'mailbox':
				314	return [self[0]]
				315	elif self[0].token_type == 'invalid-mailbox':
				316	return []
				317	return self[0].mailboxes
				318
				319	@property
				320	def all_mailboxes(self):
				321	if self[0].token_type == 'mailbox':
				322	return [self[0]]
				323	elif self[0].token_type == 'invalid-mailbox':
				324	return [self[0]]
				325	return self[0].all_mailboxes
				326
				327	class MailboxList(TokenList):
				328
				329	token_type = 'mailbox-list'
				330
				331	@property
				332	def mailboxes(self):
				333	return [x for x in self if x.token_type=='mailbox']
				334
				335	@property
				336	def all_mailboxes(self):
				337	return [x for x in self
				338	if x.token_type in ('mailbox', 'invalid-mailbox')]
				339
				340
				341	class GroupList(TokenList):
				342
				343	token_type = 'group-list'
				344
				345	@property
				346	def mailboxes(self):
				347	if not self or self[0].token_type != 'mailbox-list':
				348	return []
				349	return self[0].mailboxes
				350
				351	@property
				352	def all_mailboxes(self):
				353	if not self or self[0].token_type != 'mailbox-list':
				354	return []
				355	return self[0].all_mailboxes
				356
				357
				358	class Group(TokenList):
				359
				360	token_type = "group"
				361
				362	@property
				363	def mailboxes(self):
				364	if self[2].token_type != 'group-list':
				365	return []
				366	return self[2].mailboxes
				367
				368	@property
				369	def all_mailboxes(self):
				370	if self[2].token_type != 'group-list':
				371	return []
				372	return self[2].all_mailboxes
				373
				374	@property
				375	def display_name(self):
				376	return self[0].display_name
				377
				378
				379	class NameAddr(TokenList):
				380
				381	token_type = 'name-addr'
				382
				383	@property
				384	def display_name(self):
				385	if len(self) == 1:
				386	return None
				387	return self[0].display_name
				388
				389	@property
				390	def local_part(self):
				391	return self[-1].local_part
				392
				393	@property
				394	def domain(self):
				395	return self[-1].domain
				396
				397	@property
				398	def route(self):
				399	return self[-1].route
				400
				401	@property
				402	def addr_spec(self):
				403	return self[-1].addr_spec
				404
				405
				406	class AngleAddr(TokenList):
				407
				408	token_type = 'angle-addr'
				409
				410	@property
				411	def local_part(self):
				412	for x in self:
				413	if x.token_type == 'addr-spec':
				414	return x.local_part
				415
				416	@property
				417	def domain(self):
				418	for x in self:
				419	if x.token_type == 'addr-spec':
				420	return x.domain
				421
				422	@property
				423	def route(self):
				424	for x in self:
				425	if x.token_type == 'obs-route':
				426	return x.domains
				427
				428	@property
				429	def addr_spec(self):
				430	for x in self:
				431	if x.token_type == 'addr-spec':
jayyyin	aa218d1	2018-01-29 13:07:44 -0500	[diff] [blame]	432	if x.local_part:
				433	return x.addr_spec
				434	else:
				435	return quote_string(x.local_part) + x.addr_spec
R David Murray	032eed3	2012-05-26 14:31:12 -0400	[diff] [blame]	436	else:
				437	return '<>'
R David Murray	0b6f6c8	2012-05-25 18:42:14 -0400	[diff] [blame]	438
				439
				440	class ObsRoute(TokenList):
				441
				442	token_type = 'obs-route'
				443
				444	@property
				445	def domains(self):
				446	return [x.domain for x in self if x.token_type == 'domain']
				447
				448
				449	class Mailbox(TokenList):
				450
				451	token_type = 'mailbox'
				452
				453	@property
				454	def display_name(self):
				455	if self[0].token_type == 'name-addr':
				456	return self[0].display_name
				457
				458	@property
				459	def local_part(self):
				460	return self[0].local_part
				461
				462	@property
				463	def domain(self):
				464	return self[0].domain
				465
				466	@property
				467	def route(self):
				468	if self[0].token_type == 'name-addr':
				469	return self[0].route
				470
				471	@property
				472	def addr_spec(self):
				473	return self[0].addr_spec
				474
				475
				476	class InvalidMailbox(TokenList):
				477
				478	token_type = 'invalid-mailbox'
				479
				480	@property
				481	def display_name(self):
				482	return None
				483
				484	local_part = domain = route = addr_spec = display_name
				485
				486
				487	class Domain(TokenList):
				488
				489	token_type = 'domain'
R. David Murray	85d5c18	2017-12-03 18:51:41 -0500	[diff] [blame]	490	as_ew_allowed = False
R David Murray	0b6f6c8	2012-05-25 18:42:14 -0400	[diff] [blame]	491
				492	@property
				493	def domain(self):
				494	return ''.join(super().value.split())
				495
				496
				497	class DotAtom(TokenList):
				498
				499	token_type = 'dot-atom'
				500
				501
				502	class DotAtomText(TokenList):
				503
				504	token_type = 'dot-atom-text'
R. David Murray	85d5c18	2017-12-03 18:51:41 -0500	[diff] [blame]	505	as_ew_allowed = True
R David Murray	0b6f6c8	2012-05-25 18:42:14 -0400	[diff] [blame]	506
				507
				508	class AddrSpec(TokenList):
				509
				510	token_type = 'addr-spec'
R. David Murray	85d5c18	2017-12-03 18:51:41 -0500	[diff] [blame]	511	as_ew_allowed = False
R David Murray	0b6f6c8	2012-05-25 18:42:14 -0400	[diff] [blame]	512
				513	@property
				514	def local_part(self):
				515	return self[0].local_part
				516
				517	@property
				518	def domain(self):
				519	if len(self) < 3:
				520	return None
				521	return self[-1].domain
				522
				523	@property
				524	def value(self):
				525	if len(self) < 3:
				526	return self[0].value
				527	return self[0].value.rstrip()+self[1].value+self[2].value.lstrip()
				528
				529	@property
				530	def addr_spec(self):
				531	nameset = set(self.local_part)
				532	if len(nameset) > len(nameset-DOT_ATOM_ENDS):
				533	lp = quote_string(self.local_part)
				534	else:
				535	lp = self.local_part
				536	if self.domain is not None:
				537	return lp + '@' + self.domain
				538	return lp
				539
				540
				541	class ObsLocalPart(TokenList):
				542
				543	token_type = 'obs-local-part'
R. David Murray	85d5c18	2017-12-03 18:51:41 -0500	[diff] [blame]	544	as_ew_allowed = False
R David Murray	0b6f6c8	2012-05-25 18:42:14 -0400	[diff] [blame]	545
				546
				547	class DisplayName(Phrase):
				548
				549	token_type = 'display-name'
R. David Murray	85d5c18	2017-12-03 18:51:41 -0500	[diff] [blame]	550	ew_combine_allowed = False
R David Murray	0b6f6c8	2012-05-25 18:42:14 -0400	[diff] [blame]	551
				552	@property
				553	def display_name(self):
				554	res = TokenList(self)
				555	if res[0].token_type == 'cfws':
				556	res.pop(0)
				557	else:
				558	if res[0][0].token_type == 'cfws':
				559	res[0] = TokenList(res[0][1:])
				560	if res[-1].token_type == 'cfws':
				561	res.pop()
				562	else:
				563	if res[-1][-1].token_type == 'cfws':
				564	res[-1] = TokenList(res[-1][:-1])
				565	return res.value
				566
				567	@property
				568	def value(self):
				569	quote = False
				570	if self.defects:
				571	quote = True
				572	else:
				573	for x in self:
				574	if x.token_type == 'quoted-string':
				575	quote = True
				576	if quote:
				577	pre = post = ''
				578	if self[0].token_type=='cfws' or self[0][0].token_type=='cfws':
				579	pre = ' '
				580	if self[-1].token_type=='cfws' or self[-1][-1].token_type=='cfws':
				581	post = ' '
				582	return pre+quote_string(self.display_name)+post
				583	else:
				584	return super().value
				585
				586
				587	class LocalPart(TokenList):
				588
				589	token_type = 'local-part'
R. David Murray	85d5c18	2017-12-03 18:51:41 -0500	[diff] [blame]	590	as_ew_allowed = False
R David Murray	0b6f6c8	2012-05-25 18:42:14 -0400	[diff] [blame]	591
				592	@property
				593	def value(self):
				594	if self[0].token_type == "quoted-string":
				595	return self[0].quoted_value
				596	else:
				597	return self[0].value
				598
				599	@property
				600	def local_part(self):
				601	# Strip whitespace from front, back, and around dots.
				602	res = [DOT]
				603	last = DOT
				604	last_is_tl = False
				605	for tok in self[0] + [DOT]:
				606	if tok.token_type == 'cfws':
				607	continue
				608	if (last_is_tl and tok.token_type == 'dot' and
				609	last[-1].token_type == 'cfws'):
				610	res[-1] = TokenList(last[:-1])
				611	is_tl = isinstance(tok, TokenList)
				612	if (is_tl and last.token_type == 'dot' and
				613	tok[0].token_type == 'cfws'):
				614	res.append(TokenList(tok[1:]))
				615	else:
				616	res.append(tok)
				617	last = res[-1]
				618	last_is_tl = is_tl
				619	res = TokenList(res[1:-1])
				620	return res.value
				621
				622
				623	class DomainLiteral(TokenList):
				624
				625	token_type = 'domain-literal'
R. David Murray	85d5c18	2017-12-03 18:51:41 -0500	[diff] [blame]	626	as_ew_allowed = False
R David Murray	0b6f6c8	2012-05-25 18:42:14 -0400	[diff] [blame]	627
				628	@property
				629	def domain(self):
				630	return ''.join(super().value.split())
				631
				632	@property
				633	def ip(self):
				634	for x in self:
				635	if x.token_type == 'ptext':
				636	return x.value
				637
				638
R David Murray	97f43c0	2012-06-24 05:03:27 -0400	[diff] [blame]	639	class MIMEVersion(TokenList):
				640
				641	token_type = 'mime-version'
				642	major = None
				643	minor = None
				644
				645
				646	class Parameter(TokenList):
				647
				648	token_type = 'parameter'
				649	sectioned = False
				650	extended = False
				651	charset = 'us-ascii'
				652
				653	@property
				654	def section_number(self):
				655	# Because the first token, the attribute (name) eats CFWS, the second
				656	# token is always the section if there is one.
				657	return self[1].number if self.sectioned else 0
				658
				659	@property
				660	def param_value(self):
				661	# This is part of the "handle quoted extended parameters" hack.
				662	for token in self:
				663	if token.token_type == 'value':
				664	return token.stripped_value
				665	if token.token_type == 'quoted-string':
				666	for token in token:
				667	if token.token_type == 'bare-quoted-string':
				668	for token in token:
				669	if token.token_type == 'value':
				670	return token.stripped_value
				671	return ''
				672
				673
				674	class InvalidParameter(Parameter):
				675
				676	token_type = 'invalid-parameter'
				677
				678
				679	class Attribute(TokenList):
				680
				681	token_type = 'attribute'
				682
				683	@property
				684	def stripped_value(self):
				685	for token in self:
				686	if token.token_type.endswith('attrtext'):
				687	return token.value
				688
				689	class Section(TokenList):
				690
				691	token_type = 'section'
				692	number = None
				693
				694
				695	class Value(TokenList):
				696
				697	token_type = 'value'
				698
				699	@property
				700	def stripped_value(self):
				701	token = self[0]
				702	if token.token_type == 'cfws':
				703	token = self[1]
				704	if token.token_type.endswith(
				705	('quoted-string', 'attribute', 'extended-attribute')):
				706	return token.stripped_value
				707	return self.value
				708
				709
				710	class MimeParameters(TokenList):
				711
				712	token_type = 'mime-parameters'
R. David Murray	85d5c18	2017-12-03 18:51:41 -0500	[diff] [blame]	713	syntactic_break = False
R David Murray	97f43c0	2012-06-24 05:03:27 -0400	[diff] [blame]	714
				715	@property
				716	def params(self):
				717	# The RFC specifically states that the ordering of parameters is not
				718	# guaranteed and may be reordered by the transport layer. So we have
				719	# to assume the RFC 2231 pieces can come in any order. However, we
				720	# output them in the order that we first see a given name, which gives
				721	# us a stable __str__.
Inada Naoki	c95404f	2019-02-05 17:05:43 +0900	[diff] [blame]	722	params = {} # Using order preserving dict from Python 3.7+
R David Murray	97f43c0	2012-06-24 05:03:27 -0400	[diff] [blame]	723	for token in self:
				724	if not token.token_type.endswith('parameter'):
				725	continue
				726	if token[0].token_type != 'attribute':
				727	continue
				728	name = token[0].value.strip()
				729	if name not in params:
				730	params[name] = []
				731	params[name].append((token.section_number, token))
				732	for name, parts in params.items():
R David Murray	7d0325d	2015-03-29 21:53:05 -0400	[diff] [blame]	733	parts = sorted(parts, key=itemgetter(0))
				734	first_param = parts[0][1]
				735	charset = first_param.charset
				736	# Our arbitrary error recovery is to ignore duplicate parameters,
				737	# to use appearance order if there are duplicate rfc 2231 parts,
				738	# and to ignore gaps. This mimics the error recovery of get_param.
				739	if not first_param.extended and len(parts) > 1:
				740	if parts[1][0] == 0:
				741	parts[1][1].defects.append(errors.InvalidHeaderDefect(
				742	'duplicate parameter name; duplicate(s) ignored'))
				743	parts = parts[:1]
				744	# Else assume the 0 was missing...note that this is different
				745	# from get_param, but we registered a defect for this earlier.
R David Murray	97f43c0	2012-06-24 05:03:27 -0400	[diff] [blame]	746	value_parts = []
R David Murray	7d0325d	2015-03-29 21:53:05 -0400	[diff] [blame]	747	i = 0
				748	for section_number, param in parts:
R David Murray	97f43c0	2012-06-24 05:03:27 -0400	[diff] [blame]	749	if section_number != i:
R David Murray	7d0325d	2015-03-29 21:53:05 -0400	[diff] [blame]	750	# We could get fancier here and look for a complete
				751	# duplicate extended parameter and ignore the second one
				752	# seen. But we're not doing that. The old code didn't.
				753	if not param.extended:
				754	param.defects.append(errors.InvalidHeaderDefect(
				755	'duplicate parameter name; duplicate ignored'))
				756	continue
				757	else:
				758	param.defects.append(errors.InvalidHeaderDefect(
				759	"inconsistent RFC2231 parameter numbering"))
				760	i += 1
R David Murray	97f43c0	2012-06-24 05:03:27 -0400	[diff] [blame]	761	value = param.param_value
				762	if param.extended:
				763	try:
				764	value = urllib.parse.unquote_to_bytes(value)
				765	except UnicodeEncodeError:
				766	# source had surrogate escaped bytes. What we do now
				767	# is a bit of an open question. I'm not sure this is
				768	# the best choice, but it is what the old algorithm did
				769	value = urllib.parse.unquote(value, encoding='latin-1')
				770	else:
				771	try:
				772	value = value.decode(charset, 'surrogateescape')
				773	except LookupError:
				774	# XXX: there should really be a custom defect for
				775	# unknown character set to make it easy to find,
				776	# because otherwise unknown charset is a silent
				777	# failure.
				778	value = value.decode('us-ascii', 'surrogateescape')
				779	if utils._has_surrogates(value):
				780	param.defects.append(errors.UndecodableBytesDefect())
				781	value_parts.append(value)
				782	value = ''.join(value_parts)
				783	yield name, value
				784
				785	def __str__(self):
				786	params = []
				787	for name, value in self.params:
				788	if value:
				789	params.append('{}={}'.format(name, quote_string(value)))
				790	else:
				791	params.append(name)
				792	params = '; '.join(params)
				793	return ' ' + params if params else ''
				794
				795
				796	class ParameterizedHeaderValue(TokenList):
				797
R. David Murray	85d5c18	2017-12-03 18:51:41 -0500	[diff] [blame]	798	# Set this false so that the value doesn't wind up on a new line even
				799	# if it and the parameters would fit there but not on the first line.
				800	syntactic_break = False
				801
R David Murray	97f43c0	2012-06-24 05:03:27 -0400	[diff] [blame]	802	@property
				803	def params(self):
				804	for token in reversed(self):
				805	if token.token_type == 'mime-parameters':
				806	return token.params
				807	return {}
				808
R David Murray	97f43c0	2012-06-24 05:03:27 -0400	[diff] [blame]	809
				810	class ContentType(ParameterizedHeaderValue):
				811
				812	token_type = 'content-type'
R. David Murray	85d5c18	2017-12-03 18:51:41 -0500	[diff] [blame]	813	as_ew_allowed = False
R David Murray	97f43c0	2012-06-24 05:03:27 -0400	[diff] [blame]	814	maintype = 'text'
				815	subtype = 'plain'
				816
				817
				818	class ContentDisposition(ParameterizedHeaderValue):
				819
				820	token_type = 'content-disposition'
R. David Murray	85d5c18	2017-12-03 18:51:41 -0500	[diff] [blame]	821	as_ew_allowed = False
R David Murray	97f43c0	2012-06-24 05:03:27 -0400	[diff] [blame]	822	content_disposition = None
				823
				824
				825	class ContentTransferEncoding(TokenList):
				826
				827	token_type = 'content-transfer-encoding'
R. David Murray	85d5c18	2017-12-03 18:51:41 -0500	[diff] [blame]	828	as_ew_allowed = False
R David Murray	97f43c0	2012-06-24 05:03:27 -0400	[diff] [blame]	829	cte = '7bit'
				830
				831
R David Murray	0b6f6c8	2012-05-25 18:42:14 -0400	[diff] [blame]	832	class HeaderLabel(TokenList):
				833
				834	token_type = 'header-label'
R. David Murray	85d5c18	2017-12-03 18:51:41 -0500	[diff] [blame]	835	as_ew_allowed = False
R David Murray	0b6f6c8	2012-05-25 18:42:14 -0400	[diff] [blame]	836
				837
				838	class Header(TokenList):
				839
				840	token_type = 'header'
				841
R David Murray	0b6f6c8	2012-05-25 18:42:14 -0400	[diff] [blame]	842
				843	#
				844	# Terminal classes and instances
				845	#
				846
				847	class Terminal(str):
				848
R. David Murray	85d5c18	2017-12-03 18:51:41 -0500	[diff] [blame]	849	as_ew_allowed = True
				850	ew_combine_allowed = True
				851	syntactic_break = True
				852
R David Murray	0b6f6c8	2012-05-25 18:42:14 -0400	[diff] [blame]	853	def __new__(cls, value, token_type):
				854	self = super().__new__(cls, value)
				855	self.token_type = token_type
				856	self.defects = []
				857	return self
				858
				859	def __repr__(self):
				860	return "{}({})".format(self.__class__.__name__, super().__repr__())
				861
R. David Murray	85d5c18	2017-12-03 18:51:41 -0500	[diff] [blame]	862	def pprint(self):
				863	print(self.__class__.__name__ + '/' + self.token_type)
				864
R David Murray	0b6f6c8	2012-05-25 18:42:14 -0400	[diff] [blame]	865	@property
				866	def all_defects(self):
				867	return list(self.defects)
				868
				869	def _pp(self, indent=''):
				870	return ["{}{}/{}({}){}".format(
				871	indent,
				872	self.__class__.__name__,
				873	self.token_type,
				874	super().__repr__(),
				875	'' if not self.defects else ' {}'.format(self.defects),
				876	)]
				877
R David Murray	0b6f6c8	2012-05-25 18:42:14 -0400	[diff] [blame]	878	def pop_trailing_ws(self):
				879	# This terminates the recursion.
				880	return None
				881
R David Murray	0b6f6c8	2012-05-25 18:42:14 -0400	[diff] [blame]	882	@property
				883	def comments(self):
				884	return []
				885
R David Murray	0b6f6c8	2012-05-25 18:42:14 -0400	[diff] [blame]	886	def __getnewargs__(self):
				887	return(str(self), self.token_type)
				888
				889
				890	class WhiteSpaceTerminal(Terminal):
				891
				892	@property
				893	def value(self):
				894	return ' '
				895
				896	def startswith_fws(self):
				897	return True
				898
R David Murray	0b6f6c8	2012-05-25 18:42:14 -0400	[diff] [blame]	899
				900	class ValueTerminal(Terminal):
				901
				902	@property
				903	def value(self):
				904	return self
				905
				906	def startswith_fws(self):
				907	return False
				908
R David Murray	0b6f6c8	2012-05-25 18:42:14 -0400	[diff] [blame]	909
				910	class EWWhiteSpaceTerminal(WhiteSpaceTerminal):
				911
				912	@property
				913	def value(self):
				914	return ''
				915
R David Murray	0b6f6c8	2012-05-25 18:42:14 -0400	[diff] [blame]	916	def __str__(self):
				917	return ''
				918
R David Murray	0b6f6c8	2012-05-25 18:42:14 -0400	[diff] [blame]	919
				920	# XXX these need to become classes and used as instances so
				921	# that a program can't change them in a parse tree and screw
				922	# up other parse trees. Maybe should have tests for that, too.
				923	DOT = ValueTerminal('.', 'dot')
				924	ListSeparator = ValueTerminal(',', 'list-separator')
				925	RouteComponentMarker = ValueTerminal('@', 'route-component-marker')
				926
				927	#
				928	# Parser
				929	#
				930
Victor Stinner	765531d	2013-03-26 01:11:54 +0100	[diff] [blame]	931	# Parse strings according to RFC822/2047/2822/5322 rules.
				932	#
				933	# This is a stateless parser. Each get_XXX function accepts a string and
				934	# returns either a Terminal or a TokenList representing the RFC object named
				935	# by the method and a string containing the remaining unparsed characters
				936	# from the input. Thus a parser method consumes the next syntactic construct
				937	# of a given type and returns a token representing the construct plus the
				938	# unparsed remainder of the input string.
				939	#
				940	# For example, if the first element of a structured header is a 'phrase',
				941	# then:
				942	#
				943	# phrase, value = get_phrase(value)
				944	#
				945	# returns the complete phrase from the start of the string value, plus any
				946	# characters left in the string after the phrase is removed.
R David Murray	0b6f6c8	2012-05-25 18:42:14 -0400	[diff] [blame]	947
				948	_wsp_splitter = re.compile(r'([{}]+)'.format(''.join(WSP))).split
				949	_non_atom_end_matcher = re.compile(r"[^{}]+".format(
Serhiy Storchaka	05cb728	2017-11-16 12:38:26 +0200	[diff] [blame]	950	re.escape(''.join(ATOM_ENDS)))).match
R David Murray	0b6f6c8	2012-05-25 18:42:14 -0400	[diff] [blame]	951	_non_printable_finder = re.compile(r"[\x00-\x20\x7F]").findall
R David Murray	97f43c0	2012-06-24 05:03:27 -0400	[diff] [blame]	952	_non_token_end_matcher = re.compile(r"[^{}]+".format(
Serhiy Storchaka	05cb728	2017-11-16 12:38:26 +0200	[diff] [blame]	953	re.escape(''.join(TOKEN_ENDS)))).match
R David Murray	97f43c0	2012-06-24 05:03:27 -0400	[diff] [blame]	954	_non_attribute_end_matcher = re.compile(r"[^{}]+".format(
Serhiy Storchaka	05cb728	2017-11-16 12:38:26 +0200	[diff] [blame]	955	re.escape(''.join(ATTRIBUTE_ENDS)))).match
R David Murray	97f43c0	2012-06-24 05:03:27 -0400	[diff] [blame]	956	_non_extended_attribute_end_matcher = re.compile(r"[^{}]+".format(
Serhiy Storchaka	05cb728	2017-11-16 12:38:26 +0200	[diff] [blame]	957	re.escape(''.join(EXTENDED_ATTRIBUTE_ENDS)))).match
R David Murray	0b6f6c8	2012-05-25 18:42:14 -0400	[diff] [blame]	958
				959	def _validate_xtext(xtext):
				960	"""If input token contains ASCII non-printables, register a defect."""
				961
				962	non_printables = _non_printable_finder(xtext)
				963	if non_printables:
				964	xtext.defects.append(errors.NonPrintableDefect(non_printables))
				965	if utils._has_surrogates(xtext):
				966	xtext.defects.append(errors.UndecodableBytesDefect(
				967	"Non-ASCII characters found in header token"))
				968
				969	def _get_ptext_to_endchars(value, endchars):
				970	"""Scan printables/quoted-pairs until endchars and return unquoted ptext.
				971
				972	This function turns a run of qcontent, ccontent-without-comments, or
				973	dtext-with-quoted-printables into a single string by unquoting any
				974	quoted printables. It returns the string, the remaining value, and
				975	a flag that is True iff there were any quoted printables decoded.
				976
				977	"""
				978	fragment, *remainder = _wsp_splitter(value, 1)
				979	vchars = []
				980	escape = False
				981	had_qp = False
				982	for pos in range(len(fragment)):
				983	if fragment[pos] == '\\':
				984	if escape:
				985	escape = False
				986	had_qp = True
				987	else:
				988	escape = True
				989	continue
				990	if escape:
				991	escape = False
				992	elif fragment[pos] in endchars:
				993	break
				994	vchars.append(fragment[pos])
				995	else:
				996	pos = pos + 1
				997	return ''.join(vchars), ''.join([fragment[pos:]] + remainder), had_qp
				998
R David Murray	0b6f6c8	2012-05-25 18:42:14 -0400	[diff] [blame]	999	def get_fws(value):
				1000	"""FWS = 1*WSP
				1001
				1002	This isn't the RFC definition. We're using fws to represent tokens where
				1003	folding can be done, but when we are parsing the unfolding has already
				1004	been done so we don't need to watch out for CRLF.
				1005
				1006	"""
				1007	newvalue = value.lstrip()
				1008	fws = WhiteSpaceTerminal(value[:len(value)-len(newvalue)], 'fws')
				1009	return fws, newvalue
				1010
				1011	def get_encoded_word(value):
				1012	""" encoded-word = "=?" charset "?" encoding "?" encoded-text "?="
				1013
				1014	"""
				1015	ew = EncodedWord()
				1016	if not value.startswith('=?'):
				1017	raise errors.HeaderParseError(
				1018	"expected encoded word but found {}".format(value))
				1019	tok, *remainder = value[2:].split('?=', 1)
				1020	if tok == value[2:]:
				1021	raise errors.HeaderParseError(
				1022	"expected encoded word but found {}".format(value))
				1023	remstr = ''.join(remainder)
R David Murray	65171b2	2013-07-11 15:52:57 -0400	[diff] [blame]	1024	if len(remstr) > 1 and remstr[0] in hexdigits and remstr[1] in hexdigits:
				1025	# The ? after the CTE was followed by an encoded word escape (=XX).
R David Murray	0b6f6c8	2012-05-25 18:42:14 -0400	[diff] [blame]	1026	rest, *remainder = remstr.split('?=', 1)
				1027	tok = tok + '?=' + rest
				1028	if len(tok.split()) > 1:
				1029	ew.defects.append(errors.InvalidHeaderDefect(
				1030	"whitespace inside encoded word"))
				1031	ew.cte = value
				1032	value = ''.join(remainder)
				1033	try:
				1034	text, charset, lang, defects = _ew.decode('=?' + tok + '?=')
				1035	except ValueError:
				1036	raise errors.HeaderParseError(
				1037	"encoded word format invalid: '{}'".format(ew.cte))
				1038	ew.charset = charset
				1039	ew.lang = lang
				1040	ew.defects.extend(defects)
				1041	while text:
				1042	if text[0] in WSP:
				1043	token, text = get_fws(text)
				1044	ew.append(token)
				1045	continue
				1046	chars, *remainder = _wsp_splitter(text, 1)
				1047	vtext = ValueTerminal(chars, 'vtext')
				1048	_validate_xtext(vtext)
				1049	ew.append(vtext)
				1050	text = ''.join(remainder)
				1051	return ew, value
				1052
				1053	def get_unstructured(value):
				1054	"""unstructured = (([FWS] vchar) WSP) / obs-unstruct
				1055	obs-unstruct = ((LF CR (obs-utext) LF CR)) / FWS)
				1056	obs-utext = %d0 / obs-NO-WS-CTL / LF / CR
				1057
				1058	obs-NO-WS-CTL is control characters except WSP/CR/LF.
				1059
				1060	So, basically, we have printable runs, plus control characters or nulls in
				1061	the obsolete syntax, separated by whitespace. Since RFC 2047 uses the
				1062	obsolete syntax in its specification, but requires whitespace on either
				1063	side of the encoded words, I can see no reason to need to separate the
				1064	non-printable-non-whitespace from the printable runs if they occur, so we
				1065	parse this into xtext tokens separated by WSP tokens.
				1066
				1067	Because an 'unstructured' value must by definition constitute the entire
				1068	value, this 'get' routine does not return a remaining value, only the
				1069	parsed TokenList.
				1070
				1071	"""
				1072	# XXX: but what about bare CR and LF? They might signal the start or
R David Murray	65171b2	2013-07-11 15:52:57 -0400	[diff] [blame]	1073	# end of an encoded word. YAGNI for now, since our current parsers
				1074	# will never send us strings with bare CR or LF.
R David Murray	0b6f6c8	2012-05-25 18:42:14 -0400	[diff] [blame]	1075
				1076	unstructured = UnstructuredTokenList()
				1077	while value:
				1078	if value[0] in WSP:
				1079	token, value = get_fws(value)
				1080	unstructured.append(token)
				1081	continue
				1082	if value.startswith('=?'):
				1083	try:
				1084	token, value = get_encoded_word(value)
				1085	except errors.HeaderParseError:
R David Murray	65171b2	2013-07-11 15:52:57 -0400	[diff] [blame]	1086	# XXX: Need to figure out how to register defects when
				1087	# appropriate here.
R David Murray	0b6f6c8	2012-05-25 18:42:14 -0400	[diff] [blame]	1088	pass
				1089	else:
				1090	have_ws = True
				1091	if len(unstructured) > 0:
				1092	if unstructured[-1].token_type != 'fws':
				1093	unstructured.defects.append(errors.InvalidHeaderDefect(
				1094	"missing whitespace before encoded word"))
				1095	have_ws = False
				1096	if have_ws and len(unstructured) > 1:
				1097	if unstructured[-2].token_type == 'encoded-word':
				1098	unstructured[-1] = EWWhiteSpaceTerminal(
				1099	unstructured[-1], 'fws')
				1100	unstructured.append(token)
				1101	continue
				1102	tok, *remainder = _wsp_splitter(value, 1)
				1103	vtext = ValueTerminal(tok, 'vtext')
				1104	_validate_xtext(vtext)
				1105	unstructured.append(vtext)
				1106	value = ''.join(remainder)
				1107	return unstructured
				1108
				1109	def get_qp_ctext(value):
R David Murray	44b548d	2016-09-08 13:59:53 -0400	[diff] [blame]	1110	r"""ctext = <printable ascii except \ ( )>
R David Murray	0b6f6c8	2012-05-25 18:42:14 -0400	[diff] [blame]	1111
				1112	This is not the RFC ctext, since we are handling nested comments in comment
				1113	and unquoting quoted-pairs here. We allow anything except the '()'
				1114	characters, but if we find any ASCII other than the RFC defined printable
Serhiy Storchaka	6a7b3a7	2016-04-17 08:32:47 +0300	[diff] [blame]	1115	ASCII, a NonPrintableDefect is added to the token's defects list. Since
R David Murray	0b6f6c8	2012-05-25 18:42:14 -0400	[diff] [blame]	1116	quoted pairs are converted to their unquoted values, what is returned is
				1117	a 'ptext' token. In this case it is a WhiteSpaceTerminal, so it's value
				1118	is ' '.
				1119
				1120	"""
				1121	ptext, value, _ = _get_ptext_to_endchars(value, '()')
				1122	ptext = WhiteSpaceTerminal(ptext, 'ptext')
				1123	_validate_xtext(ptext)
				1124	return ptext, value
				1125
				1126	def get_qcontent(value):
				1127	"""qcontent = qtext / quoted-pair
				1128
				1129	We allow anything except the DQUOTE character, but if we find any ASCII
Serhiy Storchaka	6a7b3a7	2016-04-17 08:32:47 +0300	[diff] [blame]	1130	other than the RFC defined printable ASCII, a NonPrintableDefect is
R David Murray	0b6f6c8	2012-05-25 18:42:14 -0400	[diff] [blame]	1131	added to the token's defects list. Any quoted pairs are converted to their
				1132	unquoted values, so what is returned is a 'ptext' token. In this case it
				1133	is a ValueTerminal.
				1134
				1135	"""
				1136	ptext, value, _ = _get_ptext_to_endchars(value, '"')
				1137	ptext = ValueTerminal(ptext, 'ptext')
				1138	_validate_xtext(ptext)
				1139	return ptext, value
				1140
				1141	def get_atext(value):
				1142	"""atext = <matches _atext_matcher>
				1143
				1144	We allow any non-ATOM_ENDS in atext, but add an InvalidATextDefect to
				1145	the token's defects list if we find non-atext characters.
				1146	"""
				1147	m = _non_atom_end_matcher(value)
				1148	if not m:
				1149	raise errors.HeaderParseError(
				1150	"expected atext but found '{}'".format(value))
				1151	atext = m.group()
				1152	value = value[len(atext):]
				1153	atext = ValueTerminal(atext, 'atext')
				1154	_validate_xtext(atext)
				1155	return atext, value
				1156
				1157	def get_bare_quoted_string(value):
				1158	"""bare-quoted-string = DQUOTE *([FWS] qcontent) [FWS] DQUOTE
				1159
				1160	A quoted-string without the leading or trailing white space. Its
				1161	value is the text between the quote marks, with whitespace
				1162	preserved and quoted pairs decoded.
				1163	"""
				1164	if value[0] != '"':
				1165	raise errors.HeaderParseError(
				1166	"expected '\"' but found '{}'".format(value))
				1167	bare_quoted_string = BareQuotedString()
				1168	value = value[1:]
jayyyin	aa218d1	2018-01-29 13:07:44 -0500	[diff] [blame]	1169	if value[0] == '"':
				1170	token, value = get_qcontent(value)
				1171	bare_quoted_string.append(token)
R David Murray	0b6f6c8	2012-05-25 18:42:14 -0400	[diff] [blame]	1172	while value and value[0] != '"':
				1173	if value[0] in WSP:
				1174	token, value = get_fws(value)
R David Murray	0400d33	2014-02-08 13:12:00 -0500	[diff] [blame]	1175	elif value[:2] == '=?':
				1176	try:
				1177	token, value = get_encoded_word(value)
				1178	bare_quoted_string.defects.append(errors.InvalidHeaderDefect(
				1179	"encoded word inside quoted string"))
				1180	except errors.HeaderParseError:
				1181	token, value = get_qcontent(value)
R David Murray	0b6f6c8	2012-05-25 18:42:14 -0400	[diff] [blame]	1182	else:
				1183	token, value = get_qcontent(value)
				1184	bare_quoted_string.append(token)
				1185	if not value:
				1186	bare_quoted_string.defects.append(errors.InvalidHeaderDefect(
				1187	"end of header inside quoted string"))
				1188	return bare_quoted_string, value
				1189	return bare_quoted_string, value[1:]
				1190
				1191	def get_comment(value):
				1192	"""comment = "(" *([FWS] ccontent) [FWS] ")"
				1193	ccontent = ctext / quoted-pair / comment
				1194
				1195	We handle nested comments here, and quoted-pair in our qp-ctext routine.
				1196	"""
				1197	if value and value[0] != '(':
				1198	raise errors.HeaderParseError(
				1199	"expected '(' but found '{}'".format(value))
				1200	comment = Comment()
				1201	value = value[1:]
				1202	while value and value[0] != ")":
				1203	if value[0] in WSP:
				1204	token, value = get_fws(value)
				1205	elif value[0] == '(':
				1206	token, value = get_comment(value)
				1207	else:
				1208	token, value = get_qp_ctext(value)
				1209	comment.append(token)
				1210	if not value:
				1211	comment.defects.append(errors.InvalidHeaderDefect(
				1212	"end of header inside comment"))
				1213	return comment, value
				1214	return comment, value[1:]
				1215
				1216	def get_cfws(value):
				1217	"""CFWS = (1*([FWS] comment) [FWS]) / FWS
				1218
				1219	"""
				1220	cfws = CFWSList()
				1221	while value and value[0] in CFWS_LEADER:
				1222	if value[0] in WSP:
				1223	token, value = get_fws(value)
				1224	else:
				1225	token, value = get_comment(value)
				1226	cfws.append(token)
				1227	return cfws, value
				1228
				1229	def get_quoted_string(value):
				1230	"""quoted-string = [CFWS] <bare-quoted-string> [CFWS]
				1231
				1232	'bare-quoted-string' is an intermediate class defined by this
				1233	parser and not by the RFC grammar. It is the quoted string
				1234	without any attached CFWS.
				1235	"""
				1236	quoted_string = QuotedString()
				1237	if value and value[0] in CFWS_LEADER:
				1238	token, value = get_cfws(value)
				1239	quoted_string.append(token)
				1240	token, value = get_bare_quoted_string(value)
				1241	quoted_string.append(token)
				1242	if value and value[0] in CFWS_LEADER:
				1243	token, value = get_cfws(value)
				1244	quoted_string.append(token)
				1245	return quoted_string, value
				1246
				1247	def get_atom(value):
				1248	"""atom = [CFWS] 1*atext [CFWS]
				1249
R David Murray	923512f	2013-07-12 16:00:28 -0400	[diff] [blame]	1250	An atom could be an rfc2047 encoded word.
R David Murray	0b6f6c8	2012-05-25 18:42:14 -0400	[diff] [blame]	1251	"""
				1252	atom = Atom()
				1253	if value and value[0] in CFWS_LEADER:
				1254	token, value = get_cfws(value)
				1255	atom.append(token)
				1256	if value and value[0] in ATOM_ENDS:
				1257	raise errors.HeaderParseError(
				1258	"expected atom but found '{}'".format(value))
R David Murray	923512f	2013-07-12 16:00:28 -0400	[diff] [blame]	1259	if value.startswith('=?'):
				1260	try:
				1261	token, value = get_encoded_word(value)
				1262	except errors.HeaderParseError:
				1263	# XXX: need to figure out how to register defects when
				1264	# appropriate here.
				1265	token, value = get_atext(value)
				1266	else:
				1267	token, value = get_atext(value)
R David Murray	0b6f6c8	2012-05-25 18:42:14 -0400	[diff] [blame]	1268	atom.append(token)
				1269	if value and value[0] in CFWS_LEADER:
				1270	token, value = get_cfws(value)
				1271	atom.append(token)
				1272	return atom, value
				1273
				1274	def get_dot_atom_text(value):
				1275	""" dot-text = 1atext ("." 1*atext)
				1276
				1277	"""
				1278	dot_atom_text = DotAtomText()
				1279	if not value or value[0] in ATOM_ENDS:
				1280	raise errors.HeaderParseError("expected atom at a start of "
				1281	"dot-atom-text but found '{}'".format(value))
				1282	while value and value[0] not in ATOM_ENDS:
				1283	token, value = get_atext(value)
				1284	dot_atom_text.append(token)
				1285	if value and value[0] == '.':
				1286	dot_atom_text.append(DOT)
				1287	value = value[1:]
				1288	if dot_atom_text[-1] is DOT:
				1289	raise errors.HeaderParseError("expected atom at end of dot-atom-text "
				1290	"but found '{}'".format('.'+value))
				1291	return dot_atom_text, value
				1292
				1293	def get_dot_atom(value):
				1294	""" dot-atom = [CFWS] dot-atom-text [CFWS]
				1295
R David Murray	923512f	2013-07-12 16:00:28 -0400	[diff] [blame]	1296	Any place we can have a dot atom, we could instead have an rfc2047 encoded
				1297	word.
R David Murray	0b6f6c8	2012-05-25 18:42:14 -0400	[diff] [blame]	1298	"""
				1299	dot_atom = DotAtom()
				1300	if value[0] in CFWS_LEADER:
				1301	token, value = get_cfws(value)
				1302	dot_atom.append(token)
R David Murray	923512f	2013-07-12 16:00:28 -0400	[diff] [blame]	1303	if value.startswith('=?'):
				1304	try:
				1305	token, value = get_encoded_word(value)
				1306	except errors.HeaderParseError:
				1307	# XXX: need to figure out how to register defects when
				1308	# appropriate here.
				1309	token, value = get_dot_atom_text(value)
				1310	else:
				1311	token, value = get_dot_atom_text(value)
R David Murray	0b6f6c8	2012-05-25 18:42:14 -0400	[diff] [blame]	1312	dot_atom.append(token)
				1313	if value and value[0] in CFWS_LEADER:
				1314	token, value = get_cfws(value)
				1315	dot_atom.append(token)
				1316	return dot_atom, value
				1317
				1318	def get_word(value):
				1319	"""word = atom / quoted-string
				1320
				1321	Either atom or quoted-string may start with CFWS. We have to peel off this
				1322	CFWS first to determine which type of word to parse. Afterward we splice
				1323	the leading CFWS, if any, into the parsed sub-token.
				1324
				1325	If neither an atom or a quoted-string is found before the next special, a
				1326	HeaderParseError is raised.
				1327
				1328	The token returned is either an Atom or a QuotedString, as appropriate.
				1329	This means the 'word' level of the formal grammar is not represented in the
				1330	parse tree; this is because having that extra layer when manipulating the
				1331	parse tree is more confusing than it is helpful.
				1332
				1333	"""
				1334	if value[0] in CFWS_LEADER:
				1335	leader, value = get_cfws(value)
				1336	else:
				1337	leader = None
				1338	if value[0]=='"':
				1339	token, value = get_quoted_string(value)
				1340	elif value[0] in SPECIALS:
				1341	raise errors.HeaderParseError("Expected 'atom' or 'quoted-string' "
				1342	"but found '{}'".format(value))
				1343	else:
				1344	token, value = get_atom(value)
				1345	if leader is not None:
				1346	token[:0] = [leader]
				1347	return token, value
				1348
				1349	def get_phrase(value):
				1350	""" phrase = 1*word / obs-phrase
				1351	obs-phrase = word *(word / "." / CFWS)
				1352
				1353	This means a phrase can be a sequence of words, periods, and CFWS in any
				1354	order as long as it starts with at least one word. If anything other than
				1355	words is detected, an ObsoleteHeaderDefect is added to the token's defect
				1356	list. We also accept a phrase that starts with CFWS followed by a dot;
				1357	this is registered as an InvalidHeaderDefect, since it is not supported by
				1358	even the obsolete grammar.
				1359
				1360	"""
				1361	phrase = Phrase()
				1362	try:
				1363	token, value = get_word(value)
				1364	phrase.append(token)
				1365	except errors.HeaderParseError:
				1366	phrase.defects.append(errors.InvalidHeaderDefect(
				1367	"phrase does not start with word"))
				1368	while value and value[0] not in PHRASE_ENDS:
				1369	if value[0]=='.':
				1370	phrase.append(DOT)
				1371	phrase.defects.append(errors.ObsoleteHeaderDefect(
				1372	"period in 'phrase'"))
				1373	value = value[1:]
				1374	else:
				1375	try:
				1376	token, value = get_word(value)
				1377	except errors.HeaderParseError:
				1378	if value[0] in CFWS_LEADER:
				1379	token, value = get_cfws(value)
				1380	phrase.defects.append(errors.ObsoleteHeaderDefect(
				1381	"comment found without atom"))
				1382	else:
				1383	raise
				1384	phrase.append(token)
				1385	return phrase, value
				1386
				1387	def get_local_part(value):
				1388	""" local-part = dot-atom / quoted-string / obs-local-part
				1389
				1390	"""
				1391	local_part = LocalPart()
				1392	leader = None
				1393	if value[0] in CFWS_LEADER:
				1394	leader, value = get_cfws(value)
				1395	if not value:
				1396	raise errors.HeaderParseError(
				1397	"expected local-part but found '{}'".format(value))
				1398	try:
				1399	token, value = get_dot_atom(value)
				1400	except errors.HeaderParseError:
				1401	try:
				1402	token, value = get_word(value)
				1403	except errors.HeaderParseError:
				1404	if value[0] != '\\' and value[0] in PHRASE_ENDS:
				1405	raise
				1406	token = TokenList()
				1407	if leader is not None:
				1408	token[:0] = [leader]
				1409	local_part.append(token)
				1410	if value and (value[0]=='\\' or value[0] not in PHRASE_ENDS):
				1411	obs_local_part, value = get_obs_local_part(str(local_part) + value)
				1412	if obs_local_part.token_type == 'invalid-obs-local-part':
				1413	local_part.defects.append(errors.InvalidHeaderDefect(
				1414	"local-part is not dot-atom, quoted-string, or obs-local-part"))
				1415	else:
				1416	local_part.defects.append(errors.ObsoleteHeaderDefect(
				1417	"local-part is not a dot-atom (contains CFWS)"))
				1418	local_part[0] = obs_local_part
				1419	try:
				1420	local_part.value.encode('ascii')
				1421	except UnicodeEncodeError:
				1422	local_part.defects.append(errors.NonASCIILocalPartDefect(
				1423	"local-part contains non-ASCII characters)"))
				1424	return local_part, value
				1425
				1426	def get_obs_local_part(value):
				1427	""" obs-local-part = word *("." word)
				1428	"""
				1429	obs_local_part = ObsLocalPart()
				1430	last_non_ws_was_dot = False
				1431	while value and (value[0]=='\\' or value[0] not in PHRASE_ENDS):
				1432	if value[0] == '.':
				1433	if last_non_ws_was_dot:
				1434	obs_local_part.defects.append(errors.InvalidHeaderDefect(
				1435	"invalid repeated '.'"))
				1436	obs_local_part.append(DOT)
				1437	last_non_ws_was_dot = True
				1438	value = value[1:]
				1439	continue
				1440	elif value[0]=='\\':
				1441	obs_local_part.append(ValueTerminal(value[0],
				1442	'misplaced-special'))
				1443	value = value[1:]
				1444	obs_local_part.defects.append(errors.InvalidHeaderDefect(
				1445	"'\\' character outside of quoted-string/ccontent"))
				1446	last_non_ws_was_dot = False
				1447	continue
				1448	if obs_local_part and obs_local_part[-1].token_type != 'dot':
				1449	obs_local_part.defects.append(errors.InvalidHeaderDefect(
				1450	"missing '.' between words"))
				1451	try:
				1452	token, value = get_word(value)
				1453	last_non_ws_was_dot = False
				1454	except errors.HeaderParseError:
				1455	if value[0] not in CFWS_LEADER:
				1456	raise
				1457	token, value = get_cfws(value)
				1458	obs_local_part.append(token)
				1459	if (obs_local_part[0].token_type == 'dot' or
				1460	obs_local_part[0].token_type=='cfws' and
				1461	obs_local_part[1].token_type=='dot'):
				1462	obs_local_part.defects.append(errors.InvalidHeaderDefect(
				1463	"Invalid leading '.' in local part"))
				1464	if (obs_local_part[-1].token_type == 'dot' or
				1465	obs_local_part[-1].token_type=='cfws' and
				1466	obs_local_part[-2].token_type=='dot'):
				1467	obs_local_part.defects.append(errors.InvalidHeaderDefect(
				1468	"Invalid trailing '.' in local part"))
				1469	if obs_local_part.defects:
				1470	obs_local_part.token_type = 'invalid-obs-local-part'
				1471	return obs_local_part, value
				1472
				1473	def get_dtext(value):
R David Murray	44b548d	2016-09-08 13:59:53 -0400	[diff] [blame]	1474	r""" dtext = <printable ascii except \ [ ]> / obs-dtext
R David Murray	0b6f6c8	2012-05-25 18:42:14 -0400	[diff] [blame]	1475	obs-dtext = obs-NO-WS-CTL / quoted-pair
				1476
Terry Jan Reedy	0f84764	2013-03-11 18:34:00 -0400	[diff] [blame]	1477	We allow anything except the excluded characters, but if we find any
Serhiy Storchaka	6a7b3a7	2016-04-17 08:32:47 +0300	[diff] [blame]	1478	ASCII other than the RFC defined printable ASCII, a NonPrintableDefect is
R David Murray	0b6f6c8	2012-05-25 18:42:14 -0400	[diff] [blame]	1479	added to the token's defects list. Quoted pairs are converted to their
				1480	unquoted values, so what is returned is a ptext token, in this case a
				1481	ValueTerminal. If there were quoted-printables, an ObsoleteHeaderDefect is
				1482	added to the returned token's defect list.
				1483
				1484	"""
				1485	ptext, value, had_qp = _get_ptext_to_endchars(value, '[]')
				1486	ptext = ValueTerminal(ptext, 'ptext')
				1487	if had_qp:
				1488	ptext.defects.append(errors.ObsoleteHeaderDefect(
				1489	"quoted printable found in domain-literal"))
				1490	_validate_xtext(ptext)
				1491	return ptext, value
				1492
				1493	def _check_for_early_dl_end(value, domain_literal):
				1494	if value:
				1495	return False
				1496	domain_literal.append(errors.InvalidHeaderDefect(
				1497	"end of input inside domain-literal"))
				1498	domain_literal.append(ValueTerminal(']', 'domain-literal-end'))
				1499	return True
				1500
				1501	def get_domain_literal(value):
				1502	""" domain-literal = [CFWS] "[" *([FWS] dtext) [FWS] "]" [CFWS]
				1503
				1504	"""
				1505	domain_literal = DomainLiteral()
				1506	if value[0] in CFWS_LEADER:
				1507	token, value = get_cfws(value)
				1508	domain_literal.append(token)
				1509	if not value:
				1510	raise errors.HeaderParseError("expected domain-literal")
				1511	if value[0] != '[':
				1512	raise errors.HeaderParseError("expected '[' at start of domain-literal "
				1513	"but found '{}'".format(value))
				1514	value = value[1:]
				1515	if _check_for_early_dl_end(value, domain_literal):
				1516	return domain_literal, value
				1517	domain_literal.append(ValueTerminal('[', 'domain-literal-start'))
				1518	if value[0] in WSP:
				1519	token, value = get_fws(value)
				1520	domain_literal.append(token)
				1521	token, value = get_dtext(value)
				1522	domain_literal.append(token)
				1523	if _check_for_early_dl_end(value, domain_literal):
				1524	return domain_literal, value
				1525	if value[0] in WSP:
				1526	token, value = get_fws(value)
				1527	domain_literal.append(token)
				1528	if _check_for_early_dl_end(value, domain_literal):
				1529	return domain_literal, value
				1530	if value[0] != ']':
				1531	raise errors.HeaderParseError("expected ']' at end of domain-literal "
				1532	"but found '{}'".format(value))
				1533	domain_literal.append(ValueTerminal(']', 'domain-literal-end'))
				1534	value = value[1:]
				1535	if value and value[0] in CFWS_LEADER:
				1536	token, value = get_cfws(value)
				1537	domain_literal.append(token)
				1538	return domain_literal, value
				1539
				1540	def get_domain(value):
				1541	""" domain = dot-atom / domain-literal / obs-domain
				1542	obs-domain = atom *("." atom))
				1543
				1544	"""
				1545	domain = Domain()
				1546	leader = None
				1547	if value[0] in CFWS_LEADER:
				1548	leader, value = get_cfws(value)
				1549	if not value:
				1550	raise errors.HeaderParseError(
				1551	"expected domain but found '{}'".format(value))
				1552	if value[0] == '[':
				1553	token, value = get_domain_literal(value)
				1554	if leader is not None:
				1555	token[:0] = [leader]
				1556	domain.append(token)
				1557	return domain, value
				1558	try:
				1559	token, value = get_dot_atom(value)
				1560	except errors.HeaderParseError:
				1561	token, value = get_atom(value)
				1562	if leader is not None:
				1563	token[:0] = [leader]
				1564	domain.append(token)
				1565	if value and value[0] == '.':
				1566	domain.defects.append(errors.ObsoleteHeaderDefect(
				1567	"domain is not a dot-atom (contains CFWS)"))
				1568	if domain[0].token_type == 'dot-atom':
				1569	domain[:] = domain[0]
				1570	while value and value[0] == '.':
				1571	domain.append(DOT)
				1572	token, value = get_atom(value[1:])
				1573	domain.append(token)
				1574	return domain, value
				1575
				1576	def get_addr_spec(value):
				1577	""" addr-spec = local-part "@" domain
				1578
				1579	"""
				1580	addr_spec = AddrSpec()
				1581	token, value = get_local_part(value)
				1582	addr_spec.append(token)
				1583	if not value or value[0] != '@':
				1584	addr_spec.defects.append(errors.InvalidHeaderDefect(
				1585	"add-spec local part with no domain"))
				1586	return addr_spec, value
				1587	addr_spec.append(ValueTerminal('@', 'address-at-symbol'))
				1588	token, value = get_domain(value[1:])
				1589	addr_spec.append(token)
				1590	return addr_spec, value
				1591
				1592	def get_obs_route(value):
				1593	""" obs-route = obs-domain-list ":"
				1594	obs-domain-list = (CFWS / ",") "@" domain ("," [CFWS] ["@" domain])
				1595
				1596	Returns an obs-route token with the appropriate sub-tokens (that is,
				1597	there is no obs-domain-list in the parse tree).
				1598	"""
				1599	obs_route = ObsRoute()
				1600	while value and (value[0]==',' or value[0] in CFWS_LEADER):
				1601	if value[0] in CFWS_LEADER:
				1602	token, value = get_cfws(value)
				1603	obs_route.append(token)
				1604	elif value[0] == ',':
				1605	obs_route.append(ListSeparator)
				1606	value = value[1:]
				1607	if not value or value[0] != '@':
				1608	raise errors.HeaderParseError(
				1609	"expected obs-route domain but found '{}'".format(value))
				1610	obs_route.append(RouteComponentMarker)
				1611	token, value = get_domain(value[1:])
				1612	obs_route.append(token)
				1613	while value and value[0]==',':
				1614	obs_route.append(ListSeparator)
				1615	value = value[1:]
				1616	if not value:
				1617	break
				1618	if value[0] in CFWS_LEADER:
				1619	token, value = get_cfws(value)
				1620	obs_route.append(token)
				1621	if value[0] == '@':
				1622	obs_route.append(RouteComponentMarker)
				1623	token, value = get_domain(value[1:])
				1624	obs_route.append(token)
				1625	if not value:
				1626	raise errors.HeaderParseError("end of header while parsing obs-route")
				1627	if value[0] != ':':
				1628	raise errors.HeaderParseError( "expected ':' marking end of "
				1629	"obs-route but found '{}'".format(value))
				1630	obs_route.append(ValueTerminal(':', 'end-of-obs-route-marker'))
				1631	return obs_route, value[1:]
				1632
				1633	def get_angle_addr(value):
				1634	""" angle-addr = [CFWS] "<" addr-spec ">" [CFWS] / obs-angle-addr
				1635	obs-angle-addr = [CFWS] "<" obs-route addr-spec ">" [CFWS]
				1636
				1637	"""
				1638	angle_addr = AngleAddr()
				1639	if value[0] in CFWS_LEADER:
				1640	token, value = get_cfws(value)
				1641	angle_addr.append(token)
				1642	if not value or value[0] != '<':
				1643	raise errors.HeaderParseError(
				1644	"expected angle-addr but found '{}'".format(value))
				1645	angle_addr.append(ValueTerminal('<', 'angle-addr-start'))
				1646	value = value[1:]
R David Murray	032eed3	2012-05-26 14:31:12 -0400	[diff] [blame]	1647	# Although it is not legal per RFC5322, SMTP uses '<>' in certain
				1648	# circumstances.
				1649	if value[0] == '>':
				1650	angle_addr.append(ValueTerminal('>', 'angle-addr-end'))
				1651	angle_addr.defects.append(errors.InvalidHeaderDefect(
				1652	"null addr-spec in angle-addr"))
				1653	value = value[1:]
				1654	return angle_addr, value
R David Murray	0b6f6c8	2012-05-25 18:42:14 -0400	[diff] [blame]	1655	try:
				1656	token, value = get_addr_spec(value)
				1657	except errors.HeaderParseError:
				1658	try:
				1659	token, value = get_obs_route(value)
				1660	angle_addr.defects.append(errors.ObsoleteHeaderDefect(
				1661	"obsolete route specification in angle-addr"))
				1662	except errors.HeaderParseError:
				1663	raise errors.HeaderParseError(
R David Murray	032eed3	2012-05-26 14:31:12 -0400	[diff] [blame]	1664	"expected addr-spec or obs-route but found '{}'".format(value))
R David Murray	0b6f6c8	2012-05-25 18:42:14 -0400	[diff] [blame]	1665	angle_addr.append(token)
				1666	token, value = get_addr_spec(value)
				1667	angle_addr.append(token)
				1668	if value and value[0] == '>':
				1669	value = value[1:]
				1670	else:
				1671	angle_addr.defects.append(errors.InvalidHeaderDefect(
				1672	"missing trailing '>' on angle-addr"))
				1673	angle_addr.append(ValueTerminal('>', 'angle-addr-end'))
				1674	if value and value[0] in CFWS_LEADER:
				1675	token, value = get_cfws(value)
				1676	angle_addr.append(token)
				1677	return angle_addr, value
				1678
				1679	def get_display_name(value):
				1680	""" display-name = phrase
				1681
				1682	Because this is simply a name-rule, we don't return a display-name
				1683	token containing a phrase, but rather a display-name token with
				1684	the content of the phrase.
				1685
				1686	"""
				1687	display_name = DisplayName()
				1688	token, value = get_phrase(value)
				1689	display_name.extend(token[:])
				1690	display_name.defects = token.defects[:]
				1691	return display_name, value
				1692
				1693
				1694	def get_name_addr(value):
				1695	""" name-addr = [display-name] angle-addr
				1696
				1697	"""
				1698	name_addr = NameAddr()
				1699	# Both the optional display name and the angle-addr can start with cfws.
				1700	leader = None
				1701	if value[0] in CFWS_LEADER:
				1702	leader, value = get_cfws(value)
				1703	if not value:
				1704	raise errors.HeaderParseError(
				1705	"expected name-addr but found '{}'".format(leader))
				1706	if value[0] != '<':
				1707	if value[0] in PHRASE_ENDS:
				1708	raise errors.HeaderParseError(
				1709	"expected name-addr but found '{}'".format(value))
				1710	token, value = get_display_name(value)
				1711	if not value:
				1712	raise errors.HeaderParseError(
				1713	"expected name-addr but found '{}'".format(token))
				1714	if leader is not None:
				1715	token[0][:0] = [leader]
				1716	leader = None
				1717	name_addr.append(token)
				1718	token, value = get_angle_addr(value)
				1719	if leader is not None:
				1720	token[:0] = [leader]
				1721	name_addr.append(token)
				1722	return name_addr, value
				1723
				1724	def get_mailbox(value):
				1725	""" mailbox = name-addr / addr-spec
				1726
				1727	"""
				1728	# The only way to figure out if we are dealing with a name-addr or an
				1729	# addr-spec is to try parsing each one.
				1730	mailbox = Mailbox()
				1731	try:
				1732	token, value = get_name_addr(value)
				1733	except errors.HeaderParseError:
				1734	try:
				1735	token, value = get_addr_spec(value)
				1736	except errors.HeaderParseError:
				1737	raise errors.HeaderParseError(
				1738	"expected mailbox but found '{}'".format(value))
				1739	if any(isinstance(x, errors.InvalidHeaderDefect)
				1740	for x in token.all_defects):
				1741	mailbox.token_type = 'invalid-mailbox'
				1742	mailbox.append(token)
				1743	return mailbox, value
				1744
				1745	def get_invalid_mailbox(value, endchars):
				1746	""" Read everything up to one of the chars in endchars.
				1747
				1748	This is outside the formal grammar. The InvalidMailbox TokenList that is
				1749	returned acts like a Mailbox, but the data attributes are None.
				1750
				1751	"""
				1752	invalid_mailbox = InvalidMailbox()
				1753	while value and value[0] not in endchars:
				1754	if value[0] in PHRASE_ENDS:
				1755	invalid_mailbox.append(ValueTerminal(value[0],
				1756	'misplaced-special'))
				1757	value = value[1:]
				1758	else:
				1759	token, value = get_phrase(value)
				1760	invalid_mailbox.append(token)
				1761	return invalid_mailbox, value
				1762
				1763	def get_mailbox_list(value):
				1764	""" mailbox-list = (mailbox *("," mailbox)) / obs-mbox-list
				1765	obs-mbox-list = ([CFWS] ",") mailbox ("," [mailbox / CFWS])
				1766
				1767	For this routine we go outside the formal grammar in order to improve error
				1768	handling. We recognize the end of the mailbox list only at the end of the
				1769	value or at a ';' (the group terminator). This is so that we can turn
				1770	invalid mailboxes into InvalidMailbox tokens and continue parsing any
				1771	remaining valid mailboxes. We also allow all mailbox entries to be null,
				1772	and this condition is handled appropriately at a higher level.
				1773
				1774	"""
				1775	mailbox_list = MailboxList()
				1776	while value and value[0] != ';':
				1777	try:
				1778	token, value = get_mailbox(value)
				1779	mailbox_list.append(token)
				1780	except errors.HeaderParseError:
				1781	leader = None
				1782	if value[0] in CFWS_LEADER:
				1783	leader, value = get_cfws(value)
				1784	if not value or value[0] in ',;':
				1785	mailbox_list.append(leader)
				1786	mailbox_list.defects.append(errors.ObsoleteHeaderDefect(
				1787	"empty element in mailbox-list"))
				1788	else:
				1789	token, value = get_invalid_mailbox(value, ',;')
				1790	if leader is not None:
				1791	token[:0] = [leader]
				1792	mailbox_list.append(token)
				1793	mailbox_list.defects.append(errors.InvalidHeaderDefect(
				1794	"invalid mailbox in mailbox-list"))
				1795	elif value[0] == ',':
				1796	mailbox_list.defects.append(errors.ObsoleteHeaderDefect(
				1797	"empty element in mailbox-list"))
				1798	else:
				1799	token, value = get_invalid_mailbox(value, ',;')
				1800	if leader is not None:
				1801	token[:0] = [leader]
				1802	mailbox_list.append(token)
				1803	mailbox_list.defects.append(errors.InvalidHeaderDefect(
				1804	"invalid mailbox in mailbox-list"))
				1805	if value and value[0] not in ',;':
				1806	# Crap after mailbox; treat it as an invalid mailbox.
				1807	# The mailbox info will still be available.
				1808	mailbox = mailbox_list[-1]
				1809	mailbox.token_type = 'invalid-mailbox'
				1810	token, value = get_invalid_mailbox(value, ',;')
				1811	mailbox.extend(token)
				1812	mailbox_list.defects.append(errors.InvalidHeaderDefect(
				1813	"invalid mailbox in mailbox-list"))
				1814	if value and value[0] == ',':
				1815	mailbox_list.append(ListSeparator)
				1816	value = value[1:]
				1817	return mailbox_list, value
				1818
				1819
				1820	def get_group_list(value):
				1821	""" group-list = mailbox-list / CFWS / obs-group-list
				1822	obs-group-list = 1*([CFWS] ",") [CFWS]
				1823
				1824	"""
				1825	group_list = GroupList()
				1826	if not value:
				1827	group_list.defects.append(errors.InvalidHeaderDefect(
				1828	"end of header before group-list"))
				1829	return group_list, value
				1830	leader = None
				1831	if value and value[0] in CFWS_LEADER:
				1832	leader, value = get_cfws(value)
				1833	if not value:
				1834	# This should never happen in email parsing, since CFWS-only is a
				1835	# legal alternative to group-list in a group, which is the only
				1836	# place group-list appears.
				1837	group_list.defects.append(errors.InvalidHeaderDefect(
				1838	"end of header in group-list"))
				1839	group_list.append(leader)
				1840	return group_list, value
				1841	if value[0] == ';':
				1842	group_list.append(leader)
				1843	return group_list, value
				1844	token, value = get_mailbox_list(value)
				1845	if len(token.all_mailboxes)==0:
				1846	if leader is not None:
				1847	group_list.append(leader)
				1848	group_list.extend(token)
				1849	group_list.defects.append(errors.ObsoleteHeaderDefect(
				1850	"group-list with empty entries"))
				1851	return group_list, value
				1852	if leader is not None:
				1853	token[:0] = [leader]
				1854	group_list.append(token)
				1855	return group_list, value
				1856
				1857	def get_group(value):
				1858	""" group = display-name ":" [group-list] ";" [CFWS]
				1859
				1860	"""
				1861	group = Group()
				1862	token, value = get_display_name(value)
				1863	if not value or value[0] != ':':
				1864	raise errors.HeaderParseError("expected ':' at end of group "
				1865	"display name but found '{}'".format(value))
				1866	group.append(token)
				1867	group.append(ValueTerminal(':', 'group-display-name-terminator'))
				1868	value = value[1:]
				1869	if value and value[0] == ';':
				1870	group.append(ValueTerminal(';', 'group-terminator'))
				1871	return group, value[1:]
				1872	token, value = get_group_list(value)
				1873	group.append(token)
				1874	if not value:
				1875	group.defects.append(errors.InvalidHeaderDefect(
				1876	"end of header in group"))
Dong-hee Na	8fe9eed	2018-07-28 21:55:11 +0900	[diff] [blame]	1877	elif value[0] != ';':
R David Murray	0b6f6c8	2012-05-25 18:42:14 -0400	[diff] [blame]	1878	raise errors.HeaderParseError(
				1879	"expected ';' at end of group but found {}".format(value))
				1880	group.append(ValueTerminal(';', 'group-terminator'))
				1881	value = value[1:]
				1882	if value and value[0] in CFWS_LEADER:
				1883	token, value = get_cfws(value)
				1884	group.append(token)
				1885	return group, value
				1886
				1887	def get_address(value):
				1888	""" address = mailbox / group
				1889
				1890	Note that counter-intuitively, an address can be either a single address or
				1891	a list of addresses (a group). This is why the returned Address object has
				1892	a 'mailboxes' attribute which treats a single address as a list of length
				1893	one. When you need to differentiate between to two cases, extract the single
				1894	element, which is either a mailbox or a group token.
				1895
				1896	"""
				1897	# The formal grammar isn't very helpful when parsing an address. mailbox
				1898	# and group, especially when allowing for obsolete forms, start off very
				1899	# similarly. It is only when you reach one of @, <, or : that you know
				1900	# what you've got. So, we try each one in turn, starting with the more
				1901	# likely of the two. We could perhaps make this more efficient by looking
				1902	# for a phrase and then branching based on the next character, but that
				1903	# would be a premature optimization.
				1904	address = Address()
				1905	try:
				1906	token, value = get_group(value)
				1907	except errors.HeaderParseError:
				1908	try:
				1909	token, value = get_mailbox(value)
				1910	except errors.HeaderParseError:
				1911	raise errors.HeaderParseError(
				1912	"expected address but found '{}'".format(value))
				1913	address.append(token)
				1914	return address, value
				1915
				1916	def get_address_list(value):
				1917	""" address_list = (address *("," address)) / obs-addr-list
				1918	obs-addr-list = ([CFWS] ",") address ("," [address / CFWS])
				1919
				1920	We depart from the formal grammar here by continuing to parse until the end
				1921	of the input, assuming the input to be entirely composed of an
				1922	address-list. This is always true in email parsing, and allows us
				1923	to skip invalid addresses to parse additional valid ones.
				1924
				1925	"""
				1926	address_list = AddressList()
				1927	while value:
				1928	try:
				1929	token, value = get_address(value)
				1930	address_list.append(token)
				1931	except errors.HeaderParseError as err:
				1932	leader = None
				1933	if value[0] in CFWS_LEADER:
				1934	leader, value = get_cfws(value)
				1935	if not value or value[0] == ',':
				1936	address_list.append(leader)
				1937	address_list.defects.append(errors.ObsoleteHeaderDefect(
				1938	"address-list entry with no content"))
				1939	else:
				1940	token, value = get_invalid_mailbox(value, ',')
				1941	if leader is not None:
				1942	token[:0] = [leader]
				1943	address_list.append(Address([token]))
				1944	address_list.defects.append(errors.InvalidHeaderDefect(
				1945	"invalid address in address-list"))
				1946	elif value[0] == ',':
				1947	address_list.defects.append(errors.ObsoleteHeaderDefect(
				1948	"empty element in address-list"))
				1949	else:
				1950	token, value = get_invalid_mailbox(value, ',')
				1951	if leader is not None:
				1952	token[:0] = [leader]
				1953	address_list.append(Address([token]))
				1954	address_list.defects.append(errors.InvalidHeaderDefect(
				1955	"invalid address in address-list"))
				1956	if value and value[0] != ',':
				1957	# Crap after address; treat it as an invalid mailbox.
				1958	# The mailbox info will still be available.
				1959	mailbox = address_list[-1][0]
				1960	mailbox.token_type = 'invalid-mailbox'
				1961	token, value = get_invalid_mailbox(value, ',')
				1962	mailbox.extend(token)
				1963	address_list.defects.append(errors.InvalidHeaderDefect(
				1964	"invalid address in address-list"))
				1965	if value: # Must be a , at this point.
				1966	address_list.append(ValueTerminal(',', 'list-separator'))
				1967	value = value[1:]
				1968	return address_list, value
R David Murray	97f43c0	2012-06-24 05:03:27 -0400	[diff] [blame]	1969
				1970	#
				1971	# XXX: As I begin to add additional header parsers, I'm realizing we probably
				1972	# have two level of parser routines: the get_XXX methods that get a token in
				1973	# the grammar, and parse_XXX methods that parse an entire field value. So
				1974	# get_address_list above should really be a parse_ method, as probably should
				1975	# be get_unstructured.
				1976	#
				1977
				1978	def parse_mime_version(value):
				1979	""" mime-version = [CFWS] 1digit [CFWS] "." [CFWS] 1digit [CFWS]
				1980
				1981	"""
				1982	# The [CFWS] is implicit in the RFC 2045 BNF.
				1983	# XXX: This routine is a bit verbose, should factor out a get_int method.
				1984	mime_version = MIMEVersion()
				1985	if not value:
				1986	mime_version.defects.append(errors.HeaderMissingRequiredValue(
				1987	"Missing MIME version number (eg: 1.0)"))
				1988	return mime_version
				1989	if value[0] in CFWS_LEADER:
				1990	token, value = get_cfws(value)
				1991	mime_version.append(token)
				1992	if not value:
				1993	mime_version.defects.append(errors.HeaderMissingRequiredValue(
				1994	"Expected MIME version number but found only CFWS"))
				1995	digits = ''
				1996	while value and value[0] != '.' and value[0] not in CFWS_LEADER:
				1997	digits += value[0]
				1998	value = value[1:]
				1999	if not digits.isdigit():
				2000	mime_version.defects.append(errors.InvalidHeaderDefect(
				2001	"Expected MIME major version number but found {!r}".format(digits)))
				2002	mime_version.append(ValueTerminal(digits, 'xtext'))
				2003	else:
				2004	mime_version.major = int(digits)
				2005	mime_version.append(ValueTerminal(digits, 'digits'))
				2006	if value and value[0] in CFWS_LEADER:
				2007	token, value = get_cfws(value)
				2008	mime_version.append(token)
				2009	if not value or value[0] != '.':
				2010	if mime_version.major is not None:
				2011	mime_version.defects.append(errors.InvalidHeaderDefect(
				2012	"Incomplete MIME version; found only major number"))
				2013	if value:
				2014	mime_version.append(ValueTerminal(value, 'xtext'))
				2015	return mime_version
				2016	mime_version.append(ValueTerminal('.', 'version-separator'))
				2017	value = value[1:]
				2018	if value and value[0] in CFWS_LEADER:
				2019	token, value = get_cfws(value)
				2020	mime_version.append(token)
				2021	if not value:
				2022	if mime_version.major is not None:
				2023	mime_version.defects.append(errors.InvalidHeaderDefect(
				2024	"Incomplete MIME version; found only major number"))
				2025	return mime_version
				2026	digits = ''
				2027	while value and value[0] not in CFWS_LEADER:
				2028	digits += value[0]
				2029	value = value[1:]
				2030	if not digits.isdigit():
				2031	mime_version.defects.append(errors.InvalidHeaderDefect(
				2032	"Expected MIME minor version number but found {!r}".format(digits)))
				2033	mime_version.append(ValueTerminal(digits, 'xtext'))
				2034	else:
				2035	mime_version.minor = int(digits)
				2036	mime_version.append(ValueTerminal(digits, 'digits'))
				2037	if value and value[0] in CFWS_LEADER:
				2038	token, value = get_cfws(value)
				2039	mime_version.append(token)
				2040	if value:
				2041	mime_version.defects.append(errors.InvalidHeaderDefect(
				2042	"Excess non-CFWS text after MIME version"))
				2043	mime_version.append(ValueTerminal(value, 'xtext'))
				2044	return mime_version
				2045
				2046	def get_invalid_parameter(value):
				2047	""" Read everything up to the next ';'.
				2048
				2049	This is outside the formal grammar. The InvalidParameter TokenList that is
				2050	returned acts like a Parameter, but the data attributes are None.
				2051
				2052	"""
				2053	invalid_parameter = InvalidParameter()
				2054	while value and value[0] != ';':
				2055	if value[0] in PHRASE_ENDS:
				2056	invalid_parameter.append(ValueTerminal(value[0],
				2057	'misplaced-special'))
				2058	value = value[1:]
				2059	else:
				2060	token, value = get_phrase(value)
				2061	invalid_parameter.append(token)
				2062	return invalid_parameter, value
				2063
				2064	def get_ttext(value):
				2065	"""ttext = <matches _ttext_matcher>
				2066
				2067	We allow any non-TOKEN_ENDS in ttext, but add defects to the token's
				2068	defects list if we find non-ttext characters. We also register defects for
				2069	any non-printables even though the RFC doesn't exclude all of them,
				2070	because we follow the spirit of RFC 5322.
				2071
				2072	"""
				2073	m = _non_token_end_matcher(value)
				2074	if not m:
				2075	raise errors.HeaderParseError(
				2076	"expected ttext but found '{}'".format(value))
				2077	ttext = m.group()
				2078	value = value[len(ttext):]
				2079	ttext = ValueTerminal(ttext, 'ttext')
				2080	_validate_xtext(ttext)
				2081	return ttext, value
				2082
				2083	def get_token(value):
				2084	"""token = [CFWS] 1*ttext [CFWS]
				2085
				2086	The RFC equivalent of ttext is any US-ASCII chars except space, ctls, or
				2087	tspecials. We also exclude tabs even though the RFC doesn't.
				2088
				2089	The RFC implies the CFWS but is not explicit about it in the BNF.
				2090
				2091	"""
				2092	mtoken = Token()
				2093	if value and value[0] in CFWS_LEADER:
				2094	token, value = get_cfws(value)
				2095	mtoken.append(token)
				2096	if value and value[0] in TOKEN_ENDS:
				2097	raise errors.HeaderParseError(
				2098	"expected token but found '{}'".format(value))
				2099	token, value = get_ttext(value)
				2100	mtoken.append(token)
				2101	if value and value[0] in CFWS_LEADER:
				2102	token, value = get_cfws(value)
				2103	mtoken.append(token)
				2104	return mtoken, value
				2105
				2106	def get_attrtext(value):
				2107	"""attrtext = 1*(any non-ATTRIBUTE_ENDS character)
				2108
				2109	We allow any non-ATTRIBUTE_ENDS in attrtext, but add defects to the
				2110	token's defects list if we find non-attrtext characters. We also register
				2111	defects for any non-printables even though the RFC doesn't exclude all of
				2112	them, because we follow the spirit of RFC 5322.
				2113
				2114	"""
				2115	m = _non_attribute_end_matcher(value)
				2116	if not m:
				2117	raise errors.HeaderParseError(
				2118	"expected attrtext but found {!r}".format(value))
				2119	attrtext = m.group()
				2120	value = value[len(attrtext):]
				2121	attrtext = ValueTerminal(attrtext, 'attrtext')
				2122	_validate_xtext(attrtext)
				2123	return attrtext, value
				2124
				2125	def get_attribute(value):
				2126	""" [CFWS] 1*attrtext [CFWS]
				2127
				2128	This version of the BNF makes the CFWS explicit, and as usual we use a
				2129	value terminal for the actual run of characters. The RFC equivalent of
				2130	attrtext is the token characters, with the subtraction of '*', "'", and '%'.
				2131	We include tab in the excluded set just as we do for token.
				2132
				2133	"""
				2134	attribute = Attribute()
				2135	if value and value[0] in CFWS_LEADER:
				2136	token, value = get_cfws(value)
				2137	attribute.append(token)
				2138	if value and value[0] in ATTRIBUTE_ENDS:
				2139	raise errors.HeaderParseError(
				2140	"expected token but found '{}'".format(value))
				2141	token, value = get_attrtext(value)
				2142	attribute.append(token)
				2143	if value and value[0] in CFWS_LEADER:
				2144	token, value = get_cfws(value)
				2145	attribute.append(token)
				2146	return attribute, value
				2147
				2148	def get_extended_attrtext(value):
				2149	"""attrtext = 1*(any non-ATTRIBUTE_ENDS character plus '%')
				2150
				2151	This is a special parsing routine so that we get a value that
				2152	includes % escapes as a single string (which we decode as a single
				2153	string later).
				2154
				2155	"""
				2156	m = _non_extended_attribute_end_matcher(value)
				2157	if not m:
				2158	raise errors.HeaderParseError(
				2159	"expected extended attrtext but found {!r}".format(value))
				2160	attrtext = m.group()
				2161	value = value[len(attrtext):]
				2162	attrtext = ValueTerminal(attrtext, 'extended-attrtext')
				2163	_validate_xtext(attrtext)
				2164	return attrtext, value
				2165
				2166	def get_extended_attribute(value):
				2167	""" [CFWS] 1*extended_attrtext [CFWS]
				2168
				2169	This is like the non-extended version except we allow % characters, so that
				2170	we can pick up an encoded value as a single string.
				2171
				2172	"""
				2173	# XXX: should we have an ExtendedAttribute TokenList?
				2174	attribute = Attribute()
				2175	if value and value[0] in CFWS_LEADER:
				2176	token, value = get_cfws(value)
				2177	attribute.append(token)
				2178	if value and value[0] in EXTENDED_ATTRIBUTE_ENDS:
				2179	raise errors.HeaderParseError(
				2180	"expected token but found '{}'".format(value))
				2181	token, value = get_extended_attrtext(value)
				2182	attribute.append(token)
				2183	if value and value[0] in CFWS_LEADER:
				2184	token, value = get_cfws(value)
				2185	attribute.append(token)
				2186	return attribute, value
				2187
				2188	def get_section(value):
				2189	""" '*' digits
				2190
				2191	The formal BNF is more complicated because leading 0s are not allowed. We
				2192	check for that and add a defect. We also assume no CFWS is allowed between
				2193	the '*' and the digits, though the RFC is not crystal clear on that.
				2194	The caller should already have dealt with leading CFWS.
				2195
				2196	"""
				2197	section = Section()
				2198	if not value or value[0] != '*':
				2199	raise errors.HeaderParseError("Expected section but found {}".format(
				2200	value))
				2201	section.append(ValueTerminal('*', 'section-marker'))
				2202	value = value[1:]
				2203	if not value or not value[0].isdigit():
				2204	raise errors.HeaderParseError("Expected section number but "
				2205	"found {}".format(value))
				2206	digits = ''
				2207	while value and value[0].isdigit():
				2208	digits += value[0]
				2209	value = value[1:]
				2210	if digits[0] == '0' and digits != '0':
Serhiy Storchaka	34fd4c2	2018-11-05 16:20:25 +0200	[diff] [blame]	2211	section.defects.append(errors.InvalidHeaderError(
				2212	"section number has an invalid leading 0"))
R David Murray	97f43c0	2012-06-24 05:03:27 -0400	[diff] [blame]	2213	section.number = int(digits)
				2214	section.append(ValueTerminal(digits, 'digits'))
				2215	return section, value
				2216
				2217
				2218	def get_value(value):
				2219	""" quoted-string / attribute
				2220
				2221	"""
				2222	v = Value()
				2223	if not value:
				2224	raise errors.HeaderParseError("Expected value but found end of string")
				2225	leader = None
				2226	if value[0] in CFWS_LEADER:
				2227	leader, value = get_cfws(value)
				2228	if not value:
				2229	raise errors.HeaderParseError("Expected value but found "
				2230	"only {}".format(leader))
				2231	if value[0] == '"':
				2232	token, value = get_quoted_string(value)
				2233	else:
				2234	token, value = get_extended_attribute(value)
				2235	if leader is not None:
				2236	token[:0] = [leader]
				2237	v.append(token)
				2238	return v, value
				2239
				2240	def get_parameter(value):
				2241	""" attribute [section] ["*"] [CFWS] "=" value
				2242
				2243	The CFWS is implied by the RFC but not made explicit in the BNF. This
				2244	simplified form of the BNF from the RFC is made to conform with the RFC BNF
				2245	through some extra checks. We do it this way because it makes both error
				2246	recovery and working with the resulting parse tree easier.
				2247	"""
				2248	# It is possible CFWS would also be implicitly allowed between the section
				2249	# and the 'extended-attribute' marker (the '*') , but we've never seen that
				2250	# in the wild and we will therefore ignore the possibility.
				2251	param = Parameter()
				2252	token, value = get_attribute(value)
				2253	param.append(token)
				2254	if not value or value[0] == ';':
				2255	param.defects.append(errors.InvalidHeaderDefect("Parameter contains "
				2256	"name ({}) but no value".format(token)))
				2257	return param, value
				2258	if value[0] == '*':
				2259	try:
				2260	token, value = get_section(value)
				2261	param.sectioned = True
				2262	param.append(token)
				2263	except errors.HeaderParseError:
				2264	pass
				2265	if not value:
				2266	raise errors.HeaderParseError("Incomplete parameter")
				2267	if value[0] == '*':
				2268	param.append(ValueTerminal('*', 'extended-parameter-marker'))
				2269	value = value[1:]
				2270	param.extended = True
				2271	if value[0] != '=':
				2272	raise errors.HeaderParseError("Parameter not followed by '='")
				2273	param.append(ValueTerminal('=', 'parameter-separator'))
				2274	value = value[1:]
				2275	leader = None
				2276	if value and value[0] in CFWS_LEADER:
				2277	token, value = get_cfws(value)
				2278	param.append(token)
				2279	remainder = None
				2280	appendto = param
				2281	if param.extended and value and value[0] == '"':
				2282	# Now for some serious hackery to handle the common invalid case of
				2283	# double quotes around an extended value. We also accept (with defect)
				2284	# a value marked as encoded that isn't really.
				2285	qstring, remainder = get_quoted_string(value)
				2286	inner_value = qstring.stripped_value
				2287	semi_valid = False
				2288	if param.section_number == 0:
				2289	if inner_value and inner_value[0] == "'":
				2290	semi_valid = True
				2291	else:
				2292	token, rest = get_attrtext(inner_value)
				2293	if rest and rest[0] == "'":
				2294	semi_valid = True
				2295	else:
				2296	try:
				2297	token, rest = get_extended_attrtext(inner_value)
				2298	except:
				2299	pass
				2300	else:
				2301	if not rest:
				2302	semi_valid = True
				2303	if semi_valid:
				2304	param.defects.append(errors.InvalidHeaderDefect(
				2305	"Quoted string value for extended parameter is invalid"))
				2306	param.append(qstring)
				2307	for t in qstring:
				2308	if t.token_type == 'bare-quoted-string':
				2309	t[:] = []
				2310	appendto = t
				2311	break
				2312	value = inner_value
				2313	else:
				2314	remainder = None
				2315	param.defects.append(errors.InvalidHeaderDefect(
				2316	"Parameter marked as extended but appears to have a "
				2317	"quoted string value that is non-encoded"))
				2318	if value and value[0] == "'":
				2319	token = None
				2320	else:
				2321	token, value = get_value(value)
				2322	if not param.extended or param.section_number > 0:
				2323	if not value or value[0] != "'":
				2324	appendto.append(token)
				2325	if remainder is not None:
				2326	assert not value, value
				2327	value = remainder
				2328	return param, value
				2329	param.defects.append(errors.InvalidHeaderDefect(
				2330	"Apparent initial-extended-value but attribute "
				2331	"was not marked as extended or was not initial section"))
				2332	if not value:
				2333	# Assume the charset/lang is missing and the token is the value.
				2334	param.defects.append(errors.InvalidHeaderDefect(
				2335	"Missing required charset/lang delimiters"))
				2336	appendto.append(token)
				2337	if remainder is None:
				2338	return param, value
				2339	else:
				2340	if token is not None:
				2341	for t in token:
				2342	if t.token_type == 'extended-attrtext':
				2343	break
				2344	t.token_type == 'attrtext'
				2345	appendto.append(t)
				2346	param.charset = t.value
				2347	if value[0] != "'":
				2348	raise errors.HeaderParseError("Expected RFC2231 char/lang encoding "
				2349	"delimiter, but found {!r}".format(value))
R. David Murray	85d5c18	2017-12-03 18:51:41 -0500	[diff] [blame]	2350	appendto.append(ValueTerminal("'", 'RFC2231-delimiter'))
R David Murray	97f43c0	2012-06-24 05:03:27 -0400	[diff] [blame]	2351	value = value[1:]
				2352	if value and value[0] != "'":
				2353	token, value = get_attrtext(value)
				2354	appendto.append(token)
				2355	param.lang = token.value
				2356	if not value or value[0] != "'":
				2357	raise errors.HeaderParseError("Expected RFC2231 char/lang encoding "
				2358	"delimiter, but found {}".format(value))
R. David Murray	85d5c18	2017-12-03 18:51:41 -0500	[diff] [blame]	2359	appendto.append(ValueTerminal("'", 'RFC2231-delimiter'))
R David Murray	97f43c0	2012-06-24 05:03:27 -0400	[diff] [blame]	2360	value = value[1:]
				2361	if remainder is not None:
				2362	# Treat the rest of value as bare quoted string content.
				2363	v = Value()
				2364	while value:
				2365	if value[0] in WSP:
				2366	token, value = get_fws(value)
				2367	else:
				2368	token, value = get_qcontent(value)
				2369	v.append(token)
				2370	token = v
				2371	else:
				2372	token, value = get_value(value)
				2373	appendto.append(token)
				2374	if remainder is not None:
				2375	assert not value, value
				2376	value = remainder
				2377	return param, value
				2378
				2379	def parse_mime_parameters(value):
				2380	""" parameter *( ";" parameter )
				2381
				2382	That BNF is meant to indicate this routine should only be called after
				2383	finding and handling the leading ';'. There is no corresponding rule in
				2384	the formal RFC grammar, but it is more convenient for us for the set of
				2385	parameters to be treated as its own TokenList.
				2386
				2387	This is 'parse' routine because it consumes the reminaing value, but it
				2388	would never be called to parse a full header. Instead it is called to
				2389	parse everything after the non-parameter value of a specific MIME header.
				2390
				2391	"""
				2392	mime_parameters = MimeParameters()
				2393	while value:
				2394	try:
				2395	token, value = get_parameter(value)
				2396	mime_parameters.append(token)
				2397	except errors.HeaderParseError as err:
				2398	leader = None
				2399	if value[0] in CFWS_LEADER:
				2400	leader, value = get_cfws(value)
				2401	if not value:
				2402	mime_parameters.append(leader)
				2403	return mime_parameters
				2404	if value[0] == ';':
				2405	if leader is not None:
				2406	mime_parameters.append(leader)
				2407	mime_parameters.defects.append(errors.InvalidHeaderDefect(
				2408	"parameter entry with no content"))
				2409	else:
				2410	token, value = get_invalid_parameter(value)
				2411	if leader:
				2412	token[:0] = [leader]
				2413	mime_parameters.append(token)
				2414	mime_parameters.defects.append(errors.InvalidHeaderDefect(
				2415	"invalid parameter {!r}".format(token)))
				2416	if value and value[0] != ';':
				2417	# Junk after the otherwise valid parameter. Mark it as
				2418	# invalid, but it will have a value.
				2419	param = mime_parameters[-1]
				2420	param.token_type = 'invalid-parameter'
				2421	token, value = get_invalid_parameter(value)
				2422	param.extend(token)
				2423	mime_parameters.defects.append(errors.InvalidHeaderDefect(
				2424	"parameter with invalid trailing text {!r}".format(token)))
				2425	if value:
				2426	# Must be a ';' at this point.
				2427	mime_parameters.append(ValueTerminal(';', 'parameter-separator'))
				2428	value = value[1:]
				2429	return mime_parameters
				2430
				2431	def _find_mime_parameters(tokenlist, value):
				2432	"""Do our best to find the parameters in an invalid MIME header
				2433
				2434	"""
				2435	while value and value[0] != ';':
				2436	if value[0] in PHRASE_ENDS:
				2437	tokenlist.append(ValueTerminal(value[0], 'misplaced-special'))
				2438	value = value[1:]
				2439	else:
				2440	token, value = get_phrase(value)
				2441	tokenlist.append(token)
				2442	if not value:
				2443	return
				2444	tokenlist.append(ValueTerminal(';', 'parameter-separator'))
				2445	tokenlist.append(parse_mime_parameters(value[1:]))
				2446
				2447	def parse_content_type_header(value):
				2448	""" maintype "/" subtype *( ";" parameter )
				2449
				2450	The maintype and substype are tokens. Theoretically they could
				2451	be checked against the official IANA list + x-token, but we
				2452	don't do that.
				2453	"""
				2454	ctype = ContentType()
				2455	recover = False
				2456	if not value:
				2457	ctype.defects.append(errors.HeaderMissingRequiredValue(
				2458	"Missing content type specification"))
				2459	return ctype
				2460	try:
				2461	token, value = get_token(value)
				2462	except errors.HeaderParseError:
				2463	ctype.defects.append(errors.InvalidHeaderDefect(
				2464	"Expected content maintype but found {!r}".format(value)))
				2465	_find_mime_parameters(ctype, value)
				2466	return ctype
				2467	ctype.append(token)
Martin Panter	46f5072	2016-05-26 05:35:26 +0000	[diff] [blame]	2468	# XXX: If we really want to follow the formal grammar we should make
R David Murray	97f43c0	2012-06-24 05:03:27 -0400	[diff] [blame]	2469	# mantype and subtype specialized TokenLists here. Probably not worth it.
				2470	if not value or value[0] != '/':
				2471	ctype.defects.append(errors.InvalidHeaderDefect(
				2472	"Invalid content type"))
				2473	if value:
				2474	_find_mime_parameters(ctype, value)
				2475	return ctype
				2476	ctype.maintype = token.value.strip().lower()
				2477	ctype.append(ValueTerminal('/', 'content-type-separator'))
				2478	value = value[1:]
				2479	try:
				2480	token, value = get_token(value)
				2481	except errors.HeaderParseError:
				2482	ctype.defects.append(errors.InvalidHeaderDefect(
				2483	"Expected content subtype but found {!r}".format(value)))
				2484	_find_mime_parameters(ctype, value)
				2485	return ctype
				2486	ctype.append(token)
				2487	ctype.subtype = token.value.strip().lower()
				2488	if not value:
				2489	return ctype
				2490	if value[0] != ';':
				2491	ctype.defects.append(errors.InvalidHeaderDefect(
				2492	"Only parameters are valid after content type, but "
				2493	"found {!r}".format(value)))
				2494	# The RFC requires that a syntactically invalid content-type be treated
				2495	# as text/plain. Perhaps we should postel this, but we should probably
				2496	# only do that if we were checking the subtype value against IANA.
				2497	del ctype.maintype, ctype.subtype
				2498	_find_mime_parameters(ctype, value)
				2499	return ctype
				2500	ctype.append(ValueTerminal(';', 'parameter-separator'))
				2501	ctype.append(parse_mime_parameters(value[1:]))
				2502	return ctype
				2503
				2504	def parse_content_disposition_header(value):
				2505	""" disposition-type *( ";" parameter )
				2506
				2507	"""
				2508	disp_header = ContentDisposition()
				2509	if not value:
				2510	disp_header.defects.append(errors.HeaderMissingRequiredValue(
				2511	"Missing content disposition"))
				2512	return disp_header
				2513	try:
				2514	token, value = get_token(value)
				2515	except errors.HeaderParseError:
Ezio Melotti	d577480	2014-08-04 17:16:49 +0300	[diff] [blame]	2516	disp_header.defects.append(errors.InvalidHeaderDefect(
R David Murray	97f43c0	2012-06-24 05:03:27 -0400	[diff] [blame]	2517	"Expected content disposition but found {!r}".format(value)))
				2518	_find_mime_parameters(disp_header, value)
				2519	return disp_header
				2520	disp_header.append(token)
				2521	disp_header.content_disposition = token.value.strip().lower()
				2522	if not value:
				2523	return disp_header
				2524	if value[0] != ';':
				2525	disp_header.defects.append(errors.InvalidHeaderDefect(
				2526	"Only parameters are valid after content disposition, but "
				2527	"found {!r}".format(value)))
				2528	_find_mime_parameters(disp_header, value)
				2529	return disp_header
				2530	disp_header.append(ValueTerminal(';', 'parameter-separator'))
				2531	disp_header.append(parse_mime_parameters(value[1:]))
				2532	return disp_header
				2533
				2534	def parse_content_transfer_encoding_header(value):
				2535	""" mechanism
				2536
				2537	"""
				2538	# We should probably validate the values, since the list is fixed.
				2539	cte_header = ContentTransferEncoding()
				2540	if not value:
				2541	cte_header.defects.append(errors.HeaderMissingRequiredValue(
				2542	"Missing content transfer encoding"))
				2543	return cte_header
				2544	try:
				2545	token, value = get_token(value)
				2546	except errors.HeaderParseError:
Ezio Melotti	d577480	2014-08-04 17:16:49 +0300	[diff] [blame]	2547	cte_header.defects.append(errors.InvalidHeaderDefect(
				2548	"Expected content transfer encoding but found {!r}".format(value)))
R David Murray	97f43c0	2012-06-24 05:03:27 -0400	[diff] [blame]	2549	else:
				2550	cte_header.append(token)
				2551	cte_header.cte = token.value.strip().lower()
				2552	if not value:
				2553	return cte_header
				2554	while value:
				2555	cte_header.defects.append(errors.InvalidHeaderDefect(
				2556	"Extra text after content transfer encoding"))
				2557	if value[0] in PHRASE_ENDS:
				2558	cte_header.append(ValueTerminal(value[0], 'misplaced-special'))
				2559	value = value[1:]
				2560	else:
				2561	token, value = get_phrase(value)
				2562	cte_header.append(token)
				2563	return cte_header
R. David Murray	85d5c18	2017-12-03 18:51:41 -0500	[diff] [blame]	2564
				2565
				2566	#
				2567	# Header folding
				2568	#
				2569	# Header folding is complex, with lots of rules and corner cases. The
				2570	# following code does its best to obey the rules and handle the corner
				2571	# cases, but you can be sure there are few bugs:)
				2572	#
				2573	# This folder generally canonicalizes as it goes, preferring the stringified
				2574	# version of each token. The tokens contain information that supports the
				2575	# folder, including which tokens can be encoded in which ways.
				2576	#
				2577	# Folded text is accumulated in a simple list of strings ('lines'), each
				2578	# one of which should be less than policy.max_line_length ('maxlen').
				2579	#
				2580
				2581	def _steal_trailing_WSP_if_exists(lines):
				2582	wsp = ''
				2583	if lines and lines[-1] and lines[-1][-1] in WSP:
				2584	wsp = lines[-1][-1]
				2585	lines[-1] = lines[-1][:-1]
				2586	return wsp
				2587
				2588	def _refold_parse_tree(parse_tree, *, policy):
				2589	"""Return string of contents of parse_tree folded according to RFC rules.
				2590
				2591	"""
				2592	# max_line_length 0/None means no limit, ie: infinitely long.
				2593	maxlen = policy.max_line_length or float("+inf")
				2594	encoding = 'utf-8' if policy.utf8 else 'us-ascii'
				2595	lines = ['']
				2596	last_ew = None
				2597	wrap_as_ew_blocked = 0
				2598	want_encoding = False
				2599	end_ew_not_allowed = Terminal('', 'wrap_as_ew_blocked')
				2600	parts = list(parse_tree)
				2601	while parts:
				2602	part = parts.pop(0)
				2603	if part is end_ew_not_allowed:
				2604	wrap_as_ew_blocked -= 1
				2605	continue
				2606	tstr = str(part)
				2607	try:
				2608	tstr.encode(encoding)
				2609	charset = encoding
				2610	except UnicodeEncodeError:
				2611	if any(isinstance(x, errors.UndecodableBytesDefect)
				2612	for x in part.all_defects):
				2613	charset = 'unknown-8bit'
				2614	else:
				2615	# If policy.utf8 is false this should really be taken from a
				2616	# 'charset' property on the policy.
				2617	charset = 'utf-8'
				2618	want_encoding = True
				2619	if part.token_type == 'mime-parameters':
				2620	# Mime parameter folding (using RFC2231) is extra special.
				2621	_fold_mime_parameters(part, lines, maxlen, encoding)
				2622	continue
				2623	if want_encoding and not wrap_as_ew_blocked:
				2624	if not part.as_ew_allowed:
				2625	want_encoding = False
				2626	last_ew = None
				2627	if part.syntactic_break:
Jens Troeger	45b2f88	2019-05-14 11:07:39 +1000	[diff] [blame^]	2628	encoded_part = part.fold(policy=policy)[:-len(policy.linesep)]
R. David Murray	85d5c18	2017-12-03 18:51:41 -0500	[diff] [blame]	2629	if policy.linesep not in encoded_part:
				2630	# It fits on a single line
				2631	if len(encoded_part) > maxlen - len(lines[-1]):
				2632	# But not on this one, so start a new one.
				2633	newline = _steal_trailing_WSP_if_exists(lines)
				2634	# XXX what if encoded_part has no leading FWS?
				2635	lines.append(newline)
				2636	lines[-1] += encoded_part
				2637	continue
				2638	# Either this is not a major syntactic break, so we don't
				2639	# want it on a line by itself even if it fits, or it
				2640	# doesn't fit on a line by itself. Either way, fall through
				2641	# to unpacking the subparts and wrapping them.
				2642	if not hasattr(part, 'encode'):
				2643	# It's not a Terminal, do each piece individually.
				2644	parts = list(part) + parts
				2645	else:
				2646	# It's a terminal, wrap it as an encoded word, possibly
				2647	# combining it with previously encoded words if allowed.
				2648	last_ew = _fold_as_ew(tstr, lines, maxlen, last_ew,
				2649	part.ew_combine_allowed, charset)
				2650	want_encoding = False
				2651	continue
				2652	if len(tstr) <= maxlen - len(lines[-1]):
				2653	lines[-1] += tstr
				2654	continue
				2655	# This part is too long to fit. The RFC wants us to break at
				2656	# "major syntactic breaks", so unless we don't consider this
				2657	# to be one, check if it will fit on the next line by itself.
				2658	if (part.syntactic_break and
				2659	len(tstr) + 1 <= maxlen):
				2660	newline = _steal_trailing_WSP_if_exists(lines)
				2661	if newline or part.startswith_fws():
				2662	lines.append(newline + tstr)
				2663	continue
				2664	if not hasattr(part, 'encode'):
				2665	# It's not a terminal, try folding the subparts.
				2666	newparts = list(part)
				2667	if not part.as_ew_allowed:
				2668	wrap_as_ew_blocked += 1
				2669	newparts.append(end_ew_not_allowed)
				2670	parts = newparts + parts
				2671	continue
				2672	if part.as_ew_allowed and not wrap_as_ew_blocked:
				2673	# It doesn't need CTE encoding, but encode it anyway so we can
				2674	# wrap it.
				2675	parts.insert(0, part)
				2676	want_encoding = True
				2677	continue
				2678	# We can't figure out how to wrap, it, so give up.
				2679	newline = _steal_trailing_WSP_if_exists(lines)
				2680	if newline or part.startswith_fws():
				2681	lines.append(newline + tstr)
				2682	else:
				2683	# We can't fold it onto the next line either...
				2684	lines[-1] += tstr
				2685	return policy.linesep.join(lines) + policy.linesep
				2686
				2687	def _fold_as_ew(to_encode, lines, maxlen, last_ew, ew_combine_allowed, charset):
				2688	"""Fold string to_encode into lines as encoded word, combining if allowed.
				2689	Return the new value for last_ew, or None if ew_combine_allowed is False.
				2690
				2691	If there is already an encoded word in the last line of lines (indicated by
				2692	a non-None value for last_ew) and ew_combine_allowed is true, decode the
				2693	existing ew, combine it with to_encode, and re-encode. Otherwise, encode
				2694	to_encode. In either case, split to_encode as necessary so that the
				2695	encoded segments fit within maxlen.
				2696
				2697	"""
				2698	if last_ew is not None and ew_combine_allowed:
				2699	to_encode = str(
				2700	get_unstructured(lines[-1][last_ew:] + to_encode))
				2701	lines[-1] = lines[-1][:last_ew]
				2702	if to_encode[0] in WSP:
				2703	# We're joining this to non-encoded text, so don't encode
				2704	# the leading blank.
				2705	leading_wsp = to_encode[0]
				2706	to_encode = to_encode[1:]
				2707	if (len(lines[-1]) == maxlen):
				2708	lines.append(_steal_trailing_WSP_if_exists(lines))
				2709	lines[-1] += leading_wsp
				2710	trailing_wsp = ''
				2711	if to_encode[-1] in WSP:
				2712	# Likewise for the trailing space.
				2713	trailing_wsp = to_encode[-1]
				2714	to_encode = to_encode[:-1]
				2715	new_last_ew = len(lines[-1]) if last_ew is None else last_ew
				2716	while to_encode:
				2717	remaining_space = maxlen - len(lines[-1])
				2718	# The RFC2047 chrome takes up 7 characters plus the length
				2719	# of the charset name.
				2720	encode_as = 'utf-8' if charset == 'us-ascii' else charset
				2721	text_space = remaining_space - len(encode_as) - 7
				2722	if text_space <= 0:
				2723	lines.append(' ')
				2724	# XXX We'll get an infinite loop here if maxlen is <= 7
				2725	continue
				2726	first_part = to_encode[:text_space]
				2727	ew = _ew.encode(first_part, charset=encode_as)
				2728	excess = len(ew) - remaining_space
				2729	if excess > 0:
				2730	# encode always chooses the shortest encoding, so this
				2731	# is guaranteed to fit at this point.
				2732	first_part = first_part[:-excess]
				2733	ew = _ew.encode(first_part)
				2734	lines[-1] += ew
				2735	to_encode = to_encode[len(first_part):]
				2736	if to_encode:
				2737	lines.append(' ')
				2738	new_last_ew = len(lines[-1])
				2739	lines[-1] += trailing_wsp
				2740	return new_last_ew if ew_combine_allowed else None
				2741
				2742	def _fold_mime_parameters(part, lines, maxlen, encoding):
				2743	"""Fold TokenList 'part' into the 'lines' list as mime parameters.
				2744
				2745	Using the decoded list of parameters and values, format them according to
				2746	the RFC rules, including using RFC2231 encoding if the value cannot be
Leo Arias	c3d9508	2018-02-03 18:36:10 -0600	[diff] [blame]	2747	expressed in 'encoding' and/or the parameter+value is too long to fit
				2748	within 'maxlen'.
R. David Murray	85d5c18	2017-12-03 18:51:41 -0500	[diff] [blame]	2749
				2750	"""
				2751	# Special case for RFC2231 encoding: start from decoded values and use
				2752	# RFC2231 encoding iff needed.
				2753	#
				2754	# Note that the 1 and 2s being added to the length calculations are
				2755	# accounting for the possibly-needed spaces and semicolons we'll be adding.
				2756	#
				2757	for name, value in part.params:
				2758	# XXX What if this ';' puts us over maxlen the first time through the
				2759	# loop? We should split the header value onto a newline in that case,
				2760	# but to do that we need to recognize the need earlier or reparse the
				2761	# header, so I'm going to ignore that bug for now. It'll only put us
				2762	# one character over.
				2763	if not lines[-1].rstrip().endswith(';'):
				2764	lines[-1] += ';'
				2765	charset = encoding
				2766	error_handler = 'strict'
				2767	try:
				2768	value.encode(encoding)
				2769	encoding_required = False
				2770	except UnicodeEncodeError:
				2771	encoding_required = True
				2772	if utils._has_surrogates(value):
				2773	charset = 'unknown-8bit'
				2774	error_handler = 'surrogateescape'
				2775	else:
				2776	charset = 'utf-8'
				2777	if encoding_required:
				2778	encoded_value = urllib.parse.quote(
				2779	value, safe='', errors=error_handler)
				2780	tstr = "{}*={}''{}".format(name, charset, encoded_value)
				2781	else:
				2782	tstr = '{}={}'.format(name, quote_string(value))
				2783	if len(lines[-1]) + len(tstr) + 1 < maxlen:
				2784	lines[-1] = lines[-1] + ' ' + tstr
				2785	continue
				2786	elif len(tstr) + 2 <= maxlen:
				2787	lines.append(' ' + tstr)
				2788	continue
				2789	# We need multiple sections. We are allowed to mix encoded and
				2790	# non-encoded sections, but we aren't going to. We'll encode them all.
				2791	section = 0
				2792	extra_chrome = charset + "''"
				2793	while value:
				2794	chrome_len = len(name) + len(str(section)) + 3 + len(extra_chrome)
				2795	if maxlen <= chrome_len + 3:
				2796	# We need room for the leading blank, the trailing semicolon,
				2797	# and at least one character of the value. If we don't
				2798	# have that, we'd be stuck, so in that case fall back to
				2799	# the RFC standard width.
				2800	maxlen = 78
				2801	splitpoint = maxchars = maxlen - chrome_len - 2
				2802	while True:
				2803	partial = value[:splitpoint]
				2804	encoded_value = urllib.parse.quote(
				2805	partial, safe='', errors=error_handler)
				2806	if len(encoded_value) <= maxchars:
				2807	break
				2808	splitpoint -= 1
				2809	lines.append(" {}{}={}{}".format(
				2810	name, section, extra_chrome, encoded_value))
				2811	extra_chrome = ''
				2812	section += 1
				2813	value = value[splitpoint:]
				2814	if value:
				2815	lines[-1] += ';'