Blame - common/py3-stdlib/email/_parseaddr.py - platform/prebuilts/build-tools

blob: 41ff6f8c000d57d23445fa529b33297b1ffa07fc [file] [log] [blame]

Jingwen Chen	475b3cc	2021-01-05 21:45:16 -0500	[diff] [blame]	1	# Copyright (C) 2002-2007 Python Software Foundation
				2	# Contact: email-sig@python.org
				3
				4	"""Email address parsing code.
				5
				6	Lifted directly from rfc822.py. This should eventually be rewritten.
				7	"""
				8
				9	__all__ = [
				10	'mktime_tz',
				11	'parsedate',
				12	'parsedate_tz',
				13	'quote',
				14	]
				15
				16	import time, calendar
				17
				18	SPACE = ' '
				19	EMPTYSTRING = ''
				20	COMMASPACE = ', '
				21
				22	# Parse a date field
				23	_monthnames = ['jan', 'feb', 'mar', 'apr', 'may', 'jun', 'jul',
				24	'aug', 'sep', 'oct', 'nov', 'dec',
				25	'january', 'february', 'march', 'april', 'may', 'june', 'july',
				26	'august', 'september', 'october', 'november', 'december']
				27
				28	_daynames = ['mon', 'tue', 'wed', 'thu', 'fri', 'sat', 'sun']
				29
				30	# The timezone table does not include the military time zones defined
				31	# in RFC822, other than Z. According to RFC1123, the description in
				32	# RFC822 gets the signs wrong, so we can't rely on any such time
				33	# zones. RFC1123 recommends that numeric timezone indicators be used
				34	# instead of timezone names.
				35
				36	_timezones = {'UT':0, 'UTC':0, 'GMT':0, 'Z':0,
				37	'AST': -400, 'ADT': -300, # Atlantic (used in Canada)
				38	'EST': -500, 'EDT': -400, # Eastern
				39	'CST': -600, 'CDT': -500, # Central
				40	'MST': -700, 'MDT': -600, # Mountain
				41	'PST': -800, 'PDT': -700 # Pacific
				42	}
				43
				44
				45	def parsedate_tz(data):
				46	"""Convert a date string to a time tuple.
				47
				48	Accounts for military timezones.
				49	"""
				50	res = _parsedate_tz(data)
				51	if not res:
				52	return
				53	if res[9] is None:
				54	res[9] = 0
				55	return tuple(res)
				56
				57	def _parsedate_tz(data):
				58	"""Convert date to extended time tuple.
				59
				60	The last (additional) element is the time zone offset in seconds, except if
				61	the timezone was specified as -0000. In that case the last element is
				62	None. This indicates a UTC timestamp that explicitly declaims knowledge of
				63	the source timezone, as opposed to a +0000 timestamp that indicates the
				64	source timezone really was UTC.
				65
				66	"""
				67	if not data:
				68	return
				69	data = data.split()
				70	# The FWS after the comma after the day-of-week is optional, so search and
				71	# adjust for this.
				72	if data[0].endswith(',') or data[0].lower() in _daynames:
				73	# There's a dayname here. Skip it
				74	del data[0]
				75	else:
				76	i = data[0].rfind(',')
				77	if i >= 0:
				78	data[0] = data[0][i+1:]
				79	if len(data) == 3: # RFC 850 date, deprecated
				80	stuff = data[0].split('-')
				81	if len(stuff) == 3:
				82	data = stuff + data[1:]
				83	if len(data) == 4:
				84	s = data[3]
				85	i = s.find('+')
				86	if i == -1:
				87	i = s.find('-')
				88	if i > 0:
				89	data[3:] = [s[:i], s[i:]]
				90	else:
				91	data.append('') # Dummy tz
				92	if len(data) < 5:
				93	return None
				94	data = data[:5]
				95	[dd, mm, yy, tm, tz] = data
				96	mm = mm.lower()
				97	if mm not in _monthnames:
				98	dd, mm = mm, dd.lower()
				99	if mm not in _monthnames:
				100	return None
				101	mm = _monthnames.index(mm) + 1
				102	if mm > 12:
				103	mm -= 12
				104	if dd[-1] == ',':
				105	dd = dd[:-1]
				106	i = yy.find(':')
				107	if i > 0:
				108	yy, tm = tm, yy
				109	if yy[-1] == ',':
				110	yy = yy[:-1]
				111	if not yy[0].isdigit():
				112	yy, tz = tz, yy
				113	if tm[-1] == ',':
				114	tm = tm[:-1]
				115	tm = tm.split(':')
				116	if len(tm) == 2:
				117	[thh, tmm] = tm
				118	tss = '0'
				119	elif len(tm) == 3:
				120	[thh, tmm, tss] = tm
				121	elif len(tm) == 1 and '.' in tm[0]:
				122	# Some non-compliant MUAs use '.' to separate time elements.
				123	tm = tm[0].split('.')
				124	if len(tm) == 2:
				125	[thh, tmm] = tm
				126	tss = 0
				127	elif len(tm) == 3:
				128	[thh, tmm, tss] = tm
				129	else:
				130	return None
				131	try:
				132	yy = int(yy)
				133	dd = int(dd)
				134	thh = int(thh)
				135	tmm = int(tmm)
				136	tss = int(tss)
				137	except ValueError:
				138	return None
				139	# Check for a yy specified in two-digit format, then convert it to the
				140	# appropriate four-digit format, according to the POSIX standard. RFC 822
				141	# calls for a two-digit yy, but RFC 2822 (which obsoletes RFC 822)
				142	# mandates a 4-digit yy. For more information, see the documentation for
				143	# the time module.
				144	if yy < 100:
				145	# The year is between 1969 and 1999 (inclusive).
				146	if yy > 68:
				147	yy += 1900
				148	# The year is between 2000 and 2068 (inclusive).
				149	else:
				150	yy += 2000
				151	tzoffset = None
				152	tz = tz.upper()
				153	if tz in _timezones:
				154	tzoffset = _timezones[tz]
				155	else:
				156	try:
				157	tzoffset = int(tz)
				158	except ValueError:
				159	pass
				160	if tzoffset==0 and tz.startswith('-'):
				161	tzoffset = None
				162	# Convert a timezone offset into seconds ; -0500 -> -18000
				163	if tzoffset:
				164	if tzoffset < 0:
				165	tzsign = -1
				166	tzoffset = -tzoffset
				167	else:
				168	tzsign = 1
				169	tzoffset = tzsign * ( (tzoffset//100)3600 + (tzoffset % 100)60)
				170	# Daylight Saving Time flag is set to -1, since DST is unknown.
				171	return [yy, mm, dd, thh, tmm, tss, 0, 1, -1, tzoffset]
				172
				173
				174	def parsedate(data):
				175	"""Convert a time string to a time tuple."""
				176	t = parsedate_tz(data)
				177	if isinstance(t, tuple):
				178	return t[:9]
				179	else:
				180	return t
				181
				182
				183	def mktime_tz(data):
				184	"""Turn a 10-tuple as returned by parsedate_tz() into a POSIX timestamp."""
				185	if data[9] is None:
				186	# No zone info, so localtime is better assumption than GMT
				187	return time.mktime(data[:8] + (-1,))
				188	else:
				189	t = calendar.timegm(data)
				190	return t - data[9]
				191
				192
				193	def quote(str):
				194	"""Prepare string to be used in a quoted string.
				195
				196	Turns backslash and double quote characters into quoted pairs. These
				197	are the only characters that need to be quoted inside a quoted string.
				198	Does not add the surrounding double quotes.
				199	"""
				200	return str.replace('\\', '\\\\').replace('"', '\\"')
				201
				202
				203	class AddrlistClass:
				204	"""Address parser class by Ben Escoto.
				205
				206	To understand what this class does, it helps to have a copy of RFC 2822 in
				207	front of you.
				208
				209	Note: this class interface is deprecated and may be removed in the future.
				210	Use email.utils.AddressList instead.
				211	"""
				212
				213	def __init__(self, field):
				214	"""Initialize a new instance.
				215
				216	`field' is an unparsed address header field, containing
				217	one or more addresses.
				218	"""
				219	self.specials = '()<>@,:;.\"[]'
				220	self.pos = 0
				221	self.LWS = ' \t'
				222	self.CR = '\r\n'
				223	self.FWS = self.LWS + self.CR
				224	self.atomends = self.specials + self.LWS + self.CR
				225	# Note that RFC 2822 now specifies `.' as obs-phrase, meaning that it
				226	# is obsolete syntax. RFC 2822 requires that we recognize obsolete
				227	# syntax, so allow dots in phrases.
				228	self.phraseends = self.atomends.replace('.', '')
				229	self.field = field
				230	self.commentlist = []
				231
				232	def gotonext(self):
				233	"""Skip white space and extract comments."""
				234	wslist = []
				235	while self.pos < len(self.field):
				236	if self.field[self.pos] in self.LWS + '\n\r':
				237	if self.field[self.pos] not in '\n\r':
				238	wslist.append(self.field[self.pos])
				239	self.pos += 1
				240	elif self.field[self.pos] == '(':
				241	self.commentlist.append(self.getcomment())
				242	else:
				243	break
				244	return EMPTYSTRING.join(wslist)
				245
				246	def getaddrlist(self):
				247	"""Parse all addresses.
				248
				249	Returns a list containing all of the addresses.
				250	"""
				251	result = []
				252	while self.pos < len(self.field):
				253	ad = self.getaddress()
				254	if ad:
				255	result += ad
				256	else:
				257	result.append(('', ''))
				258	return result
				259
				260	def getaddress(self):
				261	"""Parse the next address."""
				262	self.commentlist = []
				263	self.gotonext()
				264
				265	oldpos = self.pos
				266	oldcl = self.commentlist
				267	plist = self.getphraselist()
				268
				269	self.gotonext()
				270	returnlist = []
				271
				272	if self.pos >= len(self.field):
				273	# Bad email address technically, no domain.
				274	if plist:
				275	returnlist = [(SPACE.join(self.commentlist), plist[0])]
				276
				277	elif self.field[self.pos] in '.@':
				278	# email address is just an addrspec
				279	# this isn't very efficient since we start over
				280	self.pos = oldpos
				281	self.commentlist = oldcl
				282	addrspec = self.getaddrspec()
				283	returnlist = [(SPACE.join(self.commentlist), addrspec)]
				284
				285	elif self.field[self.pos] == ':':
				286	# address is a group
				287	returnlist = []
				288
				289	fieldlen = len(self.field)
				290	self.pos += 1
				291	while self.pos < len(self.field):
				292	self.gotonext()
				293	if self.pos < fieldlen and self.field[self.pos] == ';':
				294	self.pos += 1
				295	break
				296	returnlist = returnlist + self.getaddress()
				297
				298	elif self.field[self.pos] == '<':
				299	# Address is a phrase then a route addr
				300	routeaddr = self.getrouteaddr()
				301
				302	if self.commentlist:
				303	returnlist = [(SPACE.join(plist) + ' (' +
				304	' '.join(self.commentlist) + ')', routeaddr)]
				305	else:
				306	returnlist = [(SPACE.join(plist), routeaddr)]
				307
				308	else:
				309	if plist:
				310	returnlist = [(SPACE.join(self.commentlist), plist[0])]
				311	elif self.field[self.pos] in self.specials:
				312	self.pos += 1
				313
				314	self.gotonext()
				315	if self.pos < len(self.field) and self.field[self.pos] == ',':
				316	self.pos += 1
				317	return returnlist
				318
				319	def getrouteaddr(self):
				320	"""Parse a route address (Return-path value).
				321
				322	This method just skips all the route stuff and returns the addrspec.
				323	"""
				324	if self.field[self.pos] != '<':
				325	return
				326
				327	expectroute = False
				328	self.pos += 1
				329	self.gotonext()
				330	adlist = ''
				331	while self.pos < len(self.field):
				332	if expectroute:
				333	self.getdomain()
				334	expectroute = False
				335	elif self.field[self.pos] == '>':
				336	self.pos += 1
				337	break
				338	elif self.field[self.pos] == '@':
				339	self.pos += 1
				340	expectroute = True
				341	elif self.field[self.pos] == ':':
				342	self.pos += 1
				343	else:
				344	adlist = self.getaddrspec()
				345	self.pos += 1
				346	break
				347	self.gotonext()
				348
				349	return adlist
				350
				351	def getaddrspec(self):
				352	"""Parse an RFC 2822 addr-spec."""
				353	aslist = []
				354
				355	self.gotonext()
				356	while self.pos < len(self.field):
				357	preserve_ws = True
				358	if self.field[self.pos] == '.':
				359	if aslist and not aslist[-1].strip():
				360	aslist.pop()
				361	aslist.append('.')
				362	self.pos += 1
				363	preserve_ws = False
				364	elif self.field[self.pos] == '"':
				365	aslist.append('"%s"' % quote(self.getquote()))
				366	elif self.field[self.pos] in self.atomends:
				367	if aslist and not aslist[-1].strip():
				368	aslist.pop()
				369	break
				370	else:
				371	aslist.append(self.getatom())
				372	ws = self.gotonext()
				373	if preserve_ws and ws:
				374	aslist.append(ws)
				375
				376	if self.pos >= len(self.field) or self.field[self.pos] != '@':
				377	return EMPTYSTRING.join(aslist)
				378
				379	aslist.append('@')
				380	self.pos += 1
				381	self.gotonext()
				382	domain = self.getdomain()
				383	if not domain:
				384	# Invalid domain, return an empty address instead of returning a
				385	# local part to denote failed parsing.
				386	return EMPTYSTRING
				387	return EMPTYSTRING.join(aslist) + domain
				388
				389	def getdomain(self):
				390	"""Get the complete domain name from an address."""
				391	sdlist = []
				392	while self.pos < len(self.field):
				393	if self.field[self.pos] in self.LWS:
				394	self.pos += 1
				395	elif self.field[self.pos] == '(':
				396	self.commentlist.append(self.getcomment())
				397	elif self.field[self.pos] == '[':
				398	sdlist.append(self.getdomainliteral())
				399	elif self.field[self.pos] == '.':
				400	self.pos += 1
				401	sdlist.append('.')
				402	elif self.field[self.pos] == '@':
				403	# bpo-34155: Don't parse domains with two `@` like
				404	# `a@malicious.org@important.com`.
				405	return EMPTYSTRING
				406	elif self.field[self.pos] in self.atomends:
				407	break
				408	else:
				409	sdlist.append(self.getatom())
				410	return EMPTYSTRING.join(sdlist)
				411
				412	def getdelimited(self, beginchar, endchars, allowcomments=True):
				413	"""Parse a header fragment delimited by special characters.
				414
				415	`beginchar' is the start character for the fragment.
				416	If self is not looking at an instance of `beginchar' then
				417	getdelimited returns the empty string.
				418
				419	`endchars' is a sequence of allowable end-delimiting characters.
				420	Parsing stops when one of these is encountered.
				421
				422	If `allowcomments' is non-zero, embedded RFC 2822 comments are allowed
				423	within the parsed fragment.
				424	"""
				425	if self.field[self.pos] != beginchar:
				426	return ''
				427
				428	slist = ['']
				429	quote = False
				430	self.pos += 1
				431	while self.pos < len(self.field):
				432	if quote:
				433	slist.append(self.field[self.pos])
				434	quote = False
				435	elif self.field[self.pos] in endchars:
				436	self.pos += 1
				437	break
				438	elif allowcomments and self.field[self.pos] == '(':
				439	slist.append(self.getcomment())
				440	continue # have already advanced pos from getcomment
				441	elif self.field[self.pos] == '\\':
				442	quote = True
				443	else:
				444	slist.append(self.field[self.pos])
				445	self.pos += 1
				446
				447	return EMPTYSTRING.join(slist)
				448
				449	def getquote(self):
				450	"""Get a quote-delimited fragment from self's field."""
				451	return self.getdelimited('"', '"\r', False)
				452
				453	def getcomment(self):
				454	"""Get a parenthesis-delimited fragment from self's field."""
				455	return self.getdelimited('(', ')\r', True)
				456
				457	def getdomainliteral(self):
				458	"""Parse an RFC 2822 domain-literal."""
				459	return '[%s]' % self.getdelimited('[', ']\r', False)
				460
				461	def getatom(self, atomends=None):
				462	"""Parse an RFC 2822 atom.
				463
				464	Optional atomends specifies a different set of end token delimiters
				465	(the default is to use self.atomends). This is used e.g. in
				466	getphraselist() since phrase endings must not include the `.' (which
				467	is legal in phrases)."""
				468	atomlist = ['']
				469	if atomends is None:
				470	atomends = self.atomends
				471
				472	while self.pos < len(self.field):
				473	if self.field[self.pos] in atomends:
				474	break
				475	else:
				476	atomlist.append(self.field[self.pos])
				477	self.pos += 1
				478
				479	return EMPTYSTRING.join(atomlist)
				480
				481	def getphraselist(self):
				482	"""Parse a sequence of RFC 2822 phrases.
				483
				484	A phrase is a sequence of words, which are in turn either RFC 2822
				485	atoms or quoted-strings. Phrases are canonicalized by squeezing all
				486	runs of continuous whitespace into one space.
				487	"""
				488	plist = []
				489
				490	while self.pos < len(self.field):
				491	if self.field[self.pos] in self.FWS:
				492	self.pos += 1
				493	elif self.field[self.pos] == '"':
				494	plist.append(self.getquote())
				495	elif self.field[self.pos] == '(':
				496	self.commentlist.append(self.getcomment())
				497	elif self.field[self.pos] in self.phraseends:
				498	break
				499	else:
				500	plist.append(self.getatom(self.phraseends))
				501
				502	return plist
				503
				504	class AddressList(AddrlistClass):
				505	"""An AddressList encapsulates a list of parsed RFC 2822 addresses."""
				506	def __init__(self, field):
				507	AddrlistClass.__init__(self, field)
				508	if field:
				509	self.addresslist = self.getaddrlist()
				510	else:
				511	self.addresslist = []
				512
				513	def __len__(self):
				514	return len(self.addresslist)
				515
				516	def __add__(self, other):
				517	# Set union
				518	newaddr = AddressList(None)
				519	newaddr.addresslist = self.addresslist[:]
				520	for x in other.addresslist:
				521	if not x in self.addresslist:
				522	newaddr.addresslist.append(x)
				523	return newaddr
				524
				525	def __iadd__(self, other):
				526	# Set union, in-place
				527	for x in other.addresslist:
				528	if not x in self.addresslist:
				529	self.addresslist.append(x)
				530	return self
				531
				532	def __sub__(self, other):
				533	# Set difference
				534	newaddr = AddressList(None)
				535	for x in self.addresslist:
				536	if not x in other.addresslist:
				537	newaddr.addresslist.append(x)
				538	return newaddr
				539
				540	def __isub__(self, other):
				541	# Set difference, in-place
				542	for x in other.addresslist:
				543	if x in self.addresslist:
				544	self.addresslist.remove(x)
				545	return self
				546
				547	def __getitem__(self, index):
				548	# Make indexing, slices, and 'in' work
				549	return self.addresslist[index]