Blame - Lib/email/_parseaddr.py - platform/external/python/cpython3

blob: ba5ad5a36d06b740d6d515aa7bd47d8464483155 [file] [log] [blame]

Guido van Rossum	8b3febe	2007-08-30 01:15:14 +0000	[diff] [blame]	1	# Copyright (C) 2002-2007 Python Software Foundation
				2	# Contact: email-sig@python.org
				3
				4	"""Email address parsing code.
				5
				6	Lifted directly from rfc822.py. This should eventually be rewritten.
				7	"""
				8
				9	__all__ = [
				10	'mktime_tz',
				11	'parsedate',
				12	'parsedate_tz',
				13	'quote',
				14	]
				15
Alexander Belopolsky	a07548e	2012-06-21 20:34:09 -0400	[diff] [blame]	16	import time, calendar
Guido van Rossum	8b3febe	2007-08-30 01:15:14 +0000	[diff] [blame]	17
				18	SPACE = ' '
				19	EMPTYSTRING = ''
				20	COMMASPACE = ', '
				21
				22	# Parse a date field
				23	_monthnames = ['jan', 'feb', 'mar', 'apr', 'may', 'jun', 'jul',
				24	'aug', 'sep', 'oct', 'nov', 'dec',
				25	'january', 'february', 'march', 'april', 'may', 'june', 'july',
				26	'august', 'september', 'october', 'november', 'december']
				27
				28	_daynames = ['mon', 'tue', 'wed', 'thu', 'fri', 'sat', 'sun']
				29
				30	# The timezone table does not include the military time zones defined
				31	# in RFC822, other than Z. According to RFC1123, the description in
				32	# RFC822 gets the signs wrong, so we can't rely on any such time
				33	# zones. RFC1123 recommends that numeric timezone indicators be used
				34	# instead of timezone names.
				35
				36	_timezones = {'UT':0, 'UTC':0, 'GMT':0, 'Z':0,
				37	'AST': -400, 'ADT': -300, # Atlantic (used in Canada)
				38	'EST': -500, 'EDT': -400, # Eastern
				39	'CST': -600, 'CDT': -500, # Central
				40	'MST': -700, 'MDT': -600, # Mountain
				41	'PST': -800, 'PDT': -700 # Pacific
				42	}
				43
				44
				45	def parsedate_tz(data):
				46	"""Convert a date string to a time tuple.
				47
				48	Accounts for military timezones.
				49	"""
R David Murray	875048b	2011-07-20 11:41:21 -0400	[diff] [blame]	50	res = _parsedate_tz(data)
Georg Brandl	1aca31e	2012-09-22 09:03:56 +0200	[diff] [blame]	51	if not res:
				52	return
R David Murray	875048b	2011-07-20 11:41:21 -0400	[diff] [blame]	53	if res[9] is None:
				54	res[9] = 0
				55	return tuple(res)
				56
				57	def _parsedate_tz(data):
				58	"""Convert date to extended time tuple.
				59
				60	The last (additional) element is the time zone offset in seconds, except if
				61	the timezone was specified as -0000. In that case the last element is
				62	None. This indicates a UTC timestamp that explicitly declaims knowledge of
				63	the source timezone, as opposed to a +0000 timestamp that indicates the
				64	source timezone really was UTC.
				65
				66	"""
Georg Brandl	1aca31e	2012-09-22 09:03:56 +0200	[diff] [blame]	67	if not data:
Georges Toth	303aac8	2020-10-27 01:31:06 +0100	[diff] [blame]	68	return None
Guido van Rossum	8b3febe	2007-08-30 01:15:14 +0000	[diff] [blame]	69	data = data.split()
Miss Islington (bot)	9a79242	2021-08-26 08:47:27 -0700	[diff] [blame]	70	if not data: # This happens for whitespace-only input.
				71	return None
Guido van Rossum	8b3febe	2007-08-30 01:15:14 +0000	[diff] [blame]	72	# The FWS after the comma after the day-of-week is optional, so search and
				73	# adjust for this.
				74	if data[0].endswith(',') or data[0].lower() in _daynames:
				75	# There's a dayname here. Skip it
				76	del data[0]
				77	else:
				78	i = data[0].rfind(',')
				79	if i >= 0:
				80	data[0] = data[0][i+1:]
				81	if len(data) == 3: # RFC 850 date, deprecated
				82	stuff = data[0].split('-')
				83	if len(stuff) == 3:
				84	data = stuff + data[1:]
				85	if len(data) == 4:
				86	s = data[3]
				87	i = s.find('+')
R. David Murray	4a62e89	2010-12-23 20:35:46 +0000	[diff] [blame]	88	if i == -1:
				89	i = s.find('-')
Guido van Rossum	8b3febe	2007-08-30 01:15:14 +0000	[diff] [blame]	90	if i > 0:
R. David Murray	4a62e89	2010-12-23 20:35:46 +0000	[diff] [blame]	91	data[3:] = [s[:i], s[i:]]
Guido van Rossum	8b3febe	2007-08-30 01:15:14 +0000	[diff] [blame]	92	else:
				93	data.append('') # Dummy tz
				94	if len(data) < 5:
				95	return None
				96	data = data[:5]
				97	[dd, mm, yy, tm, tz] = data
				98	mm = mm.lower()
				99	if mm not in _monthnames:
				100	dd, mm = mm, dd.lower()
				101	if mm not in _monthnames:
				102	return None
				103	mm = _monthnames.index(mm) + 1
				104	if mm > 12:
				105	mm -= 12
				106	if dd[-1] == ',':
				107	dd = dd[:-1]
				108	i = yy.find(':')
				109	if i > 0:
				110	yy, tm = tm, yy
				111	if yy[-1] == ',':
				112	yy = yy[:-1]
				113	if not yy[0].isdigit():
				114	yy, tz = tz, yy
				115	if tm[-1] == ',':
				116	tm = tm[:-1]
				117	tm = tm.split(':')
				118	if len(tm) == 2:
				119	[thh, tmm] = tm
				120	tss = '0'
				121	elif len(tm) == 3:
				122	[thh, tmm, tss] = tm
R David Murray	accd1c0	2011-03-13 20:06:23 -0400	[diff] [blame]	123	elif len(tm) == 1 and '.' in tm[0]:
				124	# Some non-compliant MUAs use '.' to separate time elements.
				125	tm = tm[0].split('.')
				126	if len(tm) == 2:
				127	[thh, tmm] = tm
				128	tss = 0
				129	elif len(tm) == 3:
				130	[thh, tmm, tss] = tm
Łukasz Langa	f8473f6	2021-10-13 19:12:22 +0200	[diff] [blame]	131	else:
				132	return None
Guido van Rossum	8b3febe	2007-08-30 01:15:14 +0000	[diff] [blame]	133	else:
				134	return None
				135	try:
				136	yy = int(yy)
				137	dd = int(dd)
				138	thh = int(thh)
				139	tmm = int(tmm)
				140	tss = int(tss)
				141	except ValueError:
				142	return None
R. David Murray	219d1c8	2010-08-25 00:45:55 +0000	[diff] [blame]	143	# Check for a yy specified in two-digit format, then convert it to the
				144	# appropriate four-digit format, according to the POSIX standard. RFC 822
				145	# calls for a two-digit yy, but RFC 2822 (which obsoletes RFC 822)
				146	# mandates a 4-digit yy. For more information, see the documentation for
				147	# the time module.
				148	if yy < 100:
				149	# The year is between 1969 and 1999 (inclusive).
				150	if yy > 68:
				151	yy += 1900
				152	# The year is between 2000 and 2068 (inclusive).
				153	else:
				154	yy += 2000
Guido van Rossum	8b3febe	2007-08-30 01:15:14 +0000	[diff] [blame]	155	tzoffset = None
				156	tz = tz.upper()
				157	if tz in _timezones:
				158	tzoffset = _timezones[tz]
				159	else:
				160	try:
				161	tzoffset = int(tz)
				162	except ValueError:
				163	pass
R David Murray	875048b	2011-07-20 11:41:21 -0400	[diff] [blame]	164	if tzoffset==0 and tz.startswith('-'):
				165	tzoffset = None
Guido van Rossum	8b3febe	2007-08-30 01:15:14 +0000	[diff] [blame]	166	# Convert a timezone offset into seconds ; -0500 -> -18000
				167	if tzoffset:
				168	if tzoffset < 0:
				169	tzsign = -1
				170	tzoffset = -tzoffset
				171	else:
				172	tzsign = 1
				173	tzoffset = tzsign * ( (tzoffset//100)3600 + (tzoffset % 100)60)
				174	# Daylight Saving Time flag is set to -1, since DST is unknown.
R David Murray	875048b	2011-07-20 11:41:21 -0400	[diff] [blame]	175	return [yy, mm, dd, thh, tmm, tss, 0, 1, -1, tzoffset]
Guido van Rossum	8b3febe	2007-08-30 01:15:14 +0000	[diff] [blame]	176
				177
				178	def parsedate(data):
				179	"""Convert a time string to a time tuple."""
				180	t = parsedate_tz(data)
				181	if isinstance(t, tuple):
				182	return t[:9]
				183	else:
				184	return t
				185
				186
				187	def mktime_tz(data):
Alexander Belopolsky	a07548e	2012-06-21 20:34:09 -0400	[diff] [blame]	188	"""Turn a 10-tuple as returned by parsedate_tz() into a POSIX timestamp."""
Guido van Rossum	8b3febe	2007-08-30 01:15:14 +0000	[diff] [blame]	189	if data[9] is None:
				190	# No zone info, so localtime is better assumption than GMT
				191	return time.mktime(data[:8] + (-1,))
				192	else:
Alexander Belopolsky	a07548e	2012-06-21 20:34:09 -0400	[diff] [blame]	193	t = calendar.timegm(data)
				194	return t - data[9]
Guido van Rossum	8b3febe	2007-08-30 01:15:14 +0000	[diff] [blame]	195
				196
				197	def quote(str):
R. David Murray	5397e86	2010-10-02 15:58:26 +0000	[diff] [blame]	198	"""Prepare string to be used in a quoted string.
				199
				200	Turns backslash and double quote characters into quoted pairs. These
				201	are the only characters that need to be quoted inside a quoted string.
				202	Does not add the surrounding double quotes.
				203	"""
Guido van Rossum	8b3febe	2007-08-30 01:15:14 +0000	[diff] [blame]	204	return str.replace('\\', '\\\\').replace('"', '\\"')
				205
				206
				207	class AddrlistClass:
				208	"""Address parser class by Ben Escoto.
				209
				210	To understand what this class does, it helps to have a copy of RFC 2822 in
				211	front of you.
				212
				213	Note: this class interface is deprecated and may be removed in the future.
Florent Xicluna	992d9e0	2011-11-11 19:35:42 +0100	[diff] [blame]	214	Use email.utils.AddressList instead.
Guido van Rossum	8b3febe	2007-08-30 01:15:14 +0000	[diff] [blame]	215	"""
				216
				217	def __init__(self, field):
				218	"""Initialize a new instance.
				219
				220	`field' is an unparsed address header field, containing
				221	one or more addresses.
				222	"""
				223	self.specials = '()<>@,:;.\"[]'
				224	self.pos = 0
				225	self.LWS = ' \t'
				226	self.CR = '\r\n'
				227	self.FWS = self.LWS + self.CR
				228	self.atomends = self.specials + self.LWS + self.CR
				229	# Note that RFC 2822 now specifies `.' as obs-phrase, meaning that it
				230	# is obsolete syntax. RFC 2822 requires that we recognize obsolete
				231	# syntax, so allow dots in phrases.
				232	self.phraseends = self.atomends.replace('.', '')
				233	self.field = field
				234	self.commentlist = []
				235
				236	def gotonext(self):
R. David Murray	63563cd	2010-12-18 18:25:38 +0000	[diff] [blame]	237	"""Skip white space and extract comments."""
				238	wslist = []
Guido van Rossum	8b3febe	2007-08-30 01:15:14 +0000	[diff] [blame]	239	while self.pos < len(self.field):
				240	if self.field[self.pos] in self.LWS + '\n\r':
R. David Murray	63563cd	2010-12-18 18:25:38 +0000	[diff] [blame]	241	if self.field[self.pos] not in '\n\r':
				242	wslist.append(self.field[self.pos])
Guido van Rossum	8b3febe	2007-08-30 01:15:14 +0000	[diff] [blame]	243	self.pos += 1
				244	elif self.field[self.pos] == '(':
				245	self.commentlist.append(self.getcomment())
				246	else:
				247	break
R. David Murray	63563cd	2010-12-18 18:25:38 +0000	[diff] [blame]	248	return EMPTYSTRING.join(wslist)
Guido van Rossum	8b3febe	2007-08-30 01:15:14 +0000	[diff] [blame]	249
				250	def getaddrlist(self):
				251	"""Parse all addresses.
				252
				253	Returns a list containing all of the addresses.
				254	"""
				255	result = []
				256	while self.pos < len(self.field):
				257	ad = self.getaddress()
				258	if ad:
				259	result += ad
				260	else:
				261	result.append(('', ''))
				262	return result
				263
				264	def getaddress(self):
				265	"""Parse the next address."""
				266	self.commentlist = []
				267	self.gotonext()
				268
				269	oldpos = self.pos
				270	oldcl = self.commentlist
				271	plist = self.getphraselist()
				272
				273	self.gotonext()
				274	returnlist = []
				275
				276	if self.pos >= len(self.field):
				277	# Bad email address technically, no domain.
				278	if plist:
				279	returnlist = [(SPACE.join(self.commentlist), plist[0])]
				280
				281	elif self.field[self.pos] in '.@':
				282	# email address is just an addrspec
				283	# this isn't very efficient since we start over
				284	self.pos = oldpos
				285	self.commentlist = oldcl
				286	addrspec = self.getaddrspec()
				287	returnlist = [(SPACE.join(self.commentlist), addrspec)]
				288
				289	elif self.field[self.pos] == ':':
				290	# address is a group
				291	returnlist = []
				292
				293	fieldlen = len(self.field)
				294	self.pos += 1
				295	while self.pos < len(self.field):
				296	self.gotonext()
				297	if self.pos < fieldlen and self.field[self.pos] == ';':
				298	self.pos += 1
				299	break
				300	returnlist = returnlist + self.getaddress()
				301
				302	elif self.field[self.pos] == '<':
				303	# Address is a phrase then a route addr
				304	routeaddr = self.getrouteaddr()
				305
				306	if self.commentlist:
				307	returnlist = [(SPACE.join(plist) + ' (' +
				308	' '.join(self.commentlist) + ')', routeaddr)]
				309	else:
				310	returnlist = [(SPACE.join(plist), routeaddr)]
				311
				312	else:
				313	if plist:
				314	returnlist = [(SPACE.join(self.commentlist), plist[0])]
				315	elif self.field[self.pos] in self.specials:
				316	self.pos += 1
				317
				318	self.gotonext()
				319	if self.pos < len(self.field) and self.field[self.pos] == ',':
				320	self.pos += 1
				321	return returnlist
				322
				323	def getrouteaddr(self):
				324	"""Parse a route address (Return-path value).
				325
				326	This method just skips all the route stuff and returns the addrspec.
				327	"""
				328	if self.field[self.pos] != '<':
				329	return
				330
				331	expectroute = False
				332	self.pos += 1
				333	self.gotonext()
				334	adlist = ''
				335	while self.pos < len(self.field):
				336	if expectroute:
				337	self.getdomain()
				338	expectroute = False
				339	elif self.field[self.pos] == '>':
				340	self.pos += 1
				341	break
				342	elif self.field[self.pos] == '@':
				343	self.pos += 1
				344	expectroute = True
				345	elif self.field[self.pos] == ':':
				346	self.pos += 1
				347	else:
				348	adlist = self.getaddrspec()
				349	self.pos += 1
				350	break
				351	self.gotonext()
				352
				353	return adlist
				354
				355	def getaddrspec(self):
				356	"""Parse an RFC 2822 addr-spec."""
				357	aslist = []
				358
				359	self.gotonext()
				360	while self.pos < len(self.field):
R. David Murray	63563cd	2010-12-18 18:25:38 +0000	[diff] [blame]	361	preserve_ws = True
Guido van Rossum	8b3febe	2007-08-30 01:15:14 +0000	[diff] [blame]	362	if self.field[self.pos] == '.':
R. David Murray	63563cd	2010-12-18 18:25:38 +0000	[diff] [blame]	363	if aslist and not aslist[-1].strip():
				364	aslist.pop()
Guido van Rossum	8b3febe	2007-08-30 01:15:14 +0000	[diff] [blame]	365	aslist.append('.')
				366	self.pos += 1
R. David Murray	63563cd	2010-12-18 18:25:38 +0000	[diff] [blame]	367	preserve_ws = False
Guido van Rossum	8b3febe	2007-08-30 01:15:14 +0000	[diff] [blame]	368	elif self.field[self.pos] == '"':
R. David Murray	5397e86	2010-10-02 15:58:26 +0000	[diff] [blame]	369	aslist.append('"%s"' % quote(self.getquote()))
Guido van Rossum	8b3febe	2007-08-30 01:15:14 +0000	[diff] [blame]	370	elif self.field[self.pos] in self.atomends:
R. David Murray	63563cd	2010-12-18 18:25:38 +0000	[diff] [blame]	371	if aslist and not aslist[-1].strip():
				372	aslist.pop()
Guido van Rossum	8b3febe	2007-08-30 01:15:14 +0000	[diff] [blame]	373	break
				374	else:
				375	aslist.append(self.getatom())
R. David Murray	63563cd	2010-12-18 18:25:38 +0000	[diff] [blame]	376	ws = self.gotonext()
				377	if preserve_ws and ws:
				378	aslist.append(ws)
Guido van Rossum	8b3febe	2007-08-30 01:15:14 +0000	[diff] [blame]	379
				380	if self.pos >= len(self.field) or self.field[self.pos] != '@':
				381	return EMPTYSTRING.join(aslist)
				382
				383	aslist.append('@')
				384	self.pos += 1
				385	self.gotonext()
jpic	8cb65d1	2019-07-17 23:54:25 +0200	[diff] [blame]	386	domain = self.getdomain()
				387	if not domain:
				388	# Invalid domain, return an empty address instead of returning a
				389	# local part to denote failed parsing.
				390	return EMPTYSTRING
				391	return EMPTYSTRING.join(aslist) + domain
Guido van Rossum	8b3febe	2007-08-30 01:15:14 +0000	[diff] [blame]	392
				393	def getdomain(self):
				394	"""Get the complete domain name from an address."""
				395	sdlist = []
				396	while self.pos < len(self.field):
				397	if self.field[self.pos] in self.LWS:
				398	self.pos += 1
				399	elif self.field[self.pos] == '(':
				400	self.commentlist.append(self.getcomment())
				401	elif self.field[self.pos] == '[':
				402	sdlist.append(self.getdomainliteral())
				403	elif self.field[self.pos] == '.':
				404	self.pos += 1
				405	sdlist.append('.')
jpic	8cb65d1	2019-07-17 23:54:25 +0200	[diff] [blame]	406	elif self.field[self.pos] == '@':
				407	# bpo-34155: Don't parse domains with two `@` like
				408	# `a@malicious.org@important.com`.
				409	return EMPTYSTRING
Guido van Rossum	8b3febe	2007-08-30 01:15:14 +0000	[diff] [blame]	410	elif self.field[self.pos] in self.atomends:
				411	break
				412	else:
				413	sdlist.append(self.getatom())
				414	return EMPTYSTRING.join(sdlist)
				415
				416	def getdelimited(self, beginchar, endchars, allowcomments=True):
				417	"""Parse a header fragment delimited by special characters.
				418
				419	`beginchar' is the start character for the fragment.
				420	If self is not looking at an instance of `beginchar' then
				421	getdelimited returns the empty string.
				422
				423	`endchars' is a sequence of allowable end-delimiting characters.
				424	Parsing stops when one of these is encountered.
				425
				426	If `allowcomments' is non-zero, embedded RFC 2822 comments are allowed
				427	within the parsed fragment.
				428	"""
				429	if self.field[self.pos] != beginchar:
				430	return ''
				431
				432	slist = ['']
				433	quote = False
				434	self.pos += 1
				435	while self.pos < len(self.field):
				436	if quote:
				437	slist.append(self.field[self.pos])
				438	quote = False
				439	elif self.field[self.pos] in endchars:
				440	self.pos += 1
				441	break
				442	elif allowcomments and self.field[self.pos] == '(':
				443	slist.append(self.getcomment())
				444	continue # have already advanced pos from getcomment
				445	elif self.field[self.pos] == '\\':
				446	quote = True
				447	else:
				448	slist.append(self.field[self.pos])
				449	self.pos += 1
				450
				451	return EMPTYSTRING.join(slist)
				452
				453	def getquote(self):
				454	"""Get a quote-delimited fragment from self's field."""
				455	return self.getdelimited('"', '"\r', False)
				456
				457	def getcomment(self):
				458	"""Get a parenthesis-delimited fragment from self's field."""
				459	return self.getdelimited('(', ')\r', True)
				460
				461	def getdomainliteral(self):
				462	"""Parse an RFC 2822 domain-literal."""
				463	return '[%s]' % self.getdelimited('[', ']\r', False)
				464
				465	def getatom(self, atomends=None):
				466	"""Parse an RFC 2822 atom.
				467
				468	Optional atomends specifies a different set of end token delimiters
				469	(the default is to use self.atomends). This is used e.g. in
				470	getphraselist() since phrase endings must not include the `.' (which
				471	is legal in phrases)."""
				472	atomlist = ['']
				473	if atomends is None:
				474	atomends = self.atomends
				475
				476	while self.pos < len(self.field):
				477	if self.field[self.pos] in atomends:
				478	break
				479	else:
				480	atomlist.append(self.field[self.pos])
				481	self.pos += 1
				482
				483	return EMPTYSTRING.join(atomlist)
				484
				485	def getphraselist(self):
				486	"""Parse a sequence of RFC 2822 phrases.
				487
				488	A phrase is a sequence of words, which are in turn either RFC 2822
				489	atoms or quoted-strings. Phrases are canonicalized by squeezing all
				490	runs of continuous whitespace into one space.
				491	"""
				492	plist = []
				493
				494	while self.pos < len(self.field):
				495	if self.field[self.pos] in self.FWS:
				496	self.pos += 1
				497	elif self.field[self.pos] == '"':
				498	plist.append(self.getquote())
				499	elif self.field[self.pos] == '(':
				500	self.commentlist.append(self.getcomment())
				501	elif self.field[self.pos] in self.phraseends:
				502	break
				503	else:
				504	plist.append(self.getatom(self.phraseends))
				505
				506	return plist
				507
				508	class AddressList(AddrlistClass):
				509	"""An AddressList encapsulates a list of parsed RFC 2822 addresses."""
				510	def __init__(self, field):
				511	AddrlistClass.__init__(self, field)
				512	if field:
				513	self.addresslist = self.getaddrlist()
				514	else:
				515	self.addresslist = []
				516
				517	def __len__(self):
				518	return len(self.addresslist)
				519
				520	def __add__(self, other):
				521	# Set union
				522	newaddr = AddressList(None)
				523	newaddr.addresslist = self.addresslist[:]
				524	for x in other.addresslist:
				525	if not x in self.addresslist:
				526	newaddr.addresslist.append(x)
				527	return newaddr
				528
				529	def __iadd__(self, other):
				530	# Set union, in-place
				531	for x in other.addresslist:
				532	if not x in self.addresslist:
				533	self.addresslist.append(x)
				534	return self
				535
				536	def __sub__(self, other):
				537	# Set difference
				538	newaddr = AddressList(None)
				539	for x in self.addresslist:
				540	if not x in other.addresslist:
				541	newaddr.addresslist.append(x)
				542	return newaddr
				543
				544	def __isub__(self, other):
				545	# Set difference, in-place
				546	for x in other.addresslist:
				547	if x in self.addresslist:
				548	self.addresslist.remove(x)
				549	return self
				550
				551	def __getitem__(self, index):
				552	# Make indexing, slices, and 'in' work
				553	return self.addresslist[index]