Blame - Lib/email/_parseaddr.py - platform/external/python/cpython3

blob: 977fedf67b1591db4fe2ac532523e4e056bdac3a [file] [log] [blame]

Guido van Rossum	8b3febe	2007-08-30 01:15:14 +0000	[diff] [blame]	1	# Copyright (C) 2002-2007 Python Software Foundation
				2	# Contact: email-sig@python.org
				3
				4	"""Email address parsing code.
				5
				6	Lifted directly from rfc822.py. This should eventually be rewritten.
				7	"""
				8
				9	__all__ = [
				10	'mktime_tz',
				11	'parsedate',
				12	'parsedate_tz',
				13	'quote',
				14	]
				15
Alexander Belopolsky	a07548e	2012-06-21 20:34:09 -0400	[diff] [blame]	16	import time, calendar
Guido van Rossum	8b3febe	2007-08-30 01:15:14 +0000	[diff] [blame]	17
				18	SPACE = ' '
				19	EMPTYSTRING = ''
				20	COMMASPACE = ', '
				21
				22	# Parse a date field
				23	_monthnames = ['jan', 'feb', 'mar', 'apr', 'may', 'jun', 'jul',
				24	'aug', 'sep', 'oct', 'nov', 'dec',
				25	'january', 'february', 'march', 'april', 'may', 'june', 'july',
				26	'august', 'september', 'october', 'november', 'december']
				27
				28	_daynames = ['mon', 'tue', 'wed', 'thu', 'fri', 'sat', 'sun']
				29
				30	# The timezone table does not include the military time zones defined
				31	# in RFC822, other than Z. According to RFC1123, the description in
				32	# RFC822 gets the signs wrong, so we can't rely on any such time
				33	# zones. RFC1123 recommends that numeric timezone indicators be used
				34	# instead of timezone names.
				35
				36	_timezones = {'UT':0, 'UTC':0, 'GMT':0, 'Z':0,
				37	'AST': -400, 'ADT': -300, # Atlantic (used in Canada)
				38	'EST': -500, 'EDT': -400, # Eastern
				39	'CST': -600, 'CDT': -500, # Central
				40	'MST': -700, 'MDT': -600, # Mountain
				41	'PST': -800, 'PDT': -700 # Pacific
				42	}
				43
				44
				45	def parsedate_tz(data):
				46	"""Convert a date string to a time tuple.
				47
				48	Accounts for military timezones.
				49	"""
R David Murray	875048b	2011-07-20 11:41:21 -0400	[diff] [blame]	50	res = _parsedate_tz(data)
Georg Brandl	1aca31e	2012-09-22 09:03:56 +0200	[diff] [blame]	51	if not res:
				52	return
R David Murray	875048b	2011-07-20 11:41:21 -0400	[diff] [blame]	53	if res[9] is None:
				54	res[9] = 0
				55	return tuple(res)
				56
				57	def _parsedate_tz(data):
				58	"""Convert date to extended time tuple.
				59
				60	The last (additional) element is the time zone offset in seconds, except if
				61	the timezone was specified as -0000. In that case the last element is
				62	None. This indicates a UTC timestamp that explicitly declaims knowledge of
				63	the source timezone, as opposed to a +0000 timestamp that indicates the
				64	source timezone really was UTC.
				65
				66	"""
Georg Brandl	1aca31e	2012-09-22 09:03:56 +0200	[diff] [blame]	67	if not data:
Georges Toth	303aac8	2020-10-27 01:31:06 +0100	[diff] [blame]	68	return None
Guido van Rossum	8b3febe	2007-08-30 01:15:14 +0000	[diff] [blame]	69	data = data.split()
Miss Islington (bot)	9a79242	2021-08-26 08:47:27 -0700	[diff] [blame]	70	if not data: # This happens for whitespace-only input.
				71	return None
Guido van Rossum	8b3febe	2007-08-30 01:15:14 +0000	[diff] [blame]	72	# The FWS after the comma after the day-of-week is optional, so search and
				73	# adjust for this.
				74	if data[0].endswith(',') or data[0].lower() in _daynames:
				75	# There's a dayname here. Skip it
				76	del data[0]
				77	else:
				78	i = data[0].rfind(',')
				79	if i >= 0:
				80	data[0] = data[0][i+1:]
				81	if len(data) == 3: # RFC 850 date, deprecated
				82	stuff = data[0].split('-')
				83	if len(stuff) == 3:
				84	data = stuff + data[1:]
				85	if len(data) == 4:
				86	s = data[3]
				87	i = s.find('+')
R. David Murray	4a62e89	2010-12-23 20:35:46 +0000	[diff] [blame]	88	if i == -1:
				89	i = s.find('-')
Guido van Rossum	8b3febe	2007-08-30 01:15:14 +0000	[diff] [blame]	90	if i > 0:
R. David Murray	4a62e89	2010-12-23 20:35:46 +0000	[diff] [blame]	91	data[3:] = [s[:i], s[i:]]
Guido van Rossum	8b3febe	2007-08-30 01:15:14 +0000	[diff] [blame]	92	else:
				93	data.append('') # Dummy tz
				94	if len(data) < 5:
				95	return None
				96	data = data[:5]
				97	[dd, mm, yy, tm, tz] = data
				98	mm = mm.lower()
				99	if mm not in _monthnames:
				100	dd, mm = mm, dd.lower()
				101	if mm not in _monthnames:
				102	return None
				103	mm = _monthnames.index(mm) + 1
				104	if mm > 12:
				105	mm -= 12
				106	if dd[-1] == ',':
				107	dd = dd[:-1]
				108	i = yy.find(':')
				109	if i > 0:
				110	yy, tm = tm, yy
				111	if yy[-1] == ',':
				112	yy = yy[:-1]
				113	if not yy[0].isdigit():
				114	yy, tz = tz, yy
				115	if tm[-1] == ',':
				116	tm = tm[:-1]
				117	tm = tm.split(':')
				118	if len(tm) == 2:
				119	[thh, tmm] = tm
				120	tss = '0'
				121	elif len(tm) == 3:
				122	[thh, tmm, tss] = tm
R David Murray	accd1c0	2011-03-13 20:06:23 -0400	[diff] [blame]	123	elif len(tm) == 1 and '.' in tm[0]:
				124	# Some non-compliant MUAs use '.' to separate time elements.
				125	tm = tm[0].split('.')
				126	if len(tm) == 2:
				127	[thh, tmm] = tm
				128	tss = 0
				129	elif len(tm) == 3:
				130	[thh, tmm, tss] = tm
Guido van Rossum	8b3febe	2007-08-30 01:15:14 +0000	[diff] [blame]	131	else:
				132	return None
				133	try:
				134	yy = int(yy)
				135	dd = int(dd)
				136	thh = int(thh)
				137	tmm = int(tmm)
				138	tss = int(tss)
				139	except ValueError:
				140	return None
R. David Murray	219d1c8	2010-08-25 00:45:55 +0000	[diff] [blame]	141	# Check for a yy specified in two-digit format, then convert it to the
				142	# appropriate four-digit format, according to the POSIX standard. RFC 822
				143	# calls for a two-digit yy, but RFC 2822 (which obsoletes RFC 822)
				144	# mandates a 4-digit yy. For more information, see the documentation for
				145	# the time module.
				146	if yy < 100:
				147	# The year is between 1969 and 1999 (inclusive).
				148	if yy > 68:
				149	yy += 1900
				150	# The year is between 2000 and 2068 (inclusive).
				151	else:
				152	yy += 2000
Guido van Rossum	8b3febe	2007-08-30 01:15:14 +0000	[diff] [blame]	153	tzoffset = None
				154	tz = tz.upper()
				155	if tz in _timezones:
				156	tzoffset = _timezones[tz]
				157	else:
				158	try:
				159	tzoffset = int(tz)
				160	except ValueError:
				161	pass
R David Murray	875048b	2011-07-20 11:41:21 -0400	[diff] [blame]	162	if tzoffset==0 and tz.startswith('-'):
				163	tzoffset = None
Guido van Rossum	8b3febe	2007-08-30 01:15:14 +0000	[diff] [blame]	164	# Convert a timezone offset into seconds ; -0500 -> -18000
				165	if tzoffset:
				166	if tzoffset < 0:
				167	tzsign = -1
				168	tzoffset = -tzoffset
				169	else:
				170	tzsign = 1
				171	tzoffset = tzsign * ( (tzoffset//100)3600 + (tzoffset % 100)60)
				172	# Daylight Saving Time flag is set to -1, since DST is unknown.
R David Murray	875048b	2011-07-20 11:41:21 -0400	[diff] [blame]	173	return [yy, mm, dd, thh, tmm, tss, 0, 1, -1, tzoffset]
Guido van Rossum	8b3febe	2007-08-30 01:15:14 +0000	[diff] [blame]	174
				175
				176	def parsedate(data):
				177	"""Convert a time string to a time tuple."""
				178	t = parsedate_tz(data)
				179	if isinstance(t, tuple):
				180	return t[:9]
				181	else:
				182	return t
				183
				184
				185	def mktime_tz(data):
Alexander Belopolsky	a07548e	2012-06-21 20:34:09 -0400	[diff] [blame]	186	"""Turn a 10-tuple as returned by parsedate_tz() into a POSIX timestamp."""
Guido van Rossum	8b3febe	2007-08-30 01:15:14 +0000	[diff] [blame]	187	if data[9] is None:
				188	# No zone info, so localtime is better assumption than GMT
				189	return time.mktime(data[:8] + (-1,))
				190	else:
Alexander Belopolsky	a07548e	2012-06-21 20:34:09 -0400	[diff] [blame]	191	t = calendar.timegm(data)
				192	return t - data[9]
Guido van Rossum	8b3febe	2007-08-30 01:15:14 +0000	[diff] [blame]	193
				194
				195	def quote(str):
R. David Murray	5397e86	2010-10-02 15:58:26 +0000	[diff] [blame]	196	"""Prepare string to be used in a quoted string.
				197
				198	Turns backslash and double quote characters into quoted pairs. These
				199	are the only characters that need to be quoted inside a quoted string.
				200	Does not add the surrounding double quotes.
				201	"""
Guido van Rossum	8b3febe	2007-08-30 01:15:14 +0000	[diff] [blame]	202	return str.replace('\\', '\\\\').replace('"', '\\"')
				203
				204
				205	class AddrlistClass:
				206	"""Address parser class by Ben Escoto.
				207
				208	To understand what this class does, it helps to have a copy of RFC 2822 in
				209	front of you.
				210
				211	Note: this class interface is deprecated and may be removed in the future.
Florent Xicluna	992d9e0	2011-11-11 19:35:42 +0100	[diff] [blame]	212	Use email.utils.AddressList instead.
Guido van Rossum	8b3febe	2007-08-30 01:15:14 +0000	[diff] [blame]	213	"""
				214
				215	def __init__(self, field):
				216	"""Initialize a new instance.
				217
				218	`field' is an unparsed address header field, containing
				219	one or more addresses.
				220	"""
				221	self.specials = '()<>@,:;.\"[]'
				222	self.pos = 0
				223	self.LWS = ' \t'
				224	self.CR = '\r\n'
				225	self.FWS = self.LWS + self.CR
				226	self.atomends = self.specials + self.LWS + self.CR
				227	# Note that RFC 2822 now specifies `.' as obs-phrase, meaning that it
				228	# is obsolete syntax. RFC 2822 requires that we recognize obsolete
				229	# syntax, so allow dots in phrases.
				230	self.phraseends = self.atomends.replace('.', '')
				231	self.field = field
				232	self.commentlist = []
				233
				234	def gotonext(self):
R. David Murray	63563cd	2010-12-18 18:25:38 +0000	[diff] [blame]	235	"""Skip white space and extract comments."""
				236	wslist = []
Guido van Rossum	8b3febe	2007-08-30 01:15:14 +0000	[diff] [blame]	237	while self.pos < len(self.field):
				238	if self.field[self.pos] in self.LWS + '\n\r':
R. David Murray	63563cd	2010-12-18 18:25:38 +0000	[diff] [blame]	239	if self.field[self.pos] not in '\n\r':
				240	wslist.append(self.field[self.pos])
Guido van Rossum	8b3febe	2007-08-30 01:15:14 +0000	[diff] [blame]	241	self.pos += 1
				242	elif self.field[self.pos] == '(':
				243	self.commentlist.append(self.getcomment())
				244	else:
				245	break
R. David Murray	63563cd	2010-12-18 18:25:38 +0000	[diff] [blame]	246	return EMPTYSTRING.join(wslist)
Guido van Rossum	8b3febe	2007-08-30 01:15:14 +0000	[diff] [blame]	247
				248	def getaddrlist(self):
				249	"""Parse all addresses.
				250
				251	Returns a list containing all of the addresses.
				252	"""
				253	result = []
				254	while self.pos < len(self.field):
				255	ad = self.getaddress()
				256	if ad:
				257	result += ad
				258	else:
				259	result.append(('', ''))
				260	return result
				261
				262	def getaddress(self):
				263	"""Parse the next address."""
				264	self.commentlist = []
				265	self.gotonext()
				266
				267	oldpos = self.pos
				268	oldcl = self.commentlist
				269	plist = self.getphraselist()
				270
				271	self.gotonext()
				272	returnlist = []
				273
				274	if self.pos >= len(self.field):
				275	# Bad email address technically, no domain.
				276	if plist:
				277	returnlist = [(SPACE.join(self.commentlist), plist[0])]
				278
				279	elif self.field[self.pos] in '.@':
				280	# email address is just an addrspec
				281	# this isn't very efficient since we start over
				282	self.pos = oldpos
				283	self.commentlist = oldcl
				284	addrspec = self.getaddrspec()
				285	returnlist = [(SPACE.join(self.commentlist), addrspec)]
				286
				287	elif self.field[self.pos] == ':':
				288	# address is a group
				289	returnlist = []
				290
				291	fieldlen = len(self.field)
				292	self.pos += 1
				293	while self.pos < len(self.field):
				294	self.gotonext()
				295	if self.pos < fieldlen and self.field[self.pos] == ';':
				296	self.pos += 1
				297	break
				298	returnlist = returnlist + self.getaddress()
				299
				300	elif self.field[self.pos] == '<':
				301	# Address is a phrase then a route addr
				302	routeaddr = self.getrouteaddr()
				303
				304	if self.commentlist:
				305	returnlist = [(SPACE.join(plist) + ' (' +
				306	' '.join(self.commentlist) + ')', routeaddr)]
				307	else:
				308	returnlist = [(SPACE.join(plist), routeaddr)]
				309
				310	else:
				311	if plist:
				312	returnlist = [(SPACE.join(self.commentlist), plist[0])]
				313	elif self.field[self.pos] in self.specials:
				314	self.pos += 1
				315
				316	self.gotonext()
				317	if self.pos < len(self.field) and self.field[self.pos] == ',':
				318	self.pos += 1
				319	return returnlist
				320
				321	def getrouteaddr(self):
				322	"""Parse a route address (Return-path value).
				323
				324	This method just skips all the route stuff and returns the addrspec.
				325	"""
				326	if self.field[self.pos] != '<':
				327	return
				328
				329	expectroute = False
				330	self.pos += 1
				331	self.gotonext()
				332	adlist = ''
				333	while self.pos < len(self.field):
				334	if expectroute:
				335	self.getdomain()
				336	expectroute = False
				337	elif self.field[self.pos] == '>':
				338	self.pos += 1
				339	break
				340	elif self.field[self.pos] == '@':
				341	self.pos += 1
				342	expectroute = True
				343	elif self.field[self.pos] == ':':
				344	self.pos += 1
				345	else:
				346	adlist = self.getaddrspec()
				347	self.pos += 1
				348	break
				349	self.gotonext()
				350
				351	return adlist
				352
				353	def getaddrspec(self):
				354	"""Parse an RFC 2822 addr-spec."""
				355	aslist = []
				356
				357	self.gotonext()
				358	while self.pos < len(self.field):
R. David Murray	63563cd	2010-12-18 18:25:38 +0000	[diff] [blame]	359	preserve_ws = True
Guido van Rossum	8b3febe	2007-08-30 01:15:14 +0000	[diff] [blame]	360	if self.field[self.pos] == '.':
R. David Murray	63563cd	2010-12-18 18:25:38 +0000	[diff] [blame]	361	if aslist and not aslist[-1].strip():
				362	aslist.pop()
Guido van Rossum	8b3febe	2007-08-30 01:15:14 +0000	[diff] [blame]	363	aslist.append('.')
				364	self.pos += 1
R. David Murray	63563cd	2010-12-18 18:25:38 +0000	[diff] [blame]	365	preserve_ws = False
Guido van Rossum	8b3febe	2007-08-30 01:15:14 +0000	[diff] [blame]	366	elif self.field[self.pos] == '"':
R. David Murray	5397e86	2010-10-02 15:58:26 +0000	[diff] [blame]	367	aslist.append('"%s"' % quote(self.getquote()))
Guido van Rossum	8b3febe	2007-08-30 01:15:14 +0000	[diff] [blame]	368	elif self.field[self.pos] in self.atomends:
R. David Murray	63563cd	2010-12-18 18:25:38 +0000	[diff] [blame]	369	if aslist and not aslist[-1].strip():
				370	aslist.pop()
Guido van Rossum	8b3febe	2007-08-30 01:15:14 +0000	[diff] [blame]	371	break
				372	else:
				373	aslist.append(self.getatom())
R. David Murray	63563cd	2010-12-18 18:25:38 +0000	[diff] [blame]	374	ws = self.gotonext()
				375	if preserve_ws and ws:
				376	aslist.append(ws)
Guido van Rossum	8b3febe	2007-08-30 01:15:14 +0000	[diff] [blame]	377
				378	if self.pos >= len(self.field) or self.field[self.pos] != '@':
				379	return EMPTYSTRING.join(aslist)
				380
				381	aslist.append('@')
				382	self.pos += 1
				383	self.gotonext()
jpic	8cb65d1	2019-07-17 23:54:25 +0200	[diff] [blame]	384	domain = self.getdomain()
				385	if not domain:
				386	# Invalid domain, return an empty address instead of returning a
				387	# local part to denote failed parsing.
				388	return EMPTYSTRING
				389	return EMPTYSTRING.join(aslist) + domain
Guido van Rossum	8b3febe	2007-08-30 01:15:14 +0000	[diff] [blame]	390
				391	def getdomain(self):
				392	"""Get the complete domain name from an address."""
				393	sdlist = []
				394	while self.pos < len(self.field):
				395	if self.field[self.pos] in self.LWS:
				396	self.pos += 1
				397	elif self.field[self.pos] == '(':
				398	self.commentlist.append(self.getcomment())
				399	elif self.field[self.pos] == '[':
				400	sdlist.append(self.getdomainliteral())
				401	elif self.field[self.pos] == '.':
				402	self.pos += 1
				403	sdlist.append('.')
jpic	8cb65d1	2019-07-17 23:54:25 +0200	[diff] [blame]	404	elif self.field[self.pos] == '@':
				405	# bpo-34155: Don't parse domains with two `@` like
				406	# `a@malicious.org@important.com`.
				407	return EMPTYSTRING
Guido van Rossum	8b3febe	2007-08-30 01:15:14 +0000	[diff] [blame]	408	elif self.field[self.pos] in self.atomends:
				409	break
				410	else:
				411	sdlist.append(self.getatom())
				412	return EMPTYSTRING.join(sdlist)
				413
				414	def getdelimited(self, beginchar, endchars, allowcomments=True):
				415	"""Parse a header fragment delimited by special characters.
				416
				417	`beginchar' is the start character for the fragment.
				418	If self is not looking at an instance of `beginchar' then
				419	getdelimited returns the empty string.
				420
				421	`endchars' is a sequence of allowable end-delimiting characters.
				422	Parsing stops when one of these is encountered.
				423
				424	If `allowcomments' is non-zero, embedded RFC 2822 comments are allowed
				425	within the parsed fragment.
				426	"""
				427	if self.field[self.pos] != beginchar:
				428	return ''
				429
				430	slist = ['']
				431	quote = False
				432	self.pos += 1
				433	while self.pos < len(self.field):
				434	if quote:
				435	slist.append(self.field[self.pos])
				436	quote = False
				437	elif self.field[self.pos] in endchars:
				438	self.pos += 1
				439	break
				440	elif allowcomments and self.field[self.pos] == '(':
				441	slist.append(self.getcomment())
				442	continue # have already advanced pos from getcomment
				443	elif self.field[self.pos] == '\\':
				444	quote = True
				445	else:
				446	slist.append(self.field[self.pos])
				447	self.pos += 1
				448
				449	return EMPTYSTRING.join(slist)
				450
				451	def getquote(self):
				452	"""Get a quote-delimited fragment from self's field."""
				453	return self.getdelimited('"', '"\r', False)
				454
				455	def getcomment(self):
				456	"""Get a parenthesis-delimited fragment from self's field."""
				457	return self.getdelimited('(', ')\r', True)
				458
				459	def getdomainliteral(self):
				460	"""Parse an RFC 2822 domain-literal."""
				461	return '[%s]' % self.getdelimited('[', ']\r', False)
				462
				463	def getatom(self, atomends=None):
				464	"""Parse an RFC 2822 atom.
				465
				466	Optional atomends specifies a different set of end token delimiters
				467	(the default is to use self.atomends). This is used e.g. in
				468	getphraselist() since phrase endings must not include the `.' (which
				469	is legal in phrases)."""
				470	atomlist = ['']
				471	if atomends is None:
				472	atomends = self.atomends
				473
				474	while self.pos < len(self.field):
				475	if self.field[self.pos] in atomends:
				476	break
				477	else:
				478	atomlist.append(self.field[self.pos])
				479	self.pos += 1
				480
				481	return EMPTYSTRING.join(atomlist)
				482
				483	def getphraselist(self):
				484	"""Parse a sequence of RFC 2822 phrases.
				485
				486	A phrase is a sequence of words, which are in turn either RFC 2822
				487	atoms or quoted-strings. Phrases are canonicalized by squeezing all
				488	runs of continuous whitespace into one space.
				489	"""
				490	plist = []
				491
				492	while self.pos < len(self.field):
				493	if self.field[self.pos] in self.FWS:
				494	self.pos += 1
				495	elif self.field[self.pos] == '"':
				496	plist.append(self.getquote())
				497	elif self.field[self.pos] == '(':
				498	self.commentlist.append(self.getcomment())
				499	elif self.field[self.pos] in self.phraseends:
				500	break
				501	else:
				502	plist.append(self.getatom(self.phraseends))
				503
				504	return plist
				505
				506	class AddressList(AddrlistClass):
				507	"""An AddressList encapsulates a list of parsed RFC 2822 addresses."""
				508	def __init__(self, field):
				509	AddrlistClass.__init__(self, field)
				510	if field:
				511	self.addresslist = self.getaddrlist()
				512	else:
				513	self.addresslist = []
				514
				515	def __len__(self):
				516	return len(self.addresslist)
				517
				518	def __add__(self, other):
				519	# Set union
				520	newaddr = AddressList(None)
				521	newaddr.addresslist = self.addresslist[:]
				522	for x in other.addresslist:
				523	if not x in self.addresslist:
				524	newaddr.addresslist.append(x)
				525	return newaddr
				526
				527	def __iadd__(self, other):
				528	# Set union, in-place
				529	for x in other.addresslist:
				530	if not x in self.addresslist:
				531	self.addresslist.append(x)
				532	return self
				533
				534	def __sub__(self, other):
				535	# Set difference
				536	newaddr = AddressList(None)
				537	for x in self.addresslist:
				538	if not x in other.addresslist:
				539	newaddr.addresslist.append(x)
				540	return newaddr
				541
				542	def __isub__(self, other):
				543	# Set difference, in-place
				544	for x in other.addresslist:
				545	if x in self.addresslist:
				546	self.addresslist.remove(x)
				547	return self
				548
				549	def __getitem__(self, index):
				550	# Make indexing, slices, and 'in' work
				551	return self.addresslist[index]