Blame - Lib/email/_parseaddr.py - platform/external/python/cpython3

blob: c455e056d0ba4ab582638c7c86c857c2fe10b8d7 [file] [log] [blame]

Guido van Rossum	8b3febe	2007-08-30 01:15:14 +0000	[diff] [blame]	1	# Copyright (C) 2002-2007 Python Software Foundation
				2	# Contact: email-sig@python.org
				3
				4	"""Email address parsing code.
				5
				6	Lifted directly from rfc822.py. This should eventually be rewritten.
				7	"""
				8
				9	__all__ = [
				10	'mktime_tz',
				11	'parsedate',
				12	'parsedate_tz',
				13	'quote',
				14	]
				15
				16	import time
				17
				18	SPACE = ' '
				19	EMPTYSTRING = ''
				20	COMMASPACE = ', '
				21
				22	# Parse a date field
				23	_monthnames = ['jan', 'feb', 'mar', 'apr', 'may', 'jun', 'jul',
				24	'aug', 'sep', 'oct', 'nov', 'dec',
				25	'january', 'february', 'march', 'april', 'may', 'june', 'july',
				26	'august', 'september', 'october', 'november', 'december']
				27
				28	_daynames = ['mon', 'tue', 'wed', 'thu', 'fri', 'sat', 'sun']
				29
				30	# The timezone table does not include the military time zones defined
				31	# in RFC822, other than Z. According to RFC1123, the description in
				32	# RFC822 gets the signs wrong, so we can't rely on any such time
				33	# zones. RFC1123 recommends that numeric timezone indicators be used
				34	# instead of timezone names.
				35
				36	_timezones = {'UT':0, 'UTC':0, 'GMT':0, 'Z':0,
				37	'AST': -400, 'ADT': -300, # Atlantic (used in Canada)
				38	'EST': -500, 'EDT': -400, # Eastern
				39	'CST': -600, 'CDT': -500, # Central
				40	'MST': -700, 'MDT': -600, # Mountain
				41	'PST': -800, 'PDT': -700 # Pacific
				42	}
				43
				44
				45	def parsedate_tz(data):
				46	"""Convert a date string to a time tuple.
				47
				48	Accounts for military timezones.
				49	"""
R David Murray	875048b	2011-07-20 11:41:21 -0400	[diff] [blame^]	50	res = _parsedate_tz(data)
				51	if res[9] is None:
				52	res[9] = 0
				53	return tuple(res)
				54
				55	def _parsedate_tz(data):
				56	"""Convert date to extended time tuple.
				57
				58	The last (additional) element is the time zone offset in seconds, except if
				59	the timezone was specified as -0000. In that case the last element is
				60	None. This indicates a UTC timestamp that explicitly declaims knowledge of
				61	the source timezone, as opposed to a +0000 timestamp that indicates the
				62	source timezone really was UTC.
				63
				64	"""
Guido van Rossum	8b3febe	2007-08-30 01:15:14 +0000	[diff] [blame]	65	data = data.split()
				66	# The FWS after the comma after the day-of-week is optional, so search and
				67	# adjust for this.
				68	if data[0].endswith(',') or data[0].lower() in _daynames:
				69	# There's a dayname here. Skip it
				70	del data[0]
				71	else:
				72	i = data[0].rfind(',')
				73	if i >= 0:
				74	data[0] = data[0][i+1:]
				75	if len(data) == 3: # RFC 850 date, deprecated
				76	stuff = data[0].split('-')
				77	if len(stuff) == 3:
				78	data = stuff + data[1:]
				79	if len(data) == 4:
				80	s = data[3]
				81	i = s.find('+')
R. David Murray	4a62e89	2010-12-23 20:35:46 +0000	[diff] [blame]	82	if i == -1:
				83	i = s.find('-')
Guido van Rossum	8b3febe	2007-08-30 01:15:14 +0000	[diff] [blame]	84	if i > 0:
R. David Murray	4a62e89	2010-12-23 20:35:46 +0000	[diff] [blame]	85	data[3:] = [s[:i], s[i:]]
Guido van Rossum	8b3febe	2007-08-30 01:15:14 +0000	[diff] [blame]	86	else:
				87	data.append('') # Dummy tz
				88	if len(data) < 5:
				89	return None
				90	data = data[:5]
				91	[dd, mm, yy, tm, tz] = data
				92	mm = mm.lower()
				93	if mm not in _monthnames:
				94	dd, mm = mm, dd.lower()
				95	if mm not in _monthnames:
				96	return None
				97	mm = _monthnames.index(mm) + 1
				98	if mm > 12:
				99	mm -= 12
				100	if dd[-1] == ',':
				101	dd = dd[:-1]
				102	i = yy.find(':')
				103	if i > 0:
				104	yy, tm = tm, yy
				105	if yy[-1] == ',':
				106	yy = yy[:-1]
				107	if not yy[0].isdigit():
				108	yy, tz = tz, yy
				109	if tm[-1] == ',':
				110	tm = tm[:-1]
				111	tm = tm.split(':')
				112	if len(tm) == 2:
				113	[thh, tmm] = tm
				114	tss = '0'
				115	elif len(tm) == 3:
				116	[thh, tmm, tss] = tm
R David Murray	accd1c0	2011-03-13 20:06:23 -0400	[diff] [blame]	117	elif len(tm) == 1 and '.' in tm[0]:
				118	# Some non-compliant MUAs use '.' to separate time elements.
				119	tm = tm[0].split('.')
				120	if len(tm) == 2:
				121	[thh, tmm] = tm
				122	tss = 0
				123	elif len(tm) == 3:
				124	[thh, tmm, tss] = tm
Guido van Rossum	8b3febe	2007-08-30 01:15:14 +0000	[diff] [blame]	125	else:
				126	return None
				127	try:
				128	yy = int(yy)
				129	dd = int(dd)
				130	thh = int(thh)
				131	tmm = int(tmm)
				132	tss = int(tss)
				133	except ValueError:
				134	return None
R. David Murray	219d1c8	2010-08-25 00:45:55 +0000	[diff] [blame]	135	# Check for a yy specified in two-digit format, then convert it to the
				136	# appropriate four-digit format, according to the POSIX standard. RFC 822
				137	# calls for a two-digit yy, but RFC 2822 (which obsoletes RFC 822)
				138	# mandates a 4-digit yy. For more information, see the documentation for
				139	# the time module.
				140	if yy < 100:
				141	# The year is between 1969 and 1999 (inclusive).
				142	if yy > 68:
				143	yy += 1900
				144	# The year is between 2000 and 2068 (inclusive).
				145	else:
				146	yy += 2000
Guido van Rossum	8b3febe	2007-08-30 01:15:14 +0000	[diff] [blame]	147	tzoffset = None
				148	tz = tz.upper()
				149	if tz in _timezones:
				150	tzoffset = _timezones[tz]
				151	else:
				152	try:
				153	tzoffset = int(tz)
				154	except ValueError:
				155	pass
R David Murray	875048b	2011-07-20 11:41:21 -0400	[diff] [blame^]	156	if tzoffset==0 and tz.startswith('-'):
				157	tzoffset = None
Guido van Rossum	8b3febe	2007-08-30 01:15:14 +0000	[diff] [blame]	158	# Convert a timezone offset into seconds ; -0500 -> -18000
				159	if tzoffset:
				160	if tzoffset < 0:
				161	tzsign = -1
				162	tzoffset = -tzoffset
				163	else:
				164	tzsign = 1
				165	tzoffset = tzsign * ( (tzoffset//100)3600 + (tzoffset % 100)60)
				166	# Daylight Saving Time flag is set to -1, since DST is unknown.
R David Murray	875048b	2011-07-20 11:41:21 -0400	[diff] [blame^]	167	return [yy, mm, dd, thh, tmm, tss, 0, 1, -1, tzoffset]
Guido van Rossum	8b3febe	2007-08-30 01:15:14 +0000	[diff] [blame]	168
				169
				170	def parsedate(data):
				171	"""Convert a time string to a time tuple."""
				172	t = parsedate_tz(data)
				173	if isinstance(t, tuple):
				174	return t[:9]
				175	else:
				176	return t
				177
				178
				179	def mktime_tz(data):
				180	"""Turn a 10-tuple as returned by parsedate_tz() into a UTC timestamp."""
				181	if data[9] is None:
				182	# No zone info, so localtime is better assumption than GMT
				183	return time.mktime(data[:8] + (-1,))
				184	else:
				185	t = time.mktime(data[:8] + (0,))
				186	return t - data[9] - time.timezone
				187
				188
				189	def quote(str):
R. David Murray	5397e86	2010-10-02 15:58:26 +0000	[diff] [blame]	190	"""Prepare string to be used in a quoted string.
				191
				192	Turns backslash and double quote characters into quoted pairs. These
				193	are the only characters that need to be quoted inside a quoted string.
				194	Does not add the surrounding double quotes.
				195	"""
Guido van Rossum	8b3febe	2007-08-30 01:15:14 +0000	[diff] [blame]	196	return str.replace('\\', '\\\\').replace('"', '\\"')
				197
				198
				199	class AddrlistClass:
				200	"""Address parser class by Ben Escoto.
				201
				202	To understand what this class does, it helps to have a copy of RFC 2822 in
				203	front of you.
				204
				205	Note: this class interface is deprecated and may be removed in the future.
				206	Use rfc822.AddressList instead.
				207	"""
				208
				209	def __init__(self, field):
				210	"""Initialize a new instance.
				211
				212	`field' is an unparsed address header field, containing
				213	one or more addresses.
				214	"""
				215	self.specials = '()<>@,:;.\"[]'
				216	self.pos = 0
				217	self.LWS = ' \t'
				218	self.CR = '\r\n'
				219	self.FWS = self.LWS + self.CR
				220	self.atomends = self.specials + self.LWS + self.CR
				221	# Note that RFC 2822 now specifies `.' as obs-phrase, meaning that it
				222	# is obsolete syntax. RFC 2822 requires that we recognize obsolete
				223	# syntax, so allow dots in phrases.
				224	self.phraseends = self.atomends.replace('.', '')
				225	self.field = field
				226	self.commentlist = []
				227
				228	def gotonext(self):
R. David Murray	63563cd	2010-12-18 18:25:38 +0000	[diff] [blame]	229	"""Skip white space and extract comments."""
				230	wslist = []
Guido van Rossum	8b3febe	2007-08-30 01:15:14 +0000	[diff] [blame]	231	while self.pos < len(self.field):
				232	if self.field[self.pos] in self.LWS + '\n\r':
R. David Murray	63563cd	2010-12-18 18:25:38 +0000	[diff] [blame]	233	if self.field[self.pos] not in '\n\r':
				234	wslist.append(self.field[self.pos])
Guido van Rossum	8b3febe	2007-08-30 01:15:14 +0000	[diff] [blame]	235	self.pos += 1
				236	elif self.field[self.pos] == '(':
				237	self.commentlist.append(self.getcomment())
				238	else:
				239	break
R. David Murray	63563cd	2010-12-18 18:25:38 +0000	[diff] [blame]	240	return EMPTYSTRING.join(wslist)
Guido van Rossum	8b3febe	2007-08-30 01:15:14 +0000	[diff] [blame]	241
				242	def getaddrlist(self):
				243	"""Parse all addresses.
				244
				245	Returns a list containing all of the addresses.
				246	"""
				247	result = []
				248	while self.pos < len(self.field):
				249	ad = self.getaddress()
				250	if ad:
				251	result += ad
				252	else:
				253	result.append(('', ''))
				254	return result
				255
				256	def getaddress(self):
				257	"""Parse the next address."""
				258	self.commentlist = []
				259	self.gotonext()
				260
				261	oldpos = self.pos
				262	oldcl = self.commentlist
				263	plist = self.getphraselist()
				264
				265	self.gotonext()
				266	returnlist = []
				267
				268	if self.pos >= len(self.field):
				269	# Bad email address technically, no domain.
				270	if plist:
				271	returnlist = [(SPACE.join(self.commentlist), plist[0])]
				272
				273	elif self.field[self.pos] in '.@':
				274	# email address is just an addrspec
				275	# this isn't very efficient since we start over
				276	self.pos = oldpos
				277	self.commentlist = oldcl
				278	addrspec = self.getaddrspec()
				279	returnlist = [(SPACE.join(self.commentlist), addrspec)]
				280
				281	elif self.field[self.pos] == ':':
				282	# address is a group
				283	returnlist = []
				284
				285	fieldlen = len(self.field)
				286	self.pos += 1
				287	while self.pos < len(self.field):
				288	self.gotonext()
				289	if self.pos < fieldlen and self.field[self.pos] == ';':
				290	self.pos += 1
				291	break
				292	returnlist = returnlist + self.getaddress()
				293
				294	elif self.field[self.pos] == '<':
				295	# Address is a phrase then a route addr
				296	routeaddr = self.getrouteaddr()
				297
				298	if self.commentlist:
				299	returnlist = [(SPACE.join(plist) + ' (' +
				300	' '.join(self.commentlist) + ')', routeaddr)]
				301	else:
				302	returnlist = [(SPACE.join(plist), routeaddr)]
				303
				304	else:
				305	if plist:
				306	returnlist = [(SPACE.join(self.commentlist), plist[0])]
				307	elif self.field[self.pos] in self.specials:
				308	self.pos += 1
				309
				310	self.gotonext()
				311	if self.pos < len(self.field) and self.field[self.pos] == ',':
				312	self.pos += 1
				313	return returnlist
				314
				315	def getrouteaddr(self):
				316	"""Parse a route address (Return-path value).
				317
				318	This method just skips all the route stuff and returns the addrspec.
				319	"""
				320	if self.field[self.pos] != '<':
				321	return
				322
				323	expectroute = False
				324	self.pos += 1
				325	self.gotonext()
				326	adlist = ''
				327	while self.pos < len(self.field):
				328	if expectroute:
				329	self.getdomain()
				330	expectroute = False
				331	elif self.field[self.pos] == '>':
				332	self.pos += 1
				333	break
				334	elif self.field[self.pos] == '@':
				335	self.pos += 1
				336	expectroute = True
				337	elif self.field[self.pos] == ':':
				338	self.pos += 1
				339	else:
				340	adlist = self.getaddrspec()
				341	self.pos += 1
				342	break
				343	self.gotonext()
				344
				345	return adlist
				346
				347	def getaddrspec(self):
				348	"""Parse an RFC 2822 addr-spec."""
				349	aslist = []
				350
				351	self.gotonext()
				352	while self.pos < len(self.field):
R. David Murray	63563cd	2010-12-18 18:25:38 +0000	[diff] [blame]	353	preserve_ws = True
Guido van Rossum	8b3febe	2007-08-30 01:15:14 +0000	[diff] [blame]	354	if self.field[self.pos] == '.':
R. David Murray	63563cd	2010-12-18 18:25:38 +0000	[diff] [blame]	355	if aslist and not aslist[-1].strip():
				356	aslist.pop()
Guido van Rossum	8b3febe	2007-08-30 01:15:14 +0000	[diff] [blame]	357	aslist.append('.')
				358	self.pos += 1
R. David Murray	63563cd	2010-12-18 18:25:38 +0000	[diff] [blame]	359	preserve_ws = False
Guido van Rossum	8b3febe	2007-08-30 01:15:14 +0000	[diff] [blame]	360	elif self.field[self.pos] == '"':
R. David Murray	5397e86	2010-10-02 15:58:26 +0000	[diff] [blame]	361	aslist.append('"%s"' % quote(self.getquote()))
Guido van Rossum	8b3febe	2007-08-30 01:15:14 +0000	[diff] [blame]	362	elif self.field[self.pos] in self.atomends:
R. David Murray	63563cd	2010-12-18 18:25:38 +0000	[diff] [blame]	363	if aslist and not aslist[-1].strip():
				364	aslist.pop()
Guido van Rossum	8b3febe	2007-08-30 01:15:14 +0000	[diff] [blame]	365	break
				366	else:
				367	aslist.append(self.getatom())
R. David Murray	63563cd	2010-12-18 18:25:38 +0000	[diff] [blame]	368	ws = self.gotonext()
				369	if preserve_ws and ws:
				370	aslist.append(ws)
Guido van Rossum	8b3febe	2007-08-30 01:15:14 +0000	[diff] [blame]	371
				372	if self.pos >= len(self.field) or self.field[self.pos] != '@':
				373	return EMPTYSTRING.join(aslist)
				374
				375	aslist.append('@')
				376	self.pos += 1
				377	self.gotonext()
				378	return EMPTYSTRING.join(aslist) + self.getdomain()
				379
				380	def getdomain(self):
				381	"""Get the complete domain name from an address."""
				382	sdlist = []
				383	while self.pos < len(self.field):
				384	if self.field[self.pos] in self.LWS:
				385	self.pos += 1
				386	elif self.field[self.pos] == '(':
				387	self.commentlist.append(self.getcomment())
				388	elif self.field[self.pos] == '[':
				389	sdlist.append(self.getdomainliteral())
				390	elif self.field[self.pos] == '.':
				391	self.pos += 1
				392	sdlist.append('.')
				393	elif self.field[self.pos] in self.atomends:
				394	break
				395	else:
				396	sdlist.append(self.getatom())
				397	return EMPTYSTRING.join(sdlist)
				398
				399	def getdelimited(self, beginchar, endchars, allowcomments=True):
				400	"""Parse a header fragment delimited by special characters.
				401
				402	`beginchar' is the start character for the fragment.
				403	If self is not looking at an instance of `beginchar' then
				404	getdelimited returns the empty string.
				405
				406	`endchars' is a sequence of allowable end-delimiting characters.
				407	Parsing stops when one of these is encountered.
				408
				409	If `allowcomments' is non-zero, embedded RFC 2822 comments are allowed
				410	within the parsed fragment.
				411	"""
				412	if self.field[self.pos] != beginchar:
				413	return ''
				414
				415	slist = ['']
				416	quote = False
				417	self.pos += 1
				418	while self.pos < len(self.field):
				419	if quote:
				420	slist.append(self.field[self.pos])
				421	quote = False
				422	elif self.field[self.pos] in endchars:
				423	self.pos += 1
				424	break
				425	elif allowcomments and self.field[self.pos] == '(':
				426	slist.append(self.getcomment())
				427	continue # have already advanced pos from getcomment
				428	elif self.field[self.pos] == '\\':
				429	quote = True
				430	else:
				431	slist.append(self.field[self.pos])
				432	self.pos += 1
				433
				434	return EMPTYSTRING.join(slist)
				435
				436	def getquote(self):
				437	"""Get a quote-delimited fragment from self's field."""
				438	return self.getdelimited('"', '"\r', False)
				439
				440	def getcomment(self):
				441	"""Get a parenthesis-delimited fragment from self's field."""
				442	return self.getdelimited('(', ')\r', True)
				443
				444	def getdomainliteral(self):
				445	"""Parse an RFC 2822 domain-literal."""
				446	return '[%s]' % self.getdelimited('[', ']\r', False)
				447
				448	def getatom(self, atomends=None):
				449	"""Parse an RFC 2822 atom.
				450
				451	Optional atomends specifies a different set of end token delimiters
				452	(the default is to use self.atomends). This is used e.g. in
				453	getphraselist() since phrase endings must not include the `.' (which
				454	is legal in phrases)."""
				455	atomlist = ['']
				456	if atomends is None:
				457	atomends = self.atomends
				458
				459	while self.pos < len(self.field):
				460	if self.field[self.pos] in atomends:
				461	break
				462	else:
				463	atomlist.append(self.field[self.pos])
				464	self.pos += 1
				465
				466	return EMPTYSTRING.join(atomlist)
				467
				468	def getphraselist(self):
				469	"""Parse a sequence of RFC 2822 phrases.
				470
				471	A phrase is a sequence of words, which are in turn either RFC 2822
				472	atoms or quoted-strings. Phrases are canonicalized by squeezing all
				473	runs of continuous whitespace into one space.
				474	"""
				475	plist = []
				476
				477	while self.pos < len(self.field):
				478	if self.field[self.pos] in self.FWS:
				479	self.pos += 1
				480	elif self.field[self.pos] == '"':
				481	plist.append(self.getquote())
				482	elif self.field[self.pos] == '(':
				483	self.commentlist.append(self.getcomment())
				484	elif self.field[self.pos] in self.phraseends:
				485	break
				486	else:
				487	plist.append(self.getatom(self.phraseends))
				488
				489	return plist
				490
				491	class AddressList(AddrlistClass):
				492	"""An AddressList encapsulates a list of parsed RFC 2822 addresses."""
				493	def __init__(self, field):
				494	AddrlistClass.__init__(self, field)
				495	if field:
				496	self.addresslist = self.getaddrlist()
				497	else:
				498	self.addresslist = []
				499
				500	def __len__(self):
				501	return len(self.addresslist)
				502
				503	def __add__(self, other):
				504	# Set union
				505	newaddr = AddressList(None)
				506	newaddr.addresslist = self.addresslist[:]
				507	for x in other.addresslist:
				508	if not x in self.addresslist:
				509	newaddr.addresslist.append(x)
				510	return newaddr
				511
				512	def __iadd__(self, other):
				513	# Set union, in-place
				514	for x in other.addresslist:
				515	if not x in self.addresslist:
				516	self.addresslist.append(x)
				517	return self
				518
				519	def __sub__(self, other):
				520	# Set difference
				521	newaddr = AddressList(None)
				522	for x in self.addresslist:
				523	if not x in other.addresslist:
				524	newaddr.addresslist.append(x)
				525	return newaddr
				526
				527	def __isub__(self, other):
				528	# Set difference, in-place
				529	for x in other.addresslist:
				530	if x in self.addresslist:
				531	self.addresslist.remove(x)
				532	return self
				533
				534	def __getitem__(self, index):
				535	# Make indexing, slices, and 'in' work
				536	return self.addresslist[index]