Blame - Lib/email/_parseaddr.py - platform/external/python/cpython3

blob: 41694f9b1acac5d6903cdbfcf692cb02df9eb994 [file] [log] [blame]

Guido van Rossum	8b3febe	2007-08-30 01:15:14 +0000	[diff] [blame]	1	# Copyright (C) 2002-2007 Python Software Foundation
				2	# Contact: email-sig@python.org
				3
				4	"""Email address parsing code.
				5
				6	Lifted directly from rfc822.py. This should eventually be rewritten.
				7	"""
				8
				9	__all__ = [
				10	'mktime_tz',
				11	'parsedate',
				12	'parsedate_tz',
				13	'quote',
				14	]
				15
				16	import time
				17
				18	SPACE = ' '
				19	EMPTYSTRING = ''
				20	COMMASPACE = ', '
				21
				22	# Parse a date field
				23	_monthnames = ['jan', 'feb', 'mar', 'apr', 'may', 'jun', 'jul',
				24	'aug', 'sep', 'oct', 'nov', 'dec',
				25	'january', 'february', 'march', 'april', 'may', 'june', 'july',
				26	'august', 'september', 'october', 'november', 'december']
				27
				28	_daynames = ['mon', 'tue', 'wed', 'thu', 'fri', 'sat', 'sun']
				29
				30	# The timezone table does not include the military time zones defined
				31	# in RFC822, other than Z. According to RFC1123, the description in
				32	# RFC822 gets the signs wrong, so we can't rely on any such time
				33	# zones. RFC1123 recommends that numeric timezone indicators be used
				34	# instead of timezone names.
				35
				36	_timezones = {'UT':0, 'UTC':0, 'GMT':0, 'Z':0,
				37	'AST': -400, 'ADT': -300, # Atlantic (used in Canada)
				38	'EST': -500, 'EDT': -400, # Eastern
				39	'CST': -600, 'CDT': -500, # Central
				40	'MST': -700, 'MDT': -600, # Mountain
				41	'PST': -800, 'PDT': -700 # Pacific
				42	}
				43
				44
				45	def parsedate_tz(data):
				46	"""Convert a date string to a time tuple.
				47
				48	Accounts for military timezones.
				49	"""
				50	data = data.split()
				51	# The FWS after the comma after the day-of-week is optional, so search and
				52	# adjust for this.
				53	if data[0].endswith(',') or data[0].lower() in _daynames:
				54	# There's a dayname here. Skip it
				55	del data[0]
				56	else:
				57	i = data[0].rfind(',')
				58	if i >= 0:
				59	data[0] = data[0][i+1:]
				60	if len(data) == 3: # RFC 850 date, deprecated
				61	stuff = data[0].split('-')
				62	if len(stuff) == 3:
				63	data = stuff + data[1:]
				64	if len(data) == 4:
				65	s = data[3]
				66	i = s.find('+')
R. David Murray	4a62e89	2010-12-23 20:35:46 +0000	[diff] [blame]	67	if i == -1:
				68	i = s.find('-')
Guido van Rossum	8b3febe	2007-08-30 01:15:14 +0000	[diff] [blame]	69	if i > 0:
R. David Murray	4a62e89	2010-12-23 20:35:46 +0000	[diff] [blame]	70	data[3:] = [s[:i], s[i:]]
Guido van Rossum	8b3febe	2007-08-30 01:15:14 +0000	[diff] [blame]	71	else:
				72	data.append('') # Dummy tz
				73	if len(data) < 5:
				74	return None
				75	data = data[:5]
				76	[dd, mm, yy, tm, tz] = data
				77	mm = mm.lower()
				78	if mm not in _monthnames:
				79	dd, mm = mm, dd.lower()
				80	if mm not in _monthnames:
				81	return None
				82	mm = _monthnames.index(mm) + 1
				83	if mm > 12:
				84	mm -= 12
				85	if dd[-1] == ',':
				86	dd = dd[:-1]
				87	i = yy.find(':')
				88	if i > 0:
				89	yy, tm = tm, yy
				90	if yy[-1] == ',':
				91	yy = yy[:-1]
				92	if not yy[0].isdigit():
				93	yy, tz = tz, yy
				94	if tm[-1] == ',':
				95	tm = tm[:-1]
				96	tm = tm.split(':')
				97	if len(tm) == 2:
				98	[thh, tmm] = tm
				99	tss = '0'
				100	elif len(tm) == 3:
				101	[thh, tmm, tss] = tm
				102	else:
				103	return None
				104	try:
				105	yy = int(yy)
				106	dd = int(dd)
				107	thh = int(thh)
				108	tmm = int(tmm)
				109	tss = int(tss)
				110	except ValueError:
				111	return None
R. David Murray	219d1c8	2010-08-25 00:45:55 +0000	[diff] [blame]	112	# Check for a yy specified in two-digit format, then convert it to the
				113	# appropriate four-digit format, according to the POSIX standard. RFC 822
				114	# calls for a two-digit yy, but RFC 2822 (which obsoletes RFC 822)
				115	# mandates a 4-digit yy. For more information, see the documentation for
				116	# the time module.
				117	if yy < 100:
				118	# The year is between 1969 and 1999 (inclusive).
				119	if yy > 68:
				120	yy += 1900
				121	# The year is between 2000 and 2068 (inclusive).
				122	else:
				123	yy += 2000
Guido van Rossum	8b3febe	2007-08-30 01:15:14 +0000	[diff] [blame]	124	tzoffset = None
				125	tz = tz.upper()
				126	if tz in _timezones:
				127	tzoffset = _timezones[tz]
				128	else:
				129	try:
				130	tzoffset = int(tz)
				131	except ValueError:
				132	pass
				133	# Convert a timezone offset into seconds ; -0500 -> -18000
				134	if tzoffset:
				135	if tzoffset < 0:
				136	tzsign = -1
				137	tzoffset = -tzoffset
				138	else:
				139	tzsign = 1
				140	tzoffset = tzsign * ( (tzoffset//100)3600 + (tzoffset % 100)60)
				141	# Daylight Saving Time flag is set to -1, since DST is unknown.
				142	return yy, mm, dd, thh, tmm, tss, 0, 1, -1, tzoffset
				143
				144
				145	def parsedate(data):
				146	"""Convert a time string to a time tuple."""
				147	t = parsedate_tz(data)
				148	if isinstance(t, tuple):
				149	return t[:9]
				150	else:
				151	return t
				152
				153
				154	def mktime_tz(data):
				155	"""Turn a 10-tuple as returned by parsedate_tz() into a UTC timestamp."""
				156	if data[9] is None:
				157	# No zone info, so localtime is better assumption than GMT
				158	return time.mktime(data[:8] + (-1,))
				159	else:
				160	t = time.mktime(data[:8] + (0,))
				161	return t - data[9] - time.timezone
				162
				163
				164	def quote(str):
R. David Murray	5397e86	2010-10-02 15:58:26 +0000	[diff] [blame]	165	"""Prepare string to be used in a quoted string.
				166
				167	Turns backslash and double quote characters into quoted pairs. These
				168	are the only characters that need to be quoted inside a quoted string.
				169	Does not add the surrounding double quotes.
				170	"""
Guido van Rossum	8b3febe	2007-08-30 01:15:14 +0000	[diff] [blame]	171	return str.replace('\\', '\\\\').replace('"', '\\"')
				172
				173
				174	class AddrlistClass:
				175	"""Address parser class by Ben Escoto.
				176
				177	To understand what this class does, it helps to have a copy of RFC 2822 in
				178	front of you.
				179
				180	Note: this class interface is deprecated and may be removed in the future.
				181	Use rfc822.AddressList instead.
				182	"""
				183
				184	def __init__(self, field):
				185	"""Initialize a new instance.
				186
				187	`field' is an unparsed address header field, containing
				188	one or more addresses.
				189	"""
				190	self.specials = '()<>@,:;.\"[]'
				191	self.pos = 0
				192	self.LWS = ' \t'
				193	self.CR = '\r\n'
				194	self.FWS = self.LWS + self.CR
				195	self.atomends = self.specials + self.LWS + self.CR
				196	# Note that RFC 2822 now specifies `.' as obs-phrase, meaning that it
				197	# is obsolete syntax. RFC 2822 requires that we recognize obsolete
				198	# syntax, so allow dots in phrases.
				199	self.phraseends = self.atomends.replace('.', '')
				200	self.field = field
				201	self.commentlist = []
				202
				203	def gotonext(self):
R. David Murray	63563cd	2010-12-18 18:25:38 +0000	[diff] [blame]	204	"""Skip white space and extract comments."""
				205	wslist = []
Guido van Rossum	8b3febe	2007-08-30 01:15:14 +0000	[diff] [blame]	206	while self.pos < len(self.field):
				207	if self.field[self.pos] in self.LWS + '\n\r':
R. David Murray	63563cd	2010-12-18 18:25:38 +0000	[diff] [blame]	208	if self.field[self.pos] not in '\n\r':
				209	wslist.append(self.field[self.pos])
Guido van Rossum	8b3febe	2007-08-30 01:15:14 +0000	[diff] [blame]	210	self.pos += 1
				211	elif self.field[self.pos] == '(':
				212	self.commentlist.append(self.getcomment())
				213	else:
				214	break
R. David Murray	63563cd	2010-12-18 18:25:38 +0000	[diff] [blame]	215	return EMPTYSTRING.join(wslist)
Guido van Rossum	8b3febe	2007-08-30 01:15:14 +0000	[diff] [blame]	216
				217	def getaddrlist(self):
				218	"""Parse all addresses.
				219
				220	Returns a list containing all of the addresses.
				221	"""
				222	result = []
				223	while self.pos < len(self.field):
				224	ad = self.getaddress()
				225	if ad:
				226	result += ad
				227	else:
				228	result.append(('', ''))
				229	return result
				230
				231	def getaddress(self):
				232	"""Parse the next address."""
				233	self.commentlist = []
				234	self.gotonext()
				235
				236	oldpos = self.pos
				237	oldcl = self.commentlist
				238	plist = self.getphraselist()
				239
				240	self.gotonext()
				241	returnlist = []
				242
				243	if self.pos >= len(self.field):
				244	# Bad email address technically, no domain.
				245	if plist:
				246	returnlist = [(SPACE.join(self.commentlist), plist[0])]
				247
				248	elif self.field[self.pos] in '.@':
				249	# email address is just an addrspec
				250	# this isn't very efficient since we start over
				251	self.pos = oldpos
				252	self.commentlist = oldcl
				253	addrspec = self.getaddrspec()
				254	returnlist = [(SPACE.join(self.commentlist), addrspec)]
				255
				256	elif self.field[self.pos] == ':':
				257	# address is a group
				258	returnlist = []
				259
				260	fieldlen = len(self.field)
				261	self.pos += 1
				262	while self.pos < len(self.field):
				263	self.gotonext()
				264	if self.pos < fieldlen and self.field[self.pos] == ';':
				265	self.pos += 1
				266	break
				267	returnlist = returnlist + self.getaddress()
				268
				269	elif self.field[self.pos] == '<':
				270	# Address is a phrase then a route addr
				271	routeaddr = self.getrouteaddr()
				272
				273	if self.commentlist:
				274	returnlist = [(SPACE.join(plist) + ' (' +
				275	' '.join(self.commentlist) + ')', routeaddr)]
				276	else:
				277	returnlist = [(SPACE.join(plist), routeaddr)]
				278
				279	else:
				280	if plist:
				281	returnlist = [(SPACE.join(self.commentlist), plist[0])]
				282	elif self.field[self.pos] in self.specials:
				283	self.pos += 1
				284
				285	self.gotonext()
				286	if self.pos < len(self.field) and self.field[self.pos] == ',':
				287	self.pos += 1
				288	return returnlist
				289
				290	def getrouteaddr(self):
				291	"""Parse a route address (Return-path value).
				292
				293	This method just skips all the route stuff and returns the addrspec.
				294	"""
				295	if self.field[self.pos] != '<':
				296	return
				297
				298	expectroute = False
				299	self.pos += 1
				300	self.gotonext()
				301	adlist = ''
				302	while self.pos < len(self.field):
				303	if expectroute:
				304	self.getdomain()
				305	expectroute = False
				306	elif self.field[self.pos] == '>':
				307	self.pos += 1
				308	break
				309	elif self.field[self.pos] == '@':
				310	self.pos += 1
				311	expectroute = True
				312	elif self.field[self.pos] == ':':
				313	self.pos += 1
				314	else:
				315	adlist = self.getaddrspec()
				316	self.pos += 1
				317	break
				318	self.gotonext()
				319
				320	return adlist
				321
				322	def getaddrspec(self):
				323	"""Parse an RFC 2822 addr-spec."""
				324	aslist = []
				325
				326	self.gotonext()
				327	while self.pos < len(self.field):
R. David Murray	63563cd	2010-12-18 18:25:38 +0000	[diff] [blame]	328	preserve_ws = True
Guido van Rossum	8b3febe	2007-08-30 01:15:14 +0000	[diff] [blame]	329	if self.field[self.pos] == '.':
R. David Murray	63563cd	2010-12-18 18:25:38 +0000	[diff] [blame]	330	if aslist and not aslist[-1].strip():
				331	aslist.pop()
Guido van Rossum	8b3febe	2007-08-30 01:15:14 +0000	[diff] [blame]	332	aslist.append('.')
				333	self.pos += 1
R. David Murray	63563cd	2010-12-18 18:25:38 +0000	[diff] [blame]	334	preserve_ws = False
Guido van Rossum	8b3febe	2007-08-30 01:15:14 +0000	[diff] [blame]	335	elif self.field[self.pos] == '"':
R. David Murray	5397e86	2010-10-02 15:58:26 +0000	[diff] [blame]	336	aslist.append('"%s"' % quote(self.getquote()))
Guido van Rossum	8b3febe	2007-08-30 01:15:14 +0000	[diff] [blame]	337	elif self.field[self.pos] in self.atomends:
R. David Murray	63563cd	2010-12-18 18:25:38 +0000	[diff] [blame]	338	if aslist and not aslist[-1].strip():
				339	aslist.pop()
Guido van Rossum	8b3febe	2007-08-30 01:15:14 +0000	[diff] [blame]	340	break
				341	else:
				342	aslist.append(self.getatom())
R. David Murray	63563cd	2010-12-18 18:25:38 +0000	[diff] [blame]	343	ws = self.gotonext()
				344	if preserve_ws and ws:
				345	aslist.append(ws)
Guido van Rossum	8b3febe	2007-08-30 01:15:14 +0000	[diff] [blame]	346
				347	if self.pos >= len(self.field) or self.field[self.pos] != '@':
				348	return EMPTYSTRING.join(aslist)
				349
				350	aslist.append('@')
				351	self.pos += 1
				352	self.gotonext()
				353	return EMPTYSTRING.join(aslist) + self.getdomain()
				354
				355	def getdomain(self):
				356	"""Get the complete domain name from an address."""
				357	sdlist = []
				358	while self.pos < len(self.field):
				359	if self.field[self.pos] in self.LWS:
				360	self.pos += 1
				361	elif self.field[self.pos] == '(':
				362	self.commentlist.append(self.getcomment())
				363	elif self.field[self.pos] == '[':
				364	sdlist.append(self.getdomainliteral())
				365	elif self.field[self.pos] == '.':
				366	self.pos += 1
				367	sdlist.append('.')
				368	elif self.field[self.pos] in self.atomends:
				369	break
				370	else:
				371	sdlist.append(self.getatom())
				372	return EMPTYSTRING.join(sdlist)
				373
				374	def getdelimited(self, beginchar, endchars, allowcomments=True):
				375	"""Parse a header fragment delimited by special characters.
				376
				377	`beginchar' is the start character for the fragment.
				378	If self is not looking at an instance of `beginchar' then
				379	getdelimited returns the empty string.
				380
				381	`endchars' is a sequence of allowable end-delimiting characters.
				382	Parsing stops when one of these is encountered.
				383
				384	If `allowcomments' is non-zero, embedded RFC 2822 comments are allowed
				385	within the parsed fragment.
				386	"""
				387	if self.field[self.pos] != beginchar:
				388	return ''
				389
				390	slist = ['']
				391	quote = False
				392	self.pos += 1
				393	while self.pos < len(self.field):
				394	if quote:
				395	slist.append(self.field[self.pos])
				396	quote = False
				397	elif self.field[self.pos] in endchars:
				398	self.pos += 1
				399	break
				400	elif allowcomments and self.field[self.pos] == '(':
				401	slist.append(self.getcomment())
				402	continue # have already advanced pos from getcomment
				403	elif self.field[self.pos] == '\\':
				404	quote = True
				405	else:
				406	slist.append(self.field[self.pos])
				407	self.pos += 1
				408
				409	return EMPTYSTRING.join(slist)
				410
				411	def getquote(self):
				412	"""Get a quote-delimited fragment from self's field."""
				413	return self.getdelimited('"', '"\r', False)
				414
				415	def getcomment(self):
				416	"""Get a parenthesis-delimited fragment from self's field."""
				417	return self.getdelimited('(', ')\r', True)
				418
				419	def getdomainliteral(self):
				420	"""Parse an RFC 2822 domain-literal."""
				421	return '[%s]' % self.getdelimited('[', ']\r', False)
				422
				423	def getatom(self, atomends=None):
				424	"""Parse an RFC 2822 atom.
				425
				426	Optional atomends specifies a different set of end token delimiters
				427	(the default is to use self.atomends). This is used e.g. in
				428	getphraselist() since phrase endings must not include the `.' (which
				429	is legal in phrases)."""
				430	atomlist = ['']
				431	if atomends is None:
				432	atomends = self.atomends
				433
				434	while self.pos < len(self.field):
				435	if self.field[self.pos] in atomends:
				436	break
				437	else:
				438	atomlist.append(self.field[self.pos])
				439	self.pos += 1
				440
				441	return EMPTYSTRING.join(atomlist)
				442
				443	def getphraselist(self):
				444	"""Parse a sequence of RFC 2822 phrases.
				445
				446	A phrase is a sequence of words, which are in turn either RFC 2822
				447	atoms or quoted-strings. Phrases are canonicalized by squeezing all
				448	runs of continuous whitespace into one space.
				449	"""
				450	plist = []
				451
				452	while self.pos < len(self.field):
				453	if self.field[self.pos] in self.FWS:
				454	self.pos += 1
				455	elif self.field[self.pos] == '"':
				456	plist.append(self.getquote())
				457	elif self.field[self.pos] == '(':
				458	self.commentlist.append(self.getcomment())
				459	elif self.field[self.pos] in self.phraseends:
				460	break
				461	else:
				462	plist.append(self.getatom(self.phraseends))
				463
				464	return plist
				465
				466	class AddressList(AddrlistClass):
				467	"""An AddressList encapsulates a list of parsed RFC 2822 addresses."""
				468	def __init__(self, field):
				469	AddrlistClass.__init__(self, field)
				470	if field:
				471	self.addresslist = self.getaddrlist()
				472	else:
				473	self.addresslist = []
				474
				475	def __len__(self):
				476	return len(self.addresslist)
				477
				478	def __add__(self, other):
				479	# Set union
				480	newaddr = AddressList(None)
				481	newaddr.addresslist = self.addresslist[:]
				482	for x in other.addresslist:
				483	if not x in self.addresslist:
				484	newaddr.addresslist.append(x)
				485	return newaddr
				486
				487	def __iadd__(self, other):
				488	# Set union, in-place
				489	for x in other.addresslist:
				490	if not x in self.addresslist:
				491	self.addresslist.append(x)
				492	return self
				493
				494	def __sub__(self, other):
				495	# Set difference
				496	newaddr = AddressList(None)
				497	for x in self.addresslist:
				498	if not x in other.addresslist:
				499	newaddr.addresslist.append(x)
				500	return newaddr
				501
				502	def __isub__(self, other):
				503	# Set difference, in-place
				504	for x in other.addresslist:
				505	if x in self.addresslist:
				506	self.addresslist.remove(x)
				507	return self
				508
				509	def __getitem__(self, index):
				510	# Make indexing, slices, and 'in' work
				511	return self.addresslist[index]