Blame - Lib/email/_parseaddr.py - platform/external/python/cpython3

blob: ac2e524401e327fe31170a011ae69049d602d419 [file] [log] [blame]

Guido van Rossum	8b3febe	2007-08-30 01:15:14 +0000	[diff] [blame]	1	# Copyright (C) 2002-2007 Python Software Foundation
				2	# Contact: email-sig@python.org
				3
				4	"""Email address parsing code.
				5
				6	Lifted directly from rfc822.py. This should eventually be rewritten.
				7	"""
				8
				9	__all__ = [
				10	'mktime_tz',
				11	'parsedate',
				12	'parsedate_tz',
				13	'quote',
				14	]
				15
				16	import time
				17
				18	SPACE = ' '
				19	EMPTYSTRING = ''
				20	COMMASPACE = ', '
				21
				22	# Parse a date field
				23	_monthnames = ['jan', 'feb', 'mar', 'apr', 'may', 'jun', 'jul',
				24	'aug', 'sep', 'oct', 'nov', 'dec',
				25	'january', 'february', 'march', 'april', 'may', 'june', 'july',
				26	'august', 'september', 'october', 'november', 'december']
				27
				28	_daynames = ['mon', 'tue', 'wed', 'thu', 'fri', 'sat', 'sun']
				29
				30	# The timezone table does not include the military time zones defined
				31	# in RFC822, other than Z. According to RFC1123, the description in
				32	# RFC822 gets the signs wrong, so we can't rely on any such time
				33	# zones. RFC1123 recommends that numeric timezone indicators be used
				34	# instead of timezone names.
				35
				36	_timezones = {'UT':0, 'UTC':0, 'GMT':0, 'Z':0,
				37	'AST': -400, 'ADT': -300, # Atlantic (used in Canada)
				38	'EST': -500, 'EDT': -400, # Eastern
				39	'CST': -600, 'CDT': -500, # Central
				40	'MST': -700, 'MDT': -600, # Mountain
				41	'PST': -800, 'PDT': -700 # Pacific
				42	}
				43
				44
				45	def parsedate_tz(data):
				46	"""Convert a date string to a time tuple.
				47
				48	Accounts for military timezones.
				49	"""
				50	data = data.split()
				51	# The FWS after the comma after the day-of-week is optional, so search and
				52	# adjust for this.
				53	if data[0].endswith(',') or data[0].lower() in _daynames:
				54	# There's a dayname here. Skip it
				55	del data[0]
				56	else:
				57	i = data[0].rfind(',')
				58	if i >= 0:
				59	data[0] = data[0][i+1:]
				60	if len(data) == 3: # RFC 850 date, deprecated
				61	stuff = data[0].split('-')
				62	if len(stuff) == 3:
				63	data = stuff + data[1:]
				64	if len(data) == 4:
				65	s = data[3]
				66	i = s.find('+')
				67	if i > 0:
				68	data[3:] = [s[:i], s[i+1:]]
				69	else:
				70	data.append('') # Dummy tz
				71	if len(data) < 5:
				72	return None
				73	data = data[:5]
				74	[dd, mm, yy, tm, tz] = data
				75	mm = mm.lower()
				76	if mm not in _monthnames:
				77	dd, mm = mm, dd.lower()
				78	if mm not in _monthnames:
				79	return None
				80	mm = _monthnames.index(mm) + 1
				81	if mm > 12:
				82	mm -= 12
				83	if dd[-1] == ',':
				84	dd = dd[:-1]
				85	i = yy.find(':')
				86	if i > 0:
				87	yy, tm = tm, yy
				88	if yy[-1] == ',':
				89	yy = yy[:-1]
				90	if not yy[0].isdigit():
				91	yy, tz = tz, yy
				92	if tm[-1] == ',':
				93	tm = tm[:-1]
				94	tm = tm.split(':')
				95	if len(tm) == 2:
				96	[thh, tmm] = tm
				97	tss = '0'
				98	elif len(tm) == 3:
				99	[thh, tmm, tss] = tm
				100	else:
				101	return None
				102	try:
				103	yy = int(yy)
				104	dd = int(dd)
				105	thh = int(thh)
				106	tmm = int(tmm)
				107	tss = int(tss)
				108	except ValueError:
				109	return None
R. David Murray	1061f18	2010-08-25 01:55:24 +0000	[diff] [blame^]	110	# Check for a yy specified in two-digit format, then convert it to the
				111	# appropriate four-digit format, according to the POSIX standard. RFC 822
				112	# calls for a two-digit yy, but RFC 2822 (which obsoletes RFC 822)
				113	# mandates a 4-digit yy. For more information, see the documentation for
				114	# the time module.
				115	if yy < 100:
				116	# The year is between 1969 and 1999 (inclusive).
				117	if yy > 68:
				118	yy += 1900
				119	# The year is between 2000 and 2068 (inclusive).
				120	else:
				121	yy += 2000
Guido van Rossum	8b3febe	2007-08-30 01:15:14 +0000	[diff] [blame]	122	tzoffset = None
				123	tz = tz.upper()
				124	if tz in _timezones:
				125	tzoffset = _timezones[tz]
				126	else:
				127	try:
				128	tzoffset = int(tz)
				129	except ValueError:
				130	pass
				131	# Convert a timezone offset into seconds ; -0500 -> -18000
				132	if tzoffset:
				133	if tzoffset < 0:
				134	tzsign = -1
				135	tzoffset = -tzoffset
				136	else:
				137	tzsign = 1
				138	tzoffset = tzsign * ( (tzoffset//100)3600 + (tzoffset % 100)60)
				139	# Daylight Saving Time flag is set to -1, since DST is unknown.
				140	return yy, mm, dd, thh, tmm, tss, 0, 1, -1, tzoffset
				141
				142
				143	def parsedate(data):
				144	"""Convert a time string to a time tuple."""
				145	t = parsedate_tz(data)
				146	if isinstance(t, tuple):
				147	return t[:9]
				148	else:
				149	return t
				150
				151
				152	def mktime_tz(data):
				153	"""Turn a 10-tuple as returned by parsedate_tz() into a UTC timestamp."""
				154	if data[9] is None:
				155	# No zone info, so localtime is better assumption than GMT
				156	return time.mktime(data[:8] + (-1,))
				157	else:
				158	t = time.mktime(data[:8] + (0,))
				159	return t - data[9] - time.timezone
				160
				161
				162	def quote(str):
				163	"""Add quotes around a string."""
				164	return str.replace('\\', '\\\\').replace('"', '\\"')
				165
				166
				167	class AddrlistClass:
				168	"""Address parser class by Ben Escoto.
				169
				170	To understand what this class does, it helps to have a copy of RFC 2822 in
				171	front of you.
				172
				173	Note: this class interface is deprecated and may be removed in the future.
				174	Use rfc822.AddressList instead.
				175	"""
				176
				177	def __init__(self, field):
				178	"""Initialize a new instance.
				179
				180	`field' is an unparsed address header field, containing
				181	one or more addresses.
				182	"""
				183	self.specials = '()<>@,:;.\"[]'
				184	self.pos = 0
				185	self.LWS = ' \t'
				186	self.CR = '\r\n'
				187	self.FWS = self.LWS + self.CR
				188	self.atomends = self.specials + self.LWS + self.CR
				189	# Note that RFC 2822 now specifies `.' as obs-phrase, meaning that it
				190	# is obsolete syntax. RFC 2822 requires that we recognize obsolete
				191	# syntax, so allow dots in phrases.
				192	self.phraseends = self.atomends.replace('.', '')
				193	self.field = field
				194	self.commentlist = []
				195
				196	def gotonext(self):
				197	"""Parse up to the start of the next address."""
				198	while self.pos < len(self.field):
				199	if self.field[self.pos] in self.LWS + '\n\r':
				200	self.pos += 1
				201	elif self.field[self.pos] == '(':
				202	self.commentlist.append(self.getcomment())
				203	else:
				204	break
				205
				206	def getaddrlist(self):
				207	"""Parse all addresses.
				208
				209	Returns a list containing all of the addresses.
				210	"""
				211	result = []
				212	while self.pos < len(self.field):
				213	ad = self.getaddress()
				214	if ad:
				215	result += ad
				216	else:
				217	result.append(('', ''))
				218	return result
				219
				220	def getaddress(self):
				221	"""Parse the next address."""
				222	self.commentlist = []
				223	self.gotonext()
				224
				225	oldpos = self.pos
				226	oldcl = self.commentlist
				227	plist = self.getphraselist()
				228
				229	self.gotonext()
				230	returnlist = []
				231
				232	if self.pos >= len(self.field):
				233	# Bad email address technically, no domain.
				234	if plist:
				235	returnlist = [(SPACE.join(self.commentlist), plist[0])]
				236
				237	elif self.field[self.pos] in '.@':
				238	# email address is just an addrspec
				239	# this isn't very efficient since we start over
				240	self.pos = oldpos
				241	self.commentlist = oldcl
				242	addrspec = self.getaddrspec()
				243	returnlist = [(SPACE.join(self.commentlist), addrspec)]
				244
				245	elif self.field[self.pos] == ':':
				246	# address is a group
				247	returnlist = []
				248
				249	fieldlen = len(self.field)
				250	self.pos += 1
				251	while self.pos < len(self.field):
				252	self.gotonext()
				253	if self.pos < fieldlen and self.field[self.pos] == ';':
				254	self.pos += 1
				255	break
				256	returnlist = returnlist + self.getaddress()
				257
				258	elif self.field[self.pos] == '<':
				259	# Address is a phrase then a route addr
				260	routeaddr = self.getrouteaddr()
				261
				262	if self.commentlist:
				263	returnlist = [(SPACE.join(plist) + ' (' +
				264	' '.join(self.commentlist) + ')', routeaddr)]
				265	else:
				266	returnlist = [(SPACE.join(plist), routeaddr)]
				267
				268	else:
				269	if plist:
				270	returnlist = [(SPACE.join(self.commentlist), plist[0])]
				271	elif self.field[self.pos] in self.specials:
				272	self.pos += 1
				273
				274	self.gotonext()
				275	if self.pos < len(self.field) and self.field[self.pos] == ',':
				276	self.pos += 1
				277	return returnlist
				278
				279	def getrouteaddr(self):
				280	"""Parse a route address (Return-path value).
				281
				282	This method just skips all the route stuff and returns the addrspec.
				283	"""
				284	if self.field[self.pos] != '<':
				285	return
				286
				287	expectroute = False
				288	self.pos += 1
				289	self.gotonext()
				290	adlist = ''
				291	while self.pos < len(self.field):
				292	if expectroute:
				293	self.getdomain()
				294	expectroute = False
				295	elif self.field[self.pos] == '>':
				296	self.pos += 1
				297	break
				298	elif self.field[self.pos] == '@':
				299	self.pos += 1
				300	expectroute = True
				301	elif self.field[self.pos] == ':':
				302	self.pos += 1
				303	else:
				304	adlist = self.getaddrspec()
				305	self.pos += 1
				306	break
				307	self.gotonext()
				308
				309	return adlist
				310
				311	def getaddrspec(self):
				312	"""Parse an RFC 2822 addr-spec."""
				313	aslist = []
				314
				315	self.gotonext()
				316	while self.pos < len(self.field):
				317	if self.field[self.pos] == '.':
				318	aslist.append('.')
				319	self.pos += 1
				320	elif self.field[self.pos] == '"':
				321	aslist.append('"%s"' % self.getquote())
				322	elif self.field[self.pos] in self.atomends:
				323	break
				324	else:
				325	aslist.append(self.getatom())
				326	self.gotonext()
				327
				328	if self.pos >= len(self.field) or self.field[self.pos] != '@':
				329	return EMPTYSTRING.join(aslist)
				330
				331	aslist.append('@')
				332	self.pos += 1
				333	self.gotonext()
				334	return EMPTYSTRING.join(aslist) + self.getdomain()
				335
				336	def getdomain(self):
				337	"""Get the complete domain name from an address."""
				338	sdlist = []
				339	while self.pos < len(self.field):
				340	if self.field[self.pos] in self.LWS:
				341	self.pos += 1
				342	elif self.field[self.pos] == '(':
				343	self.commentlist.append(self.getcomment())
				344	elif self.field[self.pos] == '[':
				345	sdlist.append(self.getdomainliteral())
				346	elif self.field[self.pos] == '.':
				347	self.pos += 1
				348	sdlist.append('.')
				349	elif self.field[self.pos] in self.atomends:
				350	break
				351	else:
				352	sdlist.append(self.getatom())
				353	return EMPTYSTRING.join(sdlist)
				354
				355	def getdelimited(self, beginchar, endchars, allowcomments=True):
				356	"""Parse a header fragment delimited by special characters.
				357
				358	`beginchar' is the start character for the fragment.
				359	If self is not looking at an instance of `beginchar' then
				360	getdelimited returns the empty string.
				361
				362	`endchars' is a sequence of allowable end-delimiting characters.
				363	Parsing stops when one of these is encountered.
				364
				365	If `allowcomments' is non-zero, embedded RFC 2822 comments are allowed
				366	within the parsed fragment.
				367	"""
				368	if self.field[self.pos] != beginchar:
				369	return ''
				370
				371	slist = ['']
				372	quote = False
				373	self.pos += 1
				374	while self.pos < len(self.field):
				375	if quote:
				376	slist.append(self.field[self.pos])
				377	quote = False
				378	elif self.field[self.pos] in endchars:
				379	self.pos += 1
				380	break
				381	elif allowcomments and self.field[self.pos] == '(':
				382	slist.append(self.getcomment())
				383	continue # have already advanced pos from getcomment
				384	elif self.field[self.pos] == '\\':
				385	quote = True
				386	else:
				387	slist.append(self.field[self.pos])
				388	self.pos += 1
				389
				390	return EMPTYSTRING.join(slist)
				391
				392	def getquote(self):
				393	"""Get a quote-delimited fragment from self's field."""
				394	return self.getdelimited('"', '"\r', False)
				395
				396	def getcomment(self):
				397	"""Get a parenthesis-delimited fragment from self's field."""
				398	return self.getdelimited('(', ')\r', True)
				399
				400	def getdomainliteral(self):
				401	"""Parse an RFC 2822 domain-literal."""
				402	return '[%s]' % self.getdelimited('[', ']\r', False)
				403
				404	def getatom(self, atomends=None):
				405	"""Parse an RFC 2822 atom.
				406
				407	Optional atomends specifies a different set of end token delimiters
				408	(the default is to use self.atomends). This is used e.g. in
				409	getphraselist() since phrase endings must not include the `.' (which
				410	is legal in phrases)."""
				411	atomlist = ['']
				412	if atomends is None:
				413	atomends = self.atomends
				414
				415	while self.pos < len(self.field):
				416	if self.field[self.pos] in atomends:
				417	break
				418	else:
				419	atomlist.append(self.field[self.pos])
				420	self.pos += 1
				421
				422	return EMPTYSTRING.join(atomlist)
				423
				424	def getphraselist(self):
				425	"""Parse a sequence of RFC 2822 phrases.
				426
				427	A phrase is a sequence of words, which are in turn either RFC 2822
				428	atoms or quoted-strings. Phrases are canonicalized by squeezing all
				429	runs of continuous whitespace into one space.
				430	"""
				431	plist = []
				432
				433	while self.pos < len(self.field):
				434	if self.field[self.pos] in self.FWS:
				435	self.pos += 1
				436	elif self.field[self.pos] == '"':
				437	plist.append(self.getquote())
				438	elif self.field[self.pos] == '(':
				439	self.commentlist.append(self.getcomment())
				440	elif self.field[self.pos] in self.phraseends:
				441	break
				442	else:
				443	plist.append(self.getatom(self.phraseends))
				444
				445	return plist
				446
				447	class AddressList(AddrlistClass):
				448	"""An AddressList encapsulates a list of parsed RFC 2822 addresses."""
				449	def __init__(self, field):
				450	AddrlistClass.__init__(self, field)
				451	if field:
				452	self.addresslist = self.getaddrlist()
				453	else:
				454	self.addresslist = []
				455
				456	def __len__(self):
				457	return len(self.addresslist)
				458
				459	def __add__(self, other):
				460	# Set union
				461	newaddr = AddressList(None)
				462	newaddr.addresslist = self.addresslist[:]
				463	for x in other.addresslist:
				464	if not x in self.addresslist:
				465	newaddr.addresslist.append(x)
				466	return newaddr
				467
				468	def __iadd__(self, other):
				469	# Set union, in-place
				470	for x in other.addresslist:
				471	if not x in self.addresslist:
				472	self.addresslist.append(x)
				473	return self
				474
				475	def __sub__(self, other):
				476	# Set difference
				477	newaddr = AddressList(None)
				478	for x in self.addresslist:
				479	if not x in other.addresslist:
				480	newaddr.addresslist.append(x)
				481	return newaddr
				482
				483	def __isub__(self, other):
				484	# Set difference, in-place
				485	for x in other.addresslist:
				486	if x in self.addresslist:
				487	self.addresslist.remove(x)
				488	return self
				489
				490	def __getitem__(self, index):
				491	# Make indexing, slices, and 'in' work
				492	return self.addresslist[index]