blob: f6efcd5c1ea3a9b996d20434caa63f41a9195d13 [file] [log] [blame]
Barry Warsaw24f79762004-05-09 03:55:11 +00001# Copyright (C) 2002-2004 Python Software Foundation
Barry Warsawbb113862004-10-03 03:16:19 +00002# Contact: email-sig@python.org
Barry Warsaw030ddf72002-11-05 19:54:52 +00003
4"""Email address parsing code.
5
6Lifted directly from rfc822.py. This should eventually be rewritten.
7"""
8
9import time
Barry Warsaw5c8fef92002-12-30 16:43:42 +000010
11SPACE = ' '
12EMPTYSTRING = ''
13COMMASPACE = ', '
Barry Warsaw030ddf72002-11-05 19:54:52 +000014
15# Parse a date field
16_monthnames = ['jan', 'feb', 'mar', 'apr', 'may', 'jun', 'jul',
17 'aug', 'sep', 'oct', 'nov', 'dec',
18 'january', 'february', 'march', 'april', 'may', 'june', 'july',
19 'august', 'september', 'october', 'november', 'december']
20
21_daynames = ['mon', 'tue', 'wed', 'thu', 'fri', 'sat', 'sun']
22
23# The timezone table does not include the military time zones defined
24# in RFC822, other than Z. According to RFC1123, the description in
25# RFC822 gets the signs wrong, so we can't rely on any such time
26# zones. RFC1123 recommends that numeric timezone indicators be used
27# instead of timezone names.
28
29_timezones = {'UT':0, 'UTC':0, 'GMT':0, 'Z':0,
30 'AST': -400, 'ADT': -300, # Atlantic (used in Canada)
31 'EST': -500, 'EDT': -400, # Eastern
32 'CST': -600, 'CDT': -500, # Central
33 'MST': -700, 'MDT': -600, # Mountain
34 'PST': -800, 'PDT': -700 # Pacific
35 }
36
37
38def parsedate_tz(data):
39 """Convert a date string to a time tuple.
40
41 Accounts for military timezones.
42 """
43 data = data.split()
Barry Warsawba976592002-12-30 17:21:36 +000044 # The FWS after the comma after the day-of-week is optional, so search and
45 # adjust for this.
46 if data[0].endswith(',') or data[0].lower() in _daynames:
Barry Warsaw030ddf72002-11-05 19:54:52 +000047 # There's a dayname here. Skip it
48 del data[0]
Barry Warsawba976592002-12-30 17:21:36 +000049 else:
50 i = data[0].rfind(',')
Barry Warsawb5dc39f2003-05-08 03:33:15 +000051 if i >= 0:
52 data[0] = data[0][i+1:]
Barry Warsaw030ddf72002-11-05 19:54:52 +000053 if len(data) == 3: # RFC 850 date, deprecated
54 stuff = data[0].split('-')
55 if len(stuff) == 3:
56 data = stuff + data[1:]
57 if len(data) == 4:
58 s = data[3]
59 i = s.find('+')
60 if i > 0:
61 data[3:] = [s[:i], s[i+1:]]
62 else:
63 data.append('') # Dummy tz
64 if len(data) < 5:
65 return None
66 data = data[:5]
67 [dd, mm, yy, tm, tz] = data
68 mm = mm.lower()
Barry Warsaw5c8fef92002-12-30 16:43:42 +000069 if mm not in _monthnames:
Barry Warsaw030ddf72002-11-05 19:54:52 +000070 dd, mm = mm, dd.lower()
Barry Warsaw5c8fef92002-12-30 16:43:42 +000071 if mm not in _monthnames:
Barry Warsaw030ddf72002-11-05 19:54:52 +000072 return None
Barry Warsaw5c8fef92002-12-30 16:43:42 +000073 mm = _monthnames.index(mm) + 1
74 if mm > 12:
75 mm -= 12
Barry Warsaw030ddf72002-11-05 19:54:52 +000076 if dd[-1] == ',':
77 dd = dd[:-1]
78 i = yy.find(':')
79 if i > 0:
80 yy, tm = tm, yy
81 if yy[-1] == ',':
82 yy = yy[:-1]
83 if not yy[0].isdigit():
84 yy, tz = tz, yy
85 if tm[-1] == ',':
86 tm = tm[:-1]
87 tm = tm.split(':')
88 if len(tm) == 2:
89 [thh, tmm] = tm
90 tss = '0'
91 elif len(tm) == 3:
92 [thh, tmm, tss] = tm
93 else:
94 return None
95 try:
96 yy = int(yy)
97 dd = int(dd)
98 thh = int(thh)
99 tmm = int(tmm)
100 tss = int(tss)
101 except ValueError:
102 return None
103 tzoffset = None
104 tz = tz.upper()
105 if _timezones.has_key(tz):
106 tzoffset = _timezones[tz]
107 else:
108 try:
109 tzoffset = int(tz)
110 except ValueError:
111 pass
112 # Convert a timezone offset into seconds ; -0500 -> -18000
113 if tzoffset:
114 if tzoffset < 0:
115 tzsign = -1
116 tzoffset = -tzoffset
117 else:
118 tzsign = 1
Barry Warsawbb113862004-10-03 03:16:19 +0000119 tzoffset = tzsign * ( (tzoffset//100)*3600 + (tzoffset % 100)*60)
Barry Warsawe8bedeb2004-08-07 16:38:40 +0000120 tuple = (yy, mm, dd, thh, tmm, tss, 0, 1, 0, tzoffset)
Barry Warsaw030ddf72002-11-05 19:54:52 +0000121 return tuple
122
123
124def parsedate(data):
125 """Convert a time string to a time tuple."""
126 t = parsedate_tz(data)
Barry Warsaw24f79762004-05-09 03:55:11 +0000127 if isinstance(t, tuple):
Barry Warsaw030ddf72002-11-05 19:54:52 +0000128 return t[:9]
Barry Warsaw5c8fef92002-12-30 16:43:42 +0000129 else:
130 return t
Barry Warsaw030ddf72002-11-05 19:54:52 +0000131
132
133def mktime_tz(data):
134 """Turn a 10-tuple as returned by parsedate_tz() into a UTC timestamp."""
135 if data[9] is None:
136 # No zone info, so localtime is better assumption than GMT
137 return time.mktime(data[:8] + (-1,))
138 else:
139 t = time.mktime(data[:8] + (0,))
140 return t - data[9] - time.timezone
141
142
143def quote(str):
144 """Add quotes around a string."""
145 return str.replace('\\', '\\\\').replace('"', '\\"')
146
147
148class AddrlistClass:
149 """Address parser class by Ben Escoto.
150
Barry Warsaw1fb22bb2002-12-30 16:21:07 +0000151 To understand what this class does, it helps to have a copy of RFC 2822 in
152 front of you.
Barry Warsaw030ddf72002-11-05 19:54:52 +0000153
154 Note: this class interface is deprecated and may be removed in the future.
155 Use rfc822.AddressList instead.
156 """
157
158 def __init__(self, field):
159 """Initialize a new instance.
160
161 `field' is an unparsed address header field, containing
162 one or more addresses.
163 """
164 self.specials = '()<>@,:;.\"[]'
165 self.pos = 0
166 self.LWS = ' \t'
167 self.CR = '\r\n'
168 self.atomends = self.specials + self.LWS + self.CR
Barry Warsaw1fb22bb2002-12-30 16:21:07 +0000169 # Note that RFC 2822 now specifies `.' as obs-phrase, meaning that it
170 # is obsolete syntax. RFC 2822 requires that we recognize obsolete
171 # syntax, so allow dots in phrases.
172 self.phraseends = self.atomends.replace('.', '')
Barry Warsaw030ddf72002-11-05 19:54:52 +0000173 self.field = field
174 self.commentlist = []
175
176 def gotonext(self):
177 """Parse up to the start of the next address."""
178 while self.pos < len(self.field):
179 if self.field[self.pos] in self.LWS + '\n\r':
Barry Warsaw5c8fef92002-12-30 16:43:42 +0000180 self.pos += 1
Barry Warsaw030ddf72002-11-05 19:54:52 +0000181 elif self.field[self.pos] == '(':
182 self.commentlist.append(self.getcomment())
Barry Warsaw5c8fef92002-12-30 16:43:42 +0000183 else:
184 break
Barry Warsaw030ddf72002-11-05 19:54:52 +0000185
186 def getaddrlist(self):
187 """Parse all addresses.
188
189 Returns a list containing all of the addresses.
190 """
Barry Warsaw1fb22bb2002-12-30 16:21:07 +0000191 result = []
Barry Warsawfa348c82003-03-17 18:35:42 +0000192 while self.pos < len(self.field):
Barry Warsaw1fb22bb2002-12-30 16:21:07 +0000193 ad = self.getaddress()
194 if ad:
195 result += ad
196 else:
Barry Warsawfa348c82003-03-17 18:35:42 +0000197 result.append(('', ''))
Barry Warsaw1fb22bb2002-12-30 16:21:07 +0000198 return result
Barry Warsaw030ddf72002-11-05 19:54:52 +0000199
200 def getaddress(self):
201 """Parse the next address."""
202 self.commentlist = []
203 self.gotonext()
204
205 oldpos = self.pos
206 oldcl = self.commentlist
207 plist = self.getphraselist()
208
209 self.gotonext()
210 returnlist = []
211
212 if self.pos >= len(self.field):
213 # Bad email address technically, no domain.
214 if plist:
Barry Warsaw5c8fef92002-12-30 16:43:42 +0000215 returnlist = [(SPACE.join(self.commentlist), plist[0])]
Barry Warsaw030ddf72002-11-05 19:54:52 +0000216
217 elif self.field[self.pos] in '.@':
218 # email address is just an addrspec
219 # this isn't very efficient since we start over
220 self.pos = oldpos
221 self.commentlist = oldcl
222 addrspec = self.getaddrspec()
Barry Warsaw5c8fef92002-12-30 16:43:42 +0000223 returnlist = [(SPACE.join(self.commentlist), addrspec)]
Barry Warsaw030ddf72002-11-05 19:54:52 +0000224
225 elif self.field[self.pos] == ':':
226 # address is a group
227 returnlist = []
228
229 fieldlen = len(self.field)
Barry Warsaw5c8fef92002-12-30 16:43:42 +0000230 self.pos += 1
Barry Warsaw030ddf72002-11-05 19:54:52 +0000231 while self.pos < len(self.field):
232 self.gotonext()
233 if self.pos < fieldlen and self.field[self.pos] == ';':
Barry Warsaw5c8fef92002-12-30 16:43:42 +0000234 self.pos += 1
Barry Warsaw030ddf72002-11-05 19:54:52 +0000235 break
236 returnlist = returnlist + self.getaddress()
237
238 elif self.field[self.pos] == '<':
239 # Address is a phrase then a route addr
240 routeaddr = self.getrouteaddr()
241
242 if self.commentlist:
Barry Warsaw5c8fef92002-12-30 16:43:42 +0000243 returnlist = [(SPACE.join(plist) + ' (' +
244 ' '.join(self.commentlist) + ')', routeaddr)]
245 else:
246 returnlist = [(SPACE.join(plist), routeaddr)]
Barry Warsaw030ddf72002-11-05 19:54:52 +0000247
248 else:
249 if plist:
Barry Warsaw5c8fef92002-12-30 16:43:42 +0000250 returnlist = [(SPACE.join(self.commentlist), plist[0])]
Barry Warsaw030ddf72002-11-05 19:54:52 +0000251 elif self.field[self.pos] in self.specials:
Barry Warsaw5c8fef92002-12-30 16:43:42 +0000252 self.pos += 1
Barry Warsaw030ddf72002-11-05 19:54:52 +0000253
254 self.gotonext()
255 if self.pos < len(self.field) and self.field[self.pos] == ',':
Barry Warsaw5c8fef92002-12-30 16:43:42 +0000256 self.pos += 1
Barry Warsaw030ddf72002-11-05 19:54:52 +0000257 return returnlist
258
259 def getrouteaddr(self):
260 """Parse a route address (Return-path value).
261
262 This method just skips all the route stuff and returns the addrspec.
263 """
264 if self.field[self.pos] != '<':
265 return
266
Barry Warsaw5c8fef92002-12-30 16:43:42 +0000267 expectroute = False
268 self.pos += 1
Barry Warsaw030ddf72002-11-05 19:54:52 +0000269 self.gotonext()
Barry Warsaw5c8fef92002-12-30 16:43:42 +0000270 adlist = ''
Barry Warsaw030ddf72002-11-05 19:54:52 +0000271 while self.pos < len(self.field):
272 if expectroute:
273 self.getdomain()
Barry Warsaw5c8fef92002-12-30 16:43:42 +0000274 expectroute = False
Barry Warsaw030ddf72002-11-05 19:54:52 +0000275 elif self.field[self.pos] == '>':
Barry Warsaw5c8fef92002-12-30 16:43:42 +0000276 self.pos += 1
Barry Warsaw030ddf72002-11-05 19:54:52 +0000277 break
278 elif self.field[self.pos] == '@':
Barry Warsaw5c8fef92002-12-30 16:43:42 +0000279 self.pos += 1
280 expectroute = True
Barry Warsaw030ddf72002-11-05 19:54:52 +0000281 elif self.field[self.pos] == ':':
Barry Warsaw5c8fef92002-12-30 16:43:42 +0000282 self.pos += 1
Barry Warsaw030ddf72002-11-05 19:54:52 +0000283 else:
284 adlist = self.getaddrspec()
Barry Warsaw5c8fef92002-12-30 16:43:42 +0000285 self.pos += 1
Barry Warsaw030ddf72002-11-05 19:54:52 +0000286 break
287 self.gotonext()
288
289 return adlist
290
291 def getaddrspec(self):
Barry Warsaw1fb22bb2002-12-30 16:21:07 +0000292 """Parse an RFC 2822 addr-spec."""
Barry Warsaw030ddf72002-11-05 19:54:52 +0000293 aslist = []
294
295 self.gotonext()
296 while self.pos < len(self.field):
297 if self.field[self.pos] == '.':
298 aslist.append('.')
Barry Warsaw5c8fef92002-12-30 16:43:42 +0000299 self.pos += 1
Barry Warsaw030ddf72002-11-05 19:54:52 +0000300 elif self.field[self.pos] == '"':
301 aslist.append('"%s"' % self.getquote())
302 elif self.field[self.pos] in self.atomends:
303 break
Barry Warsaw5c8fef92002-12-30 16:43:42 +0000304 else:
305 aslist.append(self.getatom())
Barry Warsaw030ddf72002-11-05 19:54:52 +0000306 self.gotonext()
307
308 if self.pos >= len(self.field) or self.field[self.pos] != '@':
Barry Warsaw5c8fef92002-12-30 16:43:42 +0000309 return EMPTYSTRING.join(aslist)
Barry Warsaw030ddf72002-11-05 19:54:52 +0000310
311 aslist.append('@')
Barry Warsaw5c8fef92002-12-30 16:43:42 +0000312 self.pos += 1
Barry Warsaw030ddf72002-11-05 19:54:52 +0000313 self.gotonext()
Barry Warsaw5c8fef92002-12-30 16:43:42 +0000314 return EMPTYSTRING.join(aslist) + self.getdomain()
Barry Warsaw030ddf72002-11-05 19:54:52 +0000315
316 def getdomain(self):
317 """Get the complete domain name from an address."""
318 sdlist = []
319 while self.pos < len(self.field):
320 if self.field[self.pos] in self.LWS:
Barry Warsaw5c8fef92002-12-30 16:43:42 +0000321 self.pos += 1
Barry Warsaw030ddf72002-11-05 19:54:52 +0000322 elif self.field[self.pos] == '(':
323 self.commentlist.append(self.getcomment())
324 elif self.field[self.pos] == '[':
325 sdlist.append(self.getdomainliteral())
326 elif self.field[self.pos] == '.':
Barry Warsaw5c8fef92002-12-30 16:43:42 +0000327 self.pos += 1
Barry Warsaw030ddf72002-11-05 19:54:52 +0000328 sdlist.append('.')
329 elif self.field[self.pos] in self.atomends:
330 break
Barry Warsaw5c8fef92002-12-30 16:43:42 +0000331 else:
332 sdlist.append(self.getatom())
333 return EMPTYSTRING.join(sdlist)
Barry Warsaw030ddf72002-11-05 19:54:52 +0000334
Barry Warsaw5c8fef92002-12-30 16:43:42 +0000335 def getdelimited(self, beginchar, endchars, allowcomments=True):
Barry Warsaw030ddf72002-11-05 19:54:52 +0000336 """Parse a header fragment delimited by special characters.
337
338 `beginchar' is the start character for the fragment.
339 If self is not looking at an instance of `beginchar' then
340 getdelimited returns the empty string.
341
342 `endchars' is a sequence of allowable end-delimiting characters.
343 Parsing stops when one of these is encountered.
344
Barry Warsaw1fb22bb2002-12-30 16:21:07 +0000345 If `allowcomments' is non-zero, embedded RFC 2822 comments are allowed
346 within the parsed fragment.
Barry Warsaw030ddf72002-11-05 19:54:52 +0000347 """
348 if self.field[self.pos] != beginchar:
349 return ''
350
351 slist = ['']
Barry Warsaw5c8fef92002-12-30 16:43:42 +0000352 quote = False
353 self.pos += 1
Barry Warsaw030ddf72002-11-05 19:54:52 +0000354 while self.pos < len(self.field):
Barry Warsaw5c8fef92002-12-30 16:43:42 +0000355 if quote:
Barry Warsaw030ddf72002-11-05 19:54:52 +0000356 slist.append(self.field[self.pos])
Barry Warsaw5c8fef92002-12-30 16:43:42 +0000357 quote = False
Barry Warsaw030ddf72002-11-05 19:54:52 +0000358 elif self.field[self.pos] in endchars:
Barry Warsaw5c8fef92002-12-30 16:43:42 +0000359 self.pos += 1
Barry Warsaw030ddf72002-11-05 19:54:52 +0000360 break
361 elif allowcomments and self.field[self.pos] == '(':
362 slist.append(self.getcomment())
363 elif self.field[self.pos] == '\\':
Barry Warsaw5c8fef92002-12-30 16:43:42 +0000364 quote = True
Barry Warsaw030ddf72002-11-05 19:54:52 +0000365 else:
366 slist.append(self.field[self.pos])
Barry Warsaw5c8fef92002-12-30 16:43:42 +0000367 self.pos += 1
Barry Warsaw030ddf72002-11-05 19:54:52 +0000368
Barry Warsaw5c8fef92002-12-30 16:43:42 +0000369 return EMPTYSTRING.join(slist)
Barry Warsaw030ddf72002-11-05 19:54:52 +0000370
371 def getquote(self):
372 """Get a quote-delimited fragment from self's field."""
Barry Warsaw5c8fef92002-12-30 16:43:42 +0000373 return self.getdelimited('"', '"\r', False)
Barry Warsaw030ddf72002-11-05 19:54:52 +0000374
375 def getcomment(self):
376 """Get a parenthesis-delimited fragment from self's field."""
Barry Warsaw5c8fef92002-12-30 16:43:42 +0000377 return self.getdelimited('(', ')\r', True)
Barry Warsaw030ddf72002-11-05 19:54:52 +0000378
379 def getdomainliteral(self):
Barry Warsaw1fb22bb2002-12-30 16:21:07 +0000380 """Parse an RFC 2822 domain-literal."""
Barry Warsaw5c8fef92002-12-30 16:43:42 +0000381 return '[%s]' % self.getdelimited('[', ']\r', False)
Barry Warsaw030ddf72002-11-05 19:54:52 +0000382
Barry Warsaw1fb22bb2002-12-30 16:21:07 +0000383 def getatom(self, atomends=None):
384 """Parse an RFC 2822 atom.
385
386 Optional atomends specifies a different set of end token delimiters
387 (the default is to use self.atomends). This is used e.g. in
388 getphraselist() since phrase endings must not include the `.' (which
389 is legal in phrases)."""
Barry Warsaw030ddf72002-11-05 19:54:52 +0000390 atomlist = ['']
Barry Warsaw1fb22bb2002-12-30 16:21:07 +0000391 if atomends is None:
392 atomends = self.atomends
Barry Warsaw030ddf72002-11-05 19:54:52 +0000393
394 while self.pos < len(self.field):
Barry Warsaw1fb22bb2002-12-30 16:21:07 +0000395 if self.field[self.pos] in atomends:
Barry Warsaw030ddf72002-11-05 19:54:52 +0000396 break
Barry Warsaw5c8fef92002-12-30 16:43:42 +0000397 else:
398 atomlist.append(self.field[self.pos])
399 self.pos += 1
Barry Warsaw030ddf72002-11-05 19:54:52 +0000400
Barry Warsaw5c8fef92002-12-30 16:43:42 +0000401 return EMPTYSTRING.join(atomlist)
Barry Warsaw030ddf72002-11-05 19:54:52 +0000402
403 def getphraselist(self):
Barry Warsaw1fb22bb2002-12-30 16:21:07 +0000404 """Parse a sequence of RFC 2822 phrases.
Barry Warsaw030ddf72002-11-05 19:54:52 +0000405
Barry Warsaw1fb22bb2002-12-30 16:21:07 +0000406 A phrase is a sequence of words, which are in turn either RFC 2822
407 atoms or quoted-strings. Phrases are canonicalized by squeezing all
408 runs of continuous whitespace into one space.
Barry Warsaw030ddf72002-11-05 19:54:52 +0000409 """
410 plist = []
411
412 while self.pos < len(self.field):
413 if self.field[self.pos] in self.LWS:
Barry Warsaw5c8fef92002-12-30 16:43:42 +0000414 self.pos += 1
Barry Warsaw030ddf72002-11-05 19:54:52 +0000415 elif self.field[self.pos] == '"':
416 plist.append(self.getquote())
417 elif self.field[self.pos] == '(':
418 self.commentlist.append(self.getcomment())
Barry Warsaw1fb22bb2002-12-30 16:21:07 +0000419 elif self.field[self.pos] in self.phraseends:
Barry Warsaw030ddf72002-11-05 19:54:52 +0000420 break
Barry Warsaw5c8fef92002-12-30 16:43:42 +0000421 else:
422 plist.append(self.getatom(self.phraseends))
Barry Warsaw030ddf72002-11-05 19:54:52 +0000423
424 return plist
425
426class AddressList(AddrlistClass):
Barry Warsaw1fb22bb2002-12-30 16:21:07 +0000427 """An AddressList encapsulates a list of parsed RFC 2822 addresses."""
Barry Warsaw030ddf72002-11-05 19:54:52 +0000428 def __init__(self, field):
429 AddrlistClass.__init__(self, field)
430 if field:
431 self.addresslist = self.getaddrlist()
432 else:
433 self.addresslist = []
434
435 def __len__(self):
436 return len(self.addresslist)
437
Barry Warsaw030ddf72002-11-05 19:54:52 +0000438 def __add__(self, other):
439 # Set union
440 newaddr = AddressList(None)
441 newaddr.addresslist = self.addresslist[:]
442 for x in other.addresslist:
443 if not x in self.addresslist:
444 newaddr.addresslist.append(x)
445 return newaddr
446
447 def __iadd__(self, other):
448 # Set union, in-place
449 for x in other.addresslist:
450 if not x in self.addresslist:
451 self.addresslist.append(x)
452 return self
453
454 def __sub__(self, other):
455 # Set difference
456 newaddr = AddressList(None)
457 for x in self.addresslist:
458 if not x in other.addresslist:
459 newaddr.addresslist.append(x)
460 return newaddr
461
462 def __isub__(self, other):
463 # Set difference, in-place
464 for x in other.addresslist:
465 if x in self.addresslist:
466 self.addresslist.remove(x)
467 return self
468
469 def __getitem__(self, index):
470 # Make indexing, slices, and 'in' work
471 return self.addresslist[index]