blob: c56cfd03cf3569feaf8d90a224694b5f5fab62a4 [file] [log] [blame]
Barry Warsaw030ddf72002-11-05 19:54:52 +00001# Copyright (C) 2002 Python Software Foundation
2
3"""Email address parsing code.
4
5Lifted directly from rfc822.py. This should eventually be rewritten.
6"""
7
8import time
Barry Warsaw5c8fef92002-12-30 16:43:42 +00009from types import TupleType
10
11try:
12 True, False
13except NameError:
14 True = 1
15 False = 0
16
17SPACE = ' '
18EMPTYSTRING = ''
19COMMASPACE = ', '
Barry Warsaw030ddf72002-11-05 19:54:52 +000020
21# Parse a date field
22_monthnames = ['jan', 'feb', 'mar', 'apr', 'may', 'jun', 'jul',
23 'aug', 'sep', 'oct', 'nov', 'dec',
24 'january', 'february', 'march', 'april', 'may', 'june', 'july',
25 'august', 'september', 'october', 'november', 'december']
26
27_daynames = ['mon', 'tue', 'wed', 'thu', 'fri', 'sat', 'sun']
28
29# The timezone table does not include the military time zones defined
30# in RFC822, other than Z. According to RFC1123, the description in
31# RFC822 gets the signs wrong, so we can't rely on any such time
32# zones. RFC1123 recommends that numeric timezone indicators be used
33# instead of timezone names.
34
35_timezones = {'UT':0, 'UTC':0, 'GMT':0, 'Z':0,
36 'AST': -400, 'ADT': -300, # Atlantic (used in Canada)
37 'EST': -500, 'EDT': -400, # Eastern
38 'CST': -600, 'CDT': -500, # Central
39 'MST': -700, 'MDT': -600, # Mountain
40 'PST': -800, 'PDT': -700 # Pacific
41 }
42
43
44def parsedate_tz(data):
45 """Convert a date string to a time tuple.
46
47 Accounts for military timezones.
48 """
49 data = data.split()
Barry Warsawba976592002-12-30 17:21:36 +000050 # The FWS after the comma after the day-of-week is optional, so search and
51 # adjust for this.
52 if data[0].endswith(',') or data[0].lower() in _daynames:
Barry Warsaw030ddf72002-11-05 19:54:52 +000053 # There's a dayname here. Skip it
54 del data[0]
Barry Warsawba976592002-12-30 17:21:36 +000055 else:
56 i = data[0].rfind(',')
Barry Warsawb5dc39f2003-05-08 03:33:15 +000057 if i >= 0:
58 data[0] = data[0][i+1:]
Barry Warsaw030ddf72002-11-05 19:54:52 +000059 if len(data) == 3: # RFC 850 date, deprecated
60 stuff = data[0].split('-')
61 if len(stuff) == 3:
62 data = stuff + data[1:]
63 if len(data) == 4:
64 s = data[3]
65 i = s.find('+')
66 if i > 0:
67 data[3:] = [s[:i], s[i+1:]]
68 else:
69 data.append('') # Dummy tz
70 if len(data) < 5:
71 return None
72 data = data[:5]
73 [dd, mm, yy, tm, tz] = data
74 mm = mm.lower()
Barry Warsaw5c8fef92002-12-30 16:43:42 +000075 if mm not in _monthnames:
Barry Warsaw030ddf72002-11-05 19:54:52 +000076 dd, mm = mm, dd.lower()
Barry Warsaw5c8fef92002-12-30 16:43:42 +000077 if mm not in _monthnames:
Barry Warsaw030ddf72002-11-05 19:54:52 +000078 return None
Barry Warsaw5c8fef92002-12-30 16:43:42 +000079 mm = _monthnames.index(mm) + 1
80 if mm > 12:
81 mm -= 12
Barry Warsaw030ddf72002-11-05 19:54:52 +000082 if dd[-1] == ',':
83 dd = dd[:-1]
84 i = yy.find(':')
85 if i > 0:
86 yy, tm = tm, yy
87 if yy[-1] == ',':
88 yy = yy[:-1]
89 if not yy[0].isdigit():
90 yy, tz = tz, yy
91 if tm[-1] == ',':
92 tm = tm[:-1]
93 tm = tm.split(':')
94 if len(tm) == 2:
95 [thh, tmm] = tm
96 tss = '0'
97 elif len(tm) == 3:
98 [thh, tmm, tss] = tm
99 else:
100 return None
101 try:
102 yy = int(yy)
103 dd = int(dd)
104 thh = int(thh)
105 tmm = int(tmm)
106 tss = int(tss)
107 except ValueError:
108 return None
109 tzoffset = None
110 tz = tz.upper()
111 if _timezones.has_key(tz):
112 tzoffset = _timezones[tz]
113 else:
114 try:
115 tzoffset = int(tz)
116 except ValueError:
117 pass
118 # Convert a timezone offset into seconds ; -0500 -> -18000
119 if tzoffset:
120 if tzoffset < 0:
121 tzsign = -1
122 tzoffset = -tzoffset
123 else:
124 tzsign = 1
125 tzoffset = tzsign * ( (tzoffset/100)*3600 + (tzoffset % 100)*60)
126 tuple = (yy, mm, dd, thh, tmm, tss, 0, 0, 0, tzoffset)
127 return tuple
128
129
130def parsedate(data):
131 """Convert a time string to a time tuple."""
132 t = parsedate_tz(data)
Barry Warsaw5c8fef92002-12-30 16:43:42 +0000133 if isinstance(t, TupleType):
Barry Warsaw030ddf72002-11-05 19:54:52 +0000134 return t[:9]
Barry Warsaw5c8fef92002-12-30 16:43:42 +0000135 else:
136 return t
Barry Warsaw030ddf72002-11-05 19:54:52 +0000137
138
139def mktime_tz(data):
140 """Turn a 10-tuple as returned by parsedate_tz() into a UTC timestamp."""
141 if data[9] is None:
142 # No zone info, so localtime is better assumption than GMT
143 return time.mktime(data[:8] + (-1,))
144 else:
145 t = time.mktime(data[:8] + (0,))
146 return t - data[9] - time.timezone
147
148
149def quote(str):
150 """Add quotes around a string."""
151 return str.replace('\\', '\\\\').replace('"', '\\"')
152
153
154class AddrlistClass:
155 """Address parser class by Ben Escoto.
156
Barry Warsaw1fb22bb2002-12-30 16:21:07 +0000157 To understand what this class does, it helps to have a copy of RFC 2822 in
158 front of you.
Barry Warsaw030ddf72002-11-05 19:54:52 +0000159
160 Note: this class interface is deprecated and may be removed in the future.
161 Use rfc822.AddressList instead.
162 """
163
164 def __init__(self, field):
165 """Initialize a new instance.
166
167 `field' is an unparsed address header field, containing
168 one or more addresses.
169 """
170 self.specials = '()<>@,:;.\"[]'
171 self.pos = 0
172 self.LWS = ' \t'
173 self.CR = '\r\n'
174 self.atomends = self.specials + self.LWS + self.CR
Barry Warsaw1fb22bb2002-12-30 16:21:07 +0000175 # Note that RFC 2822 now specifies `.' as obs-phrase, meaning that it
176 # is obsolete syntax. RFC 2822 requires that we recognize obsolete
177 # syntax, so allow dots in phrases.
178 self.phraseends = self.atomends.replace('.', '')
Barry Warsaw030ddf72002-11-05 19:54:52 +0000179 self.field = field
180 self.commentlist = []
181
182 def gotonext(self):
183 """Parse up to the start of the next address."""
184 while self.pos < len(self.field):
185 if self.field[self.pos] in self.LWS + '\n\r':
Barry Warsaw5c8fef92002-12-30 16:43:42 +0000186 self.pos += 1
Barry Warsaw030ddf72002-11-05 19:54:52 +0000187 elif self.field[self.pos] == '(':
188 self.commentlist.append(self.getcomment())
Barry Warsaw5c8fef92002-12-30 16:43:42 +0000189 else:
190 break
Barry Warsaw030ddf72002-11-05 19:54:52 +0000191
192 def getaddrlist(self):
193 """Parse all addresses.
194
195 Returns a list containing all of the addresses.
196 """
Barry Warsaw1fb22bb2002-12-30 16:21:07 +0000197 result = []
Barry Warsawfa348c82003-03-17 18:35:42 +0000198 while self.pos < len(self.field):
Barry Warsaw1fb22bb2002-12-30 16:21:07 +0000199 ad = self.getaddress()
200 if ad:
201 result += ad
202 else:
Barry Warsawfa348c82003-03-17 18:35:42 +0000203 result.append(('', ''))
Barry Warsaw1fb22bb2002-12-30 16:21:07 +0000204 return result
Barry Warsaw030ddf72002-11-05 19:54:52 +0000205
206 def getaddress(self):
207 """Parse the next address."""
208 self.commentlist = []
209 self.gotonext()
210
211 oldpos = self.pos
212 oldcl = self.commentlist
213 plist = self.getphraselist()
214
215 self.gotonext()
216 returnlist = []
217
218 if self.pos >= len(self.field):
219 # Bad email address technically, no domain.
220 if plist:
Barry Warsaw5c8fef92002-12-30 16:43:42 +0000221 returnlist = [(SPACE.join(self.commentlist), plist[0])]
Barry Warsaw030ddf72002-11-05 19:54:52 +0000222
223 elif self.field[self.pos] in '.@':
224 # email address is just an addrspec
225 # this isn't very efficient since we start over
226 self.pos = oldpos
227 self.commentlist = oldcl
228 addrspec = self.getaddrspec()
Barry Warsaw5c8fef92002-12-30 16:43:42 +0000229 returnlist = [(SPACE.join(self.commentlist), addrspec)]
Barry Warsaw030ddf72002-11-05 19:54:52 +0000230
231 elif self.field[self.pos] == ':':
232 # address is a group
233 returnlist = []
234
235 fieldlen = len(self.field)
Barry Warsaw5c8fef92002-12-30 16:43:42 +0000236 self.pos += 1
Barry Warsaw030ddf72002-11-05 19:54:52 +0000237 while self.pos < len(self.field):
238 self.gotonext()
239 if self.pos < fieldlen and self.field[self.pos] == ';':
Barry Warsaw5c8fef92002-12-30 16:43:42 +0000240 self.pos += 1
Barry Warsaw030ddf72002-11-05 19:54:52 +0000241 break
242 returnlist = returnlist + self.getaddress()
243
244 elif self.field[self.pos] == '<':
245 # Address is a phrase then a route addr
246 routeaddr = self.getrouteaddr()
247
248 if self.commentlist:
Barry Warsaw5c8fef92002-12-30 16:43:42 +0000249 returnlist = [(SPACE.join(plist) + ' (' +
250 ' '.join(self.commentlist) + ')', routeaddr)]
251 else:
252 returnlist = [(SPACE.join(plist), routeaddr)]
Barry Warsaw030ddf72002-11-05 19:54:52 +0000253
254 else:
255 if plist:
Barry Warsaw5c8fef92002-12-30 16:43:42 +0000256 returnlist = [(SPACE.join(self.commentlist), plist[0])]
Barry Warsaw030ddf72002-11-05 19:54:52 +0000257 elif self.field[self.pos] in self.specials:
Barry Warsaw5c8fef92002-12-30 16:43:42 +0000258 self.pos += 1
Barry Warsaw030ddf72002-11-05 19:54:52 +0000259
260 self.gotonext()
261 if self.pos < len(self.field) and self.field[self.pos] == ',':
Barry Warsaw5c8fef92002-12-30 16:43:42 +0000262 self.pos += 1
Barry Warsaw030ddf72002-11-05 19:54:52 +0000263 return returnlist
264
265 def getrouteaddr(self):
266 """Parse a route address (Return-path value).
267
268 This method just skips all the route stuff and returns the addrspec.
269 """
270 if self.field[self.pos] != '<':
271 return
272
Barry Warsaw5c8fef92002-12-30 16:43:42 +0000273 expectroute = False
274 self.pos += 1
Barry Warsaw030ddf72002-11-05 19:54:52 +0000275 self.gotonext()
Barry Warsaw5c8fef92002-12-30 16:43:42 +0000276 adlist = ''
Barry Warsaw030ddf72002-11-05 19:54:52 +0000277 while self.pos < len(self.field):
278 if expectroute:
279 self.getdomain()
Barry Warsaw5c8fef92002-12-30 16:43:42 +0000280 expectroute = False
Barry Warsaw030ddf72002-11-05 19:54:52 +0000281 elif self.field[self.pos] == '>':
Barry Warsaw5c8fef92002-12-30 16:43:42 +0000282 self.pos += 1
Barry Warsaw030ddf72002-11-05 19:54:52 +0000283 break
284 elif self.field[self.pos] == '@':
Barry Warsaw5c8fef92002-12-30 16:43:42 +0000285 self.pos += 1
286 expectroute = True
Barry Warsaw030ddf72002-11-05 19:54:52 +0000287 elif self.field[self.pos] == ':':
Barry Warsaw5c8fef92002-12-30 16:43:42 +0000288 self.pos += 1
Barry Warsaw030ddf72002-11-05 19:54:52 +0000289 else:
290 adlist = self.getaddrspec()
Barry Warsaw5c8fef92002-12-30 16:43:42 +0000291 self.pos += 1
Barry Warsaw030ddf72002-11-05 19:54:52 +0000292 break
293 self.gotonext()
294
295 return adlist
296
297 def getaddrspec(self):
Barry Warsaw1fb22bb2002-12-30 16:21:07 +0000298 """Parse an RFC 2822 addr-spec."""
Barry Warsaw030ddf72002-11-05 19:54:52 +0000299 aslist = []
300
301 self.gotonext()
302 while self.pos < len(self.field):
303 if self.field[self.pos] == '.':
304 aslist.append('.')
Barry Warsaw5c8fef92002-12-30 16:43:42 +0000305 self.pos += 1
Barry Warsaw030ddf72002-11-05 19:54:52 +0000306 elif self.field[self.pos] == '"':
307 aslist.append('"%s"' % self.getquote())
308 elif self.field[self.pos] in self.atomends:
309 break
Barry Warsaw5c8fef92002-12-30 16:43:42 +0000310 else:
311 aslist.append(self.getatom())
Barry Warsaw030ddf72002-11-05 19:54:52 +0000312 self.gotonext()
313
314 if self.pos >= len(self.field) or self.field[self.pos] != '@':
Barry Warsaw5c8fef92002-12-30 16:43:42 +0000315 return EMPTYSTRING.join(aslist)
Barry Warsaw030ddf72002-11-05 19:54:52 +0000316
317 aslist.append('@')
Barry Warsaw5c8fef92002-12-30 16:43:42 +0000318 self.pos += 1
Barry Warsaw030ddf72002-11-05 19:54:52 +0000319 self.gotonext()
Barry Warsaw5c8fef92002-12-30 16:43:42 +0000320 return EMPTYSTRING.join(aslist) + self.getdomain()
Barry Warsaw030ddf72002-11-05 19:54:52 +0000321
322 def getdomain(self):
323 """Get the complete domain name from an address."""
324 sdlist = []
325 while self.pos < len(self.field):
326 if self.field[self.pos] in self.LWS:
Barry Warsaw5c8fef92002-12-30 16:43:42 +0000327 self.pos += 1
Barry Warsaw030ddf72002-11-05 19:54:52 +0000328 elif self.field[self.pos] == '(':
329 self.commentlist.append(self.getcomment())
330 elif self.field[self.pos] == '[':
331 sdlist.append(self.getdomainliteral())
332 elif self.field[self.pos] == '.':
Barry Warsaw5c8fef92002-12-30 16:43:42 +0000333 self.pos += 1
Barry Warsaw030ddf72002-11-05 19:54:52 +0000334 sdlist.append('.')
335 elif self.field[self.pos] in self.atomends:
336 break
Barry Warsaw5c8fef92002-12-30 16:43:42 +0000337 else:
338 sdlist.append(self.getatom())
339 return EMPTYSTRING.join(sdlist)
Barry Warsaw030ddf72002-11-05 19:54:52 +0000340
Barry Warsaw5c8fef92002-12-30 16:43:42 +0000341 def getdelimited(self, beginchar, endchars, allowcomments=True):
Barry Warsaw030ddf72002-11-05 19:54:52 +0000342 """Parse a header fragment delimited by special characters.
343
344 `beginchar' is the start character for the fragment.
345 If self is not looking at an instance of `beginchar' then
346 getdelimited returns the empty string.
347
348 `endchars' is a sequence of allowable end-delimiting characters.
349 Parsing stops when one of these is encountered.
350
Barry Warsaw1fb22bb2002-12-30 16:21:07 +0000351 If `allowcomments' is non-zero, embedded RFC 2822 comments are allowed
352 within the parsed fragment.
Barry Warsaw030ddf72002-11-05 19:54:52 +0000353 """
354 if self.field[self.pos] != beginchar:
355 return ''
356
357 slist = ['']
Barry Warsaw5c8fef92002-12-30 16:43:42 +0000358 quote = False
359 self.pos += 1
Barry Warsaw030ddf72002-11-05 19:54:52 +0000360 while self.pos < len(self.field):
Barry Warsaw5c8fef92002-12-30 16:43:42 +0000361 if quote:
Barry Warsaw030ddf72002-11-05 19:54:52 +0000362 slist.append(self.field[self.pos])
Barry Warsaw5c8fef92002-12-30 16:43:42 +0000363 quote = False
Barry Warsaw030ddf72002-11-05 19:54:52 +0000364 elif self.field[self.pos] in endchars:
Barry Warsaw5c8fef92002-12-30 16:43:42 +0000365 self.pos += 1
Barry Warsaw030ddf72002-11-05 19:54:52 +0000366 break
367 elif allowcomments and self.field[self.pos] == '(':
368 slist.append(self.getcomment())
369 elif self.field[self.pos] == '\\':
Barry Warsaw5c8fef92002-12-30 16:43:42 +0000370 quote = True
Barry Warsaw030ddf72002-11-05 19:54:52 +0000371 else:
372 slist.append(self.field[self.pos])
Barry Warsaw5c8fef92002-12-30 16:43:42 +0000373 self.pos += 1
Barry Warsaw030ddf72002-11-05 19:54:52 +0000374
Barry Warsaw5c8fef92002-12-30 16:43:42 +0000375 return EMPTYSTRING.join(slist)
Barry Warsaw030ddf72002-11-05 19:54:52 +0000376
377 def getquote(self):
378 """Get a quote-delimited fragment from self's field."""
Barry Warsaw5c8fef92002-12-30 16:43:42 +0000379 return self.getdelimited('"', '"\r', False)
Barry Warsaw030ddf72002-11-05 19:54:52 +0000380
381 def getcomment(self):
382 """Get a parenthesis-delimited fragment from self's field."""
Barry Warsaw5c8fef92002-12-30 16:43:42 +0000383 return self.getdelimited('(', ')\r', True)
Barry Warsaw030ddf72002-11-05 19:54:52 +0000384
385 def getdomainliteral(self):
Barry Warsaw1fb22bb2002-12-30 16:21:07 +0000386 """Parse an RFC 2822 domain-literal."""
Barry Warsaw5c8fef92002-12-30 16:43:42 +0000387 return '[%s]' % self.getdelimited('[', ']\r', False)
Barry Warsaw030ddf72002-11-05 19:54:52 +0000388
Barry Warsaw1fb22bb2002-12-30 16:21:07 +0000389 def getatom(self, atomends=None):
390 """Parse an RFC 2822 atom.
391
392 Optional atomends specifies a different set of end token delimiters
393 (the default is to use self.atomends). This is used e.g. in
394 getphraselist() since phrase endings must not include the `.' (which
395 is legal in phrases)."""
Barry Warsaw030ddf72002-11-05 19:54:52 +0000396 atomlist = ['']
Barry Warsaw1fb22bb2002-12-30 16:21:07 +0000397 if atomends is None:
398 atomends = self.atomends
Barry Warsaw030ddf72002-11-05 19:54:52 +0000399
400 while self.pos < len(self.field):
Barry Warsaw1fb22bb2002-12-30 16:21:07 +0000401 if self.field[self.pos] in atomends:
Barry Warsaw030ddf72002-11-05 19:54:52 +0000402 break
Barry Warsaw5c8fef92002-12-30 16:43:42 +0000403 else:
404 atomlist.append(self.field[self.pos])
405 self.pos += 1
Barry Warsaw030ddf72002-11-05 19:54:52 +0000406
Barry Warsaw5c8fef92002-12-30 16:43:42 +0000407 return EMPTYSTRING.join(atomlist)
Barry Warsaw030ddf72002-11-05 19:54:52 +0000408
409 def getphraselist(self):
Barry Warsaw1fb22bb2002-12-30 16:21:07 +0000410 """Parse a sequence of RFC 2822 phrases.
Barry Warsaw030ddf72002-11-05 19:54:52 +0000411
Barry Warsaw1fb22bb2002-12-30 16:21:07 +0000412 A phrase is a sequence of words, which are in turn either RFC 2822
413 atoms or quoted-strings. Phrases are canonicalized by squeezing all
414 runs of continuous whitespace into one space.
Barry Warsaw030ddf72002-11-05 19:54:52 +0000415 """
416 plist = []
417
418 while self.pos < len(self.field):
419 if self.field[self.pos] in self.LWS:
Barry Warsaw5c8fef92002-12-30 16:43:42 +0000420 self.pos += 1
Barry Warsaw030ddf72002-11-05 19:54:52 +0000421 elif self.field[self.pos] == '"':
422 plist.append(self.getquote())
423 elif self.field[self.pos] == '(':
424 self.commentlist.append(self.getcomment())
Barry Warsaw1fb22bb2002-12-30 16:21:07 +0000425 elif self.field[self.pos] in self.phraseends:
Barry Warsaw030ddf72002-11-05 19:54:52 +0000426 break
Barry Warsaw5c8fef92002-12-30 16:43:42 +0000427 else:
428 plist.append(self.getatom(self.phraseends))
Barry Warsaw030ddf72002-11-05 19:54:52 +0000429
430 return plist
431
432class AddressList(AddrlistClass):
Barry Warsaw1fb22bb2002-12-30 16:21:07 +0000433 """An AddressList encapsulates a list of parsed RFC 2822 addresses."""
Barry Warsaw030ddf72002-11-05 19:54:52 +0000434 def __init__(self, field):
435 AddrlistClass.__init__(self, field)
436 if field:
437 self.addresslist = self.getaddrlist()
438 else:
439 self.addresslist = []
440
441 def __len__(self):
442 return len(self.addresslist)
443
444 def __str__(self):
Barry Warsaw5c8fef92002-12-30 16:43:42 +0000445 return COMMASPACE.join(map(dump_address_pair, self.addresslist))
Barry Warsaw030ddf72002-11-05 19:54:52 +0000446
447 def __add__(self, other):
448 # Set union
449 newaddr = AddressList(None)
450 newaddr.addresslist = self.addresslist[:]
451 for x in other.addresslist:
452 if not x in self.addresslist:
453 newaddr.addresslist.append(x)
454 return newaddr
455
456 def __iadd__(self, other):
457 # Set union, in-place
458 for x in other.addresslist:
459 if not x in self.addresslist:
460 self.addresslist.append(x)
461 return self
462
463 def __sub__(self, other):
464 # Set difference
465 newaddr = AddressList(None)
466 for x in self.addresslist:
467 if not x in other.addresslist:
468 newaddr.addresslist.append(x)
469 return newaddr
470
471 def __isub__(self, other):
472 # Set difference, in-place
473 for x in other.addresslist:
474 if x in self.addresslist:
475 self.addresslist.remove(x)
476 return self
477
478 def __getitem__(self, index):
479 # Make indexing, slices, and 'in' work
480 return self.addresslist[index]