blob: ef9423aadf7ac117533bad2960484b9e2df95c9e [file] [log] [blame]
Barry Warsaw030ddf72002-11-05 19:54:52 +00001# Copyright (C) 2002 Python Software Foundation
2
3"""Email address parsing code.
4
5Lifted directly from rfc822.py. This should eventually be rewritten.
6"""
7
8import time
Barry Warsaw5c8fef92002-12-30 16:43:42 +00009from types import TupleType
10
11try:
12 True, False
13except NameError:
14 True = 1
15 False = 0
16
17SPACE = ' '
18EMPTYSTRING = ''
19COMMASPACE = ', '
Barry Warsaw030ddf72002-11-05 19:54:52 +000020
21# Parse a date field
22_monthnames = ['jan', 'feb', 'mar', 'apr', 'may', 'jun', 'jul',
23 'aug', 'sep', 'oct', 'nov', 'dec',
24 'january', 'february', 'march', 'april', 'may', 'june', 'july',
25 'august', 'september', 'october', 'november', 'december']
26
27_daynames = ['mon', 'tue', 'wed', 'thu', 'fri', 'sat', 'sun']
28
29# The timezone table does not include the military time zones defined
30# in RFC822, other than Z. According to RFC1123, the description in
31# RFC822 gets the signs wrong, so we can't rely on any such time
32# zones. RFC1123 recommends that numeric timezone indicators be used
33# instead of timezone names.
34
35_timezones = {'UT':0, 'UTC':0, 'GMT':0, 'Z':0,
36 'AST': -400, 'ADT': -300, # Atlantic (used in Canada)
37 'EST': -500, 'EDT': -400, # Eastern
38 'CST': -600, 'CDT': -500, # Central
39 'MST': -700, 'MDT': -600, # Mountain
40 'PST': -800, 'PDT': -700 # Pacific
41 }
42
43
44def parsedate_tz(data):
45 """Convert a date string to a time tuple.
46
47 Accounts for military timezones.
48 """
49 data = data.split()
Barry Warsawba976592002-12-30 17:21:36 +000050 # The FWS after the comma after the day-of-week is optional, so search and
51 # adjust for this.
52 if data[0].endswith(',') or data[0].lower() in _daynames:
Barry Warsaw030ddf72002-11-05 19:54:52 +000053 # There's a dayname here. Skip it
54 del data[0]
Barry Warsawba976592002-12-30 17:21:36 +000055 else:
56 i = data[0].rfind(',')
57 if i < 0:
58 return None
59 data[0] = data[0][i+1:]
Barry Warsaw030ddf72002-11-05 19:54:52 +000060 if len(data) == 3: # RFC 850 date, deprecated
61 stuff = data[0].split('-')
62 if len(stuff) == 3:
63 data = stuff + data[1:]
64 if len(data) == 4:
65 s = data[3]
66 i = s.find('+')
67 if i > 0:
68 data[3:] = [s[:i], s[i+1:]]
69 else:
70 data.append('') # Dummy tz
71 if len(data) < 5:
72 return None
73 data = data[:5]
74 [dd, mm, yy, tm, tz] = data
75 mm = mm.lower()
Barry Warsaw5c8fef92002-12-30 16:43:42 +000076 if mm not in _monthnames:
Barry Warsaw030ddf72002-11-05 19:54:52 +000077 dd, mm = mm, dd.lower()
Barry Warsaw5c8fef92002-12-30 16:43:42 +000078 if mm not in _monthnames:
Barry Warsaw030ddf72002-11-05 19:54:52 +000079 return None
Barry Warsaw5c8fef92002-12-30 16:43:42 +000080 mm = _monthnames.index(mm) + 1
81 if mm > 12:
82 mm -= 12
Barry Warsaw030ddf72002-11-05 19:54:52 +000083 if dd[-1] == ',':
84 dd = dd[:-1]
85 i = yy.find(':')
86 if i > 0:
87 yy, tm = tm, yy
88 if yy[-1] == ',':
89 yy = yy[:-1]
90 if not yy[0].isdigit():
91 yy, tz = tz, yy
92 if tm[-1] == ',':
93 tm = tm[:-1]
94 tm = tm.split(':')
95 if len(tm) == 2:
96 [thh, tmm] = tm
97 tss = '0'
98 elif len(tm) == 3:
99 [thh, tmm, tss] = tm
100 else:
101 return None
102 try:
103 yy = int(yy)
104 dd = int(dd)
105 thh = int(thh)
106 tmm = int(tmm)
107 tss = int(tss)
108 except ValueError:
109 return None
110 tzoffset = None
111 tz = tz.upper()
112 if _timezones.has_key(tz):
113 tzoffset = _timezones[tz]
114 else:
115 try:
116 tzoffset = int(tz)
117 except ValueError:
118 pass
119 # Convert a timezone offset into seconds ; -0500 -> -18000
120 if tzoffset:
121 if tzoffset < 0:
122 tzsign = -1
123 tzoffset = -tzoffset
124 else:
125 tzsign = 1
126 tzoffset = tzsign * ( (tzoffset/100)*3600 + (tzoffset % 100)*60)
127 tuple = (yy, mm, dd, thh, tmm, tss, 0, 0, 0, tzoffset)
128 return tuple
129
130
131def parsedate(data):
132 """Convert a time string to a time tuple."""
133 t = parsedate_tz(data)
Barry Warsaw5c8fef92002-12-30 16:43:42 +0000134 if isinstance(t, TupleType):
Barry Warsaw030ddf72002-11-05 19:54:52 +0000135 return t[:9]
Barry Warsaw5c8fef92002-12-30 16:43:42 +0000136 else:
137 return t
Barry Warsaw030ddf72002-11-05 19:54:52 +0000138
139
140def mktime_tz(data):
141 """Turn a 10-tuple as returned by parsedate_tz() into a UTC timestamp."""
142 if data[9] is None:
143 # No zone info, so localtime is better assumption than GMT
144 return time.mktime(data[:8] + (-1,))
145 else:
146 t = time.mktime(data[:8] + (0,))
147 return t - data[9] - time.timezone
148
149
150def quote(str):
151 """Add quotes around a string."""
152 return str.replace('\\', '\\\\').replace('"', '\\"')
153
154
155class AddrlistClass:
156 """Address parser class by Ben Escoto.
157
Barry Warsaw1fb22bb2002-12-30 16:21:07 +0000158 To understand what this class does, it helps to have a copy of RFC 2822 in
159 front of you.
Barry Warsaw030ddf72002-11-05 19:54:52 +0000160
161 Note: this class interface is deprecated and may be removed in the future.
162 Use rfc822.AddressList instead.
163 """
164
165 def __init__(self, field):
166 """Initialize a new instance.
167
168 `field' is an unparsed address header field, containing
169 one or more addresses.
170 """
171 self.specials = '()<>@,:;.\"[]'
172 self.pos = 0
173 self.LWS = ' \t'
174 self.CR = '\r\n'
175 self.atomends = self.specials + self.LWS + self.CR
Barry Warsaw1fb22bb2002-12-30 16:21:07 +0000176 # Note that RFC 2822 now specifies `.' as obs-phrase, meaning that it
177 # is obsolete syntax. RFC 2822 requires that we recognize obsolete
178 # syntax, so allow dots in phrases.
179 self.phraseends = self.atomends.replace('.', '')
Barry Warsaw030ddf72002-11-05 19:54:52 +0000180 self.field = field
181 self.commentlist = []
182
183 def gotonext(self):
184 """Parse up to the start of the next address."""
185 while self.pos < len(self.field):
186 if self.field[self.pos] in self.LWS + '\n\r':
Barry Warsaw5c8fef92002-12-30 16:43:42 +0000187 self.pos += 1
Barry Warsaw030ddf72002-11-05 19:54:52 +0000188 elif self.field[self.pos] == '(':
189 self.commentlist.append(self.getcomment())
Barry Warsaw5c8fef92002-12-30 16:43:42 +0000190 else:
191 break
Barry Warsaw030ddf72002-11-05 19:54:52 +0000192
193 def getaddrlist(self):
194 """Parse all addresses.
195
196 Returns a list containing all of the addresses.
197 """
Barry Warsaw1fb22bb2002-12-30 16:21:07 +0000198 result = []
Barry Warsaw5c8fef92002-12-30 16:43:42 +0000199 while True:
Barry Warsaw1fb22bb2002-12-30 16:21:07 +0000200 ad = self.getaddress()
201 if ad:
202 result += ad
203 else:
204 break
205 return result
Barry Warsaw030ddf72002-11-05 19:54:52 +0000206
207 def getaddress(self):
208 """Parse the next address."""
209 self.commentlist = []
210 self.gotonext()
211
212 oldpos = self.pos
213 oldcl = self.commentlist
214 plist = self.getphraselist()
215
216 self.gotonext()
217 returnlist = []
218
219 if self.pos >= len(self.field):
220 # Bad email address technically, no domain.
221 if plist:
Barry Warsaw5c8fef92002-12-30 16:43:42 +0000222 returnlist = [(SPACE.join(self.commentlist), plist[0])]
Barry Warsaw030ddf72002-11-05 19:54:52 +0000223
224 elif self.field[self.pos] in '.@':
225 # email address is just an addrspec
226 # this isn't very efficient since we start over
227 self.pos = oldpos
228 self.commentlist = oldcl
229 addrspec = self.getaddrspec()
Barry Warsaw5c8fef92002-12-30 16:43:42 +0000230 returnlist = [(SPACE.join(self.commentlist), addrspec)]
Barry Warsaw030ddf72002-11-05 19:54:52 +0000231
232 elif self.field[self.pos] == ':':
233 # address is a group
234 returnlist = []
235
236 fieldlen = len(self.field)
Barry Warsaw5c8fef92002-12-30 16:43:42 +0000237 self.pos += 1
Barry Warsaw030ddf72002-11-05 19:54:52 +0000238 while self.pos < len(self.field):
239 self.gotonext()
240 if self.pos < fieldlen and self.field[self.pos] == ';':
Barry Warsaw5c8fef92002-12-30 16:43:42 +0000241 self.pos += 1
Barry Warsaw030ddf72002-11-05 19:54:52 +0000242 break
243 returnlist = returnlist + self.getaddress()
244
245 elif self.field[self.pos] == '<':
246 # Address is a phrase then a route addr
247 routeaddr = self.getrouteaddr()
248
249 if self.commentlist:
Barry Warsaw5c8fef92002-12-30 16:43:42 +0000250 returnlist = [(SPACE.join(plist) + ' (' +
251 ' '.join(self.commentlist) + ')', routeaddr)]
252 else:
253 returnlist = [(SPACE.join(plist), routeaddr)]
Barry Warsaw030ddf72002-11-05 19:54:52 +0000254
255 else:
256 if plist:
Barry Warsaw5c8fef92002-12-30 16:43:42 +0000257 returnlist = [(SPACE.join(self.commentlist), plist[0])]
Barry Warsaw030ddf72002-11-05 19:54:52 +0000258 elif self.field[self.pos] in self.specials:
Barry Warsaw5c8fef92002-12-30 16:43:42 +0000259 self.pos += 1
Barry Warsaw030ddf72002-11-05 19:54:52 +0000260
261 self.gotonext()
262 if self.pos < len(self.field) and self.field[self.pos] == ',':
Barry Warsaw5c8fef92002-12-30 16:43:42 +0000263 self.pos += 1
Barry Warsaw030ddf72002-11-05 19:54:52 +0000264 return returnlist
265
266 def getrouteaddr(self):
267 """Parse a route address (Return-path value).
268
269 This method just skips all the route stuff and returns the addrspec.
270 """
271 if self.field[self.pos] != '<':
272 return
273
Barry Warsaw5c8fef92002-12-30 16:43:42 +0000274 expectroute = False
275 self.pos += 1
Barry Warsaw030ddf72002-11-05 19:54:52 +0000276 self.gotonext()
Barry Warsaw5c8fef92002-12-30 16:43:42 +0000277 adlist = ''
Barry Warsaw030ddf72002-11-05 19:54:52 +0000278 while self.pos < len(self.field):
279 if expectroute:
280 self.getdomain()
Barry Warsaw5c8fef92002-12-30 16:43:42 +0000281 expectroute = False
Barry Warsaw030ddf72002-11-05 19:54:52 +0000282 elif self.field[self.pos] == '>':
Barry Warsaw5c8fef92002-12-30 16:43:42 +0000283 self.pos += 1
Barry Warsaw030ddf72002-11-05 19:54:52 +0000284 break
285 elif self.field[self.pos] == '@':
Barry Warsaw5c8fef92002-12-30 16:43:42 +0000286 self.pos += 1
287 expectroute = True
Barry Warsaw030ddf72002-11-05 19:54:52 +0000288 elif self.field[self.pos] == ':':
Barry Warsaw5c8fef92002-12-30 16:43:42 +0000289 self.pos += 1
Barry Warsaw030ddf72002-11-05 19:54:52 +0000290 else:
291 adlist = self.getaddrspec()
Barry Warsaw5c8fef92002-12-30 16:43:42 +0000292 self.pos += 1
Barry Warsaw030ddf72002-11-05 19:54:52 +0000293 break
294 self.gotonext()
295
296 return adlist
297
298 def getaddrspec(self):
Barry Warsaw1fb22bb2002-12-30 16:21:07 +0000299 """Parse an RFC 2822 addr-spec."""
Barry Warsaw030ddf72002-11-05 19:54:52 +0000300 aslist = []
301
302 self.gotonext()
303 while self.pos < len(self.field):
304 if self.field[self.pos] == '.':
305 aslist.append('.')
Barry Warsaw5c8fef92002-12-30 16:43:42 +0000306 self.pos += 1
Barry Warsaw030ddf72002-11-05 19:54:52 +0000307 elif self.field[self.pos] == '"':
308 aslist.append('"%s"' % self.getquote())
309 elif self.field[self.pos] in self.atomends:
310 break
Barry Warsaw5c8fef92002-12-30 16:43:42 +0000311 else:
312 aslist.append(self.getatom())
Barry Warsaw030ddf72002-11-05 19:54:52 +0000313 self.gotonext()
314
315 if self.pos >= len(self.field) or self.field[self.pos] != '@':
Barry Warsaw5c8fef92002-12-30 16:43:42 +0000316 return EMPTYSTRING.join(aslist)
Barry Warsaw030ddf72002-11-05 19:54:52 +0000317
318 aslist.append('@')
Barry Warsaw5c8fef92002-12-30 16:43:42 +0000319 self.pos += 1
Barry Warsaw030ddf72002-11-05 19:54:52 +0000320 self.gotonext()
Barry Warsaw5c8fef92002-12-30 16:43:42 +0000321 return EMPTYSTRING.join(aslist) + self.getdomain()
Barry Warsaw030ddf72002-11-05 19:54:52 +0000322
323 def getdomain(self):
324 """Get the complete domain name from an address."""
325 sdlist = []
326 while self.pos < len(self.field):
327 if self.field[self.pos] in self.LWS:
Barry Warsaw5c8fef92002-12-30 16:43:42 +0000328 self.pos += 1
Barry Warsaw030ddf72002-11-05 19:54:52 +0000329 elif self.field[self.pos] == '(':
330 self.commentlist.append(self.getcomment())
331 elif self.field[self.pos] == '[':
332 sdlist.append(self.getdomainliteral())
333 elif self.field[self.pos] == '.':
Barry Warsaw5c8fef92002-12-30 16:43:42 +0000334 self.pos += 1
Barry Warsaw030ddf72002-11-05 19:54:52 +0000335 sdlist.append('.')
336 elif self.field[self.pos] in self.atomends:
337 break
Barry Warsaw5c8fef92002-12-30 16:43:42 +0000338 else:
339 sdlist.append(self.getatom())
340 return EMPTYSTRING.join(sdlist)
Barry Warsaw030ddf72002-11-05 19:54:52 +0000341
Barry Warsaw5c8fef92002-12-30 16:43:42 +0000342 def getdelimited(self, beginchar, endchars, allowcomments=True):
Barry Warsaw030ddf72002-11-05 19:54:52 +0000343 """Parse a header fragment delimited by special characters.
344
345 `beginchar' is the start character for the fragment.
346 If self is not looking at an instance of `beginchar' then
347 getdelimited returns the empty string.
348
349 `endchars' is a sequence of allowable end-delimiting characters.
350 Parsing stops when one of these is encountered.
351
Barry Warsaw1fb22bb2002-12-30 16:21:07 +0000352 If `allowcomments' is non-zero, embedded RFC 2822 comments are allowed
353 within the parsed fragment.
Barry Warsaw030ddf72002-11-05 19:54:52 +0000354 """
355 if self.field[self.pos] != beginchar:
356 return ''
357
358 slist = ['']
Barry Warsaw5c8fef92002-12-30 16:43:42 +0000359 quote = False
360 self.pos += 1
Barry Warsaw030ddf72002-11-05 19:54:52 +0000361 while self.pos < len(self.field):
Barry Warsaw5c8fef92002-12-30 16:43:42 +0000362 if quote:
Barry Warsaw030ddf72002-11-05 19:54:52 +0000363 slist.append(self.field[self.pos])
Barry Warsaw5c8fef92002-12-30 16:43:42 +0000364 quote = False
Barry Warsaw030ddf72002-11-05 19:54:52 +0000365 elif self.field[self.pos] in endchars:
Barry Warsaw5c8fef92002-12-30 16:43:42 +0000366 self.pos += 1
Barry Warsaw030ddf72002-11-05 19:54:52 +0000367 break
368 elif allowcomments and self.field[self.pos] == '(':
369 slist.append(self.getcomment())
370 elif self.field[self.pos] == '\\':
Barry Warsaw5c8fef92002-12-30 16:43:42 +0000371 quote = True
Barry Warsaw030ddf72002-11-05 19:54:52 +0000372 else:
373 slist.append(self.field[self.pos])
Barry Warsaw5c8fef92002-12-30 16:43:42 +0000374 self.pos += 1
Barry Warsaw030ddf72002-11-05 19:54:52 +0000375
Barry Warsaw5c8fef92002-12-30 16:43:42 +0000376 return EMPTYSTRING.join(slist)
Barry Warsaw030ddf72002-11-05 19:54:52 +0000377
378 def getquote(self):
379 """Get a quote-delimited fragment from self's field."""
Barry Warsaw5c8fef92002-12-30 16:43:42 +0000380 return self.getdelimited('"', '"\r', False)
Barry Warsaw030ddf72002-11-05 19:54:52 +0000381
382 def getcomment(self):
383 """Get a parenthesis-delimited fragment from self's field."""
Barry Warsaw5c8fef92002-12-30 16:43:42 +0000384 return self.getdelimited('(', ')\r', True)
Barry Warsaw030ddf72002-11-05 19:54:52 +0000385
386 def getdomainliteral(self):
Barry Warsaw1fb22bb2002-12-30 16:21:07 +0000387 """Parse an RFC 2822 domain-literal."""
Barry Warsaw5c8fef92002-12-30 16:43:42 +0000388 return '[%s]' % self.getdelimited('[', ']\r', False)
Barry Warsaw030ddf72002-11-05 19:54:52 +0000389
Barry Warsaw1fb22bb2002-12-30 16:21:07 +0000390 def getatom(self, atomends=None):
391 """Parse an RFC 2822 atom.
392
393 Optional atomends specifies a different set of end token delimiters
394 (the default is to use self.atomends). This is used e.g. in
395 getphraselist() since phrase endings must not include the `.' (which
396 is legal in phrases)."""
Barry Warsaw030ddf72002-11-05 19:54:52 +0000397 atomlist = ['']
Barry Warsaw1fb22bb2002-12-30 16:21:07 +0000398 if atomends is None:
399 atomends = self.atomends
Barry Warsaw030ddf72002-11-05 19:54:52 +0000400
401 while self.pos < len(self.field):
Barry Warsaw1fb22bb2002-12-30 16:21:07 +0000402 if self.field[self.pos] in atomends:
Barry Warsaw030ddf72002-11-05 19:54:52 +0000403 break
Barry Warsaw5c8fef92002-12-30 16:43:42 +0000404 else:
405 atomlist.append(self.field[self.pos])
406 self.pos += 1
Barry Warsaw030ddf72002-11-05 19:54:52 +0000407
Barry Warsaw5c8fef92002-12-30 16:43:42 +0000408 return EMPTYSTRING.join(atomlist)
Barry Warsaw030ddf72002-11-05 19:54:52 +0000409
410 def getphraselist(self):
Barry Warsaw1fb22bb2002-12-30 16:21:07 +0000411 """Parse a sequence of RFC 2822 phrases.
Barry Warsaw030ddf72002-11-05 19:54:52 +0000412
Barry Warsaw1fb22bb2002-12-30 16:21:07 +0000413 A phrase is a sequence of words, which are in turn either RFC 2822
414 atoms or quoted-strings. Phrases are canonicalized by squeezing all
415 runs of continuous whitespace into one space.
Barry Warsaw030ddf72002-11-05 19:54:52 +0000416 """
417 plist = []
418
419 while self.pos < len(self.field):
420 if self.field[self.pos] in self.LWS:
Barry Warsaw5c8fef92002-12-30 16:43:42 +0000421 self.pos += 1
Barry Warsaw030ddf72002-11-05 19:54:52 +0000422 elif self.field[self.pos] == '"':
423 plist.append(self.getquote())
424 elif self.field[self.pos] == '(':
425 self.commentlist.append(self.getcomment())
Barry Warsaw1fb22bb2002-12-30 16:21:07 +0000426 elif self.field[self.pos] in self.phraseends:
Barry Warsaw030ddf72002-11-05 19:54:52 +0000427 break
Barry Warsaw5c8fef92002-12-30 16:43:42 +0000428 else:
429 plist.append(self.getatom(self.phraseends))
Barry Warsaw030ddf72002-11-05 19:54:52 +0000430
431 return plist
432
433class AddressList(AddrlistClass):
Barry Warsaw1fb22bb2002-12-30 16:21:07 +0000434 """An AddressList encapsulates a list of parsed RFC 2822 addresses."""
Barry Warsaw030ddf72002-11-05 19:54:52 +0000435 def __init__(self, field):
436 AddrlistClass.__init__(self, field)
437 if field:
438 self.addresslist = self.getaddrlist()
439 else:
440 self.addresslist = []
441
442 def __len__(self):
443 return len(self.addresslist)
444
445 def __str__(self):
Barry Warsaw5c8fef92002-12-30 16:43:42 +0000446 return COMMASPACE.join(map(dump_address_pair, self.addresslist))
Barry Warsaw030ddf72002-11-05 19:54:52 +0000447
448 def __add__(self, other):
449 # Set union
450 newaddr = AddressList(None)
451 newaddr.addresslist = self.addresslist[:]
452 for x in other.addresslist:
453 if not x in self.addresslist:
454 newaddr.addresslist.append(x)
455 return newaddr
456
457 def __iadd__(self, other):
458 # Set union, in-place
459 for x in other.addresslist:
460 if not x in self.addresslist:
461 self.addresslist.append(x)
462 return self
463
464 def __sub__(self, other):
465 # Set difference
466 newaddr = AddressList(None)
467 for x in self.addresslist:
468 if not x in other.addresslist:
469 newaddr.addresslist.append(x)
470 return newaddr
471
472 def __isub__(self, other):
473 # Set difference, in-place
474 for x in other.addresslist:
475 if x in self.addresslist:
476 self.addresslist.remove(x)
477 return self
478
479 def __getitem__(self, index):
480 # Make indexing, slices, and 'in' work
481 return self.addresslist[index]