blob: 167eb3fcf9cc01826d11680be9620f9da6d0d84e [file] [log] [blame]
Barry Warsaw24f79762004-05-09 03:55:11 +00001# Copyright (C) 2002-2004 Python Software Foundation
Barry Warsaw030ddf72002-11-05 19:54:52 +00002
3"""Email address parsing code.
4
5Lifted directly from rfc822.py. This should eventually be rewritten.
6"""
7
8import time
Barry Warsaw5c8fef92002-12-30 16:43:42 +00009
10SPACE = ' '
11EMPTYSTRING = ''
12COMMASPACE = ', '
Barry Warsaw030ddf72002-11-05 19:54:52 +000013
14# Parse a date field
15_monthnames = ['jan', 'feb', 'mar', 'apr', 'may', 'jun', 'jul',
16 'aug', 'sep', 'oct', 'nov', 'dec',
17 'january', 'february', 'march', 'april', 'may', 'june', 'july',
18 'august', 'september', 'october', 'november', 'december']
19
20_daynames = ['mon', 'tue', 'wed', 'thu', 'fri', 'sat', 'sun']
21
22# The timezone table does not include the military time zones defined
23# in RFC822, other than Z. According to RFC1123, the description in
24# RFC822 gets the signs wrong, so we can't rely on any such time
25# zones. RFC1123 recommends that numeric timezone indicators be used
26# instead of timezone names.
27
28_timezones = {'UT':0, 'UTC':0, 'GMT':0, 'Z':0,
29 'AST': -400, 'ADT': -300, # Atlantic (used in Canada)
30 'EST': -500, 'EDT': -400, # Eastern
31 'CST': -600, 'CDT': -500, # Central
32 'MST': -700, 'MDT': -600, # Mountain
33 'PST': -800, 'PDT': -700 # Pacific
34 }
35
36
37def parsedate_tz(data):
38 """Convert a date string to a time tuple.
39
40 Accounts for military timezones.
41 """
42 data = data.split()
Barry Warsawba976592002-12-30 17:21:36 +000043 # The FWS after the comma after the day-of-week is optional, so search and
44 # adjust for this.
45 if data[0].endswith(',') or data[0].lower() in _daynames:
Barry Warsaw030ddf72002-11-05 19:54:52 +000046 # There's a dayname here. Skip it
47 del data[0]
Barry Warsawba976592002-12-30 17:21:36 +000048 else:
49 i = data[0].rfind(',')
Barry Warsawb5dc39f2003-05-08 03:33:15 +000050 if i >= 0:
51 data[0] = data[0][i+1:]
Barry Warsaw030ddf72002-11-05 19:54:52 +000052 if len(data) == 3: # RFC 850 date, deprecated
53 stuff = data[0].split('-')
54 if len(stuff) == 3:
55 data = stuff + data[1:]
56 if len(data) == 4:
57 s = data[3]
58 i = s.find('+')
59 if i > 0:
60 data[3:] = [s[:i], s[i+1:]]
61 else:
62 data.append('') # Dummy tz
63 if len(data) < 5:
64 return None
65 data = data[:5]
66 [dd, mm, yy, tm, tz] = data
67 mm = mm.lower()
Barry Warsaw5c8fef92002-12-30 16:43:42 +000068 if mm not in _monthnames:
Barry Warsaw030ddf72002-11-05 19:54:52 +000069 dd, mm = mm, dd.lower()
Barry Warsaw5c8fef92002-12-30 16:43:42 +000070 if mm not in _monthnames:
Barry Warsaw030ddf72002-11-05 19:54:52 +000071 return None
Barry Warsaw5c8fef92002-12-30 16:43:42 +000072 mm = _monthnames.index(mm) + 1
73 if mm > 12:
74 mm -= 12
Barry Warsaw030ddf72002-11-05 19:54:52 +000075 if dd[-1] == ',':
76 dd = dd[:-1]
77 i = yy.find(':')
78 if i > 0:
79 yy, tm = tm, yy
80 if yy[-1] == ',':
81 yy = yy[:-1]
82 if not yy[0].isdigit():
83 yy, tz = tz, yy
84 if tm[-1] == ',':
85 tm = tm[:-1]
86 tm = tm.split(':')
87 if len(tm) == 2:
88 [thh, tmm] = tm
89 tss = '0'
90 elif len(tm) == 3:
91 [thh, tmm, tss] = tm
92 else:
93 return None
94 try:
95 yy = int(yy)
96 dd = int(dd)
97 thh = int(thh)
98 tmm = int(tmm)
99 tss = int(tss)
100 except ValueError:
101 return None
102 tzoffset = None
103 tz = tz.upper()
104 if _timezones.has_key(tz):
105 tzoffset = _timezones[tz]
106 else:
107 try:
108 tzoffset = int(tz)
109 except ValueError:
110 pass
111 # Convert a timezone offset into seconds ; -0500 -> -18000
112 if tzoffset:
113 if tzoffset < 0:
114 tzsign = -1
115 tzoffset = -tzoffset
116 else:
117 tzsign = 1
118 tzoffset = tzsign * ( (tzoffset/100)*3600 + (tzoffset % 100)*60)
119 tuple = (yy, mm, dd, thh, tmm, tss, 0, 0, 0, tzoffset)
120 return tuple
121
122
123def parsedate(data):
124 """Convert a time string to a time tuple."""
125 t = parsedate_tz(data)
Barry Warsaw24f79762004-05-09 03:55:11 +0000126 if isinstance(t, tuple):
Barry Warsaw030ddf72002-11-05 19:54:52 +0000127 return t[:9]
Barry Warsaw5c8fef92002-12-30 16:43:42 +0000128 else:
129 return t
Barry Warsaw030ddf72002-11-05 19:54:52 +0000130
131
132def mktime_tz(data):
133 """Turn a 10-tuple as returned by parsedate_tz() into a UTC timestamp."""
134 if data[9] is None:
135 # No zone info, so localtime is better assumption than GMT
136 return time.mktime(data[:8] + (-1,))
137 else:
138 t = time.mktime(data[:8] + (0,))
139 return t - data[9] - time.timezone
140
141
142def quote(str):
143 """Add quotes around a string."""
144 return str.replace('\\', '\\\\').replace('"', '\\"')
145
146
147class AddrlistClass:
148 """Address parser class by Ben Escoto.
149
Barry Warsaw1fb22bb2002-12-30 16:21:07 +0000150 To understand what this class does, it helps to have a copy of RFC 2822 in
151 front of you.
Barry Warsaw030ddf72002-11-05 19:54:52 +0000152
153 Note: this class interface is deprecated and may be removed in the future.
154 Use rfc822.AddressList instead.
155 """
156
157 def __init__(self, field):
158 """Initialize a new instance.
159
160 `field' is an unparsed address header field, containing
161 one or more addresses.
162 """
163 self.specials = '()<>@,:;.\"[]'
164 self.pos = 0
165 self.LWS = ' \t'
166 self.CR = '\r\n'
167 self.atomends = self.specials + self.LWS + self.CR
Barry Warsaw1fb22bb2002-12-30 16:21:07 +0000168 # Note that RFC 2822 now specifies `.' as obs-phrase, meaning that it
169 # is obsolete syntax. RFC 2822 requires that we recognize obsolete
170 # syntax, so allow dots in phrases.
171 self.phraseends = self.atomends.replace('.', '')
Barry Warsaw030ddf72002-11-05 19:54:52 +0000172 self.field = field
173 self.commentlist = []
174
175 def gotonext(self):
176 """Parse up to the start of the next address."""
177 while self.pos < len(self.field):
178 if self.field[self.pos] in self.LWS + '\n\r':
Barry Warsaw5c8fef92002-12-30 16:43:42 +0000179 self.pos += 1
Barry Warsaw030ddf72002-11-05 19:54:52 +0000180 elif self.field[self.pos] == '(':
181 self.commentlist.append(self.getcomment())
Barry Warsaw5c8fef92002-12-30 16:43:42 +0000182 else:
183 break
Barry Warsaw030ddf72002-11-05 19:54:52 +0000184
185 def getaddrlist(self):
186 """Parse all addresses.
187
188 Returns a list containing all of the addresses.
189 """
Barry Warsaw1fb22bb2002-12-30 16:21:07 +0000190 result = []
Barry Warsawfa348c82003-03-17 18:35:42 +0000191 while self.pos < len(self.field):
Barry Warsaw1fb22bb2002-12-30 16:21:07 +0000192 ad = self.getaddress()
193 if ad:
194 result += ad
195 else:
Barry Warsawfa348c82003-03-17 18:35:42 +0000196 result.append(('', ''))
Barry Warsaw1fb22bb2002-12-30 16:21:07 +0000197 return result
Barry Warsaw030ddf72002-11-05 19:54:52 +0000198
199 def getaddress(self):
200 """Parse the next address."""
201 self.commentlist = []
202 self.gotonext()
203
204 oldpos = self.pos
205 oldcl = self.commentlist
206 plist = self.getphraselist()
207
208 self.gotonext()
209 returnlist = []
210
211 if self.pos >= len(self.field):
212 # Bad email address technically, no domain.
213 if plist:
Barry Warsaw5c8fef92002-12-30 16:43:42 +0000214 returnlist = [(SPACE.join(self.commentlist), plist[0])]
Barry Warsaw030ddf72002-11-05 19:54:52 +0000215
216 elif self.field[self.pos] in '.@':
217 # email address is just an addrspec
218 # this isn't very efficient since we start over
219 self.pos = oldpos
220 self.commentlist = oldcl
221 addrspec = self.getaddrspec()
Barry Warsaw5c8fef92002-12-30 16:43:42 +0000222 returnlist = [(SPACE.join(self.commentlist), addrspec)]
Barry Warsaw030ddf72002-11-05 19:54:52 +0000223
224 elif self.field[self.pos] == ':':
225 # address is a group
226 returnlist = []
227
228 fieldlen = len(self.field)
Barry Warsaw5c8fef92002-12-30 16:43:42 +0000229 self.pos += 1
Barry Warsaw030ddf72002-11-05 19:54:52 +0000230 while self.pos < len(self.field):
231 self.gotonext()
232 if self.pos < fieldlen and self.field[self.pos] == ';':
Barry Warsaw5c8fef92002-12-30 16:43:42 +0000233 self.pos += 1
Barry Warsaw030ddf72002-11-05 19:54:52 +0000234 break
235 returnlist = returnlist + self.getaddress()
236
237 elif self.field[self.pos] == '<':
238 # Address is a phrase then a route addr
239 routeaddr = self.getrouteaddr()
240
241 if self.commentlist:
Barry Warsaw5c8fef92002-12-30 16:43:42 +0000242 returnlist = [(SPACE.join(plist) + ' (' +
243 ' '.join(self.commentlist) + ')', routeaddr)]
244 else:
245 returnlist = [(SPACE.join(plist), routeaddr)]
Barry Warsaw030ddf72002-11-05 19:54:52 +0000246
247 else:
248 if plist:
Barry Warsaw5c8fef92002-12-30 16:43:42 +0000249 returnlist = [(SPACE.join(self.commentlist), plist[0])]
Barry Warsaw030ddf72002-11-05 19:54:52 +0000250 elif self.field[self.pos] in self.specials:
Barry Warsaw5c8fef92002-12-30 16:43:42 +0000251 self.pos += 1
Barry Warsaw030ddf72002-11-05 19:54:52 +0000252
253 self.gotonext()
254 if self.pos < len(self.field) and self.field[self.pos] == ',':
Barry Warsaw5c8fef92002-12-30 16:43:42 +0000255 self.pos += 1
Barry Warsaw030ddf72002-11-05 19:54:52 +0000256 return returnlist
257
258 def getrouteaddr(self):
259 """Parse a route address (Return-path value).
260
261 This method just skips all the route stuff and returns the addrspec.
262 """
263 if self.field[self.pos] != '<':
264 return
265
Barry Warsaw5c8fef92002-12-30 16:43:42 +0000266 expectroute = False
267 self.pos += 1
Barry Warsaw030ddf72002-11-05 19:54:52 +0000268 self.gotonext()
Barry Warsaw5c8fef92002-12-30 16:43:42 +0000269 adlist = ''
Barry Warsaw030ddf72002-11-05 19:54:52 +0000270 while self.pos < len(self.field):
271 if expectroute:
272 self.getdomain()
Barry Warsaw5c8fef92002-12-30 16:43:42 +0000273 expectroute = False
Barry Warsaw030ddf72002-11-05 19:54:52 +0000274 elif self.field[self.pos] == '>':
Barry Warsaw5c8fef92002-12-30 16:43:42 +0000275 self.pos += 1
Barry Warsaw030ddf72002-11-05 19:54:52 +0000276 break
277 elif self.field[self.pos] == '@':
Barry Warsaw5c8fef92002-12-30 16:43:42 +0000278 self.pos += 1
279 expectroute = True
Barry Warsaw030ddf72002-11-05 19:54:52 +0000280 elif self.field[self.pos] == ':':
Barry Warsaw5c8fef92002-12-30 16:43:42 +0000281 self.pos += 1
Barry Warsaw030ddf72002-11-05 19:54:52 +0000282 else:
283 adlist = self.getaddrspec()
Barry Warsaw5c8fef92002-12-30 16:43:42 +0000284 self.pos += 1
Barry Warsaw030ddf72002-11-05 19:54:52 +0000285 break
286 self.gotonext()
287
288 return adlist
289
290 def getaddrspec(self):
Barry Warsaw1fb22bb2002-12-30 16:21:07 +0000291 """Parse an RFC 2822 addr-spec."""
Barry Warsaw030ddf72002-11-05 19:54:52 +0000292 aslist = []
293
294 self.gotonext()
295 while self.pos < len(self.field):
296 if self.field[self.pos] == '.':
297 aslist.append('.')
Barry Warsaw5c8fef92002-12-30 16:43:42 +0000298 self.pos += 1
Barry Warsaw030ddf72002-11-05 19:54:52 +0000299 elif self.field[self.pos] == '"':
300 aslist.append('"%s"' % self.getquote())
301 elif self.field[self.pos] in self.atomends:
302 break
Barry Warsaw5c8fef92002-12-30 16:43:42 +0000303 else:
304 aslist.append(self.getatom())
Barry Warsaw030ddf72002-11-05 19:54:52 +0000305 self.gotonext()
306
307 if self.pos >= len(self.field) or self.field[self.pos] != '@':
Barry Warsaw5c8fef92002-12-30 16:43:42 +0000308 return EMPTYSTRING.join(aslist)
Barry Warsaw030ddf72002-11-05 19:54:52 +0000309
310 aslist.append('@')
Barry Warsaw5c8fef92002-12-30 16:43:42 +0000311 self.pos += 1
Barry Warsaw030ddf72002-11-05 19:54:52 +0000312 self.gotonext()
Barry Warsaw5c8fef92002-12-30 16:43:42 +0000313 return EMPTYSTRING.join(aslist) + self.getdomain()
Barry Warsaw030ddf72002-11-05 19:54:52 +0000314
315 def getdomain(self):
316 """Get the complete domain name from an address."""
317 sdlist = []
318 while self.pos < len(self.field):
319 if self.field[self.pos] in self.LWS:
Barry Warsaw5c8fef92002-12-30 16:43:42 +0000320 self.pos += 1
Barry Warsaw030ddf72002-11-05 19:54:52 +0000321 elif self.field[self.pos] == '(':
322 self.commentlist.append(self.getcomment())
323 elif self.field[self.pos] == '[':
324 sdlist.append(self.getdomainliteral())
325 elif self.field[self.pos] == '.':
Barry Warsaw5c8fef92002-12-30 16:43:42 +0000326 self.pos += 1
Barry Warsaw030ddf72002-11-05 19:54:52 +0000327 sdlist.append('.')
328 elif self.field[self.pos] in self.atomends:
329 break
Barry Warsaw5c8fef92002-12-30 16:43:42 +0000330 else:
331 sdlist.append(self.getatom())
332 return EMPTYSTRING.join(sdlist)
Barry Warsaw030ddf72002-11-05 19:54:52 +0000333
Barry Warsaw5c8fef92002-12-30 16:43:42 +0000334 def getdelimited(self, beginchar, endchars, allowcomments=True):
Barry Warsaw030ddf72002-11-05 19:54:52 +0000335 """Parse a header fragment delimited by special characters.
336
337 `beginchar' is the start character for the fragment.
338 If self is not looking at an instance of `beginchar' then
339 getdelimited returns the empty string.
340
341 `endchars' is a sequence of allowable end-delimiting characters.
342 Parsing stops when one of these is encountered.
343
Barry Warsaw1fb22bb2002-12-30 16:21:07 +0000344 If `allowcomments' is non-zero, embedded RFC 2822 comments are allowed
345 within the parsed fragment.
Barry Warsaw030ddf72002-11-05 19:54:52 +0000346 """
347 if self.field[self.pos] != beginchar:
348 return ''
349
350 slist = ['']
Barry Warsaw5c8fef92002-12-30 16:43:42 +0000351 quote = False
352 self.pos += 1
Barry Warsaw030ddf72002-11-05 19:54:52 +0000353 while self.pos < len(self.field):
Barry Warsaw5c8fef92002-12-30 16:43:42 +0000354 if quote:
Barry Warsaw030ddf72002-11-05 19:54:52 +0000355 slist.append(self.field[self.pos])
Barry Warsaw5c8fef92002-12-30 16:43:42 +0000356 quote = False
Barry Warsaw030ddf72002-11-05 19:54:52 +0000357 elif self.field[self.pos] in endchars:
Barry Warsaw5c8fef92002-12-30 16:43:42 +0000358 self.pos += 1
Barry Warsaw030ddf72002-11-05 19:54:52 +0000359 break
360 elif allowcomments and self.field[self.pos] == '(':
361 slist.append(self.getcomment())
362 elif self.field[self.pos] == '\\':
Barry Warsaw5c8fef92002-12-30 16:43:42 +0000363 quote = True
Barry Warsaw030ddf72002-11-05 19:54:52 +0000364 else:
365 slist.append(self.field[self.pos])
Barry Warsaw5c8fef92002-12-30 16:43:42 +0000366 self.pos += 1
Barry Warsaw030ddf72002-11-05 19:54:52 +0000367
Barry Warsaw5c8fef92002-12-30 16:43:42 +0000368 return EMPTYSTRING.join(slist)
Barry Warsaw030ddf72002-11-05 19:54:52 +0000369
370 def getquote(self):
371 """Get a quote-delimited fragment from self's field."""
Barry Warsaw5c8fef92002-12-30 16:43:42 +0000372 return self.getdelimited('"', '"\r', False)
Barry Warsaw030ddf72002-11-05 19:54:52 +0000373
374 def getcomment(self):
375 """Get a parenthesis-delimited fragment from self's field."""
Barry Warsaw5c8fef92002-12-30 16:43:42 +0000376 return self.getdelimited('(', ')\r', True)
Barry Warsaw030ddf72002-11-05 19:54:52 +0000377
378 def getdomainliteral(self):
Barry Warsaw1fb22bb2002-12-30 16:21:07 +0000379 """Parse an RFC 2822 domain-literal."""
Barry Warsaw5c8fef92002-12-30 16:43:42 +0000380 return '[%s]' % self.getdelimited('[', ']\r', False)
Barry Warsaw030ddf72002-11-05 19:54:52 +0000381
Barry Warsaw1fb22bb2002-12-30 16:21:07 +0000382 def getatom(self, atomends=None):
383 """Parse an RFC 2822 atom.
384
385 Optional atomends specifies a different set of end token delimiters
386 (the default is to use self.atomends). This is used e.g. in
387 getphraselist() since phrase endings must not include the `.' (which
388 is legal in phrases)."""
Barry Warsaw030ddf72002-11-05 19:54:52 +0000389 atomlist = ['']
Barry Warsaw1fb22bb2002-12-30 16:21:07 +0000390 if atomends is None:
391 atomends = self.atomends
Barry Warsaw030ddf72002-11-05 19:54:52 +0000392
393 while self.pos < len(self.field):
Barry Warsaw1fb22bb2002-12-30 16:21:07 +0000394 if self.field[self.pos] in atomends:
Barry Warsaw030ddf72002-11-05 19:54:52 +0000395 break
Barry Warsaw5c8fef92002-12-30 16:43:42 +0000396 else:
397 atomlist.append(self.field[self.pos])
398 self.pos += 1
Barry Warsaw030ddf72002-11-05 19:54:52 +0000399
Barry Warsaw5c8fef92002-12-30 16:43:42 +0000400 return EMPTYSTRING.join(atomlist)
Barry Warsaw030ddf72002-11-05 19:54:52 +0000401
402 def getphraselist(self):
Barry Warsaw1fb22bb2002-12-30 16:21:07 +0000403 """Parse a sequence of RFC 2822 phrases.
Barry Warsaw030ddf72002-11-05 19:54:52 +0000404
Barry Warsaw1fb22bb2002-12-30 16:21:07 +0000405 A phrase is a sequence of words, which are in turn either RFC 2822
406 atoms or quoted-strings. Phrases are canonicalized by squeezing all
407 runs of continuous whitespace into one space.
Barry Warsaw030ddf72002-11-05 19:54:52 +0000408 """
409 plist = []
410
411 while self.pos < len(self.field):
412 if self.field[self.pos] in self.LWS:
Barry Warsaw5c8fef92002-12-30 16:43:42 +0000413 self.pos += 1
Barry Warsaw030ddf72002-11-05 19:54:52 +0000414 elif self.field[self.pos] == '"':
415 plist.append(self.getquote())
416 elif self.field[self.pos] == '(':
417 self.commentlist.append(self.getcomment())
Barry Warsaw1fb22bb2002-12-30 16:21:07 +0000418 elif self.field[self.pos] in self.phraseends:
Barry Warsaw030ddf72002-11-05 19:54:52 +0000419 break
Barry Warsaw5c8fef92002-12-30 16:43:42 +0000420 else:
421 plist.append(self.getatom(self.phraseends))
Barry Warsaw030ddf72002-11-05 19:54:52 +0000422
423 return plist
424
425class AddressList(AddrlistClass):
Barry Warsaw1fb22bb2002-12-30 16:21:07 +0000426 """An AddressList encapsulates a list of parsed RFC 2822 addresses."""
Barry Warsaw030ddf72002-11-05 19:54:52 +0000427 def __init__(self, field):
428 AddrlistClass.__init__(self, field)
429 if field:
430 self.addresslist = self.getaddrlist()
431 else:
432 self.addresslist = []
433
434 def __len__(self):
435 return len(self.addresslist)
436
Barry Warsaw030ddf72002-11-05 19:54:52 +0000437 def __add__(self, other):
438 # Set union
439 newaddr = AddressList(None)
440 newaddr.addresslist = self.addresslist[:]
441 for x in other.addresslist:
442 if not x in self.addresslist:
443 newaddr.addresslist.append(x)
444 return newaddr
445
446 def __iadd__(self, other):
447 # Set union, in-place
448 for x in other.addresslist:
449 if not x in self.addresslist:
450 self.addresslist.append(x)
451 return self
452
453 def __sub__(self, other):
454 # Set difference
455 newaddr = AddressList(None)
456 for x in self.addresslist:
457 if not x in other.addresslist:
458 newaddr.addresslist.append(x)
459 return newaddr
460
461 def __isub__(self, other):
462 # Set difference, in-place
463 for x in other.addresslist:
464 if x in self.addresslist:
465 self.addresslist.remove(x)
466 return self
467
468 def __getitem__(self, index):
469 # Make indexing, slices, and 'in' work
470 return self.addresslist[index]