blob: 5a0f5b625b00c573854e455dc912781fa94f8304 [file] [log] [blame]
Barry Warsaw030ddf72002-11-05 19:54:52 +00001# Copyright (C) 2002 Python Software Foundation
2
3"""Email address parsing code.
4
5Lifted directly from rfc822.py. This should eventually be rewritten.
6"""
7
8import time
9
10# Parse a date field
11_monthnames = ['jan', 'feb', 'mar', 'apr', 'may', 'jun', 'jul',
12 'aug', 'sep', 'oct', 'nov', 'dec',
13 'january', 'february', 'march', 'april', 'may', 'june', 'july',
14 'august', 'september', 'october', 'november', 'december']
15
16_daynames = ['mon', 'tue', 'wed', 'thu', 'fri', 'sat', 'sun']
17
18# The timezone table does not include the military time zones defined
19# in RFC822, other than Z. According to RFC1123, the description in
20# RFC822 gets the signs wrong, so we can't rely on any such time
21# zones. RFC1123 recommends that numeric timezone indicators be used
22# instead of timezone names.
23
24_timezones = {'UT':0, 'UTC':0, 'GMT':0, 'Z':0,
25 'AST': -400, 'ADT': -300, # Atlantic (used in Canada)
26 'EST': -500, 'EDT': -400, # Eastern
27 'CST': -600, 'CDT': -500, # Central
28 'MST': -700, 'MDT': -600, # Mountain
29 'PST': -800, 'PDT': -700 # Pacific
30 }
31
32
33def parsedate_tz(data):
34 """Convert a date string to a time tuple.
35
36 Accounts for military timezones.
37 """
38 data = data.split()
39 if data[0][-1] in (',', '.') or data[0].lower() in _daynames:
40 # There's a dayname here. Skip it
41 del data[0]
42 if len(data) == 3: # RFC 850 date, deprecated
43 stuff = data[0].split('-')
44 if len(stuff) == 3:
45 data = stuff + data[1:]
46 if len(data) == 4:
47 s = data[3]
48 i = s.find('+')
49 if i > 0:
50 data[3:] = [s[:i], s[i+1:]]
51 else:
52 data.append('') # Dummy tz
53 if len(data) < 5:
54 return None
55 data = data[:5]
56 [dd, mm, yy, tm, tz] = data
57 mm = mm.lower()
58 if not mm in _monthnames:
59 dd, mm = mm, dd.lower()
60 if not mm in _monthnames:
61 return None
62 mm = _monthnames.index(mm)+1
63 if mm > 12: mm = mm - 12
64 if dd[-1] == ',':
65 dd = dd[:-1]
66 i = yy.find(':')
67 if i > 0:
68 yy, tm = tm, yy
69 if yy[-1] == ',':
70 yy = yy[:-1]
71 if not yy[0].isdigit():
72 yy, tz = tz, yy
73 if tm[-1] == ',':
74 tm = tm[:-1]
75 tm = tm.split(':')
76 if len(tm) == 2:
77 [thh, tmm] = tm
78 tss = '0'
79 elif len(tm) == 3:
80 [thh, tmm, tss] = tm
81 else:
82 return None
83 try:
84 yy = int(yy)
85 dd = int(dd)
86 thh = int(thh)
87 tmm = int(tmm)
88 tss = int(tss)
89 except ValueError:
90 return None
91 tzoffset = None
92 tz = tz.upper()
93 if _timezones.has_key(tz):
94 tzoffset = _timezones[tz]
95 else:
96 try:
97 tzoffset = int(tz)
98 except ValueError:
99 pass
100 # Convert a timezone offset into seconds ; -0500 -> -18000
101 if tzoffset:
102 if tzoffset < 0:
103 tzsign = -1
104 tzoffset = -tzoffset
105 else:
106 tzsign = 1
107 tzoffset = tzsign * ( (tzoffset/100)*3600 + (tzoffset % 100)*60)
108 tuple = (yy, mm, dd, thh, tmm, tss, 0, 0, 0, tzoffset)
109 return tuple
110
111
112def parsedate(data):
113 """Convert a time string to a time tuple."""
114 t = parsedate_tz(data)
115 if type(t) == type( () ):
116 return t[:9]
117 else: return t
118
119
120def mktime_tz(data):
121 """Turn a 10-tuple as returned by parsedate_tz() into a UTC timestamp."""
122 if data[9] is None:
123 # No zone info, so localtime is better assumption than GMT
124 return time.mktime(data[:8] + (-1,))
125 else:
126 t = time.mktime(data[:8] + (0,))
127 return t - data[9] - time.timezone
128
129
130def quote(str):
131 """Add quotes around a string."""
132 return str.replace('\\', '\\\\').replace('"', '\\"')
133
134
135class AddrlistClass:
136 """Address parser class by Ben Escoto.
137
Barry Warsaw1fb22bb2002-12-30 16:21:07 +0000138 To understand what this class does, it helps to have a copy of RFC 2822 in
139 front of you.
Barry Warsaw030ddf72002-11-05 19:54:52 +0000140
141 Note: this class interface is deprecated and may be removed in the future.
142 Use rfc822.AddressList instead.
143 """
144
145 def __init__(self, field):
146 """Initialize a new instance.
147
148 `field' is an unparsed address header field, containing
149 one or more addresses.
150 """
151 self.specials = '()<>@,:;.\"[]'
152 self.pos = 0
153 self.LWS = ' \t'
154 self.CR = '\r\n'
155 self.atomends = self.specials + self.LWS + self.CR
Barry Warsaw1fb22bb2002-12-30 16:21:07 +0000156 # Note that RFC 2822 now specifies `.' as obs-phrase, meaning that it
157 # is obsolete syntax. RFC 2822 requires that we recognize obsolete
158 # syntax, so allow dots in phrases.
159 self.phraseends = self.atomends.replace('.', '')
Barry Warsaw030ddf72002-11-05 19:54:52 +0000160 self.field = field
161 self.commentlist = []
162
163 def gotonext(self):
164 """Parse up to the start of the next address."""
165 while self.pos < len(self.field):
166 if self.field[self.pos] in self.LWS + '\n\r':
167 self.pos = self.pos + 1
168 elif self.field[self.pos] == '(':
169 self.commentlist.append(self.getcomment())
170 else: break
171
172 def getaddrlist(self):
173 """Parse all addresses.
174
175 Returns a list containing all of the addresses.
176 """
Barry Warsaw1fb22bb2002-12-30 16:21:07 +0000177 result = []
178 while 1:
179 ad = self.getaddress()
180 if ad:
181 result += ad
182 else:
183 break
184 return result
Barry Warsaw030ddf72002-11-05 19:54:52 +0000185
186 def getaddress(self):
187 """Parse the next address."""
188 self.commentlist = []
189 self.gotonext()
190
191 oldpos = self.pos
192 oldcl = self.commentlist
193 plist = self.getphraselist()
194
195 self.gotonext()
196 returnlist = []
197
198 if self.pos >= len(self.field):
199 # Bad email address technically, no domain.
200 if plist:
201 returnlist = [(' '.join(self.commentlist), plist[0])]
202
203 elif self.field[self.pos] in '.@':
204 # email address is just an addrspec
205 # this isn't very efficient since we start over
206 self.pos = oldpos
207 self.commentlist = oldcl
208 addrspec = self.getaddrspec()
209 returnlist = [(' '.join(self.commentlist), addrspec)]
210
211 elif self.field[self.pos] == ':':
212 # address is a group
213 returnlist = []
214
215 fieldlen = len(self.field)
216 self.pos = self.pos + 1
217 while self.pos < len(self.field):
218 self.gotonext()
219 if self.pos < fieldlen and self.field[self.pos] == ';':
220 self.pos = self.pos + 1
221 break
222 returnlist = returnlist + self.getaddress()
223
224 elif self.field[self.pos] == '<':
225 # Address is a phrase then a route addr
226 routeaddr = self.getrouteaddr()
227
228 if self.commentlist:
229 returnlist = [(' '.join(plist) + ' (' + \
230 ' '.join(self.commentlist) + ')', routeaddr)]
231 else: returnlist = [(' '.join(plist), routeaddr)]
232
233 else:
234 if plist:
235 returnlist = [(' '.join(self.commentlist), plist[0])]
236 elif self.field[self.pos] in self.specials:
237 self.pos = self.pos + 1
238
239 self.gotonext()
240 if self.pos < len(self.field) and self.field[self.pos] == ',':
241 self.pos = self.pos + 1
242 return returnlist
243
244 def getrouteaddr(self):
245 """Parse a route address (Return-path value).
246
247 This method just skips all the route stuff and returns the addrspec.
248 """
249 if self.field[self.pos] != '<':
250 return
251
252 expectroute = 0
253 self.pos = self.pos + 1
254 self.gotonext()
255 adlist = ""
256 while self.pos < len(self.field):
257 if expectroute:
258 self.getdomain()
259 expectroute = 0
260 elif self.field[self.pos] == '>':
261 self.pos = self.pos + 1
262 break
263 elif self.field[self.pos] == '@':
264 self.pos = self.pos + 1
265 expectroute = 1
266 elif self.field[self.pos] == ':':
267 self.pos = self.pos + 1
Barry Warsaw030ddf72002-11-05 19:54:52 +0000268 else:
269 adlist = self.getaddrspec()
270 self.pos = self.pos + 1
271 break
272 self.gotonext()
273
274 return adlist
275
276 def getaddrspec(self):
Barry Warsaw1fb22bb2002-12-30 16:21:07 +0000277 """Parse an RFC 2822 addr-spec."""
Barry Warsaw030ddf72002-11-05 19:54:52 +0000278 aslist = []
279
280 self.gotonext()
281 while self.pos < len(self.field):
282 if self.field[self.pos] == '.':
283 aslist.append('.')
284 self.pos = self.pos + 1
285 elif self.field[self.pos] == '"':
286 aslist.append('"%s"' % self.getquote())
287 elif self.field[self.pos] in self.atomends:
288 break
289 else: aslist.append(self.getatom())
290 self.gotonext()
291
292 if self.pos >= len(self.field) or self.field[self.pos] != '@':
293 return ''.join(aslist)
294
295 aslist.append('@')
296 self.pos = self.pos + 1
297 self.gotonext()
298 return ''.join(aslist) + self.getdomain()
299
300 def getdomain(self):
301 """Get the complete domain name from an address."""
302 sdlist = []
303 while self.pos < len(self.field):
304 if self.field[self.pos] in self.LWS:
305 self.pos = self.pos + 1
306 elif self.field[self.pos] == '(':
307 self.commentlist.append(self.getcomment())
308 elif self.field[self.pos] == '[':
309 sdlist.append(self.getdomainliteral())
310 elif self.field[self.pos] == '.':
311 self.pos = self.pos + 1
312 sdlist.append('.')
313 elif self.field[self.pos] in self.atomends:
314 break
315 else: sdlist.append(self.getatom())
316 return ''.join(sdlist)
317
318 def getdelimited(self, beginchar, endchars, allowcomments = 1):
319 """Parse a header fragment delimited by special characters.
320
321 `beginchar' is the start character for the fragment.
322 If self is not looking at an instance of `beginchar' then
323 getdelimited returns the empty string.
324
325 `endchars' is a sequence of allowable end-delimiting characters.
326 Parsing stops when one of these is encountered.
327
Barry Warsaw1fb22bb2002-12-30 16:21:07 +0000328 If `allowcomments' is non-zero, embedded RFC 2822 comments are allowed
329 within the parsed fragment.
Barry Warsaw030ddf72002-11-05 19:54:52 +0000330 """
331 if self.field[self.pos] != beginchar:
332 return ''
333
334 slist = ['']
335 quote = 0
336 self.pos = self.pos + 1
337 while self.pos < len(self.field):
338 if quote == 1:
339 slist.append(self.field[self.pos])
340 quote = 0
341 elif self.field[self.pos] in endchars:
342 self.pos = self.pos + 1
343 break
344 elif allowcomments and self.field[self.pos] == '(':
345 slist.append(self.getcomment())
346 elif self.field[self.pos] == '\\':
347 quote = 1
348 else:
349 slist.append(self.field[self.pos])
350 self.pos = self.pos + 1
351
352 return ''.join(slist)
353
354 def getquote(self):
355 """Get a quote-delimited fragment from self's field."""
356 return self.getdelimited('"', '"\r', 0)
357
358 def getcomment(self):
359 """Get a parenthesis-delimited fragment from self's field."""
360 return self.getdelimited('(', ')\r', 1)
361
362 def getdomainliteral(self):
Barry Warsaw1fb22bb2002-12-30 16:21:07 +0000363 """Parse an RFC 2822 domain-literal."""
Barry Warsaw030ddf72002-11-05 19:54:52 +0000364 return '[%s]' % self.getdelimited('[', ']\r', 0)
365
Barry Warsaw1fb22bb2002-12-30 16:21:07 +0000366 def getatom(self, atomends=None):
367 """Parse an RFC 2822 atom.
368
369 Optional atomends specifies a different set of end token delimiters
370 (the default is to use self.atomends). This is used e.g. in
371 getphraselist() since phrase endings must not include the `.' (which
372 is legal in phrases)."""
Barry Warsaw030ddf72002-11-05 19:54:52 +0000373 atomlist = ['']
Barry Warsaw1fb22bb2002-12-30 16:21:07 +0000374 if atomends is None:
375 atomends = self.atomends
Barry Warsaw030ddf72002-11-05 19:54:52 +0000376
377 while self.pos < len(self.field):
Barry Warsaw1fb22bb2002-12-30 16:21:07 +0000378 if self.field[self.pos] in atomends:
Barry Warsaw030ddf72002-11-05 19:54:52 +0000379 break
380 else: atomlist.append(self.field[self.pos])
381 self.pos = self.pos + 1
382
383 return ''.join(atomlist)
384
385 def getphraselist(self):
Barry Warsaw1fb22bb2002-12-30 16:21:07 +0000386 """Parse a sequence of RFC 2822 phrases.
Barry Warsaw030ddf72002-11-05 19:54:52 +0000387
Barry Warsaw1fb22bb2002-12-30 16:21:07 +0000388 A phrase is a sequence of words, which are in turn either RFC 2822
389 atoms or quoted-strings. Phrases are canonicalized by squeezing all
390 runs of continuous whitespace into one space.
Barry Warsaw030ddf72002-11-05 19:54:52 +0000391 """
392 plist = []
393
394 while self.pos < len(self.field):
395 if self.field[self.pos] in self.LWS:
396 self.pos = self.pos + 1
397 elif self.field[self.pos] == '"':
398 plist.append(self.getquote())
399 elif self.field[self.pos] == '(':
400 self.commentlist.append(self.getcomment())
Barry Warsaw1fb22bb2002-12-30 16:21:07 +0000401 elif self.field[self.pos] in self.phraseends:
Barry Warsaw030ddf72002-11-05 19:54:52 +0000402 break
Barry Warsaw1fb22bb2002-12-30 16:21:07 +0000403 else: plist.append(self.getatom(self.phraseends))
Barry Warsaw030ddf72002-11-05 19:54:52 +0000404
405 return plist
406
407class AddressList(AddrlistClass):
Barry Warsaw1fb22bb2002-12-30 16:21:07 +0000408 """An AddressList encapsulates a list of parsed RFC 2822 addresses."""
Barry Warsaw030ddf72002-11-05 19:54:52 +0000409 def __init__(self, field):
410 AddrlistClass.__init__(self, field)
411 if field:
412 self.addresslist = self.getaddrlist()
413 else:
414 self.addresslist = []
415
416 def __len__(self):
417 return len(self.addresslist)
418
419 def __str__(self):
420 return ", ".join(map(dump_address_pair, self.addresslist))
421
422 def __add__(self, other):
423 # Set union
424 newaddr = AddressList(None)
425 newaddr.addresslist = self.addresslist[:]
426 for x in other.addresslist:
427 if not x in self.addresslist:
428 newaddr.addresslist.append(x)
429 return newaddr
430
431 def __iadd__(self, other):
432 # Set union, in-place
433 for x in other.addresslist:
434 if not x in self.addresslist:
435 self.addresslist.append(x)
436 return self
437
438 def __sub__(self, other):
439 # Set difference
440 newaddr = AddressList(None)
441 for x in self.addresslist:
442 if not x in other.addresslist:
443 newaddr.addresslist.append(x)
444 return newaddr
445
446 def __isub__(self, other):
447 # Set difference, in-place
448 for x in other.addresslist:
449 if x in self.addresslist:
450 self.addresslist.remove(x)
451 return self
452
453 def __getitem__(self, index):
454 # Make indexing, slices, and 'in' work
455 return self.addresslist[index]