blob: c455e056d0ba4ab582638c7c86c857c2fe10b8d7 [file] [log] [blame]
Guido van Rossum8b3febe2007-08-30 01:15:14 +00001# Copyright (C) 2002-2007 Python Software Foundation
2# Contact: email-sig@python.org
3
4"""Email address parsing code.
5
6Lifted directly from rfc822.py. This should eventually be rewritten.
7"""
8
9__all__ = [
10 'mktime_tz',
11 'parsedate',
12 'parsedate_tz',
13 'quote',
14 ]
15
16import time
17
18SPACE = ' '
19EMPTYSTRING = ''
20COMMASPACE = ', '
21
22# Parse a date field
23_monthnames = ['jan', 'feb', 'mar', 'apr', 'may', 'jun', 'jul',
24 'aug', 'sep', 'oct', 'nov', 'dec',
25 'january', 'february', 'march', 'april', 'may', 'june', 'july',
26 'august', 'september', 'october', 'november', 'december']
27
28_daynames = ['mon', 'tue', 'wed', 'thu', 'fri', 'sat', 'sun']
29
30# The timezone table does not include the military time zones defined
31# in RFC822, other than Z. According to RFC1123, the description in
32# RFC822 gets the signs wrong, so we can't rely on any such time
33# zones. RFC1123 recommends that numeric timezone indicators be used
34# instead of timezone names.
35
36_timezones = {'UT':0, 'UTC':0, 'GMT':0, 'Z':0,
37 'AST': -400, 'ADT': -300, # Atlantic (used in Canada)
38 'EST': -500, 'EDT': -400, # Eastern
39 'CST': -600, 'CDT': -500, # Central
40 'MST': -700, 'MDT': -600, # Mountain
41 'PST': -800, 'PDT': -700 # Pacific
42 }
43
44
45def parsedate_tz(data):
46 """Convert a date string to a time tuple.
47
48 Accounts for military timezones.
49 """
R David Murray875048b2011-07-20 11:41:21 -040050 res = _parsedate_tz(data)
51 if res[9] is None:
52 res[9] = 0
53 return tuple(res)
54
55def _parsedate_tz(data):
56 """Convert date to extended time tuple.
57
58 The last (additional) element is the time zone offset in seconds, except if
59 the timezone was specified as -0000. In that case the last element is
60 None. This indicates a UTC timestamp that explicitly declaims knowledge of
61 the source timezone, as opposed to a +0000 timestamp that indicates the
62 source timezone really was UTC.
63
64 """
Guido van Rossum8b3febe2007-08-30 01:15:14 +000065 data = data.split()
66 # The FWS after the comma after the day-of-week is optional, so search and
67 # adjust for this.
68 if data[0].endswith(',') or data[0].lower() in _daynames:
69 # There's a dayname here. Skip it
70 del data[0]
71 else:
72 i = data[0].rfind(',')
73 if i >= 0:
74 data[0] = data[0][i+1:]
75 if len(data) == 3: # RFC 850 date, deprecated
76 stuff = data[0].split('-')
77 if len(stuff) == 3:
78 data = stuff + data[1:]
79 if len(data) == 4:
80 s = data[3]
81 i = s.find('+')
R. David Murray4a62e892010-12-23 20:35:46 +000082 if i == -1:
83 i = s.find('-')
Guido van Rossum8b3febe2007-08-30 01:15:14 +000084 if i > 0:
R. David Murray4a62e892010-12-23 20:35:46 +000085 data[3:] = [s[:i], s[i:]]
Guido van Rossum8b3febe2007-08-30 01:15:14 +000086 else:
87 data.append('') # Dummy tz
88 if len(data) < 5:
89 return None
90 data = data[:5]
91 [dd, mm, yy, tm, tz] = data
92 mm = mm.lower()
93 if mm not in _monthnames:
94 dd, mm = mm, dd.lower()
95 if mm not in _monthnames:
96 return None
97 mm = _monthnames.index(mm) + 1
98 if mm > 12:
99 mm -= 12
100 if dd[-1] == ',':
101 dd = dd[:-1]
102 i = yy.find(':')
103 if i > 0:
104 yy, tm = tm, yy
105 if yy[-1] == ',':
106 yy = yy[:-1]
107 if not yy[0].isdigit():
108 yy, tz = tz, yy
109 if tm[-1] == ',':
110 tm = tm[:-1]
111 tm = tm.split(':')
112 if len(tm) == 2:
113 [thh, tmm] = tm
114 tss = '0'
115 elif len(tm) == 3:
116 [thh, tmm, tss] = tm
R David Murrayaccd1c02011-03-13 20:06:23 -0400117 elif len(tm) == 1 and '.' in tm[0]:
118 # Some non-compliant MUAs use '.' to separate time elements.
119 tm = tm[0].split('.')
120 if len(tm) == 2:
121 [thh, tmm] = tm
122 tss = 0
123 elif len(tm) == 3:
124 [thh, tmm, tss] = tm
Guido van Rossum8b3febe2007-08-30 01:15:14 +0000125 else:
126 return None
127 try:
128 yy = int(yy)
129 dd = int(dd)
130 thh = int(thh)
131 tmm = int(tmm)
132 tss = int(tss)
133 except ValueError:
134 return None
R. David Murray219d1c82010-08-25 00:45:55 +0000135 # Check for a yy specified in two-digit format, then convert it to the
136 # appropriate four-digit format, according to the POSIX standard. RFC 822
137 # calls for a two-digit yy, but RFC 2822 (which obsoletes RFC 822)
138 # mandates a 4-digit yy. For more information, see the documentation for
139 # the time module.
140 if yy < 100:
141 # The year is between 1969 and 1999 (inclusive).
142 if yy > 68:
143 yy += 1900
144 # The year is between 2000 and 2068 (inclusive).
145 else:
146 yy += 2000
Guido van Rossum8b3febe2007-08-30 01:15:14 +0000147 tzoffset = None
148 tz = tz.upper()
149 if tz in _timezones:
150 tzoffset = _timezones[tz]
151 else:
152 try:
153 tzoffset = int(tz)
154 except ValueError:
155 pass
R David Murray875048b2011-07-20 11:41:21 -0400156 if tzoffset==0 and tz.startswith('-'):
157 tzoffset = None
Guido van Rossum8b3febe2007-08-30 01:15:14 +0000158 # Convert a timezone offset into seconds ; -0500 -> -18000
159 if tzoffset:
160 if tzoffset < 0:
161 tzsign = -1
162 tzoffset = -tzoffset
163 else:
164 tzsign = 1
165 tzoffset = tzsign * ( (tzoffset//100)*3600 + (tzoffset % 100)*60)
166 # Daylight Saving Time flag is set to -1, since DST is unknown.
R David Murray875048b2011-07-20 11:41:21 -0400167 return [yy, mm, dd, thh, tmm, tss, 0, 1, -1, tzoffset]
Guido van Rossum8b3febe2007-08-30 01:15:14 +0000168
169
170def parsedate(data):
171 """Convert a time string to a time tuple."""
172 t = parsedate_tz(data)
173 if isinstance(t, tuple):
174 return t[:9]
175 else:
176 return t
177
178
179def mktime_tz(data):
180 """Turn a 10-tuple as returned by parsedate_tz() into a UTC timestamp."""
181 if data[9] is None:
182 # No zone info, so localtime is better assumption than GMT
183 return time.mktime(data[:8] + (-1,))
184 else:
185 t = time.mktime(data[:8] + (0,))
186 return t - data[9] - time.timezone
187
188
189def quote(str):
R. David Murray5397e862010-10-02 15:58:26 +0000190 """Prepare string to be used in a quoted string.
191
192 Turns backslash and double quote characters into quoted pairs. These
193 are the only characters that need to be quoted inside a quoted string.
194 Does not add the surrounding double quotes.
195 """
Guido van Rossum8b3febe2007-08-30 01:15:14 +0000196 return str.replace('\\', '\\\\').replace('"', '\\"')
197
198
199class AddrlistClass:
200 """Address parser class by Ben Escoto.
201
202 To understand what this class does, it helps to have a copy of RFC 2822 in
203 front of you.
204
205 Note: this class interface is deprecated and may be removed in the future.
206 Use rfc822.AddressList instead.
207 """
208
209 def __init__(self, field):
210 """Initialize a new instance.
211
212 `field' is an unparsed address header field, containing
213 one or more addresses.
214 """
215 self.specials = '()<>@,:;.\"[]'
216 self.pos = 0
217 self.LWS = ' \t'
218 self.CR = '\r\n'
219 self.FWS = self.LWS + self.CR
220 self.atomends = self.specials + self.LWS + self.CR
221 # Note that RFC 2822 now specifies `.' as obs-phrase, meaning that it
222 # is obsolete syntax. RFC 2822 requires that we recognize obsolete
223 # syntax, so allow dots in phrases.
224 self.phraseends = self.atomends.replace('.', '')
225 self.field = field
226 self.commentlist = []
227
228 def gotonext(self):
R. David Murray63563cd2010-12-18 18:25:38 +0000229 """Skip white space and extract comments."""
230 wslist = []
Guido van Rossum8b3febe2007-08-30 01:15:14 +0000231 while self.pos < len(self.field):
232 if self.field[self.pos] in self.LWS + '\n\r':
R. David Murray63563cd2010-12-18 18:25:38 +0000233 if self.field[self.pos] not in '\n\r':
234 wslist.append(self.field[self.pos])
Guido van Rossum8b3febe2007-08-30 01:15:14 +0000235 self.pos += 1
236 elif self.field[self.pos] == '(':
237 self.commentlist.append(self.getcomment())
238 else:
239 break
R. David Murray63563cd2010-12-18 18:25:38 +0000240 return EMPTYSTRING.join(wslist)
Guido van Rossum8b3febe2007-08-30 01:15:14 +0000241
242 def getaddrlist(self):
243 """Parse all addresses.
244
245 Returns a list containing all of the addresses.
246 """
247 result = []
248 while self.pos < len(self.field):
249 ad = self.getaddress()
250 if ad:
251 result += ad
252 else:
253 result.append(('', ''))
254 return result
255
256 def getaddress(self):
257 """Parse the next address."""
258 self.commentlist = []
259 self.gotonext()
260
261 oldpos = self.pos
262 oldcl = self.commentlist
263 plist = self.getphraselist()
264
265 self.gotonext()
266 returnlist = []
267
268 if self.pos >= len(self.field):
269 # Bad email address technically, no domain.
270 if plist:
271 returnlist = [(SPACE.join(self.commentlist), plist[0])]
272
273 elif self.field[self.pos] in '.@':
274 # email address is just an addrspec
275 # this isn't very efficient since we start over
276 self.pos = oldpos
277 self.commentlist = oldcl
278 addrspec = self.getaddrspec()
279 returnlist = [(SPACE.join(self.commentlist), addrspec)]
280
281 elif self.field[self.pos] == ':':
282 # address is a group
283 returnlist = []
284
285 fieldlen = len(self.field)
286 self.pos += 1
287 while self.pos < len(self.field):
288 self.gotonext()
289 if self.pos < fieldlen and self.field[self.pos] == ';':
290 self.pos += 1
291 break
292 returnlist = returnlist + self.getaddress()
293
294 elif self.field[self.pos] == '<':
295 # Address is a phrase then a route addr
296 routeaddr = self.getrouteaddr()
297
298 if self.commentlist:
299 returnlist = [(SPACE.join(plist) + ' (' +
300 ' '.join(self.commentlist) + ')', routeaddr)]
301 else:
302 returnlist = [(SPACE.join(plist), routeaddr)]
303
304 else:
305 if plist:
306 returnlist = [(SPACE.join(self.commentlist), plist[0])]
307 elif self.field[self.pos] in self.specials:
308 self.pos += 1
309
310 self.gotonext()
311 if self.pos < len(self.field) and self.field[self.pos] == ',':
312 self.pos += 1
313 return returnlist
314
315 def getrouteaddr(self):
316 """Parse a route address (Return-path value).
317
318 This method just skips all the route stuff and returns the addrspec.
319 """
320 if self.field[self.pos] != '<':
321 return
322
323 expectroute = False
324 self.pos += 1
325 self.gotonext()
326 adlist = ''
327 while self.pos < len(self.field):
328 if expectroute:
329 self.getdomain()
330 expectroute = False
331 elif self.field[self.pos] == '>':
332 self.pos += 1
333 break
334 elif self.field[self.pos] == '@':
335 self.pos += 1
336 expectroute = True
337 elif self.field[self.pos] == ':':
338 self.pos += 1
339 else:
340 adlist = self.getaddrspec()
341 self.pos += 1
342 break
343 self.gotonext()
344
345 return adlist
346
347 def getaddrspec(self):
348 """Parse an RFC 2822 addr-spec."""
349 aslist = []
350
351 self.gotonext()
352 while self.pos < len(self.field):
R. David Murray63563cd2010-12-18 18:25:38 +0000353 preserve_ws = True
Guido van Rossum8b3febe2007-08-30 01:15:14 +0000354 if self.field[self.pos] == '.':
R. David Murray63563cd2010-12-18 18:25:38 +0000355 if aslist and not aslist[-1].strip():
356 aslist.pop()
Guido van Rossum8b3febe2007-08-30 01:15:14 +0000357 aslist.append('.')
358 self.pos += 1
R. David Murray63563cd2010-12-18 18:25:38 +0000359 preserve_ws = False
Guido van Rossum8b3febe2007-08-30 01:15:14 +0000360 elif self.field[self.pos] == '"':
R. David Murray5397e862010-10-02 15:58:26 +0000361 aslist.append('"%s"' % quote(self.getquote()))
Guido van Rossum8b3febe2007-08-30 01:15:14 +0000362 elif self.field[self.pos] in self.atomends:
R. David Murray63563cd2010-12-18 18:25:38 +0000363 if aslist and not aslist[-1].strip():
364 aslist.pop()
Guido van Rossum8b3febe2007-08-30 01:15:14 +0000365 break
366 else:
367 aslist.append(self.getatom())
R. David Murray63563cd2010-12-18 18:25:38 +0000368 ws = self.gotonext()
369 if preserve_ws and ws:
370 aslist.append(ws)
Guido van Rossum8b3febe2007-08-30 01:15:14 +0000371
372 if self.pos >= len(self.field) or self.field[self.pos] != '@':
373 return EMPTYSTRING.join(aslist)
374
375 aslist.append('@')
376 self.pos += 1
377 self.gotonext()
378 return EMPTYSTRING.join(aslist) + self.getdomain()
379
380 def getdomain(self):
381 """Get the complete domain name from an address."""
382 sdlist = []
383 while self.pos < len(self.field):
384 if self.field[self.pos] in self.LWS:
385 self.pos += 1
386 elif self.field[self.pos] == '(':
387 self.commentlist.append(self.getcomment())
388 elif self.field[self.pos] == '[':
389 sdlist.append(self.getdomainliteral())
390 elif self.field[self.pos] == '.':
391 self.pos += 1
392 sdlist.append('.')
393 elif self.field[self.pos] in self.atomends:
394 break
395 else:
396 sdlist.append(self.getatom())
397 return EMPTYSTRING.join(sdlist)
398
399 def getdelimited(self, beginchar, endchars, allowcomments=True):
400 """Parse a header fragment delimited by special characters.
401
402 `beginchar' is the start character for the fragment.
403 If self is not looking at an instance of `beginchar' then
404 getdelimited returns the empty string.
405
406 `endchars' is a sequence of allowable end-delimiting characters.
407 Parsing stops when one of these is encountered.
408
409 If `allowcomments' is non-zero, embedded RFC 2822 comments are allowed
410 within the parsed fragment.
411 """
412 if self.field[self.pos] != beginchar:
413 return ''
414
415 slist = ['']
416 quote = False
417 self.pos += 1
418 while self.pos < len(self.field):
419 if quote:
420 slist.append(self.field[self.pos])
421 quote = False
422 elif self.field[self.pos] in endchars:
423 self.pos += 1
424 break
425 elif allowcomments and self.field[self.pos] == '(':
426 slist.append(self.getcomment())
427 continue # have already advanced pos from getcomment
428 elif self.field[self.pos] == '\\':
429 quote = True
430 else:
431 slist.append(self.field[self.pos])
432 self.pos += 1
433
434 return EMPTYSTRING.join(slist)
435
436 def getquote(self):
437 """Get a quote-delimited fragment from self's field."""
438 return self.getdelimited('"', '"\r', False)
439
440 def getcomment(self):
441 """Get a parenthesis-delimited fragment from self's field."""
442 return self.getdelimited('(', ')\r', True)
443
444 def getdomainliteral(self):
445 """Parse an RFC 2822 domain-literal."""
446 return '[%s]' % self.getdelimited('[', ']\r', False)
447
448 def getatom(self, atomends=None):
449 """Parse an RFC 2822 atom.
450
451 Optional atomends specifies a different set of end token delimiters
452 (the default is to use self.atomends). This is used e.g. in
453 getphraselist() since phrase endings must not include the `.' (which
454 is legal in phrases)."""
455 atomlist = ['']
456 if atomends is None:
457 atomends = self.atomends
458
459 while self.pos < len(self.field):
460 if self.field[self.pos] in atomends:
461 break
462 else:
463 atomlist.append(self.field[self.pos])
464 self.pos += 1
465
466 return EMPTYSTRING.join(atomlist)
467
468 def getphraselist(self):
469 """Parse a sequence of RFC 2822 phrases.
470
471 A phrase is a sequence of words, which are in turn either RFC 2822
472 atoms or quoted-strings. Phrases are canonicalized by squeezing all
473 runs of continuous whitespace into one space.
474 """
475 plist = []
476
477 while self.pos < len(self.field):
478 if self.field[self.pos] in self.FWS:
479 self.pos += 1
480 elif self.field[self.pos] == '"':
481 plist.append(self.getquote())
482 elif self.field[self.pos] == '(':
483 self.commentlist.append(self.getcomment())
484 elif self.field[self.pos] in self.phraseends:
485 break
486 else:
487 plist.append(self.getatom(self.phraseends))
488
489 return plist
490
491class AddressList(AddrlistClass):
492 """An AddressList encapsulates a list of parsed RFC 2822 addresses."""
493 def __init__(self, field):
494 AddrlistClass.__init__(self, field)
495 if field:
496 self.addresslist = self.getaddrlist()
497 else:
498 self.addresslist = []
499
500 def __len__(self):
501 return len(self.addresslist)
502
503 def __add__(self, other):
504 # Set union
505 newaddr = AddressList(None)
506 newaddr.addresslist = self.addresslist[:]
507 for x in other.addresslist:
508 if not x in self.addresslist:
509 newaddr.addresslist.append(x)
510 return newaddr
511
512 def __iadd__(self, other):
513 # Set union, in-place
514 for x in other.addresslist:
515 if not x in self.addresslist:
516 self.addresslist.append(x)
517 return self
518
519 def __sub__(self, other):
520 # Set difference
521 newaddr = AddressList(None)
522 for x in self.addresslist:
523 if not x in other.addresslist:
524 newaddr.addresslist.append(x)
525 return newaddr
526
527 def __isub__(self, other):
528 # Set difference, in-place
529 for x in other.addresslist:
530 if x in self.addresslist:
531 self.addresslist.remove(x)
532 return self
533
534 def __getitem__(self, index):
535 # Make indexing, slices, and 'in' work
536 return self.addresslist[index]