blob: cdfa3729adc79efa0b8fc2cf972a76f301c24fdb [file] [log] [blame]
Guido van Rossum8b3febe2007-08-30 01:15:14 +00001# Copyright (C) 2002-2007 Python Software Foundation
2# Contact: email-sig@python.org
3
4"""Email address parsing code.
5
6Lifted directly from rfc822.py. This should eventually be rewritten.
7"""
8
9__all__ = [
10 'mktime_tz',
11 'parsedate',
12 'parsedate_tz',
13 'quote',
14 ]
15
Alexander Belopolskya07548e2012-06-21 20:34:09 -040016import time, calendar
Guido van Rossum8b3febe2007-08-30 01:15:14 +000017
18SPACE = ' '
19EMPTYSTRING = ''
20COMMASPACE = ', '
21
22# Parse a date field
23_monthnames = ['jan', 'feb', 'mar', 'apr', 'may', 'jun', 'jul',
24 'aug', 'sep', 'oct', 'nov', 'dec',
25 'january', 'february', 'march', 'april', 'may', 'june', 'july',
26 'august', 'september', 'october', 'november', 'december']
27
28_daynames = ['mon', 'tue', 'wed', 'thu', 'fri', 'sat', 'sun']
29
30# The timezone table does not include the military time zones defined
31# in RFC822, other than Z. According to RFC1123, the description in
32# RFC822 gets the signs wrong, so we can't rely on any such time
33# zones. RFC1123 recommends that numeric timezone indicators be used
34# instead of timezone names.
35
36_timezones = {'UT':0, 'UTC':0, 'GMT':0, 'Z':0,
37 'AST': -400, 'ADT': -300, # Atlantic (used in Canada)
38 'EST': -500, 'EDT': -400, # Eastern
39 'CST': -600, 'CDT': -500, # Central
40 'MST': -700, 'MDT': -600, # Mountain
41 'PST': -800, 'PDT': -700 # Pacific
42 }
43
44
45def parsedate_tz(data):
46 """Convert a date string to a time tuple.
47
48 Accounts for military timezones.
49 """
R David Murray875048b2011-07-20 11:41:21 -040050 res = _parsedate_tz(data)
Georg Brandl1aca31e2012-09-22 09:03:56 +020051 if not res:
52 return
R David Murray875048b2011-07-20 11:41:21 -040053 if res[9] is None:
54 res[9] = 0
55 return tuple(res)
56
57def _parsedate_tz(data):
58 """Convert date to extended time tuple.
59
60 The last (additional) element is the time zone offset in seconds, except if
61 the timezone was specified as -0000. In that case the last element is
62 None. This indicates a UTC timestamp that explicitly declaims knowledge of
63 the source timezone, as opposed to a +0000 timestamp that indicates the
64 source timezone really was UTC.
65
66 """
Georg Brandl1aca31e2012-09-22 09:03:56 +020067 if not data:
68 return
Guido van Rossum8b3febe2007-08-30 01:15:14 +000069 data = data.split()
70 # The FWS after the comma after the day-of-week is optional, so search and
71 # adjust for this.
72 if data[0].endswith(',') or data[0].lower() in _daynames:
73 # There's a dayname here. Skip it
74 del data[0]
75 else:
76 i = data[0].rfind(',')
77 if i >= 0:
78 data[0] = data[0][i+1:]
79 if len(data) == 3: # RFC 850 date, deprecated
80 stuff = data[0].split('-')
81 if len(stuff) == 3:
82 data = stuff + data[1:]
83 if len(data) == 4:
84 s = data[3]
85 i = s.find('+')
R. David Murray4a62e892010-12-23 20:35:46 +000086 if i == -1:
87 i = s.find('-')
Guido van Rossum8b3febe2007-08-30 01:15:14 +000088 if i > 0:
R. David Murray4a62e892010-12-23 20:35:46 +000089 data[3:] = [s[:i], s[i:]]
Guido van Rossum8b3febe2007-08-30 01:15:14 +000090 else:
91 data.append('') # Dummy tz
92 if len(data) < 5:
93 return None
94 data = data[:5]
95 [dd, mm, yy, tm, tz] = data
96 mm = mm.lower()
97 if mm not in _monthnames:
98 dd, mm = mm, dd.lower()
99 if mm not in _monthnames:
100 return None
101 mm = _monthnames.index(mm) + 1
102 if mm > 12:
103 mm -= 12
104 if dd[-1] == ',':
105 dd = dd[:-1]
106 i = yy.find(':')
107 if i > 0:
108 yy, tm = tm, yy
109 if yy[-1] == ',':
110 yy = yy[:-1]
111 if not yy[0].isdigit():
112 yy, tz = tz, yy
113 if tm[-1] == ',':
114 tm = tm[:-1]
115 tm = tm.split(':')
116 if len(tm) == 2:
117 [thh, tmm] = tm
118 tss = '0'
119 elif len(tm) == 3:
120 [thh, tmm, tss] = tm
R David Murrayaccd1c02011-03-13 20:06:23 -0400121 elif len(tm) == 1 and '.' in tm[0]:
122 # Some non-compliant MUAs use '.' to separate time elements.
123 tm = tm[0].split('.')
124 if len(tm) == 2:
125 [thh, tmm] = tm
126 tss = 0
127 elif len(tm) == 3:
128 [thh, tmm, tss] = tm
Guido van Rossum8b3febe2007-08-30 01:15:14 +0000129 else:
130 return None
131 try:
132 yy = int(yy)
133 dd = int(dd)
134 thh = int(thh)
135 tmm = int(tmm)
136 tss = int(tss)
137 except ValueError:
138 return None
R. David Murray219d1c82010-08-25 00:45:55 +0000139 # Check for a yy specified in two-digit format, then convert it to the
140 # appropriate four-digit format, according to the POSIX standard. RFC 822
141 # calls for a two-digit yy, but RFC 2822 (which obsoletes RFC 822)
142 # mandates a 4-digit yy. For more information, see the documentation for
143 # the time module.
144 if yy < 100:
145 # The year is between 1969 and 1999 (inclusive).
146 if yy > 68:
147 yy += 1900
148 # The year is between 2000 and 2068 (inclusive).
149 else:
150 yy += 2000
Guido van Rossum8b3febe2007-08-30 01:15:14 +0000151 tzoffset = None
152 tz = tz.upper()
153 if tz in _timezones:
154 tzoffset = _timezones[tz]
155 else:
156 try:
157 tzoffset = int(tz)
158 except ValueError:
159 pass
R David Murray875048b2011-07-20 11:41:21 -0400160 if tzoffset==0 and tz.startswith('-'):
161 tzoffset = None
Guido van Rossum8b3febe2007-08-30 01:15:14 +0000162 # Convert a timezone offset into seconds ; -0500 -> -18000
163 if tzoffset:
164 if tzoffset < 0:
165 tzsign = -1
166 tzoffset = -tzoffset
167 else:
168 tzsign = 1
169 tzoffset = tzsign * ( (tzoffset//100)*3600 + (tzoffset % 100)*60)
170 # Daylight Saving Time flag is set to -1, since DST is unknown.
R David Murray875048b2011-07-20 11:41:21 -0400171 return [yy, mm, dd, thh, tmm, tss, 0, 1, -1, tzoffset]
Guido van Rossum8b3febe2007-08-30 01:15:14 +0000172
173
174def parsedate(data):
175 """Convert a time string to a time tuple."""
176 t = parsedate_tz(data)
177 if isinstance(t, tuple):
178 return t[:9]
179 else:
180 return t
181
182
183def mktime_tz(data):
Alexander Belopolskya07548e2012-06-21 20:34:09 -0400184 """Turn a 10-tuple as returned by parsedate_tz() into a POSIX timestamp."""
Guido van Rossum8b3febe2007-08-30 01:15:14 +0000185 if data[9] is None:
186 # No zone info, so localtime is better assumption than GMT
187 return time.mktime(data[:8] + (-1,))
188 else:
Alexander Belopolskya07548e2012-06-21 20:34:09 -0400189 t = calendar.timegm(data)
190 return t - data[9]
Guido van Rossum8b3febe2007-08-30 01:15:14 +0000191
192
193def quote(str):
R. David Murray5397e862010-10-02 15:58:26 +0000194 """Prepare string to be used in a quoted string.
195
196 Turns backslash and double quote characters into quoted pairs. These
197 are the only characters that need to be quoted inside a quoted string.
198 Does not add the surrounding double quotes.
199 """
Guido van Rossum8b3febe2007-08-30 01:15:14 +0000200 return str.replace('\\', '\\\\').replace('"', '\\"')
201
202
203class AddrlistClass:
204 """Address parser class by Ben Escoto.
205
206 To understand what this class does, it helps to have a copy of RFC 2822 in
207 front of you.
208
209 Note: this class interface is deprecated and may be removed in the future.
Florent Xicluna992d9e02011-11-11 19:35:42 +0100210 Use email.utils.AddressList instead.
Guido van Rossum8b3febe2007-08-30 01:15:14 +0000211 """
212
213 def __init__(self, field):
214 """Initialize a new instance.
215
216 `field' is an unparsed address header field, containing
217 one or more addresses.
218 """
219 self.specials = '()<>@,:;.\"[]'
220 self.pos = 0
221 self.LWS = ' \t'
222 self.CR = '\r\n'
223 self.FWS = self.LWS + self.CR
224 self.atomends = self.specials + self.LWS + self.CR
225 # Note that RFC 2822 now specifies `.' as obs-phrase, meaning that it
226 # is obsolete syntax. RFC 2822 requires that we recognize obsolete
227 # syntax, so allow dots in phrases.
228 self.phraseends = self.atomends.replace('.', '')
229 self.field = field
230 self.commentlist = []
231
232 def gotonext(self):
R. David Murray63563cd2010-12-18 18:25:38 +0000233 """Skip white space and extract comments."""
234 wslist = []
Guido van Rossum8b3febe2007-08-30 01:15:14 +0000235 while self.pos < len(self.field):
236 if self.field[self.pos] in self.LWS + '\n\r':
R. David Murray63563cd2010-12-18 18:25:38 +0000237 if self.field[self.pos] not in '\n\r':
238 wslist.append(self.field[self.pos])
Guido van Rossum8b3febe2007-08-30 01:15:14 +0000239 self.pos += 1
240 elif self.field[self.pos] == '(':
241 self.commentlist.append(self.getcomment())
242 else:
243 break
R. David Murray63563cd2010-12-18 18:25:38 +0000244 return EMPTYSTRING.join(wslist)
Guido van Rossum8b3febe2007-08-30 01:15:14 +0000245
246 def getaddrlist(self):
247 """Parse all addresses.
248
249 Returns a list containing all of the addresses.
250 """
251 result = []
252 while self.pos < len(self.field):
253 ad = self.getaddress()
254 if ad:
255 result += ad
256 else:
257 result.append(('', ''))
258 return result
259
260 def getaddress(self):
261 """Parse the next address."""
262 self.commentlist = []
263 self.gotonext()
264
265 oldpos = self.pos
266 oldcl = self.commentlist
267 plist = self.getphraselist()
268
269 self.gotonext()
270 returnlist = []
271
272 if self.pos >= len(self.field):
273 # Bad email address technically, no domain.
274 if plist:
275 returnlist = [(SPACE.join(self.commentlist), plist[0])]
276
277 elif self.field[self.pos] in '.@':
278 # email address is just an addrspec
279 # this isn't very efficient since we start over
280 self.pos = oldpos
281 self.commentlist = oldcl
282 addrspec = self.getaddrspec()
283 returnlist = [(SPACE.join(self.commentlist), addrspec)]
284
285 elif self.field[self.pos] == ':':
286 # address is a group
287 returnlist = []
288
289 fieldlen = len(self.field)
290 self.pos += 1
291 while self.pos < len(self.field):
292 self.gotonext()
293 if self.pos < fieldlen and self.field[self.pos] == ';':
294 self.pos += 1
295 break
296 returnlist = returnlist + self.getaddress()
297
298 elif self.field[self.pos] == '<':
299 # Address is a phrase then a route addr
300 routeaddr = self.getrouteaddr()
301
302 if self.commentlist:
303 returnlist = [(SPACE.join(plist) + ' (' +
304 ' '.join(self.commentlist) + ')', routeaddr)]
305 else:
306 returnlist = [(SPACE.join(plist), routeaddr)]
307
308 else:
309 if plist:
310 returnlist = [(SPACE.join(self.commentlist), plist[0])]
311 elif self.field[self.pos] in self.specials:
312 self.pos += 1
313
314 self.gotonext()
315 if self.pos < len(self.field) and self.field[self.pos] == ',':
316 self.pos += 1
317 return returnlist
318
319 def getrouteaddr(self):
320 """Parse a route address (Return-path value).
321
322 This method just skips all the route stuff and returns the addrspec.
323 """
324 if self.field[self.pos] != '<':
325 return
326
327 expectroute = False
328 self.pos += 1
329 self.gotonext()
330 adlist = ''
331 while self.pos < len(self.field):
332 if expectroute:
333 self.getdomain()
334 expectroute = False
335 elif self.field[self.pos] == '>':
336 self.pos += 1
337 break
338 elif self.field[self.pos] == '@':
339 self.pos += 1
340 expectroute = True
341 elif self.field[self.pos] == ':':
342 self.pos += 1
343 else:
344 adlist = self.getaddrspec()
345 self.pos += 1
346 break
347 self.gotonext()
348
349 return adlist
350
351 def getaddrspec(self):
352 """Parse an RFC 2822 addr-spec."""
353 aslist = []
354
355 self.gotonext()
356 while self.pos < len(self.field):
R. David Murray63563cd2010-12-18 18:25:38 +0000357 preserve_ws = True
Guido van Rossum8b3febe2007-08-30 01:15:14 +0000358 if self.field[self.pos] == '.':
R. David Murray63563cd2010-12-18 18:25:38 +0000359 if aslist and not aslist[-1].strip():
360 aslist.pop()
Guido van Rossum8b3febe2007-08-30 01:15:14 +0000361 aslist.append('.')
362 self.pos += 1
R. David Murray63563cd2010-12-18 18:25:38 +0000363 preserve_ws = False
Guido van Rossum8b3febe2007-08-30 01:15:14 +0000364 elif self.field[self.pos] == '"':
R. David Murray5397e862010-10-02 15:58:26 +0000365 aslist.append('"%s"' % quote(self.getquote()))
Guido van Rossum8b3febe2007-08-30 01:15:14 +0000366 elif self.field[self.pos] in self.atomends:
R. David Murray63563cd2010-12-18 18:25:38 +0000367 if aslist and not aslist[-1].strip():
368 aslist.pop()
Guido van Rossum8b3febe2007-08-30 01:15:14 +0000369 break
370 else:
371 aslist.append(self.getatom())
R. David Murray63563cd2010-12-18 18:25:38 +0000372 ws = self.gotonext()
373 if preserve_ws and ws:
374 aslist.append(ws)
Guido van Rossum8b3febe2007-08-30 01:15:14 +0000375
376 if self.pos >= len(self.field) or self.field[self.pos] != '@':
377 return EMPTYSTRING.join(aslist)
378
379 aslist.append('@')
380 self.pos += 1
381 self.gotonext()
382 return EMPTYSTRING.join(aslist) + self.getdomain()
383
384 def getdomain(self):
385 """Get the complete domain name from an address."""
386 sdlist = []
387 while self.pos < len(self.field):
388 if self.field[self.pos] in self.LWS:
389 self.pos += 1
390 elif self.field[self.pos] == '(':
391 self.commentlist.append(self.getcomment())
392 elif self.field[self.pos] == '[':
393 sdlist.append(self.getdomainliteral())
394 elif self.field[self.pos] == '.':
395 self.pos += 1
396 sdlist.append('.')
397 elif self.field[self.pos] in self.atomends:
398 break
399 else:
400 sdlist.append(self.getatom())
401 return EMPTYSTRING.join(sdlist)
402
403 def getdelimited(self, beginchar, endchars, allowcomments=True):
404 """Parse a header fragment delimited by special characters.
405
406 `beginchar' is the start character for the fragment.
407 If self is not looking at an instance of `beginchar' then
408 getdelimited returns the empty string.
409
410 `endchars' is a sequence of allowable end-delimiting characters.
411 Parsing stops when one of these is encountered.
412
413 If `allowcomments' is non-zero, embedded RFC 2822 comments are allowed
414 within the parsed fragment.
415 """
416 if self.field[self.pos] != beginchar:
417 return ''
418
419 slist = ['']
420 quote = False
421 self.pos += 1
422 while self.pos < len(self.field):
423 if quote:
424 slist.append(self.field[self.pos])
425 quote = False
426 elif self.field[self.pos] in endchars:
427 self.pos += 1
428 break
429 elif allowcomments and self.field[self.pos] == '(':
430 slist.append(self.getcomment())
431 continue # have already advanced pos from getcomment
432 elif self.field[self.pos] == '\\':
433 quote = True
434 else:
435 slist.append(self.field[self.pos])
436 self.pos += 1
437
438 return EMPTYSTRING.join(slist)
439
440 def getquote(self):
441 """Get a quote-delimited fragment from self's field."""
442 return self.getdelimited('"', '"\r', False)
443
444 def getcomment(self):
445 """Get a parenthesis-delimited fragment from self's field."""
446 return self.getdelimited('(', ')\r', True)
447
448 def getdomainliteral(self):
449 """Parse an RFC 2822 domain-literal."""
450 return '[%s]' % self.getdelimited('[', ']\r', False)
451
452 def getatom(self, atomends=None):
453 """Parse an RFC 2822 atom.
454
455 Optional atomends specifies a different set of end token delimiters
456 (the default is to use self.atomends). This is used e.g. in
457 getphraselist() since phrase endings must not include the `.' (which
458 is legal in phrases)."""
459 atomlist = ['']
460 if atomends is None:
461 atomends = self.atomends
462
463 while self.pos < len(self.field):
464 if self.field[self.pos] in atomends:
465 break
466 else:
467 atomlist.append(self.field[self.pos])
468 self.pos += 1
469
470 return EMPTYSTRING.join(atomlist)
471
472 def getphraselist(self):
473 """Parse a sequence of RFC 2822 phrases.
474
475 A phrase is a sequence of words, which are in turn either RFC 2822
476 atoms or quoted-strings. Phrases are canonicalized by squeezing all
477 runs of continuous whitespace into one space.
478 """
479 plist = []
480
481 while self.pos < len(self.field):
482 if self.field[self.pos] in self.FWS:
483 self.pos += 1
484 elif self.field[self.pos] == '"':
485 plist.append(self.getquote())
486 elif self.field[self.pos] == '(':
487 self.commentlist.append(self.getcomment())
488 elif self.field[self.pos] in self.phraseends:
489 break
490 else:
491 plist.append(self.getatom(self.phraseends))
492
493 return plist
494
495class AddressList(AddrlistClass):
496 """An AddressList encapsulates a list of parsed RFC 2822 addresses."""
497 def __init__(self, field):
498 AddrlistClass.__init__(self, field)
499 if field:
500 self.addresslist = self.getaddrlist()
501 else:
502 self.addresslist = []
503
504 def __len__(self):
505 return len(self.addresslist)
506
507 def __add__(self, other):
508 # Set union
509 newaddr = AddressList(None)
510 newaddr.addresslist = self.addresslist[:]
511 for x in other.addresslist:
512 if not x in self.addresslist:
513 newaddr.addresslist.append(x)
514 return newaddr
515
516 def __iadd__(self, other):
517 # Set union, in-place
518 for x in other.addresslist:
519 if not x in self.addresslist:
520 self.addresslist.append(x)
521 return self
522
523 def __sub__(self, other):
524 # Set difference
525 newaddr = AddressList(None)
526 for x in self.addresslist:
527 if not x in other.addresslist:
528 newaddr.addresslist.append(x)
529 return newaddr
530
531 def __isub__(self, other):
532 # Set difference, in-place
533 for x in other.addresslist:
534 if x in self.addresslist:
535 self.addresslist.remove(x)
536 return self
537
538 def __getitem__(self, index):
539 # Make indexing, slices, and 'in' work
540 return self.addresslist[index]