blob: 41694f9b1acac5d6903cdbfcf692cb02df9eb994 [file] [log] [blame]
Guido van Rossum8b3febe2007-08-30 01:15:14 +00001# Copyright (C) 2002-2007 Python Software Foundation
2# Contact: email-sig@python.org
3
4"""Email address parsing code.
5
6Lifted directly from rfc822.py. This should eventually be rewritten.
7"""
8
9__all__ = [
10 'mktime_tz',
11 'parsedate',
12 'parsedate_tz',
13 'quote',
14 ]
15
16import time
17
18SPACE = ' '
19EMPTYSTRING = ''
20COMMASPACE = ', '
21
22# Parse a date field
23_monthnames = ['jan', 'feb', 'mar', 'apr', 'may', 'jun', 'jul',
24 'aug', 'sep', 'oct', 'nov', 'dec',
25 'january', 'february', 'march', 'april', 'may', 'june', 'july',
26 'august', 'september', 'october', 'november', 'december']
27
28_daynames = ['mon', 'tue', 'wed', 'thu', 'fri', 'sat', 'sun']
29
30# The timezone table does not include the military time zones defined
31# in RFC822, other than Z. According to RFC1123, the description in
32# RFC822 gets the signs wrong, so we can't rely on any such time
33# zones. RFC1123 recommends that numeric timezone indicators be used
34# instead of timezone names.
35
36_timezones = {'UT':0, 'UTC':0, 'GMT':0, 'Z':0,
37 'AST': -400, 'ADT': -300, # Atlantic (used in Canada)
38 'EST': -500, 'EDT': -400, # Eastern
39 'CST': -600, 'CDT': -500, # Central
40 'MST': -700, 'MDT': -600, # Mountain
41 'PST': -800, 'PDT': -700 # Pacific
42 }
43
44
45def parsedate_tz(data):
46 """Convert a date string to a time tuple.
47
48 Accounts for military timezones.
49 """
50 data = data.split()
51 # The FWS after the comma after the day-of-week is optional, so search and
52 # adjust for this.
53 if data[0].endswith(',') or data[0].lower() in _daynames:
54 # There's a dayname here. Skip it
55 del data[0]
56 else:
57 i = data[0].rfind(',')
58 if i >= 0:
59 data[0] = data[0][i+1:]
60 if len(data) == 3: # RFC 850 date, deprecated
61 stuff = data[0].split('-')
62 if len(stuff) == 3:
63 data = stuff + data[1:]
64 if len(data) == 4:
65 s = data[3]
66 i = s.find('+')
R. David Murray4a62e892010-12-23 20:35:46 +000067 if i == -1:
68 i = s.find('-')
Guido van Rossum8b3febe2007-08-30 01:15:14 +000069 if i > 0:
R. David Murray4a62e892010-12-23 20:35:46 +000070 data[3:] = [s[:i], s[i:]]
Guido van Rossum8b3febe2007-08-30 01:15:14 +000071 else:
72 data.append('') # Dummy tz
73 if len(data) < 5:
74 return None
75 data = data[:5]
76 [dd, mm, yy, tm, tz] = data
77 mm = mm.lower()
78 if mm not in _monthnames:
79 dd, mm = mm, dd.lower()
80 if mm not in _monthnames:
81 return None
82 mm = _monthnames.index(mm) + 1
83 if mm > 12:
84 mm -= 12
85 if dd[-1] == ',':
86 dd = dd[:-1]
87 i = yy.find(':')
88 if i > 0:
89 yy, tm = tm, yy
90 if yy[-1] == ',':
91 yy = yy[:-1]
92 if not yy[0].isdigit():
93 yy, tz = tz, yy
94 if tm[-1] == ',':
95 tm = tm[:-1]
96 tm = tm.split(':')
97 if len(tm) == 2:
98 [thh, tmm] = tm
99 tss = '0'
100 elif len(tm) == 3:
101 [thh, tmm, tss] = tm
102 else:
103 return None
104 try:
105 yy = int(yy)
106 dd = int(dd)
107 thh = int(thh)
108 tmm = int(tmm)
109 tss = int(tss)
110 except ValueError:
111 return None
R. David Murray219d1c82010-08-25 00:45:55 +0000112 # Check for a yy specified in two-digit format, then convert it to the
113 # appropriate four-digit format, according to the POSIX standard. RFC 822
114 # calls for a two-digit yy, but RFC 2822 (which obsoletes RFC 822)
115 # mandates a 4-digit yy. For more information, see the documentation for
116 # the time module.
117 if yy < 100:
118 # The year is between 1969 and 1999 (inclusive).
119 if yy > 68:
120 yy += 1900
121 # The year is between 2000 and 2068 (inclusive).
122 else:
123 yy += 2000
Guido van Rossum8b3febe2007-08-30 01:15:14 +0000124 tzoffset = None
125 tz = tz.upper()
126 if tz in _timezones:
127 tzoffset = _timezones[tz]
128 else:
129 try:
130 tzoffset = int(tz)
131 except ValueError:
132 pass
133 # Convert a timezone offset into seconds ; -0500 -> -18000
134 if tzoffset:
135 if tzoffset < 0:
136 tzsign = -1
137 tzoffset = -tzoffset
138 else:
139 tzsign = 1
140 tzoffset = tzsign * ( (tzoffset//100)*3600 + (tzoffset % 100)*60)
141 # Daylight Saving Time flag is set to -1, since DST is unknown.
142 return yy, mm, dd, thh, tmm, tss, 0, 1, -1, tzoffset
143
144
145def parsedate(data):
146 """Convert a time string to a time tuple."""
147 t = parsedate_tz(data)
148 if isinstance(t, tuple):
149 return t[:9]
150 else:
151 return t
152
153
154def mktime_tz(data):
155 """Turn a 10-tuple as returned by parsedate_tz() into a UTC timestamp."""
156 if data[9] is None:
157 # No zone info, so localtime is better assumption than GMT
158 return time.mktime(data[:8] + (-1,))
159 else:
160 t = time.mktime(data[:8] + (0,))
161 return t - data[9] - time.timezone
162
163
164def quote(str):
R. David Murray5397e862010-10-02 15:58:26 +0000165 """Prepare string to be used in a quoted string.
166
167 Turns backslash and double quote characters into quoted pairs. These
168 are the only characters that need to be quoted inside a quoted string.
169 Does not add the surrounding double quotes.
170 """
Guido van Rossum8b3febe2007-08-30 01:15:14 +0000171 return str.replace('\\', '\\\\').replace('"', '\\"')
172
173
174class AddrlistClass:
175 """Address parser class by Ben Escoto.
176
177 To understand what this class does, it helps to have a copy of RFC 2822 in
178 front of you.
179
180 Note: this class interface is deprecated and may be removed in the future.
181 Use rfc822.AddressList instead.
182 """
183
184 def __init__(self, field):
185 """Initialize a new instance.
186
187 `field' is an unparsed address header field, containing
188 one or more addresses.
189 """
190 self.specials = '()<>@,:;.\"[]'
191 self.pos = 0
192 self.LWS = ' \t'
193 self.CR = '\r\n'
194 self.FWS = self.LWS + self.CR
195 self.atomends = self.specials + self.LWS + self.CR
196 # Note that RFC 2822 now specifies `.' as obs-phrase, meaning that it
197 # is obsolete syntax. RFC 2822 requires that we recognize obsolete
198 # syntax, so allow dots in phrases.
199 self.phraseends = self.atomends.replace('.', '')
200 self.field = field
201 self.commentlist = []
202
203 def gotonext(self):
R. David Murray63563cd2010-12-18 18:25:38 +0000204 """Skip white space and extract comments."""
205 wslist = []
Guido van Rossum8b3febe2007-08-30 01:15:14 +0000206 while self.pos < len(self.field):
207 if self.field[self.pos] in self.LWS + '\n\r':
R. David Murray63563cd2010-12-18 18:25:38 +0000208 if self.field[self.pos] not in '\n\r':
209 wslist.append(self.field[self.pos])
Guido van Rossum8b3febe2007-08-30 01:15:14 +0000210 self.pos += 1
211 elif self.field[self.pos] == '(':
212 self.commentlist.append(self.getcomment())
213 else:
214 break
R. David Murray63563cd2010-12-18 18:25:38 +0000215 return EMPTYSTRING.join(wslist)
Guido van Rossum8b3febe2007-08-30 01:15:14 +0000216
217 def getaddrlist(self):
218 """Parse all addresses.
219
220 Returns a list containing all of the addresses.
221 """
222 result = []
223 while self.pos < len(self.field):
224 ad = self.getaddress()
225 if ad:
226 result += ad
227 else:
228 result.append(('', ''))
229 return result
230
231 def getaddress(self):
232 """Parse the next address."""
233 self.commentlist = []
234 self.gotonext()
235
236 oldpos = self.pos
237 oldcl = self.commentlist
238 plist = self.getphraselist()
239
240 self.gotonext()
241 returnlist = []
242
243 if self.pos >= len(self.field):
244 # Bad email address technically, no domain.
245 if plist:
246 returnlist = [(SPACE.join(self.commentlist), plist[0])]
247
248 elif self.field[self.pos] in '.@':
249 # email address is just an addrspec
250 # this isn't very efficient since we start over
251 self.pos = oldpos
252 self.commentlist = oldcl
253 addrspec = self.getaddrspec()
254 returnlist = [(SPACE.join(self.commentlist), addrspec)]
255
256 elif self.field[self.pos] == ':':
257 # address is a group
258 returnlist = []
259
260 fieldlen = len(self.field)
261 self.pos += 1
262 while self.pos < len(self.field):
263 self.gotonext()
264 if self.pos < fieldlen and self.field[self.pos] == ';':
265 self.pos += 1
266 break
267 returnlist = returnlist + self.getaddress()
268
269 elif self.field[self.pos] == '<':
270 # Address is a phrase then a route addr
271 routeaddr = self.getrouteaddr()
272
273 if self.commentlist:
274 returnlist = [(SPACE.join(plist) + ' (' +
275 ' '.join(self.commentlist) + ')', routeaddr)]
276 else:
277 returnlist = [(SPACE.join(plist), routeaddr)]
278
279 else:
280 if plist:
281 returnlist = [(SPACE.join(self.commentlist), plist[0])]
282 elif self.field[self.pos] in self.specials:
283 self.pos += 1
284
285 self.gotonext()
286 if self.pos < len(self.field) and self.field[self.pos] == ',':
287 self.pos += 1
288 return returnlist
289
290 def getrouteaddr(self):
291 """Parse a route address (Return-path value).
292
293 This method just skips all the route stuff and returns the addrspec.
294 """
295 if self.field[self.pos] != '<':
296 return
297
298 expectroute = False
299 self.pos += 1
300 self.gotonext()
301 adlist = ''
302 while self.pos < len(self.field):
303 if expectroute:
304 self.getdomain()
305 expectroute = False
306 elif self.field[self.pos] == '>':
307 self.pos += 1
308 break
309 elif self.field[self.pos] == '@':
310 self.pos += 1
311 expectroute = True
312 elif self.field[self.pos] == ':':
313 self.pos += 1
314 else:
315 adlist = self.getaddrspec()
316 self.pos += 1
317 break
318 self.gotonext()
319
320 return adlist
321
322 def getaddrspec(self):
323 """Parse an RFC 2822 addr-spec."""
324 aslist = []
325
326 self.gotonext()
327 while self.pos < len(self.field):
R. David Murray63563cd2010-12-18 18:25:38 +0000328 preserve_ws = True
Guido van Rossum8b3febe2007-08-30 01:15:14 +0000329 if self.field[self.pos] == '.':
R. David Murray63563cd2010-12-18 18:25:38 +0000330 if aslist and not aslist[-1].strip():
331 aslist.pop()
Guido van Rossum8b3febe2007-08-30 01:15:14 +0000332 aslist.append('.')
333 self.pos += 1
R. David Murray63563cd2010-12-18 18:25:38 +0000334 preserve_ws = False
Guido van Rossum8b3febe2007-08-30 01:15:14 +0000335 elif self.field[self.pos] == '"':
R. David Murray5397e862010-10-02 15:58:26 +0000336 aslist.append('"%s"' % quote(self.getquote()))
Guido van Rossum8b3febe2007-08-30 01:15:14 +0000337 elif self.field[self.pos] in self.atomends:
R. David Murray63563cd2010-12-18 18:25:38 +0000338 if aslist and not aslist[-1].strip():
339 aslist.pop()
Guido van Rossum8b3febe2007-08-30 01:15:14 +0000340 break
341 else:
342 aslist.append(self.getatom())
R. David Murray63563cd2010-12-18 18:25:38 +0000343 ws = self.gotonext()
344 if preserve_ws and ws:
345 aslist.append(ws)
Guido van Rossum8b3febe2007-08-30 01:15:14 +0000346
347 if self.pos >= len(self.field) or self.field[self.pos] != '@':
348 return EMPTYSTRING.join(aslist)
349
350 aslist.append('@')
351 self.pos += 1
352 self.gotonext()
353 return EMPTYSTRING.join(aslist) + self.getdomain()
354
355 def getdomain(self):
356 """Get the complete domain name from an address."""
357 sdlist = []
358 while self.pos < len(self.field):
359 if self.field[self.pos] in self.LWS:
360 self.pos += 1
361 elif self.field[self.pos] == '(':
362 self.commentlist.append(self.getcomment())
363 elif self.field[self.pos] == '[':
364 sdlist.append(self.getdomainliteral())
365 elif self.field[self.pos] == '.':
366 self.pos += 1
367 sdlist.append('.')
368 elif self.field[self.pos] in self.atomends:
369 break
370 else:
371 sdlist.append(self.getatom())
372 return EMPTYSTRING.join(sdlist)
373
374 def getdelimited(self, beginchar, endchars, allowcomments=True):
375 """Parse a header fragment delimited by special characters.
376
377 `beginchar' is the start character for the fragment.
378 If self is not looking at an instance of `beginchar' then
379 getdelimited returns the empty string.
380
381 `endchars' is a sequence of allowable end-delimiting characters.
382 Parsing stops when one of these is encountered.
383
384 If `allowcomments' is non-zero, embedded RFC 2822 comments are allowed
385 within the parsed fragment.
386 """
387 if self.field[self.pos] != beginchar:
388 return ''
389
390 slist = ['']
391 quote = False
392 self.pos += 1
393 while self.pos < len(self.field):
394 if quote:
395 slist.append(self.field[self.pos])
396 quote = False
397 elif self.field[self.pos] in endchars:
398 self.pos += 1
399 break
400 elif allowcomments and self.field[self.pos] == '(':
401 slist.append(self.getcomment())
402 continue # have already advanced pos from getcomment
403 elif self.field[self.pos] == '\\':
404 quote = True
405 else:
406 slist.append(self.field[self.pos])
407 self.pos += 1
408
409 return EMPTYSTRING.join(slist)
410
411 def getquote(self):
412 """Get a quote-delimited fragment from self's field."""
413 return self.getdelimited('"', '"\r', False)
414
415 def getcomment(self):
416 """Get a parenthesis-delimited fragment from self's field."""
417 return self.getdelimited('(', ')\r', True)
418
419 def getdomainliteral(self):
420 """Parse an RFC 2822 domain-literal."""
421 return '[%s]' % self.getdelimited('[', ']\r', False)
422
423 def getatom(self, atomends=None):
424 """Parse an RFC 2822 atom.
425
426 Optional atomends specifies a different set of end token delimiters
427 (the default is to use self.atomends). This is used e.g. in
428 getphraselist() since phrase endings must not include the `.' (which
429 is legal in phrases)."""
430 atomlist = ['']
431 if atomends is None:
432 atomends = self.atomends
433
434 while self.pos < len(self.field):
435 if self.field[self.pos] in atomends:
436 break
437 else:
438 atomlist.append(self.field[self.pos])
439 self.pos += 1
440
441 return EMPTYSTRING.join(atomlist)
442
443 def getphraselist(self):
444 """Parse a sequence of RFC 2822 phrases.
445
446 A phrase is a sequence of words, which are in turn either RFC 2822
447 atoms or quoted-strings. Phrases are canonicalized by squeezing all
448 runs of continuous whitespace into one space.
449 """
450 plist = []
451
452 while self.pos < len(self.field):
453 if self.field[self.pos] in self.FWS:
454 self.pos += 1
455 elif self.field[self.pos] == '"':
456 plist.append(self.getquote())
457 elif self.field[self.pos] == '(':
458 self.commentlist.append(self.getcomment())
459 elif self.field[self.pos] in self.phraseends:
460 break
461 else:
462 plist.append(self.getatom(self.phraseends))
463
464 return plist
465
466class AddressList(AddrlistClass):
467 """An AddressList encapsulates a list of parsed RFC 2822 addresses."""
468 def __init__(self, field):
469 AddrlistClass.__init__(self, field)
470 if field:
471 self.addresslist = self.getaddrlist()
472 else:
473 self.addresslist = []
474
475 def __len__(self):
476 return len(self.addresslist)
477
478 def __add__(self, other):
479 # Set union
480 newaddr = AddressList(None)
481 newaddr.addresslist = self.addresslist[:]
482 for x in other.addresslist:
483 if not x in self.addresslist:
484 newaddr.addresslist.append(x)
485 return newaddr
486
487 def __iadd__(self, other):
488 # Set union, in-place
489 for x in other.addresslist:
490 if not x in self.addresslist:
491 self.addresslist.append(x)
492 return self
493
494 def __sub__(self, other):
495 # Set difference
496 newaddr = AddressList(None)
497 for x in self.addresslist:
498 if not x in other.addresslist:
499 newaddr.addresslist.append(x)
500 return newaddr
501
502 def __isub__(self, other):
503 # Set difference, in-place
504 for x in other.addresslist:
505 if x in self.addresslist:
506 self.addresslist.remove(x)
507 return self
508
509 def __getitem__(self, index):
510 # Make indexing, slices, and 'in' work
511 return self.addresslist[index]