blob: 4b2f5c63a76ca8ef37a171b761767b7bf26e9029 [file] [log] [blame]
Guido van Rossum8b3febe2007-08-30 01:15:14 +00001# Copyright (C) 2002-2007 Python Software Foundation
2# Contact: email-sig@python.org
3
4"""Email address parsing code.
5
6Lifted directly from rfc822.py. This should eventually be rewritten.
7"""
8
9__all__ = [
10 'mktime_tz',
11 'parsedate',
12 'parsedate_tz',
13 'quote',
14 ]
15
16import time
17
18SPACE = ' '
19EMPTYSTRING = ''
20COMMASPACE = ', '
21
22# Parse a date field
23_monthnames = ['jan', 'feb', 'mar', 'apr', 'may', 'jun', 'jul',
24 'aug', 'sep', 'oct', 'nov', 'dec',
25 'january', 'february', 'march', 'april', 'may', 'june', 'july',
26 'august', 'september', 'october', 'november', 'december']
27
28_daynames = ['mon', 'tue', 'wed', 'thu', 'fri', 'sat', 'sun']
29
30# The timezone table does not include the military time zones defined
31# in RFC822, other than Z. According to RFC1123, the description in
32# RFC822 gets the signs wrong, so we can't rely on any such time
33# zones. RFC1123 recommends that numeric timezone indicators be used
34# instead of timezone names.
35
36_timezones = {'UT':0, 'UTC':0, 'GMT':0, 'Z':0,
37 'AST': -400, 'ADT': -300, # Atlantic (used in Canada)
38 'EST': -500, 'EDT': -400, # Eastern
39 'CST': -600, 'CDT': -500, # Central
40 'MST': -700, 'MDT': -600, # Mountain
41 'PST': -800, 'PDT': -700 # Pacific
42 }
43
44
45def parsedate_tz(data):
46 """Convert a date string to a time tuple.
47
48 Accounts for military timezones.
49 """
50 data = data.split()
51 # The FWS after the comma after the day-of-week is optional, so search and
52 # adjust for this.
53 if data[0].endswith(',') or data[0].lower() in _daynames:
54 # There's a dayname here. Skip it
55 del data[0]
56 else:
57 i = data[0].rfind(',')
58 if i >= 0:
59 data[0] = data[0][i+1:]
60 if len(data) == 3: # RFC 850 date, deprecated
61 stuff = data[0].split('-')
62 if len(stuff) == 3:
63 data = stuff + data[1:]
64 if len(data) == 4:
65 s = data[3]
66 i = s.find('+')
R. David Murray4a62e892010-12-23 20:35:46 +000067 if i == -1:
68 i = s.find('-')
Guido van Rossum8b3febe2007-08-30 01:15:14 +000069 if i > 0:
R. David Murray4a62e892010-12-23 20:35:46 +000070 data[3:] = [s[:i], s[i:]]
Guido van Rossum8b3febe2007-08-30 01:15:14 +000071 else:
72 data.append('') # Dummy tz
73 if len(data) < 5:
74 return None
75 data = data[:5]
76 [dd, mm, yy, tm, tz] = data
77 mm = mm.lower()
78 if mm not in _monthnames:
79 dd, mm = mm, dd.lower()
80 if mm not in _monthnames:
81 return None
82 mm = _monthnames.index(mm) + 1
83 if mm > 12:
84 mm -= 12
85 if dd[-1] == ',':
86 dd = dd[:-1]
87 i = yy.find(':')
88 if i > 0:
89 yy, tm = tm, yy
90 if yy[-1] == ',':
91 yy = yy[:-1]
92 if not yy[0].isdigit():
93 yy, tz = tz, yy
94 if tm[-1] == ',':
95 tm = tm[:-1]
96 tm = tm.split(':')
97 if len(tm) == 2:
98 [thh, tmm] = tm
99 tss = '0'
100 elif len(tm) == 3:
101 [thh, tmm, tss] = tm
R David Murrayaccd1c02011-03-13 20:06:23 -0400102 elif len(tm) == 1 and '.' in tm[0]:
103 # Some non-compliant MUAs use '.' to separate time elements.
104 tm = tm[0].split('.')
105 if len(tm) == 2:
106 [thh, tmm] = tm
107 tss = 0
108 elif len(tm) == 3:
109 [thh, tmm, tss] = tm
Guido van Rossum8b3febe2007-08-30 01:15:14 +0000110 else:
111 return None
112 try:
113 yy = int(yy)
114 dd = int(dd)
115 thh = int(thh)
116 tmm = int(tmm)
117 tss = int(tss)
118 except ValueError:
119 return None
R. David Murray219d1c82010-08-25 00:45:55 +0000120 # Check for a yy specified in two-digit format, then convert it to the
121 # appropriate four-digit format, according to the POSIX standard. RFC 822
122 # calls for a two-digit yy, but RFC 2822 (which obsoletes RFC 822)
123 # mandates a 4-digit yy. For more information, see the documentation for
124 # the time module.
125 if yy < 100:
126 # The year is between 1969 and 1999 (inclusive).
127 if yy > 68:
128 yy += 1900
129 # The year is between 2000 and 2068 (inclusive).
130 else:
131 yy += 2000
Guido van Rossum8b3febe2007-08-30 01:15:14 +0000132 tzoffset = None
133 tz = tz.upper()
134 if tz in _timezones:
135 tzoffset = _timezones[tz]
136 else:
137 try:
138 tzoffset = int(tz)
139 except ValueError:
140 pass
141 # Convert a timezone offset into seconds ; -0500 -> -18000
142 if tzoffset:
143 if tzoffset < 0:
144 tzsign = -1
145 tzoffset = -tzoffset
146 else:
147 tzsign = 1
148 tzoffset = tzsign * ( (tzoffset//100)*3600 + (tzoffset % 100)*60)
149 # Daylight Saving Time flag is set to -1, since DST is unknown.
150 return yy, mm, dd, thh, tmm, tss, 0, 1, -1, tzoffset
151
152
153def parsedate(data):
154 """Convert a time string to a time tuple."""
155 t = parsedate_tz(data)
156 if isinstance(t, tuple):
157 return t[:9]
158 else:
159 return t
160
161
162def mktime_tz(data):
163 """Turn a 10-tuple as returned by parsedate_tz() into a UTC timestamp."""
164 if data[9] is None:
165 # No zone info, so localtime is better assumption than GMT
166 return time.mktime(data[:8] + (-1,))
167 else:
168 t = time.mktime(data[:8] + (0,))
169 return t - data[9] - time.timezone
170
171
172def quote(str):
R. David Murray5397e862010-10-02 15:58:26 +0000173 """Prepare string to be used in a quoted string.
174
175 Turns backslash and double quote characters into quoted pairs. These
176 are the only characters that need to be quoted inside a quoted string.
177 Does not add the surrounding double quotes.
178 """
Guido van Rossum8b3febe2007-08-30 01:15:14 +0000179 return str.replace('\\', '\\\\').replace('"', '\\"')
180
181
182class AddrlistClass:
183 """Address parser class by Ben Escoto.
184
185 To understand what this class does, it helps to have a copy of RFC 2822 in
186 front of you.
187
188 Note: this class interface is deprecated and may be removed in the future.
189 Use rfc822.AddressList instead.
190 """
191
192 def __init__(self, field):
193 """Initialize a new instance.
194
195 `field' is an unparsed address header field, containing
196 one or more addresses.
197 """
198 self.specials = '()<>@,:;.\"[]'
199 self.pos = 0
200 self.LWS = ' \t'
201 self.CR = '\r\n'
202 self.FWS = self.LWS + self.CR
203 self.atomends = self.specials + self.LWS + self.CR
204 # Note that RFC 2822 now specifies `.' as obs-phrase, meaning that it
205 # is obsolete syntax. RFC 2822 requires that we recognize obsolete
206 # syntax, so allow dots in phrases.
207 self.phraseends = self.atomends.replace('.', '')
208 self.field = field
209 self.commentlist = []
210
211 def gotonext(self):
R. David Murray63563cd2010-12-18 18:25:38 +0000212 """Skip white space and extract comments."""
213 wslist = []
Guido van Rossum8b3febe2007-08-30 01:15:14 +0000214 while self.pos < len(self.field):
215 if self.field[self.pos] in self.LWS + '\n\r':
R. David Murray63563cd2010-12-18 18:25:38 +0000216 if self.field[self.pos] not in '\n\r':
217 wslist.append(self.field[self.pos])
Guido van Rossum8b3febe2007-08-30 01:15:14 +0000218 self.pos += 1
219 elif self.field[self.pos] == '(':
220 self.commentlist.append(self.getcomment())
221 else:
222 break
R. David Murray63563cd2010-12-18 18:25:38 +0000223 return EMPTYSTRING.join(wslist)
Guido van Rossum8b3febe2007-08-30 01:15:14 +0000224
225 def getaddrlist(self):
226 """Parse all addresses.
227
228 Returns a list containing all of the addresses.
229 """
230 result = []
231 while self.pos < len(self.field):
232 ad = self.getaddress()
233 if ad:
234 result += ad
235 else:
236 result.append(('', ''))
237 return result
238
239 def getaddress(self):
240 """Parse the next address."""
241 self.commentlist = []
242 self.gotonext()
243
244 oldpos = self.pos
245 oldcl = self.commentlist
246 plist = self.getphraselist()
247
248 self.gotonext()
249 returnlist = []
250
251 if self.pos >= len(self.field):
252 # Bad email address technically, no domain.
253 if plist:
254 returnlist = [(SPACE.join(self.commentlist), plist[0])]
255
256 elif self.field[self.pos] in '.@':
257 # email address is just an addrspec
258 # this isn't very efficient since we start over
259 self.pos = oldpos
260 self.commentlist = oldcl
261 addrspec = self.getaddrspec()
262 returnlist = [(SPACE.join(self.commentlist), addrspec)]
263
264 elif self.field[self.pos] == ':':
265 # address is a group
266 returnlist = []
267
268 fieldlen = len(self.field)
269 self.pos += 1
270 while self.pos < len(self.field):
271 self.gotonext()
272 if self.pos < fieldlen and self.field[self.pos] == ';':
273 self.pos += 1
274 break
275 returnlist = returnlist + self.getaddress()
276
277 elif self.field[self.pos] == '<':
278 # Address is a phrase then a route addr
279 routeaddr = self.getrouteaddr()
280
281 if self.commentlist:
282 returnlist = [(SPACE.join(plist) + ' (' +
283 ' '.join(self.commentlist) + ')', routeaddr)]
284 else:
285 returnlist = [(SPACE.join(plist), routeaddr)]
286
287 else:
288 if plist:
289 returnlist = [(SPACE.join(self.commentlist), plist[0])]
290 elif self.field[self.pos] in self.specials:
291 self.pos += 1
292
293 self.gotonext()
294 if self.pos < len(self.field) and self.field[self.pos] == ',':
295 self.pos += 1
296 return returnlist
297
298 def getrouteaddr(self):
299 """Parse a route address (Return-path value).
300
301 This method just skips all the route stuff and returns the addrspec.
302 """
303 if self.field[self.pos] != '<':
304 return
305
306 expectroute = False
307 self.pos += 1
308 self.gotonext()
309 adlist = ''
310 while self.pos < len(self.field):
311 if expectroute:
312 self.getdomain()
313 expectroute = False
314 elif self.field[self.pos] == '>':
315 self.pos += 1
316 break
317 elif self.field[self.pos] == '@':
318 self.pos += 1
319 expectroute = True
320 elif self.field[self.pos] == ':':
321 self.pos += 1
322 else:
323 adlist = self.getaddrspec()
324 self.pos += 1
325 break
326 self.gotonext()
327
328 return adlist
329
330 def getaddrspec(self):
331 """Parse an RFC 2822 addr-spec."""
332 aslist = []
333
334 self.gotonext()
335 while self.pos < len(self.field):
R. David Murray63563cd2010-12-18 18:25:38 +0000336 preserve_ws = True
Guido van Rossum8b3febe2007-08-30 01:15:14 +0000337 if self.field[self.pos] == '.':
R. David Murray63563cd2010-12-18 18:25:38 +0000338 if aslist and not aslist[-1].strip():
339 aslist.pop()
Guido van Rossum8b3febe2007-08-30 01:15:14 +0000340 aslist.append('.')
341 self.pos += 1
R. David Murray63563cd2010-12-18 18:25:38 +0000342 preserve_ws = False
Guido van Rossum8b3febe2007-08-30 01:15:14 +0000343 elif self.field[self.pos] == '"':
R. David Murray5397e862010-10-02 15:58:26 +0000344 aslist.append('"%s"' % quote(self.getquote()))
Guido van Rossum8b3febe2007-08-30 01:15:14 +0000345 elif self.field[self.pos] in self.atomends:
R. David Murray63563cd2010-12-18 18:25:38 +0000346 if aslist and not aslist[-1].strip():
347 aslist.pop()
Guido van Rossum8b3febe2007-08-30 01:15:14 +0000348 break
349 else:
350 aslist.append(self.getatom())
R. David Murray63563cd2010-12-18 18:25:38 +0000351 ws = self.gotonext()
352 if preserve_ws and ws:
353 aslist.append(ws)
Guido van Rossum8b3febe2007-08-30 01:15:14 +0000354
355 if self.pos >= len(self.field) or self.field[self.pos] != '@':
356 return EMPTYSTRING.join(aslist)
357
358 aslist.append('@')
359 self.pos += 1
360 self.gotonext()
361 return EMPTYSTRING.join(aslist) + self.getdomain()
362
363 def getdomain(self):
364 """Get the complete domain name from an address."""
365 sdlist = []
366 while self.pos < len(self.field):
367 if self.field[self.pos] in self.LWS:
368 self.pos += 1
369 elif self.field[self.pos] == '(':
370 self.commentlist.append(self.getcomment())
371 elif self.field[self.pos] == '[':
372 sdlist.append(self.getdomainliteral())
373 elif self.field[self.pos] == '.':
374 self.pos += 1
375 sdlist.append('.')
376 elif self.field[self.pos] in self.atomends:
377 break
378 else:
379 sdlist.append(self.getatom())
380 return EMPTYSTRING.join(sdlist)
381
382 def getdelimited(self, beginchar, endchars, allowcomments=True):
383 """Parse a header fragment delimited by special characters.
384
385 `beginchar' is the start character for the fragment.
386 If self is not looking at an instance of `beginchar' then
387 getdelimited returns the empty string.
388
389 `endchars' is a sequence of allowable end-delimiting characters.
390 Parsing stops when one of these is encountered.
391
392 If `allowcomments' is non-zero, embedded RFC 2822 comments are allowed
393 within the parsed fragment.
394 """
395 if self.field[self.pos] != beginchar:
396 return ''
397
398 slist = ['']
399 quote = False
400 self.pos += 1
401 while self.pos < len(self.field):
402 if quote:
403 slist.append(self.field[self.pos])
404 quote = False
405 elif self.field[self.pos] in endchars:
406 self.pos += 1
407 break
408 elif allowcomments and self.field[self.pos] == '(':
409 slist.append(self.getcomment())
410 continue # have already advanced pos from getcomment
411 elif self.field[self.pos] == '\\':
412 quote = True
413 else:
414 slist.append(self.field[self.pos])
415 self.pos += 1
416
417 return EMPTYSTRING.join(slist)
418
419 def getquote(self):
420 """Get a quote-delimited fragment from self's field."""
421 return self.getdelimited('"', '"\r', False)
422
423 def getcomment(self):
424 """Get a parenthesis-delimited fragment from self's field."""
425 return self.getdelimited('(', ')\r', True)
426
427 def getdomainliteral(self):
428 """Parse an RFC 2822 domain-literal."""
429 return '[%s]' % self.getdelimited('[', ']\r', False)
430
431 def getatom(self, atomends=None):
432 """Parse an RFC 2822 atom.
433
434 Optional atomends specifies a different set of end token delimiters
435 (the default is to use self.atomends). This is used e.g. in
436 getphraselist() since phrase endings must not include the `.' (which
437 is legal in phrases)."""
438 atomlist = ['']
439 if atomends is None:
440 atomends = self.atomends
441
442 while self.pos < len(self.field):
443 if self.field[self.pos] in atomends:
444 break
445 else:
446 atomlist.append(self.field[self.pos])
447 self.pos += 1
448
449 return EMPTYSTRING.join(atomlist)
450
451 def getphraselist(self):
452 """Parse a sequence of RFC 2822 phrases.
453
454 A phrase is a sequence of words, which are in turn either RFC 2822
455 atoms or quoted-strings. Phrases are canonicalized by squeezing all
456 runs of continuous whitespace into one space.
457 """
458 plist = []
459
460 while self.pos < len(self.field):
461 if self.field[self.pos] in self.FWS:
462 self.pos += 1
463 elif self.field[self.pos] == '"':
464 plist.append(self.getquote())
465 elif self.field[self.pos] == '(':
466 self.commentlist.append(self.getcomment())
467 elif self.field[self.pos] in self.phraseends:
468 break
469 else:
470 plist.append(self.getatom(self.phraseends))
471
472 return plist
473
474class AddressList(AddrlistClass):
475 """An AddressList encapsulates a list of parsed RFC 2822 addresses."""
476 def __init__(self, field):
477 AddrlistClass.__init__(self, field)
478 if field:
479 self.addresslist = self.getaddrlist()
480 else:
481 self.addresslist = []
482
483 def __len__(self):
484 return len(self.addresslist)
485
486 def __add__(self, other):
487 # Set union
488 newaddr = AddressList(None)
489 newaddr.addresslist = self.addresslist[:]
490 for x in other.addresslist:
491 if not x in self.addresslist:
492 newaddr.addresslist.append(x)
493 return newaddr
494
495 def __iadd__(self, other):
496 # Set union, in-place
497 for x in other.addresslist:
498 if not x in self.addresslist:
499 self.addresslist.append(x)
500 return self
501
502 def __sub__(self, other):
503 # Set difference
504 newaddr = AddressList(None)
505 for x in self.addresslist:
506 if not x in other.addresslist:
507 newaddr.addresslist.append(x)
508 return newaddr
509
510 def __isub__(self, other):
511 # Set difference, in-place
512 for x in other.addresslist:
513 if x in self.addresslist:
514 self.addresslist.remove(x)
515 return self
516
517 def __getitem__(self, index):
518 # Make indexing, slices, and 'in' work
519 return self.addresslist[index]