blob: 699d418b3feb8241003b4890270adfe4862dbfa7 [file] [log] [blame]
Guido van Rossum8b3febe2007-08-30 01:15:14 +00001# Copyright (C) 2002-2007 Python Software Foundation
2# Contact: email-sig@python.org
3
4"""Email address parsing code.
5
6Lifted directly from rfc822.py. This should eventually be rewritten.
7"""
8
9__all__ = [
10 'mktime_tz',
11 'parsedate',
12 'parsedate_tz',
13 'quote',
14 ]
15
16import time
17
18SPACE = ' '
19EMPTYSTRING = ''
20COMMASPACE = ', '
21
22# Parse a date field
23_monthnames = ['jan', 'feb', 'mar', 'apr', 'may', 'jun', 'jul',
24 'aug', 'sep', 'oct', 'nov', 'dec',
25 'january', 'february', 'march', 'april', 'may', 'june', 'july',
26 'august', 'september', 'october', 'november', 'december']
27
28_daynames = ['mon', 'tue', 'wed', 'thu', 'fri', 'sat', 'sun']
29
30# The timezone table does not include the military time zones defined
31# in RFC822, other than Z. According to RFC1123, the description in
32# RFC822 gets the signs wrong, so we can't rely on any such time
33# zones. RFC1123 recommends that numeric timezone indicators be used
34# instead of timezone names.
35
36_timezones = {'UT':0, 'UTC':0, 'GMT':0, 'Z':0,
37 'AST': -400, 'ADT': -300, # Atlantic (used in Canada)
38 'EST': -500, 'EDT': -400, # Eastern
39 'CST': -600, 'CDT': -500, # Central
40 'MST': -700, 'MDT': -600, # Mountain
41 'PST': -800, 'PDT': -700 # Pacific
42 }
43
44
45def parsedate_tz(data):
46 """Convert a date string to a time tuple.
47
48 Accounts for military timezones.
49 """
50 data = data.split()
51 # The FWS after the comma after the day-of-week is optional, so search and
52 # adjust for this.
53 if data[0].endswith(',') or data[0].lower() in _daynames:
54 # There's a dayname here. Skip it
55 del data[0]
56 else:
57 i = data[0].rfind(',')
58 if i >= 0:
59 data[0] = data[0][i+1:]
60 if len(data) == 3: # RFC 850 date, deprecated
61 stuff = data[0].split('-')
62 if len(stuff) == 3:
63 data = stuff + data[1:]
64 if len(data) == 4:
65 s = data[3]
66 i = s.find('+')
67 if i > 0:
68 data[3:] = [s[:i], s[i+1:]]
69 else:
70 data.append('') # Dummy tz
71 if len(data) < 5:
72 return None
73 data = data[:5]
74 [dd, mm, yy, tm, tz] = data
75 mm = mm.lower()
76 if mm not in _monthnames:
77 dd, mm = mm, dd.lower()
78 if mm not in _monthnames:
79 return None
80 mm = _monthnames.index(mm) + 1
81 if mm > 12:
82 mm -= 12
83 if dd[-1] == ',':
84 dd = dd[:-1]
85 i = yy.find(':')
86 if i > 0:
87 yy, tm = tm, yy
88 if yy[-1] == ',':
89 yy = yy[:-1]
90 if not yy[0].isdigit():
91 yy, tz = tz, yy
92 if tm[-1] == ',':
93 tm = tm[:-1]
94 tm = tm.split(':')
95 if len(tm) == 2:
96 [thh, tmm] = tm
97 tss = '0'
98 elif len(tm) == 3:
99 [thh, tmm, tss] = tm
100 else:
101 return None
102 try:
103 yy = int(yy)
104 dd = int(dd)
105 thh = int(thh)
106 tmm = int(tmm)
107 tss = int(tss)
108 except ValueError:
109 return None
R. David Murray219d1c82010-08-25 00:45:55 +0000110 # Check for a yy specified in two-digit format, then convert it to the
111 # appropriate four-digit format, according to the POSIX standard. RFC 822
112 # calls for a two-digit yy, but RFC 2822 (which obsoletes RFC 822)
113 # mandates a 4-digit yy. For more information, see the documentation for
114 # the time module.
115 if yy < 100:
116 # The year is between 1969 and 1999 (inclusive).
117 if yy > 68:
118 yy += 1900
119 # The year is between 2000 and 2068 (inclusive).
120 else:
121 yy += 2000
Guido van Rossum8b3febe2007-08-30 01:15:14 +0000122 tzoffset = None
123 tz = tz.upper()
124 if tz in _timezones:
125 tzoffset = _timezones[tz]
126 else:
127 try:
128 tzoffset = int(tz)
129 except ValueError:
130 pass
131 # Convert a timezone offset into seconds ; -0500 -> -18000
132 if tzoffset:
133 if tzoffset < 0:
134 tzsign = -1
135 tzoffset = -tzoffset
136 else:
137 tzsign = 1
138 tzoffset = tzsign * ( (tzoffset//100)*3600 + (tzoffset % 100)*60)
139 # Daylight Saving Time flag is set to -1, since DST is unknown.
140 return yy, mm, dd, thh, tmm, tss, 0, 1, -1, tzoffset
141
142
143def parsedate(data):
144 """Convert a time string to a time tuple."""
145 t = parsedate_tz(data)
146 if isinstance(t, tuple):
147 return t[:9]
148 else:
149 return t
150
151
152def mktime_tz(data):
153 """Turn a 10-tuple as returned by parsedate_tz() into a UTC timestamp."""
154 if data[9] is None:
155 # No zone info, so localtime is better assumption than GMT
156 return time.mktime(data[:8] + (-1,))
157 else:
158 t = time.mktime(data[:8] + (0,))
159 return t - data[9] - time.timezone
160
161
162def quote(str):
R. David Murray5397e862010-10-02 15:58:26 +0000163 """Prepare string to be used in a quoted string.
164
165 Turns backslash and double quote characters into quoted pairs. These
166 are the only characters that need to be quoted inside a quoted string.
167 Does not add the surrounding double quotes.
168 """
Guido van Rossum8b3febe2007-08-30 01:15:14 +0000169 return str.replace('\\', '\\\\').replace('"', '\\"')
170
171
172class AddrlistClass:
173 """Address parser class by Ben Escoto.
174
175 To understand what this class does, it helps to have a copy of RFC 2822 in
176 front of you.
177
178 Note: this class interface is deprecated and may be removed in the future.
179 Use rfc822.AddressList instead.
180 """
181
182 def __init__(self, field):
183 """Initialize a new instance.
184
185 `field' is an unparsed address header field, containing
186 one or more addresses.
187 """
188 self.specials = '()<>@,:;.\"[]'
189 self.pos = 0
190 self.LWS = ' \t'
191 self.CR = '\r\n'
192 self.FWS = self.LWS + self.CR
193 self.atomends = self.specials + self.LWS + self.CR
194 # Note that RFC 2822 now specifies `.' as obs-phrase, meaning that it
195 # is obsolete syntax. RFC 2822 requires that we recognize obsolete
196 # syntax, so allow dots in phrases.
197 self.phraseends = self.atomends.replace('.', '')
198 self.field = field
199 self.commentlist = []
200
201 def gotonext(self):
R. David Murray63563cd2010-12-18 18:25:38 +0000202 """Skip white space and extract comments."""
203 wslist = []
Guido van Rossum8b3febe2007-08-30 01:15:14 +0000204 while self.pos < len(self.field):
205 if self.field[self.pos] in self.LWS + '\n\r':
R. David Murray63563cd2010-12-18 18:25:38 +0000206 if self.field[self.pos] not in '\n\r':
207 wslist.append(self.field[self.pos])
Guido van Rossum8b3febe2007-08-30 01:15:14 +0000208 self.pos += 1
209 elif self.field[self.pos] == '(':
210 self.commentlist.append(self.getcomment())
211 else:
212 break
R. David Murray63563cd2010-12-18 18:25:38 +0000213 return EMPTYSTRING.join(wslist)
Guido van Rossum8b3febe2007-08-30 01:15:14 +0000214
215 def getaddrlist(self):
216 """Parse all addresses.
217
218 Returns a list containing all of the addresses.
219 """
220 result = []
221 while self.pos < len(self.field):
222 ad = self.getaddress()
223 if ad:
224 result += ad
225 else:
226 result.append(('', ''))
227 return result
228
229 def getaddress(self):
230 """Parse the next address."""
231 self.commentlist = []
232 self.gotonext()
233
234 oldpos = self.pos
235 oldcl = self.commentlist
236 plist = self.getphraselist()
237
238 self.gotonext()
239 returnlist = []
240
241 if self.pos >= len(self.field):
242 # Bad email address technically, no domain.
243 if plist:
244 returnlist = [(SPACE.join(self.commentlist), plist[0])]
245
246 elif self.field[self.pos] in '.@':
247 # email address is just an addrspec
248 # this isn't very efficient since we start over
249 self.pos = oldpos
250 self.commentlist = oldcl
251 addrspec = self.getaddrspec()
252 returnlist = [(SPACE.join(self.commentlist), addrspec)]
253
254 elif self.field[self.pos] == ':':
255 # address is a group
256 returnlist = []
257
258 fieldlen = len(self.field)
259 self.pos += 1
260 while self.pos < len(self.field):
261 self.gotonext()
262 if self.pos < fieldlen and self.field[self.pos] == ';':
263 self.pos += 1
264 break
265 returnlist = returnlist + self.getaddress()
266
267 elif self.field[self.pos] == '<':
268 # Address is a phrase then a route addr
269 routeaddr = self.getrouteaddr()
270
271 if self.commentlist:
272 returnlist = [(SPACE.join(plist) + ' (' +
273 ' '.join(self.commentlist) + ')', routeaddr)]
274 else:
275 returnlist = [(SPACE.join(plist), routeaddr)]
276
277 else:
278 if plist:
279 returnlist = [(SPACE.join(self.commentlist), plist[0])]
280 elif self.field[self.pos] in self.specials:
281 self.pos += 1
282
283 self.gotonext()
284 if self.pos < len(self.field) and self.field[self.pos] == ',':
285 self.pos += 1
286 return returnlist
287
288 def getrouteaddr(self):
289 """Parse a route address (Return-path value).
290
291 This method just skips all the route stuff and returns the addrspec.
292 """
293 if self.field[self.pos] != '<':
294 return
295
296 expectroute = False
297 self.pos += 1
298 self.gotonext()
299 adlist = ''
300 while self.pos < len(self.field):
301 if expectroute:
302 self.getdomain()
303 expectroute = False
304 elif self.field[self.pos] == '>':
305 self.pos += 1
306 break
307 elif self.field[self.pos] == '@':
308 self.pos += 1
309 expectroute = True
310 elif self.field[self.pos] == ':':
311 self.pos += 1
312 else:
313 adlist = self.getaddrspec()
314 self.pos += 1
315 break
316 self.gotonext()
317
318 return adlist
319
320 def getaddrspec(self):
321 """Parse an RFC 2822 addr-spec."""
322 aslist = []
323
324 self.gotonext()
325 while self.pos < len(self.field):
R. David Murray63563cd2010-12-18 18:25:38 +0000326 preserve_ws = True
Guido van Rossum8b3febe2007-08-30 01:15:14 +0000327 if self.field[self.pos] == '.':
R. David Murray63563cd2010-12-18 18:25:38 +0000328 if aslist and not aslist[-1].strip():
329 aslist.pop()
Guido van Rossum8b3febe2007-08-30 01:15:14 +0000330 aslist.append('.')
331 self.pos += 1
R. David Murray63563cd2010-12-18 18:25:38 +0000332 preserve_ws = False
Guido van Rossum8b3febe2007-08-30 01:15:14 +0000333 elif self.field[self.pos] == '"':
R. David Murray5397e862010-10-02 15:58:26 +0000334 aslist.append('"%s"' % quote(self.getquote()))
Guido van Rossum8b3febe2007-08-30 01:15:14 +0000335 elif self.field[self.pos] in self.atomends:
R. David Murray63563cd2010-12-18 18:25:38 +0000336 if aslist and not aslist[-1].strip():
337 aslist.pop()
Guido van Rossum8b3febe2007-08-30 01:15:14 +0000338 break
339 else:
340 aslist.append(self.getatom())
R. David Murray63563cd2010-12-18 18:25:38 +0000341 ws = self.gotonext()
342 if preserve_ws and ws:
343 aslist.append(ws)
Guido van Rossum8b3febe2007-08-30 01:15:14 +0000344
345 if self.pos >= len(self.field) or self.field[self.pos] != '@':
346 return EMPTYSTRING.join(aslist)
347
348 aslist.append('@')
349 self.pos += 1
350 self.gotonext()
351 return EMPTYSTRING.join(aslist) + self.getdomain()
352
353 def getdomain(self):
354 """Get the complete domain name from an address."""
355 sdlist = []
356 while self.pos < len(self.field):
357 if self.field[self.pos] in self.LWS:
358 self.pos += 1
359 elif self.field[self.pos] == '(':
360 self.commentlist.append(self.getcomment())
361 elif self.field[self.pos] == '[':
362 sdlist.append(self.getdomainliteral())
363 elif self.field[self.pos] == '.':
364 self.pos += 1
365 sdlist.append('.')
366 elif self.field[self.pos] in self.atomends:
367 break
368 else:
369 sdlist.append(self.getatom())
370 return EMPTYSTRING.join(sdlist)
371
372 def getdelimited(self, beginchar, endchars, allowcomments=True):
373 """Parse a header fragment delimited by special characters.
374
375 `beginchar' is the start character for the fragment.
376 If self is not looking at an instance of `beginchar' then
377 getdelimited returns the empty string.
378
379 `endchars' is a sequence of allowable end-delimiting characters.
380 Parsing stops when one of these is encountered.
381
382 If `allowcomments' is non-zero, embedded RFC 2822 comments are allowed
383 within the parsed fragment.
384 """
385 if self.field[self.pos] != beginchar:
386 return ''
387
388 slist = ['']
389 quote = False
390 self.pos += 1
391 while self.pos < len(self.field):
392 if quote:
393 slist.append(self.field[self.pos])
394 quote = False
395 elif self.field[self.pos] in endchars:
396 self.pos += 1
397 break
398 elif allowcomments and self.field[self.pos] == '(':
399 slist.append(self.getcomment())
400 continue # have already advanced pos from getcomment
401 elif self.field[self.pos] == '\\':
402 quote = True
403 else:
404 slist.append(self.field[self.pos])
405 self.pos += 1
406
407 return EMPTYSTRING.join(slist)
408
409 def getquote(self):
410 """Get a quote-delimited fragment from self's field."""
411 return self.getdelimited('"', '"\r', False)
412
413 def getcomment(self):
414 """Get a parenthesis-delimited fragment from self's field."""
415 return self.getdelimited('(', ')\r', True)
416
417 def getdomainliteral(self):
418 """Parse an RFC 2822 domain-literal."""
419 return '[%s]' % self.getdelimited('[', ']\r', False)
420
421 def getatom(self, atomends=None):
422 """Parse an RFC 2822 atom.
423
424 Optional atomends specifies a different set of end token delimiters
425 (the default is to use self.atomends). This is used e.g. in
426 getphraselist() since phrase endings must not include the `.' (which
427 is legal in phrases)."""
428 atomlist = ['']
429 if atomends is None:
430 atomends = self.atomends
431
432 while self.pos < len(self.field):
433 if self.field[self.pos] in atomends:
434 break
435 else:
436 atomlist.append(self.field[self.pos])
437 self.pos += 1
438
439 return EMPTYSTRING.join(atomlist)
440
441 def getphraselist(self):
442 """Parse a sequence of RFC 2822 phrases.
443
444 A phrase is a sequence of words, which are in turn either RFC 2822
445 atoms or quoted-strings. Phrases are canonicalized by squeezing all
446 runs of continuous whitespace into one space.
447 """
448 plist = []
449
450 while self.pos < len(self.field):
451 if self.field[self.pos] in self.FWS:
452 self.pos += 1
453 elif self.field[self.pos] == '"':
454 plist.append(self.getquote())
455 elif self.field[self.pos] == '(':
456 self.commentlist.append(self.getcomment())
457 elif self.field[self.pos] in self.phraseends:
458 break
459 else:
460 plist.append(self.getatom(self.phraseends))
461
462 return plist
463
464class AddressList(AddrlistClass):
465 """An AddressList encapsulates a list of parsed RFC 2822 addresses."""
466 def __init__(self, field):
467 AddrlistClass.__init__(self, field)
468 if field:
469 self.addresslist = self.getaddrlist()
470 else:
471 self.addresslist = []
472
473 def __len__(self):
474 return len(self.addresslist)
475
476 def __add__(self, other):
477 # Set union
478 newaddr = AddressList(None)
479 newaddr.addresslist = self.addresslist[:]
480 for x in other.addresslist:
481 if not x in self.addresslist:
482 newaddr.addresslist.append(x)
483 return newaddr
484
485 def __iadd__(self, other):
486 # Set union, in-place
487 for x in other.addresslist:
488 if not x in self.addresslist:
489 self.addresslist.append(x)
490 return self
491
492 def __sub__(self, other):
493 # Set difference
494 newaddr = AddressList(None)
495 for x in self.addresslist:
496 if not x in other.addresslist:
497 newaddr.addresslist.append(x)
498 return newaddr
499
500 def __isub__(self, other):
501 # Set difference, in-place
502 for x in other.addresslist:
503 if x in self.addresslist:
504 self.addresslist.remove(x)
505 return self
506
507 def __getitem__(self, index):
508 # Make indexing, slices, and 'in' work
509 return self.addresslist[index]