blob: ba5ad5a36d06b740d6d515aa7bd47d8464483155 [file] [log] [blame]
Guido van Rossum8b3febe2007-08-30 01:15:14 +00001# Copyright (C) 2002-2007 Python Software Foundation
2# Contact: email-sig@python.org
3
4"""Email address parsing code.
5
6Lifted directly from rfc822.py. This should eventually be rewritten.
7"""
8
9__all__ = [
10 'mktime_tz',
11 'parsedate',
12 'parsedate_tz',
13 'quote',
14 ]
15
Alexander Belopolskya07548e2012-06-21 20:34:09 -040016import time, calendar
Guido van Rossum8b3febe2007-08-30 01:15:14 +000017
18SPACE = ' '
19EMPTYSTRING = ''
20COMMASPACE = ', '
21
22# Parse a date field
23_monthnames = ['jan', 'feb', 'mar', 'apr', 'may', 'jun', 'jul',
24 'aug', 'sep', 'oct', 'nov', 'dec',
25 'january', 'february', 'march', 'april', 'may', 'june', 'july',
26 'august', 'september', 'october', 'november', 'december']
27
28_daynames = ['mon', 'tue', 'wed', 'thu', 'fri', 'sat', 'sun']
29
30# The timezone table does not include the military time zones defined
31# in RFC822, other than Z. According to RFC1123, the description in
32# RFC822 gets the signs wrong, so we can't rely on any such time
33# zones. RFC1123 recommends that numeric timezone indicators be used
34# instead of timezone names.
35
36_timezones = {'UT':0, 'UTC':0, 'GMT':0, 'Z':0,
37 'AST': -400, 'ADT': -300, # Atlantic (used in Canada)
38 'EST': -500, 'EDT': -400, # Eastern
39 'CST': -600, 'CDT': -500, # Central
40 'MST': -700, 'MDT': -600, # Mountain
41 'PST': -800, 'PDT': -700 # Pacific
42 }
43
44
45def parsedate_tz(data):
46 """Convert a date string to a time tuple.
47
48 Accounts for military timezones.
49 """
R David Murray875048b2011-07-20 11:41:21 -040050 res = _parsedate_tz(data)
Georg Brandl1aca31e2012-09-22 09:03:56 +020051 if not res:
52 return
R David Murray875048b2011-07-20 11:41:21 -040053 if res[9] is None:
54 res[9] = 0
55 return tuple(res)
56
57def _parsedate_tz(data):
58 """Convert date to extended time tuple.
59
60 The last (additional) element is the time zone offset in seconds, except if
61 the timezone was specified as -0000. In that case the last element is
62 None. This indicates a UTC timestamp that explicitly declaims knowledge of
63 the source timezone, as opposed to a +0000 timestamp that indicates the
64 source timezone really was UTC.
65
66 """
Georg Brandl1aca31e2012-09-22 09:03:56 +020067 if not data:
Georges Toth303aac82020-10-27 01:31:06 +010068 return None
Guido van Rossum8b3febe2007-08-30 01:15:14 +000069 data = data.split()
Miss Islington (bot)9a792422021-08-26 08:47:27 -070070 if not data: # This happens for whitespace-only input.
71 return None
Guido van Rossum8b3febe2007-08-30 01:15:14 +000072 # The FWS after the comma after the day-of-week is optional, so search and
73 # adjust for this.
74 if data[0].endswith(',') or data[0].lower() in _daynames:
75 # There's a dayname here. Skip it
76 del data[0]
77 else:
78 i = data[0].rfind(',')
79 if i >= 0:
80 data[0] = data[0][i+1:]
81 if len(data) == 3: # RFC 850 date, deprecated
82 stuff = data[0].split('-')
83 if len(stuff) == 3:
84 data = stuff + data[1:]
85 if len(data) == 4:
86 s = data[3]
87 i = s.find('+')
R. David Murray4a62e892010-12-23 20:35:46 +000088 if i == -1:
89 i = s.find('-')
Guido van Rossum8b3febe2007-08-30 01:15:14 +000090 if i > 0:
R. David Murray4a62e892010-12-23 20:35:46 +000091 data[3:] = [s[:i], s[i:]]
Guido van Rossum8b3febe2007-08-30 01:15:14 +000092 else:
93 data.append('') # Dummy tz
94 if len(data) < 5:
95 return None
96 data = data[:5]
97 [dd, mm, yy, tm, tz] = data
98 mm = mm.lower()
99 if mm not in _monthnames:
100 dd, mm = mm, dd.lower()
101 if mm not in _monthnames:
102 return None
103 mm = _monthnames.index(mm) + 1
104 if mm > 12:
105 mm -= 12
106 if dd[-1] == ',':
107 dd = dd[:-1]
108 i = yy.find(':')
109 if i > 0:
110 yy, tm = tm, yy
111 if yy[-1] == ',':
112 yy = yy[:-1]
113 if not yy[0].isdigit():
114 yy, tz = tz, yy
115 if tm[-1] == ',':
116 tm = tm[:-1]
117 tm = tm.split(':')
118 if len(tm) == 2:
119 [thh, tmm] = tm
120 tss = '0'
121 elif len(tm) == 3:
122 [thh, tmm, tss] = tm
R David Murrayaccd1c02011-03-13 20:06:23 -0400123 elif len(tm) == 1 and '.' in tm[0]:
124 # Some non-compliant MUAs use '.' to separate time elements.
125 tm = tm[0].split('.')
126 if len(tm) == 2:
127 [thh, tmm] = tm
128 tss = 0
129 elif len(tm) == 3:
130 [thh, tmm, tss] = tm
Ɓukasz Langaf8473f62021-10-13 19:12:22 +0200131 else:
132 return None
Guido van Rossum8b3febe2007-08-30 01:15:14 +0000133 else:
134 return None
135 try:
136 yy = int(yy)
137 dd = int(dd)
138 thh = int(thh)
139 tmm = int(tmm)
140 tss = int(tss)
141 except ValueError:
142 return None
R. David Murray219d1c82010-08-25 00:45:55 +0000143 # Check for a yy specified in two-digit format, then convert it to the
144 # appropriate four-digit format, according to the POSIX standard. RFC 822
145 # calls for a two-digit yy, but RFC 2822 (which obsoletes RFC 822)
146 # mandates a 4-digit yy. For more information, see the documentation for
147 # the time module.
148 if yy < 100:
149 # The year is between 1969 and 1999 (inclusive).
150 if yy > 68:
151 yy += 1900
152 # The year is between 2000 and 2068 (inclusive).
153 else:
154 yy += 2000
Guido van Rossum8b3febe2007-08-30 01:15:14 +0000155 tzoffset = None
156 tz = tz.upper()
157 if tz in _timezones:
158 tzoffset = _timezones[tz]
159 else:
160 try:
161 tzoffset = int(tz)
162 except ValueError:
163 pass
R David Murray875048b2011-07-20 11:41:21 -0400164 if tzoffset==0 and tz.startswith('-'):
165 tzoffset = None
Guido van Rossum8b3febe2007-08-30 01:15:14 +0000166 # Convert a timezone offset into seconds ; -0500 -> -18000
167 if tzoffset:
168 if tzoffset < 0:
169 tzsign = -1
170 tzoffset = -tzoffset
171 else:
172 tzsign = 1
173 tzoffset = tzsign * ( (tzoffset//100)*3600 + (tzoffset % 100)*60)
174 # Daylight Saving Time flag is set to -1, since DST is unknown.
R David Murray875048b2011-07-20 11:41:21 -0400175 return [yy, mm, dd, thh, tmm, tss, 0, 1, -1, tzoffset]
Guido van Rossum8b3febe2007-08-30 01:15:14 +0000176
177
178def parsedate(data):
179 """Convert a time string to a time tuple."""
180 t = parsedate_tz(data)
181 if isinstance(t, tuple):
182 return t[:9]
183 else:
184 return t
185
186
187def mktime_tz(data):
Alexander Belopolskya07548e2012-06-21 20:34:09 -0400188 """Turn a 10-tuple as returned by parsedate_tz() into a POSIX timestamp."""
Guido van Rossum8b3febe2007-08-30 01:15:14 +0000189 if data[9] is None:
190 # No zone info, so localtime is better assumption than GMT
191 return time.mktime(data[:8] + (-1,))
192 else:
Alexander Belopolskya07548e2012-06-21 20:34:09 -0400193 t = calendar.timegm(data)
194 return t - data[9]
Guido van Rossum8b3febe2007-08-30 01:15:14 +0000195
196
197def quote(str):
R. David Murray5397e862010-10-02 15:58:26 +0000198 """Prepare string to be used in a quoted string.
199
200 Turns backslash and double quote characters into quoted pairs. These
201 are the only characters that need to be quoted inside a quoted string.
202 Does not add the surrounding double quotes.
203 """
Guido van Rossum8b3febe2007-08-30 01:15:14 +0000204 return str.replace('\\', '\\\\').replace('"', '\\"')
205
206
207class AddrlistClass:
208 """Address parser class by Ben Escoto.
209
210 To understand what this class does, it helps to have a copy of RFC 2822 in
211 front of you.
212
213 Note: this class interface is deprecated and may be removed in the future.
Florent Xicluna992d9e02011-11-11 19:35:42 +0100214 Use email.utils.AddressList instead.
Guido van Rossum8b3febe2007-08-30 01:15:14 +0000215 """
216
217 def __init__(self, field):
218 """Initialize a new instance.
219
220 `field' is an unparsed address header field, containing
221 one or more addresses.
222 """
223 self.specials = '()<>@,:;.\"[]'
224 self.pos = 0
225 self.LWS = ' \t'
226 self.CR = '\r\n'
227 self.FWS = self.LWS + self.CR
228 self.atomends = self.specials + self.LWS + self.CR
229 # Note that RFC 2822 now specifies `.' as obs-phrase, meaning that it
230 # is obsolete syntax. RFC 2822 requires that we recognize obsolete
231 # syntax, so allow dots in phrases.
232 self.phraseends = self.atomends.replace('.', '')
233 self.field = field
234 self.commentlist = []
235
236 def gotonext(self):
R. David Murray63563cd2010-12-18 18:25:38 +0000237 """Skip white space and extract comments."""
238 wslist = []
Guido van Rossum8b3febe2007-08-30 01:15:14 +0000239 while self.pos < len(self.field):
240 if self.field[self.pos] in self.LWS + '\n\r':
R. David Murray63563cd2010-12-18 18:25:38 +0000241 if self.field[self.pos] not in '\n\r':
242 wslist.append(self.field[self.pos])
Guido van Rossum8b3febe2007-08-30 01:15:14 +0000243 self.pos += 1
244 elif self.field[self.pos] == '(':
245 self.commentlist.append(self.getcomment())
246 else:
247 break
R. David Murray63563cd2010-12-18 18:25:38 +0000248 return EMPTYSTRING.join(wslist)
Guido van Rossum8b3febe2007-08-30 01:15:14 +0000249
250 def getaddrlist(self):
251 """Parse all addresses.
252
253 Returns a list containing all of the addresses.
254 """
255 result = []
256 while self.pos < len(self.field):
257 ad = self.getaddress()
258 if ad:
259 result += ad
260 else:
261 result.append(('', ''))
262 return result
263
264 def getaddress(self):
265 """Parse the next address."""
266 self.commentlist = []
267 self.gotonext()
268
269 oldpos = self.pos
270 oldcl = self.commentlist
271 plist = self.getphraselist()
272
273 self.gotonext()
274 returnlist = []
275
276 if self.pos >= len(self.field):
277 # Bad email address technically, no domain.
278 if plist:
279 returnlist = [(SPACE.join(self.commentlist), plist[0])]
280
281 elif self.field[self.pos] in '.@':
282 # email address is just an addrspec
283 # this isn't very efficient since we start over
284 self.pos = oldpos
285 self.commentlist = oldcl
286 addrspec = self.getaddrspec()
287 returnlist = [(SPACE.join(self.commentlist), addrspec)]
288
289 elif self.field[self.pos] == ':':
290 # address is a group
291 returnlist = []
292
293 fieldlen = len(self.field)
294 self.pos += 1
295 while self.pos < len(self.field):
296 self.gotonext()
297 if self.pos < fieldlen and self.field[self.pos] == ';':
298 self.pos += 1
299 break
300 returnlist = returnlist + self.getaddress()
301
302 elif self.field[self.pos] == '<':
303 # Address is a phrase then a route addr
304 routeaddr = self.getrouteaddr()
305
306 if self.commentlist:
307 returnlist = [(SPACE.join(plist) + ' (' +
308 ' '.join(self.commentlist) + ')', routeaddr)]
309 else:
310 returnlist = [(SPACE.join(plist), routeaddr)]
311
312 else:
313 if plist:
314 returnlist = [(SPACE.join(self.commentlist), plist[0])]
315 elif self.field[self.pos] in self.specials:
316 self.pos += 1
317
318 self.gotonext()
319 if self.pos < len(self.field) and self.field[self.pos] == ',':
320 self.pos += 1
321 return returnlist
322
323 def getrouteaddr(self):
324 """Parse a route address (Return-path value).
325
326 This method just skips all the route stuff and returns the addrspec.
327 """
328 if self.field[self.pos] != '<':
329 return
330
331 expectroute = False
332 self.pos += 1
333 self.gotonext()
334 adlist = ''
335 while self.pos < len(self.field):
336 if expectroute:
337 self.getdomain()
338 expectroute = False
339 elif self.field[self.pos] == '>':
340 self.pos += 1
341 break
342 elif self.field[self.pos] == '@':
343 self.pos += 1
344 expectroute = True
345 elif self.field[self.pos] == ':':
346 self.pos += 1
347 else:
348 adlist = self.getaddrspec()
349 self.pos += 1
350 break
351 self.gotonext()
352
353 return adlist
354
355 def getaddrspec(self):
356 """Parse an RFC 2822 addr-spec."""
357 aslist = []
358
359 self.gotonext()
360 while self.pos < len(self.field):
R. David Murray63563cd2010-12-18 18:25:38 +0000361 preserve_ws = True
Guido van Rossum8b3febe2007-08-30 01:15:14 +0000362 if self.field[self.pos] == '.':
R. David Murray63563cd2010-12-18 18:25:38 +0000363 if aslist and not aslist[-1].strip():
364 aslist.pop()
Guido van Rossum8b3febe2007-08-30 01:15:14 +0000365 aslist.append('.')
366 self.pos += 1
R. David Murray63563cd2010-12-18 18:25:38 +0000367 preserve_ws = False
Guido van Rossum8b3febe2007-08-30 01:15:14 +0000368 elif self.field[self.pos] == '"':
R. David Murray5397e862010-10-02 15:58:26 +0000369 aslist.append('"%s"' % quote(self.getquote()))
Guido van Rossum8b3febe2007-08-30 01:15:14 +0000370 elif self.field[self.pos] in self.atomends:
R. David Murray63563cd2010-12-18 18:25:38 +0000371 if aslist and not aslist[-1].strip():
372 aslist.pop()
Guido van Rossum8b3febe2007-08-30 01:15:14 +0000373 break
374 else:
375 aslist.append(self.getatom())
R. David Murray63563cd2010-12-18 18:25:38 +0000376 ws = self.gotonext()
377 if preserve_ws and ws:
378 aslist.append(ws)
Guido van Rossum8b3febe2007-08-30 01:15:14 +0000379
380 if self.pos >= len(self.field) or self.field[self.pos] != '@':
381 return EMPTYSTRING.join(aslist)
382
383 aslist.append('@')
384 self.pos += 1
385 self.gotonext()
jpic8cb65d12019-07-17 23:54:25 +0200386 domain = self.getdomain()
387 if not domain:
388 # Invalid domain, return an empty address instead of returning a
389 # local part to denote failed parsing.
390 return EMPTYSTRING
391 return EMPTYSTRING.join(aslist) + domain
Guido van Rossum8b3febe2007-08-30 01:15:14 +0000392
393 def getdomain(self):
394 """Get the complete domain name from an address."""
395 sdlist = []
396 while self.pos < len(self.field):
397 if self.field[self.pos] in self.LWS:
398 self.pos += 1
399 elif self.field[self.pos] == '(':
400 self.commentlist.append(self.getcomment())
401 elif self.field[self.pos] == '[':
402 sdlist.append(self.getdomainliteral())
403 elif self.field[self.pos] == '.':
404 self.pos += 1
405 sdlist.append('.')
jpic8cb65d12019-07-17 23:54:25 +0200406 elif self.field[self.pos] == '@':
407 # bpo-34155: Don't parse domains with two `@` like
408 # `a@malicious.org@important.com`.
409 return EMPTYSTRING
Guido van Rossum8b3febe2007-08-30 01:15:14 +0000410 elif self.field[self.pos] in self.atomends:
411 break
412 else:
413 sdlist.append(self.getatom())
414 return EMPTYSTRING.join(sdlist)
415
416 def getdelimited(self, beginchar, endchars, allowcomments=True):
417 """Parse a header fragment delimited by special characters.
418
419 `beginchar' is the start character for the fragment.
420 If self is not looking at an instance of `beginchar' then
421 getdelimited returns the empty string.
422
423 `endchars' is a sequence of allowable end-delimiting characters.
424 Parsing stops when one of these is encountered.
425
426 If `allowcomments' is non-zero, embedded RFC 2822 comments are allowed
427 within the parsed fragment.
428 """
429 if self.field[self.pos] != beginchar:
430 return ''
431
432 slist = ['']
433 quote = False
434 self.pos += 1
435 while self.pos < len(self.field):
436 if quote:
437 slist.append(self.field[self.pos])
438 quote = False
439 elif self.field[self.pos] in endchars:
440 self.pos += 1
441 break
442 elif allowcomments and self.field[self.pos] == '(':
443 slist.append(self.getcomment())
444 continue # have already advanced pos from getcomment
445 elif self.field[self.pos] == '\\':
446 quote = True
447 else:
448 slist.append(self.field[self.pos])
449 self.pos += 1
450
451 return EMPTYSTRING.join(slist)
452
453 def getquote(self):
454 """Get a quote-delimited fragment from self's field."""
455 return self.getdelimited('"', '"\r', False)
456
457 def getcomment(self):
458 """Get a parenthesis-delimited fragment from self's field."""
459 return self.getdelimited('(', ')\r', True)
460
461 def getdomainliteral(self):
462 """Parse an RFC 2822 domain-literal."""
463 return '[%s]' % self.getdelimited('[', ']\r', False)
464
465 def getatom(self, atomends=None):
466 """Parse an RFC 2822 atom.
467
468 Optional atomends specifies a different set of end token delimiters
469 (the default is to use self.atomends). This is used e.g. in
470 getphraselist() since phrase endings must not include the `.' (which
471 is legal in phrases)."""
472 atomlist = ['']
473 if atomends is None:
474 atomends = self.atomends
475
476 while self.pos < len(self.field):
477 if self.field[self.pos] in atomends:
478 break
479 else:
480 atomlist.append(self.field[self.pos])
481 self.pos += 1
482
483 return EMPTYSTRING.join(atomlist)
484
485 def getphraselist(self):
486 """Parse a sequence of RFC 2822 phrases.
487
488 A phrase is a sequence of words, which are in turn either RFC 2822
489 atoms or quoted-strings. Phrases are canonicalized by squeezing all
490 runs of continuous whitespace into one space.
491 """
492 plist = []
493
494 while self.pos < len(self.field):
495 if self.field[self.pos] in self.FWS:
496 self.pos += 1
497 elif self.field[self.pos] == '"':
498 plist.append(self.getquote())
499 elif self.field[self.pos] == '(':
500 self.commentlist.append(self.getcomment())
501 elif self.field[self.pos] in self.phraseends:
502 break
503 else:
504 plist.append(self.getatom(self.phraseends))
505
506 return plist
507
508class AddressList(AddrlistClass):
509 """An AddressList encapsulates a list of parsed RFC 2822 addresses."""
510 def __init__(self, field):
511 AddrlistClass.__init__(self, field)
512 if field:
513 self.addresslist = self.getaddrlist()
514 else:
515 self.addresslist = []
516
517 def __len__(self):
518 return len(self.addresslist)
519
520 def __add__(self, other):
521 # Set union
522 newaddr = AddressList(None)
523 newaddr.addresslist = self.addresslist[:]
524 for x in other.addresslist:
525 if not x in self.addresslist:
526 newaddr.addresslist.append(x)
527 return newaddr
528
529 def __iadd__(self, other):
530 # Set union, in-place
531 for x in other.addresslist:
532 if not x in self.addresslist:
533 self.addresslist.append(x)
534 return self
535
536 def __sub__(self, other):
537 # Set difference
538 newaddr = AddressList(None)
539 for x in self.addresslist:
540 if not x in other.addresslist:
541 newaddr.addresslist.append(x)
542 return newaddr
543
544 def __isub__(self, other):
545 # Set difference, in-place
546 for x in other.addresslist:
547 if x in self.addresslist:
548 self.addresslist.remove(x)
549 return self
550
551 def __getitem__(self, index):
552 # Make indexing, slices, and 'in' work
553 return self.addresslist[index]