blob: 41ff6f8c000d57d23445fa529b33297b1ffa07fc [file] [log] [blame]
Jingwen Chen475b3cc2021-01-05 21:45:16 -05001# Copyright (C) 2002-2007 Python Software Foundation
2# Contact: email-sig@python.org
3
4"""Email address parsing code.
5
6Lifted directly from rfc822.py. This should eventually be rewritten.
7"""
8
9__all__ = [
10 'mktime_tz',
11 'parsedate',
12 'parsedate_tz',
13 'quote',
14 ]
15
16import time, calendar
17
18SPACE = ' '
19EMPTYSTRING = ''
20COMMASPACE = ', '
21
22# Parse a date field
23_monthnames = ['jan', 'feb', 'mar', 'apr', 'may', 'jun', 'jul',
24 'aug', 'sep', 'oct', 'nov', 'dec',
25 'january', 'february', 'march', 'april', 'may', 'june', 'july',
26 'august', 'september', 'october', 'november', 'december']
27
28_daynames = ['mon', 'tue', 'wed', 'thu', 'fri', 'sat', 'sun']
29
30# The timezone table does not include the military time zones defined
31# in RFC822, other than Z. According to RFC1123, the description in
32# RFC822 gets the signs wrong, so we can't rely on any such time
33# zones. RFC1123 recommends that numeric timezone indicators be used
34# instead of timezone names.
35
36_timezones = {'UT':0, 'UTC':0, 'GMT':0, 'Z':0,
37 'AST': -400, 'ADT': -300, # Atlantic (used in Canada)
38 'EST': -500, 'EDT': -400, # Eastern
39 'CST': -600, 'CDT': -500, # Central
40 'MST': -700, 'MDT': -600, # Mountain
41 'PST': -800, 'PDT': -700 # Pacific
42 }
43
44
45def parsedate_tz(data):
46 """Convert a date string to a time tuple.
47
48 Accounts for military timezones.
49 """
50 res = _parsedate_tz(data)
51 if not res:
52 return
53 if res[9] is None:
54 res[9] = 0
55 return tuple(res)
56
57def _parsedate_tz(data):
58 """Convert date to extended time tuple.
59
60 The last (additional) element is the time zone offset in seconds, except if
61 the timezone was specified as -0000. In that case the last element is
62 None. This indicates a UTC timestamp that explicitly declaims knowledge of
63 the source timezone, as opposed to a +0000 timestamp that indicates the
64 source timezone really was UTC.
65
66 """
67 if not data:
68 return
69 data = data.split()
70 # The FWS after the comma after the day-of-week is optional, so search and
71 # adjust for this.
72 if data[0].endswith(',') or data[0].lower() in _daynames:
73 # There's a dayname here. Skip it
74 del data[0]
75 else:
76 i = data[0].rfind(',')
77 if i >= 0:
78 data[0] = data[0][i+1:]
79 if len(data) == 3: # RFC 850 date, deprecated
80 stuff = data[0].split('-')
81 if len(stuff) == 3:
82 data = stuff + data[1:]
83 if len(data) == 4:
84 s = data[3]
85 i = s.find('+')
86 if i == -1:
87 i = s.find('-')
88 if i > 0:
89 data[3:] = [s[:i], s[i:]]
90 else:
91 data.append('') # Dummy tz
92 if len(data) < 5:
93 return None
94 data = data[:5]
95 [dd, mm, yy, tm, tz] = data
96 mm = mm.lower()
97 if mm not in _monthnames:
98 dd, mm = mm, dd.lower()
99 if mm not in _monthnames:
100 return None
101 mm = _monthnames.index(mm) + 1
102 if mm > 12:
103 mm -= 12
104 if dd[-1] == ',':
105 dd = dd[:-1]
106 i = yy.find(':')
107 if i > 0:
108 yy, tm = tm, yy
109 if yy[-1] == ',':
110 yy = yy[:-1]
111 if not yy[0].isdigit():
112 yy, tz = tz, yy
113 if tm[-1] == ',':
114 tm = tm[:-1]
115 tm = tm.split(':')
116 if len(tm) == 2:
117 [thh, tmm] = tm
118 tss = '0'
119 elif len(tm) == 3:
120 [thh, tmm, tss] = tm
121 elif len(tm) == 1 and '.' in tm[0]:
122 # Some non-compliant MUAs use '.' to separate time elements.
123 tm = tm[0].split('.')
124 if len(tm) == 2:
125 [thh, tmm] = tm
126 tss = 0
127 elif len(tm) == 3:
128 [thh, tmm, tss] = tm
129 else:
130 return None
131 try:
132 yy = int(yy)
133 dd = int(dd)
134 thh = int(thh)
135 tmm = int(tmm)
136 tss = int(tss)
137 except ValueError:
138 return None
139 # Check for a yy specified in two-digit format, then convert it to the
140 # appropriate four-digit format, according to the POSIX standard. RFC 822
141 # calls for a two-digit yy, but RFC 2822 (which obsoletes RFC 822)
142 # mandates a 4-digit yy. For more information, see the documentation for
143 # the time module.
144 if yy < 100:
145 # The year is between 1969 and 1999 (inclusive).
146 if yy > 68:
147 yy += 1900
148 # The year is between 2000 and 2068 (inclusive).
149 else:
150 yy += 2000
151 tzoffset = None
152 tz = tz.upper()
153 if tz in _timezones:
154 tzoffset = _timezones[tz]
155 else:
156 try:
157 tzoffset = int(tz)
158 except ValueError:
159 pass
160 if tzoffset==0 and tz.startswith('-'):
161 tzoffset = None
162 # Convert a timezone offset into seconds ; -0500 -> -18000
163 if tzoffset:
164 if tzoffset < 0:
165 tzsign = -1
166 tzoffset = -tzoffset
167 else:
168 tzsign = 1
169 tzoffset = tzsign * ( (tzoffset//100)*3600 + (tzoffset % 100)*60)
170 # Daylight Saving Time flag is set to -1, since DST is unknown.
171 return [yy, mm, dd, thh, tmm, tss, 0, 1, -1, tzoffset]
172
173
174def parsedate(data):
175 """Convert a time string to a time tuple."""
176 t = parsedate_tz(data)
177 if isinstance(t, tuple):
178 return t[:9]
179 else:
180 return t
181
182
183def mktime_tz(data):
184 """Turn a 10-tuple as returned by parsedate_tz() into a POSIX timestamp."""
185 if data[9] is None:
186 # No zone info, so localtime is better assumption than GMT
187 return time.mktime(data[:8] + (-1,))
188 else:
189 t = calendar.timegm(data)
190 return t - data[9]
191
192
193def quote(str):
194 """Prepare string to be used in a quoted string.
195
196 Turns backslash and double quote characters into quoted pairs. These
197 are the only characters that need to be quoted inside a quoted string.
198 Does not add the surrounding double quotes.
199 """
200 return str.replace('\\', '\\\\').replace('"', '\\"')
201
202
203class AddrlistClass:
204 """Address parser class by Ben Escoto.
205
206 To understand what this class does, it helps to have a copy of RFC 2822 in
207 front of you.
208
209 Note: this class interface is deprecated and may be removed in the future.
210 Use email.utils.AddressList instead.
211 """
212
213 def __init__(self, field):
214 """Initialize a new instance.
215
216 `field' is an unparsed address header field, containing
217 one or more addresses.
218 """
219 self.specials = '()<>@,:;.\"[]'
220 self.pos = 0
221 self.LWS = ' \t'
222 self.CR = '\r\n'
223 self.FWS = self.LWS + self.CR
224 self.atomends = self.specials + self.LWS + self.CR
225 # Note that RFC 2822 now specifies `.' as obs-phrase, meaning that it
226 # is obsolete syntax. RFC 2822 requires that we recognize obsolete
227 # syntax, so allow dots in phrases.
228 self.phraseends = self.atomends.replace('.', '')
229 self.field = field
230 self.commentlist = []
231
232 def gotonext(self):
233 """Skip white space and extract comments."""
234 wslist = []
235 while self.pos < len(self.field):
236 if self.field[self.pos] in self.LWS + '\n\r':
237 if self.field[self.pos] not in '\n\r':
238 wslist.append(self.field[self.pos])
239 self.pos += 1
240 elif self.field[self.pos] == '(':
241 self.commentlist.append(self.getcomment())
242 else:
243 break
244 return EMPTYSTRING.join(wslist)
245
246 def getaddrlist(self):
247 """Parse all addresses.
248
249 Returns a list containing all of the addresses.
250 """
251 result = []
252 while self.pos < len(self.field):
253 ad = self.getaddress()
254 if ad:
255 result += ad
256 else:
257 result.append(('', ''))
258 return result
259
260 def getaddress(self):
261 """Parse the next address."""
262 self.commentlist = []
263 self.gotonext()
264
265 oldpos = self.pos
266 oldcl = self.commentlist
267 plist = self.getphraselist()
268
269 self.gotonext()
270 returnlist = []
271
272 if self.pos >= len(self.field):
273 # Bad email address technically, no domain.
274 if plist:
275 returnlist = [(SPACE.join(self.commentlist), plist[0])]
276
277 elif self.field[self.pos] in '.@':
278 # email address is just an addrspec
279 # this isn't very efficient since we start over
280 self.pos = oldpos
281 self.commentlist = oldcl
282 addrspec = self.getaddrspec()
283 returnlist = [(SPACE.join(self.commentlist), addrspec)]
284
285 elif self.field[self.pos] == ':':
286 # address is a group
287 returnlist = []
288
289 fieldlen = len(self.field)
290 self.pos += 1
291 while self.pos < len(self.field):
292 self.gotonext()
293 if self.pos < fieldlen and self.field[self.pos] == ';':
294 self.pos += 1
295 break
296 returnlist = returnlist + self.getaddress()
297
298 elif self.field[self.pos] == '<':
299 # Address is a phrase then a route addr
300 routeaddr = self.getrouteaddr()
301
302 if self.commentlist:
303 returnlist = [(SPACE.join(plist) + ' (' +
304 ' '.join(self.commentlist) + ')', routeaddr)]
305 else:
306 returnlist = [(SPACE.join(plist), routeaddr)]
307
308 else:
309 if plist:
310 returnlist = [(SPACE.join(self.commentlist), plist[0])]
311 elif self.field[self.pos] in self.specials:
312 self.pos += 1
313
314 self.gotonext()
315 if self.pos < len(self.field) and self.field[self.pos] == ',':
316 self.pos += 1
317 return returnlist
318
319 def getrouteaddr(self):
320 """Parse a route address (Return-path value).
321
322 This method just skips all the route stuff and returns the addrspec.
323 """
324 if self.field[self.pos] != '<':
325 return
326
327 expectroute = False
328 self.pos += 1
329 self.gotonext()
330 adlist = ''
331 while self.pos < len(self.field):
332 if expectroute:
333 self.getdomain()
334 expectroute = False
335 elif self.field[self.pos] == '>':
336 self.pos += 1
337 break
338 elif self.field[self.pos] == '@':
339 self.pos += 1
340 expectroute = True
341 elif self.field[self.pos] == ':':
342 self.pos += 1
343 else:
344 adlist = self.getaddrspec()
345 self.pos += 1
346 break
347 self.gotonext()
348
349 return adlist
350
351 def getaddrspec(self):
352 """Parse an RFC 2822 addr-spec."""
353 aslist = []
354
355 self.gotonext()
356 while self.pos < len(self.field):
357 preserve_ws = True
358 if self.field[self.pos] == '.':
359 if aslist and not aslist[-1].strip():
360 aslist.pop()
361 aslist.append('.')
362 self.pos += 1
363 preserve_ws = False
364 elif self.field[self.pos] == '"':
365 aslist.append('"%s"' % quote(self.getquote()))
366 elif self.field[self.pos] in self.atomends:
367 if aslist and not aslist[-1].strip():
368 aslist.pop()
369 break
370 else:
371 aslist.append(self.getatom())
372 ws = self.gotonext()
373 if preserve_ws and ws:
374 aslist.append(ws)
375
376 if self.pos >= len(self.field) or self.field[self.pos] != '@':
377 return EMPTYSTRING.join(aslist)
378
379 aslist.append('@')
380 self.pos += 1
381 self.gotonext()
382 domain = self.getdomain()
383 if not domain:
384 # Invalid domain, return an empty address instead of returning a
385 # local part to denote failed parsing.
386 return EMPTYSTRING
387 return EMPTYSTRING.join(aslist) + domain
388
389 def getdomain(self):
390 """Get the complete domain name from an address."""
391 sdlist = []
392 while self.pos < len(self.field):
393 if self.field[self.pos] in self.LWS:
394 self.pos += 1
395 elif self.field[self.pos] == '(':
396 self.commentlist.append(self.getcomment())
397 elif self.field[self.pos] == '[':
398 sdlist.append(self.getdomainliteral())
399 elif self.field[self.pos] == '.':
400 self.pos += 1
401 sdlist.append('.')
402 elif self.field[self.pos] == '@':
403 # bpo-34155: Don't parse domains with two `@` like
404 # `a@malicious.org@important.com`.
405 return EMPTYSTRING
406 elif self.field[self.pos] in self.atomends:
407 break
408 else:
409 sdlist.append(self.getatom())
410 return EMPTYSTRING.join(sdlist)
411
412 def getdelimited(self, beginchar, endchars, allowcomments=True):
413 """Parse a header fragment delimited by special characters.
414
415 `beginchar' is the start character for the fragment.
416 If self is not looking at an instance of `beginchar' then
417 getdelimited returns the empty string.
418
419 `endchars' is a sequence of allowable end-delimiting characters.
420 Parsing stops when one of these is encountered.
421
422 If `allowcomments' is non-zero, embedded RFC 2822 comments are allowed
423 within the parsed fragment.
424 """
425 if self.field[self.pos] != beginchar:
426 return ''
427
428 slist = ['']
429 quote = False
430 self.pos += 1
431 while self.pos < len(self.field):
432 if quote:
433 slist.append(self.field[self.pos])
434 quote = False
435 elif self.field[self.pos] in endchars:
436 self.pos += 1
437 break
438 elif allowcomments and self.field[self.pos] == '(':
439 slist.append(self.getcomment())
440 continue # have already advanced pos from getcomment
441 elif self.field[self.pos] == '\\':
442 quote = True
443 else:
444 slist.append(self.field[self.pos])
445 self.pos += 1
446
447 return EMPTYSTRING.join(slist)
448
449 def getquote(self):
450 """Get a quote-delimited fragment from self's field."""
451 return self.getdelimited('"', '"\r', False)
452
453 def getcomment(self):
454 """Get a parenthesis-delimited fragment from self's field."""
455 return self.getdelimited('(', ')\r', True)
456
457 def getdomainliteral(self):
458 """Parse an RFC 2822 domain-literal."""
459 return '[%s]' % self.getdelimited('[', ']\r', False)
460
461 def getatom(self, atomends=None):
462 """Parse an RFC 2822 atom.
463
464 Optional atomends specifies a different set of end token delimiters
465 (the default is to use self.atomends). This is used e.g. in
466 getphraselist() since phrase endings must not include the `.' (which
467 is legal in phrases)."""
468 atomlist = ['']
469 if atomends is None:
470 atomends = self.atomends
471
472 while self.pos < len(self.field):
473 if self.field[self.pos] in atomends:
474 break
475 else:
476 atomlist.append(self.field[self.pos])
477 self.pos += 1
478
479 return EMPTYSTRING.join(atomlist)
480
481 def getphraselist(self):
482 """Parse a sequence of RFC 2822 phrases.
483
484 A phrase is a sequence of words, which are in turn either RFC 2822
485 atoms or quoted-strings. Phrases are canonicalized by squeezing all
486 runs of continuous whitespace into one space.
487 """
488 plist = []
489
490 while self.pos < len(self.field):
491 if self.field[self.pos] in self.FWS:
492 self.pos += 1
493 elif self.field[self.pos] == '"':
494 plist.append(self.getquote())
495 elif self.field[self.pos] == '(':
496 self.commentlist.append(self.getcomment())
497 elif self.field[self.pos] in self.phraseends:
498 break
499 else:
500 plist.append(self.getatom(self.phraseends))
501
502 return plist
503
504class AddressList(AddrlistClass):
505 """An AddressList encapsulates a list of parsed RFC 2822 addresses."""
506 def __init__(self, field):
507 AddrlistClass.__init__(self, field)
508 if field:
509 self.addresslist = self.getaddrlist()
510 else:
511 self.addresslist = []
512
513 def __len__(self):
514 return len(self.addresslist)
515
516 def __add__(self, other):
517 # Set union
518 newaddr = AddressList(None)
519 newaddr.addresslist = self.addresslist[:]
520 for x in other.addresslist:
521 if not x in self.addresslist:
522 newaddr.addresslist.append(x)
523 return newaddr
524
525 def __iadd__(self, other):
526 # Set union, in-place
527 for x in other.addresslist:
528 if not x in self.addresslist:
529 self.addresslist.append(x)
530 return self
531
532 def __sub__(self, other):
533 # Set difference
534 newaddr = AddressList(None)
535 for x in self.addresslist:
536 if not x in other.addresslist:
537 newaddr.addresslist.append(x)
538 return newaddr
539
540 def __isub__(self, other):
541 # Set difference, in-place
542 for x in other.addresslist:
543 if x in self.addresslist:
544 self.addresslist.remove(x)
545 return self
546
547 def __getitem__(self, index):
548 # Make indexing, slices, and 'in' work
549 return self.addresslist[index]