blob: b3b42bb379296f7768733e9ed54c8168f09a582e [file] [log] [blame]
Benjamin Peterson46a99002010-01-09 18:45:30 +00001# Copyright (C) 2001-2010 Python Software Foundation
Guido van Rossum8b3febe2007-08-30 01:15:14 +00002# Author: Barry Warsaw
3# Contact: email-sig@python.org
4
5"""Miscellaneous utilities."""
6
7__all__ = [
8 'collapse_rfc2231_value',
9 'decode_params',
10 'decode_rfc2231',
11 'encode_rfc2231',
12 'formataddr',
13 'formatdate',
R David Murray875048b2011-07-20 11:41:21 -040014 'format_datetime',
Guido van Rossum8b3febe2007-08-30 01:15:14 +000015 'getaddresses',
16 'make_msgid',
Barry Warsawb742a962009-11-25 18:45:15 +000017 'mktime_tz',
Guido van Rossum8b3febe2007-08-30 01:15:14 +000018 'parseaddr',
19 'parsedate',
20 'parsedate_tz',
R David Murray875048b2011-07-20 11:41:21 -040021 'parsedate_to_datetime',
Guido van Rossum8b3febe2007-08-30 01:15:14 +000022 'unquote',
23 ]
24
25import os
26import re
27import time
28import base64
29import random
30import socket
R David Murray875048b2011-07-20 11:41:21 -040031import datetime
Jeremy Hylton1afc1692008-06-18 20:49:58 +000032import urllib.parse
Guido van Rossum8b3febe2007-08-30 01:15:14 +000033import warnings
34from io import StringIO
35
36from email._parseaddr import quote
37from email._parseaddr import AddressList as _AddressList
38from email._parseaddr import mktime_tz
39
Georg Brandl1aca31e2012-09-22 09:03:56 +020040from email._parseaddr import parsedate, parsedate_tz, _parsedate_tz
Guido van Rossum8b3febe2007-08-30 01:15:14 +000041
42from quopri import decodestring as _qdecode
43
44# Intrapackage imports
45from email.encoders import _bencode, _qencode
R David Murray8debacb2011-04-06 09:35:57 -040046from email.charset import Charset
Guido van Rossum8b3febe2007-08-30 01:15:14 +000047
48COMMASPACE = ', '
49EMPTYSTRING = ''
50UEMPTYSTRING = ''
51CRLF = '\r\n'
52TICK = "'"
53
54specialsre = re.compile(r'[][\\()<>@,:;".]')
R David Murrayb53319f2012-03-14 15:31:47 -040055escapesre = re.compile(r'[\\"]')
Guido van Rossum8b3febe2007-08-30 01:15:14 +000056
R David Murrayb83ee302013-06-26 12:06:21 -040057def _has_surrogates(s):
58 """Return True if s contains surrogate-escaped binary data."""
59 # This check is based on the fact that unless there are surrogates, utf8
60 # (Python's default encoding) can encode any string. This is the fastest
61 # way to check for surrogates, see issue 11454 for timings.
62 try:
63 s.encode()
64 return False
65 except UnicodeEncodeError:
66 return True
Guido van Rossum8b3febe2007-08-30 01:15:14 +000067
R David Murray0b6f6c82012-05-25 18:42:14 -040068# How to deal with a string containing bytes before handing it to the
69# application through the 'normal' interface.
70def _sanitize(string):
71 # Turn any escaped bytes into unicode 'unknown' char.
72 original_bytes = string.encode('ascii', 'surrogateescape')
73 return original_bytes.decode('ascii', 'replace')
74
Antoine Pitroufd036452008-08-19 17:56:33 +000075
Guido van Rossum8b3febe2007-08-30 01:15:14 +000076# Helpers
77
R David Murray8debacb2011-04-06 09:35:57 -040078def formataddr(pair, charset='utf-8'):
Guido van Rossum8b3febe2007-08-30 01:15:14 +000079 """The inverse of parseaddr(), this takes a 2-tuple of the form
80 (realname, email_address) and returns the string value suitable
81 for an RFC 2822 From, To or Cc header.
82
83 If the first element of pair is false, then the second element is
84 returned unmodified.
R David Murray8debacb2011-04-06 09:35:57 -040085
86 Optional charset if given is the character set that is used to encode
87 realname in case realname is not ASCII safe. Can be an instance of str or
88 a Charset-like object which has a header_encode method. Default is
89 'utf-8'.
Guido van Rossum8b3febe2007-08-30 01:15:14 +000090 """
91 name, address = pair
Andrew Svetlov5b898402012-12-18 21:26:36 +020092 # The address MUST (per RFC) be ascii, so raise an UnicodeError if it isn't.
R David Murray8debacb2011-04-06 09:35:57 -040093 address.encode('ascii')
Guido van Rossum8b3febe2007-08-30 01:15:14 +000094 if name:
R David Murray8debacb2011-04-06 09:35:57 -040095 try:
96 name.encode('ascii')
97 except UnicodeEncodeError:
98 if isinstance(charset, str):
99 charset = Charset(charset)
100 encoded_name = charset.header_encode(name)
101 return "%s <%s>" % (encoded_name, address)
102 else:
103 quotes = ''
104 if specialsre.search(name):
105 quotes = '"'
106 name = escapesre.sub(r'\\\g<0>', name)
107 return '%s%s%s <%s>' % (quotes, name, quotes, address)
Guido van Rossum8b3febe2007-08-30 01:15:14 +0000108 return address
109
110
Antoine Pitroufd036452008-08-19 17:56:33 +0000111
Guido van Rossum8b3febe2007-08-30 01:15:14 +0000112def getaddresses(fieldvalues):
113 """Return a list of (REALNAME, EMAIL) for each fieldvalue."""
114 all = COMMASPACE.join(fieldvalues)
115 a = _AddressList(all)
116 return a.addresslist
117
118
Antoine Pitroufd036452008-08-19 17:56:33 +0000119
Guido van Rossum8b3febe2007-08-30 01:15:14 +0000120ecre = re.compile(r'''
121 =\? # literal =?
122 (?P<charset>[^?]*?) # non-greedy up to the next ? is the charset
123 \? # literal ?
124 (?P<encoding>[qb]) # either a "q" or a "b", case insensitive
125 \? # literal ?
126 (?P<atom>.*?) # non-greedy up to the next ?= is the atom
127 \?= # literal ?=
128 ''', re.VERBOSE | re.IGNORECASE)
129
130
R David Murray875048b2011-07-20 11:41:21 -0400131def _format_timetuple_and_zone(timetuple, zone):
132 return '%s, %02d %s %04d %02d:%02d:%02d %s' % (
133 ['Mon', 'Tue', 'Wed', 'Thu', 'Fri', 'Sat', 'Sun'][timetuple[6]],
134 timetuple[2],
135 ['Jan', 'Feb', 'Mar', 'Apr', 'May', 'Jun',
136 'Jul', 'Aug', 'Sep', 'Oct', 'Nov', 'Dec'][timetuple[1] - 1],
137 timetuple[0], timetuple[3], timetuple[4], timetuple[5],
138 zone)
Antoine Pitroufd036452008-08-19 17:56:33 +0000139
Guido van Rossum8b3febe2007-08-30 01:15:14 +0000140def formatdate(timeval=None, localtime=False, usegmt=False):
141 """Returns a date string as specified by RFC 2822, e.g.:
142
143 Fri, 09 Nov 2001 01:08:47 -0000
144
145 Optional timeval if given is a floating point time value as accepted by
146 gmtime() and localtime(), otherwise the current time is used.
147
148 Optional localtime is a flag that when True, interprets timeval, and
149 returns a date relative to the local timezone instead of UTC, properly
150 taking daylight savings time into account.
151
152 Optional argument usegmt means that the timezone is written out as
153 an ascii string, not numeric one (so "GMT" instead of "+0000"). This
154 is needed for HTTP, and is only used when localtime==False.
155 """
156 # Note: we cannot use strftime() because that honors the locale and RFC
157 # 2822 requires that day and month names be the English abbreviations.
158 if timeval is None:
159 timeval = time.time()
160 if localtime:
161 now = time.localtime(timeval)
162 # Calculate timezone offset, based on whether the local zone has
163 # daylight savings time, and whether DST is in effect.
164 if time.daylight and now[-1]:
165 offset = time.altzone
166 else:
167 offset = time.timezone
168 hours, minutes = divmod(abs(offset), 3600)
169 # Remember offset is in seconds west of UTC, but the timezone is in
170 # minutes east of UTC, so the signs differ.
171 if offset > 0:
172 sign = '-'
173 else:
174 sign = '+'
175 zone = '%s%02d%02d' % (sign, hours, minutes // 60)
176 else:
177 now = time.gmtime(timeval)
178 # Timezone offset is always -0000
179 if usegmt:
180 zone = 'GMT'
181 else:
182 zone = '-0000'
R David Murray875048b2011-07-20 11:41:21 -0400183 return _format_timetuple_and_zone(now, zone)
Guido van Rossum8b3febe2007-08-30 01:15:14 +0000184
R David Murray875048b2011-07-20 11:41:21 -0400185def format_datetime(dt, usegmt=False):
186 """Turn a datetime into a date string as specified in RFC 2822.
187
188 If usegmt is True, dt must be an aware datetime with an offset of zero. In
189 this case 'GMT' will be rendered instead of the normal +0000 required by
190 RFC2822. This is to support HTTP headers involving date stamps.
191 """
192 now = dt.timetuple()
193 if usegmt:
194 if dt.tzinfo is None or dt.tzinfo != datetime.timezone.utc:
195 raise ValueError("usegmt option requires a UTC datetime")
196 zone = 'GMT'
197 elif dt.tzinfo is None:
198 zone = '-0000'
199 else:
200 zone = dt.strftime("%z")
201 return _format_timetuple_and_zone(now, zone)
Guido van Rossum8b3febe2007-08-30 01:15:14 +0000202
Antoine Pitroufd036452008-08-19 17:56:33 +0000203
R. David Murraya0b44b52010-12-02 21:47:19 +0000204def make_msgid(idstring=None, domain=None):
Guido van Rossum8b3febe2007-08-30 01:15:14 +0000205 """Returns a string suitable for RFC 2822 compliant Message-ID, e.g:
206
207 <20020201195627.33539.96671@nightshade.la.mastaler.com>
208
209 Optional idstring if given is a string used to strengthen the
R. David Murraya0b44b52010-12-02 21:47:19 +0000210 uniqueness of the message id. Optional domain if given provides the
211 portion of the message id after the '@'. It defaults to the locally
212 defined hostname.
Guido van Rossum8b3febe2007-08-30 01:15:14 +0000213 """
214 timeval = time.time()
215 utcdate = time.strftime('%Y%m%d%H%M%S', time.gmtime(timeval))
216 pid = os.getpid()
217 randint = random.randrange(100000)
218 if idstring is None:
219 idstring = ''
220 else:
221 idstring = '.' + idstring
R. David Murraya0b44b52010-12-02 21:47:19 +0000222 if domain is None:
223 domain = socket.getfqdn()
224 msgid = '<%s.%s.%s%s@%s>' % (utcdate, pid, randint, idstring, domain)
Guido van Rossum8b3febe2007-08-30 01:15:14 +0000225 return msgid
226
227
R David Murray875048b2011-07-20 11:41:21 -0400228def parsedate_to_datetime(data):
Georg Brandl1aca31e2012-09-22 09:03:56 +0200229 *dtuple, tz = _parsedate_tz(data)
R David Murray875048b2011-07-20 11:41:21 -0400230 if tz is None:
231 return datetime.datetime(*dtuple[:6])
232 return datetime.datetime(*dtuple[:6],
233 tzinfo=datetime.timezone(datetime.timedelta(seconds=tz)))
234
Guido van Rossum8b3febe2007-08-30 01:15:14 +0000235
236def parseaddr(addr):
237 addrs = _AddressList(addr).addresslist
238 if not addrs:
239 return '', ''
240 return addrs[0]
241
242
243# rfc822.unquote() doesn't properly de-backslash-ify in Python pre-2.3.
244def unquote(str):
245 """Remove quotes from a string."""
246 if len(str) > 1:
247 if str.startswith('"') and str.endswith('"'):
248 return str[1:-1].replace('\\\\', '\\').replace('\\"', '"')
249 if str.startswith('<') and str.endswith('>'):
250 return str[1:-1]
251 return str
252
253
Antoine Pitroufd036452008-08-19 17:56:33 +0000254
Guido van Rossum8b3febe2007-08-30 01:15:14 +0000255# RFC2231-related functions - parameter encoding and decoding
256def decode_rfc2231(s):
257 """Decode string according to RFC 2231"""
258 parts = s.split(TICK, 2)
259 if len(parts) <= 2:
260 return None, None, s
261 return parts
262
263
264def encode_rfc2231(s, charset=None, language=None):
265 """Encode string according to RFC 2231.
266
267 If neither charset nor language is given, then s is returned as-is. If
268 charset is given but not language, the string is encoded using the empty
269 string for language.
270 """
Guido van Rossum52dbbb92008-08-18 21:44:30 +0000271 s = urllib.parse.quote(s, safe='', encoding=charset or 'ascii')
Guido van Rossum8b3febe2007-08-30 01:15:14 +0000272 if charset is None and language is None:
273 return s
274 if language is None:
275 language = ''
276 return "%s'%s'%s" % (charset, language, s)
277
278
Antoine Pitroufd036452008-08-19 17:56:33 +0000279rfc2231_continuation = re.compile(r'^(?P<name>\w+)\*((?P<num>[0-9]+)\*?)?$',
280 re.ASCII)
Guido van Rossum8b3febe2007-08-30 01:15:14 +0000281
282def decode_params(params):
283 """Decode parameters list according to RFC 2231.
284
285 params is a sequence of 2-tuples containing (param name, string value).
286 """
287 # Copy params so we don't mess with the original
288 params = params[:]
289 new_params = []
290 # Map parameter's name to a list of continuations. The values are a
291 # 3-tuple of the continuation number, the string value, and a flag
292 # specifying whether a particular segment is %-encoded.
293 rfc2231_params = {}
294 name, value = params.pop(0)
295 new_params.append((name, value))
296 while params:
297 name, value = params.pop(0)
298 if name.endswith('*'):
299 encoded = True
300 else:
301 encoded = False
302 value = unquote(value)
303 mo = rfc2231_continuation.match(name)
304 if mo:
305 name, num = mo.group('name', 'num')
306 if num is not None:
307 num = int(num)
308 rfc2231_params.setdefault(name, []).append((num, value, encoded))
309 else:
310 new_params.append((name, '"%s"' % quote(value)))
311 if rfc2231_params:
312 for name, continuations in rfc2231_params.items():
313 value = []
314 extended = False
315 # Sort by number
316 continuations.sort()
317 # And now append all values in numerical order, converting
318 # %-encodings for the encoded segments. If any of the
319 # continuation names ends in a *, then the entire string, after
320 # decoding segments and concatenating, must have the charset and
321 # language specifiers at the beginning of the string.
322 for num, s, encoded in continuations:
323 if encoded:
Guido van Rossum52dbbb92008-08-18 21:44:30 +0000324 # Decode as "latin-1", so the characters in s directly
325 # represent the percent-encoded octet values.
326 # collapse_rfc2231_value treats this as an octet sequence.
327 s = urllib.parse.unquote(s, encoding="latin-1")
Guido van Rossum8b3febe2007-08-30 01:15:14 +0000328 extended = True
329 value.append(s)
330 value = quote(EMPTYSTRING.join(value))
331 if extended:
332 charset, language, value = decode_rfc2231(value)
333 new_params.append((name, (charset, language, '"%s"' % value)))
334 else:
335 new_params.append((name, '"%s"' % value))
336 return new_params
337
338def collapse_rfc2231_value(value, errors='replace',
339 fallback_charset='us-ascii'):
340 if not isinstance(value, tuple) or len(value) != 3:
341 return unquote(value)
342 # While value comes to us as a unicode string, we need it to be a bytes
343 # object. We do not want bytes() normal utf-8 decoder, we want a straight
344 # interpretation of the string as character bytes.
345 charset, language, text = value
Guido van Rossum9604e662007-08-30 03:46:43 +0000346 rawbytes = bytes(text, 'raw-unicode-escape')
Guido van Rossum8b3febe2007-08-30 01:15:14 +0000347 try:
348 return str(rawbytes, charset, errors)
349 except LookupError:
350 # charset is not a known codec.
351 return unquote(text)
R David Murrayd2d521e2012-05-25 23:22:59 -0400352
353
354#
355# datetime doesn't provide a localtime function yet, so provide one. Code
356# adapted from the patch in issue 9527. This may not be perfect, but it is
357# better than not having it.
358#
359
360def localtime(dt=None, isdst=-1):
361 """Return local time as an aware datetime object.
362
363 If called without arguments, return current time. Otherwise *dt*
364 argument should be a datetime instance, and it is converted to the
365 local time zone according to the system time zone database. If *dt* is
366 naive (that is, dt.tzinfo is None), it is assumed to be in local time.
367 In this case, a positive or zero value for *isdst* causes localtime to
368 presume initially that summer time (for example, Daylight Saving Time)
369 is or is not (respectively) in effect for the specified time. A
370 negative value for *isdst* causes the localtime() function to attempt
371 to divine whether summer time is in effect for the specified time.
372
373 """
374 if dt is None:
Alexander Belopolskyf9bd9142012-08-22 23:02:36 -0400375 return datetime.datetime.now(datetime.timezone.utc).astimezone()
R David Murrayb8687df2012-08-22 21:34:00 -0400376 if dt.tzinfo is not None:
377 return dt.astimezone()
378 # We have a naive datetime. Convert to a (localtime) timetuple and pass to
379 # system mktime together with the isdst hint. System mktime will return
380 # seconds since epoch.
381 tm = dt.timetuple()[:-1] + (isdst,)
382 seconds = time.mktime(tm)
383 localtm = time.localtime(seconds)
384 try:
385 delta = datetime.timedelta(seconds=localtm.tm_gmtoff)
386 tz = datetime.timezone(delta, localtm.tm_zone)
387 except AttributeError:
388 # Compute UTC offset and compare with the value implied by tm_isdst.
389 # If the values match, use the zone name implied by tm_isdst.
R David Murray097a1202012-08-22 21:52:31 -0400390 delta = dt - datetime.datetime(*time.gmtime(seconds)[:6])
R David Murrayb8687df2012-08-22 21:34:00 -0400391 dst = time.daylight and localtm.tm_isdst > 0
392 gmtoff = -(time.altzone if dst else time.timezone)
393 if delta == datetime.timedelta(seconds=gmtoff):
394 tz = datetime.timezone(delta, time.tzname[dst])
R David Murrayd2d521e2012-05-25 23:22:59 -0400395 else:
R David Murrayb8687df2012-08-22 21:34:00 -0400396 tz = datetime.timezone(delta)
397 return dt.replace(tzinfo=tz)