blob: 39f790364c0c1010d9e1ea801120f4a15395e9bc [file] [log] [blame]
Benjamin Peterson46a99002010-01-09 18:45:30 +00001# Copyright (C) 2001-2010 Python Software Foundation
Guido van Rossum8b3febe2007-08-30 01:15:14 +00002# Author: Barry Warsaw
3# Contact: email-sig@python.org
4
5"""Miscellaneous utilities."""
6
7__all__ = [
8 'collapse_rfc2231_value',
9 'decode_params',
10 'decode_rfc2231',
11 'encode_rfc2231',
12 'formataddr',
13 'formatdate',
R David Murray875048b2011-07-20 11:41:21 -040014 'format_datetime',
Guido van Rossum8b3febe2007-08-30 01:15:14 +000015 'getaddresses',
16 'make_msgid',
Barry Warsawb742a962009-11-25 18:45:15 +000017 'mktime_tz',
Guido van Rossum8b3febe2007-08-30 01:15:14 +000018 'parseaddr',
19 'parsedate',
20 'parsedate_tz',
R David Murray875048b2011-07-20 11:41:21 -040021 'parsedate_to_datetime',
Guido van Rossum8b3febe2007-08-30 01:15:14 +000022 'unquote',
23 ]
24
25import os
26import re
27import time
28import base64
29import random
30import socket
R David Murray875048b2011-07-20 11:41:21 -040031import datetime
Jeremy Hylton1afc1692008-06-18 20:49:58 +000032import urllib.parse
Guido van Rossum8b3febe2007-08-30 01:15:14 +000033import warnings
34from io import StringIO
35
36from email._parseaddr import quote
37from email._parseaddr import AddressList as _AddressList
38from email._parseaddr import mktime_tz
39
40# We need wormarounds for bugs in these methods in older Pythons (see below)
41from email._parseaddr import parsedate as _parsedate
42from email._parseaddr import parsedate_tz as _parsedate_tz
R David Murray875048b2011-07-20 11:41:21 -040043from email._parseaddr import _parsedate_tz as __parsedate_tz
Guido van Rossum8b3febe2007-08-30 01:15:14 +000044
45from quopri import decodestring as _qdecode
46
47# Intrapackage imports
48from email.encoders import _bencode, _qencode
R David Murray8debacb2011-04-06 09:35:57 -040049from email.charset import Charset
Guido van Rossum8b3febe2007-08-30 01:15:14 +000050
51COMMASPACE = ', '
52EMPTYSTRING = ''
53UEMPTYSTRING = ''
54CRLF = '\r\n'
55TICK = "'"
56
57specialsre = re.compile(r'[][\\()<>@,:;".]')
R David Murrayb53319f2012-03-14 15:31:47 -040058escapesre = re.compile(r'[\\"]')
Guido van Rossum8b3febe2007-08-30 01:15:14 +000059
R David Murrayc27e5222012-05-25 15:01:48 -040060# How to figure out if we are processing strings that come from a byte
61# source with undecodable characters.
62_has_surrogates = re.compile(
63 '([^\ud800-\udbff]|\A)[\udc00-\udfff]([^\udc00-\udfff]|\Z)').search
Guido van Rossum8b3febe2007-08-30 01:15:14 +000064
R David Murray0b6f6c82012-05-25 18:42:14 -040065# How to deal with a string containing bytes before handing it to the
66# application through the 'normal' interface.
67def _sanitize(string):
68 # Turn any escaped bytes into unicode 'unknown' char.
69 original_bytes = string.encode('ascii', 'surrogateescape')
70 return original_bytes.decode('ascii', 'replace')
71
Antoine Pitroufd036452008-08-19 17:56:33 +000072
Guido van Rossum8b3febe2007-08-30 01:15:14 +000073# Helpers
74
R David Murray8debacb2011-04-06 09:35:57 -040075def formataddr(pair, charset='utf-8'):
Guido van Rossum8b3febe2007-08-30 01:15:14 +000076 """The inverse of parseaddr(), this takes a 2-tuple of the form
77 (realname, email_address) and returns the string value suitable
78 for an RFC 2822 From, To or Cc header.
79
80 If the first element of pair is false, then the second element is
81 returned unmodified.
R David Murray8debacb2011-04-06 09:35:57 -040082
83 Optional charset if given is the character set that is used to encode
84 realname in case realname is not ASCII safe. Can be an instance of str or
85 a Charset-like object which has a header_encode method. Default is
86 'utf-8'.
Guido van Rossum8b3febe2007-08-30 01:15:14 +000087 """
88 name, address = pair
R David Murray8debacb2011-04-06 09:35:57 -040089 # The address MUST (per RFC) be ascii, so throw a UnicodeError if it isn't.
90 address.encode('ascii')
Guido van Rossum8b3febe2007-08-30 01:15:14 +000091 if name:
R David Murray8debacb2011-04-06 09:35:57 -040092 try:
93 name.encode('ascii')
94 except UnicodeEncodeError:
95 if isinstance(charset, str):
96 charset = Charset(charset)
97 encoded_name = charset.header_encode(name)
98 return "%s <%s>" % (encoded_name, address)
99 else:
100 quotes = ''
101 if specialsre.search(name):
102 quotes = '"'
103 name = escapesre.sub(r'\\\g<0>', name)
104 return '%s%s%s <%s>' % (quotes, name, quotes, address)
Guido van Rossum8b3febe2007-08-30 01:15:14 +0000105 return address
106
107
Antoine Pitroufd036452008-08-19 17:56:33 +0000108
Guido van Rossum8b3febe2007-08-30 01:15:14 +0000109def getaddresses(fieldvalues):
110 """Return a list of (REALNAME, EMAIL) for each fieldvalue."""
111 all = COMMASPACE.join(fieldvalues)
112 a = _AddressList(all)
113 return a.addresslist
114
115
Antoine Pitroufd036452008-08-19 17:56:33 +0000116
Guido van Rossum8b3febe2007-08-30 01:15:14 +0000117ecre = re.compile(r'''
118 =\? # literal =?
119 (?P<charset>[^?]*?) # non-greedy up to the next ? is the charset
120 \? # literal ?
121 (?P<encoding>[qb]) # either a "q" or a "b", case insensitive
122 \? # literal ?
123 (?P<atom>.*?) # non-greedy up to the next ?= is the atom
124 \?= # literal ?=
125 ''', re.VERBOSE | re.IGNORECASE)
126
127
R David Murray875048b2011-07-20 11:41:21 -0400128def _format_timetuple_and_zone(timetuple, zone):
129 return '%s, %02d %s %04d %02d:%02d:%02d %s' % (
130 ['Mon', 'Tue', 'Wed', 'Thu', 'Fri', 'Sat', 'Sun'][timetuple[6]],
131 timetuple[2],
132 ['Jan', 'Feb', 'Mar', 'Apr', 'May', 'Jun',
133 'Jul', 'Aug', 'Sep', 'Oct', 'Nov', 'Dec'][timetuple[1] - 1],
134 timetuple[0], timetuple[3], timetuple[4], timetuple[5],
135 zone)
Antoine Pitroufd036452008-08-19 17:56:33 +0000136
Guido van Rossum8b3febe2007-08-30 01:15:14 +0000137def formatdate(timeval=None, localtime=False, usegmt=False):
138 """Returns a date string as specified by RFC 2822, e.g.:
139
140 Fri, 09 Nov 2001 01:08:47 -0000
141
142 Optional timeval if given is a floating point time value as accepted by
143 gmtime() and localtime(), otherwise the current time is used.
144
145 Optional localtime is a flag that when True, interprets timeval, and
146 returns a date relative to the local timezone instead of UTC, properly
147 taking daylight savings time into account.
148
149 Optional argument usegmt means that the timezone is written out as
150 an ascii string, not numeric one (so "GMT" instead of "+0000"). This
151 is needed for HTTP, and is only used when localtime==False.
152 """
153 # Note: we cannot use strftime() because that honors the locale and RFC
154 # 2822 requires that day and month names be the English abbreviations.
155 if timeval is None:
156 timeval = time.time()
157 if localtime:
158 now = time.localtime(timeval)
159 # Calculate timezone offset, based on whether the local zone has
160 # daylight savings time, and whether DST is in effect.
161 if time.daylight and now[-1]:
162 offset = time.altzone
163 else:
164 offset = time.timezone
165 hours, minutes = divmod(abs(offset), 3600)
166 # Remember offset is in seconds west of UTC, but the timezone is in
167 # minutes east of UTC, so the signs differ.
168 if offset > 0:
169 sign = '-'
170 else:
171 sign = '+'
172 zone = '%s%02d%02d' % (sign, hours, minutes // 60)
173 else:
174 now = time.gmtime(timeval)
175 # Timezone offset is always -0000
176 if usegmt:
177 zone = 'GMT'
178 else:
179 zone = '-0000'
R David Murray875048b2011-07-20 11:41:21 -0400180 return _format_timetuple_and_zone(now, zone)
Guido van Rossum8b3febe2007-08-30 01:15:14 +0000181
R David Murray875048b2011-07-20 11:41:21 -0400182def format_datetime(dt, usegmt=False):
183 """Turn a datetime into a date string as specified in RFC 2822.
184
185 If usegmt is True, dt must be an aware datetime with an offset of zero. In
186 this case 'GMT' will be rendered instead of the normal +0000 required by
187 RFC2822. This is to support HTTP headers involving date stamps.
188 """
189 now = dt.timetuple()
190 if usegmt:
191 if dt.tzinfo is None or dt.tzinfo != datetime.timezone.utc:
192 raise ValueError("usegmt option requires a UTC datetime")
193 zone = 'GMT'
194 elif dt.tzinfo is None:
195 zone = '-0000'
196 else:
197 zone = dt.strftime("%z")
198 return _format_timetuple_and_zone(now, zone)
Guido van Rossum8b3febe2007-08-30 01:15:14 +0000199
Antoine Pitroufd036452008-08-19 17:56:33 +0000200
R. David Murraya0b44b52010-12-02 21:47:19 +0000201def make_msgid(idstring=None, domain=None):
Guido van Rossum8b3febe2007-08-30 01:15:14 +0000202 """Returns a string suitable for RFC 2822 compliant Message-ID, e.g:
203
204 <20020201195627.33539.96671@nightshade.la.mastaler.com>
205
206 Optional idstring if given is a string used to strengthen the
R. David Murraya0b44b52010-12-02 21:47:19 +0000207 uniqueness of the message id. Optional domain if given provides the
208 portion of the message id after the '@'. It defaults to the locally
209 defined hostname.
Guido van Rossum8b3febe2007-08-30 01:15:14 +0000210 """
211 timeval = time.time()
212 utcdate = time.strftime('%Y%m%d%H%M%S', time.gmtime(timeval))
213 pid = os.getpid()
214 randint = random.randrange(100000)
215 if idstring is None:
216 idstring = ''
217 else:
218 idstring = '.' + idstring
R. David Murraya0b44b52010-12-02 21:47:19 +0000219 if domain is None:
220 domain = socket.getfqdn()
221 msgid = '<%s.%s.%s%s@%s>' % (utcdate, pid, randint, idstring, domain)
Guido van Rossum8b3febe2007-08-30 01:15:14 +0000222 return msgid
223
224
Antoine Pitroufd036452008-08-19 17:56:33 +0000225
Guido van Rossum8b3febe2007-08-30 01:15:14 +0000226# These functions are in the standalone mimelib version only because they've
227# subsequently been fixed in the latest Python versions. We use this to worm
228# around broken older Pythons.
229def parsedate(data):
230 if not data:
231 return None
232 return _parsedate(data)
233
234
235def parsedate_tz(data):
236 if not data:
237 return None
238 return _parsedate_tz(data)
239
R David Murray875048b2011-07-20 11:41:21 -0400240def parsedate_to_datetime(data):
241 if not data:
242 return None
243 *dtuple, tz = __parsedate_tz(data)
244 if tz is None:
245 return datetime.datetime(*dtuple[:6])
246 return datetime.datetime(*dtuple[:6],
247 tzinfo=datetime.timezone(datetime.timedelta(seconds=tz)))
248
Guido van Rossum8b3febe2007-08-30 01:15:14 +0000249
250def parseaddr(addr):
251 addrs = _AddressList(addr).addresslist
252 if not addrs:
253 return '', ''
254 return addrs[0]
255
256
257# rfc822.unquote() doesn't properly de-backslash-ify in Python pre-2.3.
258def unquote(str):
259 """Remove quotes from a string."""
260 if len(str) > 1:
261 if str.startswith('"') and str.endswith('"'):
262 return str[1:-1].replace('\\\\', '\\').replace('\\"', '"')
263 if str.startswith('<') and str.endswith('>'):
264 return str[1:-1]
265 return str
266
267
Antoine Pitroufd036452008-08-19 17:56:33 +0000268
Guido van Rossum8b3febe2007-08-30 01:15:14 +0000269# RFC2231-related functions - parameter encoding and decoding
270def decode_rfc2231(s):
271 """Decode string according to RFC 2231"""
272 parts = s.split(TICK, 2)
273 if len(parts) <= 2:
274 return None, None, s
275 return parts
276
277
278def encode_rfc2231(s, charset=None, language=None):
279 """Encode string according to RFC 2231.
280
281 If neither charset nor language is given, then s is returned as-is. If
282 charset is given but not language, the string is encoded using the empty
283 string for language.
284 """
Guido van Rossum52dbbb92008-08-18 21:44:30 +0000285 s = urllib.parse.quote(s, safe='', encoding=charset or 'ascii')
Guido van Rossum8b3febe2007-08-30 01:15:14 +0000286 if charset is None and language is None:
287 return s
288 if language is None:
289 language = ''
290 return "%s'%s'%s" % (charset, language, s)
291
292
Antoine Pitroufd036452008-08-19 17:56:33 +0000293rfc2231_continuation = re.compile(r'^(?P<name>\w+)\*((?P<num>[0-9]+)\*?)?$',
294 re.ASCII)
Guido van Rossum8b3febe2007-08-30 01:15:14 +0000295
296def decode_params(params):
297 """Decode parameters list according to RFC 2231.
298
299 params is a sequence of 2-tuples containing (param name, string value).
300 """
301 # Copy params so we don't mess with the original
302 params = params[:]
303 new_params = []
304 # Map parameter's name to a list of continuations. The values are a
305 # 3-tuple of the continuation number, the string value, and a flag
306 # specifying whether a particular segment is %-encoded.
307 rfc2231_params = {}
308 name, value = params.pop(0)
309 new_params.append((name, value))
310 while params:
311 name, value = params.pop(0)
312 if name.endswith('*'):
313 encoded = True
314 else:
315 encoded = False
316 value = unquote(value)
317 mo = rfc2231_continuation.match(name)
318 if mo:
319 name, num = mo.group('name', 'num')
320 if num is not None:
321 num = int(num)
322 rfc2231_params.setdefault(name, []).append((num, value, encoded))
323 else:
324 new_params.append((name, '"%s"' % quote(value)))
325 if rfc2231_params:
326 for name, continuations in rfc2231_params.items():
327 value = []
328 extended = False
329 # Sort by number
330 continuations.sort()
331 # And now append all values in numerical order, converting
332 # %-encodings for the encoded segments. If any of the
333 # continuation names ends in a *, then the entire string, after
334 # decoding segments and concatenating, must have the charset and
335 # language specifiers at the beginning of the string.
336 for num, s, encoded in continuations:
337 if encoded:
Guido van Rossum52dbbb92008-08-18 21:44:30 +0000338 # Decode as "latin-1", so the characters in s directly
339 # represent the percent-encoded octet values.
340 # collapse_rfc2231_value treats this as an octet sequence.
341 s = urllib.parse.unquote(s, encoding="latin-1")
Guido van Rossum8b3febe2007-08-30 01:15:14 +0000342 extended = True
343 value.append(s)
344 value = quote(EMPTYSTRING.join(value))
345 if extended:
346 charset, language, value = decode_rfc2231(value)
347 new_params.append((name, (charset, language, '"%s"' % value)))
348 else:
349 new_params.append((name, '"%s"' % value))
350 return new_params
351
352def collapse_rfc2231_value(value, errors='replace',
353 fallback_charset='us-ascii'):
354 if not isinstance(value, tuple) or len(value) != 3:
355 return unquote(value)
356 # While value comes to us as a unicode string, we need it to be a bytes
357 # object. We do not want bytes() normal utf-8 decoder, we want a straight
358 # interpretation of the string as character bytes.
359 charset, language, text = value
Guido van Rossum9604e662007-08-30 03:46:43 +0000360 rawbytes = bytes(text, 'raw-unicode-escape')
Guido van Rossum8b3febe2007-08-30 01:15:14 +0000361 try:
362 return str(rawbytes, charset, errors)
363 except LookupError:
364 # charset is not a known codec.
365 return unquote(text)
R David Murrayd2d521e2012-05-25 23:22:59 -0400366
367
368#
369# datetime doesn't provide a localtime function yet, so provide one. Code
370# adapted from the patch in issue 9527. This may not be perfect, but it is
371# better than not having it.
372#
373
374def localtime(dt=None, isdst=-1):
375 """Return local time as an aware datetime object.
376
377 If called without arguments, return current time. Otherwise *dt*
378 argument should be a datetime instance, and it is converted to the
379 local time zone according to the system time zone database. If *dt* is
380 naive (that is, dt.tzinfo is None), it is assumed to be in local time.
381 In this case, a positive or zero value for *isdst* causes localtime to
382 presume initially that summer time (for example, Daylight Saving Time)
383 is or is not (respectively) in effect for the specified time. A
384 negative value for *isdst* causes the localtime() function to attempt
385 to divine whether summer time is in effect for the specified time.
386
387 """
388 if dt is None:
389 seconds = time.time()
390 else:
391 if dt.tzinfo is None:
392 # A naive datetime is given. Convert to a (localtime)
393 # timetuple and pass to system mktime together with
394 # the isdst hint. System mktime will return seconds
395 # sysce epoch.
396 tm = dt.timetuple()[:-1] + (isdst,)
397 seconds = time.mktime(tm)
398 else:
399 # An aware datetime is given. Use aware datetime
400 # arithmetics to find seconds since epoch.
401 delta = dt - datetime.datetime(1970, 1, 1,
402 tzinfo=datetime.timezone.utc)
403 seconds = delta.total_seconds()
404 tm = time.localtime(seconds)
405
406 # XXX: The following logic may not work correctly if UTC
407 # offset has changed since time provided in dt. This will be
408 # corrected in C implementation for platforms that support
409 # tm_gmtoff.
410 if time.daylight and tm.tm_isdst:
411 offset = time.altzone
412 tzname = time.tzname[1]
413 else:
414 offset = time.timezone
415 tzname = time.tzname[0]
416
417 tz = datetime.timezone(datetime.timedelta(seconds=-offset), tzname)
418 return datetime.datetime.fromtimestamp(seconds, tz)