blob: b82d5c578b0b797cb3cc4a01814600e9deb50f53 [file] [log] [blame]
Benjamin Peterson46a99002010-01-09 18:45:30 +00001# Copyright (C) 2001-2010 Python Software Foundation
Guido van Rossum8b3febe2007-08-30 01:15:14 +00002# Author: Barry Warsaw
3# Contact: email-sig@python.org
4
5"""Miscellaneous utilities."""
6
7__all__ = [
8 'collapse_rfc2231_value',
9 'decode_params',
10 'decode_rfc2231',
11 'encode_rfc2231',
12 'formataddr',
13 'formatdate',
R David Murray875048b2011-07-20 11:41:21 -040014 'format_datetime',
Guido van Rossum8b3febe2007-08-30 01:15:14 +000015 'getaddresses',
16 'make_msgid',
Barry Warsawb742a962009-11-25 18:45:15 +000017 'mktime_tz',
Guido van Rossum8b3febe2007-08-30 01:15:14 +000018 'parseaddr',
19 'parsedate',
20 'parsedate_tz',
R David Murray875048b2011-07-20 11:41:21 -040021 'parsedate_to_datetime',
Guido van Rossum8b3febe2007-08-30 01:15:14 +000022 'unquote',
23 ]
24
25import os
26import re
27import time
28import base64
29import random
30import socket
R David Murray875048b2011-07-20 11:41:21 -040031import datetime
Jeremy Hylton1afc1692008-06-18 20:49:58 +000032import urllib.parse
Guido van Rossum8b3febe2007-08-30 01:15:14 +000033import warnings
34from io import StringIO
35
36from email._parseaddr import quote
37from email._parseaddr import AddressList as _AddressList
38from email._parseaddr import mktime_tz
39
40# We need wormarounds for bugs in these methods in older Pythons (see below)
41from email._parseaddr import parsedate as _parsedate
42from email._parseaddr import parsedate_tz as _parsedate_tz
R David Murray875048b2011-07-20 11:41:21 -040043from email._parseaddr import _parsedate_tz as __parsedate_tz
Guido van Rossum8b3febe2007-08-30 01:15:14 +000044
45from quopri import decodestring as _qdecode
46
47# Intrapackage imports
48from email.encoders import _bencode, _qencode
R David Murray8debacb2011-04-06 09:35:57 -040049from email.charset import Charset
Guido van Rossum8b3febe2007-08-30 01:15:14 +000050
51COMMASPACE = ', '
52EMPTYSTRING = ''
53UEMPTYSTRING = ''
54CRLF = '\r\n'
55TICK = "'"
56
57specialsre = re.compile(r'[][\\()<>@,:;".]')
R David Murrayb53319f2012-03-14 15:31:47 -040058escapesre = re.compile(r'[\\"]')
Guido van Rossum8b3febe2007-08-30 01:15:14 +000059
R David Murrayc27e5222012-05-25 15:01:48 -040060# How to figure out if we are processing strings that come from a byte
61# source with undecodable characters.
62_has_surrogates = re.compile(
63 '([^\ud800-\udbff]|\A)[\udc00-\udfff]([^\udc00-\udfff]|\Z)').search
Guido van Rossum8b3febe2007-08-30 01:15:14 +000064
Antoine Pitroufd036452008-08-19 17:56:33 +000065
Guido van Rossum8b3febe2007-08-30 01:15:14 +000066# Helpers
67
R David Murray8debacb2011-04-06 09:35:57 -040068def formataddr(pair, charset='utf-8'):
Guido van Rossum8b3febe2007-08-30 01:15:14 +000069 """The inverse of parseaddr(), this takes a 2-tuple of the form
70 (realname, email_address) and returns the string value suitable
71 for an RFC 2822 From, To or Cc header.
72
73 If the first element of pair is false, then the second element is
74 returned unmodified.
R David Murray8debacb2011-04-06 09:35:57 -040075
76 Optional charset if given is the character set that is used to encode
77 realname in case realname is not ASCII safe. Can be an instance of str or
78 a Charset-like object which has a header_encode method. Default is
79 'utf-8'.
Guido van Rossum8b3febe2007-08-30 01:15:14 +000080 """
81 name, address = pair
R David Murray8debacb2011-04-06 09:35:57 -040082 # The address MUST (per RFC) be ascii, so throw a UnicodeError if it isn't.
83 address.encode('ascii')
Guido van Rossum8b3febe2007-08-30 01:15:14 +000084 if name:
R David Murray8debacb2011-04-06 09:35:57 -040085 try:
86 name.encode('ascii')
87 except UnicodeEncodeError:
88 if isinstance(charset, str):
89 charset = Charset(charset)
90 encoded_name = charset.header_encode(name)
91 return "%s <%s>" % (encoded_name, address)
92 else:
93 quotes = ''
94 if specialsre.search(name):
95 quotes = '"'
96 name = escapesre.sub(r'\\\g<0>', name)
97 return '%s%s%s <%s>' % (quotes, name, quotes, address)
Guido van Rossum8b3febe2007-08-30 01:15:14 +000098 return address
99
100
Antoine Pitroufd036452008-08-19 17:56:33 +0000101
Guido van Rossum8b3febe2007-08-30 01:15:14 +0000102def getaddresses(fieldvalues):
103 """Return a list of (REALNAME, EMAIL) for each fieldvalue."""
104 all = COMMASPACE.join(fieldvalues)
105 a = _AddressList(all)
106 return a.addresslist
107
108
Antoine Pitroufd036452008-08-19 17:56:33 +0000109
Guido van Rossum8b3febe2007-08-30 01:15:14 +0000110ecre = re.compile(r'''
111 =\? # literal =?
112 (?P<charset>[^?]*?) # non-greedy up to the next ? is the charset
113 \? # literal ?
114 (?P<encoding>[qb]) # either a "q" or a "b", case insensitive
115 \? # literal ?
116 (?P<atom>.*?) # non-greedy up to the next ?= is the atom
117 \?= # literal ?=
118 ''', re.VERBOSE | re.IGNORECASE)
119
120
R David Murray875048b2011-07-20 11:41:21 -0400121def _format_timetuple_and_zone(timetuple, zone):
122 return '%s, %02d %s %04d %02d:%02d:%02d %s' % (
123 ['Mon', 'Tue', 'Wed', 'Thu', 'Fri', 'Sat', 'Sun'][timetuple[6]],
124 timetuple[2],
125 ['Jan', 'Feb', 'Mar', 'Apr', 'May', 'Jun',
126 'Jul', 'Aug', 'Sep', 'Oct', 'Nov', 'Dec'][timetuple[1] - 1],
127 timetuple[0], timetuple[3], timetuple[4], timetuple[5],
128 zone)
Antoine Pitroufd036452008-08-19 17:56:33 +0000129
Guido van Rossum8b3febe2007-08-30 01:15:14 +0000130def formatdate(timeval=None, localtime=False, usegmt=False):
131 """Returns a date string as specified by RFC 2822, e.g.:
132
133 Fri, 09 Nov 2001 01:08:47 -0000
134
135 Optional timeval if given is a floating point time value as accepted by
136 gmtime() and localtime(), otherwise the current time is used.
137
138 Optional localtime is a flag that when True, interprets timeval, and
139 returns a date relative to the local timezone instead of UTC, properly
140 taking daylight savings time into account.
141
142 Optional argument usegmt means that the timezone is written out as
143 an ascii string, not numeric one (so "GMT" instead of "+0000"). This
144 is needed for HTTP, and is only used when localtime==False.
145 """
146 # Note: we cannot use strftime() because that honors the locale and RFC
147 # 2822 requires that day and month names be the English abbreviations.
148 if timeval is None:
149 timeval = time.time()
150 if localtime:
151 now = time.localtime(timeval)
152 # Calculate timezone offset, based on whether the local zone has
153 # daylight savings time, and whether DST is in effect.
154 if time.daylight and now[-1]:
155 offset = time.altzone
156 else:
157 offset = time.timezone
158 hours, minutes = divmod(abs(offset), 3600)
159 # Remember offset is in seconds west of UTC, but the timezone is in
160 # minutes east of UTC, so the signs differ.
161 if offset > 0:
162 sign = '-'
163 else:
164 sign = '+'
165 zone = '%s%02d%02d' % (sign, hours, minutes // 60)
166 else:
167 now = time.gmtime(timeval)
168 # Timezone offset is always -0000
169 if usegmt:
170 zone = 'GMT'
171 else:
172 zone = '-0000'
R David Murray875048b2011-07-20 11:41:21 -0400173 return _format_timetuple_and_zone(now, zone)
Guido van Rossum8b3febe2007-08-30 01:15:14 +0000174
R David Murray875048b2011-07-20 11:41:21 -0400175def format_datetime(dt, usegmt=False):
176 """Turn a datetime into a date string as specified in RFC 2822.
177
178 If usegmt is True, dt must be an aware datetime with an offset of zero. In
179 this case 'GMT' will be rendered instead of the normal +0000 required by
180 RFC2822. This is to support HTTP headers involving date stamps.
181 """
182 now = dt.timetuple()
183 if usegmt:
184 if dt.tzinfo is None or dt.tzinfo != datetime.timezone.utc:
185 raise ValueError("usegmt option requires a UTC datetime")
186 zone = 'GMT'
187 elif dt.tzinfo is None:
188 zone = '-0000'
189 else:
190 zone = dt.strftime("%z")
191 return _format_timetuple_and_zone(now, zone)
Guido van Rossum8b3febe2007-08-30 01:15:14 +0000192
Antoine Pitroufd036452008-08-19 17:56:33 +0000193
R. David Murraya0b44b52010-12-02 21:47:19 +0000194def make_msgid(idstring=None, domain=None):
Guido van Rossum8b3febe2007-08-30 01:15:14 +0000195 """Returns a string suitable for RFC 2822 compliant Message-ID, e.g:
196
197 <20020201195627.33539.96671@nightshade.la.mastaler.com>
198
199 Optional idstring if given is a string used to strengthen the
R. David Murraya0b44b52010-12-02 21:47:19 +0000200 uniqueness of the message id. Optional domain if given provides the
201 portion of the message id after the '@'. It defaults to the locally
202 defined hostname.
Guido van Rossum8b3febe2007-08-30 01:15:14 +0000203 """
204 timeval = time.time()
205 utcdate = time.strftime('%Y%m%d%H%M%S', time.gmtime(timeval))
206 pid = os.getpid()
207 randint = random.randrange(100000)
208 if idstring is None:
209 idstring = ''
210 else:
211 idstring = '.' + idstring
R. David Murraya0b44b52010-12-02 21:47:19 +0000212 if domain is None:
213 domain = socket.getfqdn()
214 msgid = '<%s.%s.%s%s@%s>' % (utcdate, pid, randint, idstring, domain)
Guido van Rossum8b3febe2007-08-30 01:15:14 +0000215 return msgid
216
217
Antoine Pitroufd036452008-08-19 17:56:33 +0000218
Guido van Rossum8b3febe2007-08-30 01:15:14 +0000219# These functions are in the standalone mimelib version only because they've
220# subsequently been fixed in the latest Python versions. We use this to worm
221# around broken older Pythons.
222def parsedate(data):
223 if not data:
224 return None
225 return _parsedate(data)
226
227
228def parsedate_tz(data):
229 if not data:
230 return None
231 return _parsedate_tz(data)
232
R David Murray875048b2011-07-20 11:41:21 -0400233def parsedate_to_datetime(data):
234 if not data:
235 return None
236 *dtuple, tz = __parsedate_tz(data)
237 if tz is None:
238 return datetime.datetime(*dtuple[:6])
239 return datetime.datetime(*dtuple[:6],
240 tzinfo=datetime.timezone(datetime.timedelta(seconds=tz)))
241
Guido van Rossum8b3febe2007-08-30 01:15:14 +0000242
243def parseaddr(addr):
244 addrs = _AddressList(addr).addresslist
245 if not addrs:
246 return '', ''
247 return addrs[0]
248
249
250# rfc822.unquote() doesn't properly de-backslash-ify in Python pre-2.3.
251def unquote(str):
252 """Remove quotes from a string."""
253 if len(str) > 1:
254 if str.startswith('"') and str.endswith('"'):
255 return str[1:-1].replace('\\\\', '\\').replace('\\"', '"')
256 if str.startswith('<') and str.endswith('>'):
257 return str[1:-1]
258 return str
259
260
Antoine Pitroufd036452008-08-19 17:56:33 +0000261
Guido van Rossum8b3febe2007-08-30 01:15:14 +0000262# RFC2231-related functions - parameter encoding and decoding
263def decode_rfc2231(s):
264 """Decode string according to RFC 2231"""
265 parts = s.split(TICK, 2)
266 if len(parts) <= 2:
267 return None, None, s
268 return parts
269
270
271def encode_rfc2231(s, charset=None, language=None):
272 """Encode string according to RFC 2231.
273
274 If neither charset nor language is given, then s is returned as-is. If
275 charset is given but not language, the string is encoded using the empty
276 string for language.
277 """
Guido van Rossum52dbbb92008-08-18 21:44:30 +0000278 s = urllib.parse.quote(s, safe='', encoding=charset or 'ascii')
Guido van Rossum8b3febe2007-08-30 01:15:14 +0000279 if charset is None and language is None:
280 return s
281 if language is None:
282 language = ''
283 return "%s'%s'%s" % (charset, language, s)
284
285
Antoine Pitroufd036452008-08-19 17:56:33 +0000286rfc2231_continuation = re.compile(r'^(?P<name>\w+)\*((?P<num>[0-9]+)\*?)?$',
287 re.ASCII)
Guido van Rossum8b3febe2007-08-30 01:15:14 +0000288
289def decode_params(params):
290 """Decode parameters list according to RFC 2231.
291
292 params is a sequence of 2-tuples containing (param name, string value).
293 """
294 # Copy params so we don't mess with the original
295 params = params[:]
296 new_params = []
297 # Map parameter's name to a list of continuations. The values are a
298 # 3-tuple of the continuation number, the string value, and a flag
299 # specifying whether a particular segment is %-encoded.
300 rfc2231_params = {}
301 name, value = params.pop(0)
302 new_params.append((name, value))
303 while params:
304 name, value = params.pop(0)
305 if name.endswith('*'):
306 encoded = True
307 else:
308 encoded = False
309 value = unquote(value)
310 mo = rfc2231_continuation.match(name)
311 if mo:
312 name, num = mo.group('name', 'num')
313 if num is not None:
314 num = int(num)
315 rfc2231_params.setdefault(name, []).append((num, value, encoded))
316 else:
317 new_params.append((name, '"%s"' % quote(value)))
318 if rfc2231_params:
319 for name, continuations in rfc2231_params.items():
320 value = []
321 extended = False
322 # Sort by number
323 continuations.sort()
324 # And now append all values in numerical order, converting
325 # %-encodings for the encoded segments. If any of the
326 # continuation names ends in a *, then the entire string, after
327 # decoding segments and concatenating, must have the charset and
328 # language specifiers at the beginning of the string.
329 for num, s, encoded in continuations:
330 if encoded:
Guido van Rossum52dbbb92008-08-18 21:44:30 +0000331 # Decode as "latin-1", so the characters in s directly
332 # represent the percent-encoded octet values.
333 # collapse_rfc2231_value treats this as an octet sequence.
334 s = urllib.parse.unquote(s, encoding="latin-1")
Guido van Rossum8b3febe2007-08-30 01:15:14 +0000335 extended = True
336 value.append(s)
337 value = quote(EMPTYSTRING.join(value))
338 if extended:
339 charset, language, value = decode_rfc2231(value)
340 new_params.append((name, (charset, language, '"%s"' % value)))
341 else:
342 new_params.append((name, '"%s"' % value))
343 return new_params
344
345def collapse_rfc2231_value(value, errors='replace',
346 fallback_charset='us-ascii'):
347 if not isinstance(value, tuple) or len(value) != 3:
348 return unquote(value)
349 # While value comes to us as a unicode string, we need it to be a bytes
350 # object. We do not want bytes() normal utf-8 decoder, we want a straight
351 # interpretation of the string as character bytes.
352 charset, language, text = value
Guido van Rossum9604e662007-08-30 03:46:43 +0000353 rawbytes = bytes(text, 'raw-unicode-escape')
Guido van Rossum8b3febe2007-08-30 01:15:14 +0000354 try:
355 return str(rawbytes, charset, errors)
356 except LookupError:
357 # charset is not a known codec.
358 return unquote(text)