blob: aecea656e66ac9064aabe82d8c8396b7c436dffb [file] [log] [blame]
Benjamin Peterson46a99002010-01-09 18:45:30 +00001# Copyright (C) 2001-2010 Python Software Foundation
Guido van Rossum8b3febe2007-08-30 01:15:14 +00002# Author: Barry Warsaw
3# Contact: email-sig@python.org
4
5"""Miscellaneous utilities."""
6
7__all__ = [
8 'collapse_rfc2231_value',
9 'decode_params',
10 'decode_rfc2231',
11 'encode_rfc2231',
12 'formataddr',
13 'formatdate',
R David Murray875048b2011-07-20 11:41:21 -040014 'format_datetime',
Guido van Rossum8b3febe2007-08-30 01:15:14 +000015 'getaddresses',
16 'make_msgid',
Barry Warsawb742a962009-11-25 18:45:15 +000017 'mktime_tz',
Guido van Rossum8b3febe2007-08-30 01:15:14 +000018 'parseaddr',
19 'parsedate',
20 'parsedate_tz',
R David Murray875048b2011-07-20 11:41:21 -040021 'parsedate_to_datetime',
Guido van Rossum8b3febe2007-08-30 01:15:14 +000022 'unquote',
23 ]
24
25import os
26import re
27import time
28import base64
29import random
30import socket
R David Murray875048b2011-07-20 11:41:21 -040031import datetime
Jeremy Hylton1afc1692008-06-18 20:49:58 +000032import urllib.parse
Guido van Rossum8b3febe2007-08-30 01:15:14 +000033import warnings
34from io import StringIO
35
36from email._parseaddr import quote
37from email._parseaddr import AddressList as _AddressList
38from email._parseaddr import mktime_tz
39
40# We need wormarounds for bugs in these methods in older Pythons (see below)
41from email._parseaddr import parsedate as _parsedate
42from email._parseaddr import parsedate_tz as _parsedate_tz
R David Murray875048b2011-07-20 11:41:21 -040043from email._parseaddr import _parsedate_tz as __parsedate_tz
Guido van Rossum8b3febe2007-08-30 01:15:14 +000044
45from quopri import decodestring as _qdecode
46
47# Intrapackage imports
48from email.encoders import _bencode, _qencode
R David Murray8debacb2011-04-06 09:35:57 -040049from email.charset import Charset
Guido van Rossum8b3febe2007-08-30 01:15:14 +000050
51COMMASPACE = ', '
52EMPTYSTRING = ''
53UEMPTYSTRING = ''
54CRLF = '\r\n'
55TICK = "'"
56
57specialsre = re.compile(r'[][\\()<>@,:;".]')
58escapesre = re.compile(r'[][\\()"]')
59
60
Antoine Pitroufd036452008-08-19 17:56:33 +000061
Guido van Rossum8b3febe2007-08-30 01:15:14 +000062# Helpers
63
R David Murray8debacb2011-04-06 09:35:57 -040064def formataddr(pair, charset='utf-8'):
Guido van Rossum8b3febe2007-08-30 01:15:14 +000065 """The inverse of parseaddr(), this takes a 2-tuple of the form
66 (realname, email_address) and returns the string value suitable
67 for an RFC 2822 From, To or Cc header.
68
69 If the first element of pair is false, then the second element is
70 returned unmodified.
R David Murray8debacb2011-04-06 09:35:57 -040071
72 Optional charset if given is the character set that is used to encode
73 realname in case realname is not ASCII safe. Can be an instance of str or
74 a Charset-like object which has a header_encode method. Default is
75 'utf-8'.
Guido van Rossum8b3febe2007-08-30 01:15:14 +000076 """
77 name, address = pair
R David Murray8debacb2011-04-06 09:35:57 -040078 # The address MUST (per RFC) be ascii, so throw a UnicodeError if it isn't.
79 address.encode('ascii')
Guido van Rossum8b3febe2007-08-30 01:15:14 +000080 if name:
R David Murray8debacb2011-04-06 09:35:57 -040081 try:
82 name.encode('ascii')
83 except UnicodeEncodeError:
84 if isinstance(charset, str):
85 charset = Charset(charset)
86 encoded_name = charset.header_encode(name)
87 return "%s <%s>" % (encoded_name, address)
88 else:
89 quotes = ''
90 if specialsre.search(name):
91 quotes = '"'
92 name = escapesre.sub(r'\\\g<0>', name)
93 return '%s%s%s <%s>' % (quotes, name, quotes, address)
Guido van Rossum8b3febe2007-08-30 01:15:14 +000094 return address
95
96
Antoine Pitroufd036452008-08-19 17:56:33 +000097
Guido van Rossum8b3febe2007-08-30 01:15:14 +000098def getaddresses(fieldvalues):
99 """Return a list of (REALNAME, EMAIL) for each fieldvalue."""
100 all = COMMASPACE.join(fieldvalues)
101 a = _AddressList(all)
102 return a.addresslist
103
104
Antoine Pitroufd036452008-08-19 17:56:33 +0000105
Guido van Rossum8b3febe2007-08-30 01:15:14 +0000106ecre = re.compile(r'''
107 =\? # literal =?
108 (?P<charset>[^?]*?) # non-greedy up to the next ? is the charset
109 \? # literal ?
110 (?P<encoding>[qb]) # either a "q" or a "b", case insensitive
111 \? # literal ?
112 (?P<atom>.*?) # non-greedy up to the next ?= is the atom
113 \?= # literal ?=
114 ''', re.VERBOSE | re.IGNORECASE)
115
116
R David Murray875048b2011-07-20 11:41:21 -0400117def _format_timetuple_and_zone(timetuple, zone):
118 return '%s, %02d %s %04d %02d:%02d:%02d %s' % (
119 ['Mon', 'Tue', 'Wed', 'Thu', 'Fri', 'Sat', 'Sun'][timetuple[6]],
120 timetuple[2],
121 ['Jan', 'Feb', 'Mar', 'Apr', 'May', 'Jun',
122 'Jul', 'Aug', 'Sep', 'Oct', 'Nov', 'Dec'][timetuple[1] - 1],
123 timetuple[0], timetuple[3], timetuple[4], timetuple[5],
124 zone)
Antoine Pitroufd036452008-08-19 17:56:33 +0000125
Guido van Rossum8b3febe2007-08-30 01:15:14 +0000126def formatdate(timeval=None, localtime=False, usegmt=False):
127 """Returns a date string as specified by RFC 2822, e.g.:
128
129 Fri, 09 Nov 2001 01:08:47 -0000
130
131 Optional timeval if given is a floating point time value as accepted by
132 gmtime() and localtime(), otherwise the current time is used.
133
134 Optional localtime is a flag that when True, interprets timeval, and
135 returns a date relative to the local timezone instead of UTC, properly
136 taking daylight savings time into account.
137
138 Optional argument usegmt means that the timezone is written out as
139 an ascii string, not numeric one (so "GMT" instead of "+0000"). This
140 is needed for HTTP, and is only used when localtime==False.
141 """
142 # Note: we cannot use strftime() because that honors the locale and RFC
143 # 2822 requires that day and month names be the English abbreviations.
144 if timeval is None:
145 timeval = time.time()
146 if localtime:
147 now = time.localtime(timeval)
148 # Calculate timezone offset, based on whether the local zone has
149 # daylight savings time, and whether DST is in effect.
150 if time.daylight and now[-1]:
151 offset = time.altzone
152 else:
153 offset = time.timezone
154 hours, minutes = divmod(abs(offset), 3600)
155 # Remember offset is in seconds west of UTC, but the timezone is in
156 # minutes east of UTC, so the signs differ.
157 if offset > 0:
158 sign = '-'
159 else:
160 sign = '+'
161 zone = '%s%02d%02d' % (sign, hours, minutes // 60)
162 else:
163 now = time.gmtime(timeval)
164 # Timezone offset is always -0000
165 if usegmt:
166 zone = 'GMT'
167 else:
168 zone = '-0000'
R David Murray875048b2011-07-20 11:41:21 -0400169 return _format_timetuple_and_zone(now, zone)
Guido van Rossum8b3febe2007-08-30 01:15:14 +0000170
R David Murray875048b2011-07-20 11:41:21 -0400171def format_datetime(dt, usegmt=False):
172 """Turn a datetime into a date string as specified in RFC 2822.
173
174 If usegmt is True, dt must be an aware datetime with an offset of zero. In
175 this case 'GMT' will be rendered instead of the normal +0000 required by
176 RFC2822. This is to support HTTP headers involving date stamps.
177 """
178 now = dt.timetuple()
179 if usegmt:
180 if dt.tzinfo is None or dt.tzinfo != datetime.timezone.utc:
181 raise ValueError("usegmt option requires a UTC datetime")
182 zone = 'GMT'
183 elif dt.tzinfo is None:
184 zone = '-0000'
185 else:
186 zone = dt.strftime("%z")
187 return _format_timetuple_and_zone(now, zone)
Guido van Rossum8b3febe2007-08-30 01:15:14 +0000188
Antoine Pitroufd036452008-08-19 17:56:33 +0000189
R. David Murraya0b44b52010-12-02 21:47:19 +0000190def make_msgid(idstring=None, domain=None):
Guido van Rossum8b3febe2007-08-30 01:15:14 +0000191 """Returns a string suitable for RFC 2822 compliant Message-ID, e.g:
192
193 <20020201195627.33539.96671@nightshade.la.mastaler.com>
194
195 Optional idstring if given is a string used to strengthen the
R. David Murraya0b44b52010-12-02 21:47:19 +0000196 uniqueness of the message id. Optional domain if given provides the
197 portion of the message id after the '@'. It defaults to the locally
198 defined hostname.
Guido van Rossum8b3febe2007-08-30 01:15:14 +0000199 """
200 timeval = time.time()
201 utcdate = time.strftime('%Y%m%d%H%M%S', time.gmtime(timeval))
202 pid = os.getpid()
203 randint = random.randrange(100000)
204 if idstring is None:
205 idstring = ''
206 else:
207 idstring = '.' + idstring
R. David Murraya0b44b52010-12-02 21:47:19 +0000208 if domain is None:
209 domain = socket.getfqdn()
210 msgid = '<%s.%s.%s%s@%s>' % (utcdate, pid, randint, idstring, domain)
Guido van Rossum8b3febe2007-08-30 01:15:14 +0000211 return msgid
212
213
Antoine Pitroufd036452008-08-19 17:56:33 +0000214
Guido van Rossum8b3febe2007-08-30 01:15:14 +0000215# These functions are in the standalone mimelib version only because they've
216# subsequently been fixed in the latest Python versions. We use this to worm
217# around broken older Pythons.
218def parsedate(data):
219 if not data:
220 return None
221 return _parsedate(data)
222
223
224def parsedate_tz(data):
225 if not data:
226 return None
227 return _parsedate_tz(data)
228
R David Murray875048b2011-07-20 11:41:21 -0400229def parsedate_to_datetime(data):
230 if not data:
231 return None
232 *dtuple, tz = __parsedate_tz(data)
233 if tz is None:
234 return datetime.datetime(*dtuple[:6])
235 return datetime.datetime(*dtuple[:6],
236 tzinfo=datetime.timezone(datetime.timedelta(seconds=tz)))
237
Guido van Rossum8b3febe2007-08-30 01:15:14 +0000238
239def parseaddr(addr):
240 addrs = _AddressList(addr).addresslist
241 if not addrs:
242 return '', ''
243 return addrs[0]
244
245
246# rfc822.unquote() doesn't properly de-backslash-ify in Python pre-2.3.
247def unquote(str):
248 """Remove quotes from a string."""
249 if len(str) > 1:
250 if str.startswith('"') and str.endswith('"'):
251 return str[1:-1].replace('\\\\', '\\').replace('\\"', '"')
252 if str.startswith('<') and str.endswith('>'):
253 return str[1:-1]
254 return str
255
256
Antoine Pitroufd036452008-08-19 17:56:33 +0000257
Guido van Rossum8b3febe2007-08-30 01:15:14 +0000258# RFC2231-related functions - parameter encoding and decoding
259def decode_rfc2231(s):
260 """Decode string according to RFC 2231"""
261 parts = s.split(TICK, 2)
262 if len(parts) <= 2:
263 return None, None, s
264 return parts
265
266
267def encode_rfc2231(s, charset=None, language=None):
268 """Encode string according to RFC 2231.
269
270 If neither charset nor language is given, then s is returned as-is. If
271 charset is given but not language, the string is encoded using the empty
272 string for language.
273 """
Guido van Rossum52dbbb92008-08-18 21:44:30 +0000274 s = urllib.parse.quote(s, safe='', encoding=charset or 'ascii')
Guido van Rossum8b3febe2007-08-30 01:15:14 +0000275 if charset is None and language is None:
276 return s
277 if language is None:
278 language = ''
279 return "%s'%s'%s" % (charset, language, s)
280
281
Antoine Pitroufd036452008-08-19 17:56:33 +0000282rfc2231_continuation = re.compile(r'^(?P<name>\w+)\*((?P<num>[0-9]+)\*?)?$',
283 re.ASCII)
Guido van Rossum8b3febe2007-08-30 01:15:14 +0000284
285def decode_params(params):
286 """Decode parameters list according to RFC 2231.
287
288 params is a sequence of 2-tuples containing (param name, string value).
289 """
290 # Copy params so we don't mess with the original
291 params = params[:]
292 new_params = []
293 # Map parameter's name to a list of continuations. The values are a
294 # 3-tuple of the continuation number, the string value, and a flag
295 # specifying whether a particular segment is %-encoded.
296 rfc2231_params = {}
297 name, value = params.pop(0)
298 new_params.append((name, value))
299 while params:
300 name, value = params.pop(0)
301 if name.endswith('*'):
302 encoded = True
303 else:
304 encoded = False
305 value = unquote(value)
306 mo = rfc2231_continuation.match(name)
307 if mo:
308 name, num = mo.group('name', 'num')
309 if num is not None:
310 num = int(num)
311 rfc2231_params.setdefault(name, []).append((num, value, encoded))
312 else:
313 new_params.append((name, '"%s"' % quote(value)))
314 if rfc2231_params:
315 for name, continuations in rfc2231_params.items():
316 value = []
317 extended = False
318 # Sort by number
319 continuations.sort()
320 # And now append all values in numerical order, converting
321 # %-encodings for the encoded segments. If any of the
322 # continuation names ends in a *, then the entire string, after
323 # decoding segments and concatenating, must have the charset and
324 # language specifiers at the beginning of the string.
325 for num, s, encoded in continuations:
326 if encoded:
Guido van Rossum52dbbb92008-08-18 21:44:30 +0000327 # Decode as "latin-1", so the characters in s directly
328 # represent the percent-encoded octet values.
329 # collapse_rfc2231_value treats this as an octet sequence.
330 s = urllib.parse.unquote(s, encoding="latin-1")
Guido van Rossum8b3febe2007-08-30 01:15:14 +0000331 extended = True
332 value.append(s)
333 value = quote(EMPTYSTRING.join(value))
334 if extended:
335 charset, language, value = decode_rfc2231(value)
336 new_params.append((name, (charset, language, '"%s"' % value)))
337 else:
338 new_params.append((name, '"%s"' % value))
339 return new_params
340
341def collapse_rfc2231_value(value, errors='replace',
342 fallback_charset='us-ascii'):
343 if not isinstance(value, tuple) or len(value) != 3:
344 return unquote(value)
345 # While value comes to us as a unicode string, we need it to be a bytes
346 # object. We do not want bytes() normal utf-8 decoder, we want a straight
347 # interpretation of the string as character bytes.
348 charset, language, text = value
Guido van Rossum9604e662007-08-30 03:46:43 +0000349 rawbytes = bytes(text, 'raw-unicode-escape')
Guido van Rossum8b3febe2007-08-30 01:15:14 +0000350 try:
351 return str(rawbytes, charset, errors)
352 except LookupError:
353 # charset is not a known codec.
354 return unquote(text)