blob: 82f7283c077e7fc413f0233acf6aec61baa06672 [file] [log] [blame]
Benjamin Peterson46a99002010-01-09 18:45:30 +00001# Copyright (C) 2001-2010 Python Software Foundation
Guido van Rossum8b3febe2007-08-30 01:15:14 +00002# Author: Barry Warsaw
3# Contact: email-sig@python.org
4
5"""Miscellaneous utilities."""
6
7__all__ = [
8 'collapse_rfc2231_value',
9 'decode_params',
10 'decode_rfc2231',
11 'encode_rfc2231',
12 'formataddr',
13 'formatdate',
14 'getaddresses',
15 'make_msgid',
Barry Warsawb742a962009-11-25 18:45:15 +000016 'mktime_tz',
Guido van Rossum8b3febe2007-08-30 01:15:14 +000017 'parseaddr',
18 'parsedate',
19 'parsedate_tz',
20 'unquote',
21 ]
22
23import os
24import re
25import time
26import base64
27import random
28import socket
Jeremy Hylton1afc1692008-06-18 20:49:58 +000029import urllib.parse
Guido van Rossum8b3febe2007-08-30 01:15:14 +000030import warnings
31from io import StringIO
32
33from email._parseaddr import quote
34from email._parseaddr import AddressList as _AddressList
35from email._parseaddr import mktime_tz
36
37# We need wormarounds for bugs in these methods in older Pythons (see below)
38from email._parseaddr import parsedate as _parsedate
39from email._parseaddr import parsedate_tz as _parsedate_tz
40
41from quopri import decodestring as _qdecode
42
43# Intrapackage imports
44from email.encoders import _bencode, _qencode
R David Murray8debacb2011-04-06 09:35:57 -040045from email.charset import Charset
Guido van Rossum8b3febe2007-08-30 01:15:14 +000046
47COMMASPACE = ', '
48EMPTYSTRING = ''
49UEMPTYSTRING = ''
50CRLF = '\r\n'
51TICK = "'"
52
53specialsre = re.compile(r'[][\\()<>@,:;".]')
54escapesre = re.compile(r'[][\\()"]')
55
56
Antoine Pitroufd036452008-08-19 17:56:33 +000057
Guido van Rossum8b3febe2007-08-30 01:15:14 +000058# Helpers
59
R David Murray8debacb2011-04-06 09:35:57 -040060def formataddr(pair, charset='utf-8'):
Guido van Rossum8b3febe2007-08-30 01:15:14 +000061 """The inverse of parseaddr(), this takes a 2-tuple of the form
62 (realname, email_address) and returns the string value suitable
63 for an RFC 2822 From, To or Cc header.
64
65 If the first element of pair is false, then the second element is
66 returned unmodified.
R David Murray8debacb2011-04-06 09:35:57 -040067
68 Optional charset if given is the character set that is used to encode
69 realname in case realname is not ASCII safe. Can be an instance of str or
70 a Charset-like object which has a header_encode method. Default is
71 'utf-8'.
Guido van Rossum8b3febe2007-08-30 01:15:14 +000072 """
73 name, address = pair
R David Murray8debacb2011-04-06 09:35:57 -040074 # The address MUST (per RFC) be ascii, so throw a UnicodeError if it isn't.
75 address.encode('ascii')
Guido van Rossum8b3febe2007-08-30 01:15:14 +000076 if name:
R David Murray8debacb2011-04-06 09:35:57 -040077 try:
78 name.encode('ascii')
79 except UnicodeEncodeError:
80 if isinstance(charset, str):
81 charset = Charset(charset)
82 encoded_name = charset.header_encode(name)
83 return "%s <%s>" % (encoded_name, address)
84 else:
85 quotes = ''
86 if specialsre.search(name):
87 quotes = '"'
88 name = escapesre.sub(r'\\\g<0>', name)
89 return '%s%s%s <%s>' % (quotes, name, quotes, address)
Guido van Rossum8b3febe2007-08-30 01:15:14 +000090 return address
91
92
Antoine Pitroufd036452008-08-19 17:56:33 +000093
Guido van Rossum8b3febe2007-08-30 01:15:14 +000094def getaddresses(fieldvalues):
95 """Return a list of (REALNAME, EMAIL) for each fieldvalue."""
96 all = COMMASPACE.join(fieldvalues)
97 a = _AddressList(all)
98 return a.addresslist
99
100
Antoine Pitroufd036452008-08-19 17:56:33 +0000101
Guido van Rossum8b3febe2007-08-30 01:15:14 +0000102ecre = re.compile(r'''
103 =\? # literal =?
104 (?P<charset>[^?]*?) # non-greedy up to the next ? is the charset
105 \? # literal ?
106 (?P<encoding>[qb]) # either a "q" or a "b", case insensitive
107 \? # literal ?
108 (?P<atom>.*?) # non-greedy up to the next ?= is the atom
109 \?= # literal ?=
110 ''', re.VERBOSE | re.IGNORECASE)
111
112
Antoine Pitroufd036452008-08-19 17:56:33 +0000113
Guido van Rossum8b3febe2007-08-30 01:15:14 +0000114def formatdate(timeval=None, localtime=False, usegmt=False):
115 """Returns a date string as specified by RFC 2822, e.g.:
116
117 Fri, 09 Nov 2001 01:08:47 -0000
118
119 Optional timeval if given is a floating point time value as accepted by
120 gmtime() and localtime(), otherwise the current time is used.
121
122 Optional localtime is a flag that when True, interprets timeval, and
123 returns a date relative to the local timezone instead of UTC, properly
124 taking daylight savings time into account.
125
126 Optional argument usegmt means that the timezone is written out as
127 an ascii string, not numeric one (so "GMT" instead of "+0000"). This
128 is needed for HTTP, and is only used when localtime==False.
129 """
130 # Note: we cannot use strftime() because that honors the locale and RFC
131 # 2822 requires that day and month names be the English abbreviations.
132 if timeval is None:
133 timeval = time.time()
134 if localtime:
135 now = time.localtime(timeval)
136 # Calculate timezone offset, based on whether the local zone has
137 # daylight savings time, and whether DST is in effect.
138 if time.daylight and now[-1]:
139 offset = time.altzone
140 else:
141 offset = time.timezone
142 hours, minutes = divmod(abs(offset), 3600)
143 # Remember offset is in seconds west of UTC, but the timezone is in
144 # minutes east of UTC, so the signs differ.
145 if offset > 0:
146 sign = '-'
147 else:
148 sign = '+'
149 zone = '%s%02d%02d' % (sign, hours, minutes // 60)
150 else:
151 now = time.gmtime(timeval)
152 # Timezone offset is always -0000
153 if usegmt:
154 zone = 'GMT'
155 else:
156 zone = '-0000'
157 return '%s, %02d %s %04d %02d:%02d:%02d %s' % (
158 ['Mon', 'Tue', 'Wed', 'Thu', 'Fri', 'Sat', 'Sun'][now[6]],
159 now[2],
160 ['Jan', 'Feb', 'Mar', 'Apr', 'May', 'Jun',
161 'Jul', 'Aug', 'Sep', 'Oct', 'Nov', 'Dec'][now[1] - 1],
162 now[0], now[3], now[4], now[5],
163 zone)
164
165
Antoine Pitroufd036452008-08-19 17:56:33 +0000166
R. David Murraya0b44b52010-12-02 21:47:19 +0000167def make_msgid(idstring=None, domain=None):
Guido van Rossum8b3febe2007-08-30 01:15:14 +0000168 """Returns a string suitable for RFC 2822 compliant Message-ID, e.g:
169
170 <20020201195627.33539.96671@nightshade.la.mastaler.com>
171
172 Optional idstring if given is a string used to strengthen the
R. David Murraya0b44b52010-12-02 21:47:19 +0000173 uniqueness of the message id. Optional domain if given provides the
174 portion of the message id after the '@'. It defaults to the locally
175 defined hostname.
Guido van Rossum8b3febe2007-08-30 01:15:14 +0000176 """
177 timeval = time.time()
178 utcdate = time.strftime('%Y%m%d%H%M%S', time.gmtime(timeval))
179 pid = os.getpid()
180 randint = random.randrange(100000)
181 if idstring is None:
182 idstring = ''
183 else:
184 idstring = '.' + idstring
R. David Murraya0b44b52010-12-02 21:47:19 +0000185 if domain is None:
186 domain = socket.getfqdn()
187 msgid = '<%s.%s.%s%s@%s>' % (utcdate, pid, randint, idstring, domain)
Guido van Rossum8b3febe2007-08-30 01:15:14 +0000188 return msgid
189
190
Antoine Pitroufd036452008-08-19 17:56:33 +0000191
Guido van Rossum8b3febe2007-08-30 01:15:14 +0000192# These functions are in the standalone mimelib version only because they've
193# subsequently been fixed in the latest Python versions. We use this to worm
194# around broken older Pythons.
195def parsedate(data):
196 if not data:
197 return None
198 return _parsedate(data)
199
200
201def parsedate_tz(data):
202 if not data:
203 return None
204 return _parsedate_tz(data)
205
206
207def parseaddr(addr):
208 addrs = _AddressList(addr).addresslist
209 if not addrs:
210 return '', ''
211 return addrs[0]
212
213
214# rfc822.unquote() doesn't properly de-backslash-ify in Python pre-2.3.
215def unquote(str):
216 """Remove quotes from a string."""
217 if len(str) > 1:
218 if str.startswith('"') and str.endswith('"'):
219 return str[1:-1].replace('\\\\', '\\').replace('\\"', '"')
220 if str.startswith('<') and str.endswith('>'):
221 return str[1:-1]
222 return str
223
224
Antoine Pitroufd036452008-08-19 17:56:33 +0000225
Guido van Rossum8b3febe2007-08-30 01:15:14 +0000226# RFC2231-related functions - parameter encoding and decoding
227def decode_rfc2231(s):
228 """Decode string according to RFC 2231"""
229 parts = s.split(TICK, 2)
230 if len(parts) <= 2:
231 return None, None, s
232 return parts
233
234
235def encode_rfc2231(s, charset=None, language=None):
236 """Encode string according to RFC 2231.
237
238 If neither charset nor language is given, then s is returned as-is. If
239 charset is given but not language, the string is encoded using the empty
240 string for language.
241 """
Guido van Rossum52dbbb92008-08-18 21:44:30 +0000242 s = urllib.parse.quote(s, safe='', encoding=charset or 'ascii')
Guido van Rossum8b3febe2007-08-30 01:15:14 +0000243 if charset is None and language is None:
244 return s
245 if language is None:
246 language = ''
247 return "%s'%s'%s" % (charset, language, s)
248
249
Antoine Pitroufd036452008-08-19 17:56:33 +0000250rfc2231_continuation = re.compile(r'^(?P<name>\w+)\*((?P<num>[0-9]+)\*?)?$',
251 re.ASCII)
Guido van Rossum8b3febe2007-08-30 01:15:14 +0000252
253def decode_params(params):
254 """Decode parameters list according to RFC 2231.
255
256 params is a sequence of 2-tuples containing (param name, string value).
257 """
258 # Copy params so we don't mess with the original
259 params = params[:]
260 new_params = []
261 # Map parameter's name to a list of continuations. The values are a
262 # 3-tuple of the continuation number, the string value, and a flag
263 # specifying whether a particular segment is %-encoded.
264 rfc2231_params = {}
265 name, value = params.pop(0)
266 new_params.append((name, value))
267 while params:
268 name, value = params.pop(0)
269 if name.endswith('*'):
270 encoded = True
271 else:
272 encoded = False
273 value = unquote(value)
274 mo = rfc2231_continuation.match(name)
275 if mo:
276 name, num = mo.group('name', 'num')
277 if num is not None:
278 num = int(num)
279 rfc2231_params.setdefault(name, []).append((num, value, encoded))
280 else:
281 new_params.append((name, '"%s"' % quote(value)))
282 if rfc2231_params:
283 for name, continuations in rfc2231_params.items():
284 value = []
285 extended = False
286 # Sort by number
287 continuations.sort()
288 # And now append all values in numerical order, converting
289 # %-encodings for the encoded segments. If any of the
290 # continuation names ends in a *, then the entire string, after
291 # decoding segments and concatenating, must have the charset and
292 # language specifiers at the beginning of the string.
293 for num, s, encoded in continuations:
294 if encoded:
Guido van Rossum52dbbb92008-08-18 21:44:30 +0000295 # Decode as "latin-1", so the characters in s directly
296 # represent the percent-encoded octet values.
297 # collapse_rfc2231_value treats this as an octet sequence.
298 s = urllib.parse.unquote(s, encoding="latin-1")
Guido van Rossum8b3febe2007-08-30 01:15:14 +0000299 extended = True
300 value.append(s)
301 value = quote(EMPTYSTRING.join(value))
302 if extended:
303 charset, language, value = decode_rfc2231(value)
304 new_params.append((name, (charset, language, '"%s"' % value)))
305 else:
306 new_params.append((name, '"%s"' % value))
307 return new_params
308
309def collapse_rfc2231_value(value, errors='replace',
310 fallback_charset='us-ascii'):
311 if not isinstance(value, tuple) or len(value) != 3:
312 return unquote(value)
313 # While value comes to us as a unicode string, we need it to be a bytes
314 # object. We do not want bytes() normal utf-8 decoder, we want a straight
315 # interpretation of the string as character bytes.
316 charset, language, text = value
Guido van Rossum9604e662007-08-30 03:46:43 +0000317 rawbytes = bytes(text, 'raw-unicode-escape')
Guido van Rossum8b3febe2007-08-30 01:15:14 +0000318 try:
319 return str(rawbytes, charset, errors)
320 except LookupError:
321 # charset is not a known codec.
322 return unquote(text)