blob: 5f40bac174bb94fce26025afcc5377b67ba9c8be [file] [log] [blame]
Benjamin Peterson46a99002010-01-09 18:45:30 +00001# Copyright (C) 2001-2010 Python Software Foundation
Guido van Rossum8b3febe2007-08-30 01:15:14 +00002# Author: Barry Warsaw
3# Contact: email-sig@python.org
4
5"""Miscellaneous utilities."""
6
7__all__ = [
8 'collapse_rfc2231_value',
9 'decode_params',
10 'decode_rfc2231',
11 'encode_rfc2231',
12 'formataddr',
13 'formatdate',
14 'getaddresses',
15 'make_msgid',
Barry Warsawb742a962009-11-25 18:45:15 +000016 'mktime_tz',
Guido van Rossum8b3febe2007-08-30 01:15:14 +000017 'parseaddr',
18 'parsedate',
19 'parsedate_tz',
20 'unquote',
21 ]
22
23import os
24import re
25import time
26import base64
27import random
28import socket
Jeremy Hylton1afc1692008-06-18 20:49:58 +000029import urllib.parse
Guido van Rossum8b3febe2007-08-30 01:15:14 +000030import warnings
31from io import StringIO
32
33from email._parseaddr import quote
34from email._parseaddr import AddressList as _AddressList
35from email._parseaddr import mktime_tz
36
37# We need wormarounds for bugs in these methods in older Pythons (see below)
38from email._parseaddr import parsedate as _parsedate
39from email._parseaddr import parsedate_tz as _parsedate_tz
40
41from quopri import decodestring as _qdecode
42
43# Intrapackage imports
44from email.encoders import _bencode, _qencode
45
46COMMASPACE = ', '
47EMPTYSTRING = ''
48UEMPTYSTRING = ''
49CRLF = '\r\n'
50TICK = "'"
51
52specialsre = re.compile(r'[][\\()<>@,:;".]')
53escapesre = re.compile(r'[][\\()"]')
54
55
Antoine Pitroufd036452008-08-19 17:56:33 +000056
Guido van Rossum8b3febe2007-08-30 01:15:14 +000057# Helpers
58
Guido van Rossum8b3febe2007-08-30 01:15:14 +000059def formataddr(pair):
60 """The inverse of parseaddr(), this takes a 2-tuple of the form
61 (realname, email_address) and returns the string value suitable
62 for an RFC 2822 From, To or Cc header.
63
64 If the first element of pair is false, then the second element is
65 returned unmodified.
66 """
67 name, address = pair
68 if name:
69 quotes = ''
70 if specialsre.search(name):
71 quotes = '"'
72 name = escapesre.sub(r'\\\g<0>', name)
73 return '%s%s%s <%s>' % (quotes, name, quotes, address)
74 return address
75
76
Antoine Pitroufd036452008-08-19 17:56:33 +000077
Guido van Rossum8b3febe2007-08-30 01:15:14 +000078def getaddresses(fieldvalues):
79 """Return a list of (REALNAME, EMAIL) for each fieldvalue."""
80 all = COMMASPACE.join(fieldvalues)
81 a = _AddressList(all)
82 return a.addresslist
83
84
Antoine Pitroufd036452008-08-19 17:56:33 +000085
Guido van Rossum8b3febe2007-08-30 01:15:14 +000086ecre = re.compile(r'''
87 =\? # literal =?
88 (?P<charset>[^?]*?) # non-greedy up to the next ? is the charset
89 \? # literal ?
90 (?P<encoding>[qb]) # either a "q" or a "b", case insensitive
91 \? # literal ?
92 (?P<atom>.*?) # non-greedy up to the next ?= is the atom
93 \?= # literal ?=
94 ''', re.VERBOSE | re.IGNORECASE)
95
96
Antoine Pitroufd036452008-08-19 17:56:33 +000097
Guido van Rossum8b3febe2007-08-30 01:15:14 +000098def formatdate(timeval=None, localtime=False, usegmt=False):
99 """Returns a date string as specified by RFC 2822, e.g.:
100
101 Fri, 09 Nov 2001 01:08:47 -0000
102
103 Optional timeval if given is a floating point time value as accepted by
104 gmtime() and localtime(), otherwise the current time is used.
105
106 Optional localtime is a flag that when True, interprets timeval, and
107 returns a date relative to the local timezone instead of UTC, properly
108 taking daylight savings time into account.
109
110 Optional argument usegmt means that the timezone is written out as
111 an ascii string, not numeric one (so "GMT" instead of "+0000"). This
112 is needed for HTTP, and is only used when localtime==False.
113 """
114 # Note: we cannot use strftime() because that honors the locale and RFC
115 # 2822 requires that day and month names be the English abbreviations.
116 if timeval is None:
117 timeval = time.time()
118 if localtime:
119 now = time.localtime(timeval)
120 # Calculate timezone offset, based on whether the local zone has
121 # daylight savings time, and whether DST is in effect.
122 if time.daylight and now[-1]:
123 offset = time.altzone
124 else:
125 offset = time.timezone
126 hours, minutes = divmod(abs(offset), 3600)
127 # Remember offset is in seconds west of UTC, but the timezone is in
128 # minutes east of UTC, so the signs differ.
129 if offset > 0:
130 sign = '-'
131 else:
132 sign = '+'
133 zone = '%s%02d%02d' % (sign, hours, minutes // 60)
134 else:
135 now = time.gmtime(timeval)
136 # Timezone offset is always -0000
137 if usegmt:
138 zone = 'GMT'
139 else:
140 zone = '-0000'
141 return '%s, %02d %s %04d %02d:%02d:%02d %s' % (
142 ['Mon', 'Tue', 'Wed', 'Thu', 'Fri', 'Sat', 'Sun'][now[6]],
143 now[2],
144 ['Jan', 'Feb', 'Mar', 'Apr', 'May', 'Jun',
145 'Jul', 'Aug', 'Sep', 'Oct', 'Nov', 'Dec'][now[1] - 1],
146 now[0], now[3], now[4], now[5],
147 zone)
148
149
Antoine Pitroufd036452008-08-19 17:56:33 +0000150
Guido van Rossum8b3febe2007-08-30 01:15:14 +0000151def make_msgid(idstring=None):
152 """Returns a string suitable for RFC 2822 compliant Message-ID, e.g:
153
154 <20020201195627.33539.96671@nightshade.la.mastaler.com>
155
156 Optional idstring if given is a string used to strengthen the
157 uniqueness of the message id.
158 """
159 timeval = time.time()
160 utcdate = time.strftime('%Y%m%d%H%M%S', time.gmtime(timeval))
161 pid = os.getpid()
162 randint = random.randrange(100000)
163 if idstring is None:
164 idstring = ''
165 else:
166 idstring = '.' + idstring
167 idhost = socket.getfqdn()
168 msgid = '<%s.%s.%s%s@%s>' % (utcdate, pid, randint, idstring, idhost)
169 return msgid
170
171
Antoine Pitroufd036452008-08-19 17:56:33 +0000172
Guido van Rossum8b3febe2007-08-30 01:15:14 +0000173# These functions are in the standalone mimelib version only because they've
174# subsequently been fixed in the latest Python versions. We use this to worm
175# around broken older Pythons.
176def parsedate(data):
177 if not data:
178 return None
179 return _parsedate(data)
180
181
182def parsedate_tz(data):
183 if not data:
184 return None
185 return _parsedate_tz(data)
186
187
188def parseaddr(addr):
189 addrs = _AddressList(addr).addresslist
190 if not addrs:
191 return '', ''
192 return addrs[0]
193
194
195# rfc822.unquote() doesn't properly de-backslash-ify in Python pre-2.3.
196def unquote(str):
197 """Remove quotes from a string."""
198 if len(str) > 1:
199 if str.startswith('"') and str.endswith('"'):
200 return str[1:-1].replace('\\\\', '\\').replace('\\"', '"')
201 if str.startswith('<') and str.endswith('>'):
202 return str[1:-1]
203 return str
204
205
Antoine Pitroufd036452008-08-19 17:56:33 +0000206
Guido van Rossum8b3febe2007-08-30 01:15:14 +0000207# RFC2231-related functions - parameter encoding and decoding
208def decode_rfc2231(s):
209 """Decode string according to RFC 2231"""
210 parts = s.split(TICK, 2)
211 if len(parts) <= 2:
212 return None, None, s
213 return parts
214
215
216def encode_rfc2231(s, charset=None, language=None):
217 """Encode string according to RFC 2231.
218
219 If neither charset nor language is given, then s is returned as-is. If
220 charset is given but not language, the string is encoded using the empty
221 string for language.
222 """
Guido van Rossum52dbbb92008-08-18 21:44:30 +0000223 s = urllib.parse.quote(s, safe='', encoding=charset or 'ascii')
Guido van Rossum8b3febe2007-08-30 01:15:14 +0000224 if charset is None and language is None:
225 return s
226 if language is None:
227 language = ''
228 return "%s'%s'%s" % (charset, language, s)
229
230
Antoine Pitroufd036452008-08-19 17:56:33 +0000231rfc2231_continuation = re.compile(r'^(?P<name>\w+)\*((?P<num>[0-9]+)\*?)?$',
232 re.ASCII)
Guido van Rossum8b3febe2007-08-30 01:15:14 +0000233
234def decode_params(params):
235 """Decode parameters list according to RFC 2231.
236
237 params is a sequence of 2-tuples containing (param name, string value).
238 """
239 # Copy params so we don't mess with the original
240 params = params[:]
241 new_params = []
242 # Map parameter's name to a list of continuations. The values are a
243 # 3-tuple of the continuation number, the string value, and a flag
244 # specifying whether a particular segment is %-encoded.
245 rfc2231_params = {}
246 name, value = params.pop(0)
247 new_params.append((name, value))
248 while params:
249 name, value = params.pop(0)
250 if name.endswith('*'):
251 encoded = True
252 else:
253 encoded = False
254 value = unquote(value)
255 mo = rfc2231_continuation.match(name)
256 if mo:
257 name, num = mo.group('name', 'num')
258 if num is not None:
259 num = int(num)
260 rfc2231_params.setdefault(name, []).append((num, value, encoded))
261 else:
262 new_params.append((name, '"%s"' % quote(value)))
263 if rfc2231_params:
264 for name, continuations in rfc2231_params.items():
265 value = []
266 extended = False
267 # Sort by number
268 continuations.sort()
269 # And now append all values in numerical order, converting
270 # %-encodings for the encoded segments. If any of the
271 # continuation names ends in a *, then the entire string, after
272 # decoding segments and concatenating, must have the charset and
273 # language specifiers at the beginning of the string.
274 for num, s, encoded in continuations:
275 if encoded:
Guido van Rossum52dbbb92008-08-18 21:44:30 +0000276 # Decode as "latin-1", so the characters in s directly
277 # represent the percent-encoded octet values.
278 # collapse_rfc2231_value treats this as an octet sequence.
279 s = urllib.parse.unquote(s, encoding="latin-1")
Guido van Rossum8b3febe2007-08-30 01:15:14 +0000280 extended = True
281 value.append(s)
282 value = quote(EMPTYSTRING.join(value))
283 if extended:
284 charset, language, value = decode_rfc2231(value)
285 new_params.append((name, (charset, language, '"%s"' % value)))
286 else:
287 new_params.append((name, '"%s"' % value))
288 return new_params
289
290def collapse_rfc2231_value(value, errors='replace',
291 fallback_charset='us-ascii'):
292 if not isinstance(value, tuple) or len(value) != 3:
293 return unquote(value)
294 # While value comes to us as a unicode string, we need it to be a bytes
295 # object. We do not want bytes() normal utf-8 decoder, we want a straight
296 # interpretation of the string as character bytes.
297 charset, language, text = value
Guido van Rossum9604e662007-08-30 03:46:43 +0000298 rawbytes = bytes(text, 'raw-unicode-escape')
Guido van Rossum8b3febe2007-08-30 01:15:14 +0000299 try:
300 return str(rawbytes, charset, errors)
301 except LookupError:
302 # charset is not a known codec.
303 return unquote(text)