blob: ac4da3705f31781ff038af9b1b3bf351b208e3c4 [file] [log] [blame]
Benjamin Peterson46a99002010-01-09 18:45:30 +00001# Copyright (C) 2001-2010 Python Software Foundation
Guido van Rossum8b3febe2007-08-30 01:15:14 +00002# Author: Barry Warsaw
3# Contact: email-sig@python.org
4
5"""Miscellaneous utilities."""
6
7__all__ = [
8 'collapse_rfc2231_value',
9 'decode_params',
10 'decode_rfc2231',
11 'encode_rfc2231',
12 'formataddr',
13 'formatdate',
14 'getaddresses',
15 'make_msgid',
Barry Warsawb742a962009-11-25 18:45:15 +000016 'mktime_tz',
Guido van Rossum8b3febe2007-08-30 01:15:14 +000017 'parseaddr',
18 'parsedate',
19 'parsedate_tz',
20 'unquote',
21 ]
22
23import os
24import re
25import time
26import base64
27import random
28import socket
Jeremy Hylton1afc1692008-06-18 20:49:58 +000029import urllib.parse
Guido van Rossum8b3febe2007-08-30 01:15:14 +000030import warnings
31from io import StringIO
32
33from email._parseaddr import quote
34from email._parseaddr import AddressList as _AddressList
35from email._parseaddr import mktime_tz
36
37# We need wormarounds for bugs in these methods in older Pythons (see below)
38from email._parseaddr import parsedate as _parsedate
39from email._parseaddr import parsedate_tz as _parsedate_tz
40
41from quopri import decodestring as _qdecode
42
43# Intrapackage imports
44from email.encoders import _bencode, _qencode
45
46COMMASPACE = ', '
47EMPTYSTRING = ''
48UEMPTYSTRING = ''
49CRLF = '\r\n'
50TICK = "'"
51
52specialsre = re.compile(r'[][\\()<>@,:;".]')
53escapesre = re.compile(r'[][\\()"]')
54
55
Antoine Pitroufd036452008-08-19 17:56:33 +000056
Guido van Rossum8b3febe2007-08-30 01:15:14 +000057# Helpers
58
Guido van Rossum8b3febe2007-08-30 01:15:14 +000059def formataddr(pair):
60 """The inverse of parseaddr(), this takes a 2-tuple of the form
61 (realname, email_address) and returns the string value suitable
62 for an RFC 2822 From, To or Cc header.
63
64 If the first element of pair is false, then the second element is
65 returned unmodified.
66 """
67 name, address = pair
68 if name:
69 quotes = ''
70 if specialsre.search(name):
71 quotes = '"'
72 name = escapesre.sub(r'\\\g<0>', name)
73 return '%s%s%s <%s>' % (quotes, name, quotes, address)
74 return address
75
76
Antoine Pitroufd036452008-08-19 17:56:33 +000077
Guido van Rossum8b3febe2007-08-30 01:15:14 +000078def getaddresses(fieldvalues):
79 """Return a list of (REALNAME, EMAIL) for each fieldvalue."""
80 all = COMMASPACE.join(fieldvalues)
81 a = _AddressList(all)
82 return a.addresslist
83
84
Antoine Pitroufd036452008-08-19 17:56:33 +000085
Guido van Rossum8b3febe2007-08-30 01:15:14 +000086ecre = re.compile(r'''
87 =\? # literal =?
88 (?P<charset>[^?]*?) # non-greedy up to the next ? is the charset
89 \? # literal ?
90 (?P<encoding>[qb]) # either a "q" or a "b", case insensitive
91 \? # literal ?
92 (?P<atom>.*?) # non-greedy up to the next ?= is the atom
93 \?= # literal ?=
94 ''', re.VERBOSE | re.IGNORECASE)
95
96
Antoine Pitroufd036452008-08-19 17:56:33 +000097
Guido van Rossum8b3febe2007-08-30 01:15:14 +000098def formatdate(timeval=None, localtime=False, usegmt=False):
99 """Returns a date string as specified by RFC 2822, e.g.:
100
101 Fri, 09 Nov 2001 01:08:47 -0000
102
103 Optional timeval if given is a floating point time value as accepted by
104 gmtime() and localtime(), otherwise the current time is used.
105
106 Optional localtime is a flag that when True, interprets timeval, and
107 returns a date relative to the local timezone instead of UTC, properly
108 taking daylight savings time into account.
109
110 Optional argument usegmt means that the timezone is written out as
111 an ascii string, not numeric one (so "GMT" instead of "+0000"). This
112 is needed for HTTP, and is only used when localtime==False.
113 """
114 # Note: we cannot use strftime() because that honors the locale and RFC
115 # 2822 requires that day and month names be the English abbreviations.
116 if timeval is None:
117 timeval = time.time()
118 if localtime:
119 now = time.localtime(timeval)
120 # Calculate timezone offset, based on whether the local zone has
121 # daylight savings time, and whether DST is in effect.
122 if time.daylight and now[-1]:
123 offset = time.altzone
124 else:
125 offset = time.timezone
126 hours, minutes = divmod(abs(offset), 3600)
127 # Remember offset is in seconds west of UTC, but the timezone is in
128 # minutes east of UTC, so the signs differ.
129 if offset > 0:
130 sign = '-'
131 else:
132 sign = '+'
133 zone = '%s%02d%02d' % (sign, hours, minutes // 60)
134 else:
135 now = time.gmtime(timeval)
136 # Timezone offset is always -0000
137 if usegmt:
138 zone = 'GMT'
139 else:
140 zone = '-0000'
141 return '%s, %02d %s %04d %02d:%02d:%02d %s' % (
142 ['Mon', 'Tue', 'Wed', 'Thu', 'Fri', 'Sat', 'Sun'][now[6]],
143 now[2],
144 ['Jan', 'Feb', 'Mar', 'Apr', 'May', 'Jun',
145 'Jul', 'Aug', 'Sep', 'Oct', 'Nov', 'Dec'][now[1] - 1],
146 now[0], now[3], now[4], now[5],
147 zone)
148
149
Antoine Pitroufd036452008-08-19 17:56:33 +0000150
R. David Murraya0b44b52010-12-02 21:47:19 +0000151def make_msgid(idstring=None, domain=None):
Guido van Rossum8b3febe2007-08-30 01:15:14 +0000152 """Returns a string suitable for RFC 2822 compliant Message-ID, e.g:
153
154 <20020201195627.33539.96671@nightshade.la.mastaler.com>
155
156 Optional idstring if given is a string used to strengthen the
R. David Murraya0b44b52010-12-02 21:47:19 +0000157 uniqueness of the message id. Optional domain if given provides the
158 portion of the message id after the '@'. It defaults to the locally
159 defined hostname.
Guido van Rossum8b3febe2007-08-30 01:15:14 +0000160 """
161 timeval = time.time()
162 utcdate = time.strftime('%Y%m%d%H%M%S', time.gmtime(timeval))
163 pid = os.getpid()
164 randint = random.randrange(100000)
165 if idstring is None:
166 idstring = ''
167 else:
168 idstring = '.' + idstring
R. David Murraya0b44b52010-12-02 21:47:19 +0000169 if domain is None:
170 domain = socket.getfqdn()
171 msgid = '<%s.%s.%s%s@%s>' % (utcdate, pid, randint, idstring, domain)
Guido van Rossum8b3febe2007-08-30 01:15:14 +0000172 return msgid
173
174
Antoine Pitroufd036452008-08-19 17:56:33 +0000175
Guido van Rossum8b3febe2007-08-30 01:15:14 +0000176# These functions are in the standalone mimelib version only because they've
177# subsequently been fixed in the latest Python versions. We use this to worm
178# around broken older Pythons.
179def parsedate(data):
180 if not data:
181 return None
182 return _parsedate(data)
183
184
185def parsedate_tz(data):
186 if not data:
187 return None
188 return _parsedate_tz(data)
189
190
191def parseaddr(addr):
192 addrs = _AddressList(addr).addresslist
193 if not addrs:
194 return '', ''
195 return addrs[0]
196
197
198# rfc822.unquote() doesn't properly de-backslash-ify in Python pre-2.3.
199def unquote(str):
200 """Remove quotes from a string."""
201 if len(str) > 1:
202 if str.startswith('"') and str.endswith('"'):
203 return str[1:-1].replace('\\\\', '\\').replace('\\"', '"')
204 if str.startswith('<') and str.endswith('>'):
205 return str[1:-1]
206 return str
207
208
Antoine Pitroufd036452008-08-19 17:56:33 +0000209
Guido van Rossum8b3febe2007-08-30 01:15:14 +0000210# RFC2231-related functions - parameter encoding and decoding
211def decode_rfc2231(s):
212 """Decode string according to RFC 2231"""
213 parts = s.split(TICK, 2)
214 if len(parts) <= 2:
215 return None, None, s
216 return parts
217
218
219def encode_rfc2231(s, charset=None, language=None):
220 """Encode string according to RFC 2231.
221
222 If neither charset nor language is given, then s is returned as-is. If
223 charset is given but not language, the string is encoded using the empty
224 string for language.
225 """
Guido van Rossum52dbbb92008-08-18 21:44:30 +0000226 s = urllib.parse.quote(s, safe='', encoding=charset or 'ascii')
Guido van Rossum8b3febe2007-08-30 01:15:14 +0000227 if charset is None and language is None:
228 return s
229 if language is None:
230 language = ''
231 return "%s'%s'%s" % (charset, language, s)
232
233
Antoine Pitroufd036452008-08-19 17:56:33 +0000234rfc2231_continuation = re.compile(r'^(?P<name>\w+)\*((?P<num>[0-9]+)\*?)?$',
235 re.ASCII)
Guido van Rossum8b3febe2007-08-30 01:15:14 +0000236
237def decode_params(params):
238 """Decode parameters list according to RFC 2231.
239
240 params is a sequence of 2-tuples containing (param name, string value).
241 """
242 # Copy params so we don't mess with the original
243 params = params[:]
244 new_params = []
245 # Map parameter's name to a list of continuations. The values are a
246 # 3-tuple of the continuation number, the string value, and a flag
247 # specifying whether a particular segment is %-encoded.
248 rfc2231_params = {}
249 name, value = params.pop(0)
250 new_params.append((name, value))
251 while params:
252 name, value = params.pop(0)
253 if name.endswith('*'):
254 encoded = True
255 else:
256 encoded = False
257 value = unquote(value)
258 mo = rfc2231_continuation.match(name)
259 if mo:
260 name, num = mo.group('name', 'num')
261 if num is not None:
262 num = int(num)
263 rfc2231_params.setdefault(name, []).append((num, value, encoded))
264 else:
265 new_params.append((name, '"%s"' % quote(value)))
266 if rfc2231_params:
267 for name, continuations in rfc2231_params.items():
268 value = []
269 extended = False
270 # Sort by number
271 continuations.sort()
272 # And now append all values in numerical order, converting
273 # %-encodings for the encoded segments. If any of the
274 # continuation names ends in a *, then the entire string, after
275 # decoding segments and concatenating, must have the charset and
276 # language specifiers at the beginning of the string.
277 for num, s, encoded in continuations:
278 if encoded:
Guido van Rossum52dbbb92008-08-18 21:44:30 +0000279 # Decode as "latin-1", so the characters in s directly
280 # represent the percent-encoded octet values.
281 # collapse_rfc2231_value treats this as an octet sequence.
282 s = urllib.parse.unquote(s, encoding="latin-1")
Guido van Rossum8b3febe2007-08-30 01:15:14 +0000283 extended = True
284 value.append(s)
285 value = quote(EMPTYSTRING.join(value))
286 if extended:
287 charset, language, value = decode_rfc2231(value)
288 new_params.append((name, (charset, language, '"%s"' % value)))
289 else:
290 new_params.append((name, '"%s"' % value))
291 return new_params
292
293def collapse_rfc2231_value(value, errors='replace',
294 fallback_charset='us-ascii'):
295 if not isinstance(value, tuple) or len(value) != 3:
296 return unquote(value)
297 # While value comes to us as a unicode string, we need it to be a bytes
298 # object. We do not want bytes() normal utf-8 decoder, we want a straight
299 # interpretation of the string as character bytes.
300 charset, language, text = value
Guido van Rossum9604e662007-08-30 03:46:43 +0000301 rawbytes = bytes(text, 'raw-unicode-escape')
Guido van Rossum8b3febe2007-08-30 01:15:14 +0000302 try:
303 return str(rawbytes, charset, errors)
304 except LookupError:
305 # charset is not a known codec.
306 return unquote(text)