blob: 6d22ca7e09eb8bafefc67a42b5cfb5c926fc6618 [file] [log] [blame]
Georg Brandl8cdc9bc2010-01-01 13:07:05 +00001# Copyright (C) 2001-2010 Python Software Foundation
Barry Warsawbb113862004-10-03 03:16:19 +00002# Author: Barry Warsaw
3# Contact: email-sig@python.org
Barry Warsawba925802001-09-23 03:17:28 +00004
Barry Warsaw24f79762004-05-09 03:55:11 +00005"""Miscellaneous utilities."""
Barry Warsawba925802001-09-23 03:17:28 +00006
Barry Warsaw40ef0062006-03-18 15:41:53 +00007__all__ = [
8 'collapse_rfc2231_value',
9 'decode_params',
10 'decode_rfc2231',
11 'encode_rfc2231',
12 'formataddr',
13 'formatdate',
14 'getaddresses',
15 'make_msgid',
Barry Warsaw0d9f61a2009-11-25 18:38:32 +000016 'mktime_tz',
Barry Warsaw40ef0062006-03-18 15:41:53 +000017 'parseaddr',
18 'parsedate',
19 'parsedate_tz',
20 'unquote',
21 ]
22
Barry Warsaw409a4c02002-04-10 21:01:31 +000023import os
Barry Warsaw24f79762004-05-09 03:55:11 +000024import re
25import time
26import base64
27import random
28import socket
Barry Warsawb110bad2006-07-21 14:51:07 +000029import urllib
Barry Warsaw409a4c02002-04-10 21:01:31 +000030import warnings
Barry Warsawba925802001-09-23 03:17:28 +000031
Barry Warsaw030ddf72002-11-05 19:54:52 +000032from email._parseaddr import quote
33from email._parseaddr import AddressList as _AddressList
34from email._parseaddr import mktime_tz
Barry Warsaw409a4c02002-04-10 21:01:31 +000035
36# We need wormarounds for bugs in these methods in older Pythons (see below)
Barry Warsaw030ddf72002-11-05 19:54:52 +000037from email._parseaddr import parsedate as _parsedate
38from email._parseaddr import parsedate_tz as _parsedate_tz
Barry Warsawba925802001-09-23 03:17:28 +000039
Barry Warsaw24f79762004-05-09 03:55:11 +000040from quopri import decodestring as _qdecode
Barry Warsawba925802001-09-23 03:17:28 +000041
42# Intrapackage imports
Barry Warsaw40ef0062006-03-18 15:41:53 +000043from email.encoders import _bencode, _qencode
Barry Warsawba925802001-09-23 03:17:28 +000044
45COMMASPACE = ', '
Barry Warsaw12566a82002-06-29 05:58:04 +000046EMPTYSTRING = ''
Barry Warsawba925802001-09-23 03:17:28 +000047UEMPTYSTRING = u''
Barry Warsaw409a4c02002-04-10 21:01:31 +000048CRLF = '\r\n'
Barry Warsaw18d2f392006-07-17 23:07:51 +000049TICK = "'"
Barry Warsaw409a4c02002-04-10 21:01:31 +000050
Barry Warsawa2369922003-03-10 19:20:18 +000051specialsre = re.compile(r'[][\\()<>@,:;".]')
52escapesre = re.compile(r'[][\\()"]')
Barry Warsawba925802001-09-23 03:17:28 +000053
54
Barry Warsawe968ead2001-10-04 17:05:11 +000055
Barry Warsawba925802001-09-23 03:17:28 +000056# Helpers
57
58def _identity(s):
59 return s
60
61
62def _bdecode(s):
R. David Murray4617e502010-03-08 02:04:06 +000063 """Decodes a base64 string.
64
65 This function is equivalent to base64.decodestring and it's retained only
66 for backward compatibility. It used to remove the last \n of the decoded
67 string, if it had any (see issue 7143).
68 """
Barry Warsawba925802001-09-23 03:17:28 +000069 if not s:
70 return s
R. David Murray4617e502010-03-08 02:04:06 +000071 return base64.decodestring(s)
Barry Warsawba925802001-09-23 03:17:28 +000072
73
Barry Warsawe968ead2001-10-04 17:05:11 +000074
Barry Warsaw409a4c02002-04-10 21:01:31 +000075def fix_eols(s):
76 """Replace all line-ending characters with \r\n."""
77 # Fix newlines with no preceding carriage return
78 s = re.sub(r'(?<!\r)\n', CRLF, s)
79 # Fix carriage returns with no following newline
80 s = re.sub(r'\r(?!\n)', CRLF, s)
81 return s
82
83
84
85def formataddr(pair):
86 """The inverse of parseaddr(), this takes a 2-tuple of the form
87 (realname, email_address) and returns the string value suitable
Barry Warsaw5bdb2be2002-09-28 20:49:57 +000088 for an RFC 2822 From, To or Cc header.
Tim Peters8ac14952002-05-23 15:15:30 +000089
Barry Warsaw409a4c02002-04-10 21:01:31 +000090 If the first element of pair is false, then the second element is
91 returned unmodified.
92 """
93 name, address = pair
94 if name:
95 quotes = ''
96 if specialsre.search(name):
97 quotes = '"'
98 name = escapesre.sub(r'\\\g<0>', name)
99 return '%s%s%s <%s>' % (quotes, name, quotes, address)
100 return address
101
Barry Warsaw409a4c02002-04-10 21:01:31 +0000102
103
Barry Warsawba925802001-09-23 03:17:28 +0000104def getaddresses(fieldvalues):
105 """Return a list of (REALNAME, EMAIL) for each fieldvalue."""
106 all = COMMASPACE.join(fieldvalues)
Barry Warsawe1df15c2002-04-12 20:50:05 +0000107 a = _AddressList(all)
Barry Warsaw4be9ecc2002-05-22 01:52:10 +0000108 return a.addresslist
Barry Warsawba925802001-09-23 03:17:28 +0000109
110
Barry Warsawe968ead2001-10-04 17:05:11 +0000111
Barry Warsawba925802001-09-23 03:17:28 +0000112ecre = re.compile(r'''
113 =\? # literal =?
114 (?P<charset>[^?]*?) # non-greedy up to the next ? is the charset
115 \? # literal ?
116 (?P<encoding>[qb]) # either a "q" or a "b", case insensitive
117 \? # literal ?
118 (?P<atom>.*?) # non-greedy up to the next ?= is the atom
119 \?= # literal ?=
120 ''', re.VERBOSE | re.IGNORECASE)
121
122
Barry Warsawaa79f4d2001-11-09 16:59:56 +0000123
Anthony Baxter3dd9e462004-10-11 13:53:08 +0000124def formatdate(timeval=None, localtime=False, usegmt=False):
Barry Warsaw9cff0e62001-11-09 17:07:28 +0000125 """Returns a date string as specified by RFC 2822, e.g.:
Barry Warsawaa79f4d2001-11-09 16:59:56 +0000126
127 Fri, 09 Nov 2001 01:08:47 -0000
128
Barry Warsaw9cff0e62001-11-09 17:07:28 +0000129 Optional timeval if given is a floating point time value as accepted by
130 gmtime() and localtime(), otherwise the current time is used.
131
Barry Warsaw5bdb2be2002-09-28 20:49:57 +0000132 Optional localtime is a flag that when True, interprets timeval, and
Barry Warsaw9cff0e62001-11-09 17:07:28 +0000133 returns a date relative to the local timezone instead of UTC, properly
134 taking daylight savings time into account.
Anthony Baxter3dd9e462004-10-11 13:53:08 +0000135
Tim Peterse718f612004-10-12 21:51:32 +0000136 Optional argument usegmt means that the timezone is written out as
Anthony Baxter3dd9e462004-10-11 13:53:08 +0000137 an ascii string, not numeric one (so "GMT" instead of "+0000"). This
138 is needed for HTTP, and is only used when localtime==False.
Barry Warsawaa79f4d2001-11-09 16:59:56 +0000139 """
140 # Note: we cannot use strftime() because that honors the locale and RFC
141 # 2822 requires that day and month names be the English abbreviations.
142 if timeval is None:
143 timeval = time.time()
144 if localtime:
145 now = time.localtime(timeval)
146 # Calculate timezone offset, based on whether the local zone has
147 # daylight savings time, and whether DST is in effect.
148 if time.daylight and now[-1]:
149 offset = time.altzone
150 else:
151 offset = time.timezone
Barry Warsawe5739a62001-11-19 18:36:43 +0000152 hours, minutes = divmod(abs(offset), 3600)
153 # Remember offset is in seconds west of UTC, but the timezone is in
154 # minutes east of UTC, so the signs differ.
155 if offset > 0:
156 sign = '-'
157 else:
158 sign = '+'
Barry Warsawbb113862004-10-03 03:16:19 +0000159 zone = '%s%02d%02d' % (sign, hours, minutes // 60)
Barry Warsawaa79f4d2001-11-09 16:59:56 +0000160 else:
161 now = time.gmtime(timeval)
162 # Timezone offset is always -0000
Anthony Baxter3dd9e462004-10-11 13:53:08 +0000163 if usegmt:
164 zone = 'GMT'
165 else:
166 zone = '-0000'
Barry Warsawaa79f4d2001-11-09 16:59:56 +0000167 return '%s, %02d %s %04d %02d:%02d:%02d %s' % (
168 ['Mon', 'Tue', 'Wed', 'Thu', 'Fri', 'Sat', 'Sun'][now[6]],
169 now[2],
170 ['Jan', 'Feb', 'Mar', 'Apr', 'May', 'Jun',
171 'Jul', 'Aug', 'Sep', 'Oct', 'Nov', 'Dec'][now[1] - 1],
172 now[0], now[3], now[4], now[5],
173 zone)
Barry Warsaw409a4c02002-04-10 21:01:31 +0000174
175
176
177def make_msgid(idstring=None):
Barry Warsaw0ebc5c92002-10-01 00:44:13 +0000178 """Returns a string suitable for RFC 2822 compliant Message-ID, e.g:
Barry Warsaw409a4c02002-04-10 21:01:31 +0000179
180 <20020201195627.33539.96671@nightshade.la.mastaler.com>
181
182 Optional idstring if given is a string used to strengthen the
Barry Warsaw0ebc5c92002-10-01 00:44:13 +0000183 uniqueness of the message id.
Barry Warsaw409a4c02002-04-10 21:01:31 +0000184 """
185 timeval = time.time()
186 utcdate = time.strftime('%Y%m%d%H%M%S', time.gmtime(timeval))
187 pid = os.getpid()
188 randint = random.randrange(100000)
189 if idstring is None:
190 idstring = ''
191 else:
192 idstring = '.' + idstring
193 idhost = socket.getfqdn()
194 msgid = '<%s.%s.%s%s@%s>' % (utcdate, pid, randint, idstring, idhost)
195 return msgid
196
197
198
199# These functions are in the standalone mimelib version only because they've
200# subsequently been fixed in the latest Python versions. We use this to worm
201# around broken older Pythons.
202def parsedate(data):
203 if not data:
204 return None
205 return _parsedate(data)
206
207
208def parsedate_tz(data):
209 if not data:
210 return None
211 return _parsedate_tz(data)
212
213
214def parseaddr(addr):
Barry Warsaw24fd0252002-04-15 22:00:25 +0000215 addrs = _AddressList(addr).addresslist
216 if not addrs:
Barry Warsaw409a4c02002-04-10 21:01:31 +0000217 return '', ''
Barry Warsaw24fd0252002-04-15 22:00:25 +0000218 return addrs[0]
Barry Warsaw12566a82002-06-29 05:58:04 +0000219
220
Barry Warsaw184d55a2002-09-11 02:22:48 +0000221# rfc822.unquote() doesn't properly de-backslash-ify in Python pre-2.3.
222def unquote(str):
223 """Remove quotes from a string."""
224 if len(str) > 1:
225 if str.startswith('"') and str.endswith('"'):
226 return str[1:-1].replace('\\\\', '\\').replace('\\"', '"')
227 if str.startswith('<') and str.endswith('>'):
228 return str[1:-1]
229 return str
230
231
Barry Warsaw12566a82002-06-29 05:58:04 +0000232
233# RFC2231-related functions - parameter encoding and decoding
234def decode_rfc2231(s):
235 """Decode string according to RFC 2231"""
Barry Warsaw18d2f392006-07-17 23:07:51 +0000236 parts = s.split(TICK, 2)
237 if len(parts) <= 2:
Barry Warsawb110bad2006-07-21 14:51:07 +0000238 return None, None, s
Barry Warsawb110bad2006-07-21 14:51:07 +0000239 return parts
Barry Warsaw12566a82002-06-29 05:58:04 +0000240
241
242def encode_rfc2231(s, charset=None, language=None):
Barry Warsaw0ebc5c92002-10-01 00:44:13 +0000243 """Encode string according to RFC 2231.
244
245 If neither charset nor language is given, then s is returned as-is. If
246 charset is given but not language, the string is encoded using the empty
247 string for language.
248 """
Barry Warsaw12566a82002-06-29 05:58:04 +0000249 import urllib
250 s = urllib.quote(s, safe='')
251 if charset is None and language is None:
252 return s
Barry Warsaw0ebc5c92002-10-01 00:44:13 +0000253 if language is None:
254 language = ''
255 return "%s'%s'%s" % (charset, language, s)
Barry Warsaw12566a82002-06-29 05:58:04 +0000256
257
258rfc2231_continuation = re.compile(r'^(?P<name>\w+)\*((?P<num>[0-9]+)\*?)?$')
259
260def decode_params(params):
Barry Warsaw0ebc5c92002-10-01 00:44:13 +0000261 """Decode parameters list according to RFC 2231.
262
Barry Warsawb110bad2006-07-21 14:51:07 +0000263 params is a sequence of 2-tuples containing (param name, string value).
Barry Warsaw0ebc5c92002-10-01 00:44:13 +0000264 """
Barry Warsawb110bad2006-07-21 14:51:07 +0000265 # Copy params so we don't mess with the original
266 params = params[:]
Barry Warsaw12566a82002-06-29 05:58:04 +0000267 new_params = []
Barry Warsawb110bad2006-07-21 14:51:07 +0000268 # Map parameter's name to a list of continuations. The values are a
269 # 3-tuple of the continuation number, the string value, and a flag
270 # specifying whether a particular segment is %-encoded.
Barry Warsaw12566a82002-06-29 05:58:04 +0000271 rfc2231_params = {}
Barry Warsawb110bad2006-07-21 14:51:07 +0000272 name, value = params.pop(0)
Barry Warsaw12566a82002-06-29 05:58:04 +0000273 new_params.append((name, value))
Barry Warsawb110bad2006-07-21 14:51:07 +0000274 while params:
275 name, value = params.pop(0)
276 if name.endswith('*'):
277 encoded = True
278 else:
279 encoded = False
Barry Warsaw12566a82002-06-29 05:58:04 +0000280 value = unquote(value)
281 mo = rfc2231_continuation.match(name)
282 if mo:
283 name, num = mo.group('name', 'num')
284 if num is not None:
285 num = int(num)
Barry Warsawb110bad2006-07-21 14:51:07 +0000286 rfc2231_params.setdefault(name, []).append((num, value, encoded))
Barry Warsaw12566a82002-06-29 05:58:04 +0000287 else:
288 new_params.append((name, '"%s"' % quote(value)))
289 if rfc2231_params:
290 for name, continuations in rfc2231_params.items():
291 value = []
Barry Warsawb110bad2006-07-21 14:51:07 +0000292 extended = False
Barry Warsaw12566a82002-06-29 05:58:04 +0000293 # Sort by number
294 continuations.sort()
Barry Warsawb110bad2006-07-21 14:51:07 +0000295 # And now append all values in numerical order, converting
296 # %-encodings for the encoded segments. If any of the
297 # continuation names ends in a *, then the entire string, after
298 # decoding segments and concatenating, must have the charset and
299 # language specifiers at the beginning of the string.
300 for num, s, encoded in continuations:
301 if encoded:
302 s = urllib.unquote(s)
303 extended = True
304 value.append(s)
305 value = quote(EMPTYSTRING.join(value))
306 if extended:
307 charset, language, value = decode_rfc2231(value)
308 new_params.append((name, (charset, language, '"%s"' % value)))
309 else:
310 new_params.append((name, '"%s"' % value))
Barry Warsaw12566a82002-06-29 05:58:04 +0000311 return new_params
Barry Warsawbb113862004-10-03 03:16:19 +0000312
313def collapse_rfc2231_value(value, errors='replace',
314 fallback_charset='us-ascii'):
315 if isinstance(value, tuple):
316 rawval = unquote(value[2])
317 charset = value[0] or 'us-ascii'
318 try:
319 return unicode(rawval, charset, errors)
320 except LookupError:
321 # XXX charset is unknown to Python.
322 return unicode(rawval, fallback_charset, errors)
323 else:
324 return unquote(value)