blob: c976021e0e0584c6ee56d5aa32b0034c531349c5 [file] [log] [blame]
Georg Brandl8cdc9bc2010-01-01 13:07:05 +00001# Copyright (C) 2001-2010 Python Software Foundation
Barry Warsawbb113862004-10-03 03:16:19 +00002# Author: Barry Warsaw
3# Contact: email-sig@python.org
Barry Warsawba925802001-09-23 03:17:28 +00004
Barry Warsaw24f79762004-05-09 03:55:11 +00005"""Miscellaneous utilities."""
Barry Warsawba925802001-09-23 03:17:28 +00006
Barry Warsaw40ef0062006-03-18 15:41:53 +00007__all__ = [
8 'collapse_rfc2231_value',
9 'decode_params',
10 'decode_rfc2231',
11 'encode_rfc2231',
12 'formataddr',
13 'formatdate',
14 'getaddresses',
15 'make_msgid',
Barry Warsaw0d9f61a2009-11-25 18:38:32 +000016 'mktime_tz',
Barry Warsaw40ef0062006-03-18 15:41:53 +000017 'parseaddr',
18 'parsedate',
19 'parsedate_tz',
20 'unquote',
21 ]
22
Barry Warsaw409a4c02002-04-10 21:01:31 +000023import os
Barry Warsaw24f79762004-05-09 03:55:11 +000024import re
25import time
26import base64
27import random
28import socket
Barry Warsawb110bad2006-07-21 14:51:07 +000029import urllib
Barry Warsaw409a4c02002-04-10 21:01:31 +000030import warnings
Barry Warsawba925802001-09-23 03:17:28 +000031
Barry Warsaw030ddf72002-11-05 19:54:52 +000032from email._parseaddr import quote
33from email._parseaddr import AddressList as _AddressList
34from email._parseaddr import mktime_tz
Barry Warsaw409a4c02002-04-10 21:01:31 +000035
36# We need wormarounds for bugs in these methods in older Pythons (see below)
Barry Warsaw030ddf72002-11-05 19:54:52 +000037from email._parseaddr import parsedate as _parsedate
38from email._parseaddr import parsedate_tz as _parsedate_tz
Barry Warsawba925802001-09-23 03:17:28 +000039
Barry Warsaw24f79762004-05-09 03:55:11 +000040from quopri import decodestring as _qdecode
Barry Warsawba925802001-09-23 03:17:28 +000041
42# Intrapackage imports
Barry Warsaw40ef0062006-03-18 15:41:53 +000043from email.encoders import _bencode, _qencode
Barry Warsawba925802001-09-23 03:17:28 +000044
45COMMASPACE = ', '
Barry Warsaw12566a82002-06-29 05:58:04 +000046EMPTYSTRING = ''
Barry Warsawba925802001-09-23 03:17:28 +000047UEMPTYSTRING = u''
Barry Warsaw409a4c02002-04-10 21:01:31 +000048CRLF = '\r\n'
Barry Warsaw18d2f392006-07-17 23:07:51 +000049TICK = "'"
Barry Warsaw409a4c02002-04-10 21:01:31 +000050
Barry Warsawa2369922003-03-10 19:20:18 +000051specialsre = re.compile(r'[][\\()<>@,:;".]')
52escapesre = re.compile(r'[][\\()"]')
Barry Warsawba925802001-09-23 03:17:28 +000053
54
Barry Warsawe968ead2001-10-04 17:05:11 +000055
Barry Warsawba925802001-09-23 03:17:28 +000056# Helpers
57
58def _identity(s):
59 return s
60
61
62def _bdecode(s):
R. David Murray4617e502010-03-08 02:04:06 +000063 """Decodes a base64 string.
64
65 This function is equivalent to base64.decodestring and it's retained only
Ezio Melotti003014b2012-09-21 16:27:45 +030066 for backward compatibility. It used to remove the last \\n of the decoded
R. David Murray4617e502010-03-08 02:04:06 +000067 string, if it had any (see issue 7143).
68 """
Barry Warsawba925802001-09-23 03:17:28 +000069 if not s:
70 return s
R. David Murray4617e502010-03-08 02:04:06 +000071 return base64.decodestring(s)
Barry Warsawba925802001-09-23 03:17:28 +000072
73
Barry Warsawe968ead2001-10-04 17:05:11 +000074
Barry Warsaw409a4c02002-04-10 21:01:31 +000075def fix_eols(s):
Ezio Melotti003014b2012-09-21 16:27:45 +030076 """Replace all line-ending characters with \\r\\n."""
Barry Warsaw409a4c02002-04-10 21:01:31 +000077 # Fix newlines with no preceding carriage return
78 s = re.sub(r'(?<!\r)\n', CRLF, s)
79 # Fix carriage returns with no following newline
80 s = re.sub(r'\r(?!\n)', CRLF, s)
81 return s
82
83
84
85def formataddr(pair):
86 """The inverse of parseaddr(), this takes a 2-tuple of the form
87 (realname, email_address) and returns the string value suitable
Barry Warsaw5bdb2be2002-09-28 20:49:57 +000088 for an RFC 2822 From, To or Cc header.
Tim Peters8ac14952002-05-23 15:15:30 +000089
Barry Warsaw409a4c02002-04-10 21:01:31 +000090 If the first element of pair is false, then the second element is
91 returned unmodified.
92 """
93 name, address = pair
94 if name:
95 quotes = ''
96 if specialsre.search(name):
97 quotes = '"'
98 name = escapesre.sub(r'\\\g<0>', name)
99 return '%s%s%s <%s>' % (quotes, name, quotes, address)
100 return address
101
Barry Warsaw409a4c02002-04-10 21:01:31 +0000102
103
Barry Warsawba925802001-09-23 03:17:28 +0000104def getaddresses(fieldvalues):
105 """Return a list of (REALNAME, EMAIL) for each fieldvalue."""
106 all = COMMASPACE.join(fieldvalues)
Barry Warsawe1df15c2002-04-12 20:50:05 +0000107 a = _AddressList(all)
Barry Warsaw4be9ecc2002-05-22 01:52:10 +0000108 return a.addresslist
Barry Warsawba925802001-09-23 03:17:28 +0000109
110
Barry Warsawe968ead2001-10-04 17:05:11 +0000111
Barry Warsawba925802001-09-23 03:17:28 +0000112ecre = re.compile(r'''
113 =\? # literal =?
114 (?P<charset>[^?]*?) # non-greedy up to the next ? is the charset
115 \? # literal ?
116 (?P<encoding>[qb]) # either a "q" or a "b", case insensitive
117 \? # literal ?
118 (?P<atom>.*?) # non-greedy up to the next ?= is the atom
119 \?= # literal ?=
120 ''', re.VERBOSE | re.IGNORECASE)
121
122
Barry Warsawaa79f4d2001-11-09 16:59:56 +0000123
Anthony Baxter3dd9e462004-10-11 13:53:08 +0000124def formatdate(timeval=None, localtime=False, usegmt=False):
Barry Warsaw9cff0e62001-11-09 17:07:28 +0000125 """Returns a date string as specified by RFC 2822, e.g.:
Barry Warsawaa79f4d2001-11-09 16:59:56 +0000126
127 Fri, 09 Nov 2001 01:08:47 -0000
128
Barry Warsaw9cff0e62001-11-09 17:07:28 +0000129 Optional timeval if given is a floating point time value as accepted by
130 gmtime() and localtime(), otherwise the current time is used.
131
Barry Warsaw5bdb2be2002-09-28 20:49:57 +0000132 Optional localtime is a flag that when True, interprets timeval, and
Barry Warsaw9cff0e62001-11-09 17:07:28 +0000133 returns a date relative to the local timezone instead of UTC, properly
134 taking daylight savings time into account.
Anthony Baxter3dd9e462004-10-11 13:53:08 +0000135
Tim Peterse718f612004-10-12 21:51:32 +0000136 Optional argument usegmt means that the timezone is written out as
Anthony Baxter3dd9e462004-10-11 13:53:08 +0000137 an ascii string, not numeric one (so "GMT" instead of "+0000"). This
138 is needed for HTTP, and is only used when localtime==False.
Barry Warsawaa79f4d2001-11-09 16:59:56 +0000139 """
140 # Note: we cannot use strftime() because that honors the locale and RFC
141 # 2822 requires that day and month names be the English abbreviations.
142 if timeval is None:
143 timeval = time.time()
144 if localtime:
145 now = time.localtime(timeval)
146 # Calculate timezone offset, based on whether the local zone has
147 # daylight savings time, and whether DST is in effect.
148 if time.daylight and now[-1]:
149 offset = time.altzone
150 else:
151 offset = time.timezone
Barry Warsawe5739a62001-11-19 18:36:43 +0000152 hours, minutes = divmod(abs(offset), 3600)
153 # Remember offset is in seconds west of UTC, but the timezone is in
154 # minutes east of UTC, so the signs differ.
155 if offset > 0:
156 sign = '-'
157 else:
158 sign = '+'
Barry Warsawbb113862004-10-03 03:16:19 +0000159 zone = '%s%02d%02d' % (sign, hours, minutes // 60)
Barry Warsawaa79f4d2001-11-09 16:59:56 +0000160 else:
161 now = time.gmtime(timeval)
162 # Timezone offset is always -0000
Anthony Baxter3dd9e462004-10-11 13:53:08 +0000163 if usegmt:
164 zone = 'GMT'
165 else:
166 zone = '-0000'
Barry Warsawaa79f4d2001-11-09 16:59:56 +0000167 return '%s, %02d %s %04d %02d:%02d:%02d %s' % (
168 ['Mon', 'Tue', 'Wed', 'Thu', 'Fri', 'Sat', 'Sun'][now[6]],
169 now[2],
170 ['Jan', 'Feb', 'Mar', 'Apr', 'May', 'Jun',
171 'Jul', 'Aug', 'Sep', 'Oct', 'Nov', 'Dec'][now[1] - 1],
172 now[0], now[3], now[4], now[5],
173 zone)
Barry Warsaw409a4c02002-04-10 21:01:31 +0000174
175
176
177def make_msgid(idstring=None):
Barry Warsaw0ebc5c92002-10-01 00:44:13 +0000178 """Returns a string suitable for RFC 2822 compliant Message-ID, e.g:
Barry Warsaw409a4c02002-04-10 21:01:31 +0000179
180 <20020201195627.33539.96671@nightshade.la.mastaler.com>
181
182 Optional idstring if given is a string used to strengthen the
Barry Warsaw0ebc5c92002-10-01 00:44:13 +0000183 uniqueness of the message id.
Barry Warsaw409a4c02002-04-10 21:01:31 +0000184 """
185 timeval = time.time()
186 utcdate = time.strftime('%Y%m%d%H%M%S', time.gmtime(timeval))
187 pid = os.getpid()
188 randint = random.randrange(100000)
189 if idstring is None:
190 idstring = ''
191 else:
192 idstring = '.' + idstring
193 idhost = socket.getfqdn()
194 msgid = '<%s.%s.%s%s@%s>' % (utcdate, pid, randint, idstring, idhost)
195 return msgid
196
197
198
199# These functions are in the standalone mimelib version only because they've
200# subsequently been fixed in the latest Python versions. We use this to worm
201# around broken older Pythons.
202def parsedate(data):
203 if not data:
204 return None
205 return _parsedate(data)
206
207
208def parsedate_tz(data):
209 if not data:
210 return None
211 return _parsedate_tz(data)
212
213
214def parseaddr(addr):
Barry Warsaw24fd0252002-04-15 22:00:25 +0000215 addrs = _AddressList(addr).addresslist
216 if not addrs:
Barry Warsaw409a4c02002-04-10 21:01:31 +0000217 return '', ''
Barry Warsaw24fd0252002-04-15 22:00:25 +0000218 return addrs[0]
Barry Warsaw12566a82002-06-29 05:58:04 +0000219
220
Barry Warsaw184d55a2002-09-11 02:22:48 +0000221# rfc822.unquote() doesn't properly de-backslash-ify in Python pre-2.3.
222def unquote(str):
223 """Remove quotes from a string."""
224 if len(str) > 1:
225 if str.startswith('"') and str.endswith('"'):
226 return str[1:-1].replace('\\\\', '\\').replace('\\"', '"')
227 if str.startswith('<') and str.endswith('>'):
228 return str[1:-1]
229 return str
230
231
Barry Warsaw12566a82002-06-29 05:58:04 +0000232
233# RFC2231-related functions - parameter encoding and decoding
234def decode_rfc2231(s):
235 """Decode string according to RFC 2231"""
Barry Warsaw18d2f392006-07-17 23:07:51 +0000236 parts = s.split(TICK, 2)
237 if len(parts) <= 2:
Barry Warsawb110bad2006-07-21 14:51:07 +0000238 return None, None, s
Barry Warsawb110bad2006-07-21 14:51:07 +0000239 return parts
Barry Warsaw12566a82002-06-29 05:58:04 +0000240
241
242def encode_rfc2231(s, charset=None, language=None):
Barry Warsaw0ebc5c92002-10-01 00:44:13 +0000243 """Encode string according to RFC 2231.
244
245 If neither charset nor language is given, then s is returned as-is. If
246 charset is given but not language, the string is encoded using the empty
247 string for language.
248 """
Barry Warsaw12566a82002-06-29 05:58:04 +0000249 import urllib
250 s = urllib.quote(s, safe='')
251 if charset is None and language is None:
252 return s
Barry Warsaw0ebc5c92002-10-01 00:44:13 +0000253 if language is None:
254 language = ''
255 return "%s'%s'%s" % (charset, language, s)
Barry Warsaw12566a82002-06-29 05:58:04 +0000256
257
258rfc2231_continuation = re.compile(r'^(?P<name>\w+)\*((?P<num>[0-9]+)\*?)?$')
259
260def decode_params(params):
Barry Warsaw0ebc5c92002-10-01 00:44:13 +0000261 """Decode parameters list according to RFC 2231.
262
Barry Warsawb110bad2006-07-21 14:51:07 +0000263 params is a sequence of 2-tuples containing (param name, string value).
Barry Warsaw0ebc5c92002-10-01 00:44:13 +0000264 """
Barry Warsawb110bad2006-07-21 14:51:07 +0000265 # Copy params so we don't mess with the original
266 params = params[:]
Barry Warsaw12566a82002-06-29 05:58:04 +0000267 new_params = []
Barry Warsawb110bad2006-07-21 14:51:07 +0000268 # Map parameter's name to a list of continuations. The values are a
269 # 3-tuple of the continuation number, the string value, and a flag
270 # specifying whether a particular segment is %-encoded.
Barry Warsaw12566a82002-06-29 05:58:04 +0000271 rfc2231_params = {}
Barry Warsawb110bad2006-07-21 14:51:07 +0000272 name, value = params.pop(0)
Barry Warsaw12566a82002-06-29 05:58:04 +0000273 new_params.append((name, value))
Barry Warsawb110bad2006-07-21 14:51:07 +0000274 while params:
275 name, value = params.pop(0)
276 if name.endswith('*'):
277 encoded = True
278 else:
279 encoded = False
Barry Warsaw12566a82002-06-29 05:58:04 +0000280 value = unquote(value)
281 mo = rfc2231_continuation.match(name)
282 if mo:
283 name, num = mo.group('name', 'num')
284 if num is not None:
285 num = int(num)
Barry Warsawb110bad2006-07-21 14:51:07 +0000286 rfc2231_params.setdefault(name, []).append((num, value, encoded))
Barry Warsaw12566a82002-06-29 05:58:04 +0000287 else:
288 new_params.append((name, '"%s"' % quote(value)))
289 if rfc2231_params:
290 for name, continuations in rfc2231_params.items():
291 value = []
Barry Warsawb110bad2006-07-21 14:51:07 +0000292 extended = False
Barry Warsaw12566a82002-06-29 05:58:04 +0000293 # Sort by number
294 continuations.sort()
Barry Warsawb110bad2006-07-21 14:51:07 +0000295 # And now append all values in numerical order, converting
296 # %-encodings for the encoded segments. If any of the
297 # continuation names ends in a *, then the entire string, after
298 # decoding segments and concatenating, must have the charset and
299 # language specifiers at the beginning of the string.
300 for num, s, encoded in continuations:
301 if encoded:
302 s = urllib.unquote(s)
303 extended = True
304 value.append(s)
305 value = quote(EMPTYSTRING.join(value))
306 if extended:
307 charset, language, value = decode_rfc2231(value)
308 new_params.append((name, (charset, language, '"%s"' % value)))
309 else:
310 new_params.append((name, '"%s"' % value))
Barry Warsaw12566a82002-06-29 05:58:04 +0000311 return new_params
Barry Warsawbb113862004-10-03 03:16:19 +0000312
313def collapse_rfc2231_value(value, errors='replace',
314 fallback_charset='us-ascii'):
315 if isinstance(value, tuple):
316 rawval = unquote(value[2])
317 charset = value[0] or 'us-ascii'
318 try:
319 return unicode(rawval, charset, errors)
320 except LookupError:
321 # XXX charset is unknown to Python.
322 return unicode(rawval, fallback_charset, errors)
323 else:
324 return unquote(value)