blob: 3a4bbc83b63f9e56fa2885710c034b8f03392da1 [file] [log] [blame]
Barry Warsaw24f79762004-05-09 03:55:11 +00001# Copyright (C) 2001-2004 Python Software Foundation
2# Author: barry@python.org (Barry Warsaw)
Barry Warsawba925802001-09-23 03:17:28 +00003
Barry Warsaw24f79762004-05-09 03:55:11 +00004"""Miscellaneous utilities."""
Barry Warsawba925802001-09-23 03:17:28 +00005
Barry Warsaw409a4c02002-04-10 21:01:31 +00006import os
Barry Warsaw24f79762004-05-09 03:55:11 +00007import re
8import time
9import base64
10import random
11import socket
Barry Warsaw409a4c02002-04-10 21:01:31 +000012import warnings
13from cStringIO import StringIO
Barry Warsawba925802001-09-23 03:17:28 +000014
Barry Warsaw030ddf72002-11-05 19:54:52 +000015from email._parseaddr import quote
16from email._parseaddr import AddressList as _AddressList
17from email._parseaddr import mktime_tz
Barry Warsaw409a4c02002-04-10 21:01:31 +000018
19# We need wormarounds for bugs in these methods in older Pythons (see below)
Barry Warsaw030ddf72002-11-05 19:54:52 +000020from email._parseaddr import parsedate as _parsedate
21from email._parseaddr import parsedate_tz as _parsedate_tz
Barry Warsawba925802001-09-23 03:17:28 +000022
Barry Warsaw24f79762004-05-09 03:55:11 +000023from quopri import decodestring as _qdecode
Barry Warsawba925802001-09-23 03:17:28 +000024
25# Intrapackage imports
Barry Warsaw21f77ac2002-06-02 19:07:16 +000026from email.Encoders import _bencode, _qencode
Barry Warsawba925802001-09-23 03:17:28 +000027
28COMMASPACE = ', '
Barry Warsaw12566a82002-06-29 05:58:04 +000029EMPTYSTRING = ''
Barry Warsawba925802001-09-23 03:17:28 +000030UEMPTYSTRING = u''
Barry Warsaw409a4c02002-04-10 21:01:31 +000031CRLF = '\r\n'
32
Barry Warsawa2369922003-03-10 19:20:18 +000033specialsre = re.compile(r'[][\\()<>@,:;".]')
34escapesre = re.compile(r'[][\\()"]')
Barry Warsawba925802001-09-23 03:17:28 +000035
36
Barry Warsawe968ead2001-10-04 17:05:11 +000037
Barry Warsawba925802001-09-23 03:17:28 +000038# Helpers
39
40def _identity(s):
41 return s
42
43
44def _bdecode(s):
Barry Warsawba925802001-09-23 03:17:28 +000045 # We can't quite use base64.encodestring() since it tacks on a "courtesy
46 # newline". Blech!
47 if not s:
48 return s
Barry Warsawba925802001-09-23 03:17:28 +000049 value = base64.decodestring(s)
Barry Warsaw5bdb2be2002-09-28 20:49:57 +000050 if not s.endswith('\n') and value.endswith('\n'):
Barry Warsawba925802001-09-23 03:17:28 +000051 return value[:-1]
52 return value
53
54
Barry Warsawe968ead2001-10-04 17:05:11 +000055
Barry Warsaw409a4c02002-04-10 21:01:31 +000056def fix_eols(s):
57 """Replace all line-ending characters with \r\n."""
58 # Fix newlines with no preceding carriage return
59 s = re.sub(r'(?<!\r)\n', CRLF, s)
60 # Fix carriage returns with no following newline
61 s = re.sub(r'\r(?!\n)', CRLF, s)
62 return s
63
64
65
66def formataddr(pair):
67 """The inverse of parseaddr(), this takes a 2-tuple of the form
68 (realname, email_address) and returns the string value suitable
Barry Warsaw5bdb2be2002-09-28 20:49:57 +000069 for an RFC 2822 From, To or Cc header.
Tim Peters8ac14952002-05-23 15:15:30 +000070
Barry Warsaw409a4c02002-04-10 21:01:31 +000071 If the first element of pair is false, then the second element is
72 returned unmodified.
73 """
74 name, address = pair
75 if name:
76 quotes = ''
77 if specialsre.search(name):
78 quotes = '"'
79 name = escapesre.sub(r'\\\g<0>', name)
80 return '%s%s%s <%s>' % (quotes, name, quotes, address)
81 return address
82
83# For backwards compatibility
84def dump_address_pair(pair):
85 warnings.warn('Use email.Utils.formataddr() instead',
86 DeprecationWarning, 2)
87 return formataddr(pair)
88
89
90
Barry Warsawba925802001-09-23 03:17:28 +000091def getaddresses(fieldvalues):
92 """Return a list of (REALNAME, EMAIL) for each fieldvalue."""
93 all = COMMASPACE.join(fieldvalues)
Barry Warsawe1df15c2002-04-12 20:50:05 +000094 a = _AddressList(all)
Barry Warsaw4be9ecc2002-05-22 01:52:10 +000095 return a.addresslist
Barry Warsawba925802001-09-23 03:17:28 +000096
97
Barry Warsawe968ead2001-10-04 17:05:11 +000098
Barry Warsawba925802001-09-23 03:17:28 +000099ecre = re.compile(r'''
100 =\? # literal =?
101 (?P<charset>[^?]*?) # non-greedy up to the next ? is the charset
102 \? # literal ?
103 (?P<encoding>[qb]) # either a "q" or a "b", case insensitive
104 \? # literal ?
105 (?P<atom>.*?) # non-greedy up to the next ?= is the atom
106 \?= # literal ?=
107 ''', re.VERBOSE | re.IGNORECASE)
108
109
110def decode(s):
Barry Warsaw409a4c02002-04-10 21:01:31 +0000111 """Return a decoded string according to RFC 2047, as a unicode string.
112
113 NOTE: This function is deprecated. Use Header.decode_header() instead.
114 """
115 warnings.warn('Use Header.decode_header() instead.', DeprecationWarning, 2)
116 # Intra-package import here to avoid circular import problems.
Barry Warsaw21f77ac2002-06-02 19:07:16 +0000117 from email.Header import decode_header
Barry Warsaw409a4c02002-04-10 21:01:31 +0000118 L = decode_header(s)
Barry Warsaw24f79762004-05-09 03:55:11 +0000119 if not isinstance(L, list):
Barry Warsaw409a4c02002-04-10 21:01:31 +0000120 # s wasn't decoded
121 return s
122
Barry Warsawba925802001-09-23 03:17:28 +0000123 rtn = []
Barry Warsaw409a4c02002-04-10 21:01:31 +0000124 for atom, charset in L:
125 if charset is None:
126 rtn.append(atom)
Barry Warsawba925802001-09-23 03:17:28 +0000127 else:
Barry Warsaw409a4c02002-04-10 21:01:31 +0000128 # Convert the string to Unicode using the given encoding. Leave
129 # Unicode conversion errors to strict.
130 rtn.append(unicode(atom, charset))
Barry Warsawba925802001-09-23 03:17:28 +0000131 # Now that we've decoded everything, we just need to join all the parts
132 # together into the final string.
133 return UEMPTYSTRING.join(rtn)
134
135
Barry Warsawe968ead2001-10-04 17:05:11 +0000136
Barry Warsawba925802001-09-23 03:17:28 +0000137def encode(s, charset='iso-8859-1', encoding='q'):
138 """Encode a string according to RFC 2047."""
Barry Warsaw409a4c02002-04-10 21:01:31 +0000139 warnings.warn('Use Header.Header.encode() instead.', DeprecationWarning, 2)
Barry Warsawc44d2c52001-12-03 19:26:40 +0000140 encoding = encoding.lower()
141 if encoding == 'q':
Barry Warsawba925802001-09-23 03:17:28 +0000142 estr = _qencode(s)
Barry Warsawc44d2c52001-12-03 19:26:40 +0000143 elif encoding == 'b':
Barry Warsawba925802001-09-23 03:17:28 +0000144 estr = _bencode(s)
145 else:
146 raise ValueError, 'Illegal encoding code: ' + encoding
Barry Warsawc44d2c52001-12-03 19:26:40 +0000147 return '=?%s?%s?%s?=' % (charset.lower(), encoding, estr)
Barry Warsawaa79f4d2001-11-09 16:59:56 +0000148
149
150
Barry Warsaw5bdb2be2002-09-28 20:49:57 +0000151def formatdate(timeval=None, localtime=False):
Barry Warsaw9cff0e62001-11-09 17:07:28 +0000152 """Returns a date string as specified by RFC 2822, e.g.:
Barry Warsawaa79f4d2001-11-09 16:59:56 +0000153
154 Fri, 09 Nov 2001 01:08:47 -0000
155
Barry Warsaw9cff0e62001-11-09 17:07:28 +0000156 Optional timeval if given is a floating point time value as accepted by
157 gmtime() and localtime(), otherwise the current time is used.
158
Barry Warsaw5bdb2be2002-09-28 20:49:57 +0000159 Optional localtime is a flag that when True, interprets timeval, and
Barry Warsaw9cff0e62001-11-09 17:07:28 +0000160 returns a date relative to the local timezone instead of UTC, properly
161 taking daylight savings time into account.
Barry Warsawaa79f4d2001-11-09 16:59:56 +0000162 """
163 # Note: we cannot use strftime() because that honors the locale and RFC
164 # 2822 requires that day and month names be the English abbreviations.
165 if timeval is None:
166 timeval = time.time()
167 if localtime:
168 now = time.localtime(timeval)
169 # Calculate timezone offset, based on whether the local zone has
170 # daylight savings time, and whether DST is in effect.
171 if time.daylight and now[-1]:
172 offset = time.altzone
173 else:
174 offset = time.timezone
Barry Warsawe5739a62001-11-19 18:36:43 +0000175 hours, minutes = divmod(abs(offset), 3600)
176 # Remember offset is in seconds west of UTC, but the timezone is in
177 # minutes east of UTC, so the signs differ.
178 if offset > 0:
179 sign = '-'
180 else:
181 sign = '+'
182 zone = '%s%02d%02d' % (sign, hours, minutes / 60)
Barry Warsawaa79f4d2001-11-09 16:59:56 +0000183 else:
184 now = time.gmtime(timeval)
185 # Timezone offset is always -0000
186 zone = '-0000'
187 return '%s, %02d %s %04d %02d:%02d:%02d %s' % (
188 ['Mon', 'Tue', 'Wed', 'Thu', 'Fri', 'Sat', 'Sun'][now[6]],
189 now[2],
190 ['Jan', 'Feb', 'Mar', 'Apr', 'May', 'Jun',
191 'Jul', 'Aug', 'Sep', 'Oct', 'Nov', 'Dec'][now[1] - 1],
192 now[0], now[3], now[4], now[5],
193 zone)
Barry Warsaw409a4c02002-04-10 21:01:31 +0000194
195
196
197def make_msgid(idstring=None):
Barry Warsaw0ebc5c92002-10-01 00:44:13 +0000198 """Returns a string suitable for RFC 2822 compliant Message-ID, e.g:
Barry Warsaw409a4c02002-04-10 21:01:31 +0000199
200 <20020201195627.33539.96671@nightshade.la.mastaler.com>
201
202 Optional idstring if given is a string used to strengthen the
Barry Warsaw0ebc5c92002-10-01 00:44:13 +0000203 uniqueness of the message id.
Barry Warsaw409a4c02002-04-10 21:01:31 +0000204 """
205 timeval = time.time()
206 utcdate = time.strftime('%Y%m%d%H%M%S', time.gmtime(timeval))
207 pid = os.getpid()
208 randint = random.randrange(100000)
209 if idstring is None:
210 idstring = ''
211 else:
212 idstring = '.' + idstring
213 idhost = socket.getfqdn()
214 msgid = '<%s.%s.%s%s@%s>' % (utcdate, pid, randint, idstring, idhost)
215 return msgid
216
217
218
219# These functions are in the standalone mimelib version only because they've
220# subsequently been fixed in the latest Python versions. We use this to worm
221# around broken older Pythons.
222def parsedate(data):
223 if not data:
224 return None
225 return _parsedate(data)
226
227
228def parsedate_tz(data):
229 if not data:
230 return None
231 return _parsedate_tz(data)
232
233
234def parseaddr(addr):
Barry Warsaw24fd0252002-04-15 22:00:25 +0000235 addrs = _AddressList(addr).addresslist
236 if not addrs:
Barry Warsaw409a4c02002-04-10 21:01:31 +0000237 return '', ''
Barry Warsaw24fd0252002-04-15 22:00:25 +0000238 return addrs[0]
Barry Warsaw12566a82002-06-29 05:58:04 +0000239
240
Barry Warsaw184d55a2002-09-11 02:22:48 +0000241# rfc822.unquote() doesn't properly de-backslash-ify in Python pre-2.3.
242def unquote(str):
243 """Remove quotes from a string."""
244 if len(str) > 1:
245 if str.startswith('"') and str.endswith('"'):
246 return str[1:-1].replace('\\\\', '\\').replace('\\"', '"')
247 if str.startswith('<') and str.endswith('>'):
248 return str[1:-1]
249 return str
250
251
Barry Warsaw12566a82002-06-29 05:58:04 +0000252
253# RFC2231-related functions - parameter encoding and decoding
254def decode_rfc2231(s):
255 """Decode string according to RFC 2231"""
256 import urllib
Barry Warsaw8e1e7f52003-03-07 22:46:41 +0000257 parts = s.split("'", 2)
258 if len(parts) == 1:
Barry Warsaw0b6f0d82003-08-19 03:49:34 +0000259 return None, None, urllib.unquote(s)
Barry Warsaw8e1e7f52003-03-07 22:46:41 +0000260 charset, language, s = parts
261 return charset, language, urllib.unquote(s)
Barry Warsaw12566a82002-06-29 05:58:04 +0000262
263
264def encode_rfc2231(s, charset=None, language=None):
Barry Warsaw0ebc5c92002-10-01 00:44:13 +0000265 """Encode string according to RFC 2231.
266
267 If neither charset nor language is given, then s is returned as-is. If
268 charset is given but not language, the string is encoded using the empty
269 string for language.
270 """
Barry Warsaw12566a82002-06-29 05:58:04 +0000271 import urllib
272 s = urllib.quote(s, safe='')
273 if charset is None and language is None:
274 return s
Barry Warsaw0ebc5c92002-10-01 00:44:13 +0000275 if language is None:
276 language = ''
277 return "%s'%s'%s" % (charset, language, s)
Barry Warsaw12566a82002-06-29 05:58:04 +0000278
279
280rfc2231_continuation = re.compile(r'^(?P<name>\w+)\*((?P<num>[0-9]+)\*?)?$')
281
282def decode_params(params):
Barry Warsaw0ebc5c92002-10-01 00:44:13 +0000283 """Decode parameters list according to RFC 2231.
284
285 params is a sequence of 2-tuples containing (content type, string value).
286 """
Barry Warsaw12566a82002-06-29 05:58:04 +0000287 new_params = []
288 # maps parameter's name to a list of continuations
289 rfc2231_params = {}
290 # params is a sequence of 2-tuples containing (content_type, string value)
291 name, value = params[0]
292 new_params.append((name, value))
293 # Cycle through each of the rest of the parameters.
294 for name, value in params[1:]:
295 value = unquote(value)
296 mo = rfc2231_continuation.match(name)
297 if mo:
298 name, num = mo.group('name', 'num')
299 if num is not None:
300 num = int(num)
301 rfc2231_param1 = rfc2231_params.setdefault(name, [])
302 rfc2231_param1.append((num, value))
303 else:
304 new_params.append((name, '"%s"' % quote(value)))
305 if rfc2231_params:
306 for name, continuations in rfc2231_params.items():
307 value = []
308 # Sort by number
309 continuations.sort()
310 # And now append all values in num order
311 for num, continuation in continuations:
312 value.append(continuation)
313 charset, language, value = decode_rfc2231(EMPTYSTRING.join(value))
Barry Warsaw8e1e7f52003-03-07 22:46:41 +0000314 new_params.append(
315 (name, (charset, language, '"%s"' % quote(value))))
Barry Warsaw12566a82002-06-29 05:58:04 +0000316 return new_params