blob: 5771209c1be8e74432bd6005aa361971b784412c [file] [log] [blame]
Guido van Rossum8b3febe2007-08-30 01:15:14 +00001# Copyright (C) 2001-2007 Python Software Foundation
2# Author: Barry Warsaw
3# Contact: email-sig@python.org
4
5"""Miscellaneous utilities."""
6
7__all__ = [
8 'collapse_rfc2231_value',
9 'decode_params',
10 'decode_rfc2231',
11 'encode_rfc2231',
12 'formataddr',
13 'formatdate',
14 'getaddresses',
15 'make_msgid',
16 'parseaddr',
17 'parsedate',
18 'parsedate_tz',
19 'unquote',
20 ]
21
22import os
23import re
24import time
25import base64
26import random
27import socket
28import urllib
29import warnings
30from io import StringIO
31
32from email._parseaddr import quote
33from email._parseaddr import AddressList as _AddressList
34from email._parseaddr import mktime_tz
35
36# We need wormarounds for bugs in these methods in older Pythons (see below)
37from email._parseaddr import parsedate as _parsedate
38from email._parseaddr import parsedate_tz as _parsedate_tz
39
40from quopri import decodestring as _qdecode
41
42# Intrapackage imports
43from email.encoders import _bencode, _qencode
44
45COMMASPACE = ', '
46EMPTYSTRING = ''
47UEMPTYSTRING = ''
48CRLF = '\r\n'
49TICK = "'"
50
51specialsre = re.compile(r'[][\\()<>@,:;".]')
52escapesre = re.compile(r'[][\\()"]')
53
54
55
56# Helpers
57
58def _identity(s):
59 return s
60
61
62def _bdecode(s):
63 # We can't quite use base64.encodestring() since it tacks on a "courtesy
64 # newline". Blech!
65 if not s:
66 return s
67 value = base64.decodestring(s)
68 if not s.endswith('\n') and value.endswith('\n'):
69 return value[:-1]
70 return value
71
72
73
Guido van Rossum8b3febe2007-08-30 01:15:14 +000074def formataddr(pair):
75 """The inverse of parseaddr(), this takes a 2-tuple of the form
76 (realname, email_address) and returns the string value suitable
77 for an RFC 2822 From, To or Cc header.
78
79 If the first element of pair is false, then the second element is
80 returned unmodified.
81 """
82 name, address = pair
83 if name:
84 quotes = ''
85 if specialsre.search(name):
86 quotes = '"'
87 name = escapesre.sub(r'\\\g<0>', name)
88 return '%s%s%s <%s>' % (quotes, name, quotes, address)
89 return address
90
91
92
93def getaddresses(fieldvalues):
94 """Return a list of (REALNAME, EMAIL) for each fieldvalue."""
95 all = COMMASPACE.join(fieldvalues)
96 a = _AddressList(all)
97 return a.addresslist
98
99
100
101ecre = re.compile(r'''
102 =\? # literal =?
103 (?P<charset>[^?]*?) # non-greedy up to the next ? is the charset
104 \? # literal ?
105 (?P<encoding>[qb]) # either a "q" or a "b", case insensitive
106 \? # literal ?
107 (?P<atom>.*?) # non-greedy up to the next ?= is the atom
108 \?= # literal ?=
109 ''', re.VERBOSE | re.IGNORECASE)
110
111
112
113def formatdate(timeval=None, localtime=False, usegmt=False):
114 """Returns a date string as specified by RFC 2822, e.g.:
115
116 Fri, 09 Nov 2001 01:08:47 -0000
117
118 Optional timeval if given is a floating point time value as accepted by
119 gmtime() and localtime(), otherwise the current time is used.
120
121 Optional localtime is a flag that when True, interprets timeval, and
122 returns a date relative to the local timezone instead of UTC, properly
123 taking daylight savings time into account.
124
125 Optional argument usegmt means that the timezone is written out as
126 an ascii string, not numeric one (so "GMT" instead of "+0000"). This
127 is needed for HTTP, and is only used when localtime==False.
128 """
129 # Note: we cannot use strftime() because that honors the locale and RFC
130 # 2822 requires that day and month names be the English abbreviations.
131 if timeval is None:
132 timeval = time.time()
133 if localtime:
134 now = time.localtime(timeval)
135 # Calculate timezone offset, based on whether the local zone has
136 # daylight savings time, and whether DST is in effect.
137 if time.daylight and now[-1]:
138 offset = time.altzone
139 else:
140 offset = time.timezone
141 hours, minutes = divmod(abs(offset), 3600)
142 # Remember offset is in seconds west of UTC, but the timezone is in
143 # minutes east of UTC, so the signs differ.
144 if offset > 0:
145 sign = '-'
146 else:
147 sign = '+'
148 zone = '%s%02d%02d' % (sign, hours, minutes // 60)
149 else:
150 now = time.gmtime(timeval)
151 # Timezone offset is always -0000
152 if usegmt:
153 zone = 'GMT'
154 else:
155 zone = '-0000'
156 return '%s, %02d %s %04d %02d:%02d:%02d %s' % (
157 ['Mon', 'Tue', 'Wed', 'Thu', 'Fri', 'Sat', 'Sun'][now[6]],
158 now[2],
159 ['Jan', 'Feb', 'Mar', 'Apr', 'May', 'Jun',
160 'Jul', 'Aug', 'Sep', 'Oct', 'Nov', 'Dec'][now[1] - 1],
161 now[0], now[3], now[4], now[5],
162 zone)
163
164
165
166def make_msgid(idstring=None):
167 """Returns a string suitable for RFC 2822 compliant Message-ID, e.g:
168
169 <20020201195627.33539.96671@nightshade.la.mastaler.com>
170
171 Optional idstring if given is a string used to strengthen the
172 uniqueness of the message id.
173 """
174 timeval = time.time()
175 utcdate = time.strftime('%Y%m%d%H%M%S', time.gmtime(timeval))
176 pid = os.getpid()
177 randint = random.randrange(100000)
178 if idstring is None:
179 idstring = ''
180 else:
181 idstring = '.' + idstring
182 idhost = socket.getfqdn()
183 msgid = '<%s.%s.%s%s@%s>' % (utcdate, pid, randint, idstring, idhost)
184 return msgid
185
186
187
188# These functions are in the standalone mimelib version only because they've
189# subsequently been fixed in the latest Python versions. We use this to worm
190# around broken older Pythons.
191def parsedate(data):
192 if not data:
193 return None
194 return _parsedate(data)
195
196
197def parsedate_tz(data):
198 if not data:
199 return None
200 return _parsedate_tz(data)
201
202
203def parseaddr(addr):
204 addrs = _AddressList(addr).addresslist
205 if not addrs:
206 return '', ''
207 return addrs[0]
208
209
210# rfc822.unquote() doesn't properly de-backslash-ify in Python pre-2.3.
211def unquote(str):
212 """Remove quotes from a string."""
213 if len(str) > 1:
214 if str.startswith('"') and str.endswith('"'):
215 return str[1:-1].replace('\\\\', '\\').replace('\\"', '"')
216 if str.startswith('<') and str.endswith('>'):
217 return str[1:-1]
218 return str
219
220
221
222# RFC2231-related functions - parameter encoding and decoding
223def decode_rfc2231(s):
224 """Decode string according to RFC 2231"""
225 parts = s.split(TICK, 2)
226 if len(parts) <= 2:
227 return None, None, s
228 return parts
229
230
231def encode_rfc2231(s, charset=None, language=None):
232 """Encode string according to RFC 2231.
233
234 If neither charset nor language is given, then s is returned as-is. If
235 charset is given but not language, the string is encoded using the empty
236 string for language.
237 """
238 import urllib
239 s = urllib.quote(s, safe='')
240 if charset is None and language is None:
241 return s
242 if language is None:
243 language = ''
244 return "%s'%s'%s" % (charset, language, s)
245
246
247rfc2231_continuation = re.compile(r'^(?P<name>\w+)\*((?P<num>[0-9]+)\*?)?$')
248
249def decode_params(params):
250 """Decode parameters list according to RFC 2231.
251
252 params is a sequence of 2-tuples containing (param name, string value).
253 """
254 # Copy params so we don't mess with the original
255 params = params[:]
256 new_params = []
257 # Map parameter's name to a list of continuations. The values are a
258 # 3-tuple of the continuation number, the string value, and a flag
259 # specifying whether a particular segment is %-encoded.
260 rfc2231_params = {}
261 name, value = params.pop(0)
262 new_params.append((name, value))
263 while params:
264 name, value = params.pop(0)
265 if name.endswith('*'):
266 encoded = True
267 else:
268 encoded = False
269 value = unquote(value)
270 mo = rfc2231_continuation.match(name)
271 if mo:
272 name, num = mo.group('name', 'num')
273 if num is not None:
274 num = int(num)
275 rfc2231_params.setdefault(name, []).append((num, value, encoded))
276 else:
277 new_params.append((name, '"%s"' % quote(value)))
278 if rfc2231_params:
279 for name, continuations in rfc2231_params.items():
280 value = []
281 extended = False
282 # Sort by number
283 continuations.sort()
284 # And now append all values in numerical order, converting
285 # %-encodings for the encoded segments. If any of the
286 # continuation names ends in a *, then the entire string, after
287 # decoding segments and concatenating, must have the charset and
288 # language specifiers at the beginning of the string.
289 for num, s, encoded in continuations:
290 if encoded:
291 s = urllib.unquote(s)
292 extended = True
293 value.append(s)
294 value = quote(EMPTYSTRING.join(value))
295 if extended:
296 charset, language, value = decode_rfc2231(value)
297 new_params.append((name, (charset, language, '"%s"' % value)))
298 else:
299 new_params.append((name, '"%s"' % value))
300 return new_params
301
302def collapse_rfc2231_value(value, errors='replace',
303 fallback_charset='us-ascii'):
304 if not isinstance(value, tuple) or len(value) != 3:
305 return unquote(value)
306 # While value comes to us as a unicode string, we need it to be a bytes
307 # object. We do not want bytes() normal utf-8 decoder, we want a straight
308 # interpretation of the string as character bytes.
309 charset, language, text = value
Guido van Rossum9604e662007-08-30 03:46:43 +0000310 rawbytes = bytes(text, 'raw-unicode-escape')
Guido van Rossum8b3febe2007-08-30 01:15:14 +0000311 try:
312 return str(rawbytes, charset, errors)
313 except LookupError:
314 # charset is not a known codec.
315 return unquote(text)