blob: e691c3cb0e975ea3f27a1c70aaa2563bb8f505b7 [file] [log] [blame]
Barry Warsaw409a4c02002-04-10 21:01:31 +00001# Copyright (C) 2002 Python Software Foundation
2# Author: che@debian.org (Ben Gertzfield)
3
4"""Header encoding and decoding functionality."""
5
6import re
7import email.quopriMIME
8import email.base64MIME
9from email.Charset import Charset
10
Barry Warsaw812031b2002-05-19 23:47:53 +000011try:
Barry Warsaw1c30aa22002-06-01 05:49:17 +000012 from email._compat22 import _floordiv
Barry Warsaw812031b2002-05-19 23:47:53 +000013except SyntaxError:
14 # Python 2.1 spells integer division differently
Barry Warsaw1c30aa22002-06-01 05:49:17 +000015 from email._compat21 import _floordiv
Barry Warsaw812031b2002-05-19 23:47:53 +000016
Barry Warsaw409a4c02002-04-10 21:01:31 +000017CRLFSPACE = '\r\n '
18CRLF = '\r\n'
Barry Warsaw76612502002-06-28 23:46:53 +000019NL = '\n'
20SPACE8 = ' ' * 8
21EMPTYSTRING = ''
Barry Warsaw409a4c02002-04-10 21:01:31 +000022
23MAXLINELEN = 76
24
25ENCODE = 1
26DECODE = 2
27
28# Match encoded-word strings in the form =?charset?q?Hello_World?=
29ecre = re.compile(r'''
30 =\? # literal =?
31 (?P<charset>[^?]*?) # non-greedy up to the next ? is the charset
32 \? # literal ?
33 (?P<encoding>[qb]) # either a "q" or a "b", case insensitive
34 \? # literal ?
35 (?P<encoded>.*?) # non-greedy up to the next ?= is the encoded string
36 \?= # literal ?=
37 ''', re.VERBOSE | re.IGNORECASE)
38
39
40
41# Helpers
42_max_append = email.quopriMIME._max_append
43
44
45
46def decode_header(header):
47 """Decode a message header value without converting charset.
48
49 Returns a list of (decoded_string, charset) pairs containing each of the
50 decoded parts of the header. Charset is None for non-encoded parts of the
51 header, otherwise a lower-case string containing the name of the character
52 set specified in the encoded string.
53 """
54 # If no encoding, just return the header
55 header = str(header)
56 if not ecre.search(header):
57 return [(header, None)]
Barry Warsaw409a4c02002-04-10 21:01:31 +000058 decoded = []
59 dec = ''
60 for line in header.splitlines():
61 # This line might not have an encoding in it
62 if not ecre.search(line):
63 decoded.append((line, None))
64 continue
Barry Warsaw409a4c02002-04-10 21:01:31 +000065 parts = ecre.split(line)
66 while parts:
67 unenc = parts.pop(0).strip()
68 if unenc:
69 # Should we continue a long line?
70 if decoded and decoded[-1][1] is None:
71 decoded[-1] = (decoded[-1][0] + dec, None)
72 else:
73 decoded.append((unenc, None))
74 if parts:
75 charset, encoding = [s.lower() for s in parts[0:2]]
76 encoded = parts[2]
77 dec = ''
78 if encoding == 'q':
79 dec = email.quopriMIME.header_decode(encoded)
80 elif encoding == 'b':
81 dec = email.base64MIME.decode(encoded)
82 else:
83 dec = encoded
84
85 if decoded and decoded[-1][1] == charset:
86 decoded[-1] = (decoded[-1][0] + dec, decoded[-1][1])
87 else:
88 decoded.append((dec, charset))
89 del parts[0:3]
90 return decoded
91
92
93
Barry Warsaw8da39aa2002-07-09 16:33:47 +000094def make_header(decoded_seq, maxlinelen=None, header_name=None,
95 continuation_ws=' '):
96 """Create a Header from a sequence of pairs as returned by decode_header()
97
98 decode_header() takes a header value string and returns a sequence of
99 pairs of the format (decoded_string, charset) where charset is the string
100 name of the character set.
101
102 This function takes one of those sequence of pairs and returns a Header
103 instance. Optional maxlinelen, header_name, and continuation_ws are as in
104 the Header constructor.
105 """
106 h = Header(maxlinelen=maxlinelen, header_name=header_name,
107 continuation_ws=continuation_ws)
108 for s, charset in decoded_seq:
Barry Warsaw15d37392002-07-23 04:29:54 +0000109 # None means us-ascii but we can simply pass it on to h.append()
110 if charset is not None and not isinstance(charset, Charset):
Barry Warsaw8da39aa2002-07-09 16:33:47 +0000111 charset = Charset(charset)
112 h.append(s, charset)
113 return h
114
115
116
Barry Warsaw409a4c02002-04-10 21:01:31 +0000117class Header:
Barry Warsaw8da39aa2002-07-09 16:33:47 +0000118 def __init__(self, s=None, charset=None, maxlinelen=None, header_name=None,
Barry Warsaw76612502002-06-28 23:46:53 +0000119 continuation_ws=' '):
Barry Warsaw409a4c02002-04-10 21:01:31 +0000120 """Create a MIME-compliant header that can contain many languages.
121
Barry Warsaw8da39aa2002-07-09 16:33:47 +0000122 Specify the initial header value in s. If None, the initial header
123 value is not set.
124
125 Specify both s's character set, and the default character set by
126 setting the charset argument to a Charset object (not a character set
127 name string!). If None, a us-ascii Charset is used as both s's
128 initial charset and as the default character set for subsequent
129 .append() calls.
Barry Warsaw409a4c02002-04-10 21:01:31 +0000130
131 You can later append to the header with append(s, charset) below;
132 charset does not have to be the same as the one initially specified
133 here. In fact, it's optional, and if not given, defaults to the
134 charset specified in the constructor.
135
Barry Warsaw76612502002-06-28 23:46:53 +0000136 The maximum line length can be specified explicit via maxlinelen. For
137 splitting the first line to a shorter value (to account for the field
138 header which isn't included in s, e.g. `Subject') pass in the name of
139 the field in header_name. The default maxlinelen is 76.
140
141 continuation_ws must be RFC 2822 compliant folding whitespace (usually
142 either a space or a hard tab) which will be prepended to continuation
143 lines.
Barry Warsaw409a4c02002-04-10 21:01:31 +0000144 """
145 if charset is None:
146 charset = Charset()
147 self._charset = charset
Barry Warsaw76612502002-06-28 23:46:53 +0000148 self._continuation_ws = continuation_ws
149 cws_expanded_len = len(continuation_ws.replace('\t', SPACE8))
Barry Warsaw409a4c02002-04-10 21:01:31 +0000150 # BAW: I believe `chunks' and `maxlinelen' should be non-public.
151 self._chunks = []
Barry Warsaw8da39aa2002-07-09 16:33:47 +0000152 if s is not None:
153 self.append(s, charset)
Barry Warsaw812031b2002-05-19 23:47:53 +0000154 if maxlinelen is None:
Barry Warsaw76612502002-06-28 23:46:53 +0000155 maxlinelen = MAXLINELEN
156 if header_name is None:
157 # We don't know anything about the field header so the first line
158 # is the same length as subsequent lines.
159 self._firstlinelen = maxlinelen
Barry Warsaw812031b2002-05-19 23:47:53 +0000160 else:
Barry Warsaw76612502002-06-28 23:46:53 +0000161 # The first line should be shorter to take into account the field
162 # header. Also subtract off 2 extra for the colon and space.
163 self._firstlinelen = maxlinelen - len(header_name) - 2
164 # Second and subsequent lines should subtract off the length in
165 # columns of the continuation whitespace prefix.
166 self._maxlinelen = maxlinelen - cws_expanded_len
Barry Warsaw409a4c02002-04-10 21:01:31 +0000167
168 def __str__(self):
169 """A synonym for self.encode()."""
170 return self.encode()
171
Barry Warsaw8e69bda2002-06-29 03:26:58 +0000172 def __unicode__(self):
173 """Helper for the built-in unicode function."""
174 # charset item is a Charset instance so we need to stringify it.
175 uchunks = [unicode(s, str(charset)) for s, charset in self._chunks]
176 return u''.join(uchunks)
177
Barry Warsaw8da39aa2002-07-09 16:33:47 +0000178 # Rich comparison operators for equality only. BAW: does it make sense to
179 # have or explicitly disable <, <=, >, >= operators?
180 def __eq__(self, other):
181 # other may be a Header or a string. Both are fine so coerce
182 # ourselves to a string, swap the args and do another comparison.
183 return other == self.encode()
184
185 def __ne__(self, other):
186 return not self == other
187
Barry Warsaw409a4c02002-04-10 21:01:31 +0000188 def append(self, s, charset=None):
189 """Append string s with Charset charset to the MIME header.
190
Barry Warsaw6ee71562002-07-03 05:04:04 +0000191 charset defaults to the one given in the class constructor. If
Barry Warsaw8da39aa2002-07-09 16:33:47 +0000192 charset is given, it should be an instance of Charset (not a character
193 set name string!).
Barry Warsaw409a4c02002-04-10 21:01:31 +0000194 """
195 if charset is None:
196 charset = self._charset
197 self._chunks.append((s, charset))
Tim Peters8ac14952002-05-23 15:15:30 +0000198
Barry Warsaw76612502002-06-28 23:46:53 +0000199 def _split(self, s, charset, firstline=0):
Barry Warsaw409a4c02002-04-10 21:01:31 +0000200 # Split up a header safely for use with encode_chunks. BAW: this
201 # appears to be a private convenience method.
202 splittable = charset.to_splittable(s)
203 encoded = charset.from_splittable(splittable)
Barry Warsaw812031b2002-05-19 23:47:53 +0000204 elen = charset.encoded_header_len(encoded)
Tim Peters8ac14952002-05-23 15:15:30 +0000205
Barry Warsaw812031b2002-05-19 23:47:53 +0000206 if elen <= self._maxlinelen:
Barry Warsaw409a4c02002-04-10 21:01:31 +0000207 return [(encoded, charset)]
Barry Warsaw76612502002-06-28 23:46:53 +0000208 # BAW: I'm not sure what the right test here is. What we're trying to
209 # do is be faithful to RFC 2822's recommendation that ($2.2.3):
210 #
211 # "Note: Though structured field bodies are defined in such a way that
212 # folding can take place between many of the lexical tokens (and even
213 # within some of the lexical tokens), folding SHOULD be limited to
214 # placing the CRLF at higher-level syntactic breaks."
215 #
216 # For now, I can only imagine doing this when the charset is us-ascii,
217 # although it's possible that other charsets may also benefit from the
218 # higher-level syntactic breaks.
219 #
220 elif charset == 'us-ascii':
221 return self._ascii_split(s, charset, firstline)
Barry Warsaw812031b2002-05-19 23:47:53 +0000222 # BAW: should we use encoded?
223 elif elen == len(s):
224 # We can split on _maxlinelen boundaries because we know that the
225 # encoding won't change the size of the string
226 splitpnt = self._maxlinelen
227 first = charset.from_splittable(splittable[:splitpnt], 0)
228 last = charset.from_splittable(splittable[splitpnt:], 0)
Barry Warsaw409a4c02002-04-10 21:01:31 +0000229 else:
Barry Warsaw1c30aa22002-06-01 05:49:17 +0000230 # Divide and conquer.
231 halfway = _floordiv(len(splittable), 2)
Barry Warsaw409a4c02002-04-10 21:01:31 +0000232 first = charset.from_splittable(splittable[:halfway], 0)
233 last = charset.from_splittable(splittable[halfway:], 0)
Barry Warsaw76612502002-06-28 23:46:53 +0000234 # Do the split
235 return self._split(first, charset, firstline) + \
236 self._split(last, charset)
237
238 def _ascii_split(self, s, charset, firstline):
239 # Attempt to split the line at the highest-level syntactic break
240 # possible. Note that we don't have a lot of smarts about field
241 # syntax; we just try to break on semi-colons, then whitespace.
242 rtn = []
243 lines = s.splitlines()
244 while lines:
245 line = lines.pop(0)
246 if firstline:
247 maxlinelen = self._firstlinelen
248 firstline = 0
249 else:
250 line = line.lstrip()
251 maxlinelen = self._maxlinelen
252 # Short lines can remain unchanged
253 if len(line.replace('\t', SPACE8)) <= maxlinelen:
254 rtn.append(line)
255 else:
256 oldlen = len(line)
257 # Try to break the line on semicolons, but if that doesn't
258 # work, try to split on folding whitespace.
259 while len(line) > maxlinelen:
260 i = line.rfind(';', 0, maxlinelen)
261 if i < 0:
262 break
263 rtn.append(line[:i] + ';')
264 line = line[i+1:]
265 # Is the remaining stuff still longer than maxlinelen?
266 if len(line) <= maxlinelen:
267 # Splitting on semis worked
268 rtn.append(line)
269 continue
270 # Splitting on semis didn't finish the job. If it did any
271 # work at all, stick the remaining junk on the front of the
272 # `lines' sequence and let the next pass do its thing.
273 if len(line) <> oldlen:
274 lines.insert(0, line)
275 continue
276 # Otherwise, splitting on semis didn't help at all.
277 parts = re.split(r'(\s+)', line)
278 if len(parts) == 1 or (len(parts) == 3 and
279 parts[0].endswith(':')):
280 # This line can't be split on whitespace. There's now
281 # little we can do to get this into maxlinelen. BAW:
282 # We're still potentially breaking the RFC by possibly
283 # allowing lines longer than the absolute maximum of 998
284 # characters. For now, let it slide.
285 #
286 # len(parts) will be 1 if this line has no `Field: '
287 # prefix, otherwise it will be len(3).
288 rtn.append(line)
289 continue
290 # There is whitespace we can split on.
291 first = parts.pop(0)
292 sublines = [first]
293 acc = len(first)
294 while parts:
295 len0 = len(parts[0])
296 len1 = len(parts[1])
297 if acc + len0 + len1 <= maxlinelen:
298 sublines.append(parts.pop(0))
299 sublines.append(parts.pop(0))
300 acc += len0 + len1
301 else:
302 # Split it here, but don't forget to ignore the
303 # next whitespace-only part
304 if first <> '':
305 rtn.append(EMPTYSTRING.join(sublines))
306 del parts[0]
307 first = parts.pop(0)
308 sublines = [first]
309 acc = len(first)
310 rtn.append(EMPTYSTRING.join(sublines))
311 return [(chunk, charset) for chunk in rtn]
312
313 def _encode_chunks(self):
314 """MIME-encode a header with many different charsets and/or encodings.
315
316 Given a list of pairs (string, charset), return a MIME-encoded string
317 suitable for use in a header field. Each pair may have different
318 charsets and/or encodings, and the resulting header will accurately
319 reflect each setting.
320
321 Each encoding can be email.Utils.QP (quoted-printable, for ASCII-like
322 character sets like iso-8859-1), email.Utils.BASE64 (Base64, for
323 non-ASCII like character sets like KOI8-R and iso-2022-jp), or None
324 (no encoding).
325
326 Each pair will be represented on a separate line; the resulting string
327 will be in the format:
328
329 "=?charset1?q?Mar=EDa_Gonz=E1lez_Alonso?=\n
330 =?charset2?b?SvxyZ2VuIEL2aW5n?="
331 """
332 chunks = []
333 for header, charset in self._chunks:
334 if charset is None or charset.header_encoding is None:
335 # There's no encoding for this chunk's charsets
336 _max_append(chunks, header, self._maxlinelen)
337 else:
338 _max_append(chunks, charset.header_encode(header, 0),
339 self._maxlinelen, ' ')
340 joiner = NL + self._continuation_ws
341 return joiner.join(chunks)
Barry Warsaw409a4c02002-04-10 21:01:31 +0000342
343 def encode(self):
344 """Encode a message header, possibly converting charset and encoding.
345
346 There are many issues involved in converting a given string for use in
347 an email header. Only certain character sets are readable in most
348 email clients, and as header strings can only contain a subset of
349 7-bit ASCII, care must be taken to properly convert and encode (with
350 Base64 or quoted-printable) header strings. In addition, there is a
351 75-character length limit on any given encoded header field, so
352 line-wrapping must be performed, even with double-byte character sets.
Tim Peters8ac14952002-05-23 15:15:30 +0000353
Barry Warsaw409a4c02002-04-10 21:01:31 +0000354 This method will do its best to convert the string to the correct
355 character set used in email, and encode and line wrap it safely with
356 the appropriate scheme for that character set.
357
358 If the given charset is not known or an error occurs during
359 conversion, this function will return the header untouched.
360 """
361 newchunks = []
362 for s, charset in self._chunks:
Barry Warsaw76612502002-06-28 23:46:53 +0000363 newchunks += self._split(s, charset, 1)
Barry Warsaw409a4c02002-04-10 21:01:31 +0000364 self._chunks = newchunks
Barry Warsaw76612502002-06-28 23:46:53 +0000365 return self._encode_chunks()