blob: c72f64d7ba082585e99835091562ec795a0a8232 [file] [log] [blame]
Barry Warsaw409a4c02002-04-10 21:01:31 +00001# Copyright (C) 2002 Python Software Foundation
2# Author: che@debian.org (Ben Gertzfield)
3
4"""Header encoding and decoding functionality."""
5
6import re
7import email.quopriMIME
8import email.base64MIME
9from email.Charset import Charset
10
Barry Warsaw812031b2002-05-19 23:47:53 +000011try:
Barry Warsaw1c30aa22002-06-01 05:49:17 +000012 from email._compat22 import _floordiv
Barry Warsaw812031b2002-05-19 23:47:53 +000013except SyntaxError:
14 # Python 2.1 spells integer division differently
Barry Warsaw1c30aa22002-06-01 05:49:17 +000015 from email._compat21 import _floordiv
Barry Warsaw812031b2002-05-19 23:47:53 +000016
Barry Warsaw409a4c02002-04-10 21:01:31 +000017CRLFSPACE = '\r\n '
18CRLF = '\r\n'
Barry Warsaw76612502002-06-28 23:46:53 +000019NL = '\n'
20SPACE8 = ' ' * 8
21EMPTYSTRING = ''
Barry Warsaw409a4c02002-04-10 21:01:31 +000022
23MAXLINELEN = 76
24
25ENCODE = 1
26DECODE = 2
27
28# Match encoded-word strings in the form =?charset?q?Hello_World?=
29ecre = re.compile(r'''
30 =\? # literal =?
31 (?P<charset>[^?]*?) # non-greedy up to the next ? is the charset
32 \? # literal ?
33 (?P<encoding>[qb]) # either a "q" or a "b", case insensitive
34 \? # literal ?
35 (?P<encoded>.*?) # non-greedy up to the next ?= is the encoded string
36 \?= # literal ?=
37 ''', re.VERBOSE | re.IGNORECASE)
38
39
40
41# Helpers
42_max_append = email.quopriMIME._max_append
43
44
45
46def decode_header(header):
47 """Decode a message header value without converting charset.
48
49 Returns a list of (decoded_string, charset) pairs containing each of the
50 decoded parts of the header. Charset is None for non-encoded parts of the
51 header, otherwise a lower-case string containing the name of the character
52 set specified in the encoded string.
53 """
54 # If no encoding, just return the header
55 header = str(header)
56 if not ecre.search(header):
57 return [(header, None)]
58
59 decoded = []
60 dec = ''
61 for line in header.splitlines():
62 # This line might not have an encoding in it
63 if not ecre.search(line):
64 decoded.append((line, None))
65 continue
Tim Peters8ac14952002-05-23 15:15:30 +000066
Barry Warsaw409a4c02002-04-10 21:01:31 +000067 parts = ecre.split(line)
68 while parts:
69 unenc = parts.pop(0).strip()
70 if unenc:
71 # Should we continue a long line?
72 if decoded and decoded[-1][1] is None:
73 decoded[-1] = (decoded[-1][0] + dec, None)
74 else:
75 decoded.append((unenc, None))
76 if parts:
77 charset, encoding = [s.lower() for s in parts[0:2]]
78 encoded = parts[2]
79 dec = ''
80 if encoding == 'q':
81 dec = email.quopriMIME.header_decode(encoded)
82 elif encoding == 'b':
83 dec = email.base64MIME.decode(encoded)
84 else:
85 dec = encoded
86
87 if decoded and decoded[-1][1] == charset:
88 decoded[-1] = (decoded[-1][0] + dec, decoded[-1][1])
89 else:
90 decoded.append((dec, charset))
91 del parts[0:3]
92 return decoded
93
94
95
96class Header:
Barry Warsaw76612502002-06-28 23:46:53 +000097 def __init__(self, s, charset=None, maxlinelen=None, header_name=None,
98 continuation_ws=' '):
Barry Warsaw409a4c02002-04-10 21:01:31 +000099 """Create a MIME-compliant header that can contain many languages.
100
101 Specify the initial header value in s. Specify its character set as a
Barry Warsaw76612502002-06-28 23:46:53 +0000102 Charset object in the charset argument. If None, a default Charset
Barry Warsaw409a4c02002-04-10 21:01:31 +0000103 instance will be used.
104
105 You can later append to the header with append(s, charset) below;
106 charset does not have to be the same as the one initially specified
107 here. In fact, it's optional, and if not given, defaults to the
108 charset specified in the constructor.
109
Barry Warsaw76612502002-06-28 23:46:53 +0000110 The maximum line length can be specified explicit via maxlinelen. For
111 splitting the first line to a shorter value (to account for the field
112 header which isn't included in s, e.g. `Subject') pass in the name of
113 the field in header_name. The default maxlinelen is 76.
114
115 continuation_ws must be RFC 2822 compliant folding whitespace (usually
116 either a space or a hard tab) which will be prepended to continuation
117 lines.
Barry Warsaw409a4c02002-04-10 21:01:31 +0000118 """
119 if charset is None:
120 charset = Charset()
121 self._charset = charset
Barry Warsaw76612502002-06-28 23:46:53 +0000122 self._continuation_ws = continuation_ws
123 cws_expanded_len = len(continuation_ws.replace('\t', SPACE8))
Barry Warsaw409a4c02002-04-10 21:01:31 +0000124 # BAW: I believe `chunks' and `maxlinelen' should be non-public.
125 self._chunks = []
126 self.append(s, charset)
Barry Warsaw812031b2002-05-19 23:47:53 +0000127 if maxlinelen is None:
Barry Warsaw76612502002-06-28 23:46:53 +0000128 maxlinelen = MAXLINELEN
129 if header_name is None:
130 # We don't know anything about the field header so the first line
131 # is the same length as subsequent lines.
132 self._firstlinelen = maxlinelen
Barry Warsaw812031b2002-05-19 23:47:53 +0000133 else:
Barry Warsaw76612502002-06-28 23:46:53 +0000134 # The first line should be shorter to take into account the field
135 # header. Also subtract off 2 extra for the colon and space.
136 self._firstlinelen = maxlinelen - len(header_name) - 2
137 # Second and subsequent lines should subtract off the length in
138 # columns of the continuation whitespace prefix.
139 self._maxlinelen = maxlinelen - cws_expanded_len
Barry Warsaw409a4c02002-04-10 21:01:31 +0000140
141 def __str__(self):
142 """A synonym for self.encode()."""
143 return self.encode()
144
Barry Warsaw409a4c02002-04-10 21:01:31 +0000145 def append(self, s, charset=None):
146 """Append string s with Charset charset to the MIME header.
147
148 charset defaults to the one given in the class constructor.
149 """
150 if charset is None:
151 charset = self._charset
152 self._chunks.append((s, charset))
Tim Peters8ac14952002-05-23 15:15:30 +0000153
Barry Warsaw76612502002-06-28 23:46:53 +0000154 def _split(self, s, charset, firstline=0):
Barry Warsaw409a4c02002-04-10 21:01:31 +0000155 # Split up a header safely for use with encode_chunks. BAW: this
156 # appears to be a private convenience method.
157 splittable = charset.to_splittable(s)
158 encoded = charset.from_splittable(splittable)
Barry Warsaw812031b2002-05-19 23:47:53 +0000159 elen = charset.encoded_header_len(encoded)
Tim Peters8ac14952002-05-23 15:15:30 +0000160
Barry Warsaw812031b2002-05-19 23:47:53 +0000161 if elen <= self._maxlinelen:
Barry Warsaw409a4c02002-04-10 21:01:31 +0000162 return [(encoded, charset)]
Barry Warsaw76612502002-06-28 23:46:53 +0000163 # BAW: I'm not sure what the right test here is. What we're trying to
164 # do is be faithful to RFC 2822's recommendation that ($2.2.3):
165 #
166 # "Note: Though structured field bodies are defined in such a way that
167 # folding can take place between many of the lexical tokens (and even
168 # within some of the lexical tokens), folding SHOULD be limited to
169 # placing the CRLF at higher-level syntactic breaks."
170 #
171 # For now, I can only imagine doing this when the charset is us-ascii,
172 # although it's possible that other charsets may also benefit from the
173 # higher-level syntactic breaks.
174 #
175 elif charset == 'us-ascii':
176 return self._ascii_split(s, charset, firstline)
Barry Warsaw812031b2002-05-19 23:47:53 +0000177 # BAW: should we use encoded?
178 elif elen == len(s):
179 # We can split on _maxlinelen boundaries because we know that the
180 # encoding won't change the size of the string
181 splitpnt = self._maxlinelen
182 first = charset.from_splittable(splittable[:splitpnt], 0)
183 last = charset.from_splittable(splittable[splitpnt:], 0)
Barry Warsaw409a4c02002-04-10 21:01:31 +0000184 else:
Barry Warsaw1c30aa22002-06-01 05:49:17 +0000185 # Divide and conquer.
186 halfway = _floordiv(len(splittable), 2)
Barry Warsaw409a4c02002-04-10 21:01:31 +0000187 first = charset.from_splittable(splittable[:halfway], 0)
188 last = charset.from_splittable(splittable[halfway:], 0)
Barry Warsaw76612502002-06-28 23:46:53 +0000189 # Do the split
190 return self._split(first, charset, firstline) + \
191 self._split(last, charset)
192
193 def _ascii_split(self, s, charset, firstline):
194 # Attempt to split the line at the highest-level syntactic break
195 # possible. Note that we don't have a lot of smarts about field
196 # syntax; we just try to break on semi-colons, then whitespace.
197 rtn = []
198 lines = s.splitlines()
199 while lines:
200 line = lines.pop(0)
201 if firstline:
202 maxlinelen = self._firstlinelen
203 firstline = 0
204 else:
205 line = line.lstrip()
206 maxlinelen = self._maxlinelen
207 # Short lines can remain unchanged
208 if len(line.replace('\t', SPACE8)) <= maxlinelen:
209 rtn.append(line)
210 else:
211 oldlen = len(line)
212 # Try to break the line on semicolons, but if that doesn't
213 # work, try to split on folding whitespace.
214 while len(line) > maxlinelen:
215 i = line.rfind(';', 0, maxlinelen)
216 if i < 0:
217 break
218 rtn.append(line[:i] + ';')
219 line = line[i+1:]
220 # Is the remaining stuff still longer than maxlinelen?
221 if len(line) <= maxlinelen:
222 # Splitting on semis worked
223 rtn.append(line)
224 continue
225 # Splitting on semis didn't finish the job. If it did any
226 # work at all, stick the remaining junk on the front of the
227 # `lines' sequence and let the next pass do its thing.
228 if len(line) <> oldlen:
229 lines.insert(0, line)
230 continue
231 # Otherwise, splitting on semis didn't help at all.
232 parts = re.split(r'(\s+)', line)
233 if len(parts) == 1 or (len(parts) == 3 and
234 parts[0].endswith(':')):
235 # This line can't be split on whitespace. There's now
236 # little we can do to get this into maxlinelen. BAW:
237 # We're still potentially breaking the RFC by possibly
238 # allowing lines longer than the absolute maximum of 998
239 # characters. For now, let it slide.
240 #
241 # len(parts) will be 1 if this line has no `Field: '
242 # prefix, otherwise it will be len(3).
243 rtn.append(line)
244 continue
245 # There is whitespace we can split on.
246 first = parts.pop(0)
247 sublines = [first]
248 acc = len(first)
249 while parts:
250 len0 = len(parts[0])
251 len1 = len(parts[1])
252 if acc + len0 + len1 <= maxlinelen:
253 sublines.append(parts.pop(0))
254 sublines.append(parts.pop(0))
255 acc += len0 + len1
256 else:
257 # Split it here, but don't forget to ignore the
258 # next whitespace-only part
259 if first <> '':
260 rtn.append(EMPTYSTRING.join(sublines))
261 del parts[0]
262 first = parts.pop(0)
263 sublines = [first]
264 acc = len(first)
265 rtn.append(EMPTYSTRING.join(sublines))
266 return [(chunk, charset) for chunk in rtn]
267
268 def _encode_chunks(self):
269 """MIME-encode a header with many different charsets and/or encodings.
270
271 Given a list of pairs (string, charset), return a MIME-encoded string
272 suitable for use in a header field. Each pair may have different
273 charsets and/or encodings, and the resulting header will accurately
274 reflect each setting.
275
276 Each encoding can be email.Utils.QP (quoted-printable, for ASCII-like
277 character sets like iso-8859-1), email.Utils.BASE64 (Base64, for
278 non-ASCII like character sets like KOI8-R and iso-2022-jp), or None
279 (no encoding).
280
281 Each pair will be represented on a separate line; the resulting string
282 will be in the format:
283
284 "=?charset1?q?Mar=EDa_Gonz=E1lez_Alonso?=\n
285 =?charset2?b?SvxyZ2VuIEL2aW5n?="
286 """
287 chunks = []
288 for header, charset in self._chunks:
289 if charset is None or charset.header_encoding is None:
290 # There's no encoding for this chunk's charsets
291 _max_append(chunks, header, self._maxlinelen)
292 else:
293 _max_append(chunks, charset.header_encode(header, 0),
294 self._maxlinelen, ' ')
295 joiner = NL + self._continuation_ws
296 return joiner.join(chunks)
Barry Warsaw409a4c02002-04-10 21:01:31 +0000297
298 def encode(self):
299 """Encode a message header, possibly converting charset and encoding.
300
301 There are many issues involved in converting a given string for use in
302 an email header. Only certain character sets are readable in most
303 email clients, and as header strings can only contain a subset of
304 7-bit ASCII, care must be taken to properly convert and encode (with
305 Base64 or quoted-printable) header strings. In addition, there is a
306 75-character length limit on any given encoded header field, so
307 line-wrapping must be performed, even with double-byte character sets.
Tim Peters8ac14952002-05-23 15:15:30 +0000308
Barry Warsaw409a4c02002-04-10 21:01:31 +0000309 This method will do its best to convert the string to the correct
310 character set used in email, and encode and line wrap it safely with
311 the appropriate scheme for that character set.
312
313 If the given charset is not known or an error occurs during
314 conversion, this function will return the header untouched.
315 """
316 newchunks = []
317 for s, charset in self._chunks:
Barry Warsaw76612502002-06-28 23:46:53 +0000318 newchunks += self._split(s, charset, 1)
Barry Warsaw409a4c02002-04-10 21:01:31 +0000319 self._chunks = newchunks
Barry Warsaw76612502002-06-28 23:46:53 +0000320 return self._encode_chunks()