blob: 9060fbba9198c3f80b309715c2c95b114a74ead7 [file] [log] [blame]
Barry Warsaw409a4c02002-04-10 21:01:31 +00001# Copyright (C) 2002 Python Software Foundation
2# Author: che@debian.org (Ben Gertzfield)
3
4"""Header encoding and decoding functionality."""
5
6import re
7import email.quopriMIME
8import email.base64MIME
9from email.Charset import Charset
10
Barry Warsaw812031b2002-05-19 23:47:53 +000011try:
Barry Warsaw1c30aa22002-06-01 05:49:17 +000012 from email._compat22 import _floordiv
Barry Warsaw812031b2002-05-19 23:47:53 +000013except SyntaxError:
14 # Python 2.1 spells integer division differently
Barry Warsaw1c30aa22002-06-01 05:49:17 +000015 from email._compat21 import _floordiv
Barry Warsaw812031b2002-05-19 23:47:53 +000016
Barry Warsaw409a4c02002-04-10 21:01:31 +000017CRLFSPACE = '\r\n '
18CRLF = '\r\n'
Barry Warsaw76612502002-06-28 23:46:53 +000019NL = '\n'
20SPACE8 = ' ' * 8
21EMPTYSTRING = ''
Barry Warsaw409a4c02002-04-10 21:01:31 +000022
23MAXLINELEN = 76
24
25ENCODE = 1
26DECODE = 2
27
28# Match encoded-word strings in the form =?charset?q?Hello_World?=
29ecre = re.compile(r'''
30 =\? # literal =?
31 (?P<charset>[^?]*?) # non-greedy up to the next ? is the charset
32 \? # literal ?
33 (?P<encoding>[qb]) # either a "q" or a "b", case insensitive
34 \? # literal ?
35 (?P<encoded>.*?) # non-greedy up to the next ?= is the encoded string
36 \?= # literal ?=
37 ''', re.VERBOSE | re.IGNORECASE)
38
39
40
41# Helpers
42_max_append = email.quopriMIME._max_append
43
44
45
46def decode_header(header):
47 """Decode a message header value without converting charset.
48
49 Returns a list of (decoded_string, charset) pairs containing each of the
50 decoded parts of the header. Charset is None for non-encoded parts of the
51 header, otherwise a lower-case string containing the name of the character
52 set specified in the encoded string.
53 """
54 # If no encoding, just return the header
55 header = str(header)
56 if not ecre.search(header):
57 return [(header, None)]
58
59 decoded = []
60 dec = ''
61 for line in header.splitlines():
62 # This line might not have an encoding in it
63 if not ecre.search(line):
64 decoded.append((line, None))
65 continue
Tim Peters8ac14952002-05-23 15:15:30 +000066
Barry Warsaw409a4c02002-04-10 21:01:31 +000067 parts = ecre.split(line)
68 while parts:
69 unenc = parts.pop(0).strip()
70 if unenc:
71 # Should we continue a long line?
72 if decoded and decoded[-1][1] is None:
73 decoded[-1] = (decoded[-1][0] + dec, None)
74 else:
75 decoded.append((unenc, None))
76 if parts:
77 charset, encoding = [s.lower() for s in parts[0:2]]
78 encoded = parts[2]
79 dec = ''
80 if encoding == 'q':
81 dec = email.quopriMIME.header_decode(encoded)
82 elif encoding == 'b':
83 dec = email.base64MIME.decode(encoded)
84 else:
85 dec = encoded
86
87 if decoded and decoded[-1][1] == charset:
88 decoded[-1] = (decoded[-1][0] + dec, decoded[-1][1])
89 else:
90 decoded.append((dec, charset))
91 del parts[0:3]
92 return decoded
93
94
95
96class Header:
Barry Warsaw76612502002-06-28 23:46:53 +000097 def __init__(self, s, charset=None, maxlinelen=None, header_name=None,
98 continuation_ws=' '):
Barry Warsaw409a4c02002-04-10 21:01:31 +000099 """Create a MIME-compliant header that can contain many languages.
100
101 Specify the initial header value in s. Specify its character set as a
Barry Warsaw76612502002-06-28 23:46:53 +0000102 Charset object in the charset argument. If None, a default Charset
Barry Warsaw409a4c02002-04-10 21:01:31 +0000103 instance will be used.
104
105 You can later append to the header with append(s, charset) below;
106 charset does not have to be the same as the one initially specified
107 here. In fact, it's optional, and if not given, defaults to the
108 charset specified in the constructor.
109
Barry Warsaw76612502002-06-28 23:46:53 +0000110 The maximum line length can be specified explicit via maxlinelen. For
111 splitting the first line to a shorter value (to account for the field
112 header which isn't included in s, e.g. `Subject') pass in the name of
113 the field in header_name. The default maxlinelen is 76.
114
115 continuation_ws must be RFC 2822 compliant folding whitespace (usually
116 either a space or a hard tab) which will be prepended to continuation
117 lines.
Barry Warsaw409a4c02002-04-10 21:01:31 +0000118 """
119 if charset is None:
120 charset = Charset()
121 self._charset = charset
Barry Warsaw76612502002-06-28 23:46:53 +0000122 self._continuation_ws = continuation_ws
123 cws_expanded_len = len(continuation_ws.replace('\t', SPACE8))
Barry Warsaw409a4c02002-04-10 21:01:31 +0000124 # BAW: I believe `chunks' and `maxlinelen' should be non-public.
125 self._chunks = []
126 self.append(s, charset)
Barry Warsaw812031b2002-05-19 23:47:53 +0000127 if maxlinelen is None:
Barry Warsaw76612502002-06-28 23:46:53 +0000128 maxlinelen = MAXLINELEN
129 if header_name is None:
130 # We don't know anything about the field header so the first line
131 # is the same length as subsequent lines.
132 self._firstlinelen = maxlinelen
Barry Warsaw812031b2002-05-19 23:47:53 +0000133 else:
Barry Warsaw76612502002-06-28 23:46:53 +0000134 # The first line should be shorter to take into account the field
135 # header. Also subtract off 2 extra for the colon and space.
136 self._firstlinelen = maxlinelen - len(header_name) - 2
137 # Second and subsequent lines should subtract off the length in
138 # columns of the continuation whitespace prefix.
139 self._maxlinelen = maxlinelen - cws_expanded_len
Barry Warsaw409a4c02002-04-10 21:01:31 +0000140
141 def __str__(self):
142 """A synonym for self.encode()."""
143 return self.encode()
144
Barry Warsaw8e69bda2002-06-29 03:26:58 +0000145 def __unicode__(self):
146 """Helper for the built-in unicode function."""
147 # charset item is a Charset instance so we need to stringify it.
148 uchunks = [unicode(s, str(charset)) for s, charset in self._chunks]
149 return u''.join(uchunks)
150
Barry Warsaw409a4c02002-04-10 21:01:31 +0000151 def append(self, s, charset=None):
152 """Append string s with Charset charset to the MIME header.
153
Barry Warsaw6ee71562002-07-03 05:04:04 +0000154 charset defaults to the one given in the class constructor. If
155 charset is given, it should be an instance of email.Charset.Charset.
Barry Warsaw409a4c02002-04-10 21:01:31 +0000156 """
157 if charset is None:
158 charset = self._charset
159 self._chunks.append((s, charset))
Tim Peters8ac14952002-05-23 15:15:30 +0000160
Barry Warsaw76612502002-06-28 23:46:53 +0000161 def _split(self, s, charset, firstline=0):
Barry Warsaw409a4c02002-04-10 21:01:31 +0000162 # Split up a header safely for use with encode_chunks. BAW: this
163 # appears to be a private convenience method.
164 splittable = charset.to_splittable(s)
165 encoded = charset.from_splittable(splittable)
Barry Warsaw812031b2002-05-19 23:47:53 +0000166 elen = charset.encoded_header_len(encoded)
Tim Peters8ac14952002-05-23 15:15:30 +0000167
Barry Warsaw812031b2002-05-19 23:47:53 +0000168 if elen <= self._maxlinelen:
Barry Warsaw409a4c02002-04-10 21:01:31 +0000169 return [(encoded, charset)]
Barry Warsaw76612502002-06-28 23:46:53 +0000170 # BAW: I'm not sure what the right test here is. What we're trying to
171 # do is be faithful to RFC 2822's recommendation that ($2.2.3):
172 #
173 # "Note: Though structured field bodies are defined in such a way that
174 # folding can take place between many of the lexical tokens (and even
175 # within some of the lexical tokens), folding SHOULD be limited to
176 # placing the CRLF at higher-level syntactic breaks."
177 #
178 # For now, I can only imagine doing this when the charset is us-ascii,
179 # although it's possible that other charsets may also benefit from the
180 # higher-level syntactic breaks.
181 #
182 elif charset == 'us-ascii':
183 return self._ascii_split(s, charset, firstline)
Barry Warsaw812031b2002-05-19 23:47:53 +0000184 # BAW: should we use encoded?
185 elif elen == len(s):
186 # We can split on _maxlinelen boundaries because we know that the
187 # encoding won't change the size of the string
188 splitpnt = self._maxlinelen
189 first = charset.from_splittable(splittable[:splitpnt], 0)
190 last = charset.from_splittable(splittable[splitpnt:], 0)
Barry Warsaw409a4c02002-04-10 21:01:31 +0000191 else:
Barry Warsaw1c30aa22002-06-01 05:49:17 +0000192 # Divide and conquer.
193 halfway = _floordiv(len(splittable), 2)
Barry Warsaw409a4c02002-04-10 21:01:31 +0000194 first = charset.from_splittable(splittable[:halfway], 0)
195 last = charset.from_splittable(splittable[halfway:], 0)
Barry Warsaw76612502002-06-28 23:46:53 +0000196 # Do the split
197 return self._split(first, charset, firstline) + \
198 self._split(last, charset)
199
200 def _ascii_split(self, s, charset, firstline):
201 # Attempt to split the line at the highest-level syntactic break
202 # possible. Note that we don't have a lot of smarts about field
203 # syntax; we just try to break on semi-colons, then whitespace.
204 rtn = []
205 lines = s.splitlines()
206 while lines:
207 line = lines.pop(0)
208 if firstline:
209 maxlinelen = self._firstlinelen
210 firstline = 0
211 else:
212 line = line.lstrip()
213 maxlinelen = self._maxlinelen
214 # Short lines can remain unchanged
215 if len(line.replace('\t', SPACE8)) <= maxlinelen:
216 rtn.append(line)
217 else:
218 oldlen = len(line)
219 # Try to break the line on semicolons, but if that doesn't
220 # work, try to split on folding whitespace.
221 while len(line) > maxlinelen:
222 i = line.rfind(';', 0, maxlinelen)
223 if i < 0:
224 break
225 rtn.append(line[:i] + ';')
226 line = line[i+1:]
227 # Is the remaining stuff still longer than maxlinelen?
228 if len(line) <= maxlinelen:
229 # Splitting on semis worked
230 rtn.append(line)
231 continue
232 # Splitting on semis didn't finish the job. If it did any
233 # work at all, stick the remaining junk on the front of the
234 # `lines' sequence and let the next pass do its thing.
235 if len(line) <> oldlen:
236 lines.insert(0, line)
237 continue
238 # Otherwise, splitting on semis didn't help at all.
239 parts = re.split(r'(\s+)', line)
240 if len(parts) == 1 or (len(parts) == 3 and
241 parts[0].endswith(':')):
242 # This line can't be split on whitespace. There's now
243 # little we can do to get this into maxlinelen. BAW:
244 # We're still potentially breaking the RFC by possibly
245 # allowing lines longer than the absolute maximum of 998
246 # characters. For now, let it slide.
247 #
248 # len(parts) will be 1 if this line has no `Field: '
249 # prefix, otherwise it will be len(3).
250 rtn.append(line)
251 continue
252 # There is whitespace we can split on.
253 first = parts.pop(0)
254 sublines = [first]
255 acc = len(first)
256 while parts:
257 len0 = len(parts[0])
258 len1 = len(parts[1])
259 if acc + len0 + len1 <= maxlinelen:
260 sublines.append(parts.pop(0))
261 sublines.append(parts.pop(0))
262 acc += len0 + len1
263 else:
264 # Split it here, but don't forget to ignore the
265 # next whitespace-only part
266 if first <> '':
267 rtn.append(EMPTYSTRING.join(sublines))
268 del parts[0]
269 first = parts.pop(0)
270 sublines = [first]
271 acc = len(first)
272 rtn.append(EMPTYSTRING.join(sublines))
273 return [(chunk, charset) for chunk in rtn]
274
275 def _encode_chunks(self):
276 """MIME-encode a header with many different charsets and/or encodings.
277
278 Given a list of pairs (string, charset), return a MIME-encoded string
279 suitable for use in a header field. Each pair may have different
280 charsets and/or encodings, and the resulting header will accurately
281 reflect each setting.
282
283 Each encoding can be email.Utils.QP (quoted-printable, for ASCII-like
284 character sets like iso-8859-1), email.Utils.BASE64 (Base64, for
285 non-ASCII like character sets like KOI8-R and iso-2022-jp), or None
286 (no encoding).
287
288 Each pair will be represented on a separate line; the resulting string
289 will be in the format:
290
291 "=?charset1?q?Mar=EDa_Gonz=E1lez_Alonso?=\n
292 =?charset2?b?SvxyZ2VuIEL2aW5n?="
293 """
294 chunks = []
295 for header, charset in self._chunks:
296 if charset is None or charset.header_encoding is None:
297 # There's no encoding for this chunk's charsets
298 _max_append(chunks, header, self._maxlinelen)
299 else:
300 _max_append(chunks, charset.header_encode(header, 0),
301 self._maxlinelen, ' ')
302 joiner = NL + self._continuation_ws
303 return joiner.join(chunks)
Barry Warsaw409a4c02002-04-10 21:01:31 +0000304
305 def encode(self):
306 """Encode a message header, possibly converting charset and encoding.
307
308 There are many issues involved in converting a given string for use in
309 an email header. Only certain character sets are readable in most
310 email clients, and as header strings can only contain a subset of
311 7-bit ASCII, care must be taken to properly convert and encode (with
312 Base64 or quoted-printable) header strings. In addition, there is a
313 75-character length limit on any given encoded header field, so
314 line-wrapping must be performed, even with double-byte character sets.
Tim Peters8ac14952002-05-23 15:15:30 +0000315
Barry Warsaw409a4c02002-04-10 21:01:31 +0000316 This method will do its best to convert the string to the correct
317 character set used in email, and encode and line wrap it safely with
318 the appropriate scheme for that character set.
319
320 If the given charset is not known or an error occurs during
321 conversion, this function will return the header untouched.
322 """
323 newchunks = []
324 for s, charset in self._chunks:
Barry Warsaw76612502002-06-28 23:46:53 +0000325 newchunks += self._split(s, charset, 1)
Barry Warsaw409a4c02002-04-10 21:01:31 +0000326 self._chunks = newchunks
Barry Warsaw76612502002-06-28 23:46:53 +0000327 return self._encode_chunks()