blob: abc342cc7ab2293a63a79ebd9180d0a8b226a73d [file] [log] [blame]
Barry Warsaw409a4c02002-04-10 21:01:31 +00001# Copyright (C) 2002 Python Software Foundation
Barry Warsaw174aa492002-09-30 15:51:31 +00002# Author: che@debian.org (Ben Gertzfield), barry@zope.com (Barry Warsaw)
Barry Warsaw409a4c02002-04-10 21:01:31 +00003
4"""Header encoding and decoding functionality."""
5
6import re
Barry Warsawe899e512003-03-06 05:39:46 +00007import binascii
Barry Warsaw174aa492002-09-30 15:51:31 +00008from types import StringType, UnicodeType
9
Barry Warsaw409a4c02002-04-10 21:01:31 +000010import email.quopriMIME
11import email.base64MIME
Barry Warsawe899e512003-03-06 05:39:46 +000012from email.Errors import HeaderParseError
Barry Warsaw409a4c02002-04-10 21:01:31 +000013from email.Charset import Charset
14
Barry Warsaw812031b2002-05-19 23:47:53 +000015try:
Barry Warsaw1c30aa22002-06-01 05:49:17 +000016 from email._compat22 import _floordiv
Barry Warsaw812031b2002-05-19 23:47:53 +000017except SyntaxError:
18 # Python 2.1 spells integer division differently
Barry Warsaw1c30aa22002-06-01 05:49:17 +000019 from email._compat21 import _floordiv
Barry Warsaw812031b2002-05-19 23:47:53 +000020
Barry Warsaw174aa492002-09-30 15:51:31 +000021try:
22 True, False
23except NameError:
24 True = 1
25 False = 0
26
Barry Warsaw409a4c02002-04-10 21:01:31 +000027CRLFSPACE = '\r\n '
28CRLF = '\r\n'
Barry Warsaw76612502002-06-28 23:46:53 +000029NL = '\n'
Barry Warsawe899e512003-03-06 05:39:46 +000030SPACE = ' '
Barry Warsaw76612502002-06-28 23:46:53 +000031SPACE8 = ' ' * 8
32EMPTYSTRING = ''
Barry Warsaw409a4c02002-04-10 21:01:31 +000033
34MAXLINELEN = 76
35
36ENCODE = 1
37DECODE = 2
38
Barry Warsaw174aa492002-09-30 15:51:31 +000039USASCII = Charset('us-ascii')
40UTF8 = Charset('utf-8')
41
Barry Warsaw409a4c02002-04-10 21:01:31 +000042# Match encoded-word strings in the form =?charset?q?Hello_World?=
43ecre = re.compile(r'''
44 =\? # literal =?
45 (?P<charset>[^?]*?) # non-greedy up to the next ? is the charset
46 \? # literal ?
47 (?P<encoding>[qb]) # either a "q" or a "b", case insensitive
48 \? # literal ?
49 (?P<encoded>.*?) # non-greedy up to the next ?= is the encoded string
50 \?= # literal ?=
51 ''', re.VERBOSE | re.IGNORECASE)
52
Barry Warsawe899e512003-03-06 05:39:46 +000053pcre = re.compile('([,;])')
54
55# Field name regexp, including trailing colon, but not separating whitespace,
56# according to RFC 2822. Character range is from tilde to exclamation mark.
57# For use with .match()
58fcre = re.compile(r'[\041-\176]+:$')
59
Barry Warsaw409a4c02002-04-10 21:01:31 +000060
61
62# Helpers
63_max_append = email.quopriMIME._max_append
64
65
66
67def decode_header(header):
68 """Decode a message header value without converting charset.
69
70 Returns a list of (decoded_string, charset) pairs containing each of the
71 decoded parts of the header. Charset is None for non-encoded parts of the
72 header, otherwise a lower-case string containing the name of the character
73 set specified in the encoded string.
Barry Warsawe899e512003-03-06 05:39:46 +000074
75 An email.Errors.HeaderParseError may be raised when certain decoding error
76 occurs (e.g. a base64 decoding exception).
Barry Warsaw409a4c02002-04-10 21:01:31 +000077 """
78 # If no encoding, just return the header
79 header = str(header)
80 if not ecre.search(header):
81 return [(header, None)]
Barry Warsaw409a4c02002-04-10 21:01:31 +000082 decoded = []
83 dec = ''
84 for line in header.splitlines():
85 # This line might not have an encoding in it
86 if not ecre.search(line):
87 decoded.append((line, None))
88 continue
Barry Warsaw409a4c02002-04-10 21:01:31 +000089 parts = ecre.split(line)
90 while parts:
91 unenc = parts.pop(0).strip()
92 if unenc:
93 # Should we continue a long line?
94 if decoded and decoded[-1][1] is None:
95 decoded[-1] = (decoded[-1][0] + dec, None)
96 else:
97 decoded.append((unenc, None))
98 if parts:
99 charset, encoding = [s.lower() for s in parts[0:2]]
100 encoded = parts[2]
Barry Warsawe899e512003-03-06 05:39:46 +0000101 dec = None
Barry Warsaw409a4c02002-04-10 21:01:31 +0000102 if encoding == 'q':
103 dec = email.quopriMIME.header_decode(encoded)
104 elif encoding == 'b':
Barry Warsawe899e512003-03-06 05:39:46 +0000105 try:
106 dec = email.base64MIME.decode(encoded)
107 except binascii.Error:
108 # Turn this into a higher level exception. BAW: Right
109 # now we throw the lower level exception away but
110 # when/if we get exception chaining, we'll preserve it.
111 raise HeaderParseError
112 if dec is None:
Barry Warsaw409a4c02002-04-10 21:01:31 +0000113 dec = encoded
114
115 if decoded and decoded[-1][1] == charset:
116 decoded[-1] = (decoded[-1][0] + dec, decoded[-1][1])
117 else:
118 decoded.append((dec, charset))
119 del parts[0:3]
120 return decoded
121
122
123
Barry Warsaw8da39aa2002-07-09 16:33:47 +0000124def make_header(decoded_seq, maxlinelen=None, header_name=None,
125 continuation_ws=' '):
126 """Create a Header from a sequence of pairs as returned by decode_header()
127
128 decode_header() takes a header value string and returns a sequence of
129 pairs of the format (decoded_string, charset) where charset is the string
130 name of the character set.
131
132 This function takes one of those sequence of pairs and returns a Header
133 instance. Optional maxlinelen, header_name, and continuation_ws are as in
134 the Header constructor.
135 """
136 h = Header(maxlinelen=maxlinelen, header_name=header_name,
137 continuation_ws=continuation_ws)
138 for s, charset in decoded_seq:
Barry Warsaw15d37392002-07-23 04:29:54 +0000139 # None means us-ascii but we can simply pass it on to h.append()
140 if charset is not None and not isinstance(charset, Charset):
Barry Warsaw8da39aa2002-07-09 16:33:47 +0000141 charset = Charset(charset)
142 h.append(s, charset)
143 return h
144
145
146
Barry Warsaw409a4c02002-04-10 21:01:31 +0000147class Header:
Barry Warsawe899e512003-03-06 05:39:46 +0000148 def __init__(self, s=None, charset=None,
149 maxlinelen=None, header_name=None,
Barry Warsawf4fdff72002-12-30 19:13:00 +0000150 continuation_ws=' ', errors='strict'):
Barry Warsaw174aa492002-09-30 15:51:31 +0000151 """Create a MIME-compliant header that can contain many character sets.
Barry Warsaw409a4c02002-04-10 21:01:31 +0000152
Barry Warsaw174aa492002-09-30 15:51:31 +0000153 Optional s is the initial header value. If None, the initial header
154 value is not set. You can later append to the header with .append()
155 method calls. s may be a byte string or a Unicode string, but see the
156 .append() documentation for semantics.
Barry Warsaw8da39aa2002-07-09 16:33:47 +0000157
Barry Warsaw174aa492002-09-30 15:51:31 +0000158 Optional charset serves two purposes: it has the same meaning as the
159 charset argument to the .append() method. It also sets the default
160 character set for all subsequent .append() calls that omit the charset
161 argument. If charset is not provided in the constructor, the us-ascii
162 charset is used both as s's initial charset and as the default for
163 subsequent .append() calls.
Barry Warsaw409a4c02002-04-10 21:01:31 +0000164
Barry Warsaw76612502002-06-28 23:46:53 +0000165 The maximum line length can be specified explicit via maxlinelen. For
166 splitting the first line to a shorter value (to account for the field
167 header which isn't included in s, e.g. `Subject') pass in the name of
168 the field in header_name. The default maxlinelen is 76.
169
170 continuation_ws must be RFC 2822 compliant folding whitespace (usually
171 either a space or a hard tab) which will be prepended to continuation
172 lines.
Barry Warsawf4fdff72002-12-30 19:13:00 +0000173
174 errors is passed through to the .append() call.
Barry Warsaw409a4c02002-04-10 21:01:31 +0000175 """
176 if charset is None:
Barry Warsaw174aa492002-09-30 15:51:31 +0000177 charset = USASCII
Barry Warsaw5e3bcff2002-10-14 15:13:17 +0000178 if not isinstance(charset, Charset):
179 charset = Charset(charset)
Barry Warsaw409a4c02002-04-10 21:01:31 +0000180 self._charset = charset
Barry Warsaw76612502002-06-28 23:46:53 +0000181 self._continuation_ws = continuation_ws
182 cws_expanded_len = len(continuation_ws.replace('\t', SPACE8))
Barry Warsaw409a4c02002-04-10 21:01:31 +0000183 # BAW: I believe `chunks' and `maxlinelen' should be non-public.
184 self._chunks = []
Barry Warsaw8da39aa2002-07-09 16:33:47 +0000185 if s is not None:
Barry Warsawf4fdff72002-12-30 19:13:00 +0000186 self.append(s, charset, errors)
Barry Warsaw812031b2002-05-19 23:47:53 +0000187 if maxlinelen is None:
Barry Warsaw76612502002-06-28 23:46:53 +0000188 maxlinelen = MAXLINELEN
189 if header_name is None:
190 # We don't know anything about the field header so the first line
191 # is the same length as subsequent lines.
192 self._firstlinelen = maxlinelen
Barry Warsaw812031b2002-05-19 23:47:53 +0000193 else:
Barry Warsaw76612502002-06-28 23:46:53 +0000194 # The first line should be shorter to take into account the field
195 # header. Also subtract off 2 extra for the colon and space.
196 self._firstlinelen = maxlinelen - len(header_name) - 2
197 # Second and subsequent lines should subtract off the length in
198 # columns of the continuation whitespace prefix.
199 self._maxlinelen = maxlinelen - cws_expanded_len
Barry Warsaw409a4c02002-04-10 21:01:31 +0000200
201 def __str__(self):
202 """A synonym for self.encode()."""
203 return self.encode()
204
Barry Warsaw8e69bda2002-06-29 03:26:58 +0000205 def __unicode__(self):
206 """Helper for the built-in unicode function."""
207 # charset item is a Charset instance so we need to stringify it.
208 uchunks = [unicode(s, str(charset)) for s, charset in self._chunks]
209 return u''.join(uchunks)
210
Barry Warsaw8da39aa2002-07-09 16:33:47 +0000211 # Rich comparison operators for equality only. BAW: does it make sense to
212 # have or explicitly disable <, <=, >, >= operators?
213 def __eq__(self, other):
214 # other may be a Header or a string. Both are fine so coerce
215 # ourselves to a string, swap the args and do another comparison.
216 return other == self.encode()
217
218 def __ne__(self, other):
219 return not self == other
220
Barry Warsawf4fdff72002-12-30 19:13:00 +0000221 def append(self, s, charset=None, errors='strict'):
Barry Warsaw174aa492002-09-30 15:51:31 +0000222 """Append a string to the MIME header.
Barry Warsaw409a4c02002-04-10 21:01:31 +0000223
Barry Warsaw174aa492002-09-30 15:51:31 +0000224 Optional charset, if given, should be a Charset instance or the name
225 of a character set (which will be converted to a Charset instance). A
226 value of None (the default) means that the charset given in the
227 constructor is used.
228
229 s may be a byte string or a Unicode string. If it is a byte string
230 (i.e. isinstance(s, StringType) is true), then charset is the encoding
231 of that byte string, and a UnicodeError will be raised if the string
Barry Warsaw48330682002-09-30 23:07:35 +0000232 cannot be decoded with that charset. If s is a Unicode string, then
Barry Warsaw174aa492002-09-30 15:51:31 +0000233 charset is a hint specifying the character set of the characters in
234 the string. In this case, when producing an RFC 2822 compliant header
235 using RFC 2047 rules, the Unicode string will be encoded using the
Barry Warsaw48330682002-09-30 23:07:35 +0000236 following charsets in order: us-ascii, the charset hint, utf-8. The
237 first character set not to provoke a UnicodeError is used.
Barry Warsawf4fdff72002-12-30 19:13:00 +0000238
239 Optional `errors' is passed as the third argument to any unicode() or
240 ustr.encode() call.
Barry Warsaw409a4c02002-04-10 21:01:31 +0000241 """
242 if charset is None:
243 charset = self._charset
Barry Warsaw92825a92002-07-23 06:08:10 +0000244 elif not isinstance(charset, Charset):
245 charset = Charset(charset)
Barry Warsaw67f8f2f2002-10-14 16:52:41 +0000246 # If the charset is our faux 8bit charset, leave the string unchanged
247 if charset <> '8bit':
248 # We need to test that the string can be converted to unicode and
249 # back to a byte string, given the input and output codecs of the
250 # charset.
251 if isinstance(s, StringType):
252 # Possibly raise UnicodeError if the byte string can't be
253 # converted to a unicode with the input codec of the charset.
254 incodec = charset.input_codec or 'us-ascii'
Barry Warsawf4fdff72002-12-30 19:13:00 +0000255 ustr = unicode(s, incodec, errors)
Barry Warsaw67f8f2f2002-10-14 16:52:41 +0000256 # Now make sure that the unicode could be converted back to a
257 # byte string with the output codec, which may be different
258 # than the iput coded. Still, use the original byte string.
259 outcodec = charset.output_codec or 'us-ascii'
Barry Warsawf4fdff72002-12-30 19:13:00 +0000260 ustr.encode(outcodec, errors)
Barry Warsaw67f8f2f2002-10-14 16:52:41 +0000261 elif isinstance(s, UnicodeType):
262 # Now we have to be sure the unicode string can be converted
263 # to a byte string with a reasonable output codec. We want to
264 # use the byte string in the chunk.
265 for charset in USASCII, charset, UTF8:
266 try:
267 outcodec = charset.output_codec or 'us-ascii'
Barry Warsawf4fdff72002-12-30 19:13:00 +0000268 s = s.encode(outcodec, errors)
Barry Warsaw67f8f2f2002-10-14 16:52:41 +0000269 break
270 except UnicodeError:
271 pass
272 else:
273 assert False, 'utf-8 conversion failed'
Barry Warsaw409a4c02002-04-10 21:01:31 +0000274 self._chunks.append((s, charset))
Tim Peters8ac14952002-05-23 15:15:30 +0000275
Barry Warsawe899e512003-03-06 05:39:46 +0000276 def _split(self, s, charset, maxlinelen, splitchars):
Barry Warsaw5e3bcff2002-10-14 15:13:17 +0000277 # Split up a header safely for use with encode_chunks.
Barry Warsaw409a4c02002-04-10 21:01:31 +0000278 splittable = charset.to_splittable(s)
Barry Warsawe899e512003-03-06 05:39:46 +0000279 encoded = charset.from_splittable(splittable, True)
Barry Warsaw812031b2002-05-19 23:47:53 +0000280 elen = charset.encoded_header_len(encoded)
Barry Warsawe899e512003-03-06 05:39:46 +0000281 # If the line's encoded length first, just return it
282 if elen <= maxlinelen:
Barry Warsaw409a4c02002-04-10 21:01:31 +0000283 return [(encoded, charset)]
Barry Warsaw5e3bcff2002-10-14 15:13:17 +0000284 # If we have undetermined raw 8bit characters sitting in a byte
285 # string, we really don't know what the right thing to do is. We
286 # can't really split it because it might be multibyte data which we
287 # could break if we split it between pairs. The least harm seems to
288 # be to not split the header at all, but that means they could go out
289 # longer than maxlinelen.
Barry Warsawe899e512003-03-06 05:39:46 +0000290 if charset == '8bit':
Barry Warsaw5e3bcff2002-10-14 15:13:17 +0000291 return [(s, charset)]
Barry Warsaw76612502002-06-28 23:46:53 +0000292 # BAW: I'm not sure what the right test here is. What we're trying to
293 # do is be faithful to RFC 2822's recommendation that ($2.2.3):
294 #
295 # "Note: Though structured field bodies are defined in such a way that
296 # folding can take place between many of the lexical tokens (and even
297 # within some of the lexical tokens), folding SHOULD be limited to
298 # placing the CRLF at higher-level syntactic breaks."
299 #
300 # For now, I can only imagine doing this when the charset is us-ascii,
301 # although it's possible that other charsets may also benefit from the
302 # higher-level syntactic breaks.
Barry Warsaw76612502002-06-28 23:46:53 +0000303 elif charset == 'us-ascii':
Barry Warsawe899e512003-03-06 05:39:46 +0000304 return self._split_ascii(s, charset, maxlinelen, splitchars)
Barry Warsaw812031b2002-05-19 23:47:53 +0000305 # BAW: should we use encoded?
306 elif elen == len(s):
307 # We can split on _maxlinelen boundaries because we know that the
308 # encoding won't change the size of the string
Barry Warsawe899e512003-03-06 05:39:46 +0000309 splitpnt = maxlinelen
Barry Warsaw174aa492002-09-30 15:51:31 +0000310 first = charset.from_splittable(splittable[:splitpnt], False)
311 last = charset.from_splittable(splittable[splitpnt:], False)
Barry Warsaw409a4c02002-04-10 21:01:31 +0000312 else:
Barry Warsawe899e512003-03-06 05:39:46 +0000313 # Binary search for split point
314 first, last = _binsplit(splittable, charset, maxlinelen)
315 # first is of the proper length so just wrap it in the appropriate
316 # chrome. last must be recursively split.
317 fsplittable = charset.to_splittable(first)
318 fencoded = charset.from_splittable(fsplittable, True)
319 chunk = [(fencoded, charset)]
320 return chunk + self._split(last, charset, self._maxlinelen, splitchars)
Barry Warsaw76612502002-06-28 23:46:53 +0000321
Barry Warsawe899e512003-03-06 05:39:46 +0000322 def _split_ascii(self, s, charset, firstlen, splitchars):
323 line = _split_ascii(s, firstlen, self._maxlinelen,
324 self._continuation_ws, splitchars)
325 lines = line.splitlines()
326 return zip(lines, [charset]*len(lines))
Barry Warsaw76612502002-06-28 23:46:53 +0000327
Barry Warsaw0c358252002-10-13 04:06:28 +0000328 def _encode_chunks(self, newchunks):
329 # MIME-encode a header with many different charsets and/or encodings.
330 #
331 # Given a list of pairs (string, charset), return a MIME-encoded
332 # string suitable for use in a header field. Each pair may have
333 # different charsets and/or encodings, and the resulting header will
334 # accurately reflect each setting.
335 #
336 # Each encoding can be email.Utils.QP (quoted-printable, for
337 # ASCII-like character sets like iso-8859-1), email.Utils.BASE64
338 # (Base64, for non-ASCII like character sets like KOI8-R and
339 # iso-2022-jp), or None (no encoding).
340 #
341 # Each pair will be represented on a separate line; the resulting
342 # string will be in the format:
343 #
344 # =?charset1?q?Mar=EDa_Gonz=E1lez_Alonso?=\n
345 # =?charset2?b?SvxyZ2VuIEL2aW5n?="
346 #
Barry Warsaw76612502002-06-28 23:46:53 +0000347 chunks = []
Barry Warsaw0c358252002-10-13 04:06:28 +0000348 for header, charset in newchunks:
Barry Warsaw76612502002-06-28 23:46:53 +0000349 if charset is None or charset.header_encoding is None:
Barry Warsawe899e512003-03-06 05:39:46 +0000350 s = header
Barry Warsaw76612502002-06-28 23:46:53 +0000351 else:
Barry Warsawe899e512003-03-06 05:39:46 +0000352 s = charset.header_encode(header)
353 _max_append(chunks, s, self._maxlinelen, ' ')
Barry Warsaw76612502002-06-28 23:46:53 +0000354 joiner = NL + self._continuation_ws
355 return joiner.join(chunks)
Barry Warsaw409a4c02002-04-10 21:01:31 +0000356
Barry Warsawe899e512003-03-06 05:39:46 +0000357 def encode(self, splitchars=';, '):
Barry Warsaw48330682002-09-30 23:07:35 +0000358 """Encode a message header into an RFC-compliant format.
Barry Warsaw409a4c02002-04-10 21:01:31 +0000359
360 There are many issues involved in converting a given string for use in
361 an email header. Only certain character sets are readable in most
362 email clients, and as header strings can only contain a subset of
363 7-bit ASCII, care must be taken to properly convert and encode (with
364 Base64 or quoted-printable) header strings. In addition, there is a
365 75-character length limit on any given encoded header field, so
366 line-wrapping must be performed, even with double-byte character sets.
Tim Peters8ac14952002-05-23 15:15:30 +0000367
Barry Warsaw409a4c02002-04-10 21:01:31 +0000368 This method will do its best to convert the string to the correct
369 character set used in email, and encode and line wrap it safely with
370 the appropriate scheme for that character set.
371
372 If the given charset is not known or an error occurs during
373 conversion, this function will return the header untouched.
Barry Warsawe899e512003-03-06 05:39:46 +0000374
375 Optional splitchars is a string containing characters to split long
376 ASCII lines on, in rough support of RFC 2822's `highest level
377 syntactic breaks'. This doesn't affect RFC 2047 encoded lines.
Barry Warsaw409a4c02002-04-10 21:01:31 +0000378 """
379 newchunks = []
Barry Warsawe899e512003-03-06 05:39:46 +0000380 maxlinelen = self._firstlinelen
381 lastlen = 0
Barry Warsaw409a4c02002-04-10 21:01:31 +0000382 for s, charset in self._chunks:
Barry Warsawe899e512003-03-06 05:39:46 +0000383 # The first bit of the next chunk should be just long enough to
384 # fill the next line. Don't forget the space separating the
385 # encoded words.
386 targetlen = maxlinelen - lastlen - 1
387 if targetlen < charset.encoded_header_len(''):
388 # Stick it on the next line
389 targetlen = maxlinelen
390 newchunks += self._split(s, charset, targetlen, splitchars)
391 lastchunk, lastcharset = newchunks[-1]
392 lastlen = lastcharset.encoded_header_len(lastchunk)
Barry Warsaw0c358252002-10-13 04:06:28 +0000393 return self._encode_chunks(newchunks)
Barry Warsawe899e512003-03-06 05:39:46 +0000394
395
396
397def _split_ascii(s, firstlen, restlen, continuation_ws, splitchars):
398 lines = []
399 maxlen = firstlen
400 for line in s.splitlines():
401 if len(line) < maxlen:
402 lines.append(line)
403 maxlen = restlen
404 continue
405 # Attempt to split the line at the highest-level syntactic break
406 # possible. Note that we don't have a lot of smarts about field
407 # syntax; we just try to break on semi-colons, then commas, then
408 # whitespace.
409 for ch in splitchars:
410 if line.find(ch) >= 0:
411 break
412 else:
413 # There's nothing useful to split the line on, not even spaces, so
414 # just append this line unchanged
415 lines.append(line)
416 maxlen = restlen
417 continue
418 # Now split the line on the character plus trailing whitespace
419 cre = re.compile(r'%s\s*' % ch)
420 if ch in ';,':
421 eol = ch
422 else:
423 eol = ''
424 joiner = eol + ' '
425 joinlen = len(joiner)
426 wslen = len(continuation_ws.replace('\t', SPACE8))
427 this = []
428 linelen = 0
429 for part in cre.split(line):
430 curlen = linelen + max(0, len(this)-1) * joinlen
431 partlen = len(part)
432 onfirstline = not lines
433 # We don't want to split after the field name, if we're on the
434 # first line and the field name is present in the header string.
435 if ch == ' ' and onfirstline and \
436 len(this) == 1 and fcre.match(this[0]):
437 this.append(part)
438 linelen += partlen
439 elif curlen + partlen > maxlen:
440 if this:
441 lines.append(joiner.join(this) + eol)
442 this = [part]
443 linelen = wslen + partlen
444 maxlen = restlen
445 else:
446 this.append(part)
447 linelen += partlen
448 # Put any left over parts on a line by themselves
449 if this:
450 lines.append(joiner.join(this))
451 linejoiner = '\n' + continuation_ws
452 return linejoiner.join(lines)
453
454
455
456def _binsplit(splittable, charset, maxlinelen):
457 i = 0
458 j = len(splittable)
459 while i < j:
460 # Invariants:
461 # 1. splittable[:k] fits for all k <= i (note that we *assume*,
462 # at the start, that splittable[:0] fits).
463 # 2. splittable[:k] does not fit for any k > j (at the start,
464 # this means we shouldn't look at any k > len(splittable)).
465 # 3. We don't know about splittable[:k] for k in i+1..j.
466 # 4. We want to set i to the largest k that fits, with i <= k <= j.
467 #
468 m = (i+j+1) >> 1 # ceiling((i+j)/2); i < m <= j
469 chunk = charset.from_splittable(splittable[:m], True)
470 chunklen = charset.encoded_header_len(chunk)
471 if chunklen <= maxlinelen:
472 # m is acceptable, so is a new lower bound.
473 i = m
474 else:
475 # m is not acceptable, so final i must be < j.
476 j = m - 1
477 # i == j. Invariant #1 implies that splittable[:i] fits, and
478 # invariant #2 implies that splittable[:i+1] does not fit, so i
479 # is what we're looking for.
480 first = charset.from_splittable(splittable[:i], False)
481 last = charset.from_splittable(splittable[i:], False)
482 return first, last