blob: 21acaf1179014bb8c172f9f45af24e947fe7da83 [file] [log] [blame]
Barry Warsaw3d1f3972004-05-09 03:40:17 +00001# Copyright (C) 2002-2004 Python Software Foundation
2# Author: che@debian.org (Ben Gertzfield), barry@python.org (Barry Warsaw)
Barry Warsaw409a4c02002-04-10 21:01:31 +00003
4"""Header encoding and decoding functionality."""
5
6import re
Barry Warsawe899e512003-03-06 05:39:46 +00007import binascii
Barry Warsaw174aa492002-09-30 15:51:31 +00008
Barry Warsaw409a4c02002-04-10 21:01:31 +00009import email.quopriMIME
10import email.base64MIME
Barry Warsawe899e512003-03-06 05:39:46 +000011from email.Errors import HeaderParseError
Barry Warsaw409a4c02002-04-10 21:01:31 +000012from email.Charset import Charset
13
Barry Warsaw76612502002-06-28 23:46:53 +000014NL = '\n'
Barry Warsawe899e512003-03-06 05:39:46 +000015SPACE = ' '
Barry Warsaw48488052003-03-06 16:10:30 +000016USPACE = u' '
Barry Warsaw76612502002-06-28 23:46:53 +000017SPACE8 = ' ' * 8
Barry Warsaw48488052003-03-06 16:10:30 +000018UEMPTYSTRING = u''
Barry Warsaw409a4c02002-04-10 21:01:31 +000019
20MAXLINELEN = 76
21
Barry Warsaw174aa492002-09-30 15:51:31 +000022USASCII = Charset('us-ascii')
23UTF8 = Charset('utf-8')
24
Barry Warsaw409a4c02002-04-10 21:01:31 +000025# Match encoded-word strings in the form =?charset?q?Hello_World?=
26ecre = re.compile(r'''
27 =\? # literal =?
28 (?P<charset>[^?]*?) # non-greedy up to the next ? is the charset
29 \? # literal ?
30 (?P<encoding>[qb]) # either a "q" or a "b", case insensitive
31 \? # literal ?
32 (?P<encoded>.*?) # non-greedy up to the next ?= is the encoded string
33 \?= # literal ?=
34 ''', re.VERBOSE | re.IGNORECASE)
35
Barry Warsawe899e512003-03-06 05:39:46 +000036# Field name regexp, including trailing colon, but not separating whitespace,
37# according to RFC 2822. Character range is from tilde to exclamation mark.
38# For use with .match()
39fcre = re.compile(r'[\041-\176]+:$')
40
Barry Warsaw409a4c02002-04-10 21:01:31 +000041
42
43# Helpers
44_max_append = email.quopriMIME._max_append
45
46
47
48def decode_header(header):
49 """Decode a message header value without converting charset.
50
51 Returns a list of (decoded_string, charset) pairs containing each of the
52 decoded parts of the header. Charset is None for non-encoded parts of the
53 header, otherwise a lower-case string containing the name of the character
54 set specified in the encoded string.
Barry Warsawe899e512003-03-06 05:39:46 +000055
56 An email.Errors.HeaderParseError may be raised when certain decoding error
57 occurs (e.g. a base64 decoding exception).
Barry Warsaw409a4c02002-04-10 21:01:31 +000058 """
59 # If no encoding, just return the header
60 header = str(header)
61 if not ecre.search(header):
62 return [(header, None)]
Barry Warsaw409a4c02002-04-10 21:01:31 +000063 decoded = []
64 dec = ''
65 for line in header.splitlines():
66 # This line might not have an encoding in it
67 if not ecre.search(line):
68 decoded.append((line, None))
69 continue
Barry Warsaw409a4c02002-04-10 21:01:31 +000070 parts = ecre.split(line)
71 while parts:
72 unenc = parts.pop(0).strip()
73 if unenc:
74 # Should we continue a long line?
75 if decoded and decoded[-1][1] is None:
Barry Warsaw671c3e62003-03-06 06:37:42 +000076 decoded[-1] = (decoded[-1][0] + SPACE + unenc, None)
Barry Warsaw409a4c02002-04-10 21:01:31 +000077 else:
78 decoded.append((unenc, None))
79 if parts:
80 charset, encoding = [s.lower() for s in parts[0:2]]
81 encoded = parts[2]
Barry Warsawe899e512003-03-06 05:39:46 +000082 dec = None
Barry Warsaw409a4c02002-04-10 21:01:31 +000083 if encoding == 'q':
84 dec = email.quopriMIME.header_decode(encoded)
85 elif encoding == 'b':
Barry Warsawe899e512003-03-06 05:39:46 +000086 try:
87 dec = email.base64MIME.decode(encoded)
88 except binascii.Error:
89 # Turn this into a higher level exception. BAW: Right
90 # now we throw the lower level exception away but
91 # when/if we get exception chaining, we'll preserve it.
92 raise HeaderParseError
93 if dec is None:
Barry Warsaw409a4c02002-04-10 21:01:31 +000094 dec = encoded
95
96 if decoded and decoded[-1][1] == charset:
97 decoded[-1] = (decoded[-1][0] + dec, decoded[-1][1])
98 else:
99 decoded.append((dec, charset))
100 del parts[0:3]
101 return decoded
102
103
104
Barry Warsaw8da39aa2002-07-09 16:33:47 +0000105def make_header(decoded_seq, maxlinelen=None, header_name=None,
106 continuation_ws=' '):
107 """Create a Header from a sequence of pairs as returned by decode_header()
108
109 decode_header() takes a header value string and returns a sequence of
110 pairs of the format (decoded_string, charset) where charset is the string
111 name of the character set.
112
113 This function takes one of those sequence of pairs and returns a Header
114 instance. Optional maxlinelen, header_name, and continuation_ws are as in
115 the Header constructor.
116 """
117 h = Header(maxlinelen=maxlinelen, header_name=header_name,
118 continuation_ws=continuation_ws)
119 for s, charset in decoded_seq:
Barry Warsaw15d37392002-07-23 04:29:54 +0000120 # None means us-ascii but we can simply pass it on to h.append()
121 if charset is not None and not isinstance(charset, Charset):
Barry Warsaw8da39aa2002-07-09 16:33:47 +0000122 charset = Charset(charset)
123 h.append(s, charset)
124 return h
125
126
127
Barry Warsaw409a4c02002-04-10 21:01:31 +0000128class Header:
Barry Warsawe899e512003-03-06 05:39:46 +0000129 def __init__(self, s=None, charset=None,
130 maxlinelen=None, header_name=None,
Barry Warsawf4fdff72002-12-30 19:13:00 +0000131 continuation_ws=' ', errors='strict'):
Barry Warsaw174aa492002-09-30 15:51:31 +0000132 """Create a MIME-compliant header that can contain many character sets.
Barry Warsaw409a4c02002-04-10 21:01:31 +0000133
Barry Warsaw174aa492002-09-30 15:51:31 +0000134 Optional s is the initial header value. If None, the initial header
135 value is not set. You can later append to the header with .append()
136 method calls. s may be a byte string or a Unicode string, but see the
137 .append() documentation for semantics.
Barry Warsaw8da39aa2002-07-09 16:33:47 +0000138
Barry Warsaw174aa492002-09-30 15:51:31 +0000139 Optional charset serves two purposes: it has the same meaning as the
140 charset argument to the .append() method. It also sets the default
141 character set for all subsequent .append() calls that omit the charset
142 argument. If charset is not provided in the constructor, the us-ascii
143 charset is used both as s's initial charset and as the default for
144 subsequent .append() calls.
Barry Warsaw409a4c02002-04-10 21:01:31 +0000145
Barry Warsaw76612502002-06-28 23:46:53 +0000146 The maximum line length can be specified explicit via maxlinelen. For
147 splitting the first line to a shorter value (to account for the field
148 header which isn't included in s, e.g. `Subject') pass in the name of
149 the field in header_name. The default maxlinelen is 76.
150
151 continuation_ws must be RFC 2822 compliant folding whitespace (usually
152 either a space or a hard tab) which will be prepended to continuation
153 lines.
Barry Warsawf4fdff72002-12-30 19:13:00 +0000154
155 errors is passed through to the .append() call.
Barry Warsaw409a4c02002-04-10 21:01:31 +0000156 """
157 if charset is None:
Barry Warsaw174aa492002-09-30 15:51:31 +0000158 charset = USASCII
Barry Warsaw5e3bcff2002-10-14 15:13:17 +0000159 if not isinstance(charset, Charset):
160 charset = Charset(charset)
Barry Warsaw409a4c02002-04-10 21:01:31 +0000161 self._charset = charset
Barry Warsaw76612502002-06-28 23:46:53 +0000162 self._continuation_ws = continuation_ws
163 cws_expanded_len = len(continuation_ws.replace('\t', SPACE8))
Barry Warsaw409a4c02002-04-10 21:01:31 +0000164 # BAW: I believe `chunks' and `maxlinelen' should be non-public.
165 self._chunks = []
Barry Warsaw8da39aa2002-07-09 16:33:47 +0000166 if s is not None:
Barry Warsawf4fdff72002-12-30 19:13:00 +0000167 self.append(s, charset, errors)
Barry Warsaw812031b2002-05-19 23:47:53 +0000168 if maxlinelen is None:
Barry Warsaw76612502002-06-28 23:46:53 +0000169 maxlinelen = MAXLINELEN
170 if header_name is None:
171 # We don't know anything about the field header so the first line
172 # is the same length as subsequent lines.
173 self._firstlinelen = maxlinelen
Barry Warsaw812031b2002-05-19 23:47:53 +0000174 else:
Barry Warsaw76612502002-06-28 23:46:53 +0000175 # The first line should be shorter to take into account the field
176 # header. Also subtract off 2 extra for the colon and space.
177 self._firstlinelen = maxlinelen - len(header_name) - 2
178 # Second and subsequent lines should subtract off the length in
179 # columns of the continuation whitespace prefix.
180 self._maxlinelen = maxlinelen - cws_expanded_len
Barry Warsaw409a4c02002-04-10 21:01:31 +0000181
182 def __str__(self):
183 """A synonym for self.encode()."""
184 return self.encode()
185
Barry Warsaw8e69bda2002-06-29 03:26:58 +0000186 def __unicode__(self):
187 """Helper for the built-in unicode function."""
Barry Warsaw48488052003-03-06 16:10:30 +0000188 uchunks = []
189 lastcs = None
190 for s, charset in self._chunks:
191 # We must preserve spaces between encoded and non-encoded word
192 # boundaries, which means for us we need to add a space when we go
193 # from a charset to None/us-ascii, or from None/us-ascii to a
194 # charset. Only do this for the second and subsequent chunks.
195 nextcs = charset
196 if uchunks:
Barry Warsawba1548a2003-03-30 20:46:47 +0000197 if lastcs not in (None, 'us-ascii'):
198 if nextcs in (None, 'us-ascii'):
Barry Warsaw48488052003-03-06 16:10:30 +0000199 uchunks.append(USPACE)
200 nextcs = None
Barry Warsawba1548a2003-03-30 20:46:47 +0000201 elif nextcs not in (None, 'us-ascii'):
Barry Warsaw48488052003-03-06 16:10:30 +0000202 uchunks.append(USPACE)
203 lastcs = nextcs
204 uchunks.append(unicode(s, str(charset)))
205 return UEMPTYSTRING.join(uchunks)
Barry Warsaw8e69bda2002-06-29 03:26:58 +0000206
Barry Warsaw8da39aa2002-07-09 16:33:47 +0000207 # Rich comparison operators for equality only. BAW: does it make sense to
208 # have or explicitly disable <, <=, >, >= operators?
209 def __eq__(self, other):
210 # other may be a Header or a string. Both are fine so coerce
211 # ourselves to a string, swap the args and do another comparison.
212 return other == self.encode()
213
214 def __ne__(self, other):
215 return not self == other
216
Barry Warsawf4fdff72002-12-30 19:13:00 +0000217 def append(self, s, charset=None, errors='strict'):
Barry Warsaw174aa492002-09-30 15:51:31 +0000218 """Append a string to the MIME header.
Barry Warsaw409a4c02002-04-10 21:01:31 +0000219
Barry Warsaw174aa492002-09-30 15:51:31 +0000220 Optional charset, if given, should be a Charset instance or the name
221 of a character set (which will be converted to a Charset instance). A
222 value of None (the default) means that the charset given in the
223 constructor is used.
224
225 s may be a byte string or a Unicode string. If it is a byte string
Barry Warsaw3d1f3972004-05-09 03:40:17 +0000226 (i.e. isinstance(s, str) is true), then charset is the encoding of
227 that byte string, and a UnicodeError will be raised if the string
Barry Warsaw48330682002-09-30 23:07:35 +0000228 cannot be decoded with that charset. If s is a Unicode string, then
Barry Warsaw174aa492002-09-30 15:51:31 +0000229 charset is a hint specifying the character set of the characters in
230 the string. In this case, when producing an RFC 2822 compliant header
231 using RFC 2047 rules, the Unicode string will be encoded using the
Barry Warsaw48330682002-09-30 23:07:35 +0000232 following charsets in order: us-ascii, the charset hint, utf-8. The
233 first character set not to provoke a UnicodeError is used.
Barry Warsawf4fdff72002-12-30 19:13:00 +0000234
235 Optional `errors' is passed as the third argument to any unicode() or
236 ustr.encode() call.
Barry Warsaw409a4c02002-04-10 21:01:31 +0000237 """
238 if charset is None:
239 charset = self._charset
Barry Warsaw92825a92002-07-23 06:08:10 +0000240 elif not isinstance(charset, Charset):
241 charset = Charset(charset)
Barry Warsaw67f8f2f2002-10-14 16:52:41 +0000242 # If the charset is our faux 8bit charset, leave the string unchanged
243 if charset <> '8bit':
244 # We need to test that the string can be converted to unicode and
245 # back to a byte string, given the input and output codecs of the
246 # charset.
Barry Warsaw3d1f3972004-05-09 03:40:17 +0000247 if isinstance(s, str):
Barry Warsaw67f8f2f2002-10-14 16:52:41 +0000248 # Possibly raise UnicodeError if the byte string can't be
249 # converted to a unicode with the input codec of the charset.
250 incodec = charset.input_codec or 'us-ascii'
Barry Warsawf4fdff72002-12-30 19:13:00 +0000251 ustr = unicode(s, incodec, errors)
Barry Warsaw67f8f2f2002-10-14 16:52:41 +0000252 # Now make sure that the unicode could be converted back to a
253 # byte string with the output codec, which may be different
254 # than the iput coded. Still, use the original byte string.
255 outcodec = charset.output_codec or 'us-ascii'
Barry Warsawf4fdff72002-12-30 19:13:00 +0000256 ustr.encode(outcodec, errors)
Barry Warsaw3d1f3972004-05-09 03:40:17 +0000257 elif isinstance(s, unicode):
Barry Warsaw67f8f2f2002-10-14 16:52:41 +0000258 # Now we have to be sure the unicode string can be converted
259 # to a byte string with a reasonable output codec. We want to
260 # use the byte string in the chunk.
261 for charset in USASCII, charset, UTF8:
262 try:
263 outcodec = charset.output_codec or 'us-ascii'
Barry Warsawf4fdff72002-12-30 19:13:00 +0000264 s = s.encode(outcodec, errors)
Barry Warsaw67f8f2f2002-10-14 16:52:41 +0000265 break
266 except UnicodeError:
267 pass
268 else:
269 assert False, 'utf-8 conversion failed'
Barry Warsaw409a4c02002-04-10 21:01:31 +0000270 self._chunks.append((s, charset))
Tim Peters8ac14952002-05-23 15:15:30 +0000271
Barry Warsawe899e512003-03-06 05:39:46 +0000272 def _split(self, s, charset, maxlinelen, splitchars):
Barry Warsaw5e3bcff2002-10-14 15:13:17 +0000273 # Split up a header safely for use with encode_chunks.
Barry Warsaw409a4c02002-04-10 21:01:31 +0000274 splittable = charset.to_splittable(s)
Barry Warsawe899e512003-03-06 05:39:46 +0000275 encoded = charset.from_splittable(splittable, True)
Barry Warsaw812031b2002-05-19 23:47:53 +0000276 elen = charset.encoded_header_len(encoded)
Barry Warsawe899e512003-03-06 05:39:46 +0000277 # If the line's encoded length first, just return it
278 if elen <= maxlinelen:
Barry Warsaw409a4c02002-04-10 21:01:31 +0000279 return [(encoded, charset)]
Barry Warsaw5e3bcff2002-10-14 15:13:17 +0000280 # If we have undetermined raw 8bit characters sitting in a byte
281 # string, we really don't know what the right thing to do is. We
282 # can't really split it because it might be multibyte data which we
283 # could break if we split it between pairs. The least harm seems to
284 # be to not split the header at all, but that means they could go out
285 # longer than maxlinelen.
Barry Warsawe899e512003-03-06 05:39:46 +0000286 if charset == '8bit':
Barry Warsaw5e3bcff2002-10-14 15:13:17 +0000287 return [(s, charset)]
Barry Warsaw76612502002-06-28 23:46:53 +0000288 # BAW: I'm not sure what the right test here is. What we're trying to
289 # do is be faithful to RFC 2822's recommendation that ($2.2.3):
290 #
291 # "Note: Though structured field bodies are defined in such a way that
292 # folding can take place between many of the lexical tokens (and even
293 # within some of the lexical tokens), folding SHOULD be limited to
294 # placing the CRLF at higher-level syntactic breaks."
295 #
296 # For now, I can only imagine doing this when the charset is us-ascii,
297 # although it's possible that other charsets may also benefit from the
298 # higher-level syntactic breaks.
Barry Warsaw76612502002-06-28 23:46:53 +0000299 elif charset == 'us-ascii':
Barry Warsawe899e512003-03-06 05:39:46 +0000300 return self._split_ascii(s, charset, maxlinelen, splitchars)
Barry Warsaw812031b2002-05-19 23:47:53 +0000301 # BAW: should we use encoded?
302 elif elen == len(s):
303 # We can split on _maxlinelen boundaries because we know that the
304 # encoding won't change the size of the string
Barry Warsawe899e512003-03-06 05:39:46 +0000305 splitpnt = maxlinelen
Barry Warsaw174aa492002-09-30 15:51:31 +0000306 first = charset.from_splittable(splittable[:splitpnt], False)
307 last = charset.from_splittable(splittable[splitpnt:], False)
Barry Warsaw409a4c02002-04-10 21:01:31 +0000308 else:
Barry Warsawe899e512003-03-06 05:39:46 +0000309 # Binary search for split point
310 first, last = _binsplit(splittable, charset, maxlinelen)
311 # first is of the proper length so just wrap it in the appropriate
312 # chrome. last must be recursively split.
313 fsplittable = charset.to_splittable(first)
314 fencoded = charset.from_splittable(fsplittable, True)
315 chunk = [(fencoded, charset)]
316 return chunk + self._split(last, charset, self._maxlinelen, splitchars)
Barry Warsaw76612502002-06-28 23:46:53 +0000317
Barry Warsawe899e512003-03-06 05:39:46 +0000318 def _split_ascii(self, s, charset, firstlen, splitchars):
Barry Warsaw5b8c69f2003-03-10 15:14:08 +0000319 chunks = _split_ascii(s, firstlen, self._maxlinelen,
320 self._continuation_ws, splitchars)
321 return zip(chunks, [charset]*len(chunks))
Barry Warsaw76612502002-06-28 23:46:53 +0000322
Barry Warsaw9f3fcd92003-03-07 15:39:37 +0000323 def _encode_chunks(self, newchunks, maxlinelen):
Barry Warsaw0c358252002-10-13 04:06:28 +0000324 # MIME-encode a header with many different charsets and/or encodings.
325 #
326 # Given a list of pairs (string, charset), return a MIME-encoded
327 # string suitable for use in a header field. Each pair may have
328 # different charsets and/or encodings, and the resulting header will
329 # accurately reflect each setting.
330 #
331 # Each encoding can be email.Utils.QP (quoted-printable, for
332 # ASCII-like character sets like iso-8859-1), email.Utils.BASE64
333 # (Base64, for non-ASCII like character sets like KOI8-R and
334 # iso-2022-jp), or None (no encoding).
335 #
336 # Each pair will be represented on a separate line; the resulting
337 # string will be in the format:
338 #
339 # =?charset1?q?Mar=EDa_Gonz=E1lez_Alonso?=\n
340 # =?charset2?b?SvxyZ2VuIEL2aW5n?="
Barry Warsaw76612502002-06-28 23:46:53 +0000341 chunks = []
Barry Warsaw0c358252002-10-13 04:06:28 +0000342 for header, charset in newchunks:
Barry Warsaw6613fb82003-03-17 20:36:20 +0000343 if not header:
344 continue
Barry Warsaw76612502002-06-28 23:46:53 +0000345 if charset is None or charset.header_encoding is None:
Barry Warsawe899e512003-03-06 05:39:46 +0000346 s = header
Barry Warsaw76612502002-06-28 23:46:53 +0000347 else:
Barry Warsawe899e512003-03-06 05:39:46 +0000348 s = charset.header_encode(header)
Barry Warsaw5b8c69f2003-03-10 15:14:08 +0000349 # Don't add more folding whitespace than necessary
350 if chunks and chunks[-1].endswith(' '):
351 extra = ''
352 else:
353 extra = ' '
354 _max_append(chunks, s, maxlinelen, extra)
Barry Warsaw76612502002-06-28 23:46:53 +0000355 joiner = NL + self._continuation_ws
356 return joiner.join(chunks)
Barry Warsaw409a4c02002-04-10 21:01:31 +0000357
Barry Warsawe899e512003-03-06 05:39:46 +0000358 def encode(self, splitchars=';, '):
Barry Warsaw48330682002-09-30 23:07:35 +0000359 """Encode a message header into an RFC-compliant format.
Barry Warsaw409a4c02002-04-10 21:01:31 +0000360
361 There are many issues involved in converting a given string for use in
362 an email header. Only certain character sets are readable in most
363 email clients, and as header strings can only contain a subset of
364 7-bit ASCII, care must be taken to properly convert and encode (with
365 Base64 or quoted-printable) header strings. In addition, there is a
366 75-character length limit on any given encoded header field, so
367 line-wrapping must be performed, even with double-byte character sets.
Tim Peters8ac14952002-05-23 15:15:30 +0000368
Barry Warsaw409a4c02002-04-10 21:01:31 +0000369 This method will do its best to convert the string to the correct
370 character set used in email, and encode and line wrap it safely with
371 the appropriate scheme for that character set.
372
373 If the given charset is not known or an error occurs during
374 conversion, this function will return the header untouched.
Barry Warsawe899e512003-03-06 05:39:46 +0000375
376 Optional splitchars is a string containing characters to split long
377 ASCII lines on, in rough support of RFC 2822's `highest level
378 syntactic breaks'. This doesn't affect RFC 2047 encoded lines.
Barry Warsaw409a4c02002-04-10 21:01:31 +0000379 """
380 newchunks = []
Barry Warsawe899e512003-03-06 05:39:46 +0000381 maxlinelen = self._firstlinelen
382 lastlen = 0
Barry Warsaw409a4c02002-04-10 21:01:31 +0000383 for s, charset in self._chunks:
Barry Warsawe899e512003-03-06 05:39:46 +0000384 # The first bit of the next chunk should be just long enough to
385 # fill the next line. Don't forget the space separating the
386 # encoded words.
387 targetlen = maxlinelen - lastlen - 1
388 if targetlen < charset.encoded_header_len(''):
389 # Stick it on the next line
390 targetlen = maxlinelen
391 newchunks += self._split(s, charset, targetlen, splitchars)
392 lastchunk, lastcharset = newchunks[-1]
393 lastlen = lastcharset.encoded_header_len(lastchunk)
Barry Warsaw9f3fcd92003-03-07 15:39:37 +0000394 return self._encode_chunks(newchunks, maxlinelen)
Barry Warsawe899e512003-03-06 05:39:46 +0000395
396
397
398def _split_ascii(s, firstlen, restlen, continuation_ws, splitchars):
399 lines = []
400 maxlen = firstlen
401 for line in s.splitlines():
Barry Warsaw33975ea2003-03-07 23:24:34 +0000402 # Ignore any leading whitespace (i.e. continuation whitespace) already
403 # on the line, since we'll be adding our own.
404 line = line.lstrip()
Barry Warsawe899e512003-03-06 05:39:46 +0000405 if len(line) < maxlen:
406 lines.append(line)
407 maxlen = restlen
408 continue
409 # Attempt to split the line at the highest-level syntactic break
410 # possible. Note that we don't have a lot of smarts about field
411 # syntax; we just try to break on semi-colons, then commas, then
412 # whitespace.
413 for ch in splitchars:
Barry Warsaw6f3b0332004-05-10 14:44:04 +0000414 if ch in line:
Barry Warsawe899e512003-03-06 05:39:46 +0000415 break
416 else:
417 # There's nothing useful to split the line on, not even spaces, so
418 # just append this line unchanged
419 lines.append(line)
420 maxlen = restlen
421 continue
422 # Now split the line on the character plus trailing whitespace
423 cre = re.compile(r'%s\s*' % ch)
424 if ch in ';,':
425 eol = ch
426 else:
427 eol = ''
428 joiner = eol + ' '
429 joinlen = len(joiner)
430 wslen = len(continuation_ws.replace('\t', SPACE8))
431 this = []
432 linelen = 0
433 for part in cre.split(line):
434 curlen = linelen + max(0, len(this)-1) * joinlen
435 partlen = len(part)
436 onfirstline = not lines
437 # We don't want to split after the field name, if we're on the
438 # first line and the field name is present in the header string.
439 if ch == ' ' and onfirstline and \
440 len(this) == 1 and fcre.match(this[0]):
441 this.append(part)
442 linelen += partlen
443 elif curlen + partlen > maxlen:
444 if this:
445 lines.append(joiner.join(this) + eol)
Barry Warsawbd836df2003-03-06 20:33:04 +0000446 # If this part is longer than maxlen and we aren't already
447 # splitting on whitespace, try to recursively split this line
448 # on whitespace.
449 if partlen > maxlen and ch <> ' ':
Barry Warsaw5b8c69f2003-03-10 15:14:08 +0000450 subl = _split_ascii(part, maxlen, restlen,
Barry Warsaw9f3fcd92003-03-07 15:39:37 +0000451 continuation_ws, ' ')
Barry Warsaw9f3fcd92003-03-07 15:39:37 +0000452 lines.extend(subl[:-1])
453 this = [subl[-1]]
Barry Warsawbd836df2003-03-06 20:33:04 +0000454 else:
455 this = [part]
Barry Warsaw9f3fcd92003-03-07 15:39:37 +0000456 linelen = wslen + len(this[-1])
Barry Warsawe899e512003-03-06 05:39:46 +0000457 maxlen = restlen
458 else:
459 this.append(part)
460 linelen += partlen
461 # Put any left over parts on a line by themselves
462 if this:
463 lines.append(joiner.join(this))
Barry Warsaw5b8c69f2003-03-10 15:14:08 +0000464 return lines
Barry Warsawe899e512003-03-06 05:39:46 +0000465
466
467
468def _binsplit(splittable, charset, maxlinelen):
469 i = 0
470 j = len(splittable)
471 while i < j:
472 # Invariants:
473 # 1. splittable[:k] fits for all k <= i (note that we *assume*,
474 # at the start, that splittable[:0] fits).
475 # 2. splittable[:k] does not fit for any k > j (at the start,
476 # this means we shouldn't look at any k > len(splittable)).
477 # 3. We don't know about splittable[:k] for k in i+1..j.
478 # 4. We want to set i to the largest k that fits, with i <= k <= j.
479 #
480 m = (i+j+1) >> 1 # ceiling((i+j)/2); i < m <= j
481 chunk = charset.from_splittable(splittable[:m], True)
482 chunklen = charset.encoded_header_len(chunk)
483 if chunklen <= maxlinelen:
484 # m is acceptable, so is a new lower bound.
485 i = m
486 else:
Tim Peters2b482132003-03-06 23:41:58 +0000487 # m is not acceptable, so final i must be < m.
Barry Warsawe899e512003-03-06 05:39:46 +0000488 j = m - 1
489 # i == j. Invariant #1 implies that splittable[:i] fits, and
490 # invariant #2 implies that splittable[:i+1] does not fit, so i
491 # is what we're looking for.
492 first = charset.from_splittable(splittable[:i], False)
493 last = charset.from_splittable(splittable[i:], False)
494 return first, last