blob: afd815fcea41f434be65884be9703a89f0c2c8a8 [file] [log] [blame]
Barry Warsaw409a4c02002-04-10 21:01:31 +00001# Copyright (C) 2002 Python Software Foundation
Barry Warsaw174aa492002-09-30 15:51:31 +00002# Author: che@debian.org (Ben Gertzfield), barry@zope.com (Barry Warsaw)
Barry Warsaw409a4c02002-04-10 21:01:31 +00003
4"""Header encoding and decoding functionality."""
5
6import re
Barry Warsawe899e512003-03-06 05:39:46 +00007import binascii
Barry Warsaw174aa492002-09-30 15:51:31 +00008from types import StringType, UnicodeType
9
Barry Warsaw409a4c02002-04-10 21:01:31 +000010import email.quopriMIME
11import email.base64MIME
Barry Warsawe899e512003-03-06 05:39:46 +000012from email.Errors import HeaderParseError
Barry Warsaw409a4c02002-04-10 21:01:31 +000013from email.Charset import Charset
14
Barry Warsaw812031b2002-05-19 23:47:53 +000015try:
Barry Warsaw1c30aa22002-06-01 05:49:17 +000016 from email._compat22 import _floordiv
Barry Warsaw812031b2002-05-19 23:47:53 +000017except SyntaxError:
18 # Python 2.1 spells integer division differently
Barry Warsaw1c30aa22002-06-01 05:49:17 +000019 from email._compat21 import _floordiv
Barry Warsaw812031b2002-05-19 23:47:53 +000020
Barry Warsaw174aa492002-09-30 15:51:31 +000021try:
22 True, False
23except NameError:
24 True = 1
25 False = 0
26
Barry Warsaw409a4c02002-04-10 21:01:31 +000027CRLFSPACE = '\r\n '
28CRLF = '\r\n'
Barry Warsaw76612502002-06-28 23:46:53 +000029NL = '\n'
Barry Warsawe899e512003-03-06 05:39:46 +000030SPACE = ' '
Barry Warsaw48488052003-03-06 16:10:30 +000031USPACE = u' '
Barry Warsaw76612502002-06-28 23:46:53 +000032SPACE8 = ' ' * 8
33EMPTYSTRING = ''
Barry Warsaw48488052003-03-06 16:10:30 +000034UEMPTYSTRING = u''
Barry Warsaw409a4c02002-04-10 21:01:31 +000035
36MAXLINELEN = 76
37
38ENCODE = 1
39DECODE = 2
40
Barry Warsaw174aa492002-09-30 15:51:31 +000041USASCII = Charset('us-ascii')
42UTF8 = Charset('utf-8')
43
Barry Warsaw409a4c02002-04-10 21:01:31 +000044# Match encoded-word strings in the form =?charset?q?Hello_World?=
45ecre = re.compile(r'''
46 =\? # literal =?
47 (?P<charset>[^?]*?) # non-greedy up to the next ? is the charset
48 \? # literal ?
49 (?P<encoding>[qb]) # either a "q" or a "b", case insensitive
50 \? # literal ?
51 (?P<encoded>.*?) # non-greedy up to the next ?= is the encoded string
52 \?= # literal ?=
53 ''', re.VERBOSE | re.IGNORECASE)
54
Barry Warsawe899e512003-03-06 05:39:46 +000055pcre = re.compile('([,;])')
56
57# Field name regexp, including trailing colon, but not separating whitespace,
58# according to RFC 2822. Character range is from tilde to exclamation mark.
59# For use with .match()
60fcre = re.compile(r'[\041-\176]+:$')
61
Barry Warsaw409a4c02002-04-10 21:01:31 +000062
63
64# Helpers
65_max_append = email.quopriMIME._max_append
66
67
68
69def decode_header(header):
70 """Decode a message header value without converting charset.
71
72 Returns a list of (decoded_string, charset) pairs containing each of the
73 decoded parts of the header. Charset is None for non-encoded parts of the
74 header, otherwise a lower-case string containing the name of the character
75 set specified in the encoded string.
Barry Warsawe899e512003-03-06 05:39:46 +000076
77 An email.Errors.HeaderParseError may be raised when certain decoding error
78 occurs (e.g. a base64 decoding exception).
Barry Warsaw409a4c02002-04-10 21:01:31 +000079 """
80 # If no encoding, just return the header
81 header = str(header)
82 if not ecre.search(header):
83 return [(header, None)]
Barry Warsaw409a4c02002-04-10 21:01:31 +000084 decoded = []
85 dec = ''
86 for line in header.splitlines():
87 # This line might not have an encoding in it
88 if not ecre.search(line):
89 decoded.append((line, None))
90 continue
Barry Warsaw409a4c02002-04-10 21:01:31 +000091 parts = ecre.split(line)
92 while parts:
93 unenc = parts.pop(0).strip()
94 if unenc:
95 # Should we continue a long line?
96 if decoded and decoded[-1][1] is None:
Barry Warsaw671c3e62003-03-06 06:37:42 +000097 decoded[-1] = (decoded[-1][0] + SPACE + unenc, None)
Barry Warsaw409a4c02002-04-10 21:01:31 +000098 else:
99 decoded.append((unenc, None))
100 if parts:
101 charset, encoding = [s.lower() for s in parts[0:2]]
102 encoded = parts[2]
Barry Warsawe899e512003-03-06 05:39:46 +0000103 dec = None
Barry Warsaw409a4c02002-04-10 21:01:31 +0000104 if encoding == 'q':
105 dec = email.quopriMIME.header_decode(encoded)
106 elif encoding == 'b':
Barry Warsawe899e512003-03-06 05:39:46 +0000107 try:
108 dec = email.base64MIME.decode(encoded)
109 except binascii.Error:
110 # Turn this into a higher level exception. BAW: Right
111 # now we throw the lower level exception away but
112 # when/if we get exception chaining, we'll preserve it.
113 raise HeaderParseError
114 if dec is None:
Barry Warsaw409a4c02002-04-10 21:01:31 +0000115 dec = encoded
116
117 if decoded and decoded[-1][1] == charset:
118 decoded[-1] = (decoded[-1][0] + dec, decoded[-1][1])
119 else:
120 decoded.append((dec, charset))
121 del parts[0:3]
122 return decoded
123
124
125
Barry Warsaw8da39aa2002-07-09 16:33:47 +0000126def make_header(decoded_seq, maxlinelen=None, header_name=None,
127 continuation_ws=' '):
128 """Create a Header from a sequence of pairs as returned by decode_header()
129
130 decode_header() takes a header value string and returns a sequence of
131 pairs of the format (decoded_string, charset) where charset is the string
132 name of the character set.
133
134 This function takes one of those sequence of pairs and returns a Header
135 instance. Optional maxlinelen, header_name, and continuation_ws are as in
136 the Header constructor.
137 """
138 h = Header(maxlinelen=maxlinelen, header_name=header_name,
139 continuation_ws=continuation_ws)
140 for s, charset in decoded_seq:
Barry Warsaw15d37392002-07-23 04:29:54 +0000141 # None means us-ascii but we can simply pass it on to h.append()
142 if charset is not None and not isinstance(charset, Charset):
Barry Warsaw8da39aa2002-07-09 16:33:47 +0000143 charset = Charset(charset)
144 h.append(s, charset)
145 return h
146
147
148
Barry Warsaw409a4c02002-04-10 21:01:31 +0000149class Header:
Barry Warsawe899e512003-03-06 05:39:46 +0000150 def __init__(self, s=None, charset=None,
151 maxlinelen=None, header_name=None,
Barry Warsawf4fdff72002-12-30 19:13:00 +0000152 continuation_ws=' ', errors='strict'):
Barry Warsaw174aa492002-09-30 15:51:31 +0000153 """Create a MIME-compliant header that can contain many character sets.
Barry Warsaw409a4c02002-04-10 21:01:31 +0000154
Barry Warsaw174aa492002-09-30 15:51:31 +0000155 Optional s is the initial header value. If None, the initial header
156 value is not set. You can later append to the header with .append()
157 method calls. s may be a byte string or a Unicode string, but see the
158 .append() documentation for semantics.
Barry Warsaw8da39aa2002-07-09 16:33:47 +0000159
Barry Warsaw174aa492002-09-30 15:51:31 +0000160 Optional charset serves two purposes: it has the same meaning as the
161 charset argument to the .append() method. It also sets the default
162 character set for all subsequent .append() calls that omit the charset
163 argument. If charset is not provided in the constructor, the us-ascii
164 charset is used both as s's initial charset and as the default for
165 subsequent .append() calls.
Barry Warsaw409a4c02002-04-10 21:01:31 +0000166
Barry Warsaw76612502002-06-28 23:46:53 +0000167 The maximum line length can be specified explicit via maxlinelen. For
168 splitting the first line to a shorter value (to account for the field
169 header which isn't included in s, e.g. `Subject') pass in the name of
170 the field in header_name. The default maxlinelen is 76.
171
172 continuation_ws must be RFC 2822 compliant folding whitespace (usually
173 either a space or a hard tab) which will be prepended to continuation
174 lines.
Barry Warsawf4fdff72002-12-30 19:13:00 +0000175
176 errors is passed through to the .append() call.
Barry Warsaw409a4c02002-04-10 21:01:31 +0000177 """
178 if charset is None:
Barry Warsaw174aa492002-09-30 15:51:31 +0000179 charset = USASCII
Barry Warsaw5e3bcff2002-10-14 15:13:17 +0000180 if not isinstance(charset, Charset):
181 charset = Charset(charset)
Barry Warsaw409a4c02002-04-10 21:01:31 +0000182 self._charset = charset
Barry Warsaw76612502002-06-28 23:46:53 +0000183 self._continuation_ws = continuation_ws
184 cws_expanded_len = len(continuation_ws.replace('\t', SPACE8))
Barry Warsaw409a4c02002-04-10 21:01:31 +0000185 # BAW: I believe `chunks' and `maxlinelen' should be non-public.
186 self._chunks = []
Barry Warsaw8da39aa2002-07-09 16:33:47 +0000187 if s is not None:
Barry Warsawf4fdff72002-12-30 19:13:00 +0000188 self.append(s, charset, errors)
Barry Warsaw812031b2002-05-19 23:47:53 +0000189 if maxlinelen is None:
Barry Warsaw76612502002-06-28 23:46:53 +0000190 maxlinelen = MAXLINELEN
191 if header_name is None:
192 # We don't know anything about the field header so the first line
193 # is the same length as subsequent lines.
194 self._firstlinelen = maxlinelen
Barry Warsaw812031b2002-05-19 23:47:53 +0000195 else:
Barry Warsaw76612502002-06-28 23:46:53 +0000196 # The first line should be shorter to take into account the field
197 # header. Also subtract off 2 extra for the colon and space.
198 self._firstlinelen = maxlinelen - len(header_name) - 2
199 # Second and subsequent lines should subtract off the length in
200 # columns of the continuation whitespace prefix.
201 self._maxlinelen = maxlinelen - cws_expanded_len
Barry Warsaw409a4c02002-04-10 21:01:31 +0000202
203 def __str__(self):
204 """A synonym for self.encode()."""
205 return self.encode()
206
Barry Warsaw8e69bda2002-06-29 03:26:58 +0000207 def __unicode__(self):
208 """Helper for the built-in unicode function."""
Barry Warsaw48488052003-03-06 16:10:30 +0000209 uchunks = []
210 lastcs = None
211 for s, charset in self._chunks:
212 # We must preserve spaces between encoded and non-encoded word
213 # boundaries, which means for us we need to add a space when we go
214 # from a charset to None/us-ascii, or from None/us-ascii to a
215 # charset. Only do this for the second and subsequent chunks.
216 nextcs = charset
217 if uchunks:
218 if lastcs is not None:
219 if nextcs is None or nextcs == 'us-ascii':
220 uchunks.append(USPACE)
221 nextcs = None
222 elif nextcs is not None and nextcs <> 'us-ascii':
223 uchunks.append(USPACE)
224 lastcs = nextcs
225 uchunks.append(unicode(s, str(charset)))
226 return UEMPTYSTRING.join(uchunks)
Barry Warsaw8e69bda2002-06-29 03:26:58 +0000227
Barry Warsaw8da39aa2002-07-09 16:33:47 +0000228 # Rich comparison operators for equality only. BAW: does it make sense to
229 # have or explicitly disable <, <=, >, >= operators?
230 def __eq__(self, other):
231 # other may be a Header or a string. Both are fine so coerce
232 # ourselves to a string, swap the args and do another comparison.
233 return other == self.encode()
234
235 def __ne__(self, other):
236 return not self == other
237
Barry Warsawf4fdff72002-12-30 19:13:00 +0000238 def append(self, s, charset=None, errors='strict'):
Barry Warsaw174aa492002-09-30 15:51:31 +0000239 """Append a string to the MIME header.
Barry Warsaw409a4c02002-04-10 21:01:31 +0000240
Barry Warsaw174aa492002-09-30 15:51:31 +0000241 Optional charset, if given, should be a Charset instance or the name
242 of a character set (which will be converted to a Charset instance). A
243 value of None (the default) means that the charset given in the
244 constructor is used.
245
246 s may be a byte string or a Unicode string. If it is a byte string
247 (i.e. isinstance(s, StringType) is true), then charset is the encoding
248 of that byte string, and a UnicodeError will be raised if the string
Barry Warsaw48330682002-09-30 23:07:35 +0000249 cannot be decoded with that charset. If s is a Unicode string, then
Barry Warsaw174aa492002-09-30 15:51:31 +0000250 charset is a hint specifying the character set of the characters in
251 the string. In this case, when producing an RFC 2822 compliant header
252 using RFC 2047 rules, the Unicode string will be encoded using the
Barry Warsaw48330682002-09-30 23:07:35 +0000253 following charsets in order: us-ascii, the charset hint, utf-8. The
254 first character set not to provoke a UnicodeError is used.
Barry Warsawf4fdff72002-12-30 19:13:00 +0000255
256 Optional `errors' is passed as the third argument to any unicode() or
257 ustr.encode() call.
Barry Warsaw409a4c02002-04-10 21:01:31 +0000258 """
259 if charset is None:
260 charset = self._charset
Barry Warsaw92825a92002-07-23 06:08:10 +0000261 elif not isinstance(charset, Charset):
262 charset = Charset(charset)
Barry Warsaw67f8f2f2002-10-14 16:52:41 +0000263 # If the charset is our faux 8bit charset, leave the string unchanged
264 if charset <> '8bit':
265 # We need to test that the string can be converted to unicode and
266 # back to a byte string, given the input and output codecs of the
267 # charset.
268 if isinstance(s, StringType):
269 # Possibly raise UnicodeError if the byte string can't be
270 # converted to a unicode with the input codec of the charset.
271 incodec = charset.input_codec or 'us-ascii'
Barry Warsawf4fdff72002-12-30 19:13:00 +0000272 ustr = unicode(s, incodec, errors)
Barry Warsaw67f8f2f2002-10-14 16:52:41 +0000273 # Now make sure that the unicode could be converted back to a
274 # byte string with the output codec, which may be different
275 # than the iput coded. Still, use the original byte string.
276 outcodec = charset.output_codec or 'us-ascii'
Barry Warsawf4fdff72002-12-30 19:13:00 +0000277 ustr.encode(outcodec, errors)
Barry Warsaw67f8f2f2002-10-14 16:52:41 +0000278 elif isinstance(s, UnicodeType):
279 # Now we have to be sure the unicode string can be converted
280 # to a byte string with a reasonable output codec. We want to
281 # use the byte string in the chunk.
282 for charset in USASCII, charset, UTF8:
283 try:
284 outcodec = charset.output_codec or 'us-ascii'
Barry Warsawf4fdff72002-12-30 19:13:00 +0000285 s = s.encode(outcodec, errors)
Barry Warsaw67f8f2f2002-10-14 16:52:41 +0000286 break
287 except UnicodeError:
288 pass
289 else:
290 assert False, 'utf-8 conversion failed'
Barry Warsaw409a4c02002-04-10 21:01:31 +0000291 self._chunks.append((s, charset))
Tim Peters8ac14952002-05-23 15:15:30 +0000292
Barry Warsawe899e512003-03-06 05:39:46 +0000293 def _split(self, s, charset, maxlinelen, splitchars):
Barry Warsaw5e3bcff2002-10-14 15:13:17 +0000294 # Split up a header safely for use with encode_chunks.
Barry Warsaw409a4c02002-04-10 21:01:31 +0000295 splittable = charset.to_splittable(s)
Barry Warsawe899e512003-03-06 05:39:46 +0000296 encoded = charset.from_splittable(splittable, True)
Barry Warsaw812031b2002-05-19 23:47:53 +0000297 elen = charset.encoded_header_len(encoded)
Barry Warsawe899e512003-03-06 05:39:46 +0000298 # If the line's encoded length first, just return it
299 if elen <= maxlinelen:
Barry Warsaw409a4c02002-04-10 21:01:31 +0000300 return [(encoded, charset)]
Barry Warsaw5e3bcff2002-10-14 15:13:17 +0000301 # If we have undetermined raw 8bit characters sitting in a byte
302 # string, we really don't know what the right thing to do is. We
303 # can't really split it because it might be multibyte data which we
304 # could break if we split it between pairs. The least harm seems to
305 # be to not split the header at all, but that means they could go out
306 # longer than maxlinelen.
Barry Warsawe899e512003-03-06 05:39:46 +0000307 if charset == '8bit':
Barry Warsaw5e3bcff2002-10-14 15:13:17 +0000308 return [(s, charset)]
Barry Warsaw76612502002-06-28 23:46:53 +0000309 # BAW: I'm not sure what the right test here is. What we're trying to
310 # do is be faithful to RFC 2822's recommendation that ($2.2.3):
311 #
312 # "Note: Though structured field bodies are defined in such a way that
313 # folding can take place between many of the lexical tokens (and even
314 # within some of the lexical tokens), folding SHOULD be limited to
315 # placing the CRLF at higher-level syntactic breaks."
316 #
317 # For now, I can only imagine doing this when the charset is us-ascii,
318 # although it's possible that other charsets may also benefit from the
319 # higher-level syntactic breaks.
Barry Warsaw76612502002-06-28 23:46:53 +0000320 elif charset == 'us-ascii':
Barry Warsawe899e512003-03-06 05:39:46 +0000321 return self._split_ascii(s, charset, maxlinelen, splitchars)
Barry Warsaw812031b2002-05-19 23:47:53 +0000322 # BAW: should we use encoded?
323 elif elen == len(s):
324 # We can split on _maxlinelen boundaries because we know that the
325 # encoding won't change the size of the string
Barry Warsawe899e512003-03-06 05:39:46 +0000326 splitpnt = maxlinelen
Barry Warsaw174aa492002-09-30 15:51:31 +0000327 first = charset.from_splittable(splittable[:splitpnt], False)
328 last = charset.from_splittable(splittable[splitpnt:], False)
Barry Warsaw409a4c02002-04-10 21:01:31 +0000329 else:
Barry Warsawe899e512003-03-06 05:39:46 +0000330 # Binary search for split point
331 first, last = _binsplit(splittable, charset, maxlinelen)
332 # first is of the proper length so just wrap it in the appropriate
333 # chrome. last must be recursively split.
334 fsplittable = charset.to_splittable(first)
335 fencoded = charset.from_splittable(fsplittable, True)
336 chunk = [(fencoded, charset)]
337 return chunk + self._split(last, charset, self._maxlinelen, splitchars)
Barry Warsaw76612502002-06-28 23:46:53 +0000338
Barry Warsawe899e512003-03-06 05:39:46 +0000339 def _split_ascii(self, s, charset, firstlen, splitchars):
340 line = _split_ascii(s, firstlen, self._maxlinelen,
341 self._continuation_ws, splitchars)
342 lines = line.splitlines()
343 return zip(lines, [charset]*len(lines))
Barry Warsaw76612502002-06-28 23:46:53 +0000344
Barry Warsaw9f3fcd92003-03-07 15:39:37 +0000345 def _encode_chunks(self, newchunks, maxlinelen):
Barry Warsaw0c358252002-10-13 04:06:28 +0000346 # MIME-encode a header with many different charsets and/or encodings.
347 #
348 # Given a list of pairs (string, charset), return a MIME-encoded
349 # string suitable for use in a header field. Each pair may have
350 # different charsets and/or encodings, and the resulting header will
351 # accurately reflect each setting.
352 #
353 # Each encoding can be email.Utils.QP (quoted-printable, for
354 # ASCII-like character sets like iso-8859-1), email.Utils.BASE64
355 # (Base64, for non-ASCII like character sets like KOI8-R and
356 # iso-2022-jp), or None (no encoding).
357 #
358 # Each pair will be represented on a separate line; the resulting
359 # string will be in the format:
360 #
361 # =?charset1?q?Mar=EDa_Gonz=E1lez_Alonso?=\n
362 # =?charset2?b?SvxyZ2VuIEL2aW5n?="
363 #
Barry Warsaw76612502002-06-28 23:46:53 +0000364 chunks = []
Barry Warsaw0c358252002-10-13 04:06:28 +0000365 for header, charset in newchunks:
Barry Warsaw76612502002-06-28 23:46:53 +0000366 if charset is None or charset.header_encoding is None:
Barry Warsawe899e512003-03-06 05:39:46 +0000367 s = header
Barry Warsaw76612502002-06-28 23:46:53 +0000368 else:
Barry Warsawe899e512003-03-06 05:39:46 +0000369 s = charset.header_encode(header)
Barry Warsaw9f3fcd92003-03-07 15:39:37 +0000370 _max_append(chunks, s, maxlinelen, ' ')
Barry Warsaw76612502002-06-28 23:46:53 +0000371 joiner = NL + self._continuation_ws
372 return joiner.join(chunks)
Barry Warsaw409a4c02002-04-10 21:01:31 +0000373
Barry Warsawe899e512003-03-06 05:39:46 +0000374 def encode(self, splitchars=';, '):
Barry Warsaw48330682002-09-30 23:07:35 +0000375 """Encode a message header into an RFC-compliant format.
Barry Warsaw409a4c02002-04-10 21:01:31 +0000376
377 There are many issues involved in converting a given string for use in
378 an email header. Only certain character sets are readable in most
379 email clients, and as header strings can only contain a subset of
380 7-bit ASCII, care must be taken to properly convert and encode (with
381 Base64 or quoted-printable) header strings. In addition, there is a
382 75-character length limit on any given encoded header field, so
383 line-wrapping must be performed, even with double-byte character sets.
Tim Peters8ac14952002-05-23 15:15:30 +0000384
Barry Warsaw409a4c02002-04-10 21:01:31 +0000385 This method will do its best to convert the string to the correct
386 character set used in email, and encode and line wrap it safely with
387 the appropriate scheme for that character set.
388
389 If the given charset is not known or an error occurs during
390 conversion, this function will return the header untouched.
Barry Warsawe899e512003-03-06 05:39:46 +0000391
392 Optional splitchars is a string containing characters to split long
393 ASCII lines on, in rough support of RFC 2822's `highest level
394 syntactic breaks'. This doesn't affect RFC 2047 encoded lines.
Barry Warsaw409a4c02002-04-10 21:01:31 +0000395 """
396 newchunks = []
Barry Warsawe899e512003-03-06 05:39:46 +0000397 maxlinelen = self._firstlinelen
398 lastlen = 0
Barry Warsaw409a4c02002-04-10 21:01:31 +0000399 for s, charset in self._chunks:
Barry Warsawe899e512003-03-06 05:39:46 +0000400 # The first bit of the next chunk should be just long enough to
401 # fill the next line. Don't forget the space separating the
402 # encoded words.
403 targetlen = maxlinelen - lastlen - 1
404 if targetlen < charset.encoded_header_len(''):
405 # Stick it on the next line
406 targetlen = maxlinelen
407 newchunks += self._split(s, charset, targetlen, splitchars)
408 lastchunk, lastcharset = newchunks[-1]
409 lastlen = lastcharset.encoded_header_len(lastchunk)
Barry Warsaw9f3fcd92003-03-07 15:39:37 +0000410 return self._encode_chunks(newchunks, maxlinelen)
Barry Warsawe899e512003-03-06 05:39:46 +0000411
412
413
414def _split_ascii(s, firstlen, restlen, continuation_ws, splitchars):
Barry Warsaw9f3fcd92003-03-07 15:39:37 +0000415 linejoiner = '\n' + continuation_ws
Barry Warsawe899e512003-03-06 05:39:46 +0000416 lines = []
417 maxlen = firstlen
418 for line in s.splitlines():
419 if len(line) < maxlen:
420 lines.append(line)
421 maxlen = restlen
422 continue
423 # Attempt to split the line at the highest-level syntactic break
424 # possible. Note that we don't have a lot of smarts about field
425 # syntax; we just try to break on semi-colons, then commas, then
426 # whitespace.
427 for ch in splitchars:
428 if line.find(ch) >= 0:
429 break
430 else:
431 # There's nothing useful to split the line on, not even spaces, so
432 # just append this line unchanged
433 lines.append(line)
434 maxlen = restlen
435 continue
436 # Now split the line on the character plus trailing whitespace
437 cre = re.compile(r'%s\s*' % ch)
438 if ch in ';,':
439 eol = ch
440 else:
441 eol = ''
442 joiner = eol + ' '
443 joinlen = len(joiner)
444 wslen = len(continuation_ws.replace('\t', SPACE8))
445 this = []
446 linelen = 0
447 for part in cre.split(line):
448 curlen = linelen + max(0, len(this)-1) * joinlen
449 partlen = len(part)
450 onfirstline = not lines
451 # We don't want to split after the field name, if we're on the
452 # first line and the field name is present in the header string.
453 if ch == ' ' and onfirstline and \
454 len(this) == 1 and fcre.match(this[0]):
455 this.append(part)
456 linelen += partlen
457 elif curlen + partlen > maxlen:
458 if this:
459 lines.append(joiner.join(this) + eol)
Barry Warsawbd836df2003-03-06 20:33:04 +0000460 # If this part is longer than maxlen and we aren't already
461 # splitting on whitespace, try to recursively split this line
462 # on whitespace.
463 if partlen > maxlen and ch <> ' ':
Barry Warsaw9f3fcd92003-03-07 15:39:37 +0000464 subs = _split_ascii(part, maxlen, restlen,
465 continuation_ws, ' ')
466 subl = re.split(linejoiner, subs)
467 lines.extend(subl[:-1])
468 this = [subl[-1]]
Barry Warsawbd836df2003-03-06 20:33:04 +0000469 else:
470 this = [part]
Barry Warsaw9f3fcd92003-03-07 15:39:37 +0000471 linelen = wslen + len(this[-1])
Barry Warsawe899e512003-03-06 05:39:46 +0000472 maxlen = restlen
473 else:
474 this.append(part)
475 linelen += partlen
476 # Put any left over parts on a line by themselves
477 if this:
478 lines.append(joiner.join(this))
Barry Warsawe899e512003-03-06 05:39:46 +0000479 return linejoiner.join(lines)
480
481
482
483def _binsplit(splittable, charset, maxlinelen):
484 i = 0
485 j = len(splittable)
486 while i < j:
487 # Invariants:
488 # 1. splittable[:k] fits for all k <= i (note that we *assume*,
489 # at the start, that splittable[:0] fits).
490 # 2. splittable[:k] does not fit for any k > j (at the start,
491 # this means we shouldn't look at any k > len(splittable)).
492 # 3. We don't know about splittable[:k] for k in i+1..j.
493 # 4. We want to set i to the largest k that fits, with i <= k <= j.
494 #
495 m = (i+j+1) >> 1 # ceiling((i+j)/2); i < m <= j
496 chunk = charset.from_splittable(splittable[:m], True)
497 chunklen = charset.encoded_header_len(chunk)
498 if chunklen <= maxlinelen:
499 # m is acceptable, so is a new lower bound.
500 i = m
501 else:
Tim Peters2b482132003-03-06 23:41:58 +0000502 # m is not acceptable, so final i must be < m.
Barry Warsawe899e512003-03-06 05:39:46 +0000503 j = m - 1
504 # i == j. Invariant #1 implies that splittable[:i] fits, and
505 # invariant #2 implies that splittable[:i+1] does not fit, so i
506 # is what we're looking for.
507 first = charset.from_splittable(splittable[:i], False)
508 last = charset.from_splittable(splittable[i:], False)
509 return first, last