blob: 76fffb597056831689e7dff222f14dec7cd60358 [file] [log] [blame]
Barry Warsaw409a4c02002-04-10 21:01:31 +00001# Copyright (C) 2002 Python Software Foundation
Barry Warsaw174aa492002-09-30 15:51:31 +00002# Author: che@debian.org (Ben Gertzfield), barry@zope.com (Barry Warsaw)
Barry Warsaw409a4c02002-04-10 21:01:31 +00003
4"""Header encoding and decoding functionality."""
5
6import re
Barry Warsawe899e512003-03-06 05:39:46 +00007import binascii
Barry Warsaw174aa492002-09-30 15:51:31 +00008from types import StringType, UnicodeType
9
Barry Warsaw409a4c02002-04-10 21:01:31 +000010import email.quopriMIME
11import email.base64MIME
Barry Warsawe899e512003-03-06 05:39:46 +000012from email.Errors import HeaderParseError
Barry Warsaw409a4c02002-04-10 21:01:31 +000013from email.Charset import Charset
14
Barry Warsaw812031b2002-05-19 23:47:53 +000015try:
Barry Warsaw1c30aa22002-06-01 05:49:17 +000016 from email._compat22 import _floordiv
Barry Warsaw812031b2002-05-19 23:47:53 +000017except SyntaxError:
18 # Python 2.1 spells integer division differently
Barry Warsaw1c30aa22002-06-01 05:49:17 +000019 from email._compat21 import _floordiv
Barry Warsaw812031b2002-05-19 23:47:53 +000020
Barry Warsaw174aa492002-09-30 15:51:31 +000021try:
22 True, False
23except NameError:
24 True = 1
25 False = 0
26
Barry Warsaw409a4c02002-04-10 21:01:31 +000027CRLFSPACE = '\r\n '
28CRLF = '\r\n'
Barry Warsaw76612502002-06-28 23:46:53 +000029NL = '\n'
Barry Warsawe899e512003-03-06 05:39:46 +000030SPACE = ' '
Barry Warsaw48488052003-03-06 16:10:30 +000031USPACE = u' '
Barry Warsaw76612502002-06-28 23:46:53 +000032SPACE8 = ' ' * 8
33EMPTYSTRING = ''
Barry Warsaw48488052003-03-06 16:10:30 +000034UEMPTYSTRING = u''
Barry Warsaw409a4c02002-04-10 21:01:31 +000035
36MAXLINELEN = 76
37
38ENCODE = 1
39DECODE = 2
40
Barry Warsaw174aa492002-09-30 15:51:31 +000041USASCII = Charset('us-ascii')
42UTF8 = Charset('utf-8')
43
Barry Warsaw409a4c02002-04-10 21:01:31 +000044# Match encoded-word strings in the form =?charset?q?Hello_World?=
45ecre = re.compile(r'''
46 =\? # literal =?
47 (?P<charset>[^?]*?) # non-greedy up to the next ? is the charset
48 \? # literal ?
49 (?P<encoding>[qb]) # either a "q" or a "b", case insensitive
50 \? # literal ?
51 (?P<encoded>.*?) # non-greedy up to the next ?= is the encoded string
52 \?= # literal ?=
53 ''', re.VERBOSE | re.IGNORECASE)
54
Barry Warsawe899e512003-03-06 05:39:46 +000055pcre = re.compile('([,;])')
56
57# Field name regexp, including trailing colon, but not separating whitespace,
58# according to RFC 2822. Character range is from tilde to exclamation mark.
59# For use with .match()
60fcre = re.compile(r'[\041-\176]+:$')
61
Barry Warsaw409a4c02002-04-10 21:01:31 +000062
63
64# Helpers
65_max_append = email.quopriMIME._max_append
66
67
68
69def decode_header(header):
70 """Decode a message header value without converting charset.
71
72 Returns a list of (decoded_string, charset) pairs containing each of the
73 decoded parts of the header. Charset is None for non-encoded parts of the
74 header, otherwise a lower-case string containing the name of the character
75 set specified in the encoded string.
Barry Warsawe899e512003-03-06 05:39:46 +000076
77 An email.Errors.HeaderParseError may be raised when certain decoding error
78 occurs (e.g. a base64 decoding exception).
Barry Warsaw409a4c02002-04-10 21:01:31 +000079 """
80 # If no encoding, just return the header
81 header = str(header)
82 if not ecre.search(header):
83 return [(header, None)]
Barry Warsaw409a4c02002-04-10 21:01:31 +000084 decoded = []
85 dec = ''
86 for line in header.splitlines():
87 # This line might not have an encoding in it
88 if not ecre.search(line):
89 decoded.append((line, None))
90 continue
Barry Warsaw409a4c02002-04-10 21:01:31 +000091 parts = ecre.split(line)
92 while parts:
93 unenc = parts.pop(0).strip()
94 if unenc:
95 # Should we continue a long line?
96 if decoded and decoded[-1][1] is None:
Barry Warsaw671c3e62003-03-06 06:37:42 +000097 decoded[-1] = (decoded[-1][0] + SPACE + unenc, None)
Barry Warsaw409a4c02002-04-10 21:01:31 +000098 else:
99 decoded.append((unenc, None))
100 if parts:
101 charset, encoding = [s.lower() for s in parts[0:2]]
102 encoded = parts[2]
Barry Warsawe899e512003-03-06 05:39:46 +0000103 dec = None
Barry Warsaw409a4c02002-04-10 21:01:31 +0000104 if encoding == 'q':
105 dec = email.quopriMIME.header_decode(encoded)
106 elif encoding == 'b':
Barry Warsawe899e512003-03-06 05:39:46 +0000107 try:
108 dec = email.base64MIME.decode(encoded)
109 except binascii.Error:
110 # Turn this into a higher level exception. BAW: Right
111 # now we throw the lower level exception away but
112 # when/if we get exception chaining, we'll preserve it.
113 raise HeaderParseError
114 if dec is None:
Barry Warsaw409a4c02002-04-10 21:01:31 +0000115 dec = encoded
116
117 if decoded and decoded[-1][1] == charset:
118 decoded[-1] = (decoded[-1][0] + dec, decoded[-1][1])
119 else:
120 decoded.append((dec, charset))
121 del parts[0:3]
122 return decoded
123
124
125
Barry Warsaw8da39aa2002-07-09 16:33:47 +0000126def make_header(decoded_seq, maxlinelen=None, header_name=None,
127 continuation_ws=' '):
128 """Create a Header from a sequence of pairs as returned by decode_header()
129
130 decode_header() takes a header value string and returns a sequence of
131 pairs of the format (decoded_string, charset) where charset is the string
132 name of the character set.
133
134 This function takes one of those sequence of pairs and returns a Header
135 instance. Optional maxlinelen, header_name, and continuation_ws are as in
136 the Header constructor.
137 """
138 h = Header(maxlinelen=maxlinelen, header_name=header_name,
139 continuation_ws=continuation_ws)
140 for s, charset in decoded_seq:
Barry Warsaw15d37392002-07-23 04:29:54 +0000141 # None means us-ascii but we can simply pass it on to h.append()
142 if charset is not None and not isinstance(charset, Charset):
Barry Warsaw8da39aa2002-07-09 16:33:47 +0000143 charset = Charset(charset)
144 h.append(s, charset)
145 return h
146
147
148
Barry Warsaw409a4c02002-04-10 21:01:31 +0000149class Header:
Barry Warsawe899e512003-03-06 05:39:46 +0000150 def __init__(self, s=None, charset=None,
151 maxlinelen=None, header_name=None,
Barry Warsawf4fdff72002-12-30 19:13:00 +0000152 continuation_ws=' ', errors='strict'):
Barry Warsaw174aa492002-09-30 15:51:31 +0000153 """Create a MIME-compliant header that can contain many character sets.
Barry Warsaw409a4c02002-04-10 21:01:31 +0000154
Barry Warsaw174aa492002-09-30 15:51:31 +0000155 Optional s is the initial header value. If None, the initial header
156 value is not set. You can later append to the header with .append()
157 method calls. s may be a byte string or a Unicode string, but see the
158 .append() documentation for semantics.
Barry Warsaw8da39aa2002-07-09 16:33:47 +0000159
Barry Warsaw174aa492002-09-30 15:51:31 +0000160 Optional charset serves two purposes: it has the same meaning as the
161 charset argument to the .append() method. It also sets the default
162 character set for all subsequent .append() calls that omit the charset
163 argument. If charset is not provided in the constructor, the us-ascii
164 charset is used both as s's initial charset and as the default for
165 subsequent .append() calls.
Barry Warsaw409a4c02002-04-10 21:01:31 +0000166
Barry Warsaw76612502002-06-28 23:46:53 +0000167 The maximum line length can be specified explicit via maxlinelen. For
168 splitting the first line to a shorter value (to account for the field
169 header which isn't included in s, e.g. `Subject') pass in the name of
170 the field in header_name. The default maxlinelen is 76.
171
172 continuation_ws must be RFC 2822 compliant folding whitespace (usually
173 either a space or a hard tab) which will be prepended to continuation
174 lines.
Barry Warsawf4fdff72002-12-30 19:13:00 +0000175
176 errors is passed through to the .append() call.
Barry Warsaw409a4c02002-04-10 21:01:31 +0000177 """
178 if charset is None:
Barry Warsaw174aa492002-09-30 15:51:31 +0000179 charset = USASCII
Barry Warsaw5e3bcff2002-10-14 15:13:17 +0000180 if not isinstance(charset, Charset):
181 charset = Charset(charset)
Barry Warsaw409a4c02002-04-10 21:01:31 +0000182 self._charset = charset
Barry Warsaw76612502002-06-28 23:46:53 +0000183 self._continuation_ws = continuation_ws
184 cws_expanded_len = len(continuation_ws.replace('\t', SPACE8))
Barry Warsaw409a4c02002-04-10 21:01:31 +0000185 # BAW: I believe `chunks' and `maxlinelen' should be non-public.
186 self._chunks = []
Barry Warsaw8da39aa2002-07-09 16:33:47 +0000187 if s is not None:
Barry Warsawf4fdff72002-12-30 19:13:00 +0000188 self.append(s, charset, errors)
Barry Warsaw812031b2002-05-19 23:47:53 +0000189 if maxlinelen is None:
Barry Warsaw76612502002-06-28 23:46:53 +0000190 maxlinelen = MAXLINELEN
191 if header_name is None:
192 # We don't know anything about the field header so the first line
193 # is the same length as subsequent lines.
194 self._firstlinelen = maxlinelen
Barry Warsaw812031b2002-05-19 23:47:53 +0000195 else:
Barry Warsaw76612502002-06-28 23:46:53 +0000196 # The first line should be shorter to take into account the field
197 # header. Also subtract off 2 extra for the colon and space.
198 self._firstlinelen = maxlinelen - len(header_name) - 2
199 # Second and subsequent lines should subtract off the length in
200 # columns of the continuation whitespace prefix.
201 self._maxlinelen = maxlinelen - cws_expanded_len
Barry Warsaw409a4c02002-04-10 21:01:31 +0000202
203 def __str__(self):
204 """A synonym for self.encode()."""
205 return self.encode()
206
Barry Warsaw8e69bda2002-06-29 03:26:58 +0000207 def __unicode__(self):
208 """Helper for the built-in unicode function."""
Barry Warsaw48488052003-03-06 16:10:30 +0000209 uchunks = []
210 lastcs = None
211 for s, charset in self._chunks:
212 # We must preserve spaces between encoded and non-encoded word
213 # boundaries, which means for us we need to add a space when we go
214 # from a charset to None/us-ascii, or from None/us-ascii to a
215 # charset. Only do this for the second and subsequent chunks.
216 nextcs = charset
217 if uchunks:
Barry Warsawba1548a2003-03-30 20:46:47 +0000218 if lastcs not in (None, 'us-ascii'):
219 if nextcs in (None, 'us-ascii'):
Barry Warsaw48488052003-03-06 16:10:30 +0000220 uchunks.append(USPACE)
221 nextcs = None
Barry Warsawba1548a2003-03-30 20:46:47 +0000222 elif nextcs not in (None, 'us-ascii'):
Barry Warsaw48488052003-03-06 16:10:30 +0000223 uchunks.append(USPACE)
224 lastcs = nextcs
225 uchunks.append(unicode(s, str(charset)))
226 return UEMPTYSTRING.join(uchunks)
Barry Warsaw8e69bda2002-06-29 03:26:58 +0000227
Barry Warsaw8da39aa2002-07-09 16:33:47 +0000228 # Rich comparison operators for equality only. BAW: does it make sense to
229 # have or explicitly disable <, <=, >, >= operators?
230 def __eq__(self, other):
231 # other may be a Header or a string. Both are fine so coerce
232 # ourselves to a string, swap the args and do another comparison.
233 return other == self.encode()
234
235 def __ne__(self, other):
236 return not self == other
237
Barry Warsawf4fdff72002-12-30 19:13:00 +0000238 def append(self, s, charset=None, errors='strict'):
Barry Warsaw174aa492002-09-30 15:51:31 +0000239 """Append a string to the MIME header.
Barry Warsaw409a4c02002-04-10 21:01:31 +0000240
Barry Warsaw174aa492002-09-30 15:51:31 +0000241 Optional charset, if given, should be a Charset instance or the name
242 of a character set (which will be converted to a Charset instance). A
243 value of None (the default) means that the charset given in the
244 constructor is used.
245
246 s may be a byte string or a Unicode string. If it is a byte string
247 (i.e. isinstance(s, StringType) is true), then charset is the encoding
248 of that byte string, and a UnicodeError will be raised if the string
Barry Warsaw48330682002-09-30 23:07:35 +0000249 cannot be decoded with that charset. If s is a Unicode string, then
Barry Warsaw174aa492002-09-30 15:51:31 +0000250 charset is a hint specifying the character set of the characters in
251 the string. In this case, when producing an RFC 2822 compliant header
252 using RFC 2047 rules, the Unicode string will be encoded using the
Barry Warsaw48330682002-09-30 23:07:35 +0000253 following charsets in order: us-ascii, the charset hint, utf-8. The
254 first character set not to provoke a UnicodeError is used.
Barry Warsawf4fdff72002-12-30 19:13:00 +0000255
256 Optional `errors' is passed as the third argument to any unicode() or
257 ustr.encode() call.
Barry Warsaw409a4c02002-04-10 21:01:31 +0000258 """
259 if charset is None:
260 charset = self._charset
Barry Warsaw92825a92002-07-23 06:08:10 +0000261 elif not isinstance(charset, Charset):
262 charset = Charset(charset)
Barry Warsaw67f8f2f2002-10-14 16:52:41 +0000263 # If the charset is our faux 8bit charset, leave the string unchanged
264 if charset <> '8bit':
265 # We need to test that the string can be converted to unicode and
266 # back to a byte string, given the input and output codecs of the
267 # charset.
268 if isinstance(s, StringType):
269 # Possibly raise UnicodeError if the byte string can't be
270 # converted to a unicode with the input codec of the charset.
271 incodec = charset.input_codec or 'us-ascii'
Barry Warsawf4fdff72002-12-30 19:13:00 +0000272 ustr = unicode(s, incodec, errors)
Barry Warsaw67f8f2f2002-10-14 16:52:41 +0000273 # Now make sure that the unicode could be converted back to a
274 # byte string with the output codec, which may be different
275 # than the iput coded. Still, use the original byte string.
276 outcodec = charset.output_codec or 'us-ascii'
Barry Warsawf4fdff72002-12-30 19:13:00 +0000277 ustr.encode(outcodec, errors)
Barry Warsaw67f8f2f2002-10-14 16:52:41 +0000278 elif isinstance(s, UnicodeType):
279 # Now we have to be sure the unicode string can be converted
280 # to a byte string with a reasonable output codec. We want to
281 # use the byte string in the chunk.
282 for charset in USASCII, charset, UTF8:
283 try:
284 outcodec = charset.output_codec or 'us-ascii'
Barry Warsawf4fdff72002-12-30 19:13:00 +0000285 s = s.encode(outcodec, errors)
Barry Warsaw67f8f2f2002-10-14 16:52:41 +0000286 break
287 except UnicodeError:
288 pass
289 else:
290 assert False, 'utf-8 conversion failed'
Barry Warsaw409a4c02002-04-10 21:01:31 +0000291 self._chunks.append((s, charset))
Tim Peters8ac14952002-05-23 15:15:30 +0000292
Barry Warsawe899e512003-03-06 05:39:46 +0000293 def _split(self, s, charset, maxlinelen, splitchars):
Barry Warsaw5e3bcff2002-10-14 15:13:17 +0000294 # Split up a header safely for use with encode_chunks.
Barry Warsaw409a4c02002-04-10 21:01:31 +0000295 splittable = charset.to_splittable(s)
Barry Warsawe899e512003-03-06 05:39:46 +0000296 encoded = charset.from_splittable(splittable, True)
Barry Warsaw812031b2002-05-19 23:47:53 +0000297 elen = charset.encoded_header_len(encoded)
Barry Warsawe899e512003-03-06 05:39:46 +0000298 # If the line's encoded length first, just return it
299 if elen <= maxlinelen:
Barry Warsaw409a4c02002-04-10 21:01:31 +0000300 return [(encoded, charset)]
Barry Warsaw5e3bcff2002-10-14 15:13:17 +0000301 # If we have undetermined raw 8bit characters sitting in a byte
302 # string, we really don't know what the right thing to do is. We
303 # can't really split it because it might be multibyte data which we
304 # could break if we split it between pairs. The least harm seems to
305 # be to not split the header at all, but that means they could go out
306 # longer than maxlinelen.
Barry Warsawe899e512003-03-06 05:39:46 +0000307 if charset == '8bit':
Barry Warsaw5e3bcff2002-10-14 15:13:17 +0000308 return [(s, charset)]
Barry Warsaw76612502002-06-28 23:46:53 +0000309 # BAW: I'm not sure what the right test here is. What we're trying to
310 # do is be faithful to RFC 2822's recommendation that ($2.2.3):
311 #
312 # "Note: Though structured field bodies are defined in such a way that
313 # folding can take place between many of the lexical tokens (and even
314 # within some of the lexical tokens), folding SHOULD be limited to
315 # placing the CRLF at higher-level syntactic breaks."
316 #
317 # For now, I can only imagine doing this when the charset is us-ascii,
318 # although it's possible that other charsets may also benefit from the
319 # higher-level syntactic breaks.
Barry Warsaw76612502002-06-28 23:46:53 +0000320 elif charset == 'us-ascii':
Barry Warsawe899e512003-03-06 05:39:46 +0000321 return self._split_ascii(s, charset, maxlinelen, splitchars)
Barry Warsaw812031b2002-05-19 23:47:53 +0000322 # BAW: should we use encoded?
323 elif elen == len(s):
324 # We can split on _maxlinelen boundaries because we know that the
325 # encoding won't change the size of the string
Barry Warsawe899e512003-03-06 05:39:46 +0000326 splitpnt = maxlinelen
Barry Warsaw174aa492002-09-30 15:51:31 +0000327 first = charset.from_splittable(splittable[:splitpnt], False)
328 last = charset.from_splittable(splittable[splitpnt:], False)
Barry Warsaw409a4c02002-04-10 21:01:31 +0000329 else:
Barry Warsawe899e512003-03-06 05:39:46 +0000330 # Binary search for split point
331 first, last = _binsplit(splittable, charset, maxlinelen)
332 # first is of the proper length so just wrap it in the appropriate
333 # chrome. last must be recursively split.
334 fsplittable = charset.to_splittable(first)
335 fencoded = charset.from_splittable(fsplittable, True)
336 chunk = [(fencoded, charset)]
337 return chunk + self._split(last, charset, self._maxlinelen, splitchars)
Barry Warsaw76612502002-06-28 23:46:53 +0000338
Barry Warsawe899e512003-03-06 05:39:46 +0000339 def _split_ascii(self, s, charset, firstlen, splitchars):
Barry Warsaw5b8c69f2003-03-10 15:14:08 +0000340 chunks = _split_ascii(s, firstlen, self._maxlinelen,
341 self._continuation_ws, splitchars)
342 return zip(chunks, [charset]*len(chunks))
Barry Warsaw76612502002-06-28 23:46:53 +0000343
Barry Warsaw9f3fcd92003-03-07 15:39:37 +0000344 def _encode_chunks(self, newchunks, maxlinelen):
Barry Warsaw0c358252002-10-13 04:06:28 +0000345 # MIME-encode a header with many different charsets and/or encodings.
346 #
347 # Given a list of pairs (string, charset), return a MIME-encoded
348 # string suitable for use in a header field. Each pair may have
349 # different charsets and/or encodings, and the resulting header will
350 # accurately reflect each setting.
351 #
352 # Each encoding can be email.Utils.QP (quoted-printable, for
353 # ASCII-like character sets like iso-8859-1), email.Utils.BASE64
354 # (Base64, for non-ASCII like character sets like KOI8-R and
355 # iso-2022-jp), or None (no encoding).
356 #
357 # Each pair will be represented on a separate line; the resulting
358 # string will be in the format:
359 #
360 # =?charset1?q?Mar=EDa_Gonz=E1lez_Alonso?=\n
361 # =?charset2?b?SvxyZ2VuIEL2aW5n?="
Barry Warsaw76612502002-06-28 23:46:53 +0000362 chunks = []
Barry Warsaw0c358252002-10-13 04:06:28 +0000363 for header, charset in newchunks:
Barry Warsaw6613fb82003-03-17 20:36:20 +0000364 if not header:
365 continue
Barry Warsaw76612502002-06-28 23:46:53 +0000366 if charset is None or charset.header_encoding is None:
Barry Warsawe899e512003-03-06 05:39:46 +0000367 s = header
Barry Warsaw76612502002-06-28 23:46:53 +0000368 else:
Barry Warsawe899e512003-03-06 05:39:46 +0000369 s = charset.header_encode(header)
Barry Warsaw5b8c69f2003-03-10 15:14:08 +0000370 # Don't add more folding whitespace than necessary
371 if chunks and chunks[-1].endswith(' '):
372 extra = ''
373 else:
374 extra = ' '
375 _max_append(chunks, s, maxlinelen, extra)
Barry Warsaw76612502002-06-28 23:46:53 +0000376 joiner = NL + self._continuation_ws
377 return joiner.join(chunks)
Barry Warsaw409a4c02002-04-10 21:01:31 +0000378
Barry Warsawe899e512003-03-06 05:39:46 +0000379 def encode(self, splitchars=';, '):
Barry Warsaw48330682002-09-30 23:07:35 +0000380 """Encode a message header into an RFC-compliant format.
Barry Warsaw409a4c02002-04-10 21:01:31 +0000381
382 There are many issues involved in converting a given string for use in
383 an email header. Only certain character sets are readable in most
384 email clients, and as header strings can only contain a subset of
385 7-bit ASCII, care must be taken to properly convert and encode (with
386 Base64 or quoted-printable) header strings. In addition, there is a
387 75-character length limit on any given encoded header field, so
388 line-wrapping must be performed, even with double-byte character sets.
Tim Peters8ac14952002-05-23 15:15:30 +0000389
Barry Warsaw409a4c02002-04-10 21:01:31 +0000390 This method will do its best to convert the string to the correct
391 character set used in email, and encode and line wrap it safely with
392 the appropriate scheme for that character set.
393
394 If the given charset is not known or an error occurs during
395 conversion, this function will return the header untouched.
Barry Warsawe899e512003-03-06 05:39:46 +0000396
397 Optional splitchars is a string containing characters to split long
398 ASCII lines on, in rough support of RFC 2822's `highest level
399 syntactic breaks'. This doesn't affect RFC 2047 encoded lines.
Barry Warsaw409a4c02002-04-10 21:01:31 +0000400 """
401 newchunks = []
Barry Warsawe899e512003-03-06 05:39:46 +0000402 maxlinelen = self._firstlinelen
403 lastlen = 0
Barry Warsaw409a4c02002-04-10 21:01:31 +0000404 for s, charset in self._chunks:
Barry Warsawe899e512003-03-06 05:39:46 +0000405 # The first bit of the next chunk should be just long enough to
406 # fill the next line. Don't forget the space separating the
407 # encoded words.
408 targetlen = maxlinelen - lastlen - 1
409 if targetlen < charset.encoded_header_len(''):
410 # Stick it on the next line
411 targetlen = maxlinelen
412 newchunks += self._split(s, charset, targetlen, splitchars)
413 lastchunk, lastcharset = newchunks[-1]
414 lastlen = lastcharset.encoded_header_len(lastchunk)
Barry Warsaw9f3fcd92003-03-07 15:39:37 +0000415 return self._encode_chunks(newchunks, maxlinelen)
Barry Warsawe899e512003-03-06 05:39:46 +0000416
417
418
419def _split_ascii(s, firstlen, restlen, continuation_ws, splitchars):
420 lines = []
421 maxlen = firstlen
422 for line in s.splitlines():
Barry Warsaw33975ea2003-03-07 23:24:34 +0000423 # Ignore any leading whitespace (i.e. continuation whitespace) already
424 # on the line, since we'll be adding our own.
425 line = line.lstrip()
Barry Warsawe899e512003-03-06 05:39:46 +0000426 if len(line) < maxlen:
427 lines.append(line)
428 maxlen = restlen
429 continue
430 # Attempt to split the line at the highest-level syntactic break
431 # possible. Note that we don't have a lot of smarts about field
432 # syntax; we just try to break on semi-colons, then commas, then
433 # whitespace.
434 for ch in splitchars:
435 if line.find(ch) >= 0:
436 break
437 else:
438 # There's nothing useful to split the line on, not even spaces, so
439 # just append this line unchanged
440 lines.append(line)
441 maxlen = restlen
442 continue
443 # Now split the line on the character plus trailing whitespace
444 cre = re.compile(r'%s\s*' % ch)
445 if ch in ';,':
446 eol = ch
447 else:
448 eol = ''
449 joiner = eol + ' '
450 joinlen = len(joiner)
451 wslen = len(continuation_ws.replace('\t', SPACE8))
452 this = []
453 linelen = 0
454 for part in cre.split(line):
455 curlen = linelen + max(0, len(this)-1) * joinlen
456 partlen = len(part)
457 onfirstline = not lines
458 # We don't want to split after the field name, if we're on the
459 # first line and the field name is present in the header string.
460 if ch == ' ' and onfirstline and \
461 len(this) == 1 and fcre.match(this[0]):
462 this.append(part)
463 linelen += partlen
464 elif curlen + partlen > maxlen:
465 if this:
466 lines.append(joiner.join(this) + eol)
Barry Warsawbd836df2003-03-06 20:33:04 +0000467 # If this part is longer than maxlen and we aren't already
468 # splitting on whitespace, try to recursively split this line
469 # on whitespace.
470 if partlen > maxlen and ch <> ' ':
Barry Warsaw5b8c69f2003-03-10 15:14:08 +0000471 subl = _split_ascii(part, maxlen, restlen,
Barry Warsaw9f3fcd92003-03-07 15:39:37 +0000472 continuation_ws, ' ')
Barry Warsaw9f3fcd92003-03-07 15:39:37 +0000473 lines.extend(subl[:-1])
474 this = [subl[-1]]
Barry Warsawbd836df2003-03-06 20:33:04 +0000475 else:
476 this = [part]
Barry Warsaw9f3fcd92003-03-07 15:39:37 +0000477 linelen = wslen + len(this[-1])
Barry Warsawe899e512003-03-06 05:39:46 +0000478 maxlen = restlen
479 else:
480 this.append(part)
481 linelen += partlen
482 # Put any left over parts on a line by themselves
483 if this:
484 lines.append(joiner.join(this))
Barry Warsaw5b8c69f2003-03-10 15:14:08 +0000485 return lines
Barry Warsawe899e512003-03-06 05:39:46 +0000486
487
488
489def _binsplit(splittable, charset, maxlinelen):
490 i = 0
491 j = len(splittable)
492 while i < j:
493 # Invariants:
494 # 1. splittable[:k] fits for all k <= i (note that we *assume*,
495 # at the start, that splittable[:0] fits).
496 # 2. splittable[:k] does not fit for any k > j (at the start,
497 # this means we shouldn't look at any k > len(splittable)).
498 # 3. We don't know about splittable[:k] for k in i+1..j.
499 # 4. We want to set i to the largest k that fits, with i <= k <= j.
500 #
501 m = (i+j+1) >> 1 # ceiling((i+j)/2); i < m <= j
502 chunk = charset.from_splittable(splittable[:m], True)
503 chunklen = charset.encoded_header_len(chunk)
504 if chunklen <= maxlinelen:
505 # m is acceptable, so is a new lower bound.
506 i = m
507 else:
Tim Peters2b482132003-03-06 23:41:58 +0000508 # m is not acceptable, so final i must be < m.
Barry Warsawe899e512003-03-06 05:39:46 +0000509 j = m - 1
510 # i == j. Invariant #1 implies that splittable[:i] fits, and
511 # invariant #2 implies that splittable[:i+1] does not fit, so i
512 # is what we're looking for.
513 first = charset.from_splittable(splittable[:i], False)
514 last = charset.from_splittable(splittable[i:], False)
515 return first, last