blob: 3de44f905b58157de45bcc5cfe799f7d9b76b44f [file] [log] [blame]
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00001# Copyright (C) 2002-2006 Python Software Foundation
Barry Warsawbb113862004-10-03 03:16:19 +00002# Author: Ben Gertzfield, Barry Warsaw
3# Contact: email-sig@python.org
Barry Warsaw409a4c02002-04-10 21:01:31 +00004
5"""Header encoding and decoding functionality."""
6
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00007__all__ = [
8 'Header',
9 'decode_header',
10 'make_header',
11 ]
12
Barry Warsaw409a4c02002-04-10 21:01:31 +000013import re
Barry Warsawe899e512003-03-06 05:39:46 +000014import binascii
Barry Warsaw174aa492002-09-30 15:51:31 +000015
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000016import email.quoprimime
17import email.base64mime
18
19from email.errors import HeaderParseError
20from email.charset import Charset
Barry Warsaw409a4c02002-04-10 21:01:31 +000021
Barry Warsaw76612502002-06-28 23:46:53 +000022NL = '\n'
Barry Warsawe899e512003-03-06 05:39:46 +000023SPACE = ' '
Barry Warsaw48488052003-03-06 16:10:30 +000024USPACE = u' '
Barry Warsaw76612502002-06-28 23:46:53 +000025SPACE8 = ' ' * 8
Barry Warsaw48488052003-03-06 16:10:30 +000026UEMPTYSTRING = u''
Barry Warsaw409a4c02002-04-10 21:01:31 +000027
28MAXLINELEN = 76
29
Barry Warsaw174aa492002-09-30 15:51:31 +000030USASCII = Charset('us-ascii')
31UTF8 = Charset('utf-8')
32
Barry Warsaw409a4c02002-04-10 21:01:31 +000033# Match encoded-word strings in the form =?charset?q?Hello_World?=
34ecre = re.compile(r'''
35 =\? # literal =?
36 (?P<charset>[^?]*?) # non-greedy up to the next ? is the charset
37 \? # literal ?
38 (?P<encoding>[qb]) # either a "q" or a "b", case insensitive
39 \? # literal ?
40 (?P<encoded>.*?) # non-greedy up to the next ?= is the encoded string
41 \?= # literal ?=
42 ''', re.VERBOSE | re.IGNORECASE)
43
Barry Warsawe899e512003-03-06 05:39:46 +000044# Field name regexp, including trailing colon, but not separating whitespace,
45# according to RFC 2822. Character range is from tilde to exclamation mark.
46# For use with .match()
47fcre = re.compile(r'[\041-\176]+:$')
48
Barry Warsaw409a4c02002-04-10 21:01:31 +000049
50
51# Helpers
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000052_max_append = email.quoprimime._max_append
Barry Warsaw409a4c02002-04-10 21:01:31 +000053
54
55
56def decode_header(header):
57 """Decode a message header value without converting charset.
58
59 Returns a list of (decoded_string, charset) pairs containing each of the
60 decoded parts of the header. Charset is None for non-encoded parts of the
61 header, otherwise a lower-case string containing the name of the character
62 set specified in the encoded string.
Barry Warsawe899e512003-03-06 05:39:46 +000063
64 An email.Errors.HeaderParseError may be raised when certain decoding error
65 occurs (e.g. a base64 decoding exception).
Barry Warsaw409a4c02002-04-10 21:01:31 +000066 """
67 # If no encoding, just return the header
68 header = str(header)
69 if not ecre.search(header):
70 return [(header, None)]
Barry Warsaw409a4c02002-04-10 21:01:31 +000071 decoded = []
72 dec = ''
73 for line in header.splitlines():
74 # This line might not have an encoding in it
75 if not ecre.search(line):
76 decoded.append((line, None))
77 continue
Barry Warsaw409a4c02002-04-10 21:01:31 +000078 parts = ecre.split(line)
79 while parts:
80 unenc = parts.pop(0).strip()
81 if unenc:
82 # Should we continue a long line?
83 if decoded and decoded[-1][1] is None:
Barry Warsaw671c3e62003-03-06 06:37:42 +000084 decoded[-1] = (decoded[-1][0] + SPACE + unenc, None)
Barry Warsaw409a4c02002-04-10 21:01:31 +000085 else:
86 decoded.append((unenc, None))
87 if parts:
88 charset, encoding = [s.lower() for s in parts[0:2]]
89 encoded = parts[2]
Barry Warsawe899e512003-03-06 05:39:46 +000090 dec = None
Barry Warsaw409a4c02002-04-10 21:01:31 +000091 if encoding == 'q':
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000092 dec = email.quoprimime.header_decode(encoded)
Barry Warsaw409a4c02002-04-10 21:01:31 +000093 elif encoding == 'b':
Barry Warsawe899e512003-03-06 05:39:46 +000094 try:
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000095 dec = email.base64mime.decode(encoded)
Barry Warsawe899e512003-03-06 05:39:46 +000096 except binascii.Error:
97 # Turn this into a higher level exception. BAW: Right
98 # now we throw the lower level exception away but
99 # when/if we get exception chaining, we'll preserve it.
100 raise HeaderParseError
101 if dec is None:
Barry Warsaw409a4c02002-04-10 21:01:31 +0000102 dec = encoded
103
104 if decoded and decoded[-1][1] == charset:
105 decoded[-1] = (decoded[-1][0] + dec, decoded[-1][1])
106 else:
107 decoded.append((dec, charset))
108 del parts[0:3]
109 return decoded
110
111
112
Barry Warsaw8da39aa2002-07-09 16:33:47 +0000113def make_header(decoded_seq, maxlinelen=None, header_name=None,
114 continuation_ws=' '):
115 """Create a Header from a sequence of pairs as returned by decode_header()
116
117 decode_header() takes a header value string and returns a sequence of
118 pairs of the format (decoded_string, charset) where charset is the string
119 name of the character set.
120
121 This function takes one of those sequence of pairs and returns a Header
122 instance. Optional maxlinelen, header_name, and continuation_ws are as in
123 the Header constructor.
124 """
125 h = Header(maxlinelen=maxlinelen, header_name=header_name,
126 continuation_ws=continuation_ws)
127 for s, charset in decoded_seq:
Barry Warsaw15d37392002-07-23 04:29:54 +0000128 # None means us-ascii but we can simply pass it on to h.append()
129 if charset is not None and not isinstance(charset, Charset):
Barry Warsaw8da39aa2002-07-09 16:33:47 +0000130 charset = Charset(charset)
131 h.append(s, charset)
132 return h
133
134
135
Barry Warsaw409a4c02002-04-10 21:01:31 +0000136class Header:
Barry Warsawe899e512003-03-06 05:39:46 +0000137 def __init__(self, s=None, charset=None,
138 maxlinelen=None, header_name=None,
Barry Warsawf4fdff72002-12-30 19:13:00 +0000139 continuation_ws=' ', errors='strict'):
Barry Warsaw174aa492002-09-30 15:51:31 +0000140 """Create a MIME-compliant header that can contain many character sets.
Barry Warsaw409a4c02002-04-10 21:01:31 +0000141
Barry Warsaw174aa492002-09-30 15:51:31 +0000142 Optional s is the initial header value. If None, the initial header
143 value is not set. You can later append to the header with .append()
144 method calls. s may be a byte string or a Unicode string, but see the
145 .append() documentation for semantics.
Barry Warsaw8da39aa2002-07-09 16:33:47 +0000146
Barry Warsaw174aa492002-09-30 15:51:31 +0000147 Optional charset serves two purposes: it has the same meaning as the
148 charset argument to the .append() method. It also sets the default
149 character set for all subsequent .append() calls that omit the charset
150 argument. If charset is not provided in the constructor, the us-ascii
151 charset is used both as s's initial charset and as the default for
152 subsequent .append() calls.
Barry Warsaw409a4c02002-04-10 21:01:31 +0000153
Barry Warsaw76612502002-06-28 23:46:53 +0000154 The maximum line length can be specified explicit via maxlinelen. For
155 splitting the first line to a shorter value (to account for the field
156 header which isn't included in s, e.g. `Subject') pass in the name of
157 the field in header_name. The default maxlinelen is 76.
158
159 continuation_ws must be RFC 2822 compliant folding whitespace (usually
160 either a space or a hard tab) which will be prepended to continuation
161 lines.
Barry Warsawf4fdff72002-12-30 19:13:00 +0000162
163 errors is passed through to the .append() call.
Barry Warsaw409a4c02002-04-10 21:01:31 +0000164 """
165 if charset is None:
Barry Warsaw174aa492002-09-30 15:51:31 +0000166 charset = USASCII
Barry Warsaw5e3bcff2002-10-14 15:13:17 +0000167 if not isinstance(charset, Charset):
168 charset = Charset(charset)
Barry Warsaw409a4c02002-04-10 21:01:31 +0000169 self._charset = charset
Barry Warsaw76612502002-06-28 23:46:53 +0000170 self._continuation_ws = continuation_ws
171 cws_expanded_len = len(continuation_ws.replace('\t', SPACE8))
Barry Warsaw409a4c02002-04-10 21:01:31 +0000172 # BAW: I believe `chunks' and `maxlinelen' should be non-public.
173 self._chunks = []
Barry Warsaw8da39aa2002-07-09 16:33:47 +0000174 if s is not None:
Barry Warsawf4fdff72002-12-30 19:13:00 +0000175 self.append(s, charset, errors)
Barry Warsaw812031b2002-05-19 23:47:53 +0000176 if maxlinelen is None:
Barry Warsaw76612502002-06-28 23:46:53 +0000177 maxlinelen = MAXLINELEN
178 if header_name is None:
179 # We don't know anything about the field header so the first line
180 # is the same length as subsequent lines.
181 self._firstlinelen = maxlinelen
Barry Warsaw812031b2002-05-19 23:47:53 +0000182 else:
Barry Warsaw76612502002-06-28 23:46:53 +0000183 # The first line should be shorter to take into account the field
184 # header. Also subtract off 2 extra for the colon and space.
185 self._firstlinelen = maxlinelen - len(header_name) - 2
186 # Second and subsequent lines should subtract off the length in
187 # columns of the continuation whitespace prefix.
188 self._maxlinelen = maxlinelen - cws_expanded_len
Barry Warsaw409a4c02002-04-10 21:01:31 +0000189
190 def __str__(self):
191 """A synonym for self.encode()."""
192 return self.encode()
193
Barry Warsaw8e69bda2002-06-29 03:26:58 +0000194 def __unicode__(self):
195 """Helper for the built-in unicode function."""
Barry Warsaw48488052003-03-06 16:10:30 +0000196 uchunks = []
197 lastcs = None
198 for s, charset in self._chunks:
199 # We must preserve spaces between encoded and non-encoded word
200 # boundaries, which means for us we need to add a space when we go
201 # from a charset to None/us-ascii, or from None/us-ascii to a
202 # charset. Only do this for the second and subsequent chunks.
203 nextcs = charset
204 if uchunks:
Barry Warsawba1548a2003-03-30 20:46:47 +0000205 if lastcs not in (None, 'us-ascii'):
206 if nextcs in (None, 'us-ascii'):
Barry Warsaw48488052003-03-06 16:10:30 +0000207 uchunks.append(USPACE)
208 nextcs = None
Barry Warsawba1548a2003-03-30 20:46:47 +0000209 elif nextcs not in (None, 'us-ascii'):
Barry Warsaw48488052003-03-06 16:10:30 +0000210 uchunks.append(USPACE)
211 lastcs = nextcs
212 uchunks.append(unicode(s, str(charset)))
213 return UEMPTYSTRING.join(uchunks)
Barry Warsaw8e69bda2002-06-29 03:26:58 +0000214
Barry Warsaw8da39aa2002-07-09 16:33:47 +0000215 # Rich comparison operators for equality only. BAW: does it make sense to
216 # have or explicitly disable <, <=, >, >= operators?
217 def __eq__(self, other):
218 # other may be a Header or a string. Both are fine so coerce
219 # ourselves to a string, swap the args and do another comparison.
220 return other == self.encode()
221
222 def __ne__(self, other):
223 return not self == other
224
Barry Warsawf4fdff72002-12-30 19:13:00 +0000225 def append(self, s, charset=None, errors='strict'):
Barry Warsaw174aa492002-09-30 15:51:31 +0000226 """Append a string to the MIME header.
Barry Warsaw409a4c02002-04-10 21:01:31 +0000227
Barry Warsaw174aa492002-09-30 15:51:31 +0000228 Optional charset, if given, should be a Charset instance or the name
229 of a character set (which will be converted to a Charset instance). A
230 value of None (the default) means that the charset given in the
231 constructor is used.
232
233 s may be a byte string or a Unicode string. If it is a byte string
Barry Warsaw3d1f3972004-05-09 03:40:17 +0000234 (i.e. isinstance(s, str) is true), then charset is the encoding of
235 that byte string, and a UnicodeError will be raised if the string
Barry Warsaw48330682002-09-30 23:07:35 +0000236 cannot be decoded with that charset. If s is a Unicode string, then
Barry Warsaw174aa492002-09-30 15:51:31 +0000237 charset is a hint specifying the character set of the characters in
238 the string. In this case, when producing an RFC 2822 compliant header
239 using RFC 2047 rules, the Unicode string will be encoded using the
Barry Warsaw48330682002-09-30 23:07:35 +0000240 following charsets in order: us-ascii, the charset hint, utf-8. The
241 first character set not to provoke a UnicodeError is used.
Barry Warsawf4fdff72002-12-30 19:13:00 +0000242
243 Optional `errors' is passed as the third argument to any unicode() or
244 ustr.encode() call.
Barry Warsaw409a4c02002-04-10 21:01:31 +0000245 """
246 if charset is None:
247 charset = self._charset
Barry Warsaw92825a92002-07-23 06:08:10 +0000248 elif not isinstance(charset, Charset):
249 charset = Charset(charset)
Barry Warsaw67f8f2f2002-10-14 16:52:41 +0000250 # If the charset is our faux 8bit charset, leave the string unchanged
Guido van Rossumb053cd82006-08-24 03:53:23 +0000251 if charset != '8bit':
Barry Warsaw67f8f2f2002-10-14 16:52:41 +0000252 # We need to test that the string can be converted to unicode and
253 # back to a byte string, given the input and output codecs of the
254 # charset.
Barry Warsaw3d1f3972004-05-09 03:40:17 +0000255 if isinstance(s, str):
Barry Warsaw67f8f2f2002-10-14 16:52:41 +0000256 # Possibly raise UnicodeError if the byte string can't be
257 # converted to a unicode with the input codec of the charset.
258 incodec = charset.input_codec or 'us-ascii'
Barry Warsawf4fdff72002-12-30 19:13:00 +0000259 ustr = unicode(s, incodec, errors)
Barry Warsaw67f8f2f2002-10-14 16:52:41 +0000260 # Now make sure that the unicode could be converted back to a
261 # byte string with the output codec, which may be different
262 # than the iput coded. Still, use the original byte string.
263 outcodec = charset.output_codec or 'us-ascii'
Barry Warsawf4fdff72002-12-30 19:13:00 +0000264 ustr.encode(outcodec, errors)
Barry Warsaw3d1f3972004-05-09 03:40:17 +0000265 elif isinstance(s, unicode):
Barry Warsaw67f8f2f2002-10-14 16:52:41 +0000266 # Now we have to be sure the unicode string can be converted
267 # to a byte string with a reasonable output codec. We want to
268 # use the byte string in the chunk.
269 for charset in USASCII, charset, UTF8:
270 try:
271 outcodec = charset.output_codec or 'us-ascii'
Barry Warsawf4fdff72002-12-30 19:13:00 +0000272 s = s.encode(outcodec, errors)
Barry Warsaw67f8f2f2002-10-14 16:52:41 +0000273 break
274 except UnicodeError:
275 pass
276 else:
277 assert False, 'utf-8 conversion failed'
Barry Warsaw409a4c02002-04-10 21:01:31 +0000278 self._chunks.append((s, charset))
Tim Peters8ac14952002-05-23 15:15:30 +0000279
Barry Warsawe899e512003-03-06 05:39:46 +0000280 def _split(self, s, charset, maxlinelen, splitchars):
Barry Warsaw5e3bcff2002-10-14 15:13:17 +0000281 # Split up a header safely for use with encode_chunks.
Barry Warsaw409a4c02002-04-10 21:01:31 +0000282 splittable = charset.to_splittable(s)
Barry Warsawe899e512003-03-06 05:39:46 +0000283 encoded = charset.from_splittable(splittable, True)
Barry Warsaw812031b2002-05-19 23:47:53 +0000284 elen = charset.encoded_header_len(encoded)
Barry Warsawe899e512003-03-06 05:39:46 +0000285 # If the line's encoded length first, just return it
286 if elen <= maxlinelen:
Barry Warsaw409a4c02002-04-10 21:01:31 +0000287 return [(encoded, charset)]
Barry Warsaw5e3bcff2002-10-14 15:13:17 +0000288 # If we have undetermined raw 8bit characters sitting in a byte
289 # string, we really don't know what the right thing to do is. We
290 # can't really split it because it might be multibyte data which we
291 # could break if we split it between pairs. The least harm seems to
292 # be to not split the header at all, but that means they could go out
293 # longer than maxlinelen.
Barry Warsawe899e512003-03-06 05:39:46 +0000294 if charset == '8bit':
Barry Warsaw5e3bcff2002-10-14 15:13:17 +0000295 return [(s, charset)]
Barry Warsaw76612502002-06-28 23:46:53 +0000296 # BAW: I'm not sure what the right test here is. What we're trying to
297 # do is be faithful to RFC 2822's recommendation that ($2.2.3):
298 #
299 # "Note: Though structured field bodies are defined in such a way that
300 # folding can take place between many of the lexical tokens (and even
301 # within some of the lexical tokens), folding SHOULD be limited to
302 # placing the CRLF at higher-level syntactic breaks."
303 #
304 # For now, I can only imagine doing this when the charset is us-ascii,
305 # although it's possible that other charsets may also benefit from the
306 # higher-level syntactic breaks.
Barry Warsaw76612502002-06-28 23:46:53 +0000307 elif charset == 'us-ascii':
Barry Warsawe899e512003-03-06 05:39:46 +0000308 return self._split_ascii(s, charset, maxlinelen, splitchars)
Barry Warsaw812031b2002-05-19 23:47:53 +0000309 # BAW: should we use encoded?
310 elif elen == len(s):
311 # We can split on _maxlinelen boundaries because we know that the
312 # encoding won't change the size of the string
Barry Warsawe899e512003-03-06 05:39:46 +0000313 splitpnt = maxlinelen
Barry Warsaw174aa492002-09-30 15:51:31 +0000314 first = charset.from_splittable(splittable[:splitpnt], False)
315 last = charset.from_splittable(splittable[splitpnt:], False)
Barry Warsaw409a4c02002-04-10 21:01:31 +0000316 else:
Barry Warsawe899e512003-03-06 05:39:46 +0000317 # Binary search for split point
318 first, last = _binsplit(splittable, charset, maxlinelen)
319 # first is of the proper length so just wrap it in the appropriate
320 # chrome. last must be recursively split.
321 fsplittable = charset.to_splittable(first)
322 fencoded = charset.from_splittable(fsplittable, True)
323 chunk = [(fencoded, charset)]
324 return chunk + self._split(last, charset, self._maxlinelen, splitchars)
Barry Warsaw76612502002-06-28 23:46:53 +0000325
Barry Warsawe899e512003-03-06 05:39:46 +0000326 def _split_ascii(self, s, charset, firstlen, splitchars):
Barry Warsaw5b8c69f2003-03-10 15:14:08 +0000327 chunks = _split_ascii(s, firstlen, self._maxlinelen,
328 self._continuation_ws, splitchars)
329 return zip(chunks, [charset]*len(chunks))
Barry Warsaw76612502002-06-28 23:46:53 +0000330
Barry Warsaw9f3fcd92003-03-07 15:39:37 +0000331 def _encode_chunks(self, newchunks, maxlinelen):
Barry Warsaw0c358252002-10-13 04:06:28 +0000332 # MIME-encode a header with many different charsets and/or encodings.
333 #
334 # Given a list of pairs (string, charset), return a MIME-encoded
335 # string suitable for use in a header field. Each pair may have
336 # different charsets and/or encodings, and the resulting header will
337 # accurately reflect each setting.
338 #
339 # Each encoding can be email.Utils.QP (quoted-printable, for
340 # ASCII-like character sets like iso-8859-1), email.Utils.BASE64
341 # (Base64, for non-ASCII like character sets like KOI8-R and
342 # iso-2022-jp), or None (no encoding).
343 #
344 # Each pair will be represented on a separate line; the resulting
345 # string will be in the format:
346 #
347 # =?charset1?q?Mar=EDa_Gonz=E1lez_Alonso?=\n
348 # =?charset2?b?SvxyZ2VuIEL2aW5n?="
Barry Warsaw76612502002-06-28 23:46:53 +0000349 chunks = []
Barry Warsaw0c358252002-10-13 04:06:28 +0000350 for header, charset in newchunks:
Barry Warsaw6613fb82003-03-17 20:36:20 +0000351 if not header:
352 continue
Barry Warsaw76612502002-06-28 23:46:53 +0000353 if charset is None or charset.header_encoding is None:
Barry Warsawe899e512003-03-06 05:39:46 +0000354 s = header
Barry Warsaw76612502002-06-28 23:46:53 +0000355 else:
Barry Warsawe899e512003-03-06 05:39:46 +0000356 s = charset.header_encode(header)
Barry Warsaw5b8c69f2003-03-10 15:14:08 +0000357 # Don't add more folding whitespace than necessary
358 if chunks and chunks[-1].endswith(' '):
359 extra = ''
360 else:
361 extra = ' '
362 _max_append(chunks, s, maxlinelen, extra)
Barry Warsaw76612502002-06-28 23:46:53 +0000363 joiner = NL + self._continuation_ws
364 return joiner.join(chunks)
Barry Warsaw409a4c02002-04-10 21:01:31 +0000365
Barry Warsawe899e512003-03-06 05:39:46 +0000366 def encode(self, splitchars=';, '):
Barry Warsaw48330682002-09-30 23:07:35 +0000367 """Encode a message header into an RFC-compliant format.
Barry Warsaw409a4c02002-04-10 21:01:31 +0000368
369 There are many issues involved in converting a given string for use in
370 an email header. Only certain character sets are readable in most
371 email clients, and as header strings can only contain a subset of
372 7-bit ASCII, care must be taken to properly convert and encode (with
373 Base64 or quoted-printable) header strings. In addition, there is a
374 75-character length limit on any given encoded header field, so
375 line-wrapping must be performed, even with double-byte character sets.
Tim Peters8ac14952002-05-23 15:15:30 +0000376
Barry Warsaw409a4c02002-04-10 21:01:31 +0000377 This method will do its best to convert the string to the correct
378 character set used in email, and encode and line wrap it safely with
379 the appropriate scheme for that character set.
380
381 If the given charset is not known or an error occurs during
382 conversion, this function will return the header untouched.
Barry Warsawe899e512003-03-06 05:39:46 +0000383
384 Optional splitchars is a string containing characters to split long
385 ASCII lines on, in rough support of RFC 2822's `highest level
386 syntactic breaks'. This doesn't affect RFC 2047 encoded lines.
Barry Warsaw409a4c02002-04-10 21:01:31 +0000387 """
388 newchunks = []
Barry Warsawe899e512003-03-06 05:39:46 +0000389 maxlinelen = self._firstlinelen
390 lastlen = 0
Barry Warsaw409a4c02002-04-10 21:01:31 +0000391 for s, charset in self._chunks:
Barry Warsawe899e512003-03-06 05:39:46 +0000392 # The first bit of the next chunk should be just long enough to
393 # fill the next line. Don't forget the space separating the
394 # encoded words.
395 targetlen = maxlinelen - lastlen - 1
396 if targetlen < charset.encoded_header_len(''):
397 # Stick it on the next line
398 targetlen = maxlinelen
399 newchunks += self._split(s, charset, targetlen, splitchars)
400 lastchunk, lastcharset = newchunks[-1]
401 lastlen = lastcharset.encoded_header_len(lastchunk)
Barry Warsaw9f3fcd92003-03-07 15:39:37 +0000402 return self._encode_chunks(newchunks, maxlinelen)
Barry Warsawe899e512003-03-06 05:39:46 +0000403
404
405
406def _split_ascii(s, firstlen, restlen, continuation_ws, splitchars):
407 lines = []
408 maxlen = firstlen
409 for line in s.splitlines():
Barry Warsaw33975ea2003-03-07 23:24:34 +0000410 # Ignore any leading whitespace (i.e. continuation whitespace) already
411 # on the line, since we'll be adding our own.
412 line = line.lstrip()
Barry Warsawe899e512003-03-06 05:39:46 +0000413 if len(line) < maxlen:
414 lines.append(line)
415 maxlen = restlen
416 continue
417 # Attempt to split the line at the highest-level syntactic break
418 # possible. Note that we don't have a lot of smarts about field
419 # syntax; we just try to break on semi-colons, then commas, then
420 # whitespace.
421 for ch in splitchars:
Barry Warsaw6f3b0332004-05-10 14:44:04 +0000422 if ch in line:
Barry Warsawe899e512003-03-06 05:39:46 +0000423 break
424 else:
425 # There's nothing useful to split the line on, not even spaces, so
426 # just append this line unchanged
427 lines.append(line)
428 maxlen = restlen
429 continue
430 # Now split the line on the character plus trailing whitespace
431 cre = re.compile(r'%s\s*' % ch)
432 if ch in ';,':
433 eol = ch
434 else:
435 eol = ''
436 joiner = eol + ' '
437 joinlen = len(joiner)
438 wslen = len(continuation_ws.replace('\t', SPACE8))
439 this = []
440 linelen = 0
441 for part in cre.split(line):
442 curlen = linelen + max(0, len(this)-1) * joinlen
443 partlen = len(part)
444 onfirstline = not lines
445 # We don't want to split after the field name, if we're on the
446 # first line and the field name is present in the header string.
447 if ch == ' ' and onfirstline and \
448 len(this) == 1 and fcre.match(this[0]):
449 this.append(part)
450 linelen += partlen
451 elif curlen + partlen > maxlen:
452 if this:
453 lines.append(joiner.join(this) + eol)
Barry Warsawbd836df2003-03-06 20:33:04 +0000454 # If this part is longer than maxlen and we aren't already
455 # splitting on whitespace, try to recursively split this line
456 # on whitespace.
Guido van Rossumb053cd82006-08-24 03:53:23 +0000457 if partlen > maxlen and ch != ' ':
Barry Warsaw5b8c69f2003-03-10 15:14:08 +0000458 subl = _split_ascii(part, maxlen, restlen,
Barry Warsaw9f3fcd92003-03-07 15:39:37 +0000459 continuation_ws, ' ')
Barry Warsaw9f3fcd92003-03-07 15:39:37 +0000460 lines.extend(subl[:-1])
461 this = [subl[-1]]
Barry Warsawbd836df2003-03-06 20:33:04 +0000462 else:
463 this = [part]
Barry Warsaw9f3fcd92003-03-07 15:39:37 +0000464 linelen = wslen + len(this[-1])
Barry Warsawe899e512003-03-06 05:39:46 +0000465 maxlen = restlen
466 else:
467 this.append(part)
468 linelen += partlen
469 # Put any left over parts on a line by themselves
470 if this:
471 lines.append(joiner.join(this))
Barry Warsaw5b8c69f2003-03-10 15:14:08 +0000472 return lines
Barry Warsawe899e512003-03-06 05:39:46 +0000473
474
475
476def _binsplit(splittable, charset, maxlinelen):
477 i = 0
478 j = len(splittable)
479 while i < j:
480 # Invariants:
481 # 1. splittable[:k] fits for all k <= i (note that we *assume*,
482 # at the start, that splittable[:0] fits).
483 # 2. splittable[:k] does not fit for any k > j (at the start,
484 # this means we shouldn't look at any k > len(splittable)).
485 # 3. We don't know about splittable[:k] for k in i+1..j.
486 # 4. We want to set i to the largest k that fits, with i <= k <= j.
487 #
488 m = (i+j+1) >> 1 # ceiling((i+j)/2); i < m <= j
489 chunk = charset.from_splittable(splittable[:m], True)
490 chunklen = charset.encoded_header_len(chunk)
491 if chunklen <= maxlinelen:
492 # m is acceptable, so is a new lower bound.
493 i = m
494 else:
Tim Peters2b482132003-03-06 23:41:58 +0000495 # m is not acceptable, so final i must be < m.
Barry Warsawe899e512003-03-06 05:39:46 +0000496 j = m - 1
497 # i == j. Invariant #1 implies that splittable[:i] fits, and
498 # invariant #2 implies that splittable[:i+1] does not fit, so i
499 # is what we're looking for.
500 first = charset.from_splittable(splittable[:i], False)
501 last = charset.from_splittable(splittable[i:], False)
502 return first, last