blob: 9e91dc20774ebf8aa1fbee5e6535636e70e467d1 [file] [log] [blame]
Barry Warsaw40ef0062006-03-18 15:41:53 +00001# Copyright (C) 2002-2006 Python Software Foundation
Barry Warsawbb113862004-10-03 03:16:19 +00002# Author: Ben Gertzfield, Barry Warsaw
3# Contact: email-sig@python.org
Barry Warsaw409a4c02002-04-10 21:01:31 +00004
5"""Header encoding and decoding functionality."""
6
Barry Warsaw40ef0062006-03-18 15:41:53 +00007__all__ = [
8 'Header',
9 'decode_header',
10 'make_header',
11 ]
12
Barry Warsaw409a4c02002-04-10 21:01:31 +000013import re
Barry Warsawe899e512003-03-06 05:39:46 +000014import binascii
Barry Warsaw174aa492002-09-30 15:51:31 +000015
Barry Warsaw40ef0062006-03-18 15:41:53 +000016import email.quoprimime
17import email.base64mime
18
19from email.errors import HeaderParseError
20from email.charset import Charset
Barry Warsaw409a4c02002-04-10 21:01:31 +000021
Barry Warsaw76612502002-06-28 23:46:53 +000022NL = '\n'
Barry Warsawe899e512003-03-06 05:39:46 +000023SPACE = ' '
Barry Warsaw48488052003-03-06 16:10:30 +000024USPACE = u' '
Barry Warsaw76612502002-06-28 23:46:53 +000025SPACE8 = ' ' * 8
Barry Warsaw48488052003-03-06 16:10:30 +000026UEMPTYSTRING = u''
Barry Warsaw409a4c02002-04-10 21:01:31 +000027
28MAXLINELEN = 76
29
Barry Warsaw174aa492002-09-30 15:51:31 +000030USASCII = Charset('us-ascii')
31UTF8 = Charset('utf-8')
32
Barry Warsaw409a4c02002-04-10 21:01:31 +000033# Match encoded-word strings in the form =?charset?q?Hello_World?=
34ecre = re.compile(r'''
35 =\? # literal =?
36 (?P<charset>[^?]*?) # non-greedy up to the next ? is the charset
37 \? # literal ?
38 (?P<encoding>[qb]) # either a "q" or a "b", case insensitive
39 \? # literal ?
40 (?P<encoded>.*?) # non-greedy up to the next ?= is the encoded string
41 \?= # literal ?=
Barry Warsawdcd24ae2007-03-14 04:59:50 +000042 (?=[ \t]|$) # whitespace or the end of the string
43 ''', re.VERBOSE | re.IGNORECASE | re.MULTILINE)
Barry Warsaw409a4c02002-04-10 21:01:31 +000044
Barry Warsawe899e512003-03-06 05:39:46 +000045# Field name regexp, including trailing colon, but not separating whitespace,
46# according to RFC 2822. Character range is from tilde to exclamation mark.
47# For use with .match()
48fcre = re.compile(r'[\041-\176]+:$')
49
Barry Warsaw409a4c02002-04-10 21:01:31 +000050
51
52# Helpers
Barry Warsaw40ef0062006-03-18 15:41:53 +000053_max_append = email.quoprimime._max_append
Barry Warsaw409a4c02002-04-10 21:01:31 +000054
55
56
57def decode_header(header):
58 """Decode a message header value without converting charset.
59
60 Returns a list of (decoded_string, charset) pairs containing each of the
61 decoded parts of the header. Charset is None for non-encoded parts of the
62 header, otherwise a lower-case string containing the name of the character
63 set specified in the encoded string.
Barry Warsawe899e512003-03-06 05:39:46 +000064
Amaury Forgeot d'Arc74b8d332009-07-11 14:33:51 +000065 An email.errors.HeaderParseError may be raised when certain decoding error
Barry Warsawe899e512003-03-06 05:39:46 +000066 occurs (e.g. a base64 decoding exception).
Barry Warsaw409a4c02002-04-10 21:01:31 +000067 """
68 # If no encoding, just return the header
69 header = str(header)
70 if not ecre.search(header):
71 return [(header, None)]
Barry Warsaw409a4c02002-04-10 21:01:31 +000072 decoded = []
73 dec = ''
74 for line in header.splitlines():
75 # This line might not have an encoding in it
76 if not ecre.search(line):
77 decoded.append((line, None))
78 continue
Barry Warsaw409a4c02002-04-10 21:01:31 +000079 parts = ecre.split(line)
80 while parts:
81 unenc = parts.pop(0).strip()
82 if unenc:
83 # Should we continue a long line?
84 if decoded and decoded[-1][1] is None:
Barry Warsaw671c3e62003-03-06 06:37:42 +000085 decoded[-1] = (decoded[-1][0] + SPACE + unenc, None)
Barry Warsaw409a4c02002-04-10 21:01:31 +000086 else:
87 decoded.append((unenc, None))
88 if parts:
89 charset, encoding = [s.lower() for s in parts[0:2]]
90 encoded = parts[2]
Barry Warsawe899e512003-03-06 05:39:46 +000091 dec = None
Barry Warsaw409a4c02002-04-10 21:01:31 +000092 if encoding == 'q':
Barry Warsaw40ef0062006-03-18 15:41:53 +000093 dec = email.quoprimime.header_decode(encoded)
Barry Warsaw409a4c02002-04-10 21:01:31 +000094 elif encoding == 'b':
R. David Murray75a292e2010-08-04 00:05:50 +000095 paderr = len(encoded) % 4 # Postel's law: add missing padding
96 if paderr:
97 encoded += '==='[:4 - paderr]
Barry Warsawe899e512003-03-06 05:39:46 +000098 try:
Barry Warsaw40ef0062006-03-18 15:41:53 +000099 dec = email.base64mime.decode(encoded)
Barry Warsawe899e512003-03-06 05:39:46 +0000100 except binascii.Error:
101 # Turn this into a higher level exception. BAW: Right
102 # now we throw the lower level exception away but
103 # when/if we get exception chaining, we'll preserve it.
104 raise HeaderParseError
105 if dec is None:
Barry Warsaw409a4c02002-04-10 21:01:31 +0000106 dec = encoded
107
108 if decoded and decoded[-1][1] == charset:
109 decoded[-1] = (decoded[-1][0] + dec, decoded[-1][1])
110 else:
111 decoded.append((dec, charset))
112 del parts[0:3]
113 return decoded
114
115
116
Barry Warsaw8da39aa2002-07-09 16:33:47 +0000117def make_header(decoded_seq, maxlinelen=None, header_name=None,
118 continuation_ws=' '):
119 """Create a Header from a sequence of pairs as returned by decode_header()
120
121 decode_header() takes a header value string and returns a sequence of
122 pairs of the format (decoded_string, charset) where charset is the string
123 name of the character set.
124
125 This function takes one of those sequence of pairs and returns a Header
126 instance. Optional maxlinelen, header_name, and continuation_ws are as in
127 the Header constructor.
128 """
129 h = Header(maxlinelen=maxlinelen, header_name=header_name,
130 continuation_ws=continuation_ws)
131 for s, charset in decoded_seq:
Barry Warsaw15d37392002-07-23 04:29:54 +0000132 # None means us-ascii but we can simply pass it on to h.append()
133 if charset is not None and not isinstance(charset, Charset):
Barry Warsaw8da39aa2002-07-09 16:33:47 +0000134 charset = Charset(charset)
135 h.append(s, charset)
136 return h
137
138
139
Barry Warsaw409a4c02002-04-10 21:01:31 +0000140class Header:
Barry Warsawe899e512003-03-06 05:39:46 +0000141 def __init__(self, s=None, charset=None,
142 maxlinelen=None, header_name=None,
Barry Warsawf4fdff72002-12-30 19:13:00 +0000143 continuation_ws=' ', errors='strict'):
Barry Warsaw174aa492002-09-30 15:51:31 +0000144 """Create a MIME-compliant header that can contain many character sets.
Barry Warsaw409a4c02002-04-10 21:01:31 +0000145
Barry Warsaw174aa492002-09-30 15:51:31 +0000146 Optional s is the initial header value. If None, the initial header
147 value is not set. You can later append to the header with .append()
148 method calls. s may be a byte string or a Unicode string, but see the
149 .append() documentation for semantics.
Barry Warsaw8da39aa2002-07-09 16:33:47 +0000150
Barry Warsaw174aa492002-09-30 15:51:31 +0000151 Optional charset serves two purposes: it has the same meaning as the
152 charset argument to the .append() method. It also sets the default
153 character set for all subsequent .append() calls that omit the charset
154 argument. If charset is not provided in the constructor, the us-ascii
155 charset is used both as s's initial charset and as the default for
156 subsequent .append() calls.
Barry Warsaw409a4c02002-04-10 21:01:31 +0000157
Barry Warsaw76612502002-06-28 23:46:53 +0000158 The maximum line length can be specified explicit via maxlinelen. For
159 splitting the first line to a shorter value (to account for the field
160 header which isn't included in s, e.g. `Subject') pass in the name of
161 the field in header_name. The default maxlinelen is 76.
162
163 continuation_ws must be RFC 2822 compliant folding whitespace (usually
164 either a space or a hard tab) which will be prepended to continuation
165 lines.
Barry Warsawf4fdff72002-12-30 19:13:00 +0000166
167 errors is passed through to the .append() call.
Barry Warsaw409a4c02002-04-10 21:01:31 +0000168 """
169 if charset is None:
Barry Warsaw174aa492002-09-30 15:51:31 +0000170 charset = USASCII
Barry Warsaw5e3bcff2002-10-14 15:13:17 +0000171 if not isinstance(charset, Charset):
172 charset = Charset(charset)
Barry Warsaw409a4c02002-04-10 21:01:31 +0000173 self._charset = charset
Barry Warsaw76612502002-06-28 23:46:53 +0000174 self._continuation_ws = continuation_ws
175 cws_expanded_len = len(continuation_ws.replace('\t', SPACE8))
Barry Warsaw409a4c02002-04-10 21:01:31 +0000176 # BAW: I believe `chunks' and `maxlinelen' should be non-public.
177 self._chunks = []
Barry Warsaw8da39aa2002-07-09 16:33:47 +0000178 if s is not None:
Barry Warsawf4fdff72002-12-30 19:13:00 +0000179 self.append(s, charset, errors)
Barry Warsaw812031b2002-05-19 23:47:53 +0000180 if maxlinelen is None:
Barry Warsaw76612502002-06-28 23:46:53 +0000181 maxlinelen = MAXLINELEN
182 if header_name is None:
183 # We don't know anything about the field header so the first line
184 # is the same length as subsequent lines.
185 self._firstlinelen = maxlinelen
Barry Warsaw812031b2002-05-19 23:47:53 +0000186 else:
Barry Warsaw76612502002-06-28 23:46:53 +0000187 # The first line should be shorter to take into account the field
188 # header. Also subtract off 2 extra for the colon and space.
189 self._firstlinelen = maxlinelen - len(header_name) - 2
190 # Second and subsequent lines should subtract off the length in
191 # columns of the continuation whitespace prefix.
192 self._maxlinelen = maxlinelen - cws_expanded_len
Barry Warsaw409a4c02002-04-10 21:01:31 +0000193
194 def __str__(self):
195 """A synonym for self.encode()."""
196 return self.encode()
197
Barry Warsaw8e69bda2002-06-29 03:26:58 +0000198 def __unicode__(self):
199 """Helper for the built-in unicode function."""
Barry Warsaw48488052003-03-06 16:10:30 +0000200 uchunks = []
201 lastcs = None
202 for s, charset in self._chunks:
203 # We must preserve spaces between encoded and non-encoded word
204 # boundaries, which means for us we need to add a space when we go
205 # from a charset to None/us-ascii, or from None/us-ascii to a
206 # charset. Only do this for the second and subsequent chunks.
207 nextcs = charset
208 if uchunks:
Barry Warsawba1548a2003-03-30 20:46:47 +0000209 if lastcs not in (None, 'us-ascii'):
210 if nextcs in (None, 'us-ascii'):
Barry Warsaw48488052003-03-06 16:10:30 +0000211 uchunks.append(USPACE)
212 nextcs = None
Barry Warsawba1548a2003-03-30 20:46:47 +0000213 elif nextcs not in (None, 'us-ascii'):
Barry Warsaw48488052003-03-06 16:10:30 +0000214 uchunks.append(USPACE)
215 lastcs = nextcs
216 uchunks.append(unicode(s, str(charset)))
217 return UEMPTYSTRING.join(uchunks)
Barry Warsaw8e69bda2002-06-29 03:26:58 +0000218
Barry Warsaw8da39aa2002-07-09 16:33:47 +0000219 # Rich comparison operators for equality only. BAW: does it make sense to
220 # have or explicitly disable <, <=, >, >= operators?
221 def __eq__(self, other):
222 # other may be a Header or a string. Both are fine so coerce
223 # ourselves to a string, swap the args and do another comparison.
224 return other == self.encode()
225
226 def __ne__(self, other):
227 return not self == other
228
Barry Warsawf4fdff72002-12-30 19:13:00 +0000229 def append(self, s, charset=None, errors='strict'):
Barry Warsaw174aa492002-09-30 15:51:31 +0000230 """Append a string to the MIME header.
Barry Warsaw409a4c02002-04-10 21:01:31 +0000231
Barry Warsaw174aa492002-09-30 15:51:31 +0000232 Optional charset, if given, should be a Charset instance or the name
233 of a character set (which will be converted to a Charset instance). A
234 value of None (the default) means that the charset given in the
235 constructor is used.
236
237 s may be a byte string or a Unicode string. If it is a byte string
Barry Warsaw3d1f3972004-05-09 03:40:17 +0000238 (i.e. isinstance(s, str) is true), then charset is the encoding of
239 that byte string, and a UnicodeError will be raised if the string
Barry Warsaw48330682002-09-30 23:07:35 +0000240 cannot be decoded with that charset. If s is a Unicode string, then
Barry Warsaw174aa492002-09-30 15:51:31 +0000241 charset is a hint specifying the character set of the characters in
242 the string. In this case, when producing an RFC 2822 compliant header
243 using RFC 2047 rules, the Unicode string will be encoded using the
Barry Warsaw48330682002-09-30 23:07:35 +0000244 following charsets in order: us-ascii, the charset hint, utf-8. The
245 first character set not to provoke a UnicodeError is used.
Barry Warsawf4fdff72002-12-30 19:13:00 +0000246
247 Optional `errors' is passed as the third argument to any unicode() or
248 ustr.encode() call.
Barry Warsaw409a4c02002-04-10 21:01:31 +0000249 """
250 if charset is None:
251 charset = self._charset
Barry Warsaw92825a92002-07-23 06:08:10 +0000252 elif not isinstance(charset, Charset):
253 charset = Charset(charset)
Barry Warsaw67f8f2f2002-10-14 16:52:41 +0000254 # If the charset is our faux 8bit charset, leave the string unchanged
Brett Cannon1f571c62008-08-03 23:27:32 +0000255 if charset != '8bit':
Barry Warsaw67f8f2f2002-10-14 16:52:41 +0000256 # We need to test that the string can be converted to unicode and
257 # back to a byte string, given the input and output codecs of the
258 # charset.
Barry Warsaw3d1f3972004-05-09 03:40:17 +0000259 if isinstance(s, str):
Barry Warsaw67f8f2f2002-10-14 16:52:41 +0000260 # Possibly raise UnicodeError if the byte string can't be
261 # converted to a unicode with the input codec of the charset.
262 incodec = charset.input_codec or 'us-ascii'
Barry Warsawf4fdff72002-12-30 19:13:00 +0000263 ustr = unicode(s, incodec, errors)
Barry Warsaw67f8f2f2002-10-14 16:52:41 +0000264 # Now make sure that the unicode could be converted back to a
265 # byte string with the output codec, which may be different
266 # than the iput coded. Still, use the original byte string.
267 outcodec = charset.output_codec or 'us-ascii'
Barry Warsawf4fdff72002-12-30 19:13:00 +0000268 ustr.encode(outcodec, errors)
Barry Warsaw3d1f3972004-05-09 03:40:17 +0000269 elif isinstance(s, unicode):
Barry Warsaw67f8f2f2002-10-14 16:52:41 +0000270 # Now we have to be sure the unicode string can be converted
271 # to a byte string with a reasonable output codec. We want to
272 # use the byte string in the chunk.
273 for charset in USASCII, charset, UTF8:
274 try:
275 outcodec = charset.output_codec or 'us-ascii'
Barry Warsawf4fdff72002-12-30 19:13:00 +0000276 s = s.encode(outcodec, errors)
Barry Warsaw67f8f2f2002-10-14 16:52:41 +0000277 break
278 except UnicodeError:
279 pass
280 else:
281 assert False, 'utf-8 conversion failed'
Barry Warsaw409a4c02002-04-10 21:01:31 +0000282 self._chunks.append((s, charset))
Tim Peters8ac14952002-05-23 15:15:30 +0000283
Barry Warsawe899e512003-03-06 05:39:46 +0000284 def _split(self, s, charset, maxlinelen, splitchars):
Barry Warsaw5e3bcff2002-10-14 15:13:17 +0000285 # Split up a header safely for use with encode_chunks.
Barry Warsaw409a4c02002-04-10 21:01:31 +0000286 splittable = charset.to_splittable(s)
Barry Warsawe899e512003-03-06 05:39:46 +0000287 encoded = charset.from_splittable(splittable, True)
Barry Warsaw812031b2002-05-19 23:47:53 +0000288 elen = charset.encoded_header_len(encoded)
Barry Warsawe899e512003-03-06 05:39:46 +0000289 # If the line's encoded length first, just return it
290 if elen <= maxlinelen:
Barry Warsaw409a4c02002-04-10 21:01:31 +0000291 return [(encoded, charset)]
Barry Warsaw5e3bcff2002-10-14 15:13:17 +0000292 # If we have undetermined raw 8bit characters sitting in a byte
293 # string, we really don't know what the right thing to do is. We
294 # can't really split it because it might be multibyte data which we
295 # could break if we split it between pairs. The least harm seems to
296 # be to not split the header at all, but that means they could go out
297 # longer than maxlinelen.
Barry Warsawe899e512003-03-06 05:39:46 +0000298 if charset == '8bit':
Barry Warsaw5e3bcff2002-10-14 15:13:17 +0000299 return [(s, charset)]
Barry Warsaw76612502002-06-28 23:46:53 +0000300 # BAW: I'm not sure what the right test here is. What we're trying to
301 # do is be faithful to RFC 2822's recommendation that ($2.2.3):
302 #
303 # "Note: Though structured field bodies are defined in such a way that
304 # folding can take place between many of the lexical tokens (and even
305 # within some of the lexical tokens), folding SHOULD be limited to
306 # placing the CRLF at higher-level syntactic breaks."
307 #
308 # For now, I can only imagine doing this when the charset is us-ascii,
309 # although it's possible that other charsets may also benefit from the
310 # higher-level syntactic breaks.
Barry Warsaw76612502002-06-28 23:46:53 +0000311 elif charset == 'us-ascii':
Barry Warsawe899e512003-03-06 05:39:46 +0000312 return self._split_ascii(s, charset, maxlinelen, splitchars)
Barry Warsaw812031b2002-05-19 23:47:53 +0000313 # BAW: should we use encoded?
314 elif elen == len(s):
315 # We can split on _maxlinelen boundaries because we know that the
316 # encoding won't change the size of the string
Barry Warsawe899e512003-03-06 05:39:46 +0000317 splitpnt = maxlinelen
Barry Warsaw174aa492002-09-30 15:51:31 +0000318 first = charset.from_splittable(splittable[:splitpnt], False)
319 last = charset.from_splittable(splittable[splitpnt:], False)
Barry Warsaw409a4c02002-04-10 21:01:31 +0000320 else:
Barry Warsawe899e512003-03-06 05:39:46 +0000321 # Binary search for split point
322 first, last = _binsplit(splittable, charset, maxlinelen)
323 # first is of the proper length so just wrap it in the appropriate
324 # chrome. last must be recursively split.
325 fsplittable = charset.to_splittable(first)
326 fencoded = charset.from_splittable(fsplittable, True)
327 chunk = [(fencoded, charset)]
328 return chunk + self._split(last, charset, self._maxlinelen, splitchars)
Barry Warsaw76612502002-06-28 23:46:53 +0000329
Barry Warsawe899e512003-03-06 05:39:46 +0000330 def _split_ascii(self, s, charset, firstlen, splitchars):
Barry Warsaw5b8c69f2003-03-10 15:14:08 +0000331 chunks = _split_ascii(s, firstlen, self._maxlinelen,
332 self._continuation_ws, splitchars)
333 return zip(chunks, [charset]*len(chunks))
Barry Warsaw76612502002-06-28 23:46:53 +0000334
Barry Warsaw9f3fcd92003-03-07 15:39:37 +0000335 def _encode_chunks(self, newchunks, maxlinelen):
Barry Warsaw0c358252002-10-13 04:06:28 +0000336 # MIME-encode a header with many different charsets and/or encodings.
337 #
338 # Given a list of pairs (string, charset), return a MIME-encoded
339 # string suitable for use in a header field. Each pair may have
340 # different charsets and/or encodings, and the resulting header will
341 # accurately reflect each setting.
342 #
Amaury Forgeot d'Arc74b8d332009-07-11 14:33:51 +0000343 # Each encoding can be email.utils.QP (quoted-printable, for
344 # ASCII-like character sets like iso-8859-1), email.utils.BASE64
Barry Warsaw0c358252002-10-13 04:06:28 +0000345 # (Base64, for non-ASCII like character sets like KOI8-R and
346 # iso-2022-jp), or None (no encoding).
347 #
348 # Each pair will be represented on a separate line; the resulting
349 # string will be in the format:
350 #
351 # =?charset1?q?Mar=EDa_Gonz=E1lez_Alonso?=\n
352 # =?charset2?b?SvxyZ2VuIEL2aW5n?="
Barry Warsaw76612502002-06-28 23:46:53 +0000353 chunks = []
Barry Warsaw0c358252002-10-13 04:06:28 +0000354 for header, charset in newchunks:
Barry Warsaw6613fb82003-03-17 20:36:20 +0000355 if not header:
356 continue
Barry Warsaw76612502002-06-28 23:46:53 +0000357 if charset is None or charset.header_encoding is None:
Barry Warsawe899e512003-03-06 05:39:46 +0000358 s = header
Barry Warsaw76612502002-06-28 23:46:53 +0000359 else:
Barry Warsawe899e512003-03-06 05:39:46 +0000360 s = charset.header_encode(header)
Barry Warsaw5b8c69f2003-03-10 15:14:08 +0000361 # Don't add more folding whitespace than necessary
362 if chunks and chunks[-1].endswith(' '):
363 extra = ''
364 else:
365 extra = ' '
366 _max_append(chunks, s, maxlinelen, extra)
Barry Warsaw76612502002-06-28 23:46:53 +0000367 joiner = NL + self._continuation_ws
368 return joiner.join(chunks)
Barry Warsaw409a4c02002-04-10 21:01:31 +0000369
Barry Warsawe899e512003-03-06 05:39:46 +0000370 def encode(self, splitchars=';, '):
Barry Warsaw48330682002-09-30 23:07:35 +0000371 """Encode a message header into an RFC-compliant format.
Barry Warsaw409a4c02002-04-10 21:01:31 +0000372
373 There are many issues involved in converting a given string for use in
374 an email header. Only certain character sets are readable in most
375 email clients, and as header strings can only contain a subset of
376 7-bit ASCII, care must be taken to properly convert and encode (with
377 Base64 or quoted-printable) header strings. In addition, there is a
378 75-character length limit on any given encoded header field, so
379 line-wrapping must be performed, even with double-byte character sets.
Tim Peters8ac14952002-05-23 15:15:30 +0000380
Barry Warsaw409a4c02002-04-10 21:01:31 +0000381 This method will do its best to convert the string to the correct
382 character set used in email, and encode and line wrap it safely with
383 the appropriate scheme for that character set.
384
385 If the given charset is not known or an error occurs during
386 conversion, this function will return the header untouched.
Barry Warsawe899e512003-03-06 05:39:46 +0000387
388 Optional splitchars is a string containing characters to split long
389 ASCII lines on, in rough support of RFC 2822's `highest level
390 syntactic breaks'. This doesn't affect RFC 2047 encoded lines.
Barry Warsaw409a4c02002-04-10 21:01:31 +0000391 """
392 newchunks = []
Barry Warsawe899e512003-03-06 05:39:46 +0000393 maxlinelen = self._firstlinelen
394 lastlen = 0
Barry Warsaw409a4c02002-04-10 21:01:31 +0000395 for s, charset in self._chunks:
Barry Warsawe899e512003-03-06 05:39:46 +0000396 # The first bit of the next chunk should be just long enough to
397 # fill the next line. Don't forget the space separating the
398 # encoded words.
399 targetlen = maxlinelen - lastlen - 1
400 if targetlen < charset.encoded_header_len(''):
401 # Stick it on the next line
402 targetlen = maxlinelen
403 newchunks += self._split(s, charset, targetlen, splitchars)
404 lastchunk, lastcharset = newchunks[-1]
405 lastlen = lastcharset.encoded_header_len(lastchunk)
Barry Warsaw9f3fcd92003-03-07 15:39:37 +0000406 return self._encode_chunks(newchunks, maxlinelen)
Barry Warsawe899e512003-03-06 05:39:46 +0000407
408
409
410def _split_ascii(s, firstlen, restlen, continuation_ws, splitchars):
411 lines = []
412 maxlen = firstlen
413 for line in s.splitlines():
Barry Warsaw33975ea2003-03-07 23:24:34 +0000414 # Ignore any leading whitespace (i.e. continuation whitespace) already
415 # on the line, since we'll be adding our own.
416 line = line.lstrip()
Barry Warsawe899e512003-03-06 05:39:46 +0000417 if len(line) < maxlen:
418 lines.append(line)
419 maxlen = restlen
420 continue
421 # Attempt to split the line at the highest-level syntactic break
422 # possible. Note that we don't have a lot of smarts about field
423 # syntax; we just try to break on semi-colons, then commas, then
424 # whitespace.
425 for ch in splitchars:
Barry Warsaw6f3b0332004-05-10 14:44:04 +0000426 if ch in line:
Barry Warsawe899e512003-03-06 05:39:46 +0000427 break
428 else:
429 # There's nothing useful to split the line on, not even spaces, so
430 # just append this line unchanged
431 lines.append(line)
432 maxlen = restlen
433 continue
434 # Now split the line on the character plus trailing whitespace
435 cre = re.compile(r'%s\s*' % ch)
436 if ch in ';,':
437 eol = ch
438 else:
439 eol = ''
440 joiner = eol + ' '
441 joinlen = len(joiner)
442 wslen = len(continuation_ws.replace('\t', SPACE8))
443 this = []
444 linelen = 0
445 for part in cre.split(line):
446 curlen = linelen + max(0, len(this)-1) * joinlen
447 partlen = len(part)
448 onfirstline = not lines
449 # We don't want to split after the field name, if we're on the
450 # first line and the field name is present in the header string.
451 if ch == ' ' and onfirstline and \
452 len(this) == 1 and fcre.match(this[0]):
453 this.append(part)
454 linelen += partlen
455 elif curlen + partlen > maxlen:
456 if this:
457 lines.append(joiner.join(this) + eol)
Barry Warsawbd836df2003-03-06 20:33:04 +0000458 # If this part is longer than maxlen and we aren't already
459 # splitting on whitespace, try to recursively split this line
460 # on whitespace.
Brett Cannon1f571c62008-08-03 23:27:32 +0000461 if partlen > maxlen and ch != ' ':
Barry Warsaw5b8c69f2003-03-10 15:14:08 +0000462 subl = _split_ascii(part, maxlen, restlen,
Barry Warsaw9f3fcd92003-03-07 15:39:37 +0000463 continuation_ws, ' ')
Barry Warsaw9f3fcd92003-03-07 15:39:37 +0000464 lines.extend(subl[:-1])
465 this = [subl[-1]]
Barry Warsawbd836df2003-03-06 20:33:04 +0000466 else:
467 this = [part]
Barry Warsaw9f3fcd92003-03-07 15:39:37 +0000468 linelen = wslen + len(this[-1])
Barry Warsawe899e512003-03-06 05:39:46 +0000469 maxlen = restlen
470 else:
471 this.append(part)
472 linelen += partlen
473 # Put any left over parts on a line by themselves
474 if this:
475 lines.append(joiner.join(this))
Barry Warsaw5b8c69f2003-03-10 15:14:08 +0000476 return lines
Barry Warsawe899e512003-03-06 05:39:46 +0000477
478
479
480def _binsplit(splittable, charset, maxlinelen):
481 i = 0
482 j = len(splittable)
483 while i < j:
484 # Invariants:
485 # 1. splittable[:k] fits for all k <= i (note that we *assume*,
486 # at the start, that splittable[:0] fits).
487 # 2. splittable[:k] does not fit for any k > j (at the start,
488 # this means we shouldn't look at any k > len(splittable)).
489 # 3. We don't know about splittable[:k] for k in i+1..j.
490 # 4. We want to set i to the largest k that fits, with i <= k <= j.
491 #
492 m = (i+j+1) >> 1 # ceiling((i+j)/2); i < m <= j
493 chunk = charset.from_splittable(splittable[:m], True)
494 chunklen = charset.encoded_header_len(chunk)
495 if chunklen <= maxlinelen:
496 # m is acceptable, so is a new lower bound.
497 i = m
498 else:
Tim Peters2b482132003-03-06 23:41:58 +0000499 # m is not acceptable, so final i must be < m.
Barry Warsawe899e512003-03-06 05:39:46 +0000500 j = m - 1
501 # i == j. Invariant #1 implies that splittable[:i] fits, and
502 # invariant #2 implies that splittable[:i+1] does not fit, so i
503 # is what we're looking for.
504 first = charset.from_splittable(splittable[:i], False)
505 last = charset.from_splittable(splittable[i:], False)
506 return first, last