blob: ab0d3fc76ed40904fafc0a75391263c870d90cb8 [file] [log] [blame]
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00001# Copyright (C) 2002-2006 Python Software Foundation
Barry Warsawbb113862004-10-03 03:16:19 +00002# Author: Ben Gertzfield, Barry Warsaw
3# Contact: email-sig@python.org
Barry Warsaw409a4c02002-04-10 21:01:31 +00004
5"""Header encoding and decoding functionality."""
6
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00007__all__ = [
8 'Header',
9 'decode_header',
10 'make_header',
11 ]
12
Barry Warsaw409a4c02002-04-10 21:01:31 +000013import re
Barry Warsawe899e512003-03-06 05:39:46 +000014import binascii
Barry Warsaw174aa492002-09-30 15:51:31 +000015
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000016import email.quoprimime
17import email.base64mime
18
19from email.errors import HeaderParseError
20from email.charset import Charset
Barry Warsaw409a4c02002-04-10 21:01:31 +000021
Barry Warsaw76612502002-06-28 23:46:53 +000022NL = '\n'
Barry Warsawe899e512003-03-06 05:39:46 +000023SPACE = ' '
Barry Warsaw48488052003-03-06 16:10:30 +000024USPACE = u' '
Barry Warsaw76612502002-06-28 23:46:53 +000025SPACE8 = ' ' * 8
Barry Warsaw48488052003-03-06 16:10:30 +000026UEMPTYSTRING = u''
Barry Warsaw409a4c02002-04-10 21:01:31 +000027
28MAXLINELEN = 76
29
Barry Warsaw174aa492002-09-30 15:51:31 +000030USASCII = Charset('us-ascii')
31UTF8 = Charset('utf-8')
32
Barry Warsaw409a4c02002-04-10 21:01:31 +000033# Match encoded-word strings in the form =?charset?q?Hello_World?=
34ecre = re.compile(r'''
35 =\? # literal =?
36 (?P<charset>[^?]*?) # non-greedy up to the next ? is the charset
37 \? # literal ?
38 (?P<encoding>[qb]) # either a "q" or a "b", case insensitive
39 \? # literal ?
40 (?P<encoded>.*?) # non-greedy up to the next ?= is the encoded string
41 \?= # literal ?=
Guido van Rossumd8faa362007-04-27 19:54:29 +000042 (?=[ \t]|$) # whitespace or the end of the string
43 ''', re.VERBOSE | re.IGNORECASE | re.MULTILINE)
Barry Warsaw409a4c02002-04-10 21:01:31 +000044
Barry Warsawe899e512003-03-06 05:39:46 +000045# Field name regexp, including trailing colon, but not separating whitespace,
46# according to RFC 2822. Character range is from tilde to exclamation mark.
47# For use with .match()
48fcre = re.compile(r'[\041-\176]+:$')
49
Barry Warsaw409a4c02002-04-10 21:01:31 +000050
51
52# Helpers
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000053_max_append = email.quoprimime._max_append
Barry Warsaw409a4c02002-04-10 21:01:31 +000054
55
56
57def decode_header(header):
58 """Decode a message header value without converting charset.
59
60 Returns a list of (decoded_string, charset) pairs containing each of the
61 decoded parts of the header. Charset is None for non-encoded parts of the
62 header, otherwise a lower-case string containing the name of the character
63 set specified in the encoded string.
Barry Warsawe899e512003-03-06 05:39:46 +000064
65 An email.Errors.HeaderParseError may be raised when certain decoding error
66 occurs (e.g. a base64 decoding exception).
Barry Warsaw409a4c02002-04-10 21:01:31 +000067 """
68 # If no encoding, just return the header
69 header = str(header)
70 if not ecre.search(header):
71 return [(header, None)]
Barry Warsaw409a4c02002-04-10 21:01:31 +000072 decoded = []
73 dec = ''
74 for line in header.splitlines():
75 # This line might not have an encoding in it
76 if not ecre.search(line):
77 decoded.append((line, None))
78 continue
Barry Warsaw409a4c02002-04-10 21:01:31 +000079 parts = ecre.split(line)
80 while parts:
81 unenc = parts.pop(0).strip()
82 if unenc:
83 # Should we continue a long line?
84 if decoded and decoded[-1][1] is None:
Barry Warsaw671c3e62003-03-06 06:37:42 +000085 decoded[-1] = (decoded[-1][0] + SPACE + unenc, None)
Barry Warsaw409a4c02002-04-10 21:01:31 +000086 else:
87 decoded.append((unenc, None))
88 if parts:
89 charset, encoding = [s.lower() for s in parts[0:2]]
90 encoded = parts[2]
Barry Warsawe899e512003-03-06 05:39:46 +000091 dec = None
Barry Warsaw409a4c02002-04-10 21:01:31 +000092 if encoding == 'q':
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000093 dec = email.quoprimime.header_decode(encoded)
Barry Warsaw409a4c02002-04-10 21:01:31 +000094 elif encoding == 'b':
Barry Warsawe899e512003-03-06 05:39:46 +000095 try:
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000096 dec = email.base64mime.decode(encoded)
Barry Warsawe899e512003-03-06 05:39:46 +000097 except binascii.Error:
98 # Turn this into a higher level exception. BAW: Right
99 # now we throw the lower level exception away but
100 # when/if we get exception chaining, we'll preserve it.
101 raise HeaderParseError
102 if dec is None:
Barry Warsaw409a4c02002-04-10 21:01:31 +0000103 dec = encoded
104
105 if decoded and decoded[-1][1] == charset:
106 decoded[-1] = (decoded[-1][0] + dec, decoded[-1][1])
107 else:
108 decoded.append((dec, charset))
109 del parts[0:3]
110 return decoded
111
112
113
Barry Warsaw8da39aa2002-07-09 16:33:47 +0000114def make_header(decoded_seq, maxlinelen=None, header_name=None,
115 continuation_ws=' '):
116 """Create a Header from a sequence of pairs as returned by decode_header()
117
118 decode_header() takes a header value string and returns a sequence of
119 pairs of the format (decoded_string, charset) where charset is the string
120 name of the character set.
121
122 This function takes one of those sequence of pairs and returns a Header
123 instance. Optional maxlinelen, header_name, and continuation_ws are as in
124 the Header constructor.
125 """
126 h = Header(maxlinelen=maxlinelen, header_name=header_name,
127 continuation_ws=continuation_ws)
128 for s, charset in decoded_seq:
Barry Warsaw15d37392002-07-23 04:29:54 +0000129 # None means us-ascii but we can simply pass it on to h.append()
130 if charset is not None and not isinstance(charset, Charset):
Barry Warsaw8da39aa2002-07-09 16:33:47 +0000131 charset = Charset(charset)
132 h.append(s, charset)
133 return h
134
135
136
Barry Warsaw409a4c02002-04-10 21:01:31 +0000137class Header:
Barry Warsawe899e512003-03-06 05:39:46 +0000138 def __init__(self, s=None, charset=None,
139 maxlinelen=None, header_name=None,
Barry Warsawf4fdff72002-12-30 19:13:00 +0000140 continuation_ws=' ', errors='strict'):
Barry Warsaw174aa492002-09-30 15:51:31 +0000141 """Create a MIME-compliant header that can contain many character sets.
Barry Warsaw409a4c02002-04-10 21:01:31 +0000142
Barry Warsaw174aa492002-09-30 15:51:31 +0000143 Optional s is the initial header value. If None, the initial header
144 value is not set. You can later append to the header with .append()
145 method calls. s may be a byte string or a Unicode string, but see the
146 .append() documentation for semantics.
Barry Warsaw8da39aa2002-07-09 16:33:47 +0000147
Barry Warsaw174aa492002-09-30 15:51:31 +0000148 Optional charset serves two purposes: it has the same meaning as the
149 charset argument to the .append() method. It also sets the default
150 character set for all subsequent .append() calls that omit the charset
151 argument. If charset is not provided in the constructor, the us-ascii
152 charset is used both as s's initial charset and as the default for
153 subsequent .append() calls.
Barry Warsaw409a4c02002-04-10 21:01:31 +0000154
Barry Warsaw76612502002-06-28 23:46:53 +0000155 The maximum line length can be specified explicit via maxlinelen. For
156 splitting the first line to a shorter value (to account for the field
157 header which isn't included in s, e.g. `Subject') pass in the name of
158 the field in header_name. The default maxlinelen is 76.
159
160 continuation_ws must be RFC 2822 compliant folding whitespace (usually
161 either a space or a hard tab) which will be prepended to continuation
162 lines.
Barry Warsawf4fdff72002-12-30 19:13:00 +0000163
164 errors is passed through to the .append() call.
Barry Warsaw409a4c02002-04-10 21:01:31 +0000165 """
166 if charset is None:
Barry Warsaw174aa492002-09-30 15:51:31 +0000167 charset = USASCII
Barry Warsaw5e3bcff2002-10-14 15:13:17 +0000168 if not isinstance(charset, Charset):
169 charset = Charset(charset)
Barry Warsaw409a4c02002-04-10 21:01:31 +0000170 self._charset = charset
Barry Warsaw76612502002-06-28 23:46:53 +0000171 self._continuation_ws = continuation_ws
172 cws_expanded_len = len(continuation_ws.replace('\t', SPACE8))
Barry Warsaw409a4c02002-04-10 21:01:31 +0000173 # BAW: I believe `chunks' and `maxlinelen' should be non-public.
174 self._chunks = []
Barry Warsaw8da39aa2002-07-09 16:33:47 +0000175 if s is not None:
Barry Warsawf4fdff72002-12-30 19:13:00 +0000176 self.append(s, charset, errors)
Barry Warsaw812031b2002-05-19 23:47:53 +0000177 if maxlinelen is None:
Barry Warsaw76612502002-06-28 23:46:53 +0000178 maxlinelen = MAXLINELEN
179 if header_name is None:
180 # We don't know anything about the field header so the first line
181 # is the same length as subsequent lines.
182 self._firstlinelen = maxlinelen
Barry Warsaw812031b2002-05-19 23:47:53 +0000183 else:
Barry Warsaw76612502002-06-28 23:46:53 +0000184 # The first line should be shorter to take into account the field
185 # header. Also subtract off 2 extra for the colon and space.
186 self._firstlinelen = maxlinelen - len(header_name) - 2
187 # Second and subsequent lines should subtract off the length in
188 # columns of the continuation whitespace prefix.
189 self._maxlinelen = maxlinelen - cws_expanded_len
Barry Warsaw409a4c02002-04-10 21:01:31 +0000190
191 def __str__(self):
192 """A synonym for self.encode()."""
193 return self.encode()
194
Barry Warsaw8e69bda2002-06-29 03:26:58 +0000195 def __unicode__(self):
196 """Helper for the built-in unicode function."""
Barry Warsaw48488052003-03-06 16:10:30 +0000197 uchunks = []
198 lastcs = None
199 for s, charset in self._chunks:
200 # We must preserve spaces between encoded and non-encoded word
201 # boundaries, which means for us we need to add a space when we go
202 # from a charset to None/us-ascii, or from None/us-ascii to a
203 # charset. Only do this for the second and subsequent chunks.
204 nextcs = charset
205 if uchunks:
Barry Warsawba1548a2003-03-30 20:46:47 +0000206 if lastcs not in (None, 'us-ascii'):
207 if nextcs in (None, 'us-ascii'):
Barry Warsaw48488052003-03-06 16:10:30 +0000208 uchunks.append(USPACE)
209 nextcs = None
Barry Warsawba1548a2003-03-30 20:46:47 +0000210 elif nextcs not in (None, 'us-ascii'):
Barry Warsaw48488052003-03-06 16:10:30 +0000211 uchunks.append(USPACE)
212 lastcs = nextcs
213 uchunks.append(unicode(s, str(charset)))
214 return UEMPTYSTRING.join(uchunks)
Barry Warsaw8e69bda2002-06-29 03:26:58 +0000215
Barry Warsaw8da39aa2002-07-09 16:33:47 +0000216 # Rich comparison operators for equality only. BAW: does it make sense to
217 # have or explicitly disable <, <=, >, >= operators?
218 def __eq__(self, other):
219 # other may be a Header or a string. Both are fine so coerce
220 # ourselves to a string, swap the args and do another comparison.
221 return other == self.encode()
222
223 def __ne__(self, other):
224 return not self == other
225
Barry Warsawf4fdff72002-12-30 19:13:00 +0000226 def append(self, s, charset=None, errors='strict'):
Barry Warsaw174aa492002-09-30 15:51:31 +0000227 """Append a string to the MIME header.
Barry Warsaw409a4c02002-04-10 21:01:31 +0000228
Barry Warsaw174aa492002-09-30 15:51:31 +0000229 Optional charset, if given, should be a Charset instance or the name
230 of a character set (which will be converted to a Charset instance). A
231 value of None (the default) means that the charset given in the
232 constructor is used.
233
234 s may be a byte string or a Unicode string. If it is a byte string
Barry Warsaw3d1f3972004-05-09 03:40:17 +0000235 (i.e. isinstance(s, str) is true), then charset is the encoding of
236 that byte string, and a UnicodeError will be raised if the string
Barry Warsaw48330682002-09-30 23:07:35 +0000237 cannot be decoded with that charset. If s is a Unicode string, then
Barry Warsaw174aa492002-09-30 15:51:31 +0000238 charset is a hint specifying the character set of the characters in
239 the string. In this case, when producing an RFC 2822 compliant header
240 using RFC 2047 rules, the Unicode string will be encoded using the
Barry Warsaw48330682002-09-30 23:07:35 +0000241 following charsets in order: us-ascii, the charset hint, utf-8. The
242 first character set not to provoke a UnicodeError is used.
Barry Warsawf4fdff72002-12-30 19:13:00 +0000243
244 Optional `errors' is passed as the third argument to any unicode() or
245 ustr.encode() call.
Barry Warsaw409a4c02002-04-10 21:01:31 +0000246 """
247 if charset is None:
248 charset = self._charset
Barry Warsaw92825a92002-07-23 06:08:10 +0000249 elif not isinstance(charset, Charset):
250 charset = Charset(charset)
Barry Warsaw67f8f2f2002-10-14 16:52:41 +0000251 # If the charset is our faux 8bit charset, leave the string unchanged
Guido van Rossumb053cd82006-08-24 03:53:23 +0000252 if charset != '8bit':
Barry Warsaw67f8f2f2002-10-14 16:52:41 +0000253 # We need to test that the string can be converted to unicode and
254 # back to a byte string, given the input and output codecs of the
255 # charset.
Barry Warsaw3d1f3972004-05-09 03:40:17 +0000256 if isinstance(s, str):
Barry Warsaw67f8f2f2002-10-14 16:52:41 +0000257 # Possibly raise UnicodeError if the byte string can't be
258 # converted to a unicode with the input codec of the charset.
259 incodec = charset.input_codec or 'us-ascii'
Barry Warsawf4fdff72002-12-30 19:13:00 +0000260 ustr = unicode(s, incodec, errors)
Barry Warsaw67f8f2f2002-10-14 16:52:41 +0000261 # Now make sure that the unicode could be converted back to a
262 # byte string with the output codec, which may be different
263 # than the iput coded. Still, use the original byte string.
264 outcodec = charset.output_codec or 'us-ascii'
Barry Warsawf4fdff72002-12-30 19:13:00 +0000265 ustr.encode(outcodec, errors)
Barry Warsaw3d1f3972004-05-09 03:40:17 +0000266 elif isinstance(s, unicode):
Barry Warsaw67f8f2f2002-10-14 16:52:41 +0000267 # Now we have to be sure the unicode string can be converted
268 # to a byte string with a reasonable output codec. We want to
269 # use the byte string in the chunk.
270 for charset in USASCII, charset, UTF8:
271 try:
272 outcodec = charset.output_codec or 'us-ascii'
Barry Warsawf4fdff72002-12-30 19:13:00 +0000273 s = s.encode(outcodec, errors)
Barry Warsaw67f8f2f2002-10-14 16:52:41 +0000274 break
275 except UnicodeError:
276 pass
277 else:
278 assert False, 'utf-8 conversion failed'
Barry Warsaw409a4c02002-04-10 21:01:31 +0000279 self._chunks.append((s, charset))
Tim Peters8ac14952002-05-23 15:15:30 +0000280
Barry Warsawe899e512003-03-06 05:39:46 +0000281 def _split(self, s, charset, maxlinelen, splitchars):
Barry Warsaw5e3bcff2002-10-14 15:13:17 +0000282 # Split up a header safely for use with encode_chunks.
Barry Warsaw409a4c02002-04-10 21:01:31 +0000283 splittable = charset.to_splittable(s)
Barry Warsawe899e512003-03-06 05:39:46 +0000284 encoded = charset.from_splittable(splittable, True)
Barry Warsaw812031b2002-05-19 23:47:53 +0000285 elen = charset.encoded_header_len(encoded)
Barry Warsawe899e512003-03-06 05:39:46 +0000286 # If the line's encoded length first, just return it
287 if elen <= maxlinelen:
Barry Warsaw409a4c02002-04-10 21:01:31 +0000288 return [(encoded, charset)]
Barry Warsaw5e3bcff2002-10-14 15:13:17 +0000289 # If we have undetermined raw 8bit characters sitting in a byte
290 # string, we really don't know what the right thing to do is. We
291 # can't really split it because it might be multibyte data which we
292 # could break if we split it between pairs. The least harm seems to
293 # be to not split the header at all, but that means they could go out
294 # longer than maxlinelen.
Barry Warsawe899e512003-03-06 05:39:46 +0000295 if charset == '8bit':
Barry Warsaw5e3bcff2002-10-14 15:13:17 +0000296 return [(s, charset)]
Barry Warsaw76612502002-06-28 23:46:53 +0000297 # BAW: I'm not sure what the right test here is. What we're trying to
298 # do is be faithful to RFC 2822's recommendation that ($2.2.3):
299 #
300 # "Note: Though structured field bodies are defined in such a way that
301 # folding can take place between many of the lexical tokens (and even
302 # within some of the lexical tokens), folding SHOULD be limited to
303 # placing the CRLF at higher-level syntactic breaks."
304 #
305 # For now, I can only imagine doing this when the charset is us-ascii,
306 # although it's possible that other charsets may also benefit from the
307 # higher-level syntactic breaks.
Barry Warsaw76612502002-06-28 23:46:53 +0000308 elif charset == 'us-ascii':
Barry Warsawe899e512003-03-06 05:39:46 +0000309 return self._split_ascii(s, charset, maxlinelen, splitchars)
Barry Warsaw812031b2002-05-19 23:47:53 +0000310 # BAW: should we use encoded?
311 elif elen == len(s):
312 # We can split on _maxlinelen boundaries because we know that the
313 # encoding won't change the size of the string
Barry Warsawe899e512003-03-06 05:39:46 +0000314 splitpnt = maxlinelen
Barry Warsaw174aa492002-09-30 15:51:31 +0000315 first = charset.from_splittable(splittable[:splitpnt], False)
316 last = charset.from_splittable(splittable[splitpnt:], False)
Barry Warsaw409a4c02002-04-10 21:01:31 +0000317 else:
Barry Warsawe899e512003-03-06 05:39:46 +0000318 # Binary search for split point
319 first, last = _binsplit(splittable, charset, maxlinelen)
320 # first is of the proper length so just wrap it in the appropriate
321 # chrome. last must be recursively split.
322 fsplittable = charset.to_splittable(first)
323 fencoded = charset.from_splittable(fsplittable, True)
324 chunk = [(fencoded, charset)]
325 return chunk + self._split(last, charset, self._maxlinelen, splitchars)
Barry Warsaw76612502002-06-28 23:46:53 +0000326
Barry Warsawe899e512003-03-06 05:39:46 +0000327 def _split_ascii(self, s, charset, firstlen, splitchars):
Barry Warsaw5b8c69f2003-03-10 15:14:08 +0000328 chunks = _split_ascii(s, firstlen, self._maxlinelen,
329 self._continuation_ws, splitchars)
330 return zip(chunks, [charset]*len(chunks))
Barry Warsaw76612502002-06-28 23:46:53 +0000331
Barry Warsaw9f3fcd92003-03-07 15:39:37 +0000332 def _encode_chunks(self, newchunks, maxlinelen):
Barry Warsaw0c358252002-10-13 04:06:28 +0000333 # MIME-encode a header with many different charsets and/or encodings.
334 #
335 # Given a list of pairs (string, charset), return a MIME-encoded
336 # string suitable for use in a header field. Each pair may have
337 # different charsets and/or encodings, and the resulting header will
338 # accurately reflect each setting.
339 #
340 # Each encoding can be email.Utils.QP (quoted-printable, for
341 # ASCII-like character sets like iso-8859-1), email.Utils.BASE64
342 # (Base64, for non-ASCII like character sets like KOI8-R and
343 # iso-2022-jp), or None (no encoding).
344 #
345 # Each pair will be represented on a separate line; the resulting
346 # string will be in the format:
347 #
348 # =?charset1?q?Mar=EDa_Gonz=E1lez_Alonso?=\n
349 # =?charset2?b?SvxyZ2VuIEL2aW5n?="
Barry Warsaw76612502002-06-28 23:46:53 +0000350 chunks = []
Barry Warsaw0c358252002-10-13 04:06:28 +0000351 for header, charset in newchunks:
Barry Warsaw6613fb82003-03-17 20:36:20 +0000352 if not header:
353 continue
Barry Warsaw76612502002-06-28 23:46:53 +0000354 if charset is None or charset.header_encoding is None:
Barry Warsawe899e512003-03-06 05:39:46 +0000355 s = header
Barry Warsaw76612502002-06-28 23:46:53 +0000356 else:
Barry Warsawe899e512003-03-06 05:39:46 +0000357 s = charset.header_encode(header)
Barry Warsaw5b8c69f2003-03-10 15:14:08 +0000358 # Don't add more folding whitespace than necessary
359 if chunks and chunks[-1].endswith(' '):
360 extra = ''
361 else:
362 extra = ' '
363 _max_append(chunks, s, maxlinelen, extra)
Barry Warsaw76612502002-06-28 23:46:53 +0000364 joiner = NL + self._continuation_ws
365 return joiner.join(chunks)
Barry Warsaw409a4c02002-04-10 21:01:31 +0000366
Barry Warsawe899e512003-03-06 05:39:46 +0000367 def encode(self, splitchars=';, '):
Barry Warsaw48330682002-09-30 23:07:35 +0000368 """Encode a message header into an RFC-compliant format.
Barry Warsaw409a4c02002-04-10 21:01:31 +0000369
370 There are many issues involved in converting a given string for use in
371 an email header. Only certain character sets are readable in most
372 email clients, and as header strings can only contain a subset of
373 7-bit ASCII, care must be taken to properly convert and encode (with
374 Base64 or quoted-printable) header strings. In addition, there is a
375 75-character length limit on any given encoded header field, so
376 line-wrapping must be performed, even with double-byte character sets.
Tim Peters8ac14952002-05-23 15:15:30 +0000377
Barry Warsaw409a4c02002-04-10 21:01:31 +0000378 This method will do its best to convert the string to the correct
379 character set used in email, and encode and line wrap it safely with
380 the appropriate scheme for that character set.
381
382 If the given charset is not known or an error occurs during
383 conversion, this function will return the header untouched.
Barry Warsawe899e512003-03-06 05:39:46 +0000384
385 Optional splitchars is a string containing characters to split long
386 ASCII lines on, in rough support of RFC 2822's `highest level
387 syntactic breaks'. This doesn't affect RFC 2047 encoded lines.
Barry Warsaw409a4c02002-04-10 21:01:31 +0000388 """
389 newchunks = []
Barry Warsawe899e512003-03-06 05:39:46 +0000390 maxlinelen = self._firstlinelen
391 lastlen = 0
Barry Warsaw409a4c02002-04-10 21:01:31 +0000392 for s, charset in self._chunks:
Barry Warsawe899e512003-03-06 05:39:46 +0000393 # The first bit of the next chunk should be just long enough to
394 # fill the next line. Don't forget the space separating the
395 # encoded words.
396 targetlen = maxlinelen - lastlen - 1
397 if targetlen < charset.encoded_header_len(''):
398 # Stick it on the next line
399 targetlen = maxlinelen
400 newchunks += self._split(s, charset, targetlen, splitchars)
401 lastchunk, lastcharset = newchunks[-1]
402 lastlen = lastcharset.encoded_header_len(lastchunk)
Barry Warsaw9f3fcd92003-03-07 15:39:37 +0000403 return self._encode_chunks(newchunks, maxlinelen)
Barry Warsawe899e512003-03-06 05:39:46 +0000404
405
406
407def _split_ascii(s, firstlen, restlen, continuation_ws, splitchars):
408 lines = []
409 maxlen = firstlen
410 for line in s.splitlines():
Barry Warsaw33975ea2003-03-07 23:24:34 +0000411 # Ignore any leading whitespace (i.e. continuation whitespace) already
412 # on the line, since we'll be adding our own.
413 line = line.lstrip()
Barry Warsawe899e512003-03-06 05:39:46 +0000414 if len(line) < maxlen:
415 lines.append(line)
416 maxlen = restlen
417 continue
418 # Attempt to split the line at the highest-level syntactic break
419 # possible. Note that we don't have a lot of smarts about field
420 # syntax; we just try to break on semi-colons, then commas, then
421 # whitespace.
422 for ch in splitchars:
Barry Warsaw6f3b0332004-05-10 14:44:04 +0000423 if ch in line:
Barry Warsawe899e512003-03-06 05:39:46 +0000424 break
425 else:
426 # There's nothing useful to split the line on, not even spaces, so
427 # just append this line unchanged
428 lines.append(line)
429 maxlen = restlen
430 continue
431 # Now split the line on the character plus trailing whitespace
432 cre = re.compile(r'%s\s*' % ch)
433 if ch in ';,':
434 eol = ch
435 else:
436 eol = ''
437 joiner = eol + ' '
438 joinlen = len(joiner)
439 wslen = len(continuation_ws.replace('\t', SPACE8))
440 this = []
441 linelen = 0
442 for part in cre.split(line):
443 curlen = linelen + max(0, len(this)-1) * joinlen
444 partlen = len(part)
445 onfirstline = not lines
446 # We don't want to split after the field name, if we're on the
447 # first line and the field name is present in the header string.
448 if ch == ' ' and onfirstline and \
449 len(this) == 1 and fcre.match(this[0]):
450 this.append(part)
451 linelen += partlen
452 elif curlen + partlen > maxlen:
453 if this:
454 lines.append(joiner.join(this) + eol)
Barry Warsawbd836df2003-03-06 20:33:04 +0000455 # If this part is longer than maxlen and we aren't already
456 # splitting on whitespace, try to recursively split this line
457 # on whitespace.
Guido van Rossumb053cd82006-08-24 03:53:23 +0000458 if partlen > maxlen and ch != ' ':
Barry Warsaw5b8c69f2003-03-10 15:14:08 +0000459 subl = _split_ascii(part, maxlen, restlen,
Barry Warsaw9f3fcd92003-03-07 15:39:37 +0000460 continuation_ws, ' ')
Barry Warsaw9f3fcd92003-03-07 15:39:37 +0000461 lines.extend(subl[:-1])
462 this = [subl[-1]]
Barry Warsawbd836df2003-03-06 20:33:04 +0000463 else:
464 this = [part]
Barry Warsaw9f3fcd92003-03-07 15:39:37 +0000465 linelen = wslen + len(this[-1])
Barry Warsawe899e512003-03-06 05:39:46 +0000466 maxlen = restlen
467 else:
468 this.append(part)
469 linelen += partlen
470 # Put any left over parts on a line by themselves
471 if this:
472 lines.append(joiner.join(this))
Barry Warsaw5b8c69f2003-03-10 15:14:08 +0000473 return lines
Barry Warsawe899e512003-03-06 05:39:46 +0000474
475
476
477def _binsplit(splittable, charset, maxlinelen):
478 i = 0
479 j = len(splittable)
480 while i < j:
481 # Invariants:
482 # 1. splittable[:k] fits for all k <= i (note that we *assume*,
483 # at the start, that splittable[:0] fits).
484 # 2. splittable[:k] does not fit for any k > j (at the start,
485 # this means we shouldn't look at any k > len(splittable)).
486 # 3. We don't know about splittable[:k] for k in i+1..j.
487 # 4. We want to set i to the largest k that fits, with i <= k <= j.
488 #
489 m = (i+j+1) >> 1 # ceiling((i+j)/2); i < m <= j
490 chunk = charset.from_splittable(splittable[:m], True)
491 chunklen = charset.encoded_header_len(chunk)
492 if chunklen <= maxlinelen:
493 # m is acceptable, so is a new lower bound.
494 i = m
495 else:
Tim Peters2b482132003-03-06 23:41:58 +0000496 # m is not acceptable, so final i must be < m.
Barry Warsawe899e512003-03-06 05:39:46 +0000497 j = m - 1
498 # i == j. Invariant #1 implies that splittable[:i] fits, and
499 # invariant #2 implies that splittable[:i+1] does not fit, so i
500 # is what we're looking for.
501 first = charset.from_splittable(splittable[:i], False)
502 last = charset.from_splittable(splittable[i:], False)
503 return first, last