blob: 2562b30c1672f97e41e70f17a16b2347489a6364 [file] [log] [blame]
Guido van Rossum8b3febe2007-08-30 01:15:14 +00001# Copyright (C) 2002-2007 Python Software Foundation
2# Author: Ben Gertzfield, Barry Warsaw
3# Contact: email-sig@python.org
4
5"""Header encoding and decoding functionality."""
6
7__all__ = [
8 'Header',
9 'decode_header',
10 'make_header',
11 ]
12
13import re
14import binascii
15
16import email.quoprimime
17import email.base64mime
18
19from email.errors import HeaderParseError
R. David Murray92532142011-01-07 23:25:30 +000020from email import charset as _charset
21Charset = _charset.Charset
Guido van Rossum8b3febe2007-08-30 01:15:14 +000022
23NL = '\n'
24SPACE = ' '
25BSPACE = b' '
26SPACE8 = ' ' * 8
27EMPTYSTRING = ''
Guido van Rossum9604e662007-08-30 03:46:43 +000028MAXLINELEN = 78
Guido van Rossum8b3febe2007-08-30 01:15:14 +000029
30USASCII = Charset('us-ascii')
31UTF8 = Charset('utf-8')
32
33# Match encoded-word strings in the form =?charset?q?Hello_World?=
34ecre = re.compile(r'''
35 =\? # literal =?
36 (?P<charset>[^?]*?) # non-greedy up to the next ? is the charset
37 \? # literal ?
38 (?P<encoding>[qb]) # either a "q" or a "b", case insensitive
39 \? # literal ?
40 (?P<encoded>.*?) # non-greedy up to the next ?= is the encoded string
41 \?= # literal ?=
42 (?=[ \t]|$) # whitespace or the end of the string
43 ''', re.VERBOSE | re.IGNORECASE | re.MULTILINE)
44
45# Field name regexp, including trailing colon, but not separating whitespace,
46# according to RFC 2822. Character range is from tilde to exclamation mark.
47# For use with .match()
48fcre = re.compile(r'[\041-\176]+:$')
49
Ezio Melotti13925002011-03-16 11:05:33 +020050# Find a header embedded in a putative header value. Used to check for
R. David Murray5b2d9dd2011-01-09 02:35:24 +000051# header injection attack.
52_embeded_header = re.compile(r'\n[^ \t]+:')
53
Guido van Rossum8b3febe2007-08-30 01:15:14 +000054
55
56# Helpers
57_max_append = email.quoprimime._max_append
58
59
60
61def decode_header(header):
62 """Decode a message header value without converting charset.
63
64 Returns a list of (string, charset) pairs containing each of the decoded
65 parts of the header. Charset is None for non-encoded parts of the header,
66 otherwise a lower-case string containing the name of the character set
67 specified in the encoded string.
68
R David Murray041015c2011-03-25 15:10:55 -040069 header may be a string that may or may not contain RFC2047 encoded words,
70 or it may be a Header object.
71
Amaury Forgeot d'Arc1c25de62009-07-12 16:43:19 +000072 An email.errors.HeaderParseError may be raised when certain decoding error
Guido van Rossum8b3febe2007-08-30 01:15:14 +000073 occurs (e.g. a base64 decoding exception).
74 """
R David Murray041015c2011-03-25 15:10:55 -040075 # If it is a Header object, we can just return the chunks.
76 if hasattr(header, '_chunks'):
77 return list(header._chunks)
Guido van Rossum8b3febe2007-08-30 01:15:14 +000078 # If no encoding, just return the header with no charset.
79 if not ecre.search(header):
80 return [(header, None)]
81 # First step is to parse all the encoded parts into triplets of the form
82 # (encoded_string, encoding, charset). For unencoded strings, the last
83 # two parts will be None.
84 words = []
85 for line in header.splitlines():
86 parts = ecre.split(line)
87 while parts:
88 unencoded = parts.pop(0).strip()
89 if unencoded:
90 words.append((unencoded, None, None))
91 if parts:
92 charset = parts.pop(0).lower()
93 encoding = parts.pop(0).lower()
94 encoded = parts.pop(0)
95 words.append((encoded, encoding, charset))
96 # The next step is to decode each encoded word by applying the reverse
97 # base64 or quopri transformation. decoded_words is now a list of the
98 # form (decoded_word, charset).
99 decoded_words = []
100 for encoded_string, encoding, charset in words:
101 if encoding is None:
102 # This is an unencoded word.
103 decoded_words.append((encoded_string, charset))
104 elif encoding == 'q':
105 word = email.quoprimime.header_decode(encoded_string)
106 decoded_words.append((word, charset))
107 elif encoding == 'b':
R. David Murrayc4e69cc2010-08-03 22:14:10 +0000108 paderr = len(encoded_string) % 4 # Postel's law: add missing padding
109 if paderr:
110 encoded_string += '==='[:4 - paderr]
Guido van Rossum8b3febe2007-08-30 01:15:14 +0000111 try:
112 word = email.base64mime.decode(encoded_string)
113 except binascii.Error:
114 raise HeaderParseError('Base64 decoding error')
115 else:
116 decoded_words.append((word, charset))
117 else:
118 raise AssertionError('Unexpected encoding: ' + encoding)
119 # Now convert all words to bytes and collapse consecutive runs of
120 # similarly encoded words.
121 collapsed = []
122 last_word = last_charset = None
123 for word, charset in decoded_words:
124 if isinstance(word, str):
Guido van Rossum9604e662007-08-30 03:46:43 +0000125 word = bytes(word, 'raw-unicode-escape')
Guido van Rossum8b3febe2007-08-30 01:15:14 +0000126 if last_word is None:
127 last_word = word
128 last_charset = charset
129 elif charset != last_charset:
130 collapsed.append((last_word, last_charset))
131 last_word = word
132 last_charset = charset
133 elif last_charset is None:
134 last_word += BSPACE + word
135 else:
136 last_word += word
137 collapsed.append((last_word, last_charset))
138 return collapsed
139
140
141
142def make_header(decoded_seq, maxlinelen=None, header_name=None,
143 continuation_ws=' '):
144 """Create a Header from a sequence of pairs as returned by decode_header()
145
146 decode_header() takes a header value string and returns a sequence of
147 pairs of the format (decoded_string, charset) where charset is the string
148 name of the character set.
149
150 This function takes one of those sequence of pairs and returns a Header
151 instance. Optional maxlinelen, header_name, and continuation_ws are as in
152 the Header constructor.
153 """
154 h = Header(maxlinelen=maxlinelen, header_name=header_name,
155 continuation_ws=continuation_ws)
156 for s, charset in decoded_seq:
157 # None means us-ascii but we can simply pass it on to h.append()
158 if charset is not None and not isinstance(charset, Charset):
159 charset = Charset(charset)
160 h.append(s, charset)
161 return h
162
163
164
165class Header:
166 def __init__(self, s=None, charset=None,
167 maxlinelen=None, header_name=None,
168 continuation_ws=' ', errors='strict'):
169 """Create a MIME-compliant header that can contain many character sets.
170
171 Optional s is the initial header value. If None, the initial header
172 value is not set. You can later append to the header with .append()
173 method calls. s may be a byte string or a Unicode string, but see the
174 .append() documentation for semantics.
175
176 Optional charset serves two purposes: it has the same meaning as the
177 charset argument to the .append() method. It also sets the default
178 character set for all subsequent .append() calls that omit the charset
179 argument. If charset is not provided in the constructor, the us-ascii
180 charset is used both as s's initial charset and as the default for
181 subsequent .append() calls.
182
R. David Murray4c1da4c2010-12-29 16:57:24 +0000183 The maximum line length can be specified explicitly via maxlinelen. For
Guido van Rossum8b3febe2007-08-30 01:15:14 +0000184 splitting the first line to a shorter value (to account for the field
185 header which isn't included in s, e.g. `Subject') pass in the name of
Guido van Rossum9604e662007-08-30 03:46:43 +0000186 the field in header_name. The default maxlinelen is 78 as recommended
187 by RFC 2822.
Guido van Rossum8b3febe2007-08-30 01:15:14 +0000188
189 continuation_ws must be RFC 2822 compliant folding whitespace (usually
190 either a space or a hard tab) which will be prepended to continuation
191 lines.
192
193 errors is passed through to the .append() call.
194 """
195 if charset is None:
196 charset = USASCII
197 elif not isinstance(charset, Charset):
198 charset = Charset(charset)
199 self._charset = charset
200 self._continuation_ws = continuation_ws
201 self._chunks = []
202 if s is not None:
203 self.append(s, charset, errors)
204 if maxlinelen is None:
205 maxlinelen = MAXLINELEN
206 self._maxlinelen = maxlinelen
207 if header_name is None:
208 self._headerlen = 0
209 else:
210 # Take the separating colon and space into account.
211 self._headerlen = len(header_name) + 2
212
213 def __str__(self):
214 """Return the string value of the header."""
Guido van Rossum9604e662007-08-30 03:46:43 +0000215 self._normalize()
Guido van Rossum8b3febe2007-08-30 01:15:14 +0000216 uchunks = []
217 lastcs = None
Guido van Rossum9604e662007-08-30 03:46:43 +0000218 for string, charset in self._chunks:
Guido van Rossum8b3febe2007-08-30 01:15:14 +0000219 # We must preserve spaces between encoded and non-encoded word
220 # boundaries, which means for us we need to add a space when we go
221 # from a charset to None/us-ascii, or from None/us-ascii to a
222 # charset. Only do this for the second and subsequent chunks.
223 nextcs = charset
R. David Murray92532142011-01-07 23:25:30 +0000224 if nextcs == _charset.UNKNOWN8BIT:
225 original_bytes = string.encode('ascii', 'surrogateescape')
226 string = original_bytes.decode('ascii', 'replace')
Guido van Rossum8b3febe2007-08-30 01:15:14 +0000227 if uchunks:
228 if lastcs not in (None, 'us-ascii'):
229 if nextcs in (None, 'us-ascii'):
230 uchunks.append(SPACE)
231 nextcs = None
232 elif nextcs not in (None, 'us-ascii'):
233 uchunks.append(SPACE)
234 lastcs = nextcs
Guido van Rossum9604e662007-08-30 03:46:43 +0000235 uchunks.append(string)
Guido van Rossum8b3febe2007-08-30 01:15:14 +0000236 return EMPTYSTRING.join(uchunks)
237
238 # Rich comparison operators for equality only. BAW: does it make sense to
239 # have or explicitly disable <, <=, >, >= operators?
240 def __eq__(self, other):
241 # other may be a Header or a string. Both are fine so coerce
Guido van Rossum9604e662007-08-30 03:46:43 +0000242 # ourselves to a unicode (of the unencoded header value), swap the
243 # args and do another comparison.
244 return other == str(self)
Guido van Rossum8b3febe2007-08-30 01:15:14 +0000245
246 def __ne__(self, other):
247 return not self == other
248
249 def append(self, s, charset=None, errors='strict'):
250 """Append a string to the MIME header.
251
252 Optional charset, if given, should be a Charset instance or the name
253 of a character set (which will be converted to a Charset instance). A
254 value of None (the default) means that the charset given in the
255 constructor is used.
256
257 s may be a byte string or a Unicode string. If it is a byte string
R. David Murray4c1da4c2010-12-29 16:57:24 +0000258 (i.e. isinstance(s, str) is false), then charset is the encoding of
Guido van Rossum8b3febe2007-08-30 01:15:14 +0000259 that byte string, and a UnicodeError will be raised if the string
260 cannot be decoded with that charset. If s is a Unicode string, then
261 charset is a hint specifying the character set of the characters in
R. David Murray477efb32011-01-05 01:39:32 +0000262 the string. In either case, when producing an RFC 2822 compliant
263 header using RFC 2047 rules, the string will be encoded using the
264 output codec of the charset. If the string cannot be encoded to the
265 output codec, a UnicodeError will be raised.
Guido van Rossum8b3febe2007-08-30 01:15:14 +0000266
R. David Murray477efb32011-01-05 01:39:32 +0000267 Optional `errors' is passed as the errors argument to the decode
268 call if s is a byte string.
Guido van Rossum8b3febe2007-08-30 01:15:14 +0000269 """
270 if charset is None:
271 charset = self._charset
272 elif not isinstance(charset, Charset):
273 charset = Charset(charset)
R. David Murray477efb32011-01-05 01:39:32 +0000274 if not isinstance(s, str):
Guido van Rossum8b3febe2007-08-30 01:15:14 +0000275 input_charset = charset.input_codec or 'us-ascii'
R. David Murray477efb32011-01-05 01:39:32 +0000276 s = s.decode(input_charset, errors)
Guido van Rossum8b3febe2007-08-30 01:15:14 +0000277 # Ensure that the bytes we're storing can be decoded to the output
278 # character set, otherwise an early error is thrown.
279 output_charset = charset.output_codec or 'us-ascii'
R. David Murray92532142011-01-07 23:25:30 +0000280 if output_charset != _charset.UNKNOWN8BIT:
281 s.encode(output_charset, errors)
R. David Murray477efb32011-01-05 01:39:32 +0000282 self._chunks.append((s, charset))
Guido van Rossum8b3febe2007-08-30 01:15:14 +0000283
R. David Murray8451c4b2010-10-23 22:19:56 +0000284 def encode(self, splitchars=';, \t', maxlinelen=None, linesep='\n'):
R David Murraycd37dfc2011-03-14 18:35:56 -0400285 r"""Encode a message header into an RFC-compliant format.
Guido van Rossum8b3febe2007-08-30 01:15:14 +0000286
287 There are many issues involved in converting a given string for use in
288 an email header. Only certain character sets are readable in most
289 email clients, and as header strings can only contain a subset of
290 7-bit ASCII, care must be taken to properly convert and encode (with
291 Base64 or quoted-printable) header strings. In addition, there is a
292 75-character length limit on any given encoded header field, so
293 line-wrapping must be performed, even with double-byte character sets.
294
295 This method will do its best to convert the string to the correct
296 character set used in email, and encode and line wrap it safely with
297 the appropriate scheme for that character set.
298
299 If the given charset is not known or an error occurs during
300 conversion, this function will return the header untouched.
301
302 Optional splitchars is a string containing characters to split long
303 ASCII lines on, in rough support of RFC 2822's `highest level
304 syntactic breaks'. This doesn't affect RFC 2047 encoded lines.
R. David Murray8451c4b2010-10-23 22:19:56 +0000305
306 Optional linesep is a string to be used to separate the lines of
307 the value. The default value is the most useful for typical
308 Python applications, but it can be set to \r\n to produce RFC-compliant
309 line separators when needed.
Guido van Rossum8b3febe2007-08-30 01:15:14 +0000310 """
311 self._normalize()
Guido van Rossum9604e662007-08-30 03:46:43 +0000312 if maxlinelen is None:
313 maxlinelen = self._maxlinelen
314 # A maxlinelen of 0 means don't wrap. For all practical purposes,
315 # choosing a huge number here accomplishes that and makes the
316 # _ValueFormatter algorithm much simpler.
317 if maxlinelen == 0:
318 maxlinelen = 1000000
319 formatter = _ValueFormatter(self._headerlen, maxlinelen,
Guido van Rossum8b3febe2007-08-30 01:15:14 +0000320 self._continuation_ws, splitchars)
321 for string, charset in self._chunks:
322 lines = string.splitlines()
R David Murrayde912762011-03-16 18:26:23 -0400323 formatter.feed(lines[0] if lines else '', charset)
R. David Murray6f0022d2011-01-07 21:57:25 +0000324 for line in lines[1:]:
325 formatter.newline()
326 if charset.header_encoding is not None:
327 formatter.feed(self._continuation_ws, USASCII)
328 line = ' ' + line.lstrip()
Guido van Rossum8b3febe2007-08-30 01:15:14 +0000329 formatter.feed(line, charset)
R. David Murray6f0022d2011-01-07 21:57:25 +0000330 if len(lines) > 1:
331 formatter.newline()
Barry Warsaw00b34222007-08-31 02:35:00 +0000332 formatter.add_transition()
R. David Murray5b2d9dd2011-01-09 02:35:24 +0000333 value = formatter._str(linesep)
334 if _embeded_header.search(value):
335 raise HeaderParseError("header value appears to contain "
336 "an embedded header: {!r}".format(value))
337 return value
Guido van Rossum8b3febe2007-08-30 01:15:14 +0000338
339 def _normalize(self):
Guido van Rossum9604e662007-08-30 03:46:43 +0000340 # Step 1: Normalize the chunks so that all runs of identical charsets
341 # get collapsed into a single unicode string.
Guido van Rossum8b3febe2007-08-30 01:15:14 +0000342 chunks = []
343 last_charset = None
344 last_chunk = []
345 for string, charset in self._chunks:
346 if charset == last_charset:
347 last_chunk.append(string)
348 else:
349 if last_charset is not None:
350 chunks.append((SPACE.join(last_chunk), last_charset))
Guido van Rossum8b3febe2007-08-30 01:15:14 +0000351 last_chunk = [string]
352 last_charset = charset
353 if last_chunk:
354 chunks.append((SPACE.join(last_chunk), last_charset))
355 self._chunks = chunks
356
357
358
359class _ValueFormatter:
360 def __init__(self, headerlen, maxlen, continuation_ws, splitchars):
361 self._maxlen = maxlen
362 self._continuation_ws = continuation_ws
363 self._continuation_ws_len = len(continuation_ws.replace('\t', SPACE8))
364 self._splitchars = splitchars
365 self._lines = []
366 self._current_line = _Accumulator(headerlen)
367
R. David Murray8451c4b2010-10-23 22:19:56 +0000368 def _str(self, linesep):
Guido van Rossum8b3febe2007-08-30 01:15:14 +0000369 self.newline()
R. David Murray8451c4b2010-10-23 22:19:56 +0000370 return linesep.join(self._lines)
371
372 def __str__(self):
373 return self._str(NL)
Guido van Rossum8b3febe2007-08-30 01:15:14 +0000374
375 def newline(self):
Barry Warsaw00b34222007-08-31 02:35:00 +0000376 end_of_line = self._current_line.pop()
377 if end_of_line is not None:
378 self._current_line.push(end_of_line)
Guido van Rossum8b3febe2007-08-30 01:15:14 +0000379 if len(self._current_line) > 0:
380 self._lines.append(str(self._current_line))
381 self._current_line.reset()
382
Barry Warsaw00b34222007-08-31 02:35:00 +0000383 def add_transition(self):
384 self._current_line.push(None)
385
Guido van Rossum8b3febe2007-08-30 01:15:14 +0000386 def feed(self, string, charset):
387 # If the string itself fits on the current line in its encoded format,
388 # then add it now and be done with it.
389 encoded_string = charset.header_encode(string)
390 if len(encoded_string) + len(self._current_line) <= self._maxlen:
391 self._current_line.push(encoded_string)
392 return
Guido van Rossum9604e662007-08-30 03:46:43 +0000393 # If the charset has no header encoding (i.e. it is an ASCII encoding)
394 # then we must split the header at the "highest level syntactic break"
395 # possible. Note that we don't have a lot of smarts about field
Guido van Rossum8b3febe2007-08-30 01:15:14 +0000396 # syntax; we just try to break on semi-colons, then commas, then
Guido van Rossum9604e662007-08-30 03:46:43 +0000397 # whitespace. Eventually, this should be pluggable.
398 if charset.header_encoding is None:
399 for ch in self._splitchars:
400 if ch in string:
401 break
402 else:
403 ch = None
404 # If there's no available split character then regardless of
405 # whether the string fits on the line, we have to put it on a line
406 # by itself.
407 if ch is None:
408 if not self._current_line.is_onlyws():
409 self._lines.append(str(self._current_line))
410 self._current_line.reset(self._continuation_ws)
411 self._current_line.push(encoded_string)
412 else:
413 self._ascii_split(string, ch)
Guido van Rossum8b3febe2007-08-30 01:15:14 +0000414 return
Guido van Rossum9604e662007-08-30 03:46:43 +0000415 # Otherwise, we're doing either a Base64 or a quoted-printable
416 # encoding which means we don't need to split the line on syntactic
417 # breaks. We can basically just find enough characters to fit on the
418 # current line, minus the RFC 2047 chrome. What makes this trickier
419 # though is that we have to split at octet boundaries, not character
420 # boundaries but it's only safe to split at character boundaries so at
421 # best we can only get close.
422 encoded_lines = charset.header_encode_lines(string, self._maxlengths())
423 # The first element extends the current line, but if it's None then
424 # nothing more fit on the current line so start a new line.
425 try:
426 first_line = encoded_lines.pop(0)
427 except IndexError:
428 # There are no encoded lines, so we're done.
429 return
430 if first_line is not None:
431 self._current_line.push(first_line)
432 self._lines.append(str(self._current_line))
433 self._current_line.reset(self._continuation_ws)
434 try:
435 last_line = encoded_lines.pop()
436 except IndexError:
437 # There was only one line.
438 return
439 self._current_line.push(last_line)
Guido van Rossum9604e662007-08-30 03:46:43 +0000440 # Everything else are full lines in themselves.
441 for line in encoded_lines:
442 self._lines.append(self._continuation_ws + line)
Guido van Rossum8b3febe2007-08-30 01:15:14 +0000443
Guido van Rossum9604e662007-08-30 03:46:43 +0000444 def _maxlengths(self):
445 # The first line's length.
446 yield self._maxlen - len(self._current_line)
447 while True:
448 yield self._maxlen - self._continuation_ws_len
449
450 def _ascii_split(self, string, ch):
451 holding = _Accumulator()
Guido van Rossum8b3febe2007-08-30 01:15:14 +0000452 # Split the line on the split character, preserving it. If the split
453 # character is whitespace RFC 2822 $2.2.3 requires us to fold on the
454 # whitespace, so that the line leads with the original whitespace we
455 # split on. However, if a higher syntactic break is used instead
456 # (e.g. comma or semicolon), the folding should happen after the split
457 # character. But then in that case, we need to add our own
458 # continuation whitespace -- although won't that break unfolding?
459 for part, splitpart, nextpart in _spliterator(ch, string):
460 if not splitpart:
461 # No splitpart means this is the last chunk. Put this part
462 # either on the current line or the next line depending on
463 # whether it fits.
464 holding.push(part)
465 if len(holding) + len(self._current_line) <= self._maxlen:
466 # It fits, but we're done.
467 self._current_line.push(str(holding))
468 else:
469 # It doesn't fit, but we're done. Before pushing a new
470 # line, watch out for the current line containing only
471 # whitespace.
472 holding.pop()
Guido van Rossum9604e662007-08-30 03:46:43 +0000473 if self._current_line.is_onlyws() and holding.is_onlyws():
Guido van Rossum8b3febe2007-08-30 01:15:14 +0000474 # Don't start a new line.
475 holding.push(part)
476 part = None
477 self._current_line.push(str(holding))
478 self._lines.append(str(self._current_line))
479 if part is None:
480 self._current_line.reset()
481 else:
482 holding.reset(part)
483 self._current_line.reset(str(holding))
484 return
485 elif not nextpart:
486 # There must be some trailing split characters because we
487 # found a split character but no next part. In this case we
488 # must treat the thing to fit as the part + splitpart because
489 # if splitpart is whitespace it's not allowed to be the only
490 # thing on the line, and if it's not whitespace we must split
491 # after the syntactic break. In either case, we're done.
492 holding_prelen = len(holding)
493 holding.push(part + splitpart)
494 if len(holding) + len(self._current_line) <= self._maxlen:
495 self._current_line.push(str(holding))
496 elif holding_prelen == 0:
497 # This is the only chunk left so it has to go on the
498 # current line.
499 self._current_line.push(str(holding))
500 else:
501 save_part = holding.pop()
502 self._current_line.push(str(holding))
503 self._lines.append(str(self._current_line))
504 holding.reset(save_part)
505 self._current_line.reset(str(holding))
506 return
507 elif not part:
508 # We're leading with a split character. See if the splitpart
509 # and nextpart fits on the current line.
510 holding.push(splitpart + nextpart)
511 holding_len = len(holding)
512 # We know we're not leaving the nextpart on the stack.
513 holding.pop()
514 if holding_len + len(self._current_line) <= self._maxlen:
515 holding.push(splitpart)
516 else:
517 # It doesn't fit. Since there's no current part really
518 # the best we can do is start a new line and push the
519 # split part onto it.
520 self._current_line.push(str(holding))
521 holding.reset()
522 if len(self._current_line) > 0 and self._lines:
523 self._lines.append(str(self._current_line))
524 self._current_line.reset()
525 holding.push(splitpart)
526 else:
527 # All three parts are present. First let's see if all three
528 # parts will fit on the current line. If so, we don't need to
529 # split it.
530 holding.push(part + splitpart + nextpart)
531 holding_len = len(holding)
532 # Pop the part because we'll push nextpart on the next
533 # iteration through the loop.
534 holding.pop()
535 if holding_len + len(self._current_line) <= self._maxlen:
536 holding.push(part + splitpart)
537 else:
538 # The entire thing doesn't fit. See if we need to split
539 # before or after the split characters.
540 if splitpart.isspace():
541 # Split before whitespace. Remember that the
542 # whitespace becomes the continuation whitespace of
543 # the next line so it goes to current_line not holding.
544 holding.push(part)
545 self._current_line.push(str(holding))
546 holding.reset()
547 self._lines.append(str(self._current_line))
548 self._current_line.reset(splitpart)
549 else:
550 # Split after non-whitespace. The continuation
551 # whitespace comes from the instance variable.
552 holding.push(part + splitpart)
553 self._current_line.push(str(holding))
554 holding.reset()
555 self._lines.append(str(self._current_line))
556 if nextpart[0].isspace():
557 self._current_line.reset()
558 else:
559 self._current_line.reset(self._continuation_ws)
560 # Get the last of the holding part
561 self._current_line.push(str(holding))
562
563
564
565def _spliterator(character, string):
566 parts = list(reversed(re.split('(%s)' % character, string)))
567 while parts:
568 part = parts.pop()
569 splitparts = (parts.pop() if parts else None)
570 nextpart = (parts.pop() if parts else None)
571 yield (part, splitparts, nextpart)
572 if nextpart is not None:
573 parts.append(nextpart)
574
575
576class _Accumulator:
Guido van Rossum9604e662007-08-30 03:46:43 +0000577 def __init__(self, initial_size=0):
Guido van Rossum8b3febe2007-08-30 01:15:14 +0000578 self._initial_size = initial_size
Guido van Rossum8b3febe2007-08-30 01:15:14 +0000579 self._current = []
580
581 def push(self, string):
582 self._current.append(string)
583
584 def pop(self):
Barry Warsaw00b34222007-08-31 02:35:00 +0000585 if not self._current:
586 return None
Guido van Rossum8b3febe2007-08-30 01:15:14 +0000587 return self._current.pop()
588
589 def __len__(self):
Barry Warsaw00b34222007-08-31 02:35:00 +0000590 return sum(((1 if string is None else len(string))
591 for string in self._current),
Guido van Rossum9604e662007-08-30 03:46:43 +0000592 self._initial_size)
Guido van Rossum8b3febe2007-08-30 01:15:14 +0000593
594 def __str__(self):
Barry Warsaw00b34222007-08-31 02:35:00 +0000595 if self._current and self._current[-1] is None:
596 self._current.pop()
597 return EMPTYSTRING.join((' ' if string is None else string)
598 for string in self._current)
Guido van Rossum8b3febe2007-08-30 01:15:14 +0000599
600 def reset(self, string=None):
601 self._current = []
Guido van Rossum8b3febe2007-08-30 01:15:14 +0000602 self._initial_size = 0
603 if string is not None:
604 self.push(string)
Guido van Rossum9604e662007-08-30 03:46:43 +0000605
606 def is_onlyws(self):
607 return len(self) == 0 or str(self).isspace()