blob: f90883fe9545b35efea0ab1e85530f5eb7d9e4f1 [file] [log] [blame]
Guido van Rossum8b3febe2007-08-30 01:15:14 +00001# Copyright (C) 2002-2007 Python Software Foundation
2# Author: Ben Gertzfield, Barry Warsaw
3# Contact: email-sig@python.org
4
5"""Header encoding and decoding functionality."""
6
7__all__ = [
8 'Header',
9 'decode_header',
10 'make_header',
11 ]
12
13import re
14import binascii
15
16import email.quoprimime
17import email.base64mime
18
19from email.errors import HeaderParseError
R. David Murray92532142011-01-07 23:25:30 +000020from email import charset as _charset
21Charset = _charset.Charset
Guido van Rossum8b3febe2007-08-30 01:15:14 +000022
23NL = '\n'
24SPACE = ' '
25BSPACE = b' '
26SPACE8 = ' ' * 8
27EMPTYSTRING = ''
Guido van Rossum9604e662007-08-30 03:46:43 +000028MAXLINELEN = 78
Guido van Rossum8b3febe2007-08-30 01:15:14 +000029
30USASCII = Charset('us-ascii')
31UTF8 = Charset('utf-8')
32
33# Match encoded-word strings in the form =?charset?q?Hello_World?=
34ecre = re.compile(r'''
35 =\? # literal =?
36 (?P<charset>[^?]*?) # non-greedy up to the next ? is the charset
37 \? # literal ?
38 (?P<encoding>[qb]) # either a "q" or a "b", case insensitive
39 \? # literal ?
40 (?P<encoded>.*?) # non-greedy up to the next ?= is the encoded string
41 \?= # literal ?=
42 (?=[ \t]|$) # whitespace or the end of the string
43 ''', re.VERBOSE | re.IGNORECASE | re.MULTILINE)
44
45# Field name regexp, including trailing colon, but not separating whitespace,
46# according to RFC 2822. Character range is from tilde to exclamation mark.
47# For use with .match()
48fcre = re.compile(r'[\041-\176]+:$')
49
50
51
52# Helpers
53_max_append = email.quoprimime._max_append
54
55
56
57def decode_header(header):
58 """Decode a message header value without converting charset.
59
60 Returns a list of (string, charset) pairs containing each of the decoded
61 parts of the header. Charset is None for non-encoded parts of the header,
62 otherwise a lower-case string containing the name of the character set
63 specified in the encoded string.
64
Amaury Forgeot d'Arc1c25de62009-07-12 16:43:19 +000065 An email.errors.HeaderParseError may be raised when certain decoding error
Guido van Rossum8b3febe2007-08-30 01:15:14 +000066 occurs (e.g. a base64 decoding exception).
67 """
68 # If no encoding, just return the header with no charset.
69 if not ecre.search(header):
70 return [(header, None)]
71 # First step is to parse all the encoded parts into triplets of the form
72 # (encoded_string, encoding, charset). For unencoded strings, the last
73 # two parts will be None.
74 words = []
75 for line in header.splitlines():
76 parts = ecre.split(line)
77 while parts:
78 unencoded = parts.pop(0).strip()
79 if unencoded:
80 words.append((unencoded, None, None))
81 if parts:
82 charset = parts.pop(0).lower()
83 encoding = parts.pop(0).lower()
84 encoded = parts.pop(0)
85 words.append((encoded, encoding, charset))
86 # The next step is to decode each encoded word by applying the reverse
87 # base64 or quopri transformation. decoded_words is now a list of the
88 # form (decoded_word, charset).
89 decoded_words = []
90 for encoded_string, encoding, charset in words:
91 if encoding is None:
92 # This is an unencoded word.
93 decoded_words.append((encoded_string, charset))
94 elif encoding == 'q':
95 word = email.quoprimime.header_decode(encoded_string)
96 decoded_words.append((word, charset))
97 elif encoding == 'b':
R. David Murrayc4e69cc2010-08-03 22:14:10 +000098 paderr = len(encoded_string) % 4 # Postel's law: add missing padding
99 if paderr:
100 encoded_string += '==='[:4 - paderr]
Guido van Rossum8b3febe2007-08-30 01:15:14 +0000101 try:
102 word = email.base64mime.decode(encoded_string)
103 except binascii.Error:
104 raise HeaderParseError('Base64 decoding error')
105 else:
106 decoded_words.append((word, charset))
107 else:
108 raise AssertionError('Unexpected encoding: ' + encoding)
109 # Now convert all words to bytes and collapse consecutive runs of
110 # similarly encoded words.
111 collapsed = []
112 last_word = last_charset = None
113 for word, charset in decoded_words:
114 if isinstance(word, str):
Guido van Rossum9604e662007-08-30 03:46:43 +0000115 word = bytes(word, 'raw-unicode-escape')
Guido van Rossum8b3febe2007-08-30 01:15:14 +0000116 if last_word is None:
117 last_word = word
118 last_charset = charset
119 elif charset != last_charset:
120 collapsed.append((last_word, last_charset))
121 last_word = word
122 last_charset = charset
123 elif last_charset is None:
124 last_word += BSPACE + word
125 else:
126 last_word += word
127 collapsed.append((last_word, last_charset))
128 return collapsed
129
130
131
132def make_header(decoded_seq, maxlinelen=None, header_name=None,
133 continuation_ws=' '):
134 """Create a Header from a sequence of pairs as returned by decode_header()
135
136 decode_header() takes a header value string and returns a sequence of
137 pairs of the format (decoded_string, charset) where charset is the string
138 name of the character set.
139
140 This function takes one of those sequence of pairs and returns a Header
141 instance. Optional maxlinelen, header_name, and continuation_ws are as in
142 the Header constructor.
143 """
144 h = Header(maxlinelen=maxlinelen, header_name=header_name,
145 continuation_ws=continuation_ws)
146 for s, charset in decoded_seq:
147 # None means us-ascii but we can simply pass it on to h.append()
148 if charset is not None and not isinstance(charset, Charset):
149 charset = Charset(charset)
150 h.append(s, charset)
151 return h
152
153
154
155class Header:
156 def __init__(self, s=None, charset=None,
157 maxlinelen=None, header_name=None,
158 continuation_ws=' ', errors='strict'):
159 """Create a MIME-compliant header that can contain many character sets.
160
161 Optional s is the initial header value. If None, the initial header
162 value is not set. You can later append to the header with .append()
163 method calls. s may be a byte string or a Unicode string, but see the
164 .append() documentation for semantics.
165
166 Optional charset serves two purposes: it has the same meaning as the
167 charset argument to the .append() method. It also sets the default
168 character set for all subsequent .append() calls that omit the charset
169 argument. If charset is not provided in the constructor, the us-ascii
170 charset is used both as s's initial charset and as the default for
171 subsequent .append() calls.
172
R. David Murray4c1da4c2010-12-29 16:57:24 +0000173 The maximum line length can be specified explicitly via maxlinelen. For
Guido van Rossum8b3febe2007-08-30 01:15:14 +0000174 splitting the first line to a shorter value (to account for the field
175 header which isn't included in s, e.g. `Subject') pass in the name of
Guido van Rossum9604e662007-08-30 03:46:43 +0000176 the field in header_name. The default maxlinelen is 78 as recommended
177 by RFC 2822.
Guido van Rossum8b3febe2007-08-30 01:15:14 +0000178
179 continuation_ws must be RFC 2822 compliant folding whitespace (usually
180 either a space or a hard tab) which will be prepended to continuation
181 lines.
182
183 errors is passed through to the .append() call.
184 """
185 if charset is None:
186 charset = USASCII
187 elif not isinstance(charset, Charset):
188 charset = Charset(charset)
189 self._charset = charset
190 self._continuation_ws = continuation_ws
191 self._chunks = []
192 if s is not None:
193 self.append(s, charset, errors)
194 if maxlinelen is None:
195 maxlinelen = MAXLINELEN
196 self._maxlinelen = maxlinelen
197 if header_name is None:
198 self._headerlen = 0
199 else:
200 # Take the separating colon and space into account.
201 self._headerlen = len(header_name) + 2
202
203 def __str__(self):
204 """Return the string value of the header."""
Guido van Rossum9604e662007-08-30 03:46:43 +0000205 self._normalize()
Guido van Rossum8b3febe2007-08-30 01:15:14 +0000206 uchunks = []
207 lastcs = None
Guido van Rossum9604e662007-08-30 03:46:43 +0000208 for string, charset in self._chunks:
Guido van Rossum8b3febe2007-08-30 01:15:14 +0000209 # We must preserve spaces between encoded and non-encoded word
210 # boundaries, which means for us we need to add a space when we go
211 # from a charset to None/us-ascii, or from None/us-ascii to a
212 # charset. Only do this for the second and subsequent chunks.
213 nextcs = charset
R. David Murray92532142011-01-07 23:25:30 +0000214 if nextcs == _charset.UNKNOWN8BIT:
215 original_bytes = string.encode('ascii', 'surrogateescape')
216 string = original_bytes.decode('ascii', 'replace')
Guido van Rossum8b3febe2007-08-30 01:15:14 +0000217 if uchunks:
218 if lastcs not in (None, 'us-ascii'):
219 if nextcs in (None, 'us-ascii'):
220 uchunks.append(SPACE)
221 nextcs = None
222 elif nextcs not in (None, 'us-ascii'):
223 uchunks.append(SPACE)
224 lastcs = nextcs
Guido van Rossum9604e662007-08-30 03:46:43 +0000225 uchunks.append(string)
Guido van Rossum8b3febe2007-08-30 01:15:14 +0000226 return EMPTYSTRING.join(uchunks)
227
228 # Rich comparison operators for equality only. BAW: does it make sense to
229 # have or explicitly disable <, <=, >, >= operators?
230 def __eq__(self, other):
231 # other may be a Header or a string. Both are fine so coerce
Guido van Rossum9604e662007-08-30 03:46:43 +0000232 # ourselves to a unicode (of the unencoded header value), swap the
233 # args and do another comparison.
234 return other == str(self)
Guido van Rossum8b3febe2007-08-30 01:15:14 +0000235
236 def __ne__(self, other):
237 return not self == other
238
239 def append(self, s, charset=None, errors='strict'):
240 """Append a string to the MIME header.
241
242 Optional charset, if given, should be a Charset instance or the name
243 of a character set (which will be converted to a Charset instance). A
244 value of None (the default) means that the charset given in the
245 constructor is used.
246
247 s may be a byte string or a Unicode string. If it is a byte string
R. David Murray4c1da4c2010-12-29 16:57:24 +0000248 (i.e. isinstance(s, str) is false), then charset is the encoding of
Guido van Rossum8b3febe2007-08-30 01:15:14 +0000249 that byte string, and a UnicodeError will be raised if the string
250 cannot be decoded with that charset. If s is a Unicode string, then
251 charset is a hint specifying the character set of the characters in
R. David Murray477efb32011-01-05 01:39:32 +0000252 the string. In either case, when producing an RFC 2822 compliant
253 header using RFC 2047 rules, the string will be encoded using the
254 output codec of the charset. If the string cannot be encoded to the
255 output codec, a UnicodeError will be raised.
Guido van Rossum8b3febe2007-08-30 01:15:14 +0000256
R. David Murray477efb32011-01-05 01:39:32 +0000257 Optional `errors' is passed as the errors argument to the decode
258 call if s is a byte string.
Guido van Rossum8b3febe2007-08-30 01:15:14 +0000259 """
260 if charset is None:
261 charset = self._charset
262 elif not isinstance(charset, Charset):
263 charset = Charset(charset)
R. David Murray477efb32011-01-05 01:39:32 +0000264 if not isinstance(s, str):
Guido van Rossum8b3febe2007-08-30 01:15:14 +0000265 input_charset = charset.input_codec or 'us-ascii'
R. David Murray477efb32011-01-05 01:39:32 +0000266 s = s.decode(input_charset, errors)
Guido van Rossum8b3febe2007-08-30 01:15:14 +0000267 # Ensure that the bytes we're storing can be decoded to the output
268 # character set, otherwise an early error is thrown.
269 output_charset = charset.output_codec or 'us-ascii'
R. David Murray92532142011-01-07 23:25:30 +0000270 if output_charset != _charset.UNKNOWN8BIT:
271 s.encode(output_charset, errors)
R. David Murray477efb32011-01-05 01:39:32 +0000272 self._chunks.append((s, charset))
Guido van Rossum8b3febe2007-08-30 01:15:14 +0000273
R. David Murray8451c4b2010-10-23 22:19:56 +0000274 def encode(self, splitchars=';, \t', maxlinelen=None, linesep='\n'):
Guido van Rossum8b3febe2007-08-30 01:15:14 +0000275 """Encode a message header into an RFC-compliant format.
276
277 There are many issues involved in converting a given string for use in
278 an email header. Only certain character sets are readable in most
279 email clients, and as header strings can only contain a subset of
280 7-bit ASCII, care must be taken to properly convert and encode (with
281 Base64 or quoted-printable) header strings. In addition, there is a
282 75-character length limit on any given encoded header field, so
283 line-wrapping must be performed, even with double-byte character sets.
284
285 This method will do its best to convert the string to the correct
286 character set used in email, and encode and line wrap it safely with
287 the appropriate scheme for that character set.
288
289 If the given charset is not known or an error occurs during
290 conversion, this function will return the header untouched.
291
292 Optional splitchars is a string containing characters to split long
293 ASCII lines on, in rough support of RFC 2822's `highest level
294 syntactic breaks'. This doesn't affect RFC 2047 encoded lines.
R. David Murray8451c4b2010-10-23 22:19:56 +0000295
296 Optional linesep is a string to be used to separate the lines of
297 the value. The default value is the most useful for typical
298 Python applications, but it can be set to \r\n to produce RFC-compliant
299 line separators when needed.
Guido van Rossum8b3febe2007-08-30 01:15:14 +0000300 """
301 self._normalize()
Guido van Rossum9604e662007-08-30 03:46:43 +0000302 if maxlinelen is None:
303 maxlinelen = self._maxlinelen
304 # A maxlinelen of 0 means don't wrap. For all practical purposes,
305 # choosing a huge number here accomplishes that and makes the
306 # _ValueFormatter algorithm much simpler.
307 if maxlinelen == 0:
308 maxlinelen = 1000000
309 formatter = _ValueFormatter(self._headerlen, maxlinelen,
Guido van Rossum8b3febe2007-08-30 01:15:14 +0000310 self._continuation_ws, splitchars)
311 for string, charset in self._chunks:
312 lines = string.splitlines()
R. David Murray6f0022d2011-01-07 21:57:25 +0000313 formatter.feed(lines[0], charset)
314 for line in lines[1:]:
315 formatter.newline()
316 if charset.header_encoding is not None:
317 formatter.feed(self._continuation_ws, USASCII)
318 line = ' ' + line.lstrip()
Guido van Rossum8b3febe2007-08-30 01:15:14 +0000319 formatter.feed(line, charset)
R. David Murray6f0022d2011-01-07 21:57:25 +0000320 if len(lines) > 1:
321 formatter.newline()
Barry Warsaw00b34222007-08-31 02:35:00 +0000322 formatter.add_transition()
R. David Murray8451c4b2010-10-23 22:19:56 +0000323 return formatter._str(linesep)
Guido van Rossum8b3febe2007-08-30 01:15:14 +0000324
325 def _normalize(self):
Guido van Rossum9604e662007-08-30 03:46:43 +0000326 # Step 1: Normalize the chunks so that all runs of identical charsets
327 # get collapsed into a single unicode string.
Guido van Rossum8b3febe2007-08-30 01:15:14 +0000328 chunks = []
329 last_charset = None
330 last_chunk = []
331 for string, charset in self._chunks:
332 if charset == last_charset:
333 last_chunk.append(string)
334 else:
335 if last_charset is not None:
336 chunks.append((SPACE.join(last_chunk), last_charset))
Guido van Rossum8b3febe2007-08-30 01:15:14 +0000337 last_chunk = [string]
338 last_charset = charset
339 if last_chunk:
340 chunks.append((SPACE.join(last_chunk), last_charset))
341 self._chunks = chunks
342
343
344
345class _ValueFormatter:
346 def __init__(self, headerlen, maxlen, continuation_ws, splitchars):
347 self._maxlen = maxlen
348 self._continuation_ws = continuation_ws
349 self._continuation_ws_len = len(continuation_ws.replace('\t', SPACE8))
350 self._splitchars = splitchars
351 self._lines = []
352 self._current_line = _Accumulator(headerlen)
353
R. David Murray8451c4b2010-10-23 22:19:56 +0000354 def _str(self, linesep):
Guido van Rossum8b3febe2007-08-30 01:15:14 +0000355 self.newline()
R. David Murray8451c4b2010-10-23 22:19:56 +0000356 return linesep.join(self._lines)
357
358 def __str__(self):
359 return self._str(NL)
Guido van Rossum8b3febe2007-08-30 01:15:14 +0000360
361 def newline(self):
Barry Warsaw00b34222007-08-31 02:35:00 +0000362 end_of_line = self._current_line.pop()
363 if end_of_line is not None:
364 self._current_line.push(end_of_line)
Guido van Rossum8b3febe2007-08-30 01:15:14 +0000365 if len(self._current_line) > 0:
366 self._lines.append(str(self._current_line))
367 self._current_line.reset()
368
Barry Warsaw00b34222007-08-31 02:35:00 +0000369 def add_transition(self):
370 self._current_line.push(None)
371
Guido van Rossum8b3febe2007-08-30 01:15:14 +0000372 def feed(self, string, charset):
373 # If the string itself fits on the current line in its encoded format,
374 # then add it now and be done with it.
375 encoded_string = charset.header_encode(string)
376 if len(encoded_string) + len(self._current_line) <= self._maxlen:
377 self._current_line.push(encoded_string)
378 return
Guido van Rossum9604e662007-08-30 03:46:43 +0000379 # If the charset has no header encoding (i.e. it is an ASCII encoding)
380 # then we must split the header at the "highest level syntactic break"
381 # possible. Note that we don't have a lot of smarts about field
Guido van Rossum8b3febe2007-08-30 01:15:14 +0000382 # syntax; we just try to break on semi-colons, then commas, then
Guido van Rossum9604e662007-08-30 03:46:43 +0000383 # whitespace. Eventually, this should be pluggable.
384 if charset.header_encoding is None:
385 for ch in self._splitchars:
386 if ch in string:
387 break
388 else:
389 ch = None
390 # If there's no available split character then regardless of
391 # whether the string fits on the line, we have to put it on a line
392 # by itself.
393 if ch is None:
394 if not self._current_line.is_onlyws():
395 self._lines.append(str(self._current_line))
396 self._current_line.reset(self._continuation_ws)
397 self._current_line.push(encoded_string)
398 else:
399 self._ascii_split(string, ch)
Guido van Rossum8b3febe2007-08-30 01:15:14 +0000400 return
Guido van Rossum9604e662007-08-30 03:46:43 +0000401 # Otherwise, we're doing either a Base64 or a quoted-printable
402 # encoding which means we don't need to split the line on syntactic
403 # breaks. We can basically just find enough characters to fit on the
404 # current line, minus the RFC 2047 chrome. What makes this trickier
405 # though is that we have to split at octet boundaries, not character
406 # boundaries but it's only safe to split at character boundaries so at
407 # best we can only get close.
408 encoded_lines = charset.header_encode_lines(string, self._maxlengths())
409 # The first element extends the current line, but if it's None then
410 # nothing more fit on the current line so start a new line.
411 try:
412 first_line = encoded_lines.pop(0)
413 except IndexError:
414 # There are no encoded lines, so we're done.
415 return
416 if first_line is not None:
417 self._current_line.push(first_line)
418 self._lines.append(str(self._current_line))
419 self._current_line.reset(self._continuation_ws)
420 try:
421 last_line = encoded_lines.pop()
422 except IndexError:
423 # There was only one line.
424 return
425 self._current_line.push(last_line)
Guido van Rossum9604e662007-08-30 03:46:43 +0000426 # Everything else are full lines in themselves.
427 for line in encoded_lines:
428 self._lines.append(self._continuation_ws + line)
Guido van Rossum8b3febe2007-08-30 01:15:14 +0000429
Guido van Rossum9604e662007-08-30 03:46:43 +0000430 def _maxlengths(self):
431 # The first line's length.
432 yield self._maxlen - len(self._current_line)
433 while True:
434 yield self._maxlen - self._continuation_ws_len
435
436 def _ascii_split(self, string, ch):
437 holding = _Accumulator()
Guido van Rossum8b3febe2007-08-30 01:15:14 +0000438 # Split the line on the split character, preserving it. If the split
439 # character is whitespace RFC 2822 $2.2.3 requires us to fold on the
440 # whitespace, so that the line leads with the original whitespace we
441 # split on. However, if a higher syntactic break is used instead
442 # (e.g. comma or semicolon), the folding should happen after the split
443 # character. But then in that case, we need to add our own
444 # continuation whitespace -- although won't that break unfolding?
445 for part, splitpart, nextpart in _spliterator(ch, string):
446 if not splitpart:
447 # No splitpart means this is the last chunk. Put this part
448 # either on the current line or the next line depending on
449 # whether it fits.
450 holding.push(part)
451 if len(holding) + len(self._current_line) <= self._maxlen:
452 # It fits, but we're done.
453 self._current_line.push(str(holding))
454 else:
455 # It doesn't fit, but we're done. Before pushing a new
456 # line, watch out for the current line containing only
457 # whitespace.
458 holding.pop()
Guido van Rossum9604e662007-08-30 03:46:43 +0000459 if self._current_line.is_onlyws() and holding.is_onlyws():
Guido van Rossum8b3febe2007-08-30 01:15:14 +0000460 # Don't start a new line.
461 holding.push(part)
462 part = None
463 self._current_line.push(str(holding))
464 self._lines.append(str(self._current_line))
465 if part is None:
466 self._current_line.reset()
467 else:
468 holding.reset(part)
469 self._current_line.reset(str(holding))
470 return
471 elif not nextpart:
472 # There must be some trailing split characters because we
473 # found a split character but no next part. In this case we
474 # must treat the thing to fit as the part + splitpart because
475 # if splitpart is whitespace it's not allowed to be the only
476 # thing on the line, and if it's not whitespace we must split
477 # after the syntactic break. In either case, we're done.
478 holding_prelen = len(holding)
479 holding.push(part + splitpart)
480 if len(holding) + len(self._current_line) <= self._maxlen:
481 self._current_line.push(str(holding))
482 elif holding_prelen == 0:
483 # This is the only chunk left so it has to go on the
484 # current line.
485 self._current_line.push(str(holding))
486 else:
487 save_part = holding.pop()
488 self._current_line.push(str(holding))
489 self._lines.append(str(self._current_line))
490 holding.reset(save_part)
491 self._current_line.reset(str(holding))
492 return
493 elif not part:
494 # We're leading with a split character. See if the splitpart
495 # and nextpart fits on the current line.
496 holding.push(splitpart + nextpart)
497 holding_len = len(holding)
498 # We know we're not leaving the nextpart on the stack.
499 holding.pop()
500 if holding_len + len(self._current_line) <= self._maxlen:
501 holding.push(splitpart)
502 else:
503 # It doesn't fit. Since there's no current part really
504 # the best we can do is start a new line and push the
505 # split part onto it.
506 self._current_line.push(str(holding))
507 holding.reset()
508 if len(self._current_line) > 0 and self._lines:
509 self._lines.append(str(self._current_line))
510 self._current_line.reset()
511 holding.push(splitpart)
512 else:
513 # All three parts are present. First let's see if all three
514 # parts will fit on the current line. If so, we don't need to
515 # split it.
516 holding.push(part + splitpart + nextpart)
517 holding_len = len(holding)
518 # Pop the part because we'll push nextpart on the next
519 # iteration through the loop.
520 holding.pop()
521 if holding_len + len(self._current_line) <= self._maxlen:
522 holding.push(part + splitpart)
523 else:
524 # The entire thing doesn't fit. See if we need to split
525 # before or after the split characters.
526 if splitpart.isspace():
527 # Split before whitespace. Remember that the
528 # whitespace becomes the continuation whitespace of
529 # the next line so it goes to current_line not holding.
530 holding.push(part)
531 self._current_line.push(str(holding))
532 holding.reset()
533 self._lines.append(str(self._current_line))
534 self._current_line.reset(splitpart)
535 else:
536 # Split after non-whitespace. The continuation
537 # whitespace comes from the instance variable.
538 holding.push(part + splitpart)
539 self._current_line.push(str(holding))
540 holding.reset()
541 self._lines.append(str(self._current_line))
542 if nextpart[0].isspace():
543 self._current_line.reset()
544 else:
545 self._current_line.reset(self._continuation_ws)
546 # Get the last of the holding part
547 self._current_line.push(str(holding))
548
549
550
551def _spliterator(character, string):
552 parts = list(reversed(re.split('(%s)' % character, string)))
553 while parts:
554 part = parts.pop()
555 splitparts = (parts.pop() if parts else None)
556 nextpart = (parts.pop() if parts else None)
557 yield (part, splitparts, nextpart)
558 if nextpart is not None:
559 parts.append(nextpart)
560
561
562class _Accumulator:
Guido van Rossum9604e662007-08-30 03:46:43 +0000563 def __init__(self, initial_size=0):
Guido van Rossum8b3febe2007-08-30 01:15:14 +0000564 self._initial_size = initial_size
Guido van Rossum8b3febe2007-08-30 01:15:14 +0000565 self._current = []
566
567 def push(self, string):
568 self._current.append(string)
569
570 def pop(self):
Barry Warsaw00b34222007-08-31 02:35:00 +0000571 if not self._current:
572 return None
Guido van Rossum8b3febe2007-08-30 01:15:14 +0000573 return self._current.pop()
574
575 def __len__(self):
Barry Warsaw00b34222007-08-31 02:35:00 +0000576 return sum(((1 if string is None else len(string))
577 for string in self._current),
Guido van Rossum9604e662007-08-30 03:46:43 +0000578 self._initial_size)
Guido van Rossum8b3febe2007-08-30 01:15:14 +0000579
580 def __str__(self):
Barry Warsaw00b34222007-08-31 02:35:00 +0000581 if self._current and self._current[-1] is None:
582 self._current.pop()
583 return EMPTYSTRING.join((' ' if string is None else string)
584 for string in self._current)
Guido van Rossum8b3febe2007-08-30 01:15:14 +0000585
586 def reset(self, string=None):
587 self._current = []
Guido van Rossum8b3febe2007-08-30 01:15:14 +0000588 self._initial_size = 0
589 if string is not None:
590 self.push(string)
Guido van Rossum9604e662007-08-30 03:46:43 +0000591
592 def is_onlyws(self):
593 return len(self) == 0 or str(self).isspace()