blob: e1716176fa0890b6ff71211905c080f2e8a25c57 [file] [log] [blame]
Guido van Rossum8b3febe2007-08-30 01:15:14 +00001# Copyright (C) 2002-2007 Python Software Foundation
2# Author: Ben Gertzfield, Barry Warsaw
3# Contact: email-sig@python.org
4
5"""Header encoding and decoding functionality."""
6
7__all__ = [
8 'Header',
9 'decode_header',
10 'make_header',
11 ]
12
13import re
14import binascii
15
16import email.quoprimime
17import email.base64mime
18
19from email.errors import HeaderParseError
R. David Murray92532142011-01-07 23:25:30 +000020from email import charset as _charset
21Charset = _charset.Charset
Guido van Rossum8b3febe2007-08-30 01:15:14 +000022
23NL = '\n'
24SPACE = ' '
25BSPACE = b' '
26SPACE8 = ' ' * 8
27EMPTYSTRING = ''
Guido van Rossum9604e662007-08-30 03:46:43 +000028MAXLINELEN = 78
Guido van Rossum8b3febe2007-08-30 01:15:14 +000029
30USASCII = Charset('us-ascii')
31UTF8 = Charset('utf-8')
32
33# Match encoded-word strings in the form =?charset?q?Hello_World?=
34ecre = re.compile(r'''
35 =\? # literal =?
36 (?P<charset>[^?]*?) # non-greedy up to the next ? is the charset
37 \? # literal ?
38 (?P<encoding>[qb]) # either a "q" or a "b", case insensitive
39 \? # literal ?
40 (?P<encoded>.*?) # non-greedy up to the next ?= is the encoded string
41 \?= # literal ?=
42 (?=[ \t]|$) # whitespace or the end of the string
43 ''', re.VERBOSE | re.IGNORECASE | re.MULTILINE)
44
45# Field name regexp, including trailing colon, but not separating whitespace,
46# according to RFC 2822. Character range is from tilde to exclamation mark.
47# For use with .match()
48fcre = re.compile(r'[\041-\176]+:$')
49
R. David Murray5b2d9dd2011-01-09 02:35:24 +000050# Find a header embeded in a putative header value. Used to check for
51# header injection attack.
52_embeded_header = re.compile(r'\n[^ \t]+:')
53
Guido van Rossum8b3febe2007-08-30 01:15:14 +000054
55
56# Helpers
57_max_append = email.quoprimime._max_append
58
59
60
61def decode_header(header):
62 """Decode a message header value without converting charset.
63
64 Returns a list of (string, charset) pairs containing each of the decoded
65 parts of the header. Charset is None for non-encoded parts of the header,
66 otherwise a lower-case string containing the name of the character set
67 specified in the encoded string.
68
Amaury Forgeot d'Arc1c25de62009-07-12 16:43:19 +000069 An email.errors.HeaderParseError may be raised when certain decoding error
Guido van Rossum8b3febe2007-08-30 01:15:14 +000070 occurs (e.g. a base64 decoding exception).
71 """
72 # If no encoding, just return the header with no charset.
73 if not ecre.search(header):
74 return [(header, None)]
75 # First step is to parse all the encoded parts into triplets of the form
76 # (encoded_string, encoding, charset). For unencoded strings, the last
77 # two parts will be None.
78 words = []
79 for line in header.splitlines():
80 parts = ecre.split(line)
81 while parts:
82 unencoded = parts.pop(0).strip()
83 if unencoded:
84 words.append((unencoded, None, None))
85 if parts:
86 charset = parts.pop(0).lower()
87 encoding = parts.pop(0).lower()
88 encoded = parts.pop(0)
89 words.append((encoded, encoding, charset))
90 # The next step is to decode each encoded word by applying the reverse
91 # base64 or quopri transformation. decoded_words is now a list of the
92 # form (decoded_word, charset).
93 decoded_words = []
94 for encoded_string, encoding, charset in words:
95 if encoding is None:
96 # This is an unencoded word.
97 decoded_words.append((encoded_string, charset))
98 elif encoding == 'q':
99 word = email.quoprimime.header_decode(encoded_string)
100 decoded_words.append((word, charset))
101 elif encoding == 'b':
R. David Murrayc4e69cc2010-08-03 22:14:10 +0000102 paderr = len(encoded_string) % 4 # Postel's law: add missing padding
103 if paderr:
104 encoded_string += '==='[:4 - paderr]
Guido van Rossum8b3febe2007-08-30 01:15:14 +0000105 try:
106 word = email.base64mime.decode(encoded_string)
107 except binascii.Error:
108 raise HeaderParseError('Base64 decoding error')
109 else:
110 decoded_words.append((word, charset))
111 else:
112 raise AssertionError('Unexpected encoding: ' + encoding)
113 # Now convert all words to bytes and collapse consecutive runs of
114 # similarly encoded words.
115 collapsed = []
116 last_word = last_charset = None
117 for word, charset in decoded_words:
118 if isinstance(word, str):
Guido van Rossum9604e662007-08-30 03:46:43 +0000119 word = bytes(word, 'raw-unicode-escape')
Guido van Rossum8b3febe2007-08-30 01:15:14 +0000120 if last_word is None:
121 last_word = word
122 last_charset = charset
123 elif charset != last_charset:
124 collapsed.append((last_word, last_charset))
125 last_word = word
126 last_charset = charset
127 elif last_charset is None:
128 last_word += BSPACE + word
129 else:
130 last_word += word
131 collapsed.append((last_word, last_charset))
132 return collapsed
133
134
135
136def make_header(decoded_seq, maxlinelen=None, header_name=None,
137 continuation_ws=' '):
138 """Create a Header from a sequence of pairs as returned by decode_header()
139
140 decode_header() takes a header value string and returns a sequence of
141 pairs of the format (decoded_string, charset) where charset is the string
142 name of the character set.
143
144 This function takes one of those sequence of pairs and returns a Header
145 instance. Optional maxlinelen, header_name, and continuation_ws are as in
146 the Header constructor.
147 """
148 h = Header(maxlinelen=maxlinelen, header_name=header_name,
149 continuation_ws=continuation_ws)
150 for s, charset in decoded_seq:
151 # None means us-ascii but we can simply pass it on to h.append()
152 if charset is not None and not isinstance(charset, Charset):
153 charset = Charset(charset)
154 h.append(s, charset)
155 return h
156
157
158
159class Header:
160 def __init__(self, s=None, charset=None,
161 maxlinelen=None, header_name=None,
162 continuation_ws=' ', errors='strict'):
163 """Create a MIME-compliant header that can contain many character sets.
164
165 Optional s is the initial header value. If None, the initial header
166 value is not set. You can later append to the header with .append()
167 method calls. s may be a byte string or a Unicode string, but see the
168 .append() documentation for semantics.
169
170 Optional charset serves two purposes: it has the same meaning as the
171 charset argument to the .append() method. It also sets the default
172 character set for all subsequent .append() calls that omit the charset
173 argument. If charset is not provided in the constructor, the us-ascii
174 charset is used both as s's initial charset and as the default for
175 subsequent .append() calls.
176
R. David Murray4c1da4c2010-12-29 16:57:24 +0000177 The maximum line length can be specified explicitly via maxlinelen. For
Guido van Rossum8b3febe2007-08-30 01:15:14 +0000178 splitting the first line to a shorter value (to account for the field
179 header which isn't included in s, e.g. `Subject') pass in the name of
Guido van Rossum9604e662007-08-30 03:46:43 +0000180 the field in header_name. The default maxlinelen is 78 as recommended
181 by RFC 2822.
Guido van Rossum8b3febe2007-08-30 01:15:14 +0000182
183 continuation_ws must be RFC 2822 compliant folding whitespace (usually
184 either a space or a hard tab) which will be prepended to continuation
185 lines.
186
187 errors is passed through to the .append() call.
188 """
189 if charset is None:
190 charset = USASCII
191 elif not isinstance(charset, Charset):
192 charset = Charset(charset)
193 self._charset = charset
194 self._continuation_ws = continuation_ws
195 self._chunks = []
196 if s is not None:
197 self.append(s, charset, errors)
198 if maxlinelen is None:
199 maxlinelen = MAXLINELEN
200 self._maxlinelen = maxlinelen
201 if header_name is None:
202 self._headerlen = 0
203 else:
204 # Take the separating colon and space into account.
205 self._headerlen = len(header_name) + 2
206
207 def __str__(self):
208 """Return the string value of the header."""
Guido van Rossum9604e662007-08-30 03:46:43 +0000209 self._normalize()
Guido van Rossum8b3febe2007-08-30 01:15:14 +0000210 uchunks = []
211 lastcs = None
Guido van Rossum9604e662007-08-30 03:46:43 +0000212 for string, charset in self._chunks:
Guido van Rossum8b3febe2007-08-30 01:15:14 +0000213 # We must preserve spaces between encoded and non-encoded word
214 # boundaries, which means for us we need to add a space when we go
215 # from a charset to None/us-ascii, or from None/us-ascii to a
216 # charset. Only do this for the second and subsequent chunks.
217 nextcs = charset
R. David Murray92532142011-01-07 23:25:30 +0000218 if nextcs == _charset.UNKNOWN8BIT:
219 original_bytes = string.encode('ascii', 'surrogateescape')
220 string = original_bytes.decode('ascii', 'replace')
Guido van Rossum8b3febe2007-08-30 01:15:14 +0000221 if uchunks:
222 if lastcs not in (None, 'us-ascii'):
223 if nextcs in (None, 'us-ascii'):
224 uchunks.append(SPACE)
225 nextcs = None
226 elif nextcs not in (None, 'us-ascii'):
227 uchunks.append(SPACE)
228 lastcs = nextcs
Guido van Rossum9604e662007-08-30 03:46:43 +0000229 uchunks.append(string)
Guido van Rossum8b3febe2007-08-30 01:15:14 +0000230 return EMPTYSTRING.join(uchunks)
231
232 # Rich comparison operators for equality only. BAW: does it make sense to
233 # have or explicitly disable <, <=, >, >= operators?
234 def __eq__(self, other):
235 # other may be a Header or a string. Both are fine so coerce
Guido van Rossum9604e662007-08-30 03:46:43 +0000236 # ourselves to a unicode (of the unencoded header value), swap the
237 # args and do another comparison.
238 return other == str(self)
Guido van Rossum8b3febe2007-08-30 01:15:14 +0000239
240 def __ne__(self, other):
241 return not self == other
242
243 def append(self, s, charset=None, errors='strict'):
244 """Append a string to the MIME header.
245
246 Optional charset, if given, should be a Charset instance or the name
247 of a character set (which will be converted to a Charset instance). A
248 value of None (the default) means that the charset given in the
249 constructor is used.
250
251 s may be a byte string or a Unicode string. If it is a byte string
R. David Murray4c1da4c2010-12-29 16:57:24 +0000252 (i.e. isinstance(s, str) is false), then charset is the encoding of
Guido van Rossum8b3febe2007-08-30 01:15:14 +0000253 that byte string, and a UnicodeError will be raised if the string
254 cannot be decoded with that charset. If s is a Unicode string, then
255 charset is a hint specifying the character set of the characters in
R. David Murray477efb32011-01-05 01:39:32 +0000256 the string. In either case, when producing an RFC 2822 compliant
257 header using RFC 2047 rules, the string will be encoded using the
258 output codec of the charset. If the string cannot be encoded to the
259 output codec, a UnicodeError will be raised.
Guido van Rossum8b3febe2007-08-30 01:15:14 +0000260
R. David Murray477efb32011-01-05 01:39:32 +0000261 Optional `errors' is passed as the errors argument to the decode
262 call if s is a byte string.
Guido van Rossum8b3febe2007-08-30 01:15:14 +0000263 """
264 if charset is None:
265 charset = self._charset
266 elif not isinstance(charset, Charset):
267 charset = Charset(charset)
R. David Murray477efb32011-01-05 01:39:32 +0000268 if not isinstance(s, str):
Guido van Rossum8b3febe2007-08-30 01:15:14 +0000269 input_charset = charset.input_codec or 'us-ascii'
R. David Murray477efb32011-01-05 01:39:32 +0000270 s = s.decode(input_charset, errors)
Guido van Rossum8b3febe2007-08-30 01:15:14 +0000271 # Ensure that the bytes we're storing can be decoded to the output
272 # character set, otherwise an early error is thrown.
273 output_charset = charset.output_codec or 'us-ascii'
R. David Murray92532142011-01-07 23:25:30 +0000274 if output_charset != _charset.UNKNOWN8BIT:
275 s.encode(output_charset, errors)
R. David Murray477efb32011-01-05 01:39:32 +0000276 self._chunks.append((s, charset))
Guido van Rossum8b3febe2007-08-30 01:15:14 +0000277
R. David Murray8451c4b2010-10-23 22:19:56 +0000278 def encode(self, splitchars=';, \t', maxlinelen=None, linesep='\n'):
R David Murraycd37dfc2011-03-14 18:35:56 -0400279 r"""Encode a message header into an RFC-compliant format.
Guido van Rossum8b3febe2007-08-30 01:15:14 +0000280
281 There are many issues involved in converting a given string for use in
282 an email header. Only certain character sets are readable in most
283 email clients, and as header strings can only contain a subset of
284 7-bit ASCII, care must be taken to properly convert and encode (with
285 Base64 or quoted-printable) header strings. In addition, there is a
286 75-character length limit on any given encoded header field, so
287 line-wrapping must be performed, even with double-byte character sets.
288
289 This method will do its best to convert the string to the correct
290 character set used in email, and encode and line wrap it safely with
291 the appropriate scheme for that character set.
292
293 If the given charset is not known or an error occurs during
294 conversion, this function will return the header untouched.
295
296 Optional splitchars is a string containing characters to split long
297 ASCII lines on, in rough support of RFC 2822's `highest level
298 syntactic breaks'. This doesn't affect RFC 2047 encoded lines.
R. David Murray8451c4b2010-10-23 22:19:56 +0000299
300 Optional linesep is a string to be used to separate the lines of
301 the value. The default value is the most useful for typical
302 Python applications, but it can be set to \r\n to produce RFC-compliant
303 line separators when needed.
Guido van Rossum8b3febe2007-08-30 01:15:14 +0000304 """
305 self._normalize()
Guido van Rossum9604e662007-08-30 03:46:43 +0000306 if maxlinelen is None:
307 maxlinelen = self._maxlinelen
308 # A maxlinelen of 0 means don't wrap. For all practical purposes,
309 # choosing a huge number here accomplishes that and makes the
310 # _ValueFormatter algorithm much simpler.
311 if maxlinelen == 0:
312 maxlinelen = 1000000
313 formatter = _ValueFormatter(self._headerlen, maxlinelen,
Guido van Rossum8b3febe2007-08-30 01:15:14 +0000314 self._continuation_ws, splitchars)
315 for string, charset in self._chunks:
316 lines = string.splitlines()
R. David Murray6f0022d2011-01-07 21:57:25 +0000317 formatter.feed(lines[0], charset)
318 for line in lines[1:]:
319 formatter.newline()
320 if charset.header_encoding is not None:
321 formatter.feed(self._continuation_ws, USASCII)
322 line = ' ' + line.lstrip()
Guido van Rossum8b3febe2007-08-30 01:15:14 +0000323 formatter.feed(line, charset)
R. David Murray6f0022d2011-01-07 21:57:25 +0000324 if len(lines) > 1:
325 formatter.newline()
Barry Warsaw00b34222007-08-31 02:35:00 +0000326 formatter.add_transition()
R. David Murray5b2d9dd2011-01-09 02:35:24 +0000327 value = formatter._str(linesep)
328 if _embeded_header.search(value):
329 raise HeaderParseError("header value appears to contain "
330 "an embedded header: {!r}".format(value))
331 return value
Guido van Rossum8b3febe2007-08-30 01:15:14 +0000332
333 def _normalize(self):
Guido van Rossum9604e662007-08-30 03:46:43 +0000334 # Step 1: Normalize the chunks so that all runs of identical charsets
335 # get collapsed into a single unicode string.
Guido van Rossum8b3febe2007-08-30 01:15:14 +0000336 chunks = []
337 last_charset = None
338 last_chunk = []
339 for string, charset in self._chunks:
340 if charset == last_charset:
341 last_chunk.append(string)
342 else:
343 if last_charset is not None:
344 chunks.append((SPACE.join(last_chunk), last_charset))
Guido van Rossum8b3febe2007-08-30 01:15:14 +0000345 last_chunk = [string]
346 last_charset = charset
347 if last_chunk:
348 chunks.append((SPACE.join(last_chunk), last_charset))
349 self._chunks = chunks
350
351
352
353class _ValueFormatter:
354 def __init__(self, headerlen, maxlen, continuation_ws, splitchars):
355 self._maxlen = maxlen
356 self._continuation_ws = continuation_ws
357 self._continuation_ws_len = len(continuation_ws.replace('\t', SPACE8))
358 self._splitchars = splitchars
359 self._lines = []
360 self._current_line = _Accumulator(headerlen)
361
R. David Murray8451c4b2010-10-23 22:19:56 +0000362 def _str(self, linesep):
Guido van Rossum8b3febe2007-08-30 01:15:14 +0000363 self.newline()
R. David Murray8451c4b2010-10-23 22:19:56 +0000364 return linesep.join(self._lines)
365
366 def __str__(self):
367 return self._str(NL)
Guido van Rossum8b3febe2007-08-30 01:15:14 +0000368
369 def newline(self):
Barry Warsaw00b34222007-08-31 02:35:00 +0000370 end_of_line = self._current_line.pop()
371 if end_of_line is not None:
372 self._current_line.push(end_of_line)
Guido van Rossum8b3febe2007-08-30 01:15:14 +0000373 if len(self._current_line) > 0:
374 self._lines.append(str(self._current_line))
375 self._current_line.reset()
376
Barry Warsaw00b34222007-08-31 02:35:00 +0000377 def add_transition(self):
378 self._current_line.push(None)
379
Guido van Rossum8b3febe2007-08-30 01:15:14 +0000380 def feed(self, string, charset):
381 # If the string itself fits on the current line in its encoded format,
382 # then add it now and be done with it.
383 encoded_string = charset.header_encode(string)
384 if len(encoded_string) + len(self._current_line) <= self._maxlen:
385 self._current_line.push(encoded_string)
386 return
Guido van Rossum9604e662007-08-30 03:46:43 +0000387 # If the charset has no header encoding (i.e. it is an ASCII encoding)
388 # then we must split the header at the "highest level syntactic break"
389 # possible. Note that we don't have a lot of smarts about field
Guido van Rossum8b3febe2007-08-30 01:15:14 +0000390 # syntax; we just try to break on semi-colons, then commas, then
Guido van Rossum9604e662007-08-30 03:46:43 +0000391 # whitespace. Eventually, this should be pluggable.
392 if charset.header_encoding is None:
393 for ch in self._splitchars:
394 if ch in string:
395 break
396 else:
397 ch = None
398 # If there's no available split character then regardless of
399 # whether the string fits on the line, we have to put it on a line
400 # by itself.
401 if ch is None:
402 if not self._current_line.is_onlyws():
403 self._lines.append(str(self._current_line))
404 self._current_line.reset(self._continuation_ws)
405 self._current_line.push(encoded_string)
406 else:
407 self._ascii_split(string, ch)
Guido van Rossum8b3febe2007-08-30 01:15:14 +0000408 return
Guido van Rossum9604e662007-08-30 03:46:43 +0000409 # Otherwise, we're doing either a Base64 or a quoted-printable
410 # encoding which means we don't need to split the line on syntactic
411 # breaks. We can basically just find enough characters to fit on the
412 # current line, minus the RFC 2047 chrome. What makes this trickier
413 # though is that we have to split at octet boundaries, not character
414 # boundaries but it's only safe to split at character boundaries so at
415 # best we can only get close.
416 encoded_lines = charset.header_encode_lines(string, self._maxlengths())
417 # The first element extends the current line, but if it's None then
418 # nothing more fit on the current line so start a new line.
419 try:
420 first_line = encoded_lines.pop(0)
421 except IndexError:
422 # There are no encoded lines, so we're done.
423 return
424 if first_line is not None:
425 self._current_line.push(first_line)
426 self._lines.append(str(self._current_line))
427 self._current_line.reset(self._continuation_ws)
428 try:
429 last_line = encoded_lines.pop()
430 except IndexError:
431 # There was only one line.
432 return
433 self._current_line.push(last_line)
Guido van Rossum9604e662007-08-30 03:46:43 +0000434 # Everything else are full lines in themselves.
435 for line in encoded_lines:
436 self._lines.append(self._continuation_ws + line)
Guido van Rossum8b3febe2007-08-30 01:15:14 +0000437
Guido van Rossum9604e662007-08-30 03:46:43 +0000438 def _maxlengths(self):
439 # The first line's length.
440 yield self._maxlen - len(self._current_line)
441 while True:
442 yield self._maxlen - self._continuation_ws_len
443
444 def _ascii_split(self, string, ch):
445 holding = _Accumulator()
Guido van Rossum8b3febe2007-08-30 01:15:14 +0000446 # Split the line on the split character, preserving it. If the split
447 # character is whitespace RFC 2822 $2.2.3 requires us to fold on the
448 # whitespace, so that the line leads with the original whitespace we
449 # split on. However, if a higher syntactic break is used instead
450 # (e.g. comma or semicolon), the folding should happen after the split
451 # character. But then in that case, we need to add our own
452 # continuation whitespace -- although won't that break unfolding?
453 for part, splitpart, nextpart in _spliterator(ch, string):
454 if not splitpart:
455 # No splitpart means this is the last chunk. Put this part
456 # either on the current line or the next line depending on
457 # whether it fits.
458 holding.push(part)
459 if len(holding) + len(self._current_line) <= self._maxlen:
460 # It fits, but we're done.
461 self._current_line.push(str(holding))
462 else:
463 # It doesn't fit, but we're done. Before pushing a new
464 # line, watch out for the current line containing only
465 # whitespace.
466 holding.pop()
Guido van Rossum9604e662007-08-30 03:46:43 +0000467 if self._current_line.is_onlyws() and holding.is_onlyws():
Guido van Rossum8b3febe2007-08-30 01:15:14 +0000468 # Don't start a new line.
469 holding.push(part)
470 part = None
471 self._current_line.push(str(holding))
472 self._lines.append(str(self._current_line))
473 if part is None:
474 self._current_line.reset()
475 else:
476 holding.reset(part)
477 self._current_line.reset(str(holding))
478 return
479 elif not nextpart:
480 # There must be some trailing split characters because we
481 # found a split character but no next part. In this case we
482 # must treat the thing to fit as the part + splitpart because
483 # if splitpart is whitespace it's not allowed to be the only
484 # thing on the line, and if it's not whitespace we must split
485 # after the syntactic break. In either case, we're done.
486 holding_prelen = len(holding)
487 holding.push(part + splitpart)
488 if len(holding) + len(self._current_line) <= self._maxlen:
489 self._current_line.push(str(holding))
490 elif holding_prelen == 0:
491 # This is the only chunk left so it has to go on the
492 # current line.
493 self._current_line.push(str(holding))
494 else:
495 save_part = holding.pop()
496 self._current_line.push(str(holding))
497 self._lines.append(str(self._current_line))
498 holding.reset(save_part)
499 self._current_line.reset(str(holding))
500 return
501 elif not part:
502 # We're leading with a split character. See if the splitpart
503 # and nextpart fits on the current line.
504 holding.push(splitpart + nextpart)
505 holding_len = len(holding)
506 # We know we're not leaving the nextpart on the stack.
507 holding.pop()
508 if holding_len + len(self._current_line) <= self._maxlen:
509 holding.push(splitpart)
510 else:
511 # It doesn't fit. Since there's no current part really
512 # the best we can do is start a new line and push the
513 # split part onto it.
514 self._current_line.push(str(holding))
515 holding.reset()
516 if len(self._current_line) > 0 and self._lines:
517 self._lines.append(str(self._current_line))
518 self._current_line.reset()
519 holding.push(splitpart)
520 else:
521 # All three parts are present. First let's see if all three
522 # parts will fit on the current line. If so, we don't need to
523 # split it.
524 holding.push(part + splitpart + nextpart)
525 holding_len = len(holding)
526 # Pop the part because we'll push nextpart on the next
527 # iteration through the loop.
528 holding.pop()
529 if holding_len + len(self._current_line) <= self._maxlen:
530 holding.push(part + splitpart)
531 else:
532 # The entire thing doesn't fit. See if we need to split
533 # before or after the split characters.
534 if splitpart.isspace():
535 # Split before whitespace. Remember that the
536 # whitespace becomes the continuation whitespace of
537 # the next line so it goes to current_line not holding.
538 holding.push(part)
539 self._current_line.push(str(holding))
540 holding.reset()
541 self._lines.append(str(self._current_line))
542 self._current_line.reset(splitpart)
543 else:
544 # Split after non-whitespace. The continuation
545 # whitespace comes from the instance variable.
546 holding.push(part + splitpart)
547 self._current_line.push(str(holding))
548 holding.reset()
549 self._lines.append(str(self._current_line))
550 if nextpart[0].isspace():
551 self._current_line.reset()
552 else:
553 self._current_line.reset(self._continuation_ws)
554 # Get the last of the holding part
555 self._current_line.push(str(holding))
556
557
558
559def _spliterator(character, string):
560 parts = list(reversed(re.split('(%s)' % character, string)))
561 while parts:
562 part = parts.pop()
563 splitparts = (parts.pop() if parts else None)
564 nextpart = (parts.pop() if parts else None)
565 yield (part, splitparts, nextpart)
566 if nextpart is not None:
567 parts.append(nextpart)
568
569
570class _Accumulator:
Guido van Rossum9604e662007-08-30 03:46:43 +0000571 def __init__(self, initial_size=0):
Guido van Rossum8b3febe2007-08-30 01:15:14 +0000572 self._initial_size = initial_size
Guido van Rossum8b3febe2007-08-30 01:15:14 +0000573 self._current = []
574
575 def push(self, string):
576 self._current.append(string)
577
578 def pop(self):
Barry Warsaw00b34222007-08-31 02:35:00 +0000579 if not self._current:
580 return None
Guido van Rossum8b3febe2007-08-30 01:15:14 +0000581 return self._current.pop()
582
583 def __len__(self):
Barry Warsaw00b34222007-08-31 02:35:00 +0000584 return sum(((1 if string is None else len(string))
585 for string in self._current),
Guido van Rossum9604e662007-08-30 03:46:43 +0000586 self._initial_size)
Guido van Rossum8b3febe2007-08-30 01:15:14 +0000587
588 def __str__(self):
Barry Warsaw00b34222007-08-31 02:35:00 +0000589 if self._current and self._current[-1] is None:
590 self._current.pop()
591 return EMPTYSTRING.join((' ' if string is None else string)
592 for string in self._current)
Guido van Rossum8b3febe2007-08-30 01:15:14 +0000593
594 def reset(self, string=None):
595 self._current = []
Guido van Rossum8b3febe2007-08-30 01:15:14 +0000596 self._initial_size = 0
597 if string is not None:
598 self.push(string)
Guido van Rossum9604e662007-08-30 03:46:43 +0000599
600 def is_onlyws(self):
601 return len(self) == 0 or str(self).isspace()