blob: 2e687b7a6f10742e25e9ea33d2ddb06a5e616c7d [file] [log] [blame]
Guido van Rossum8b3febe2007-08-30 01:15:14 +00001# Copyright (C) 2002-2007 Python Software Foundation
2# Author: Ben Gertzfield, Barry Warsaw
3# Contact: email-sig@python.org
4
5"""Header encoding and decoding functionality."""
6
7__all__ = [
8 'Header',
9 'decode_header',
10 'make_header',
11 ]
12
13import re
14import binascii
15
16import email.quoprimime
17import email.base64mime
18
19from email.errors import HeaderParseError
R. David Murray92532142011-01-07 23:25:30 +000020from email import charset as _charset
21Charset = _charset.Charset
Guido van Rossum8b3febe2007-08-30 01:15:14 +000022
23NL = '\n'
24SPACE = ' '
25BSPACE = b' '
26SPACE8 = ' ' * 8
27EMPTYSTRING = ''
Guido van Rossum9604e662007-08-30 03:46:43 +000028MAXLINELEN = 78
R David Murray01581ee2011-04-18 10:04:34 -040029FWS = ' \t'
Guido van Rossum8b3febe2007-08-30 01:15:14 +000030
31USASCII = Charset('us-ascii')
32UTF8 = Charset('utf-8')
33
34# Match encoded-word strings in the form =?charset?q?Hello_World?=
35ecre = re.compile(r'''
36 =\? # literal =?
37 (?P<charset>[^?]*?) # non-greedy up to the next ? is the charset
38 \? # literal ?
39 (?P<encoding>[qb]) # either a "q" or a "b", case insensitive
40 \? # literal ?
41 (?P<encoded>.*?) # non-greedy up to the next ?= is the encoded string
42 \?= # literal ?=
43 (?=[ \t]|$) # whitespace or the end of the string
44 ''', re.VERBOSE | re.IGNORECASE | re.MULTILINE)
45
46# Field name regexp, including trailing colon, but not separating whitespace,
47# according to RFC 2822. Character range is from tilde to exclamation mark.
48# For use with .match()
49fcre = re.compile(r'[\041-\176]+:$')
50
Ezio Melotti13925002011-03-16 11:05:33 +020051# Find a header embedded in a putative header value. Used to check for
R. David Murray5b2d9dd2011-01-09 02:35:24 +000052# header injection attack.
53_embeded_header = re.compile(r'\n[^ \t]+:')
54
Guido van Rossum8b3febe2007-08-30 01:15:14 +000055
56
57# Helpers
58_max_append = email.quoprimime._max_append
59
60
61
62def decode_header(header):
63 """Decode a message header value without converting charset.
64
65 Returns a list of (string, charset) pairs containing each of the decoded
66 parts of the header. Charset is None for non-encoded parts of the header,
67 otherwise a lower-case string containing the name of the character set
68 specified in the encoded string.
69
R David Murray041015c2011-03-25 15:10:55 -040070 header may be a string that may or may not contain RFC2047 encoded words,
71 or it may be a Header object.
72
Amaury Forgeot d'Arc1c25de62009-07-12 16:43:19 +000073 An email.errors.HeaderParseError may be raised when certain decoding error
Guido van Rossum8b3febe2007-08-30 01:15:14 +000074 occurs (e.g. a base64 decoding exception).
75 """
R David Murray6bdb1762011-06-18 12:30:55 -040076 # If it is a Header object, we can just return the encoded chunks.
R David Murray041015c2011-03-25 15:10:55 -040077 if hasattr(header, '_chunks'):
R David Murray6bdb1762011-06-18 12:30:55 -040078 return [(_charset._encode(string, str(charset)), str(charset))
79 for string, charset in header._chunks]
Guido van Rossum8b3febe2007-08-30 01:15:14 +000080 # If no encoding, just return the header with no charset.
81 if not ecre.search(header):
82 return [(header, None)]
83 # First step is to parse all the encoded parts into triplets of the form
84 # (encoded_string, encoding, charset). For unencoded strings, the last
85 # two parts will be None.
86 words = []
87 for line in header.splitlines():
88 parts = ecre.split(line)
89 while parts:
90 unencoded = parts.pop(0).strip()
91 if unencoded:
92 words.append((unencoded, None, None))
93 if parts:
94 charset = parts.pop(0).lower()
95 encoding = parts.pop(0).lower()
96 encoded = parts.pop(0)
97 words.append((encoded, encoding, charset))
98 # The next step is to decode each encoded word by applying the reverse
99 # base64 or quopri transformation. decoded_words is now a list of the
100 # form (decoded_word, charset).
101 decoded_words = []
102 for encoded_string, encoding, charset in words:
103 if encoding is None:
104 # This is an unencoded word.
105 decoded_words.append((encoded_string, charset))
106 elif encoding == 'q':
107 word = email.quoprimime.header_decode(encoded_string)
108 decoded_words.append((word, charset))
109 elif encoding == 'b':
R. David Murrayc4e69cc2010-08-03 22:14:10 +0000110 paderr = len(encoded_string) % 4 # Postel's law: add missing padding
111 if paderr:
112 encoded_string += '==='[:4 - paderr]
Guido van Rossum8b3febe2007-08-30 01:15:14 +0000113 try:
114 word = email.base64mime.decode(encoded_string)
115 except binascii.Error:
116 raise HeaderParseError('Base64 decoding error')
117 else:
118 decoded_words.append((word, charset))
119 else:
120 raise AssertionError('Unexpected encoding: ' + encoding)
121 # Now convert all words to bytes and collapse consecutive runs of
122 # similarly encoded words.
123 collapsed = []
124 last_word = last_charset = None
125 for word, charset in decoded_words:
126 if isinstance(word, str):
Guido van Rossum9604e662007-08-30 03:46:43 +0000127 word = bytes(word, 'raw-unicode-escape')
Guido van Rossum8b3febe2007-08-30 01:15:14 +0000128 if last_word is None:
129 last_word = word
130 last_charset = charset
131 elif charset != last_charset:
132 collapsed.append((last_word, last_charset))
133 last_word = word
134 last_charset = charset
135 elif last_charset is None:
136 last_word += BSPACE + word
137 else:
138 last_word += word
139 collapsed.append((last_word, last_charset))
140 return collapsed
141
142
143
144def make_header(decoded_seq, maxlinelen=None, header_name=None,
145 continuation_ws=' '):
146 """Create a Header from a sequence of pairs as returned by decode_header()
147
148 decode_header() takes a header value string and returns a sequence of
149 pairs of the format (decoded_string, charset) where charset is the string
150 name of the character set.
151
152 This function takes one of those sequence of pairs and returns a Header
153 instance. Optional maxlinelen, header_name, and continuation_ws are as in
154 the Header constructor.
155 """
156 h = Header(maxlinelen=maxlinelen, header_name=header_name,
157 continuation_ws=continuation_ws)
158 for s, charset in decoded_seq:
159 # None means us-ascii but we can simply pass it on to h.append()
160 if charset is not None and not isinstance(charset, Charset):
161 charset = Charset(charset)
162 h.append(s, charset)
163 return h
164
165
166
167class Header:
168 def __init__(self, s=None, charset=None,
169 maxlinelen=None, header_name=None,
170 continuation_ws=' ', errors='strict'):
171 """Create a MIME-compliant header that can contain many character sets.
172
173 Optional s is the initial header value. If None, the initial header
174 value is not set. You can later append to the header with .append()
175 method calls. s may be a byte string or a Unicode string, but see the
176 .append() documentation for semantics.
177
178 Optional charset serves two purposes: it has the same meaning as the
179 charset argument to the .append() method. It also sets the default
180 character set for all subsequent .append() calls that omit the charset
181 argument. If charset is not provided in the constructor, the us-ascii
182 charset is used both as s's initial charset and as the default for
183 subsequent .append() calls.
184
R. David Murray4c1da4c2010-12-29 16:57:24 +0000185 The maximum line length can be specified explicitly via maxlinelen. For
Guido van Rossum8b3febe2007-08-30 01:15:14 +0000186 splitting the first line to a shorter value (to account for the field
187 header which isn't included in s, e.g. `Subject') pass in the name of
Guido van Rossum9604e662007-08-30 03:46:43 +0000188 the field in header_name. The default maxlinelen is 78 as recommended
189 by RFC 2822.
Guido van Rossum8b3febe2007-08-30 01:15:14 +0000190
191 continuation_ws must be RFC 2822 compliant folding whitespace (usually
192 either a space or a hard tab) which will be prepended to continuation
193 lines.
194
195 errors is passed through to the .append() call.
196 """
197 if charset is None:
198 charset = USASCII
199 elif not isinstance(charset, Charset):
200 charset = Charset(charset)
201 self._charset = charset
202 self._continuation_ws = continuation_ws
203 self._chunks = []
204 if s is not None:
205 self.append(s, charset, errors)
206 if maxlinelen is None:
207 maxlinelen = MAXLINELEN
208 self._maxlinelen = maxlinelen
209 if header_name is None:
210 self._headerlen = 0
211 else:
212 # Take the separating colon and space into account.
213 self._headerlen = len(header_name) + 2
214
215 def __str__(self):
216 """Return the string value of the header."""
Guido van Rossum9604e662007-08-30 03:46:43 +0000217 self._normalize()
Guido van Rossum8b3febe2007-08-30 01:15:14 +0000218 uchunks = []
219 lastcs = None
Guido van Rossum9604e662007-08-30 03:46:43 +0000220 for string, charset in self._chunks:
Guido van Rossum8b3febe2007-08-30 01:15:14 +0000221 # We must preserve spaces between encoded and non-encoded word
222 # boundaries, which means for us we need to add a space when we go
223 # from a charset to None/us-ascii, or from None/us-ascii to a
224 # charset. Only do this for the second and subsequent chunks.
225 nextcs = charset
R. David Murray92532142011-01-07 23:25:30 +0000226 if nextcs == _charset.UNKNOWN8BIT:
227 original_bytes = string.encode('ascii', 'surrogateescape')
228 string = original_bytes.decode('ascii', 'replace')
Guido van Rossum8b3febe2007-08-30 01:15:14 +0000229 if uchunks:
230 if lastcs not in (None, 'us-ascii'):
231 if nextcs in (None, 'us-ascii'):
232 uchunks.append(SPACE)
233 nextcs = None
234 elif nextcs not in (None, 'us-ascii'):
235 uchunks.append(SPACE)
236 lastcs = nextcs
Guido van Rossum9604e662007-08-30 03:46:43 +0000237 uchunks.append(string)
Guido van Rossum8b3febe2007-08-30 01:15:14 +0000238 return EMPTYSTRING.join(uchunks)
239
240 # Rich comparison operators for equality only. BAW: does it make sense to
241 # have or explicitly disable <, <=, >, >= operators?
242 def __eq__(self, other):
243 # other may be a Header or a string. Both are fine so coerce
Guido van Rossum9604e662007-08-30 03:46:43 +0000244 # ourselves to a unicode (of the unencoded header value), swap the
245 # args and do another comparison.
246 return other == str(self)
Guido van Rossum8b3febe2007-08-30 01:15:14 +0000247
248 def __ne__(self, other):
249 return not self == other
250
251 def append(self, s, charset=None, errors='strict'):
252 """Append a string to the MIME header.
253
254 Optional charset, if given, should be a Charset instance or the name
255 of a character set (which will be converted to a Charset instance). A
256 value of None (the default) means that the charset given in the
257 constructor is used.
258
259 s may be a byte string or a Unicode string. If it is a byte string
R. David Murray4c1da4c2010-12-29 16:57:24 +0000260 (i.e. isinstance(s, str) is false), then charset is the encoding of
Guido van Rossum8b3febe2007-08-30 01:15:14 +0000261 that byte string, and a UnicodeError will be raised if the string
262 cannot be decoded with that charset. If s is a Unicode string, then
263 charset is a hint specifying the character set of the characters in
R. David Murray477efb32011-01-05 01:39:32 +0000264 the string. In either case, when producing an RFC 2822 compliant
265 header using RFC 2047 rules, the string will be encoded using the
266 output codec of the charset. If the string cannot be encoded to the
267 output codec, a UnicodeError will be raised.
Guido van Rossum8b3febe2007-08-30 01:15:14 +0000268
R. David Murray477efb32011-01-05 01:39:32 +0000269 Optional `errors' is passed as the errors argument to the decode
270 call if s is a byte string.
Guido van Rossum8b3febe2007-08-30 01:15:14 +0000271 """
272 if charset is None:
273 charset = self._charset
274 elif not isinstance(charset, Charset):
275 charset = Charset(charset)
R. David Murray477efb32011-01-05 01:39:32 +0000276 if not isinstance(s, str):
Guido van Rossum8b3febe2007-08-30 01:15:14 +0000277 input_charset = charset.input_codec or 'us-ascii'
R David Murraye5e366c2011-06-18 12:57:28 -0400278 if input_charset == _charset.UNKNOWN8BIT:
279 s = s.decode('us-ascii', 'surrogateescape')
280 else:
281 s = s.decode(input_charset, errors)
Guido van Rossum8b3febe2007-08-30 01:15:14 +0000282 # Ensure that the bytes we're storing can be decoded to the output
283 # character set, otherwise an early error is thrown.
284 output_charset = charset.output_codec or 'us-ascii'
R. David Murray92532142011-01-07 23:25:30 +0000285 if output_charset != _charset.UNKNOWN8BIT:
286 s.encode(output_charset, errors)
R. David Murray477efb32011-01-05 01:39:32 +0000287 self._chunks.append((s, charset))
Guido van Rossum8b3febe2007-08-30 01:15:14 +0000288
R. David Murray8451c4b2010-10-23 22:19:56 +0000289 def encode(self, splitchars=';, \t', maxlinelen=None, linesep='\n'):
R David Murraycd37dfc2011-03-14 18:35:56 -0400290 r"""Encode a message header into an RFC-compliant format.
Guido van Rossum8b3febe2007-08-30 01:15:14 +0000291
292 There are many issues involved in converting a given string for use in
293 an email header. Only certain character sets are readable in most
294 email clients, and as header strings can only contain a subset of
295 7-bit ASCII, care must be taken to properly convert and encode (with
296 Base64 or quoted-printable) header strings. In addition, there is a
297 75-character length limit on any given encoded header field, so
298 line-wrapping must be performed, even with double-byte character sets.
299
Ezio Melottice073cd2011-04-13 16:43:21 +0300300 Optional maxlinelen specifies the maximum length of each generated
R David Murray308f14a2011-04-12 15:00:44 -0400301 line, exclusive of the linesep string. Individual lines may be longer
302 than maxlinelen if a folding point cannot be found. The first line
303 will be shorter by the length of the header name plus ": " if a header
304 name was specified at Header construction time. The default value for
305 maxlinelen is determined at header construction time.
Guido van Rossum8b3febe2007-08-30 01:15:14 +0000306
R David Murray01581ee2011-04-18 10:04:34 -0400307 Optional splitchars is a string containing characters which should be
308 given extra weight by the splitting algorithm during normal header
309 wrapping. This is in very rough support of RFC 2822's `higher level
310 syntactic breaks': split points preceded by a splitchar are preferred
311 during line splitting, with the characters preferred in the order in
312 which they appear in the string. Space and tab may be included in the
313 string to indicate whether preference should be given to one over the
314 other as a split point when other split chars do not appear in the line
315 being split. Splitchars does not affect RFC 2047 encoded lines.
R. David Murray8451c4b2010-10-23 22:19:56 +0000316
317 Optional linesep is a string to be used to separate the lines of
318 the value. The default value is the most useful for typical
319 Python applications, but it can be set to \r\n to produce RFC-compliant
320 line separators when needed.
Guido van Rossum8b3febe2007-08-30 01:15:14 +0000321 """
322 self._normalize()
Guido van Rossum9604e662007-08-30 03:46:43 +0000323 if maxlinelen is None:
324 maxlinelen = self._maxlinelen
325 # A maxlinelen of 0 means don't wrap. For all practical purposes,
326 # choosing a huge number here accomplishes that and makes the
327 # _ValueFormatter algorithm much simpler.
328 if maxlinelen == 0:
329 maxlinelen = 1000000
330 formatter = _ValueFormatter(self._headerlen, maxlinelen,
Guido van Rossum8b3febe2007-08-30 01:15:14 +0000331 self._continuation_ws, splitchars)
332 for string, charset in self._chunks:
333 lines = string.splitlines()
R David Murray01581ee2011-04-18 10:04:34 -0400334 if lines:
335 formatter.feed('', lines[0], charset)
336 else:
337 formatter.feed('', '', charset)
R. David Murray6f0022d2011-01-07 21:57:25 +0000338 for line in lines[1:]:
339 formatter.newline()
340 if charset.header_encoding is not None:
R David Murray01581ee2011-04-18 10:04:34 -0400341 formatter.feed(self._continuation_ws, ' ' + line.lstrip(),
342 charset)
343 else:
344 sline = line.lstrip()
345 fws = line[:len(line)-len(sline)]
346 formatter.feed(fws, sline, charset)
R. David Murray6f0022d2011-01-07 21:57:25 +0000347 if len(lines) > 1:
348 formatter.newline()
Barry Warsaw00b34222007-08-31 02:35:00 +0000349 formatter.add_transition()
R. David Murray5b2d9dd2011-01-09 02:35:24 +0000350 value = formatter._str(linesep)
351 if _embeded_header.search(value):
352 raise HeaderParseError("header value appears to contain "
353 "an embedded header: {!r}".format(value))
354 return value
Guido van Rossum8b3febe2007-08-30 01:15:14 +0000355
356 def _normalize(self):
Guido van Rossum9604e662007-08-30 03:46:43 +0000357 # Step 1: Normalize the chunks so that all runs of identical charsets
358 # get collapsed into a single unicode string.
Guido van Rossum8b3febe2007-08-30 01:15:14 +0000359 chunks = []
360 last_charset = None
361 last_chunk = []
362 for string, charset in self._chunks:
363 if charset == last_charset:
364 last_chunk.append(string)
365 else:
366 if last_charset is not None:
367 chunks.append((SPACE.join(last_chunk), last_charset))
Guido van Rossum8b3febe2007-08-30 01:15:14 +0000368 last_chunk = [string]
369 last_charset = charset
370 if last_chunk:
371 chunks.append((SPACE.join(last_chunk), last_charset))
372 self._chunks = chunks
373
374
375
376class _ValueFormatter:
377 def __init__(self, headerlen, maxlen, continuation_ws, splitchars):
378 self._maxlen = maxlen
379 self._continuation_ws = continuation_ws
R David Murray01581ee2011-04-18 10:04:34 -0400380 self._continuation_ws_len = len(continuation_ws)
Guido van Rossum8b3febe2007-08-30 01:15:14 +0000381 self._splitchars = splitchars
382 self._lines = []
383 self._current_line = _Accumulator(headerlen)
384
R. David Murray8451c4b2010-10-23 22:19:56 +0000385 def _str(self, linesep):
Guido van Rossum8b3febe2007-08-30 01:15:14 +0000386 self.newline()
R. David Murray8451c4b2010-10-23 22:19:56 +0000387 return linesep.join(self._lines)
388
389 def __str__(self):
390 return self._str(NL)
Guido van Rossum8b3febe2007-08-30 01:15:14 +0000391
392 def newline(self):
Barry Warsaw00b34222007-08-31 02:35:00 +0000393 end_of_line = self._current_line.pop()
R David Murray01581ee2011-04-18 10:04:34 -0400394 if end_of_line != (' ', ''):
395 self._current_line.push(*end_of_line)
Guido van Rossum8b3febe2007-08-30 01:15:14 +0000396 if len(self._current_line) > 0:
R David Murray01581ee2011-04-18 10:04:34 -0400397 if self._current_line.is_onlyws():
398 self._lines[-1] += str(self._current_line)
399 else:
400 self._lines.append(str(self._current_line))
Guido van Rossum8b3febe2007-08-30 01:15:14 +0000401 self._current_line.reset()
402
Barry Warsaw00b34222007-08-31 02:35:00 +0000403 def add_transition(self):
R David Murray01581ee2011-04-18 10:04:34 -0400404 self._current_line.push(' ', '')
Barry Warsaw00b34222007-08-31 02:35:00 +0000405
R David Murray01581ee2011-04-18 10:04:34 -0400406 def feed(self, fws, string, charset):
Guido van Rossum9604e662007-08-30 03:46:43 +0000407 # If the charset has no header encoding (i.e. it is an ASCII encoding)
408 # then we must split the header at the "highest level syntactic break"
409 # possible. Note that we don't have a lot of smarts about field
Guido van Rossum8b3febe2007-08-30 01:15:14 +0000410 # syntax; we just try to break on semi-colons, then commas, then
Guido van Rossum9604e662007-08-30 03:46:43 +0000411 # whitespace. Eventually, this should be pluggable.
412 if charset.header_encoding is None:
R David Murray01581ee2011-04-18 10:04:34 -0400413 self._ascii_split(fws, string, self._splitchars)
Guido van Rossum8b3febe2007-08-30 01:15:14 +0000414 return
Guido van Rossum9604e662007-08-30 03:46:43 +0000415 # Otherwise, we're doing either a Base64 or a quoted-printable
416 # encoding which means we don't need to split the line on syntactic
417 # breaks. We can basically just find enough characters to fit on the
418 # current line, minus the RFC 2047 chrome. What makes this trickier
419 # though is that we have to split at octet boundaries, not character
420 # boundaries but it's only safe to split at character boundaries so at
421 # best we can only get close.
422 encoded_lines = charset.header_encode_lines(string, self._maxlengths())
423 # The first element extends the current line, but if it's None then
424 # nothing more fit on the current line so start a new line.
425 try:
426 first_line = encoded_lines.pop(0)
427 except IndexError:
428 # There are no encoded lines, so we're done.
429 return
430 if first_line is not None:
R David Murray01581ee2011-04-18 10:04:34 -0400431 self._append_chunk(fws, first_line)
Guido van Rossum9604e662007-08-30 03:46:43 +0000432 try:
433 last_line = encoded_lines.pop()
434 except IndexError:
435 # There was only one line.
436 return
R David Murray01581ee2011-04-18 10:04:34 -0400437 self.newline()
438 self._current_line.push(self._continuation_ws, last_line)
Guido van Rossum9604e662007-08-30 03:46:43 +0000439 # Everything else are full lines in themselves.
440 for line in encoded_lines:
441 self._lines.append(self._continuation_ws + line)
Guido van Rossum8b3febe2007-08-30 01:15:14 +0000442
Guido van Rossum9604e662007-08-30 03:46:43 +0000443 def _maxlengths(self):
444 # The first line's length.
445 yield self._maxlen - len(self._current_line)
446 while True:
447 yield self._maxlen - self._continuation_ws_len
448
R David Murray01581ee2011-04-18 10:04:34 -0400449 def _ascii_split(self, fws, string, splitchars):
450 # The RFC 2822 header folding algorithm is simple in principle but
451 # complex in practice. Lines may be folded any place where "folding
452 # white space" appears by inserting a linesep character in front of the
453 # FWS. The complication is that not all spaces or tabs qualify as FWS,
454 # and we are also supposed to prefer to break at "higher level
455 # syntactic breaks". We can't do either of these without intimate
456 # knowledge of the structure of structured headers, which we don't have
457 # here. So the best we can do here is prefer to break at the specified
458 # splitchars, and hope that we don't choose any spaces or tabs that
459 # aren't legal FWS. (This is at least better than the old algorithm,
460 # where we would sometimes *introduce* FWS after a splitchar, or the
461 # algorithm before that, where we would turn all white space runs into
462 # single spaces or tabs.)
463 parts = re.split("(["+FWS+"]+)", fws+string)
464 if parts[0]:
465 parts[:0] = ['']
466 else:
467 parts.pop(0)
468 for fws, part in zip(*[iter(parts)]*2):
469 self._append_chunk(fws, part)
470
471 def _append_chunk(self, fws, string):
472 self._current_line.push(fws, string)
473 if len(self._current_line) > self._maxlen:
474 # Find the best split point, working backward from the end.
475 # There might be none, on a long first line.
476 for ch in self._splitchars:
477 for i in range(self._current_line.part_count()-1, 0, -1):
478 if ch.isspace():
479 fws = self._current_line[i][0]
480 if fws and fws[0]==ch:
481 break
482 prevpart = self._current_line[i-1][1]
483 if prevpart and prevpart[-1]==ch:
484 break
Guido van Rossum8b3febe2007-08-30 01:15:14 +0000485 else:
R David Murray01581ee2011-04-18 10:04:34 -0400486 continue
487 break
Guido van Rossum8b3febe2007-08-30 01:15:14 +0000488 else:
R David Murray01581ee2011-04-18 10:04:34 -0400489 fws, part = self._current_line.pop()
490 if self._current_line._initial_size > 0:
491 # There will be a header, so leave it on a line by itself.
492 self.newline()
493 if not fws:
494 # We don't use continuation_ws here because the whitespace
495 # after a header should always be a space.
496 fws = ' '
497 self._current_line.push(fws, part)
498 return
499 remainder = self._current_line.pop_from(i)
500 self._lines.append(str(self._current_line))
501 self._current_line.reset(remainder)
Guido van Rossum8b3febe2007-08-30 01:15:14 +0000502
503
R David Murray01581ee2011-04-18 10:04:34 -0400504class _Accumulator(list):
Guido van Rossum8b3febe2007-08-30 01:15:14 +0000505
Guido van Rossum9604e662007-08-30 03:46:43 +0000506 def __init__(self, initial_size=0):
Guido van Rossum8b3febe2007-08-30 01:15:14 +0000507 self._initial_size = initial_size
R David Murray01581ee2011-04-18 10:04:34 -0400508 super().__init__()
Guido van Rossum8b3febe2007-08-30 01:15:14 +0000509
R David Murray01581ee2011-04-18 10:04:34 -0400510 def push(self, fws, string):
511 self.append((fws, string))
512
513 def pop_from(self, i=0):
514 popped = self[i:]
515 self[i:] = []
516 return popped
Guido van Rossum8b3febe2007-08-30 01:15:14 +0000517
518 def pop(self):
R David Murray01581ee2011-04-18 10:04:34 -0400519 if self.part_count()==0:
520 return ('', '')
521 return super().pop()
Guido van Rossum8b3febe2007-08-30 01:15:14 +0000522
523 def __len__(self):
R David Murray01581ee2011-04-18 10:04:34 -0400524 return sum((len(fws)+len(part) for fws, part in self),
Guido van Rossum9604e662007-08-30 03:46:43 +0000525 self._initial_size)
Guido van Rossum8b3febe2007-08-30 01:15:14 +0000526
527 def __str__(self):
R David Murray01581ee2011-04-18 10:04:34 -0400528 return EMPTYSTRING.join((EMPTYSTRING.join((fws, part))
529 for fws, part in self))
Guido van Rossum8b3febe2007-08-30 01:15:14 +0000530
R David Murray01581ee2011-04-18 10:04:34 -0400531 def reset(self, startval=None):
532 if startval is None:
533 startval = []
534 self[:] = startval
Guido van Rossum8b3febe2007-08-30 01:15:14 +0000535 self._initial_size = 0
Guido van Rossum9604e662007-08-30 03:46:43 +0000536
537 def is_onlyws(self):
R David Murray01581ee2011-04-18 10:04:34 -0400538 return self._initial_size==0 and (not self or str(self).isspace())
539
540 def part_count(self):
541 return super().__len__()