blob: 0a66df54ae63c8ff7da93c6a1845fae085da5197 [file] [log] [blame]
Guido van Rossum8b3febe2007-08-30 01:15:14 +00001# Copyright (C) 2002-2007 Python Software Foundation
2# Author: Ben Gertzfield, Barry Warsaw
3# Contact: email-sig@python.org
4
5"""Header encoding and decoding functionality."""
6
7__all__ = [
8 'Header',
9 'decode_header',
10 'make_header',
11 ]
12
13import re
14import binascii
15
16import email.quoprimime
17import email.base64mime
18
19from email.errors import HeaderParseError
R. David Murray92532142011-01-07 23:25:30 +000020from email import charset as _charset
21Charset = _charset.Charset
Guido van Rossum8b3febe2007-08-30 01:15:14 +000022
23NL = '\n'
24SPACE = ' '
25BSPACE = b' '
26SPACE8 = ' ' * 8
27EMPTYSTRING = ''
Guido van Rossum9604e662007-08-30 03:46:43 +000028MAXLINELEN = 78
R David Murray01581ee2011-04-18 10:04:34 -040029FWS = ' \t'
Guido van Rossum8b3febe2007-08-30 01:15:14 +000030
31USASCII = Charset('us-ascii')
32UTF8 = Charset('utf-8')
33
34# Match encoded-word strings in the form =?charset?q?Hello_World?=
35ecre = re.compile(r'''
36 =\? # literal =?
37 (?P<charset>[^?]*?) # non-greedy up to the next ? is the charset
38 \? # literal ?
39 (?P<encoding>[qb]) # either a "q" or a "b", case insensitive
40 \? # literal ?
41 (?P<encoded>.*?) # non-greedy up to the next ?= is the encoded string
42 \?= # literal ?=
43 (?=[ \t]|$) # whitespace or the end of the string
44 ''', re.VERBOSE | re.IGNORECASE | re.MULTILINE)
45
46# Field name regexp, including trailing colon, but not separating whitespace,
47# according to RFC 2822. Character range is from tilde to exclamation mark.
48# For use with .match()
49fcre = re.compile(r'[\041-\176]+:$')
50
Ezio Melotti13925002011-03-16 11:05:33 +020051# Find a header embedded in a putative header value. Used to check for
R. David Murray5b2d9dd2011-01-09 02:35:24 +000052# header injection attack.
53_embeded_header = re.compile(r'\n[^ \t]+:')
54
Guido van Rossum8b3febe2007-08-30 01:15:14 +000055
56
57# Helpers
58_max_append = email.quoprimime._max_append
59
60
61
62def decode_header(header):
63 """Decode a message header value without converting charset.
64
65 Returns a list of (string, charset) pairs containing each of the decoded
66 parts of the header. Charset is None for non-encoded parts of the header,
67 otherwise a lower-case string containing the name of the character set
68 specified in the encoded string.
69
R David Murray041015c2011-03-25 15:10:55 -040070 header may be a string that may or may not contain RFC2047 encoded words,
71 or it may be a Header object.
72
Amaury Forgeot d'Arc1c25de62009-07-12 16:43:19 +000073 An email.errors.HeaderParseError may be raised when certain decoding error
Guido van Rossum8b3febe2007-08-30 01:15:14 +000074 occurs (e.g. a base64 decoding exception).
75 """
R David Murray041015c2011-03-25 15:10:55 -040076 # If it is a Header object, we can just return the chunks.
77 if hasattr(header, '_chunks'):
78 return list(header._chunks)
Guido van Rossum8b3febe2007-08-30 01:15:14 +000079 # If no encoding, just return the header with no charset.
80 if not ecre.search(header):
81 return [(header, None)]
82 # First step is to parse all the encoded parts into triplets of the form
83 # (encoded_string, encoding, charset). For unencoded strings, the last
84 # two parts will be None.
85 words = []
86 for line in header.splitlines():
87 parts = ecre.split(line)
88 while parts:
89 unencoded = parts.pop(0).strip()
90 if unencoded:
91 words.append((unencoded, None, None))
92 if parts:
93 charset = parts.pop(0).lower()
94 encoding = parts.pop(0).lower()
95 encoded = parts.pop(0)
96 words.append((encoded, encoding, charset))
97 # The next step is to decode each encoded word by applying the reverse
98 # base64 or quopri transformation. decoded_words is now a list of the
99 # form (decoded_word, charset).
100 decoded_words = []
101 for encoded_string, encoding, charset in words:
102 if encoding is None:
103 # This is an unencoded word.
104 decoded_words.append((encoded_string, charset))
105 elif encoding == 'q':
106 word = email.quoprimime.header_decode(encoded_string)
107 decoded_words.append((word, charset))
108 elif encoding == 'b':
R. David Murrayc4e69cc2010-08-03 22:14:10 +0000109 paderr = len(encoded_string) % 4 # Postel's law: add missing padding
110 if paderr:
111 encoded_string += '==='[:4 - paderr]
Guido van Rossum8b3febe2007-08-30 01:15:14 +0000112 try:
113 word = email.base64mime.decode(encoded_string)
114 except binascii.Error:
115 raise HeaderParseError('Base64 decoding error')
116 else:
117 decoded_words.append((word, charset))
118 else:
119 raise AssertionError('Unexpected encoding: ' + encoding)
120 # Now convert all words to bytes and collapse consecutive runs of
121 # similarly encoded words.
122 collapsed = []
123 last_word = last_charset = None
124 for word, charset in decoded_words:
125 if isinstance(word, str):
Guido van Rossum9604e662007-08-30 03:46:43 +0000126 word = bytes(word, 'raw-unicode-escape')
Guido van Rossum8b3febe2007-08-30 01:15:14 +0000127 if last_word is None:
128 last_word = word
129 last_charset = charset
130 elif charset != last_charset:
131 collapsed.append((last_word, last_charset))
132 last_word = word
133 last_charset = charset
134 elif last_charset is None:
135 last_word += BSPACE + word
136 else:
137 last_word += word
138 collapsed.append((last_word, last_charset))
139 return collapsed
140
141
142
143def make_header(decoded_seq, maxlinelen=None, header_name=None,
144 continuation_ws=' '):
145 """Create a Header from a sequence of pairs as returned by decode_header()
146
147 decode_header() takes a header value string and returns a sequence of
148 pairs of the format (decoded_string, charset) where charset is the string
149 name of the character set.
150
151 This function takes one of those sequence of pairs and returns a Header
152 instance. Optional maxlinelen, header_name, and continuation_ws are as in
153 the Header constructor.
154 """
155 h = Header(maxlinelen=maxlinelen, header_name=header_name,
156 continuation_ws=continuation_ws)
157 for s, charset in decoded_seq:
158 # None means us-ascii but we can simply pass it on to h.append()
159 if charset is not None and not isinstance(charset, Charset):
160 charset = Charset(charset)
161 h.append(s, charset)
162 return h
163
164
165
166class Header:
167 def __init__(self, s=None, charset=None,
168 maxlinelen=None, header_name=None,
169 continuation_ws=' ', errors='strict'):
170 """Create a MIME-compliant header that can contain many character sets.
171
172 Optional s is the initial header value. If None, the initial header
173 value is not set. You can later append to the header with .append()
174 method calls. s may be a byte string or a Unicode string, but see the
175 .append() documentation for semantics.
176
177 Optional charset serves two purposes: it has the same meaning as the
178 charset argument to the .append() method. It also sets the default
179 character set for all subsequent .append() calls that omit the charset
180 argument. If charset is not provided in the constructor, the us-ascii
181 charset is used both as s's initial charset and as the default for
182 subsequent .append() calls.
183
R. David Murray4c1da4c2010-12-29 16:57:24 +0000184 The maximum line length can be specified explicitly via maxlinelen. For
Guido van Rossum8b3febe2007-08-30 01:15:14 +0000185 splitting the first line to a shorter value (to account for the field
186 header which isn't included in s, e.g. `Subject') pass in the name of
Guido van Rossum9604e662007-08-30 03:46:43 +0000187 the field in header_name. The default maxlinelen is 78 as recommended
188 by RFC 2822.
Guido van Rossum8b3febe2007-08-30 01:15:14 +0000189
190 continuation_ws must be RFC 2822 compliant folding whitespace (usually
191 either a space or a hard tab) which will be prepended to continuation
192 lines.
193
194 errors is passed through to the .append() call.
195 """
196 if charset is None:
197 charset = USASCII
198 elif not isinstance(charset, Charset):
199 charset = Charset(charset)
200 self._charset = charset
201 self._continuation_ws = continuation_ws
202 self._chunks = []
203 if s is not None:
204 self.append(s, charset, errors)
205 if maxlinelen is None:
206 maxlinelen = MAXLINELEN
207 self._maxlinelen = maxlinelen
208 if header_name is None:
209 self._headerlen = 0
210 else:
211 # Take the separating colon and space into account.
212 self._headerlen = len(header_name) + 2
213
214 def __str__(self):
215 """Return the string value of the header."""
Guido van Rossum9604e662007-08-30 03:46:43 +0000216 self._normalize()
Guido van Rossum8b3febe2007-08-30 01:15:14 +0000217 uchunks = []
218 lastcs = None
Guido van Rossum9604e662007-08-30 03:46:43 +0000219 for string, charset in self._chunks:
Guido van Rossum8b3febe2007-08-30 01:15:14 +0000220 # We must preserve spaces between encoded and non-encoded word
221 # boundaries, which means for us we need to add a space when we go
222 # from a charset to None/us-ascii, or from None/us-ascii to a
223 # charset. Only do this for the second and subsequent chunks.
224 nextcs = charset
R. David Murray92532142011-01-07 23:25:30 +0000225 if nextcs == _charset.UNKNOWN8BIT:
226 original_bytes = string.encode('ascii', 'surrogateescape')
227 string = original_bytes.decode('ascii', 'replace')
Guido van Rossum8b3febe2007-08-30 01:15:14 +0000228 if uchunks:
229 if lastcs not in (None, 'us-ascii'):
230 if nextcs in (None, 'us-ascii'):
231 uchunks.append(SPACE)
232 nextcs = None
233 elif nextcs not in (None, 'us-ascii'):
234 uchunks.append(SPACE)
235 lastcs = nextcs
Guido van Rossum9604e662007-08-30 03:46:43 +0000236 uchunks.append(string)
Guido van Rossum8b3febe2007-08-30 01:15:14 +0000237 return EMPTYSTRING.join(uchunks)
238
239 # Rich comparison operators for equality only. BAW: does it make sense to
240 # have or explicitly disable <, <=, >, >= operators?
241 def __eq__(self, other):
242 # other may be a Header or a string. Both are fine so coerce
Guido van Rossum9604e662007-08-30 03:46:43 +0000243 # ourselves to a unicode (of the unencoded header value), swap the
244 # args and do another comparison.
245 return other == str(self)
Guido van Rossum8b3febe2007-08-30 01:15:14 +0000246
247 def __ne__(self, other):
248 return not self == other
249
250 def append(self, s, charset=None, errors='strict'):
251 """Append a string to the MIME header.
252
253 Optional charset, if given, should be a Charset instance or the name
254 of a character set (which will be converted to a Charset instance). A
255 value of None (the default) means that the charset given in the
256 constructor is used.
257
258 s may be a byte string or a Unicode string. If it is a byte string
R. David Murray4c1da4c2010-12-29 16:57:24 +0000259 (i.e. isinstance(s, str) is false), then charset is the encoding of
Guido van Rossum8b3febe2007-08-30 01:15:14 +0000260 that byte string, and a UnicodeError will be raised if the string
261 cannot be decoded with that charset. If s is a Unicode string, then
262 charset is a hint specifying the character set of the characters in
R. David Murray477efb32011-01-05 01:39:32 +0000263 the string. In either case, when producing an RFC 2822 compliant
264 header using RFC 2047 rules, the string will be encoded using the
265 output codec of the charset. If the string cannot be encoded to the
266 output codec, a UnicodeError will be raised.
Guido van Rossum8b3febe2007-08-30 01:15:14 +0000267
R. David Murray477efb32011-01-05 01:39:32 +0000268 Optional `errors' is passed as the errors argument to the decode
269 call if s is a byte string.
Guido van Rossum8b3febe2007-08-30 01:15:14 +0000270 """
271 if charset is None:
272 charset = self._charset
273 elif not isinstance(charset, Charset):
274 charset = Charset(charset)
R. David Murray477efb32011-01-05 01:39:32 +0000275 if not isinstance(s, str):
Guido van Rossum8b3febe2007-08-30 01:15:14 +0000276 input_charset = charset.input_codec or 'us-ascii'
R. David Murray477efb32011-01-05 01:39:32 +0000277 s = s.decode(input_charset, errors)
Guido van Rossum8b3febe2007-08-30 01:15:14 +0000278 # Ensure that the bytes we're storing can be decoded to the output
279 # character set, otherwise an early error is thrown.
280 output_charset = charset.output_codec or 'us-ascii'
R. David Murray92532142011-01-07 23:25:30 +0000281 if output_charset != _charset.UNKNOWN8BIT:
282 s.encode(output_charset, errors)
R. David Murray477efb32011-01-05 01:39:32 +0000283 self._chunks.append((s, charset))
Guido van Rossum8b3febe2007-08-30 01:15:14 +0000284
R. David Murray8451c4b2010-10-23 22:19:56 +0000285 def encode(self, splitchars=';, \t', maxlinelen=None, linesep='\n'):
R David Murraycd37dfc2011-03-14 18:35:56 -0400286 r"""Encode a message header into an RFC-compliant format.
Guido van Rossum8b3febe2007-08-30 01:15:14 +0000287
288 There are many issues involved in converting a given string for use in
289 an email header. Only certain character sets are readable in most
290 email clients, and as header strings can only contain a subset of
291 7-bit ASCII, care must be taken to properly convert and encode (with
292 Base64 or quoted-printable) header strings. In addition, there is a
293 75-character length limit on any given encoded header field, so
294 line-wrapping must be performed, even with double-byte character sets.
295
Ezio Melottice073cd2011-04-13 16:43:21 +0300296 Optional maxlinelen specifies the maximum length of each generated
R David Murray308f14a2011-04-12 15:00:44 -0400297 line, exclusive of the linesep string. Individual lines may be longer
298 than maxlinelen if a folding point cannot be found. The first line
299 will be shorter by the length of the header name plus ": " if a header
300 name was specified at Header construction time. The default value for
301 maxlinelen is determined at header construction time.
Guido van Rossum8b3febe2007-08-30 01:15:14 +0000302
R David Murray01581ee2011-04-18 10:04:34 -0400303 Optional splitchars is a string containing characters which should be
304 given extra weight by the splitting algorithm during normal header
305 wrapping. This is in very rough support of RFC 2822's `higher level
306 syntactic breaks': split points preceded by a splitchar are preferred
307 during line splitting, with the characters preferred in the order in
308 which they appear in the string. Space and tab may be included in the
309 string to indicate whether preference should be given to one over the
310 other as a split point when other split chars do not appear in the line
311 being split. Splitchars does not affect RFC 2047 encoded lines.
R. David Murray8451c4b2010-10-23 22:19:56 +0000312
313 Optional linesep is a string to be used to separate the lines of
314 the value. The default value is the most useful for typical
315 Python applications, but it can be set to \r\n to produce RFC-compliant
316 line separators when needed.
Guido van Rossum8b3febe2007-08-30 01:15:14 +0000317 """
318 self._normalize()
Guido van Rossum9604e662007-08-30 03:46:43 +0000319 if maxlinelen is None:
320 maxlinelen = self._maxlinelen
321 # A maxlinelen of 0 means don't wrap. For all practical purposes,
322 # choosing a huge number here accomplishes that and makes the
323 # _ValueFormatter algorithm much simpler.
324 if maxlinelen == 0:
325 maxlinelen = 1000000
326 formatter = _ValueFormatter(self._headerlen, maxlinelen,
Guido van Rossum8b3febe2007-08-30 01:15:14 +0000327 self._continuation_ws, splitchars)
328 for string, charset in self._chunks:
329 lines = string.splitlines()
R David Murray01581ee2011-04-18 10:04:34 -0400330 if lines:
331 formatter.feed('', lines[0], charset)
332 else:
333 formatter.feed('', '', charset)
R. David Murray6f0022d2011-01-07 21:57:25 +0000334 for line in lines[1:]:
335 formatter.newline()
336 if charset.header_encoding is not None:
R David Murray01581ee2011-04-18 10:04:34 -0400337 formatter.feed(self._continuation_ws, ' ' + line.lstrip(),
338 charset)
339 else:
340 sline = line.lstrip()
341 fws = line[:len(line)-len(sline)]
342 formatter.feed(fws, sline, charset)
R. David Murray6f0022d2011-01-07 21:57:25 +0000343 if len(lines) > 1:
344 formatter.newline()
Barry Warsaw00b34222007-08-31 02:35:00 +0000345 formatter.add_transition()
R. David Murray5b2d9dd2011-01-09 02:35:24 +0000346 value = formatter._str(linesep)
347 if _embeded_header.search(value):
348 raise HeaderParseError("header value appears to contain "
349 "an embedded header: {!r}".format(value))
350 return value
Guido van Rossum8b3febe2007-08-30 01:15:14 +0000351
352 def _normalize(self):
Guido van Rossum9604e662007-08-30 03:46:43 +0000353 # Step 1: Normalize the chunks so that all runs of identical charsets
354 # get collapsed into a single unicode string.
Guido van Rossum8b3febe2007-08-30 01:15:14 +0000355 chunks = []
356 last_charset = None
357 last_chunk = []
358 for string, charset in self._chunks:
359 if charset == last_charset:
360 last_chunk.append(string)
361 else:
362 if last_charset is not None:
363 chunks.append((SPACE.join(last_chunk), last_charset))
Guido van Rossum8b3febe2007-08-30 01:15:14 +0000364 last_chunk = [string]
365 last_charset = charset
366 if last_chunk:
367 chunks.append((SPACE.join(last_chunk), last_charset))
368 self._chunks = chunks
369
370
371
372class _ValueFormatter:
373 def __init__(self, headerlen, maxlen, continuation_ws, splitchars):
374 self._maxlen = maxlen
375 self._continuation_ws = continuation_ws
R David Murray01581ee2011-04-18 10:04:34 -0400376 self._continuation_ws_len = len(continuation_ws)
Guido van Rossum8b3febe2007-08-30 01:15:14 +0000377 self._splitchars = splitchars
378 self._lines = []
379 self._current_line = _Accumulator(headerlen)
380
R. David Murray8451c4b2010-10-23 22:19:56 +0000381 def _str(self, linesep):
Guido van Rossum8b3febe2007-08-30 01:15:14 +0000382 self.newline()
R. David Murray8451c4b2010-10-23 22:19:56 +0000383 return linesep.join(self._lines)
384
385 def __str__(self):
386 return self._str(NL)
Guido van Rossum8b3febe2007-08-30 01:15:14 +0000387
388 def newline(self):
Barry Warsaw00b34222007-08-31 02:35:00 +0000389 end_of_line = self._current_line.pop()
R David Murray01581ee2011-04-18 10:04:34 -0400390 if end_of_line != (' ', ''):
391 self._current_line.push(*end_of_line)
Guido van Rossum8b3febe2007-08-30 01:15:14 +0000392 if len(self._current_line) > 0:
R David Murray01581ee2011-04-18 10:04:34 -0400393 if self._current_line.is_onlyws():
394 self._lines[-1] += str(self._current_line)
395 else:
396 self._lines.append(str(self._current_line))
Guido van Rossum8b3febe2007-08-30 01:15:14 +0000397 self._current_line.reset()
398
Barry Warsaw00b34222007-08-31 02:35:00 +0000399 def add_transition(self):
R David Murray01581ee2011-04-18 10:04:34 -0400400 self._current_line.push(' ', '')
Barry Warsaw00b34222007-08-31 02:35:00 +0000401
R David Murray01581ee2011-04-18 10:04:34 -0400402 def feed(self, fws, string, charset):
Guido van Rossum9604e662007-08-30 03:46:43 +0000403 # If the charset has no header encoding (i.e. it is an ASCII encoding)
404 # then we must split the header at the "highest level syntactic break"
405 # possible. Note that we don't have a lot of smarts about field
Guido van Rossum8b3febe2007-08-30 01:15:14 +0000406 # syntax; we just try to break on semi-colons, then commas, then
Guido van Rossum9604e662007-08-30 03:46:43 +0000407 # whitespace. Eventually, this should be pluggable.
408 if charset.header_encoding is None:
R David Murray01581ee2011-04-18 10:04:34 -0400409 self._ascii_split(fws, string, self._splitchars)
Guido van Rossum8b3febe2007-08-30 01:15:14 +0000410 return
Guido van Rossum9604e662007-08-30 03:46:43 +0000411 # Otherwise, we're doing either a Base64 or a quoted-printable
412 # encoding which means we don't need to split the line on syntactic
413 # breaks. We can basically just find enough characters to fit on the
414 # current line, minus the RFC 2047 chrome. What makes this trickier
415 # though is that we have to split at octet boundaries, not character
416 # boundaries but it's only safe to split at character boundaries so at
417 # best we can only get close.
418 encoded_lines = charset.header_encode_lines(string, self._maxlengths())
419 # The first element extends the current line, but if it's None then
420 # nothing more fit on the current line so start a new line.
421 try:
422 first_line = encoded_lines.pop(0)
423 except IndexError:
424 # There are no encoded lines, so we're done.
425 return
426 if first_line is not None:
R David Murray01581ee2011-04-18 10:04:34 -0400427 self._append_chunk(fws, first_line)
Guido van Rossum9604e662007-08-30 03:46:43 +0000428 try:
429 last_line = encoded_lines.pop()
430 except IndexError:
431 # There was only one line.
432 return
R David Murray01581ee2011-04-18 10:04:34 -0400433 self.newline()
434 self._current_line.push(self._continuation_ws, last_line)
Guido van Rossum9604e662007-08-30 03:46:43 +0000435 # Everything else are full lines in themselves.
436 for line in encoded_lines:
437 self._lines.append(self._continuation_ws + line)
Guido van Rossum8b3febe2007-08-30 01:15:14 +0000438
Guido van Rossum9604e662007-08-30 03:46:43 +0000439 def _maxlengths(self):
440 # The first line's length.
441 yield self._maxlen - len(self._current_line)
442 while True:
443 yield self._maxlen - self._continuation_ws_len
444
R David Murray01581ee2011-04-18 10:04:34 -0400445 def _ascii_split(self, fws, string, splitchars):
446 # The RFC 2822 header folding algorithm is simple in principle but
447 # complex in practice. Lines may be folded any place where "folding
448 # white space" appears by inserting a linesep character in front of the
449 # FWS. The complication is that not all spaces or tabs qualify as FWS,
450 # and we are also supposed to prefer to break at "higher level
451 # syntactic breaks". We can't do either of these without intimate
452 # knowledge of the structure of structured headers, which we don't have
453 # here. So the best we can do here is prefer to break at the specified
454 # splitchars, and hope that we don't choose any spaces or tabs that
455 # aren't legal FWS. (This is at least better than the old algorithm,
456 # where we would sometimes *introduce* FWS after a splitchar, or the
457 # algorithm before that, where we would turn all white space runs into
458 # single spaces or tabs.)
459 parts = re.split("(["+FWS+"]+)", fws+string)
460 if parts[0]:
461 parts[:0] = ['']
462 else:
463 parts.pop(0)
464 for fws, part in zip(*[iter(parts)]*2):
465 self._append_chunk(fws, part)
466
467 def _append_chunk(self, fws, string):
468 self._current_line.push(fws, string)
469 if len(self._current_line) > self._maxlen:
470 # Find the best split point, working backward from the end.
471 # There might be none, on a long first line.
472 for ch in self._splitchars:
473 for i in range(self._current_line.part_count()-1, 0, -1):
474 if ch.isspace():
475 fws = self._current_line[i][0]
476 if fws and fws[0]==ch:
477 break
478 prevpart = self._current_line[i-1][1]
479 if prevpart and prevpart[-1]==ch:
480 break
Guido van Rossum8b3febe2007-08-30 01:15:14 +0000481 else:
R David Murray01581ee2011-04-18 10:04:34 -0400482 continue
483 break
Guido van Rossum8b3febe2007-08-30 01:15:14 +0000484 else:
R David Murray01581ee2011-04-18 10:04:34 -0400485 fws, part = self._current_line.pop()
486 if self._current_line._initial_size > 0:
487 # There will be a header, so leave it on a line by itself.
488 self.newline()
489 if not fws:
490 # We don't use continuation_ws here because the whitespace
491 # after a header should always be a space.
492 fws = ' '
493 self._current_line.push(fws, part)
494 return
495 remainder = self._current_line.pop_from(i)
496 self._lines.append(str(self._current_line))
497 self._current_line.reset(remainder)
Guido van Rossum8b3febe2007-08-30 01:15:14 +0000498
499
R David Murray01581ee2011-04-18 10:04:34 -0400500class _Accumulator(list):
Guido van Rossum8b3febe2007-08-30 01:15:14 +0000501
Guido van Rossum9604e662007-08-30 03:46:43 +0000502 def __init__(self, initial_size=0):
Guido van Rossum8b3febe2007-08-30 01:15:14 +0000503 self._initial_size = initial_size
R David Murray01581ee2011-04-18 10:04:34 -0400504 super().__init__()
Guido van Rossum8b3febe2007-08-30 01:15:14 +0000505
R David Murray01581ee2011-04-18 10:04:34 -0400506 def push(self, fws, string):
507 self.append((fws, string))
508
509 def pop_from(self, i=0):
510 popped = self[i:]
511 self[i:] = []
512 return popped
Guido van Rossum8b3febe2007-08-30 01:15:14 +0000513
514 def pop(self):
R David Murray01581ee2011-04-18 10:04:34 -0400515 if self.part_count()==0:
516 return ('', '')
517 return super().pop()
Guido van Rossum8b3febe2007-08-30 01:15:14 +0000518
519 def __len__(self):
R David Murray01581ee2011-04-18 10:04:34 -0400520 return sum((len(fws)+len(part) for fws, part in self),
Guido van Rossum9604e662007-08-30 03:46:43 +0000521 self._initial_size)
Guido van Rossum8b3febe2007-08-30 01:15:14 +0000522
523 def __str__(self):
R David Murray01581ee2011-04-18 10:04:34 -0400524 return EMPTYSTRING.join((EMPTYSTRING.join((fws, part))
525 for fws, part in self))
Guido van Rossum8b3febe2007-08-30 01:15:14 +0000526
R David Murray01581ee2011-04-18 10:04:34 -0400527 def reset(self, startval=None):
528 if startval is None:
529 startval = []
530 self[:] = startval
Guido van Rossum8b3febe2007-08-30 01:15:14 +0000531 self._initial_size = 0
Guido van Rossum9604e662007-08-30 03:46:43 +0000532
533 def is_onlyws(self):
R David Murray01581ee2011-04-18 10:04:34 -0400534 return self._initial_size==0 and (not self or str(self).isspace())
535
536 def part_count(self):
537 return super().__len__()