blob: 06708853c2d90de91981c5300b658ceabeb21db6 [file] [log] [blame]
Guido van Rossum8b3febe2007-08-30 01:15:14 +00001# Copyright (C) 2002-2007 Python Software Foundation
2# Author: Ben Gertzfield, Barry Warsaw
3# Contact: email-sig@python.org
4
5"""Header encoding and decoding functionality."""
6
7__all__ = [
8 'Header',
9 'decode_header',
10 'make_header',
11 ]
12
13import re
14import binascii
15
16import email.quoprimime
17import email.base64mime
18
19from email.errors import HeaderParseError
R. David Murray92532142011-01-07 23:25:30 +000020from email import charset as _charset
21Charset = _charset.Charset
Guido van Rossum8b3febe2007-08-30 01:15:14 +000022
23NL = '\n'
24SPACE = ' '
25BSPACE = b' '
26SPACE8 = ' ' * 8
27EMPTYSTRING = ''
Guido van Rossum9604e662007-08-30 03:46:43 +000028MAXLINELEN = 78
R David Murray01581ee2011-04-18 10:04:34 -040029FWS = ' \t'
Guido van Rossum8b3febe2007-08-30 01:15:14 +000030
31USASCII = Charset('us-ascii')
32UTF8 = Charset('utf-8')
33
34# Match encoded-word strings in the form =?charset?q?Hello_World?=
35ecre = re.compile(r'''
36 =\? # literal =?
37 (?P<charset>[^?]*?) # non-greedy up to the next ? is the charset
38 \? # literal ?
39 (?P<encoding>[qb]) # either a "q" or a "b", case insensitive
40 \? # literal ?
41 (?P<encoded>.*?) # non-greedy up to the next ?= is the encoded string
42 \?= # literal ?=
43 (?=[ \t]|$) # whitespace or the end of the string
44 ''', re.VERBOSE | re.IGNORECASE | re.MULTILINE)
45
46# Field name regexp, including trailing colon, but not separating whitespace,
47# according to RFC 2822. Character range is from tilde to exclamation mark.
48# For use with .match()
49fcre = re.compile(r'[\041-\176]+:$')
50
Ezio Melotti13925002011-03-16 11:05:33 +020051# Find a header embedded in a putative header value. Used to check for
R. David Murray5b2d9dd2011-01-09 02:35:24 +000052# header injection attack.
53_embeded_header = re.compile(r'\n[^ \t]+:')
54
Guido van Rossum8b3febe2007-08-30 01:15:14 +000055
56
57# Helpers
58_max_append = email.quoprimime._max_append
59
60
61
62def decode_header(header):
63 """Decode a message header value without converting charset.
64
65 Returns a list of (string, charset) pairs containing each of the decoded
66 parts of the header. Charset is None for non-encoded parts of the header,
67 otherwise a lower-case string containing the name of the character set
68 specified in the encoded string.
69
R David Murray041015c2011-03-25 15:10:55 -040070 header may be a string that may or may not contain RFC2047 encoded words,
71 or it may be a Header object.
72
Amaury Forgeot d'Arc1c25de62009-07-12 16:43:19 +000073 An email.errors.HeaderParseError may be raised when certain decoding error
Guido van Rossum8b3febe2007-08-30 01:15:14 +000074 occurs (e.g. a base64 decoding exception).
75 """
R David Murray6bdb1762011-06-18 12:30:55 -040076 # If it is a Header object, we can just return the encoded chunks.
R David Murray041015c2011-03-25 15:10:55 -040077 if hasattr(header, '_chunks'):
R David Murray6bdb1762011-06-18 12:30:55 -040078 return [(_charset._encode(string, str(charset)), str(charset))
79 for string, charset in header._chunks]
Guido van Rossum8b3febe2007-08-30 01:15:14 +000080 # If no encoding, just return the header with no charset.
81 if not ecre.search(header):
82 return [(header, None)]
83 # First step is to parse all the encoded parts into triplets of the form
84 # (encoded_string, encoding, charset). For unencoded strings, the last
85 # two parts will be None.
86 words = []
87 for line in header.splitlines():
88 parts = ecre.split(line)
89 while parts:
90 unencoded = parts.pop(0).strip()
91 if unencoded:
92 words.append((unencoded, None, None))
93 if parts:
94 charset = parts.pop(0).lower()
95 encoding = parts.pop(0).lower()
96 encoded = parts.pop(0)
97 words.append((encoded, encoding, charset))
98 # The next step is to decode each encoded word by applying the reverse
99 # base64 or quopri transformation. decoded_words is now a list of the
100 # form (decoded_word, charset).
101 decoded_words = []
102 for encoded_string, encoding, charset in words:
103 if encoding is None:
104 # This is an unencoded word.
105 decoded_words.append((encoded_string, charset))
106 elif encoding == 'q':
107 word = email.quoprimime.header_decode(encoded_string)
108 decoded_words.append((word, charset))
109 elif encoding == 'b':
R. David Murrayc4e69cc2010-08-03 22:14:10 +0000110 paderr = len(encoded_string) % 4 # Postel's law: add missing padding
111 if paderr:
112 encoded_string += '==='[:4 - paderr]
Guido van Rossum8b3febe2007-08-30 01:15:14 +0000113 try:
114 word = email.base64mime.decode(encoded_string)
115 except binascii.Error:
116 raise HeaderParseError('Base64 decoding error')
117 else:
118 decoded_words.append((word, charset))
119 else:
120 raise AssertionError('Unexpected encoding: ' + encoding)
121 # Now convert all words to bytes and collapse consecutive runs of
122 # similarly encoded words.
123 collapsed = []
124 last_word = last_charset = None
125 for word, charset in decoded_words:
126 if isinstance(word, str):
Guido van Rossum9604e662007-08-30 03:46:43 +0000127 word = bytes(word, 'raw-unicode-escape')
Guido van Rossum8b3febe2007-08-30 01:15:14 +0000128 if last_word is None:
129 last_word = word
130 last_charset = charset
131 elif charset != last_charset:
132 collapsed.append((last_word, last_charset))
133 last_word = word
134 last_charset = charset
135 elif last_charset is None:
136 last_word += BSPACE + word
137 else:
138 last_word += word
139 collapsed.append((last_word, last_charset))
140 return collapsed
141
142
143
144def make_header(decoded_seq, maxlinelen=None, header_name=None,
145 continuation_ws=' '):
146 """Create a Header from a sequence of pairs as returned by decode_header()
147
148 decode_header() takes a header value string and returns a sequence of
149 pairs of the format (decoded_string, charset) where charset is the string
150 name of the character set.
151
152 This function takes one of those sequence of pairs and returns a Header
153 instance. Optional maxlinelen, header_name, and continuation_ws are as in
154 the Header constructor.
155 """
156 h = Header(maxlinelen=maxlinelen, header_name=header_name,
157 continuation_ws=continuation_ws)
158 for s, charset in decoded_seq:
159 # None means us-ascii but we can simply pass it on to h.append()
160 if charset is not None and not isinstance(charset, Charset):
161 charset = Charset(charset)
162 h.append(s, charset)
163 return h
164
165
166
167class Header:
168 def __init__(self, s=None, charset=None,
169 maxlinelen=None, header_name=None,
170 continuation_ws=' ', errors='strict'):
171 """Create a MIME-compliant header that can contain many character sets.
172
173 Optional s is the initial header value. If None, the initial header
174 value is not set. You can later append to the header with .append()
175 method calls. s may be a byte string or a Unicode string, but see the
176 .append() documentation for semantics.
177
178 Optional charset serves two purposes: it has the same meaning as the
179 charset argument to the .append() method. It also sets the default
180 character set for all subsequent .append() calls that omit the charset
181 argument. If charset is not provided in the constructor, the us-ascii
182 charset is used both as s's initial charset and as the default for
183 subsequent .append() calls.
184
R. David Murray4c1da4c2010-12-29 16:57:24 +0000185 The maximum line length can be specified explicitly via maxlinelen. For
Guido van Rossum8b3febe2007-08-30 01:15:14 +0000186 splitting the first line to a shorter value (to account for the field
187 header which isn't included in s, e.g. `Subject') pass in the name of
Guido van Rossum9604e662007-08-30 03:46:43 +0000188 the field in header_name. The default maxlinelen is 78 as recommended
189 by RFC 2822.
Guido van Rossum8b3febe2007-08-30 01:15:14 +0000190
191 continuation_ws must be RFC 2822 compliant folding whitespace (usually
192 either a space or a hard tab) which will be prepended to continuation
193 lines.
194
195 errors is passed through to the .append() call.
196 """
197 if charset is None:
198 charset = USASCII
199 elif not isinstance(charset, Charset):
200 charset = Charset(charset)
201 self._charset = charset
202 self._continuation_ws = continuation_ws
203 self._chunks = []
204 if s is not None:
205 self.append(s, charset, errors)
206 if maxlinelen is None:
207 maxlinelen = MAXLINELEN
208 self._maxlinelen = maxlinelen
209 if header_name is None:
210 self._headerlen = 0
211 else:
212 # Take the separating colon and space into account.
213 self._headerlen = len(header_name) + 2
214
215 def __str__(self):
216 """Return the string value of the header."""
Guido van Rossum9604e662007-08-30 03:46:43 +0000217 self._normalize()
Guido van Rossum8b3febe2007-08-30 01:15:14 +0000218 uchunks = []
219 lastcs = None
Guido van Rossum9604e662007-08-30 03:46:43 +0000220 for string, charset in self._chunks:
Guido van Rossum8b3febe2007-08-30 01:15:14 +0000221 # We must preserve spaces between encoded and non-encoded word
222 # boundaries, which means for us we need to add a space when we go
223 # from a charset to None/us-ascii, or from None/us-ascii to a
224 # charset. Only do this for the second and subsequent chunks.
225 nextcs = charset
R. David Murray92532142011-01-07 23:25:30 +0000226 if nextcs == _charset.UNKNOWN8BIT:
227 original_bytes = string.encode('ascii', 'surrogateescape')
228 string = original_bytes.decode('ascii', 'replace')
Guido van Rossum8b3febe2007-08-30 01:15:14 +0000229 if uchunks:
230 if lastcs not in (None, 'us-ascii'):
231 if nextcs in (None, 'us-ascii'):
232 uchunks.append(SPACE)
233 nextcs = None
234 elif nextcs not in (None, 'us-ascii'):
235 uchunks.append(SPACE)
236 lastcs = nextcs
Guido van Rossum9604e662007-08-30 03:46:43 +0000237 uchunks.append(string)
Guido van Rossum8b3febe2007-08-30 01:15:14 +0000238 return EMPTYSTRING.join(uchunks)
239
240 # Rich comparison operators for equality only. BAW: does it make sense to
241 # have or explicitly disable <, <=, >, >= operators?
242 def __eq__(self, other):
243 # other may be a Header or a string. Both are fine so coerce
Guido van Rossum9604e662007-08-30 03:46:43 +0000244 # ourselves to a unicode (of the unencoded header value), swap the
245 # args and do another comparison.
246 return other == str(self)
Guido van Rossum8b3febe2007-08-30 01:15:14 +0000247
248 def __ne__(self, other):
249 return not self == other
250
251 def append(self, s, charset=None, errors='strict'):
252 """Append a string to the MIME header.
253
254 Optional charset, if given, should be a Charset instance or the name
255 of a character set (which will be converted to a Charset instance). A
256 value of None (the default) means that the charset given in the
257 constructor is used.
258
259 s may be a byte string or a Unicode string. If it is a byte string
R. David Murray4c1da4c2010-12-29 16:57:24 +0000260 (i.e. isinstance(s, str) is false), then charset is the encoding of
Guido van Rossum8b3febe2007-08-30 01:15:14 +0000261 that byte string, and a UnicodeError will be raised if the string
262 cannot be decoded with that charset. If s is a Unicode string, then
263 charset is a hint specifying the character set of the characters in
R. David Murray477efb32011-01-05 01:39:32 +0000264 the string. In either case, when producing an RFC 2822 compliant
265 header using RFC 2047 rules, the string will be encoded using the
266 output codec of the charset. If the string cannot be encoded to the
267 output codec, a UnicodeError will be raised.
Guido van Rossum8b3febe2007-08-30 01:15:14 +0000268
R. David Murray477efb32011-01-05 01:39:32 +0000269 Optional `errors' is passed as the errors argument to the decode
270 call if s is a byte string.
Guido van Rossum8b3febe2007-08-30 01:15:14 +0000271 """
272 if charset is None:
273 charset = self._charset
274 elif not isinstance(charset, Charset):
275 charset = Charset(charset)
R. David Murray477efb32011-01-05 01:39:32 +0000276 if not isinstance(s, str):
Guido van Rossum8b3febe2007-08-30 01:15:14 +0000277 input_charset = charset.input_codec or 'us-ascii'
R. David Murray477efb32011-01-05 01:39:32 +0000278 s = s.decode(input_charset, errors)
Guido van Rossum8b3febe2007-08-30 01:15:14 +0000279 # Ensure that the bytes we're storing can be decoded to the output
280 # character set, otherwise an early error is thrown.
281 output_charset = charset.output_codec or 'us-ascii'
R. David Murray92532142011-01-07 23:25:30 +0000282 if output_charset != _charset.UNKNOWN8BIT:
283 s.encode(output_charset, errors)
R. David Murray477efb32011-01-05 01:39:32 +0000284 self._chunks.append((s, charset))
Guido van Rossum8b3febe2007-08-30 01:15:14 +0000285
R. David Murray8451c4b2010-10-23 22:19:56 +0000286 def encode(self, splitchars=';, \t', maxlinelen=None, linesep='\n'):
R David Murraycd37dfc2011-03-14 18:35:56 -0400287 r"""Encode a message header into an RFC-compliant format.
Guido van Rossum8b3febe2007-08-30 01:15:14 +0000288
289 There are many issues involved in converting a given string for use in
290 an email header. Only certain character sets are readable in most
291 email clients, and as header strings can only contain a subset of
292 7-bit ASCII, care must be taken to properly convert and encode (with
293 Base64 or quoted-printable) header strings. In addition, there is a
294 75-character length limit on any given encoded header field, so
295 line-wrapping must be performed, even with double-byte character sets.
296
Ezio Melottice073cd2011-04-13 16:43:21 +0300297 Optional maxlinelen specifies the maximum length of each generated
R David Murray308f14a2011-04-12 15:00:44 -0400298 line, exclusive of the linesep string. Individual lines may be longer
299 than maxlinelen if a folding point cannot be found. The first line
300 will be shorter by the length of the header name plus ": " if a header
301 name was specified at Header construction time. The default value for
302 maxlinelen is determined at header construction time.
Guido van Rossum8b3febe2007-08-30 01:15:14 +0000303
R David Murray01581ee2011-04-18 10:04:34 -0400304 Optional splitchars is a string containing characters which should be
305 given extra weight by the splitting algorithm during normal header
306 wrapping. This is in very rough support of RFC 2822's `higher level
307 syntactic breaks': split points preceded by a splitchar are preferred
308 during line splitting, with the characters preferred in the order in
309 which they appear in the string. Space and tab may be included in the
310 string to indicate whether preference should be given to one over the
311 other as a split point when other split chars do not appear in the line
312 being split. Splitchars does not affect RFC 2047 encoded lines.
R. David Murray8451c4b2010-10-23 22:19:56 +0000313
314 Optional linesep is a string to be used to separate the lines of
315 the value. The default value is the most useful for typical
316 Python applications, but it can be set to \r\n to produce RFC-compliant
317 line separators when needed.
Guido van Rossum8b3febe2007-08-30 01:15:14 +0000318 """
319 self._normalize()
Guido van Rossum9604e662007-08-30 03:46:43 +0000320 if maxlinelen is None:
321 maxlinelen = self._maxlinelen
322 # A maxlinelen of 0 means don't wrap. For all practical purposes,
323 # choosing a huge number here accomplishes that and makes the
324 # _ValueFormatter algorithm much simpler.
325 if maxlinelen == 0:
326 maxlinelen = 1000000
327 formatter = _ValueFormatter(self._headerlen, maxlinelen,
Guido van Rossum8b3febe2007-08-30 01:15:14 +0000328 self._continuation_ws, splitchars)
329 for string, charset in self._chunks:
330 lines = string.splitlines()
R David Murray01581ee2011-04-18 10:04:34 -0400331 if lines:
332 formatter.feed('', lines[0], charset)
333 else:
334 formatter.feed('', '', charset)
R. David Murray6f0022d2011-01-07 21:57:25 +0000335 for line in lines[1:]:
336 formatter.newline()
337 if charset.header_encoding is not None:
R David Murray01581ee2011-04-18 10:04:34 -0400338 formatter.feed(self._continuation_ws, ' ' + line.lstrip(),
339 charset)
340 else:
341 sline = line.lstrip()
342 fws = line[:len(line)-len(sline)]
343 formatter.feed(fws, sline, charset)
R. David Murray6f0022d2011-01-07 21:57:25 +0000344 if len(lines) > 1:
345 formatter.newline()
Barry Warsaw00b34222007-08-31 02:35:00 +0000346 formatter.add_transition()
R. David Murray5b2d9dd2011-01-09 02:35:24 +0000347 value = formatter._str(linesep)
348 if _embeded_header.search(value):
349 raise HeaderParseError("header value appears to contain "
350 "an embedded header: {!r}".format(value))
351 return value
Guido van Rossum8b3febe2007-08-30 01:15:14 +0000352
353 def _normalize(self):
Guido van Rossum9604e662007-08-30 03:46:43 +0000354 # Step 1: Normalize the chunks so that all runs of identical charsets
355 # get collapsed into a single unicode string.
Guido van Rossum8b3febe2007-08-30 01:15:14 +0000356 chunks = []
357 last_charset = None
358 last_chunk = []
359 for string, charset in self._chunks:
360 if charset == last_charset:
361 last_chunk.append(string)
362 else:
363 if last_charset is not None:
364 chunks.append((SPACE.join(last_chunk), last_charset))
Guido van Rossum8b3febe2007-08-30 01:15:14 +0000365 last_chunk = [string]
366 last_charset = charset
367 if last_chunk:
368 chunks.append((SPACE.join(last_chunk), last_charset))
369 self._chunks = chunks
370
371
372
373class _ValueFormatter:
374 def __init__(self, headerlen, maxlen, continuation_ws, splitchars):
375 self._maxlen = maxlen
376 self._continuation_ws = continuation_ws
R David Murray01581ee2011-04-18 10:04:34 -0400377 self._continuation_ws_len = len(continuation_ws)
Guido van Rossum8b3febe2007-08-30 01:15:14 +0000378 self._splitchars = splitchars
379 self._lines = []
380 self._current_line = _Accumulator(headerlen)
381
R. David Murray8451c4b2010-10-23 22:19:56 +0000382 def _str(self, linesep):
Guido van Rossum8b3febe2007-08-30 01:15:14 +0000383 self.newline()
R. David Murray8451c4b2010-10-23 22:19:56 +0000384 return linesep.join(self._lines)
385
386 def __str__(self):
387 return self._str(NL)
Guido van Rossum8b3febe2007-08-30 01:15:14 +0000388
389 def newline(self):
Barry Warsaw00b34222007-08-31 02:35:00 +0000390 end_of_line = self._current_line.pop()
R David Murray01581ee2011-04-18 10:04:34 -0400391 if end_of_line != (' ', ''):
392 self._current_line.push(*end_of_line)
Guido van Rossum8b3febe2007-08-30 01:15:14 +0000393 if len(self._current_line) > 0:
R David Murray01581ee2011-04-18 10:04:34 -0400394 if self._current_line.is_onlyws():
395 self._lines[-1] += str(self._current_line)
396 else:
397 self._lines.append(str(self._current_line))
Guido van Rossum8b3febe2007-08-30 01:15:14 +0000398 self._current_line.reset()
399
Barry Warsaw00b34222007-08-31 02:35:00 +0000400 def add_transition(self):
R David Murray01581ee2011-04-18 10:04:34 -0400401 self._current_line.push(' ', '')
Barry Warsaw00b34222007-08-31 02:35:00 +0000402
R David Murray01581ee2011-04-18 10:04:34 -0400403 def feed(self, fws, string, charset):
Guido van Rossum9604e662007-08-30 03:46:43 +0000404 # If the charset has no header encoding (i.e. it is an ASCII encoding)
405 # then we must split the header at the "highest level syntactic break"
406 # possible. Note that we don't have a lot of smarts about field
Guido van Rossum8b3febe2007-08-30 01:15:14 +0000407 # syntax; we just try to break on semi-colons, then commas, then
Guido van Rossum9604e662007-08-30 03:46:43 +0000408 # whitespace. Eventually, this should be pluggable.
409 if charset.header_encoding is None:
R David Murray01581ee2011-04-18 10:04:34 -0400410 self._ascii_split(fws, string, self._splitchars)
Guido van Rossum8b3febe2007-08-30 01:15:14 +0000411 return
Guido van Rossum9604e662007-08-30 03:46:43 +0000412 # Otherwise, we're doing either a Base64 or a quoted-printable
413 # encoding which means we don't need to split the line on syntactic
414 # breaks. We can basically just find enough characters to fit on the
415 # current line, minus the RFC 2047 chrome. What makes this trickier
416 # though is that we have to split at octet boundaries, not character
417 # boundaries but it's only safe to split at character boundaries so at
418 # best we can only get close.
419 encoded_lines = charset.header_encode_lines(string, self._maxlengths())
420 # The first element extends the current line, but if it's None then
421 # nothing more fit on the current line so start a new line.
422 try:
423 first_line = encoded_lines.pop(0)
424 except IndexError:
425 # There are no encoded lines, so we're done.
426 return
427 if first_line is not None:
R David Murray01581ee2011-04-18 10:04:34 -0400428 self._append_chunk(fws, first_line)
Guido van Rossum9604e662007-08-30 03:46:43 +0000429 try:
430 last_line = encoded_lines.pop()
431 except IndexError:
432 # There was only one line.
433 return
R David Murray01581ee2011-04-18 10:04:34 -0400434 self.newline()
435 self._current_line.push(self._continuation_ws, last_line)
Guido van Rossum9604e662007-08-30 03:46:43 +0000436 # Everything else are full lines in themselves.
437 for line in encoded_lines:
438 self._lines.append(self._continuation_ws + line)
Guido van Rossum8b3febe2007-08-30 01:15:14 +0000439
Guido van Rossum9604e662007-08-30 03:46:43 +0000440 def _maxlengths(self):
441 # The first line's length.
442 yield self._maxlen - len(self._current_line)
443 while True:
444 yield self._maxlen - self._continuation_ws_len
445
R David Murray01581ee2011-04-18 10:04:34 -0400446 def _ascii_split(self, fws, string, splitchars):
447 # The RFC 2822 header folding algorithm is simple in principle but
448 # complex in practice. Lines may be folded any place where "folding
449 # white space" appears by inserting a linesep character in front of the
450 # FWS. The complication is that not all spaces or tabs qualify as FWS,
451 # and we are also supposed to prefer to break at "higher level
452 # syntactic breaks". We can't do either of these without intimate
453 # knowledge of the structure of structured headers, which we don't have
454 # here. So the best we can do here is prefer to break at the specified
455 # splitchars, and hope that we don't choose any spaces or tabs that
456 # aren't legal FWS. (This is at least better than the old algorithm,
457 # where we would sometimes *introduce* FWS after a splitchar, or the
458 # algorithm before that, where we would turn all white space runs into
459 # single spaces or tabs.)
460 parts = re.split("(["+FWS+"]+)", fws+string)
461 if parts[0]:
462 parts[:0] = ['']
463 else:
464 parts.pop(0)
465 for fws, part in zip(*[iter(parts)]*2):
466 self._append_chunk(fws, part)
467
468 def _append_chunk(self, fws, string):
469 self._current_line.push(fws, string)
470 if len(self._current_line) > self._maxlen:
471 # Find the best split point, working backward from the end.
472 # There might be none, on a long first line.
473 for ch in self._splitchars:
474 for i in range(self._current_line.part_count()-1, 0, -1):
475 if ch.isspace():
476 fws = self._current_line[i][0]
477 if fws and fws[0]==ch:
478 break
479 prevpart = self._current_line[i-1][1]
480 if prevpart and prevpart[-1]==ch:
481 break
Guido van Rossum8b3febe2007-08-30 01:15:14 +0000482 else:
R David Murray01581ee2011-04-18 10:04:34 -0400483 continue
484 break
Guido van Rossum8b3febe2007-08-30 01:15:14 +0000485 else:
R David Murray01581ee2011-04-18 10:04:34 -0400486 fws, part = self._current_line.pop()
487 if self._current_line._initial_size > 0:
488 # There will be a header, so leave it on a line by itself.
489 self.newline()
490 if not fws:
491 # We don't use continuation_ws here because the whitespace
492 # after a header should always be a space.
493 fws = ' '
494 self._current_line.push(fws, part)
495 return
496 remainder = self._current_line.pop_from(i)
497 self._lines.append(str(self._current_line))
498 self._current_line.reset(remainder)
Guido van Rossum8b3febe2007-08-30 01:15:14 +0000499
500
R David Murray01581ee2011-04-18 10:04:34 -0400501class _Accumulator(list):
Guido van Rossum8b3febe2007-08-30 01:15:14 +0000502
Guido van Rossum9604e662007-08-30 03:46:43 +0000503 def __init__(self, initial_size=0):
Guido van Rossum8b3febe2007-08-30 01:15:14 +0000504 self._initial_size = initial_size
R David Murray01581ee2011-04-18 10:04:34 -0400505 super().__init__()
Guido van Rossum8b3febe2007-08-30 01:15:14 +0000506
R David Murray01581ee2011-04-18 10:04:34 -0400507 def push(self, fws, string):
508 self.append((fws, string))
509
510 def pop_from(self, i=0):
511 popped = self[i:]
512 self[i:] = []
513 return popped
Guido van Rossum8b3febe2007-08-30 01:15:14 +0000514
515 def pop(self):
R David Murray01581ee2011-04-18 10:04:34 -0400516 if self.part_count()==0:
517 return ('', '')
518 return super().pop()
Guido van Rossum8b3febe2007-08-30 01:15:14 +0000519
520 def __len__(self):
R David Murray01581ee2011-04-18 10:04:34 -0400521 return sum((len(fws)+len(part) for fws, part in self),
Guido van Rossum9604e662007-08-30 03:46:43 +0000522 self._initial_size)
Guido van Rossum8b3febe2007-08-30 01:15:14 +0000523
524 def __str__(self):
R David Murray01581ee2011-04-18 10:04:34 -0400525 return EMPTYSTRING.join((EMPTYSTRING.join((fws, part))
526 for fws, part in self))
Guido van Rossum8b3febe2007-08-30 01:15:14 +0000527
R David Murray01581ee2011-04-18 10:04:34 -0400528 def reset(self, startval=None):
529 if startval is None:
530 startval = []
531 self[:] = startval
Guido van Rossum8b3febe2007-08-30 01:15:14 +0000532 self._initial_size = 0
Guido van Rossum9604e662007-08-30 03:46:43 +0000533
534 def is_onlyws(self):
R David Murray01581ee2011-04-18 10:04:34 -0400535 return self._initial_size==0 and (not self or str(self).isspace())
536
537 def part_count(self):
538 return super().__len__()