blob: 6820ea16baf3d1379e651c5d3b5391fd4e368d8d [file] [log] [blame]
Guido van Rossum8b3febe2007-08-30 01:15:14 +00001# Copyright (C) 2002-2007 Python Software Foundation
2# Author: Ben Gertzfield, Barry Warsaw
3# Contact: email-sig@python.org
4
5"""Header encoding and decoding functionality."""
6
7__all__ = [
8 'Header',
9 'decode_header',
10 'make_header',
11 ]
12
13import re
14import binascii
15
16import email.quoprimime
17import email.base64mime
18
19from email.errors import HeaderParseError
R. David Murray92532142011-01-07 23:25:30 +000020from email import charset as _charset
21Charset = _charset.Charset
Guido van Rossum8b3febe2007-08-30 01:15:14 +000022
23NL = '\n'
24SPACE = ' '
25BSPACE = b' '
26SPACE8 = ' ' * 8
27EMPTYSTRING = ''
Guido van Rossum9604e662007-08-30 03:46:43 +000028MAXLINELEN = 78
R David Murray01581ee2011-04-18 10:04:34 -040029FWS = ' \t'
Guido van Rossum8b3febe2007-08-30 01:15:14 +000030
31USASCII = Charset('us-ascii')
32UTF8 = Charset('utf-8')
33
34# Match encoded-word strings in the form =?charset?q?Hello_World?=
35ecre = re.compile(r'''
36 =\? # literal =?
37 (?P<charset>[^?]*?) # non-greedy up to the next ? is the charset
38 \? # literal ?
39 (?P<encoding>[qb]) # either a "q" or a "b", case insensitive
40 \? # literal ?
41 (?P<encoded>.*?) # non-greedy up to the next ?= is the encoded string
42 \?= # literal ?=
Guido van Rossum8b3febe2007-08-30 01:15:14 +000043 ''', re.VERBOSE | re.IGNORECASE | re.MULTILINE)
44
45# Field name regexp, including trailing colon, but not separating whitespace,
46# according to RFC 2822. Character range is from tilde to exclamation mark.
47# For use with .match()
48fcre = re.compile(r'[\041-\176]+:$')
49
Ezio Melotti13925002011-03-16 11:05:33 +020050# Find a header embedded in a putative header value. Used to check for
R. David Murray5b2d9dd2011-01-09 02:35:24 +000051# header injection attack.
52_embeded_header = re.compile(r'\n[^ \t]+:')
53
Guido van Rossum8b3febe2007-08-30 01:15:14 +000054
55
56# Helpers
57_max_append = email.quoprimime._max_append
58
59
60
61def decode_header(header):
62 """Decode a message header value without converting charset.
63
64 Returns a list of (string, charset) pairs containing each of the decoded
65 parts of the header. Charset is None for non-encoded parts of the header,
66 otherwise a lower-case string containing the name of the character set
67 specified in the encoded string.
68
R David Murray041015c2011-03-25 15:10:55 -040069 header may be a string that may or may not contain RFC2047 encoded words,
70 or it may be a Header object.
71
Amaury Forgeot d'Arc1c25de62009-07-12 16:43:19 +000072 An email.errors.HeaderParseError may be raised when certain decoding error
Guido van Rossum8b3febe2007-08-30 01:15:14 +000073 occurs (e.g. a base64 decoding exception).
74 """
R David Murray6bdb1762011-06-18 12:30:55 -040075 # If it is a Header object, we can just return the encoded chunks.
R David Murray041015c2011-03-25 15:10:55 -040076 if hasattr(header, '_chunks'):
R David Murray6bdb1762011-06-18 12:30:55 -040077 return [(_charset._encode(string, str(charset)), str(charset))
78 for string, charset in header._chunks]
Guido van Rossum8b3febe2007-08-30 01:15:14 +000079 # If no encoding, just return the header with no charset.
80 if not ecre.search(header):
81 return [(header, None)]
82 # First step is to parse all the encoded parts into triplets of the form
83 # (encoded_string, encoding, charset). For unencoded strings, the last
84 # two parts will be None.
85 words = []
86 for line in header.splitlines():
87 parts = ecre.split(line)
R David Murray07ea53c2012-06-02 17:56:49 -040088 first = True
Guido van Rossum8b3febe2007-08-30 01:15:14 +000089 while parts:
R David Murray07ea53c2012-06-02 17:56:49 -040090 unencoded = parts.pop(0)
91 if first:
92 unencoded = unencoded.lstrip()
93 first = False
Guido van Rossum8b3febe2007-08-30 01:15:14 +000094 if unencoded:
95 words.append((unencoded, None, None))
96 if parts:
97 charset = parts.pop(0).lower()
98 encoding = parts.pop(0).lower()
99 encoded = parts.pop(0)
100 words.append((encoded, encoding, charset))
R David Murray07ea53c2012-06-02 17:56:49 -0400101 # Now loop over words and remove words that consist of whitespace
102 # between two encoded strings.
R David Murray07ea53c2012-06-02 17:56:49 -0400103 droplist = []
104 for n, w in enumerate(words):
105 if n>1 and w[1] and words[n-2][1] and words[n-1][0].isspace():
106 droplist.append(n-1)
107 for d in reversed(droplist):
108 del words[d]
109
Guido van Rossum8b3febe2007-08-30 01:15:14 +0000110 # The next step is to decode each encoded word by applying the reverse
111 # base64 or quopri transformation. decoded_words is now a list of the
112 # form (decoded_word, charset).
113 decoded_words = []
114 for encoded_string, encoding, charset in words:
115 if encoding is None:
116 # This is an unencoded word.
117 decoded_words.append((encoded_string, charset))
118 elif encoding == 'q':
119 word = email.quoprimime.header_decode(encoded_string)
120 decoded_words.append((word, charset))
121 elif encoding == 'b':
R. David Murrayc4e69cc2010-08-03 22:14:10 +0000122 paderr = len(encoded_string) % 4 # Postel's law: add missing padding
123 if paderr:
124 encoded_string += '==='[:4 - paderr]
Guido van Rossum8b3febe2007-08-30 01:15:14 +0000125 try:
126 word = email.base64mime.decode(encoded_string)
127 except binascii.Error:
128 raise HeaderParseError('Base64 decoding error')
129 else:
130 decoded_words.append((word, charset))
131 else:
132 raise AssertionError('Unexpected encoding: ' + encoding)
133 # Now convert all words to bytes and collapse consecutive runs of
134 # similarly encoded words.
135 collapsed = []
136 last_word = last_charset = None
137 for word, charset in decoded_words:
138 if isinstance(word, str):
Guido van Rossum9604e662007-08-30 03:46:43 +0000139 word = bytes(word, 'raw-unicode-escape')
Guido van Rossum8b3febe2007-08-30 01:15:14 +0000140 if last_word is None:
141 last_word = word
142 last_charset = charset
143 elif charset != last_charset:
144 collapsed.append((last_word, last_charset))
145 last_word = word
146 last_charset = charset
147 elif last_charset is None:
148 last_word += BSPACE + word
149 else:
150 last_word += word
151 collapsed.append((last_word, last_charset))
152 return collapsed
153
154
155
156def make_header(decoded_seq, maxlinelen=None, header_name=None,
157 continuation_ws=' '):
158 """Create a Header from a sequence of pairs as returned by decode_header()
159
160 decode_header() takes a header value string and returns a sequence of
161 pairs of the format (decoded_string, charset) where charset is the string
162 name of the character set.
163
164 This function takes one of those sequence of pairs and returns a Header
165 instance. Optional maxlinelen, header_name, and continuation_ws are as in
166 the Header constructor.
167 """
168 h = Header(maxlinelen=maxlinelen, header_name=header_name,
169 continuation_ws=continuation_ws)
170 for s, charset in decoded_seq:
171 # None means us-ascii but we can simply pass it on to h.append()
172 if charset is not None and not isinstance(charset, Charset):
173 charset = Charset(charset)
174 h.append(s, charset)
175 return h
176
177
178
179class Header:
180 def __init__(self, s=None, charset=None,
181 maxlinelen=None, header_name=None,
182 continuation_ws=' ', errors='strict'):
183 """Create a MIME-compliant header that can contain many character sets.
184
185 Optional s is the initial header value. If None, the initial header
186 value is not set. You can later append to the header with .append()
187 method calls. s may be a byte string or a Unicode string, but see the
188 .append() documentation for semantics.
189
190 Optional charset serves two purposes: it has the same meaning as the
191 charset argument to the .append() method. It also sets the default
192 character set for all subsequent .append() calls that omit the charset
193 argument. If charset is not provided in the constructor, the us-ascii
194 charset is used both as s's initial charset and as the default for
195 subsequent .append() calls.
196
R. David Murray4c1da4c2010-12-29 16:57:24 +0000197 The maximum line length can be specified explicitly via maxlinelen. For
Guido van Rossum8b3febe2007-08-30 01:15:14 +0000198 splitting the first line to a shorter value (to account for the field
199 header which isn't included in s, e.g. `Subject') pass in the name of
Guido van Rossum9604e662007-08-30 03:46:43 +0000200 the field in header_name. The default maxlinelen is 78 as recommended
201 by RFC 2822.
Guido van Rossum8b3febe2007-08-30 01:15:14 +0000202
203 continuation_ws must be RFC 2822 compliant folding whitespace (usually
204 either a space or a hard tab) which will be prepended to continuation
205 lines.
206
207 errors is passed through to the .append() call.
208 """
209 if charset is None:
210 charset = USASCII
211 elif not isinstance(charset, Charset):
212 charset = Charset(charset)
213 self._charset = charset
214 self._continuation_ws = continuation_ws
215 self._chunks = []
216 if s is not None:
217 self.append(s, charset, errors)
218 if maxlinelen is None:
219 maxlinelen = MAXLINELEN
220 self._maxlinelen = maxlinelen
221 if header_name is None:
222 self._headerlen = 0
223 else:
224 # Take the separating colon and space into account.
225 self._headerlen = len(header_name) + 2
226
227 def __str__(self):
228 """Return the string value of the header."""
Guido van Rossum9604e662007-08-30 03:46:43 +0000229 self._normalize()
Guido van Rossum8b3febe2007-08-30 01:15:14 +0000230 uchunks = []
231 lastcs = None
R David Murray07ea53c2012-06-02 17:56:49 -0400232 lastspace = None
Guido van Rossum9604e662007-08-30 03:46:43 +0000233 for string, charset in self._chunks:
Guido van Rossum8b3febe2007-08-30 01:15:14 +0000234 # We must preserve spaces between encoded and non-encoded word
235 # boundaries, which means for us we need to add a space when we go
236 # from a charset to None/us-ascii, or from None/us-ascii to a
237 # charset. Only do this for the second and subsequent chunks.
R David Murray07ea53c2012-06-02 17:56:49 -0400238 # Don't add a space if the None/us-ascii string already has
239 # a space (trailing or leading depending on transition)
Guido van Rossum8b3febe2007-08-30 01:15:14 +0000240 nextcs = charset
R. David Murray92532142011-01-07 23:25:30 +0000241 if nextcs == _charset.UNKNOWN8BIT:
242 original_bytes = string.encode('ascii', 'surrogateescape')
243 string = original_bytes.decode('ascii', 'replace')
Guido van Rossum8b3febe2007-08-30 01:15:14 +0000244 if uchunks:
R David Murray07ea53c2012-06-02 17:56:49 -0400245 hasspace = string and self._nonctext(string[0])
Guido van Rossum8b3febe2007-08-30 01:15:14 +0000246 if lastcs not in (None, 'us-ascii'):
R David Murray07ea53c2012-06-02 17:56:49 -0400247 if nextcs in (None, 'us-ascii') and not hasspace:
Guido van Rossum8b3febe2007-08-30 01:15:14 +0000248 uchunks.append(SPACE)
249 nextcs = None
R David Murray07ea53c2012-06-02 17:56:49 -0400250 elif nextcs not in (None, 'us-ascii') and not lastspace:
Guido van Rossum8b3febe2007-08-30 01:15:14 +0000251 uchunks.append(SPACE)
R David Murray07ea53c2012-06-02 17:56:49 -0400252 lastspace = string and self._nonctext(string[-1])
Guido van Rossum8b3febe2007-08-30 01:15:14 +0000253 lastcs = nextcs
Guido van Rossum9604e662007-08-30 03:46:43 +0000254 uchunks.append(string)
Guido van Rossum8b3febe2007-08-30 01:15:14 +0000255 return EMPTYSTRING.join(uchunks)
256
257 # Rich comparison operators for equality only. BAW: does it make sense to
258 # have or explicitly disable <, <=, >, >= operators?
259 def __eq__(self, other):
260 # other may be a Header or a string. Both are fine so coerce
Guido van Rossum9604e662007-08-30 03:46:43 +0000261 # ourselves to a unicode (of the unencoded header value), swap the
262 # args and do another comparison.
263 return other == str(self)
Guido van Rossum8b3febe2007-08-30 01:15:14 +0000264
Guido van Rossum8b3febe2007-08-30 01:15:14 +0000265 def append(self, s, charset=None, errors='strict'):
266 """Append a string to the MIME header.
267
268 Optional charset, if given, should be a Charset instance or the name
269 of a character set (which will be converted to a Charset instance). A
270 value of None (the default) means that the charset given in the
271 constructor is used.
272
273 s may be a byte string or a Unicode string. If it is a byte string
R. David Murray4c1da4c2010-12-29 16:57:24 +0000274 (i.e. isinstance(s, str) is false), then charset is the encoding of
Guido van Rossum8b3febe2007-08-30 01:15:14 +0000275 that byte string, and a UnicodeError will be raised if the string
276 cannot be decoded with that charset. If s is a Unicode string, then
277 charset is a hint specifying the character set of the characters in
R. David Murray477efb32011-01-05 01:39:32 +0000278 the string. In either case, when producing an RFC 2822 compliant
279 header using RFC 2047 rules, the string will be encoded using the
280 output codec of the charset. If the string cannot be encoded to the
281 output codec, a UnicodeError will be raised.
Guido van Rossum8b3febe2007-08-30 01:15:14 +0000282
R. David Murray477efb32011-01-05 01:39:32 +0000283 Optional `errors' is passed as the errors argument to the decode
284 call if s is a byte string.
Guido van Rossum8b3febe2007-08-30 01:15:14 +0000285 """
286 if charset is None:
287 charset = self._charset
288 elif not isinstance(charset, Charset):
289 charset = Charset(charset)
R. David Murray477efb32011-01-05 01:39:32 +0000290 if not isinstance(s, str):
Guido van Rossum8b3febe2007-08-30 01:15:14 +0000291 input_charset = charset.input_codec or 'us-ascii'
R David Murraye5e366c2011-06-18 12:57:28 -0400292 if input_charset == _charset.UNKNOWN8BIT:
293 s = s.decode('us-ascii', 'surrogateescape')
294 else:
295 s = s.decode(input_charset, errors)
Guido van Rossum8b3febe2007-08-30 01:15:14 +0000296 # Ensure that the bytes we're storing can be decoded to the output
Andrew Svetlov737fb892012-12-18 21:14:22 +0200297 # character set, otherwise an early error is raised.
Guido van Rossum8b3febe2007-08-30 01:15:14 +0000298 output_charset = charset.output_codec or 'us-ascii'
R. David Murray92532142011-01-07 23:25:30 +0000299 if output_charset != _charset.UNKNOWN8BIT:
R David Murray7441a7a2012-03-14 02:59:51 -0400300 try:
301 s.encode(output_charset, errors)
302 except UnicodeEncodeError:
303 if output_charset!='us-ascii':
304 raise
305 charset = UTF8
R. David Murray477efb32011-01-05 01:39:32 +0000306 self._chunks.append((s, charset))
Guido van Rossum8b3febe2007-08-30 01:15:14 +0000307
R David Murray07ea53c2012-06-02 17:56:49 -0400308 def _nonctext(self, s):
309 """True if string s is not a ctext character of RFC822.
310 """
311 return s.isspace() or s in ('(', ')', '\\')
312
R. David Murray8451c4b2010-10-23 22:19:56 +0000313 def encode(self, splitchars=';, \t', maxlinelen=None, linesep='\n'):
R David Murraycd37dfc2011-03-14 18:35:56 -0400314 r"""Encode a message header into an RFC-compliant format.
Guido van Rossum8b3febe2007-08-30 01:15:14 +0000315
316 There are many issues involved in converting a given string for use in
317 an email header. Only certain character sets are readable in most
318 email clients, and as header strings can only contain a subset of
319 7-bit ASCII, care must be taken to properly convert and encode (with
320 Base64 or quoted-printable) header strings. In addition, there is a
321 75-character length limit on any given encoded header field, so
322 line-wrapping must be performed, even with double-byte character sets.
323
Ezio Melottice073cd2011-04-13 16:43:21 +0300324 Optional maxlinelen specifies the maximum length of each generated
R David Murray308f14a2011-04-12 15:00:44 -0400325 line, exclusive of the linesep string. Individual lines may be longer
326 than maxlinelen if a folding point cannot be found. The first line
327 will be shorter by the length of the header name plus ": " if a header
328 name was specified at Header construction time. The default value for
329 maxlinelen is determined at header construction time.
Guido van Rossum8b3febe2007-08-30 01:15:14 +0000330
R David Murray01581ee2011-04-18 10:04:34 -0400331 Optional splitchars is a string containing characters which should be
332 given extra weight by the splitting algorithm during normal header
333 wrapping. This is in very rough support of RFC 2822's `higher level
334 syntactic breaks': split points preceded by a splitchar are preferred
335 during line splitting, with the characters preferred in the order in
336 which they appear in the string. Space and tab may be included in the
337 string to indicate whether preference should be given to one over the
338 other as a split point when other split chars do not appear in the line
339 being split. Splitchars does not affect RFC 2047 encoded lines.
R. David Murray8451c4b2010-10-23 22:19:56 +0000340
341 Optional linesep is a string to be used to separate the lines of
342 the value. The default value is the most useful for typical
343 Python applications, but it can be set to \r\n to produce RFC-compliant
344 line separators when needed.
Guido van Rossum8b3febe2007-08-30 01:15:14 +0000345 """
346 self._normalize()
Guido van Rossum9604e662007-08-30 03:46:43 +0000347 if maxlinelen is None:
348 maxlinelen = self._maxlinelen
349 # A maxlinelen of 0 means don't wrap. For all practical purposes,
350 # choosing a huge number here accomplishes that and makes the
351 # _ValueFormatter algorithm much simpler.
352 if maxlinelen == 0:
353 maxlinelen = 1000000
354 formatter = _ValueFormatter(self._headerlen, maxlinelen,
Guido van Rossum8b3febe2007-08-30 01:15:14 +0000355 self._continuation_ws, splitchars)
R David Murray07ea53c2012-06-02 17:56:49 -0400356 lastcs = None
357 hasspace = lastspace = None
Guido van Rossum8b3febe2007-08-30 01:15:14 +0000358 for string, charset in self._chunks:
R David Murray07ea53c2012-06-02 17:56:49 -0400359 if hasspace is not None:
360 hasspace = string and self._nonctext(string[0])
R David Murray07ea53c2012-06-02 17:56:49 -0400361 if lastcs not in (None, 'us-ascii'):
362 if not hasspace or charset not in (None, 'us-ascii'):
363 formatter.add_transition()
364 elif charset not in (None, 'us-ascii') and not lastspace:
365 formatter.add_transition()
366 lastspace = string and self._nonctext(string[-1])
367 lastcs = charset
368 hasspace = False
Guido van Rossum8b3febe2007-08-30 01:15:14 +0000369 lines = string.splitlines()
R David Murray01581ee2011-04-18 10:04:34 -0400370 if lines:
371 formatter.feed('', lines[0], charset)
372 else:
373 formatter.feed('', '', charset)
R. David Murray6f0022d2011-01-07 21:57:25 +0000374 for line in lines[1:]:
375 formatter.newline()
376 if charset.header_encoding is not None:
R David Murray01581ee2011-04-18 10:04:34 -0400377 formatter.feed(self._continuation_ws, ' ' + line.lstrip(),
378 charset)
379 else:
380 sline = line.lstrip()
381 fws = line[:len(line)-len(sline)]
382 formatter.feed(fws, sline, charset)
R. David Murray6f0022d2011-01-07 21:57:25 +0000383 if len(lines) > 1:
384 formatter.newline()
R David Murray07ea53c2012-06-02 17:56:49 -0400385 if self._chunks:
Barry Warsaw00b34222007-08-31 02:35:00 +0000386 formatter.add_transition()
R. David Murray5b2d9dd2011-01-09 02:35:24 +0000387 value = formatter._str(linesep)
388 if _embeded_header.search(value):
389 raise HeaderParseError("header value appears to contain "
390 "an embedded header: {!r}".format(value))
391 return value
Guido van Rossum8b3febe2007-08-30 01:15:14 +0000392
393 def _normalize(self):
Guido van Rossum9604e662007-08-30 03:46:43 +0000394 # Step 1: Normalize the chunks so that all runs of identical charsets
395 # get collapsed into a single unicode string.
Guido van Rossum8b3febe2007-08-30 01:15:14 +0000396 chunks = []
397 last_charset = None
398 last_chunk = []
399 for string, charset in self._chunks:
400 if charset == last_charset:
401 last_chunk.append(string)
402 else:
403 if last_charset is not None:
404 chunks.append((SPACE.join(last_chunk), last_charset))
Guido van Rossum8b3febe2007-08-30 01:15:14 +0000405 last_chunk = [string]
406 last_charset = charset
407 if last_chunk:
408 chunks.append((SPACE.join(last_chunk), last_charset))
409 self._chunks = chunks
410
411
412
413class _ValueFormatter:
414 def __init__(self, headerlen, maxlen, continuation_ws, splitchars):
415 self._maxlen = maxlen
416 self._continuation_ws = continuation_ws
R David Murray01581ee2011-04-18 10:04:34 -0400417 self._continuation_ws_len = len(continuation_ws)
Guido van Rossum8b3febe2007-08-30 01:15:14 +0000418 self._splitchars = splitchars
419 self._lines = []
420 self._current_line = _Accumulator(headerlen)
421
R. David Murray8451c4b2010-10-23 22:19:56 +0000422 def _str(self, linesep):
Guido van Rossum8b3febe2007-08-30 01:15:14 +0000423 self.newline()
R. David Murray8451c4b2010-10-23 22:19:56 +0000424 return linesep.join(self._lines)
425
426 def __str__(self):
427 return self._str(NL)
Guido van Rossum8b3febe2007-08-30 01:15:14 +0000428
429 def newline(self):
Barry Warsaw00b34222007-08-31 02:35:00 +0000430 end_of_line = self._current_line.pop()
R David Murray01581ee2011-04-18 10:04:34 -0400431 if end_of_line != (' ', ''):
432 self._current_line.push(*end_of_line)
Guido van Rossum8b3febe2007-08-30 01:15:14 +0000433 if len(self._current_line) > 0:
R David Murray01581ee2011-04-18 10:04:34 -0400434 if self._current_line.is_onlyws():
435 self._lines[-1] += str(self._current_line)
436 else:
437 self._lines.append(str(self._current_line))
Guido van Rossum8b3febe2007-08-30 01:15:14 +0000438 self._current_line.reset()
439
Barry Warsaw00b34222007-08-31 02:35:00 +0000440 def add_transition(self):
R David Murray01581ee2011-04-18 10:04:34 -0400441 self._current_line.push(' ', '')
Barry Warsaw00b34222007-08-31 02:35:00 +0000442
R David Murray01581ee2011-04-18 10:04:34 -0400443 def feed(self, fws, string, charset):
Guido van Rossum9604e662007-08-30 03:46:43 +0000444 # If the charset has no header encoding (i.e. it is an ASCII encoding)
445 # then we must split the header at the "highest level syntactic break"
446 # possible. Note that we don't have a lot of smarts about field
Guido van Rossum8b3febe2007-08-30 01:15:14 +0000447 # syntax; we just try to break on semi-colons, then commas, then
Guido van Rossum9604e662007-08-30 03:46:43 +0000448 # whitespace. Eventually, this should be pluggable.
449 if charset.header_encoding is None:
R David Murray01581ee2011-04-18 10:04:34 -0400450 self._ascii_split(fws, string, self._splitchars)
Guido van Rossum8b3febe2007-08-30 01:15:14 +0000451 return
Guido van Rossum9604e662007-08-30 03:46:43 +0000452 # Otherwise, we're doing either a Base64 or a quoted-printable
453 # encoding which means we don't need to split the line on syntactic
454 # breaks. We can basically just find enough characters to fit on the
455 # current line, minus the RFC 2047 chrome. What makes this trickier
456 # though is that we have to split at octet boundaries, not character
457 # boundaries but it's only safe to split at character boundaries so at
458 # best we can only get close.
459 encoded_lines = charset.header_encode_lines(string, self._maxlengths())
460 # The first element extends the current line, but if it's None then
461 # nothing more fit on the current line so start a new line.
462 try:
463 first_line = encoded_lines.pop(0)
464 except IndexError:
465 # There are no encoded lines, so we're done.
466 return
467 if first_line is not None:
R David Murray01581ee2011-04-18 10:04:34 -0400468 self._append_chunk(fws, first_line)
Guido van Rossum9604e662007-08-30 03:46:43 +0000469 try:
470 last_line = encoded_lines.pop()
471 except IndexError:
472 # There was only one line.
473 return
R David Murray01581ee2011-04-18 10:04:34 -0400474 self.newline()
475 self._current_line.push(self._continuation_ws, last_line)
Guido van Rossum9604e662007-08-30 03:46:43 +0000476 # Everything else are full lines in themselves.
477 for line in encoded_lines:
478 self._lines.append(self._continuation_ws + line)
Guido van Rossum8b3febe2007-08-30 01:15:14 +0000479
Guido van Rossum9604e662007-08-30 03:46:43 +0000480 def _maxlengths(self):
481 # The first line's length.
482 yield self._maxlen - len(self._current_line)
483 while True:
484 yield self._maxlen - self._continuation_ws_len
485
R David Murray01581ee2011-04-18 10:04:34 -0400486 def _ascii_split(self, fws, string, splitchars):
487 # The RFC 2822 header folding algorithm is simple in principle but
488 # complex in practice. Lines may be folded any place where "folding
489 # white space" appears by inserting a linesep character in front of the
490 # FWS. The complication is that not all spaces or tabs qualify as FWS,
491 # and we are also supposed to prefer to break at "higher level
492 # syntactic breaks". We can't do either of these without intimate
493 # knowledge of the structure of structured headers, which we don't have
494 # here. So the best we can do here is prefer to break at the specified
495 # splitchars, and hope that we don't choose any spaces or tabs that
496 # aren't legal FWS. (This is at least better than the old algorithm,
497 # where we would sometimes *introduce* FWS after a splitchar, or the
498 # algorithm before that, where we would turn all white space runs into
499 # single spaces or tabs.)
500 parts = re.split("(["+FWS+"]+)", fws+string)
501 if parts[0]:
502 parts[:0] = ['']
503 else:
504 parts.pop(0)
505 for fws, part in zip(*[iter(parts)]*2):
506 self._append_chunk(fws, part)
507
508 def _append_chunk(self, fws, string):
509 self._current_line.push(fws, string)
510 if len(self._current_line) > self._maxlen:
511 # Find the best split point, working backward from the end.
512 # There might be none, on a long first line.
513 for ch in self._splitchars:
514 for i in range(self._current_line.part_count()-1, 0, -1):
515 if ch.isspace():
516 fws = self._current_line[i][0]
517 if fws and fws[0]==ch:
518 break
519 prevpart = self._current_line[i-1][1]
520 if prevpart and prevpart[-1]==ch:
521 break
Guido van Rossum8b3febe2007-08-30 01:15:14 +0000522 else:
R David Murray01581ee2011-04-18 10:04:34 -0400523 continue
524 break
Guido van Rossum8b3febe2007-08-30 01:15:14 +0000525 else:
R David Murray01581ee2011-04-18 10:04:34 -0400526 fws, part = self._current_line.pop()
527 if self._current_line._initial_size > 0:
528 # There will be a header, so leave it on a line by itself.
529 self.newline()
530 if not fws:
531 # We don't use continuation_ws here because the whitespace
532 # after a header should always be a space.
533 fws = ' '
534 self._current_line.push(fws, part)
535 return
536 remainder = self._current_line.pop_from(i)
537 self._lines.append(str(self._current_line))
538 self._current_line.reset(remainder)
Guido van Rossum8b3febe2007-08-30 01:15:14 +0000539
540
R David Murray01581ee2011-04-18 10:04:34 -0400541class _Accumulator(list):
Guido van Rossum8b3febe2007-08-30 01:15:14 +0000542
Guido van Rossum9604e662007-08-30 03:46:43 +0000543 def __init__(self, initial_size=0):
Guido van Rossum8b3febe2007-08-30 01:15:14 +0000544 self._initial_size = initial_size
R David Murray01581ee2011-04-18 10:04:34 -0400545 super().__init__()
Guido van Rossum8b3febe2007-08-30 01:15:14 +0000546
R David Murray01581ee2011-04-18 10:04:34 -0400547 def push(self, fws, string):
548 self.append((fws, string))
549
550 def pop_from(self, i=0):
551 popped = self[i:]
552 self[i:] = []
553 return popped
Guido van Rossum8b3febe2007-08-30 01:15:14 +0000554
555 def pop(self):
R David Murray01581ee2011-04-18 10:04:34 -0400556 if self.part_count()==0:
557 return ('', '')
558 return super().pop()
Guido van Rossum8b3febe2007-08-30 01:15:14 +0000559
560 def __len__(self):
R David Murray01581ee2011-04-18 10:04:34 -0400561 return sum((len(fws)+len(part) for fws, part in self),
Guido van Rossum9604e662007-08-30 03:46:43 +0000562 self._initial_size)
Guido van Rossum8b3febe2007-08-30 01:15:14 +0000563
564 def __str__(self):
R David Murray01581ee2011-04-18 10:04:34 -0400565 return EMPTYSTRING.join((EMPTYSTRING.join((fws, part))
566 for fws, part in self))
Guido van Rossum8b3febe2007-08-30 01:15:14 +0000567
R David Murray01581ee2011-04-18 10:04:34 -0400568 def reset(self, startval=None):
569 if startval is None:
570 startval = []
571 self[:] = startval
Guido van Rossum8b3febe2007-08-30 01:15:14 +0000572 self._initial_size = 0
Guido van Rossum9604e662007-08-30 03:46:43 +0000573
574 def is_onlyws(self):
R David Murray01581ee2011-04-18 10:04:34 -0400575 return self._initial_size==0 and (not self or str(self).isspace())
576
577 def part_count(self):
578 return super().__len__()