| Guido van Rossum | 8b3febe | 2007-08-30 01:15:14 +0000 | [diff] [blame] | 1 | # Copyright (C) 2002-2007 Python Software Foundation | 
|  | 2 | # Author: Ben Gertzfield, Barry Warsaw | 
|  | 3 | # Contact: email-sig@python.org | 
|  | 4 |  | 
|  | 5 | """Header encoding and decoding functionality.""" | 
|  | 6 |  | 
|  | 7 | __all__ = [ | 
|  | 8 | 'Header', | 
|  | 9 | 'decode_header', | 
|  | 10 | 'make_header', | 
|  | 11 | ] | 
|  | 12 |  | 
|  | 13 | import re | 
|  | 14 | import binascii | 
|  | 15 |  | 
|  | 16 | import email.quoprimime | 
|  | 17 | import email.base64mime | 
|  | 18 |  | 
|  | 19 | from email.errors import HeaderParseError | 
| R. David Murray | 9253214 | 2011-01-07 23:25:30 +0000 | [diff] [blame] | 20 | from email import charset as _charset | 
|  | 21 | Charset = _charset.Charset | 
| Guido van Rossum | 8b3febe | 2007-08-30 01:15:14 +0000 | [diff] [blame] | 22 |  | 
|  | 23 | NL = '\n' | 
|  | 24 | SPACE = ' ' | 
|  | 25 | BSPACE = b' ' | 
|  | 26 | SPACE8 = ' ' * 8 | 
|  | 27 | EMPTYSTRING = '' | 
| Guido van Rossum | 9604e66 | 2007-08-30 03:46:43 +0000 | [diff] [blame] | 28 | MAXLINELEN = 78 | 
| Guido van Rossum | 8b3febe | 2007-08-30 01:15:14 +0000 | [diff] [blame] | 29 |  | 
|  | 30 | USASCII = Charset('us-ascii') | 
|  | 31 | UTF8 = Charset('utf-8') | 
|  | 32 |  | 
|  | 33 | # Match encoded-word strings in the form =?charset?q?Hello_World?= | 
|  | 34 | ecre = re.compile(r''' | 
|  | 35 | =\?                   # literal =? | 
|  | 36 | (?P<charset>[^?]*?)   # non-greedy up to the next ? is the charset | 
|  | 37 | \?                    # literal ? | 
|  | 38 | (?P<encoding>[qb])    # either a "q" or a "b", case insensitive | 
|  | 39 | \?                    # literal ? | 
|  | 40 | (?P<encoded>.*?)      # non-greedy up to the next ?= is the encoded string | 
|  | 41 | \?=                   # literal ?= | 
|  | 42 | (?=[ \t]|$)           # whitespace or the end of the string | 
|  | 43 | ''', re.VERBOSE | re.IGNORECASE | re.MULTILINE) | 
|  | 44 |  | 
|  | 45 | # Field name regexp, including trailing colon, but not separating whitespace, | 
|  | 46 | # according to RFC 2822.  Character range is from tilde to exclamation mark. | 
|  | 47 | # For use with .match() | 
|  | 48 | fcre = re.compile(r'[\041-\176]+:$') | 
|  | 49 |  | 
| R. David Murray | 5b2d9dd | 2011-01-09 02:35:24 +0000 | [diff] [blame] | 50 | # Find a header embeded in a putative header value.  Used to check for | 
|  | 51 | # header injection attack. | 
|  | 52 | _embeded_header = re.compile(r'\n[^ \t]+:') | 
|  | 53 |  | 
| Guido van Rossum | 8b3febe | 2007-08-30 01:15:14 +0000 | [diff] [blame] | 54 |  | 
|  | 55 |  | 
|  | 56 | # Helpers | 
|  | 57 | _max_append = email.quoprimime._max_append | 
|  | 58 |  | 
|  | 59 |  | 
|  | 60 |  | 
|  | 61 | def decode_header(header): | 
|  | 62 | """Decode a message header value without converting charset. | 
|  | 63 |  | 
|  | 64 | Returns a list of (string, charset) pairs containing each of the decoded | 
|  | 65 | parts of the header.  Charset is None for non-encoded parts of the header, | 
|  | 66 | otherwise a lower-case string containing the name of the character set | 
|  | 67 | specified in the encoded string. | 
|  | 68 |  | 
| Amaury Forgeot d'Arc | 1c25de6 | 2009-07-12 16:43:19 +0000 | [diff] [blame] | 69 | An email.errors.HeaderParseError may be raised when certain decoding error | 
| Guido van Rossum | 8b3febe | 2007-08-30 01:15:14 +0000 | [diff] [blame] | 70 | occurs (e.g. a base64 decoding exception). | 
|  | 71 | """ | 
|  | 72 | # If no encoding, just return the header with no charset. | 
|  | 73 | if not ecre.search(header): | 
|  | 74 | return [(header, None)] | 
|  | 75 | # First step is to parse all the encoded parts into triplets of the form | 
|  | 76 | # (encoded_string, encoding, charset).  For unencoded strings, the last | 
|  | 77 | # two parts will be None. | 
|  | 78 | words = [] | 
|  | 79 | for line in header.splitlines(): | 
|  | 80 | parts = ecre.split(line) | 
|  | 81 | while parts: | 
|  | 82 | unencoded = parts.pop(0).strip() | 
|  | 83 | if unencoded: | 
|  | 84 | words.append((unencoded, None, None)) | 
|  | 85 | if parts: | 
|  | 86 | charset = parts.pop(0).lower() | 
|  | 87 | encoding = parts.pop(0).lower() | 
|  | 88 | encoded = parts.pop(0) | 
|  | 89 | words.append((encoded, encoding, charset)) | 
|  | 90 | # The next step is to decode each encoded word by applying the reverse | 
|  | 91 | # base64 or quopri transformation.  decoded_words is now a list of the | 
|  | 92 | # form (decoded_word, charset). | 
|  | 93 | decoded_words = [] | 
|  | 94 | for encoded_string, encoding, charset in words: | 
|  | 95 | if encoding is None: | 
|  | 96 | # This is an unencoded word. | 
|  | 97 | decoded_words.append((encoded_string, charset)) | 
|  | 98 | elif encoding == 'q': | 
|  | 99 | word = email.quoprimime.header_decode(encoded_string) | 
|  | 100 | decoded_words.append((word, charset)) | 
|  | 101 | elif encoding == 'b': | 
| R. David Murray | c4e69cc | 2010-08-03 22:14:10 +0000 | [diff] [blame] | 102 | paderr = len(encoded_string) % 4   # Postel's law: add missing padding | 
|  | 103 | if paderr: | 
|  | 104 | encoded_string += '==='[:4 - paderr] | 
| Guido van Rossum | 8b3febe | 2007-08-30 01:15:14 +0000 | [diff] [blame] | 105 | try: | 
|  | 106 | word = email.base64mime.decode(encoded_string) | 
|  | 107 | except binascii.Error: | 
|  | 108 | raise HeaderParseError('Base64 decoding error') | 
|  | 109 | else: | 
|  | 110 | decoded_words.append((word, charset)) | 
|  | 111 | else: | 
|  | 112 | raise AssertionError('Unexpected encoding: ' + encoding) | 
|  | 113 | # Now convert all words to bytes and collapse consecutive runs of | 
|  | 114 | # similarly encoded words. | 
|  | 115 | collapsed = [] | 
|  | 116 | last_word = last_charset = None | 
|  | 117 | for word, charset in decoded_words: | 
|  | 118 | if isinstance(word, str): | 
| Guido van Rossum | 9604e66 | 2007-08-30 03:46:43 +0000 | [diff] [blame] | 119 | word = bytes(word, 'raw-unicode-escape') | 
| Guido van Rossum | 8b3febe | 2007-08-30 01:15:14 +0000 | [diff] [blame] | 120 | if last_word is None: | 
|  | 121 | last_word = word | 
|  | 122 | last_charset = charset | 
|  | 123 | elif charset != last_charset: | 
|  | 124 | collapsed.append((last_word, last_charset)) | 
|  | 125 | last_word = word | 
|  | 126 | last_charset = charset | 
|  | 127 | elif last_charset is None: | 
|  | 128 | last_word += BSPACE + word | 
|  | 129 | else: | 
|  | 130 | last_word += word | 
|  | 131 | collapsed.append((last_word, last_charset)) | 
|  | 132 | return collapsed | 
|  | 133 |  | 
|  | 134 |  | 
|  | 135 |  | 
|  | 136 | def make_header(decoded_seq, maxlinelen=None, header_name=None, | 
|  | 137 | continuation_ws=' '): | 
|  | 138 | """Create a Header from a sequence of pairs as returned by decode_header() | 
|  | 139 |  | 
|  | 140 | decode_header() takes a header value string and returns a sequence of | 
|  | 141 | pairs of the format (decoded_string, charset) where charset is the string | 
|  | 142 | name of the character set. | 
|  | 143 |  | 
|  | 144 | This function takes one of those sequence of pairs and returns a Header | 
|  | 145 | instance.  Optional maxlinelen, header_name, and continuation_ws are as in | 
|  | 146 | the Header constructor. | 
|  | 147 | """ | 
|  | 148 | h = Header(maxlinelen=maxlinelen, header_name=header_name, | 
|  | 149 | continuation_ws=continuation_ws) | 
|  | 150 | for s, charset in decoded_seq: | 
|  | 151 | # None means us-ascii but we can simply pass it on to h.append() | 
|  | 152 | if charset is not None and not isinstance(charset, Charset): | 
|  | 153 | charset = Charset(charset) | 
|  | 154 | h.append(s, charset) | 
|  | 155 | return h | 
|  | 156 |  | 
|  | 157 |  | 
|  | 158 |  | 
|  | 159 | class Header: | 
|  | 160 | def __init__(self, s=None, charset=None, | 
|  | 161 | maxlinelen=None, header_name=None, | 
|  | 162 | continuation_ws=' ', errors='strict'): | 
|  | 163 | """Create a MIME-compliant header that can contain many character sets. | 
|  | 164 |  | 
|  | 165 | Optional s is the initial header value.  If None, the initial header | 
|  | 166 | value is not set.  You can later append to the header with .append() | 
|  | 167 | method calls.  s may be a byte string or a Unicode string, but see the | 
|  | 168 | .append() documentation for semantics. | 
|  | 169 |  | 
|  | 170 | Optional charset serves two purposes: it has the same meaning as the | 
|  | 171 | charset argument to the .append() method.  It also sets the default | 
|  | 172 | character set for all subsequent .append() calls that omit the charset | 
|  | 173 | argument.  If charset is not provided in the constructor, the us-ascii | 
|  | 174 | charset is used both as s's initial charset and as the default for | 
|  | 175 | subsequent .append() calls. | 
|  | 176 |  | 
| R. David Murray | 4c1da4c | 2010-12-29 16:57:24 +0000 | [diff] [blame] | 177 | The maximum line length can be specified explicitly via maxlinelen. For | 
| Guido van Rossum | 8b3febe | 2007-08-30 01:15:14 +0000 | [diff] [blame] | 178 | splitting the first line to a shorter value (to account for the field | 
|  | 179 | header which isn't included in s, e.g. `Subject') pass in the name of | 
| Guido van Rossum | 9604e66 | 2007-08-30 03:46:43 +0000 | [diff] [blame] | 180 | the field in header_name.  The default maxlinelen is 78 as recommended | 
|  | 181 | by RFC 2822. | 
| Guido van Rossum | 8b3febe | 2007-08-30 01:15:14 +0000 | [diff] [blame] | 182 |  | 
|  | 183 | continuation_ws must be RFC 2822 compliant folding whitespace (usually | 
|  | 184 | either a space or a hard tab) which will be prepended to continuation | 
|  | 185 | lines. | 
|  | 186 |  | 
|  | 187 | errors is passed through to the .append() call. | 
|  | 188 | """ | 
|  | 189 | if charset is None: | 
|  | 190 | charset = USASCII | 
|  | 191 | elif not isinstance(charset, Charset): | 
|  | 192 | charset = Charset(charset) | 
|  | 193 | self._charset = charset | 
|  | 194 | self._continuation_ws = continuation_ws | 
|  | 195 | self._chunks = [] | 
|  | 196 | if s is not None: | 
|  | 197 | self.append(s, charset, errors) | 
|  | 198 | if maxlinelen is None: | 
|  | 199 | maxlinelen = MAXLINELEN | 
|  | 200 | self._maxlinelen = maxlinelen | 
|  | 201 | if header_name is None: | 
|  | 202 | self._headerlen = 0 | 
|  | 203 | else: | 
|  | 204 | # Take the separating colon and space into account. | 
|  | 205 | self._headerlen = len(header_name) + 2 | 
|  | 206 |  | 
|  | 207 | def __str__(self): | 
|  | 208 | """Return the string value of the header.""" | 
| Guido van Rossum | 9604e66 | 2007-08-30 03:46:43 +0000 | [diff] [blame] | 209 | self._normalize() | 
| Guido van Rossum | 8b3febe | 2007-08-30 01:15:14 +0000 | [diff] [blame] | 210 | uchunks = [] | 
|  | 211 | lastcs = None | 
| Guido van Rossum | 9604e66 | 2007-08-30 03:46:43 +0000 | [diff] [blame] | 212 | for string, charset in self._chunks: | 
| Guido van Rossum | 8b3febe | 2007-08-30 01:15:14 +0000 | [diff] [blame] | 213 | # We must preserve spaces between encoded and non-encoded word | 
|  | 214 | # boundaries, which means for us we need to add a space when we go | 
|  | 215 | # from a charset to None/us-ascii, or from None/us-ascii to a | 
|  | 216 | # charset.  Only do this for the second and subsequent chunks. | 
|  | 217 | nextcs = charset | 
| R. David Murray | 9253214 | 2011-01-07 23:25:30 +0000 | [diff] [blame] | 218 | if nextcs == _charset.UNKNOWN8BIT: | 
|  | 219 | original_bytes = string.encode('ascii', 'surrogateescape') | 
|  | 220 | string = original_bytes.decode('ascii', 'replace') | 
| Guido van Rossum | 8b3febe | 2007-08-30 01:15:14 +0000 | [diff] [blame] | 221 | if uchunks: | 
|  | 222 | if lastcs not in (None, 'us-ascii'): | 
|  | 223 | if nextcs in (None, 'us-ascii'): | 
|  | 224 | uchunks.append(SPACE) | 
|  | 225 | nextcs = None | 
|  | 226 | elif nextcs not in (None, 'us-ascii'): | 
|  | 227 | uchunks.append(SPACE) | 
|  | 228 | lastcs = nextcs | 
| Guido van Rossum | 9604e66 | 2007-08-30 03:46:43 +0000 | [diff] [blame] | 229 | uchunks.append(string) | 
| Guido van Rossum | 8b3febe | 2007-08-30 01:15:14 +0000 | [diff] [blame] | 230 | return EMPTYSTRING.join(uchunks) | 
|  | 231 |  | 
|  | 232 | # Rich comparison operators for equality only.  BAW: does it make sense to | 
|  | 233 | # have or explicitly disable <, <=, >, >= operators? | 
|  | 234 | def __eq__(self, other): | 
|  | 235 | # other may be a Header or a string.  Both are fine so coerce | 
| Guido van Rossum | 9604e66 | 2007-08-30 03:46:43 +0000 | [diff] [blame] | 236 | # ourselves to a unicode (of the unencoded header value), swap the | 
|  | 237 | # args and do another comparison. | 
|  | 238 | return other == str(self) | 
| Guido van Rossum | 8b3febe | 2007-08-30 01:15:14 +0000 | [diff] [blame] | 239 |  | 
|  | 240 | def __ne__(self, other): | 
|  | 241 | return not self == other | 
|  | 242 |  | 
|  | 243 | def append(self, s, charset=None, errors='strict'): | 
|  | 244 | """Append a string to the MIME header. | 
|  | 245 |  | 
|  | 246 | Optional charset, if given, should be a Charset instance or the name | 
|  | 247 | of a character set (which will be converted to a Charset instance).  A | 
|  | 248 | value of None (the default) means that the charset given in the | 
|  | 249 | constructor is used. | 
|  | 250 |  | 
|  | 251 | s may be a byte string or a Unicode string.  If it is a byte string | 
| R. David Murray | 4c1da4c | 2010-12-29 16:57:24 +0000 | [diff] [blame] | 252 | (i.e. isinstance(s, str) is false), then charset is the encoding of | 
| Guido van Rossum | 8b3febe | 2007-08-30 01:15:14 +0000 | [diff] [blame] | 253 | that byte string, and a UnicodeError will be raised if the string | 
|  | 254 | cannot be decoded with that charset.  If s is a Unicode string, then | 
|  | 255 | charset is a hint specifying the character set of the characters in | 
| R. David Murray | 477efb3 | 2011-01-05 01:39:32 +0000 | [diff] [blame] | 256 | the string.  In either case, when producing an RFC 2822 compliant | 
|  | 257 | header using RFC 2047 rules, the string will be encoded using the | 
|  | 258 | output codec of the charset.  If the string cannot be encoded to the | 
|  | 259 | output codec, a UnicodeError will be raised. | 
| Guido van Rossum | 8b3febe | 2007-08-30 01:15:14 +0000 | [diff] [blame] | 260 |  | 
| R. David Murray | 477efb3 | 2011-01-05 01:39:32 +0000 | [diff] [blame] | 261 | Optional `errors' is passed as the errors argument to the decode | 
|  | 262 | call if s is a byte string. | 
| Guido van Rossum | 8b3febe | 2007-08-30 01:15:14 +0000 | [diff] [blame] | 263 | """ | 
|  | 264 | if charset is None: | 
|  | 265 | charset = self._charset | 
|  | 266 | elif not isinstance(charset, Charset): | 
|  | 267 | charset = Charset(charset) | 
| R. David Murray | 477efb3 | 2011-01-05 01:39:32 +0000 | [diff] [blame] | 268 | if not isinstance(s, str): | 
| Guido van Rossum | 8b3febe | 2007-08-30 01:15:14 +0000 | [diff] [blame] | 269 | input_charset = charset.input_codec or 'us-ascii' | 
| R. David Murray | 477efb3 | 2011-01-05 01:39:32 +0000 | [diff] [blame] | 270 | s = s.decode(input_charset, errors) | 
| Guido van Rossum | 8b3febe | 2007-08-30 01:15:14 +0000 | [diff] [blame] | 271 | # Ensure that the bytes we're storing can be decoded to the output | 
|  | 272 | # character set, otherwise an early error is thrown. | 
|  | 273 | output_charset = charset.output_codec or 'us-ascii' | 
| R. David Murray | 9253214 | 2011-01-07 23:25:30 +0000 | [diff] [blame] | 274 | if output_charset != _charset.UNKNOWN8BIT: | 
|  | 275 | s.encode(output_charset, errors) | 
| R. David Murray | 477efb3 | 2011-01-05 01:39:32 +0000 | [diff] [blame] | 276 | self._chunks.append((s, charset)) | 
| Guido van Rossum | 8b3febe | 2007-08-30 01:15:14 +0000 | [diff] [blame] | 277 |  | 
| R. David Murray | 8451c4b | 2010-10-23 22:19:56 +0000 | [diff] [blame] | 278 | def encode(self, splitchars=';, \t', maxlinelen=None, linesep='\n'): | 
| Guido van Rossum | 8b3febe | 2007-08-30 01:15:14 +0000 | [diff] [blame] | 279 | """Encode a message header into an RFC-compliant format. | 
|  | 280 |  | 
|  | 281 | There are many issues involved in converting a given string for use in | 
|  | 282 | an email header.  Only certain character sets are readable in most | 
|  | 283 | email clients, and as header strings can only contain a subset of | 
|  | 284 | 7-bit ASCII, care must be taken to properly convert and encode (with | 
|  | 285 | Base64 or quoted-printable) header strings.  In addition, there is a | 
|  | 286 | 75-character length limit on any given encoded header field, so | 
|  | 287 | line-wrapping must be performed, even with double-byte character sets. | 
|  | 288 |  | 
|  | 289 | This method will do its best to convert the string to the correct | 
|  | 290 | character set used in email, and encode and line wrap it safely with | 
|  | 291 | the appropriate scheme for that character set. | 
|  | 292 |  | 
|  | 293 | If the given charset is not known or an error occurs during | 
|  | 294 | conversion, this function will return the header untouched. | 
|  | 295 |  | 
|  | 296 | Optional splitchars is a string containing characters to split long | 
|  | 297 | ASCII lines on, in rough support of RFC 2822's `highest level | 
|  | 298 | syntactic breaks'.  This doesn't affect RFC 2047 encoded lines. | 
| R. David Murray | 8451c4b | 2010-10-23 22:19:56 +0000 | [diff] [blame] | 299 |  | 
|  | 300 | Optional linesep is a string to be used to separate the lines of | 
|  | 301 | the value.  The default value is the most useful for typical | 
|  | 302 | Python applications, but it can be set to \r\n to produce RFC-compliant | 
|  | 303 | line separators when needed. | 
| Guido van Rossum | 8b3febe | 2007-08-30 01:15:14 +0000 | [diff] [blame] | 304 | """ | 
|  | 305 | self._normalize() | 
| Guido van Rossum | 9604e66 | 2007-08-30 03:46:43 +0000 | [diff] [blame] | 306 | if maxlinelen is None: | 
|  | 307 | maxlinelen = self._maxlinelen | 
|  | 308 | # A maxlinelen of 0 means don't wrap.  For all practical purposes, | 
|  | 309 | # choosing a huge number here accomplishes that and makes the | 
|  | 310 | # _ValueFormatter algorithm much simpler. | 
|  | 311 | if maxlinelen == 0: | 
|  | 312 | maxlinelen = 1000000 | 
|  | 313 | formatter = _ValueFormatter(self._headerlen, maxlinelen, | 
| Guido van Rossum | 8b3febe | 2007-08-30 01:15:14 +0000 | [diff] [blame] | 314 | self._continuation_ws, splitchars) | 
|  | 315 | for string, charset in self._chunks: | 
|  | 316 | lines = string.splitlines() | 
| R. David Murray | 6f0022d | 2011-01-07 21:57:25 +0000 | [diff] [blame] | 317 | formatter.feed(lines[0], charset) | 
|  | 318 | for line in lines[1:]: | 
|  | 319 | formatter.newline() | 
|  | 320 | if charset.header_encoding is not None: | 
|  | 321 | formatter.feed(self._continuation_ws, USASCII) | 
|  | 322 | line = ' ' + line.lstrip() | 
| Guido van Rossum | 8b3febe | 2007-08-30 01:15:14 +0000 | [diff] [blame] | 323 | formatter.feed(line, charset) | 
| R. David Murray | 6f0022d | 2011-01-07 21:57:25 +0000 | [diff] [blame] | 324 | if len(lines) > 1: | 
|  | 325 | formatter.newline() | 
| Barry Warsaw | 00b3422 | 2007-08-31 02:35:00 +0000 | [diff] [blame] | 326 | formatter.add_transition() | 
| R. David Murray | 5b2d9dd | 2011-01-09 02:35:24 +0000 | [diff] [blame] | 327 | value = formatter._str(linesep) | 
|  | 328 | if _embeded_header.search(value): | 
|  | 329 | raise HeaderParseError("header value appears to contain " | 
|  | 330 | "an embedded header: {!r}".format(value)) | 
|  | 331 | return value | 
| Guido van Rossum | 8b3febe | 2007-08-30 01:15:14 +0000 | [diff] [blame] | 332 |  | 
|  | 333 | def _normalize(self): | 
| Guido van Rossum | 9604e66 | 2007-08-30 03:46:43 +0000 | [diff] [blame] | 334 | # Step 1: Normalize the chunks so that all runs of identical charsets | 
|  | 335 | # get collapsed into a single unicode string. | 
| Guido van Rossum | 8b3febe | 2007-08-30 01:15:14 +0000 | [diff] [blame] | 336 | chunks = [] | 
|  | 337 | last_charset = None | 
|  | 338 | last_chunk = [] | 
|  | 339 | for string, charset in self._chunks: | 
|  | 340 | if charset == last_charset: | 
|  | 341 | last_chunk.append(string) | 
|  | 342 | else: | 
|  | 343 | if last_charset is not None: | 
|  | 344 | chunks.append((SPACE.join(last_chunk), last_charset)) | 
| Guido van Rossum | 8b3febe | 2007-08-30 01:15:14 +0000 | [diff] [blame] | 345 | last_chunk = [string] | 
|  | 346 | last_charset = charset | 
|  | 347 | if last_chunk: | 
|  | 348 | chunks.append((SPACE.join(last_chunk), last_charset)) | 
|  | 349 | self._chunks = chunks | 
|  | 350 |  | 
|  | 351 |  | 
|  | 352 |  | 
|  | 353 | class _ValueFormatter: | 
|  | 354 | def __init__(self, headerlen, maxlen, continuation_ws, splitchars): | 
|  | 355 | self._maxlen = maxlen | 
|  | 356 | self._continuation_ws = continuation_ws | 
|  | 357 | self._continuation_ws_len = len(continuation_ws.replace('\t', SPACE8)) | 
|  | 358 | self._splitchars = splitchars | 
|  | 359 | self._lines = [] | 
|  | 360 | self._current_line = _Accumulator(headerlen) | 
|  | 361 |  | 
| R. David Murray | 8451c4b | 2010-10-23 22:19:56 +0000 | [diff] [blame] | 362 | def _str(self, linesep): | 
| Guido van Rossum | 8b3febe | 2007-08-30 01:15:14 +0000 | [diff] [blame] | 363 | self.newline() | 
| R. David Murray | 8451c4b | 2010-10-23 22:19:56 +0000 | [diff] [blame] | 364 | return linesep.join(self._lines) | 
|  | 365 |  | 
|  | 366 | def __str__(self): | 
|  | 367 | return self._str(NL) | 
| Guido van Rossum | 8b3febe | 2007-08-30 01:15:14 +0000 | [diff] [blame] | 368 |  | 
|  | 369 | def newline(self): | 
| Barry Warsaw | 00b3422 | 2007-08-31 02:35:00 +0000 | [diff] [blame] | 370 | end_of_line = self._current_line.pop() | 
|  | 371 | if end_of_line is not None: | 
|  | 372 | self._current_line.push(end_of_line) | 
| Guido van Rossum | 8b3febe | 2007-08-30 01:15:14 +0000 | [diff] [blame] | 373 | if len(self._current_line) > 0: | 
|  | 374 | self._lines.append(str(self._current_line)) | 
|  | 375 | self._current_line.reset() | 
|  | 376 |  | 
| Barry Warsaw | 00b3422 | 2007-08-31 02:35:00 +0000 | [diff] [blame] | 377 | def add_transition(self): | 
|  | 378 | self._current_line.push(None) | 
|  | 379 |  | 
| Guido van Rossum | 8b3febe | 2007-08-30 01:15:14 +0000 | [diff] [blame] | 380 | def feed(self, string, charset): | 
|  | 381 | # If the string itself fits on the current line in its encoded format, | 
|  | 382 | # then add it now and be done with it. | 
|  | 383 | encoded_string = charset.header_encode(string) | 
|  | 384 | if len(encoded_string) + len(self._current_line) <= self._maxlen: | 
|  | 385 | self._current_line.push(encoded_string) | 
|  | 386 | return | 
| Guido van Rossum | 9604e66 | 2007-08-30 03:46:43 +0000 | [diff] [blame] | 387 | # If the charset has no header encoding (i.e. it is an ASCII encoding) | 
|  | 388 | # then we must split the header at the "highest level syntactic break" | 
|  | 389 | # possible. Note that we don't have a lot of smarts about field | 
| Guido van Rossum | 8b3febe | 2007-08-30 01:15:14 +0000 | [diff] [blame] | 390 | # syntax; we just try to break on semi-colons, then commas, then | 
| Guido van Rossum | 9604e66 | 2007-08-30 03:46:43 +0000 | [diff] [blame] | 391 | # whitespace.  Eventually, this should be pluggable. | 
|  | 392 | if charset.header_encoding is None: | 
|  | 393 | for ch in self._splitchars: | 
|  | 394 | if ch in string: | 
|  | 395 | break | 
|  | 396 | else: | 
|  | 397 | ch = None | 
|  | 398 | # If there's no available split character then regardless of | 
|  | 399 | # whether the string fits on the line, we have to put it on a line | 
|  | 400 | # by itself. | 
|  | 401 | if ch is None: | 
|  | 402 | if not self._current_line.is_onlyws(): | 
|  | 403 | self._lines.append(str(self._current_line)) | 
|  | 404 | self._current_line.reset(self._continuation_ws) | 
|  | 405 | self._current_line.push(encoded_string) | 
|  | 406 | else: | 
|  | 407 | self._ascii_split(string, ch) | 
| Guido van Rossum | 8b3febe | 2007-08-30 01:15:14 +0000 | [diff] [blame] | 408 | return | 
| Guido van Rossum | 9604e66 | 2007-08-30 03:46:43 +0000 | [diff] [blame] | 409 | # Otherwise, we're doing either a Base64 or a quoted-printable | 
|  | 410 | # encoding which means we don't need to split the line on syntactic | 
|  | 411 | # breaks.  We can basically just find enough characters to fit on the | 
|  | 412 | # current line, minus the RFC 2047 chrome.  What makes this trickier | 
|  | 413 | # though is that we have to split at octet boundaries, not character | 
|  | 414 | # boundaries but it's only safe to split at character boundaries so at | 
|  | 415 | # best we can only get close. | 
|  | 416 | encoded_lines = charset.header_encode_lines(string, self._maxlengths()) | 
|  | 417 | # The first element extends the current line, but if it's None then | 
|  | 418 | # nothing more fit on the current line so start a new line. | 
|  | 419 | try: | 
|  | 420 | first_line = encoded_lines.pop(0) | 
|  | 421 | except IndexError: | 
|  | 422 | # There are no encoded lines, so we're done. | 
|  | 423 | return | 
|  | 424 | if first_line is not None: | 
|  | 425 | self._current_line.push(first_line) | 
|  | 426 | self._lines.append(str(self._current_line)) | 
|  | 427 | self._current_line.reset(self._continuation_ws) | 
|  | 428 | try: | 
|  | 429 | last_line = encoded_lines.pop() | 
|  | 430 | except IndexError: | 
|  | 431 | # There was only one line. | 
|  | 432 | return | 
|  | 433 | self._current_line.push(last_line) | 
| Guido van Rossum | 9604e66 | 2007-08-30 03:46:43 +0000 | [diff] [blame] | 434 | # Everything else are full lines in themselves. | 
|  | 435 | for line in encoded_lines: | 
|  | 436 | self._lines.append(self._continuation_ws + line) | 
| Guido van Rossum | 8b3febe | 2007-08-30 01:15:14 +0000 | [diff] [blame] | 437 |  | 
| Guido van Rossum | 9604e66 | 2007-08-30 03:46:43 +0000 | [diff] [blame] | 438 | def _maxlengths(self): | 
|  | 439 | # The first line's length. | 
|  | 440 | yield self._maxlen - len(self._current_line) | 
|  | 441 | while True: | 
|  | 442 | yield self._maxlen - self._continuation_ws_len | 
|  | 443 |  | 
|  | 444 | def _ascii_split(self, string, ch): | 
|  | 445 | holding = _Accumulator() | 
| Guido van Rossum | 8b3febe | 2007-08-30 01:15:14 +0000 | [diff] [blame] | 446 | # Split the line on the split character, preserving it.  If the split | 
|  | 447 | # character is whitespace RFC 2822 $2.2.3 requires us to fold on the | 
|  | 448 | # whitespace, so that the line leads with the original whitespace we | 
|  | 449 | # split on.  However, if a higher syntactic break is used instead | 
|  | 450 | # (e.g. comma or semicolon), the folding should happen after the split | 
|  | 451 | # character.  But then in that case, we need to add our own | 
|  | 452 | # continuation whitespace -- although won't that break unfolding? | 
|  | 453 | for part, splitpart, nextpart in _spliterator(ch, string): | 
|  | 454 | if not splitpart: | 
|  | 455 | # No splitpart means this is the last chunk.  Put this part | 
|  | 456 | # either on the current line or the next line depending on | 
|  | 457 | # whether it fits. | 
|  | 458 | holding.push(part) | 
|  | 459 | if len(holding) + len(self._current_line) <= self._maxlen: | 
|  | 460 | # It fits, but we're done. | 
|  | 461 | self._current_line.push(str(holding)) | 
|  | 462 | else: | 
|  | 463 | # It doesn't fit, but we're done.  Before pushing a new | 
|  | 464 | # line, watch out for the current line containing only | 
|  | 465 | # whitespace. | 
|  | 466 | holding.pop() | 
| Guido van Rossum | 9604e66 | 2007-08-30 03:46:43 +0000 | [diff] [blame] | 467 | if self._current_line.is_onlyws() and holding.is_onlyws(): | 
| Guido van Rossum | 8b3febe | 2007-08-30 01:15:14 +0000 | [diff] [blame] | 468 | # Don't start a new line. | 
|  | 469 | holding.push(part) | 
|  | 470 | part = None | 
|  | 471 | self._current_line.push(str(holding)) | 
|  | 472 | self._lines.append(str(self._current_line)) | 
|  | 473 | if part is None: | 
|  | 474 | self._current_line.reset() | 
|  | 475 | else: | 
|  | 476 | holding.reset(part) | 
|  | 477 | self._current_line.reset(str(holding)) | 
|  | 478 | return | 
|  | 479 | elif not nextpart: | 
|  | 480 | # There must be some trailing split characters because we | 
|  | 481 | # found a split character but no next part.  In this case we | 
|  | 482 | # must treat the thing to fit as the part + splitpart because | 
|  | 483 | # if splitpart is whitespace it's not allowed to be the only | 
|  | 484 | # thing on the line, and if it's not whitespace we must split | 
|  | 485 | # after the syntactic break.  In either case, we're done. | 
|  | 486 | holding_prelen = len(holding) | 
|  | 487 | holding.push(part + splitpart) | 
|  | 488 | if len(holding) + len(self._current_line) <= self._maxlen: | 
|  | 489 | self._current_line.push(str(holding)) | 
|  | 490 | elif holding_prelen == 0: | 
|  | 491 | # This is the only chunk left so it has to go on the | 
|  | 492 | # current line. | 
|  | 493 | self._current_line.push(str(holding)) | 
|  | 494 | else: | 
|  | 495 | save_part = holding.pop() | 
|  | 496 | self._current_line.push(str(holding)) | 
|  | 497 | self._lines.append(str(self._current_line)) | 
|  | 498 | holding.reset(save_part) | 
|  | 499 | self._current_line.reset(str(holding)) | 
|  | 500 | return | 
|  | 501 | elif not part: | 
|  | 502 | # We're leading with a split character.  See if the splitpart | 
|  | 503 | # and nextpart fits on the current line. | 
|  | 504 | holding.push(splitpart + nextpart) | 
|  | 505 | holding_len = len(holding) | 
|  | 506 | # We know we're not leaving the nextpart on the stack. | 
|  | 507 | holding.pop() | 
|  | 508 | if holding_len + len(self._current_line) <= self._maxlen: | 
|  | 509 | holding.push(splitpart) | 
|  | 510 | else: | 
|  | 511 | # It doesn't fit.  Since there's no current part really | 
|  | 512 | # the best we can do is start a new line and push the | 
|  | 513 | # split part onto it. | 
|  | 514 | self._current_line.push(str(holding)) | 
|  | 515 | holding.reset() | 
|  | 516 | if len(self._current_line) > 0 and self._lines: | 
|  | 517 | self._lines.append(str(self._current_line)) | 
|  | 518 | self._current_line.reset() | 
|  | 519 | holding.push(splitpart) | 
|  | 520 | else: | 
|  | 521 | # All three parts are present.  First let's see if all three | 
|  | 522 | # parts will fit on the current line.  If so, we don't need to | 
|  | 523 | # split it. | 
|  | 524 | holding.push(part + splitpart + nextpart) | 
|  | 525 | holding_len = len(holding) | 
|  | 526 | # Pop the part because we'll push nextpart on the next | 
|  | 527 | # iteration through the loop. | 
|  | 528 | holding.pop() | 
|  | 529 | if holding_len + len(self._current_line) <= self._maxlen: | 
|  | 530 | holding.push(part + splitpart) | 
|  | 531 | else: | 
|  | 532 | # The entire thing doesn't fit.  See if we need to split | 
|  | 533 | # before or after the split characters. | 
|  | 534 | if splitpart.isspace(): | 
|  | 535 | # Split before whitespace.  Remember that the | 
|  | 536 | # whitespace becomes the continuation whitespace of | 
|  | 537 | # the next line so it goes to current_line not holding. | 
|  | 538 | holding.push(part) | 
|  | 539 | self._current_line.push(str(holding)) | 
|  | 540 | holding.reset() | 
|  | 541 | self._lines.append(str(self._current_line)) | 
|  | 542 | self._current_line.reset(splitpart) | 
|  | 543 | else: | 
|  | 544 | # Split after non-whitespace.  The continuation | 
|  | 545 | # whitespace comes from the instance variable. | 
|  | 546 | holding.push(part + splitpart) | 
|  | 547 | self._current_line.push(str(holding)) | 
|  | 548 | holding.reset() | 
|  | 549 | self._lines.append(str(self._current_line)) | 
|  | 550 | if nextpart[0].isspace(): | 
|  | 551 | self._current_line.reset() | 
|  | 552 | else: | 
|  | 553 | self._current_line.reset(self._continuation_ws) | 
|  | 554 | # Get the last of the holding part | 
|  | 555 | self._current_line.push(str(holding)) | 
|  | 556 |  | 
|  | 557 |  | 
|  | 558 |  | 
|  | 559 | def _spliterator(character, string): | 
|  | 560 | parts = list(reversed(re.split('(%s)' % character, string))) | 
|  | 561 | while parts: | 
|  | 562 | part = parts.pop() | 
|  | 563 | splitparts = (parts.pop() if parts else None) | 
|  | 564 | nextpart = (parts.pop() if parts else None) | 
|  | 565 | yield (part, splitparts, nextpart) | 
|  | 566 | if nextpart is not None: | 
|  | 567 | parts.append(nextpart) | 
|  | 568 |  | 
|  | 569 |  | 
|  | 570 | class _Accumulator: | 
| Guido van Rossum | 9604e66 | 2007-08-30 03:46:43 +0000 | [diff] [blame] | 571 | def __init__(self, initial_size=0): | 
| Guido van Rossum | 8b3febe | 2007-08-30 01:15:14 +0000 | [diff] [blame] | 572 | self._initial_size = initial_size | 
| Guido van Rossum | 8b3febe | 2007-08-30 01:15:14 +0000 | [diff] [blame] | 573 | self._current = [] | 
|  | 574 |  | 
|  | 575 | def push(self, string): | 
|  | 576 | self._current.append(string) | 
|  | 577 |  | 
|  | 578 | def pop(self): | 
| Barry Warsaw | 00b3422 | 2007-08-31 02:35:00 +0000 | [diff] [blame] | 579 | if not self._current: | 
|  | 580 | return None | 
| Guido van Rossum | 8b3febe | 2007-08-30 01:15:14 +0000 | [diff] [blame] | 581 | return self._current.pop() | 
|  | 582 |  | 
|  | 583 | def __len__(self): | 
| Barry Warsaw | 00b3422 | 2007-08-31 02:35:00 +0000 | [diff] [blame] | 584 | return sum(((1 if string is None else len(string)) | 
|  | 585 | for string in self._current), | 
| Guido van Rossum | 9604e66 | 2007-08-30 03:46:43 +0000 | [diff] [blame] | 586 | self._initial_size) | 
| Guido van Rossum | 8b3febe | 2007-08-30 01:15:14 +0000 | [diff] [blame] | 587 |  | 
|  | 588 | def __str__(self): | 
| Barry Warsaw | 00b3422 | 2007-08-31 02:35:00 +0000 | [diff] [blame] | 589 | if self._current and self._current[-1] is None: | 
|  | 590 | self._current.pop() | 
|  | 591 | return EMPTYSTRING.join((' ' if string is None else string) | 
|  | 592 | for string in self._current) | 
| Guido van Rossum | 8b3febe | 2007-08-30 01:15:14 +0000 | [diff] [blame] | 593 |  | 
|  | 594 | def reset(self, string=None): | 
|  | 595 | self._current = [] | 
| Guido van Rossum | 8b3febe | 2007-08-30 01:15:14 +0000 | [diff] [blame] | 596 | self._initial_size = 0 | 
|  | 597 | if string is not None: | 
|  | 598 | self.push(string) | 
| Guido van Rossum | 9604e66 | 2007-08-30 03:46:43 +0000 | [diff] [blame] | 599 |  | 
|  | 600 | def is_onlyws(self): | 
|  | 601 | return len(self) == 0 or str(self).isspace() |