| Barry Warsaw | 409a4c0 | 2002-04-10 21:01:31 +0000 | [diff] [blame] | 1 | # Copyright (C) 2002 Python Software Foundation | 
| Barry Warsaw | 174aa49 | 2002-09-30 15:51:31 +0000 | [diff] [blame] | 2 | # Author: che@debian.org (Ben Gertzfield), barry@zope.com (Barry Warsaw) | 
| Barry Warsaw | 409a4c0 | 2002-04-10 21:01:31 +0000 | [diff] [blame] | 3 |  | 
 | 4 | """Header encoding and decoding functionality.""" | 
 | 5 |  | 
 | 6 | import re | 
| Barry Warsaw | 174aa49 | 2002-09-30 15:51:31 +0000 | [diff] [blame] | 7 | from types import StringType, UnicodeType | 
 | 8 |  | 
| Barry Warsaw | 409a4c0 | 2002-04-10 21:01:31 +0000 | [diff] [blame] | 9 | import email.quopriMIME | 
 | 10 | import email.base64MIME | 
 | 11 | from email.Charset import Charset | 
 | 12 |  | 
| Barry Warsaw | 812031b | 2002-05-19 23:47:53 +0000 | [diff] [blame] | 13 | try: | 
| Barry Warsaw | 1c30aa2 | 2002-06-01 05:49:17 +0000 | [diff] [blame] | 14 |     from email._compat22 import _floordiv | 
| Barry Warsaw | 812031b | 2002-05-19 23:47:53 +0000 | [diff] [blame] | 15 | except SyntaxError: | 
 | 16 |     # Python 2.1 spells integer division differently | 
| Barry Warsaw | 1c30aa2 | 2002-06-01 05:49:17 +0000 | [diff] [blame] | 17 |     from email._compat21 import _floordiv | 
| Barry Warsaw | 812031b | 2002-05-19 23:47:53 +0000 | [diff] [blame] | 18 |  | 
| Barry Warsaw | 174aa49 | 2002-09-30 15:51:31 +0000 | [diff] [blame] | 19 | try: | 
 | 20 |     True, False | 
 | 21 | except NameError: | 
 | 22 |     True = 1 | 
 | 23 |     False = 0 | 
 | 24 |  | 
| Barry Warsaw | 409a4c0 | 2002-04-10 21:01:31 +0000 | [diff] [blame] | 25 | CRLFSPACE = '\r\n ' | 
 | 26 | CRLF = '\r\n' | 
| Barry Warsaw | 7661250 | 2002-06-28 23:46:53 +0000 | [diff] [blame] | 27 | NL = '\n' | 
 | 28 | SPACE8 = ' ' * 8 | 
 | 29 | EMPTYSTRING = '' | 
| Barry Warsaw | 409a4c0 | 2002-04-10 21:01:31 +0000 | [diff] [blame] | 30 |  | 
 | 31 | MAXLINELEN = 76 | 
 | 32 |  | 
 | 33 | ENCODE = 1 | 
 | 34 | DECODE = 2 | 
 | 35 |  | 
| Barry Warsaw | 174aa49 | 2002-09-30 15:51:31 +0000 | [diff] [blame] | 36 | USASCII = Charset('us-ascii') | 
 | 37 | UTF8 = Charset('utf-8') | 
 | 38 |  | 
| Barry Warsaw | 409a4c0 | 2002-04-10 21:01:31 +0000 | [diff] [blame] | 39 | # Match encoded-word strings in the form =?charset?q?Hello_World?= | 
 | 40 | ecre = re.compile(r''' | 
 | 41 |   =\?                   # literal =? | 
 | 42 |   (?P<charset>[^?]*?)   # non-greedy up to the next ? is the charset | 
 | 43 |   \?                    # literal ? | 
 | 44 |   (?P<encoding>[qb])    # either a "q" or a "b", case insensitive | 
 | 45 |   \?                    # literal ? | 
 | 46 |   (?P<encoded>.*?)      # non-greedy up to the next ?= is the encoded string | 
 | 47 |   \?=                   # literal ?= | 
 | 48 |   ''', re.VERBOSE | re.IGNORECASE) | 
 | 49 |  | 
 | 50 |  | 
 | 51 |  | 
 | 52 | # Helpers | 
 | 53 | _max_append = email.quopriMIME._max_append | 
 | 54 |  | 
 | 55 |  | 
 | 56 |  | 
 | 57 | def decode_header(header): | 
 | 58 |     """Decode a message header value without converting charset. | 
 | 59 |  | 
 | 60 |     Returns a list of (decoded_string, charset) pairs containing each of the | 
 | 61 |     decoded parts of the header.  Charset is None for non-encoded parts of the | 
 | 62 |     header, otherwise a lower-case string containing the name of the character | 
 | 63 |     set specified in the encoded string. | 
 | 64 |     """ | 
 | 65 |     # If no encoding, just return the header | 
 | 66 |     header = str(header) | 
 | 67 |     if not ecre.search(header): | 
 | 68 |         return [(header, None)] | 
| Barry Warsaw | 409a4c0 | 2002-04-10 21:01:31 +0000 | [diff] [blame] | 69 |     decoded = [] | 
 | 70 |     dec = '' | 
 | 71 |     for line in header.splitlines(): | 
 | 72 |         # This line might not have an encoding in it | 
 | 73 |         if not ecre.search(line): | 
 | 74 |             decoded.append((line, None)) | 
 | 75 |             continue | 
| Barry Warsaw | 409a4c0 | 2002-04-10 21:01:31 +0000 | [diff] [blame] | 76 |         parts = ecre.split(line) | 
 | 77 |         while parts: | 
 | 78 |             unenc = parts.pop(0).strip() | 
 | 79 |             if unenc: | 
 | 80 |                 # Should we continue a long line? | 
 | 81 |                 if decoded and decoded[-1][1] is None: | 
 | 82 |                     decoded[-1] = (decoded[-1][0] + dec, None) | 
 | 83 |                 else: | 
 | 84 |                     decoded.append((unenc, None)) | 
 | 85 |             if parts: | 
 | 86 |                 charset, encoding = [s.lower() for s in parts[0:2]] | 
 | 87 |                 encoded = parts[2] | 
 | 88 |                 dec = '' | 
 | 89 |                 if encoding == 'q': | 
 | 90 |                     dec = email.quopriMIME.header_decode(encoded) | 
 | 91 |                 elif encoding == 'b': | 
 | 92 |                     dec = email.base64MIME.decode(encoded) | 
 | 93 |                 else: | 
 | 94 |                     dec = encoded | 
 | 95 |  | 
 | 96 |                 if decoded and decoded[-1][1] == charset: | 
 | 97 |                     decoded[-1] = (decoded[-1][0] + dec, decoded[-1][1]) | 
 | 98 |                 else: | 
 | 99 |                     decoded.append((dec, charset)) | 
 | 100 |             del parts[0:3] | 
 | 101 |     return decoded | 
 | 102 |  | 
 | 103 |  | 
 | 104 |  | 
| Barry Warsaw | 8da39aa | 2002-07-09 16:33:47 +0000 | [diff] [blame] | 105 | def make_header(decoded_seq, maxlinelen=None, header_name=None, | 
 | 106 |                 continuation_ws=' '): | 
 | 107 |     """Create a Header from a sequence of pairs as returned by decode_header() | 
 | 108 |  | 
 | 109 |     decode_header() takes a header value string and returns a sequence of | 
 | 110 |     pairs of the format (decoded_string, charset) where charset is the string | 
 | 111 |     name of the character set. | 
 | 112 |  | 
 | 113 |     This function takes one of those sequence of pairs and returns a Header | 
 | 114 |     instance.  Optional maxlinelen, header_name, and continuation_ws are as in | 
 | 115 |     the Header constructor. | 
 | 116 |     """ | 
 | 117 |     h = Header(maxlinelen=maxlinelen, header_name=header_name, | 
 | 118 |                continuation_ws=continuation_ws) | 
 | 119 |     for s, charset in decoded_seq: | 
| Barry Warsaw | 15d3739 | 2002-07-23 04:29:54 +0000 | [diff] [blame] | 120 |         # None means us-ascii but we can simply pass it on to h.append() | 
 | 121 |         if charset is not None and not isinstance(charset, Charset): | 
| Barry Warsaw | 8da39aa | 2002-07-09 16:33:47 +0000 | [diff] [blame] | 122 |             charset = Charset(charset) | 
 | 123 |         h.append(s, charset) | 
 | 124 |     return h | 
 | 125 |  | 
 | 126 |  | 
 | 127 |  | 
| Barry Warsaw | 409a4c0 | 2002-04-10 21:01:31 +0000 | [diff] [blame] | 128 | class Header: | 
| Barry Warsaw | 8da39aa | 2002-07-09 16:33:47 +0000 | [diff] [blame] | 129 |     def __init__(self, s=None, charset=None, maxlinelen=None, header_name=None, | 
| Barry Warsaw | 7661250 | 2002-06-28 23:46:53 +0000 | [diff] [blame] | 130 |                  continuation_ws=' '): | 
| Barry Warsaw | 174aa49 | 2002-09-30 15:51:31 +0000 | [diff] [blame] | 131 |         """Create a MIME-compliant header that can contain many character sets. | 
| Barry Warsaw | 409a4c0 | 2002-04-10 21:01:31 +0000 | [diff] [blame] | 132 |  | 
| Barry Warsaw | 174aa49 | 2002-09-30 15:51:31 +0000 | [diff] [blame] | 133 |         Optional s is the initial header value.  If None, the initial header | 
 | 134 |         value is not set.  You can later append to the header with .append() | 
 | 135 |         method calls.  s may be a byte string or a Unicode string, but see the | 
 | 136 |         .append() documentation for semantics. | 
| Barry Warsaw | 8da39aa | 2002-07-09 16:33:47 +0000 | [diff] [blame] | 137 |  | 
| Barry Warsaw | 174aa49 | 2002-09-30 15:51:31 +0000 | [diff] [blame] | 138 |         Optional charset serves two purposes: it has the same meaning as the | 
 | 139 |         charset argument to the .append() method.  It also sets the default | 
 | 140 |         character set for all subsequent .append() calls that omit the charset | 
 | 141 |         argument.  If charset is not provided in the constructor, the us-ascii | 
 | 142 |         charset is used both as s's initial charset and as the default for | 
 | 143 |         subsequent .append() calls. | 
| Barry Warsaw | 409a4c0 | 2002-04-10 21:01:31 +0000 | [diff] [blame] | 144 |  | 
| Barry Warsaw | 7661250 | 2002-06-28 23:46:53 +0000 | [diff] [blame] | 145 |         The maximum line length can be specified explicit via maxlinelen.  For | 
 | 146 |         splitting the first line to a shorter value (to account for the field | 
 | 147 |         header which isn't included in s, e.g. `Subject') pass in the name of | 
 | 148 |         the field in header_name.  The default maxlinelen is 76. | 
 | 149 |  | 
 | 150 |         continuation_ws must be RFC 2822 compliant folding whitespace (usually | 
 | 151 |         either a space or a hard tab) which will be prepended to continuation | 
 | 152 |         lines. | 
| Barry Warsaw | 409a4c0 | 2002-04-10 21:01:31 +0000 | [diff] [blame] | 153 |         """ | 
 | 154 |         if charset is None: | 
| Barry Warsaw | 174aa49 | 2002-09-30 15:51:31 +0000 | [diff] [blame] | 155 |             charset = USASCII | 
| Barry Warsaw | 5e3bcff | 2002-10-14 15:13:17 +0000 | [diff] [blame] | 156 |         if not isinstance(charset, Charset): | 
 | 157 |             charset = Charset(charset) | 
| Barry Warsaw | 409a4c0 | 2002-04-10 21:01:31 +0000 | [diff] [blame] | 158 |         self._charset = charset | 
| Barry Warsaw | 7661250 | 2002-06-28 23:46:53 +0000 | [diff] [blame] | 159 |         self._continuation_ws = continuation_ws | 
 | 160 |         cws_expanded_len = len(continuation_ws.replace('\t', SPACE8)) | 
| Barry Warsaw | 409a4c0 | 2002-04-10 21:01:31 +0000 | [diff] [blame] | 161 |         # BAW: I believe `chunks' and `maxlinelen' should be non-public. | 
 | 162 |         self._chunks = [] | 
| Barry Warsaw | 8da39aa | 2002-07-09 16:33:47 +0000 | [diff] [blame] | 163 |         if s is not None: | 
 | 164 |             self.append(s, charset) | 
| Barry Warsaw | 812031b | 2002-05-19 23:47:53 +0000 | [diff] [blame] | 165 |         if maxlinelen is None: | 
| Barry Warsaw | 7661250 | 2002-06-28 23:46:53 +0000 | [diff] [blame] | 166 |             maxlinelen = MAXLINELEN | 
 | 167 |         if header_name is None: | 
 | 168 |             # We don't know anything about the field header so the first line | 
 | 169 |             # is the same length as subsequent lines. | 
 | 170 |             self._firstlinelen = maxlinelen | 
| Barry Warsaw | 812031b | 2002-05-19 23:47:53 +0000 | [diff] [blame] | 171 |         else: | 
| Barry Warsaw | 7661250 | 2002-06-28 23:46:53 +0000 | [diff] [blame] | 172 |             # The first line should be shorter to take into account the field | 
 | 173 |             # header.  Also subtract off 2 extra for the colon and space. | 
 | 174 |             self._firstlinelen = maxlinelen - len(header_name) - 2 | 
 | 175 |         # Second and subsequent lines should subtract off the length in | 
 | 176 |         # columns of the continuation whitespace prefix. | 
 | 177 |         self._maxlinelen = maxlinelen - cws_expanded_len | 
| Barry Warsaw | 409a4c0 | 2002-04-10 21:01:31 +0000 | [diff] [blame] | 178 |  | 
 | 179 |     def __str__(self): | 
 | 180 |         """A synonym for self.encode().""" | 
 | 181 |         return self.encode() | 
 | 182 |  | 
| Barry Warsaw | 8e69bda | 2002-06-29 03:26:58 +0000 | [diff] [blame] | 183 |     def __unicode__(self): | 
 | 184 |         """Helper for the built-in unicode function.""" | 
 | 185 |         # charset item is a Charset instance so we need to stringify it. | 
 | 186 |         uchunks = [unicode(s, str(charset)) for s, charset in self._chunks] | 
 | 187 |         return u''.join(uchunks) | 
 | 188 |  | 
| Barry Warsaw | 8da39aa | 2002-07-09 16:33:47 +0000 | [diff] [blame] | 189 |     # Rich comparison operators for equality only.  BAW: does it make sense to | 
 | 190 |     # have or explicitly disable <, <=, >, >= operators? | 
 | 191 |     def __eq__(self, other): | 
 | 192 |         # other may be a Header or a string.  Both are fine so coerce | 
 | 193 |         # ourselves to a string, swap the args and do another comparison. | 
 | 194 |         return other == self.encode() | 
 | 195 |  | 
 | 196 |     def __ne__(self, other): | 
 | 197 |         return not self == other | 
 | 198 |  | 
| Barry Warsaw | 409a4c0 | 2002-04-10 21:01:31 +0000 | [diff] [blame] | 199 |     def append(self, s, charset=None): | 
| Barry Warsaw | 174aa49 | 2002-09-30 15:51:31 +0000 | [diff] [blame] | 200 |         """Append a string to the MIME header. | 
| Barry Warsaw | 409a4c0 | 2002-04-10 21:01:31 +0000 | [diff] [blame] | 201 |  | 
| Barry Warsaw | 174aa49 | 2002-09-30 15:51:31 +0000 | [diff] [blame] | 202 |         Optional charset, if given, should be a Charset instance or the name | 
 | 203 |         of a character set (which will be converted to a Charset instance).  A | 
 | 204 |         value of None (the default) means that the charset given in the | 
 | 205 |         constructor is used. | 
 | 206 |  | 
 | 207 |         s may be a byte string or a Unicode string.  If it is a byte string | 
 | 208 |         (i.e. isinstance(s, StringType) is true), then charset is the encoding | 
 | 209 |         of that byte string, and a UnicodeError will be raised if the string | 
| Barry Warsaw | 4833068 | 2002-09-30 23:07:35 +0000 | [diff] [blame] | 210 |         cannot be decoded with that charset.  If s is a Unicode string, then | 
| Barry Warsaw | 174aa49 | 2002-09-30 15:51:31 +0000 | [diff] [blame] | 211 |         charset is a hint specifying the character set of the characters in | 
 | 212 |         the string.  In this case, when producing an RFC 2822 compliant header | 
 | 213 |         using RFC 2047 rules, the Unicode string will be encoded using the | 
| Barry Warsaw | 4833068 | 2002-09-30 23:07:35 +0000 | [diff] [blame] | 214 |         following charsets in order: us-ascii, the charset hint, utf-8.  The | 
 | 215 |         first character set not to provoke a UnicodeError is used. | 
| Barry Warsaw | 409a4c0 | 2002-04-10 21:01:31 +0000 | [diff] [blame] | 216 |         """ | 
 | 217 |         if charset is None: | 
 | 218 |             charset = self._charset | 
| Barry Warsaw | 92825a9 | 2002-07-23 06:08:10 +0000 | [diff] [blame] | 219 |         elif not isinstance(charset, Charset): | 
 | 220 |             charset = Charset(charset) | 
| Barry Warsaw | 67f8f2f | 2002-10-14 16:52:41 +0000 | [diff] [blame] | 221 |         # If the charset is our faux 8bit charset, leave the string unchanged | 
 | 222 |         if charset <> '8bit': | 
 | 223 |             # We need to test that the string can be converted to unicode and | 
 | 224 |             # back to a byte string, given the input and output codecs of the | 
 | 225 |             # charset. | 
 | 226 |             if isinstance(s, StringType): | 
 | 227 |                 # Possibly raise UnicodeError if the byte string can't be | 
 | 228 |                 # converted to a unicode with the input codec of the charset. | 
 | 229 |                 incodec = charset.input_codec or 'us-ascii' | 
 | 230 |                 ustr = unicode(s, incodec) | 
 | 231 |                 # Now make sure that the unicode could be converted back to a | 
 | 232 |                 # byte string with the output codec, which may be different | 
 | 233 |                 # than the iput coded.  Still, use the original byte string. | 
 | 234 |                 outcodec = charset.output_codec or 'us-ascii' | 
 | 235 |                 ustr.encode(outcodec) | 
 | 236 |             elif isinstance(s, UnicodeType): | 
 | 237 |                 # Now we have to be sure the unicode string can be converted | 
 | 238 |                 # to a byte string with a reasonable output codec.  We want to | 
 | 239 |                 # use the byte string in the chunk. | 
 | 240 |                 for charset in USASCII, charset, UTF8: | 
 | 241 |                     try: | 
 | 242 |                         outcodec = charset.output_codec or 'us-ascii' | 
 | 243 |                         s = s.encode(outcodec) | 
 | 244 |                         break | 
 | 245 |                     except UnicodeError: | 
 | 246 |                         pass | 
 | 247 |                 else: | 
 | 248 |                     assert False, 'utf-8 conversion failed' | 
| Barry Warsaw | 409a4c0 | 2002-04-10 21:01:31 +0000 | [diff] [blame] | 249 |         self._chunks.append((s, charset)) | 
| Tim Peters | 8ac1495 | 2002-05-23 15:15:30 +0000 | [diff] [blame] | 250 |  | 
| Barry Warsaw | 174aa49 | 2002-09-30 15:51:31 +0000 | [diff] [blame] | 251 |     def _split(self, s, charset, firstline=False): | 
| Barry Warsaw | 5e3bcff | 2002-10-14 15:13:17 +0000 | [diff] [blame] | 252 |         # Split up a header safely for use with encode_chunks. | 
| Barry Warsaw | 409a4c0 | 2002-04-10 21:01:31 +0000 | [diff] [blame] | 253 |         splittable = charset.to_splittable(s) | 
 | 254 |         encoded = charset.from_splittable(splittable) | 
| Barry Warsaw | 812031b | 2002-05-19 23:47:53 +0000 | [diff] [blame] | 255 |         elen = charset.encoded_header_len(encoded) | 
| Tim Peters | 8ac1495 | 2002-05-23 15:15:30 +0000 | [diff] [blame] | 256 |  | 
| Barry Warsaw | 812031b | 2002-05-19 23:47:53 +0000 | [diff] [blame] | 257 |         if elen <= self._maxlinelen: | 
| Barry Warsaw | 409a4c0 | 2002-04-10 21:01:31 +0000 | [diff] [blame] | 258 |             return [(encoded, charset)] | 
| Barry Warsaw | 5e3bcff | 2002-10-14 15:13:17 +0000 | [diff] [blame] | 259 |         # If we have undetermined raw 8bit characters sitting in a byte | 
 | 260 |         # string, we really don't know what the right thing to do is.  We | 
 | 261 |         # can't really split it because it might be multibyte data which we | 
 | 262 |         # could break if we split it between pairs.  The least harm seems to | 
 | 263 |         # be to not split the header at all, but that means they could go out | 
 | 264 |         # longer than maxlinelen. | 
 | 265 |         elif charset == '8bit': | 
 | 266 |             return [(s, charset)] | 
| Barry Warsaw | 7661250 | 2002-06-28 23:46:53 +0000 | [diff] [blame] | 267 |         # BAW: I'm not sure what the right test here is.  What we're trying to | 
 | 268 |         # do is be faithful to RFC 2822's recommendation that ($2.2.3): | 
 | 269 |         # | 
 | 270 |         # "Note: Though structured field bodies are defined in such a way that | 
 | 271 |         #  folding can take place between many of the lexical tokens (and even | 
 | 272 |         #  within some of the lexical tokens), folding SHOULD be limited to | 
 | 273 |         #  placing the CRLF at higher-level syntactic breaks." | 
 | 274 |         # | 
 | 275 |         # For now, I can only imagine doing this when the charset is us-ascii, | 
 | 276 |         # although it's possible that other charsets may also benefit from the | 
 | 277 |         # higher-level syntactic breaks. | 
 | 278 |         # | 
 | 279 |         elif charset == 'us-ascii': | 
 | 280 |             return self._ascii_split(s, charset, firstline) | 
| Barry Warsaw | 812031b | 2002-05-19 23:47:53 +0000 | [diff] [blame] | 281 |         # BAW: should we use encoded? | 
 | 282 |         elif elen == len(s): | 
 | 283 |             # We can split on _maxlinelen boundaries because we know that the | 
 | 284 |             # encoding won't change the size of the string | 
 | 285 |             splitpnt = self._maxlinelen | 
| Barry Warsaw | 174aa49 | 2002-09-30 15:51:31 +0000 | [diff] [blame] | 286 |             first = charset.from_splittable(splittable[:splitpnt], False) | 
 | 287 |             last = charset.from_splittable(splittable[splitpnt:], False) | 
| Barry Warsaw | 409a4c0 | 2002-04-10 21:01:31 +0000 | [diff] [blame] | 288 |         else: | 
| Barry Warsaw | 1c30aa2 | 2002-06-01 05:49:17 +0000 | [diff] [blame] | 289 |             # Divide and conquer. | 
 | 290 |             halfway = _floordiv(len(splittable), 2) | 
| Barry Warsaw | 174aa49 | 2002-09-30 15:51:31 +0000 | [diff] [blame] | 291 |             first = charset.from_splittable(splittable[:halfway], False) | 
 | 292 |             last = charset.from_splittable(splittable[halfway:], False) | 
| Barry Warsaw | 7661250 | 2002-06-28 23:46:53 +0000 | [diff] [blame] | 293 |         # Do the split | 
 | 294 |         return self._split(first, charset, firstline) + \ | 
 | 295 |                self._split(last, charset) | 
 | 296 |  | 
 | 297 |     def _ascii_split(self, s, charset, firstline): | 
 | 298 |         # Attempt to split the line at the highest-level syntactic break | 
 | 299 |         # possible.  Note that we don't have a lot of smarts about field | 
 | 300 |         # syntax; we just try to break on semi-colons, then whitespace. | 
 | 301 |         rtn = [] | 
 | 302 |         lines = s.splitlines() | 
 | 303 |         while lines: | 
 | 304 |             line = lines.pop(0) | 
 | 305 |             if firstline: | 
 | 306 |                 maxlinelen = self._firstlinelen | 
| Barry Warsaw | 174aa49 | 2002-09-30 15:51:31 +0000 | [diff] [blame] | 307 |                 firstline = False | 
| Barry Warsaw | 7661250 | 2002-06-28 23:46:53 +0000 | [diff] [blame] | 308 |             else: | 
| Barry Warsaw | 45d9bde | 2002-09-10 15:57:29 +0000 | [diff] [blame] | 309 |                 #line = line.lstrip() | 
| Barry Warsaw | 7661250 | 2002-06-28 23:46:53 +0000 | [diff] [blame] | 310 |                 maxlinelen = self._maxlinelen | 
 | 311 |             # Short lines can remain unchanged | 
 | 312 |             if len(line.replace('\t', SPACE8)) <= maxlinelen: | 
 | 313 |                 rtn.append(line) | 
 | 314 |             else: | 
 | 315 |                 oldlen = len(line) | 
 | 316 |                 # Try to break the line on semicolons, but if that doesn't | 
 | 317 |                 # work, try to split on folding whitespace. | 
 | 318 |                 while len(line) > maxlinelen: | 
 | 319 |                     i = line.rfind(';', 0, maxlinelen) | 
 | 320 |                     if i < 0: | 
 | 321 |                         break | 
 | 322 |                     rtn.append(line[:i] + ';') | 
 | 323 |                     line = line[i+1:] | 
 | 324 |                 # Is the remaining stuff still longer than maxlinelen? | 
 | 325 |                 if len(line) <= maxlinelen: | 
 | 326 |                     # Splitting on semis worked | 
 | 327 |                     rtn.append(line) | 
 | 328 |                     continue | 
 | 329 |                 # Splitting on semis didn't finish the job.  If it did any | 
 | 330 |                 # work at all, stick the remaining junk on the front of the | 
 | 331 |                 # `lines' sequence and let the next pass do its thing. | 
 | 332 |                 if len(line) <> oldlen: | 
 | 333 |                     lines.insert(0, line) | 
 | 334 |                     continue | 
 | 335 |                 # Otherwise, splitting on semis didn't help at all. | 
 | 336 |                 parts = re.split(r'(\s+)', line) | 
 | 337 |                 if len(parts) == 1 or (len(parts) == 3 and | 
 | 338 |                                        parts[0].endswith(':')): | 
 | 339 |                     # This line can't be split on whitespace.  There's now | 
 | 340 |                     # little we can do to get this into maxlinelen.  BAW: | 
 | 341 |                     # We're still potentially breaking the RFC by possibly | 
 | 342 |                     # allowing lines longer than the absolute maximum of 998 | 
 | 343 |                     # characters.  For now, let it slide. | 
 | 344 |                     # | 
 | 345 |                     # len(parts) will be 1 if this line has no `Field: ' | 
 | 346 |                     # prefix, otherwise it will be len(3). | 
 | 347 |                     rtn.append(line) | 
 | 348 |                     continue | 
 | 349 |                 # There is whitespace we can split on. | 
 | 350 |                 first = parts.pop(0) | 
 | 351 |                 sublines = [first] | 
 | 352 |                 acc = len(first) | 
 | 353 |                 while parts: | 
 | 354 |                     len0 = len(parts[0]) | 
 | 355 |                     len1 = len(parts[1]) | 
 | 356 |                     if acc + len0 + len1 <= maxlinelen: | 
 | 357 |                         sublines.append(parts.pop(0)) | 
 | 358 |                         sublines.append(parts.pop(0)) | 
 | 359 |                         acc += len0 + len1 | 
 | 360 |                     else: | 
 | 361 |                         # Split it here, but don't forget to ignore the | 
 | 362 |                         # next whitespace-only part | 
 | 363 |                         if first <> '': | 
 | 364 |                             rtn.append(EMPTYSTRING.join(sublines)) | 
 | 365 |                         del parts[0] | 
 | 366 |                         first = parts.pop(0) | 
 | 367 |                         sublines = [first] | 
 | 368 |                         acc = len(first) | 
 | 369 |                 rtn.append(EMPTYSTRING.join(sublines)) | 
 | 370 |         return [(chunk, charset) for chunk in rtn] | 
 | 371 |  | 
| Barry Warsaw | 0c35825 | 2002-10-13 04:06:28 +0000 | [diff] [blame] | 372 |     def _encode_chunks(self, newchunks): | 
 | 373 |         # MIME-encode a header with many different charsets and/or encodings. | 
 | 374 |         # | 
 | 375 |         # Given a list of pairs (string, charset), return a MIME-encoded | 
 | 376 |         # string suitable for use in a header field.  Each pair may have | 
 | 377 |         # different charsets and/or encodings, and the resulting header will | 
 | 378 |         # accurately reflect each setting. | 
 | 379 |         # | 
 | 380 |         # Each encoding can be email.Utils.QP (quoted-printable, for | 
 | 381 |         # ASCII-like character sets like iso-8859-1), email.Utils.BASE64 | 
 | 382 |         # (Base64, for non-ASCII like character sets like KOI8-R and | 
 | 383 |         # iso-2022-jp), or None (no encoding). | 
 | 384 |         # | 
 | 385 |         # Each pair will be represented on a separate line; the resulting | 
 | 386 |         # string will be in the format: | 
 | 387 |         # | 
 | 388 |         # =?charset1?q?Mar=EDa_Gonz=E1lez_Alonso?=\n | 
 | 389 |         #  =?charset2?b?SvxyZ2VuIEL2aW5n?=" | 
 | 390 |         # | 
| Barry Warsaw | 7661250 | 2002-06-28 23:46:53 +0000 | [diff] [blame] | 391 |         chunks = [] | 
| Barry Warsaw | 0c35825 | 2002-10-13 04:06:28 +0000 | [diff] [blame] | 392 |         for header, charset in newchunks: | 
| Barry Warsaw | 7661250 | 2002-06-28 23:46:53 +0000 | [diff] [blame] | 393 |             if charset is None or charset.header_encoding is None: | 
 | 394 |                 # There's no encoding for this chunk's charsets | 
 | 395 |                 _max_append(chunks, header, self._maxlinelen) | 
 | 396 |             else: | 
| Barry Warsaw | 174aa49 | 2002-09-30 15:51:31 +0000 | [diff] [blame] | 397 |                 _max_append(chunks, charset.header_encode(header), | 
| Barry Warsaw | 7661250 | 2002-06-28 23:46:53 +0000 | [diff] [blame] | 398 |                             self._maxlinelen, ' ') | 
 | 399 |         joiner = NL + self._continuation_ws | 
 | 400 |         return joiner.join(chunks) | 
| Barry Warsaw | 409a4c0 | 2002-04-10 21:01:31 +0000 | [diff] [blame] | 401 |  | 
 | 402 |     def encode(self): | 
| Barry Warsaw | 4833068 | 2002-09-30 23:07:35 +0000 | [diff] [blame] | 403 |         """Encode a message header into an RFC-compliant format. | 
| Barry Warsaw | 409a4c0 | 2002-04-10 21:01:31 +0000 | [diff] [blame] | 404 |  | 
 | 405 |         There are many issues involved in converting a given string for use in | 
 | 406 |         an email header.  Only certain character sets are readable in most | 
 | 407 |         email clients, and as header strings can only contain a subset of | 
 | 408 |         7-bit ASCII, care must be taken to properly convert and encode (with | 
 | 409 |         Base64 or quoted-printable) header strings.  In addition, there is a | 
 | 410 |         75-character length limit on any given encoded header field, so | 
 | 411 |         line-wrapping must be performed, even with double-byte character sets. | 
| Tim Peters | 8ac1495 | 2002-05-23 15:15:30 +0000 | [diff] [blame] | 412 |  | 
| Barry Warsaw | 409a4c0 | 2002-04-10 21:01:31 +0000 | [diff] [blame] | 413 |         This method will do its best to convert the string to the correct | 
 | 414 |         character set used in email, and encode and line wrap it safely with | 
 | 415 |         the appropriate scheme for that character set. | 
 | 416 |  | 
 | 417 |         If the given charset is not known or an error occurs during | 
 | 418 |         conversion, this function will return the header untouched. | 
 | 419 |         """ | 
 | 420 |         newchunks = [] | 
 | 421 |         for s, charset in self._chunks: | 
| Barry Warsaw | 174aa49 | 2002-09-30 15:51:31 +0000 | [diff] [blame] | 422 |             newchunks += self._split(s, charset, True) | 
| Barry Warsaw | 0c35825 | 2002-10-13 04:06:28 +0000 | [diff] [blame] | 423 |         return self._encode_chunks(newchunks) |