blob: 714839ede5a4e574425105e375696a90bf195a1b [file] [log] [blame]
Barry Warsaw409a4c02002-04-10 21:01:31 +00001# Copyright (C) 2002 Python Software Foundation
2# Author: che@debian.org (Ben Gertzfield)
3
4"""Header encoding and decoding functionality."""
5
6import re
7import email.quopriMIME
8import email.base64MIME
9from email.Charset import Charset
10
Barry Warsaw812031b2002-05-19 23:47:53 +000011try:
Barry Warsaw1c30aa22002-06-01 05:49:17 +000012 from email._compat22 import _floordiv
Barry Warsaw812031b2002-05-19 23:47:53 +000013except SyntaxError:
14 # Python 2.1 spells integer division differently
Barry Warsaw1c30aa22002-06-01 05:49:17 +000015 from email._compat21 import _floordiv
Barry Warsaw812031b2002-05-19 23:47:53 +000016
Barry Warsaw409a4c02002-04-10 21:01:31 +000017CRLFSPACE = '\r\n '
18CRLF = '\r\n'
19NLSPACE = '\n '
20
21MAXLINELEN = 76
22
23ENCODE = 1
24DECODE = 2
25
26# Match encoded-word strings in the form =?charset?q?Hello_World?=
27ecre = re.compile(r'''
28 =\? # literal =?
29 (?P<charset>[^?]*?) # non-greedy up to the next ? is the charset
30 \? # literal ?
31 (?P<encoding>[qb]) # either a "q" or a "b", case insensitive
32 \? # literal ?
33 (?P<encoded>.*?) # non-greedy up to the next ?= is the encoded string
34 \?= # literal ?=
35 ''', re.VERBOSE | re.IGNORECASE)
36
37
38
39# Helpers
40_max_append = email.quopriMIME._max_append
41
42
43
44def decode_header(header):
45 """Decode a message header value without converting charset.
46
47 Returns a list of (decoded_string, charset) pairs containing each of the
48 decoded parts of the header. Charset is None for non-encoded parts of the
49 header, otherwise a lower-case string containing the name of the character
50 set specified in the encoded string.
51 """
52 # If no encoding, just return the header
53 header = str(header)
54 if not ecre.search(header):
55 return [(header, None)]
56
57 decoded = []
58 dec = ''
59 for line in header.splitlines():
60 # This line might not have an encoding in it
61 if not ecre.search(line):
62 decoded.append((line, None))
63 continue
Tim Peters8ac14952002-05-23 15:15:30 +000064
Barry Warsaw409a4c02002-04-10 21:01:31 +000065 parts = ecre.split(line)
66 while parts:
67 unenc = parts.pop(0).strip()
68 if unenc:
69 # Should we continue a long line?
70 if decoded and decoded[-1][1] is None:
71 decoded[-1] = (decoded[-1][0] + dec, None)
72 else:
73 decoded.append((unenc, None))
74 if parts:
75 charset, encoding = [s.lower() for s in parts[0:2]]
76 encoded = parts[2]
77 dec = ''
78 if encoding == 'q':
79 dec = email.quopriMIME.header_decode(encoded)
80 elif encoding == 'b':
81 dec = email.base64MIME.decode(encoded)
82 else:
83 dec = encoded
84
85 if decoded and decoded[-1][1] == charset:
86 decoded[-1] = (decoded[-1][0] + dec, decoded[-1][1])
87 else:
88 decoded.append((dec, charset))
89 del parts[0:3]
90 return decoded
91
92
93
94class Header:
Barry Warsaw812031b2002-05-19 23:47:53 +000095 def __init__(self, s, charset=None, maxlinelen=None, header_name=None):
Barry Warsaw409a4c02002-04-10 21:01:31 +000096 """Create a MIME-compliant header that can contain many languages.
97
98 Specify the initial header value in s. Specify its character set as a
99 Charset object in the charset argument. If none, a default Charset
100 instance will be used.
101
102 You can later append to the header with append(s, charset) below;
103 charset does not have to be the same as the one initially specified
104 here. In fact, it's optional, and if not given, defaults to the
105 charset specified in the constructor.
106
Barry Warsaw812031b2002-05-19 23:47:53 +0000107 The maximum line length can be specified explicitly via maxlinelen.
108 You can also pass None for maxlinelen and the name of a header field
109 (e.g. "Subject") to let the constructor guess the best line length to
110 use. The default maxlinelen is 76.
Barry Warsaw409a4c02002-04-10 21:01:31 +0000111 """
112 if charset is None:
113 charset = Charset()
114 self._charset = charset
115 # BAW: I believe `chunks' and `maxlinelen' should be non-public.
116 self._chunks = []
117 self.append(s, charset)
Barry Warsaw812031b2002-05-19 23:47:53 +0000118 if maxlinelen is None:
119 if header_name is None:
120 self._maxlinelen = MAXLINELEN
121 else:
122 self.guess_maxlinelen(header_name)
123 else:
124 self._maxlinelen = maxlinelen
Barry Warsaw409a4c02002-04-10 21:01:31 +0000125
126 def __str__(self):
127 """A synonym for self.encode()."""
128 return self.encode()
129
130 def guess_maxlinelen(self, s=None):
131 """Guess the maximum length to make each header line.
132
133 Given a header name (e.g. "Subject"), set this header's maximum line
134 length to an appropriate length to avoid line wrapping. If s is not
135 given, return the previous maximum line length and don't set it.
136
137 Returns the new maximum line length.
138 """
139 # BAW: is this semantic necessary?
140 if s is not None:
141 self._maxlinelen = MAXLINELEN - len(s) - 2
142 return self._maxlinelen
143
144 def append(self, s, charset=None):
145 """Append string s with Charset charset to the MIME header.
146
147 charset defaults to the one given in the class constructor.
148 """
149 if charset is None:
150 charset = self._charset
151 self._chunks.append((s, charset))
Tim Peters8ac14952002-05-23 15:15:30 +0000152
Barry Warsaw409a4c02002-04-10 21:01:31 +0000153 def _split(self, s, charset):
154 # Split up a header safely for use with encode_chunks. BAW: this
155 # appears to be a private convenience method.
156 splittable = charset.to_splittable(s)
157 encoded = charset.from_splittable(splittable)
Barry Warsaw812031b2002-05-19 23:47:53 +0000158 elen = charset.encoded_header_len(encoded)
Tim Peters8ac14952002-05-23 15:15:30 +0000159
Barry Warsaw812031b2002-05-19 23:47:53 +0000160 if elen <= self._maxlinelen:
Barry Warsaw409a4c02002-04-10 21:01:31 +0000161 return [(encoded, charset)]
Barry Warsaw812031b2002-05-19 23:47:53 +0000162 # BAW: should we use encoded?
163 elif elen == len(s):
164 # We can split on _maxlinelen boundaries because we know that the
165 # encoding won't change the size of the string
166 splitpnt = self._maxlinelen
167 first = charset.from_splittable(splittable[:splitpnt], 0)
168 last = charset.from_splittable(splittable[splitpnt:], 0)
169 return self._split(first, charset) + self._split(last, charset)
Barry Warsaw409a4c02002-04-10 21:01:31 +0000170 else:
Barry Warsaw1c30aa22002-06-01 05:49:17 +0000171 # Divide and conquer.
172 halfway = _floordiv(len(splittable), 2)
Barry Warsaw409a4c02002-04-10 21:01:31 +0000173 first = charset.from_splittable(splittable[:halfway], 0)
174 last = charset.from_splittable(splittable[halfway:], 0)
175 return self._split(first, charset) + self._split(last, charset)
176
177 def encode(self):
178 """Encode a message header, possibly converting charset and encoding.
179
180 There are many issues involved in converting a given string for use in
181 an email header. Only certain character sets are readable in most
182 email clients, and as header strings can only contain a subset of
183 7-bit ASCII, care must be taken to properly convert and encode (with
184 Base64 or quoted-printable) header strings. In addition, there is a
185 75-character length limit on any given encoded header field, so
186 line-wrapping must be performed, even with double-byte character sets.
Tim Peters8ac14952002-05-23 15:15:30 +0000187
Barry Warsaw409a4c02002-04-10 21:01:31 +0000188 This method will do its best to convert the string to the correct
189 character set used in email, and encode and line wrap it safely with
190 the appropriate scheme for that character set.
191
192 If the given charset is not known or an error occurs during
193 conversion, this function will return the header untouched.
194 """
195 newchunks = []
196 for s, charset in self._chunks:
197 newchunks += self._split(s, charset)
198 self._chunks = newchunks
199 return self.encode_chunks()
200
201 def encode_chunks(self):
202 """MIME-encode a header with many different charsets and/or encodings.
203
204 Given a list of pairs (string, charset), return a MIME-encoded string
205 suitable for use in a header field. Each pair may have different
206 charsets and/or encodings, and the resulting header will accurately
207 reflect each setting.
208
209 Each encoding can be email.Utils.QP (quoted-printable, for ASCII-like
210 character sets like iso-8859-1), email.Utils.BASE64 (Base64, for
211 non-ASCII like character sets like KOI8-R and iso-2022-jp), or None
212 (no encoding).
213
214 Each pair will be represented on a separate line; the resulting string
215 will be in the format:
216
217 "=?charset1?q?Mar=EDa_Gonz=E1lez_Alonso?=\n
218 =?charset2?b?SvxyZ2VuIEL2aW5n?="
219 """
220 chunks = []
221 for header, charset in self._chunks:
222 if charset is None:
223 _max_append(chunks, header, self._maxlinelen, ' ')
224 else:
225 _max_append(chunks, charset.header_encode(header, 0),
226 self._maxlinelen, ' ')
227 return NLSPACE.join(chunks)