blob: fb8792cb36d85b99c5a93e1efc1b3d3ae9b38d5b [file] [log] [blame]
Barry Warsaw409a4c02002-04-10 21:01:31 +00001# Copyright (C) 2002 Python Software Foundation
2# Author: che@debian.org (Ben Gertzfield)
3
4"""Header encoding and decoding functionality."""
5
6import re
7import email.quopriMIME
8import email.base64MIME
9from email.Charset import Charset
10
Barry Warsaw812031b2002-05-19 23:47:53 +000011try:
12 from email._compat22 import _intdiv2
13except SyntaxError:
14 # Python 2.1 spells integer division differently
15 from email._compat21 import _intdiv2
16
Barry Warsaw409a4c02002-04-10 21:01:31 +000017CRLFSPACE = '\r\n '
18CRLF = '\r\n'
19NLSPACE = '\n '
20
21MAXLINELEN = 76
22
23ENCODE = 1
24DECODE = 2
25
26# Match encoded-word strings in the form =?charset?q?Hello_World?=
27ecre = re.compile(r'''
28 =\? # literal =?
29 (?P<charset>[^?]*?) # non-greedy up to the next ? is the charset
30 \? # literal ?
31 (?P<encoding>[qb]) # either a "q" or a "b", case insensitive
32 \? # literal ?
33 (?P<encoded>.*?) # non-greedy up to the next ?= is the encoded string
34 \?= # literal ?=
35 ''', re.VERBOSE | re.IGNORECASE)
36
37
38
39# Helpers
40_max_append = email.quopriMIME._max_append
41
42
43
44def decode_header(header):
45 """Decode a message header value without converting charset.
46
47 Returns a list of (decoded_string, charset) pairs containing each of the
48 decoded parts of the header. Charset is None for non-encoded parts of the
49 header, otherwise a lower-case string containing the name of the character
50 set specified in the encoded string.
51 """
52 # If no encoding, just return the header
53 header = str(header)
54 if not ecre.search(header):
55 return [(header, None)]
56
57 decoded = []
58 dec = ''
59 for line in header.splitlines():
60 # This line might not have an encoding in it
61 if not ecre.search(line):
62 decoded.append((line, None))
63 continue
64
65 parts = ecre.split(line)
66 while parts:
67 unenc = parts.pop(0).strip()
68 if unenc:
69 # Should we continue a long line?
70 if decoded and decoded[-1][1] is None:
71 decoded[-1] = (decoded[-1][0] + dec, None)
72 else:
73 decoded.append((unenc, None))
74 if parts:
75 charset, encoding = [s.lower() for s in parts[0:2]]
76 encoded = parts[2]
77 dec = ''
78 if encoding == 'q':
79 dec = email.quopriMIME.header_decode(encoded)
80 elif encoding == 'b':
81 dec = email.base64MIME.decode(encoded)
82 else:
83 dec = encoded
84
85 if decoded and decoded[-1][1] == charset:
86 decoded[-1] = (decoded[-1][0] + dec, decoded[-1][1])
87 else:
88 decoded.append((dec, charset))
89 del parts[0:3]
90 return decoded
91
92
93
94class Header:
Barry Warsaw812031b2002-05-19 23:47:53 +000095 def __init__(self, s, charset=None, maxlinelen=None, header_name=None):
Barry Warsaw409a4c02002-04-10 21:01:31 +000096 """Create a MIME-compliant header that can contain many languages.
97
98 Specify the initial header value in s. Specify its character set as a
99 Charset object in the charset argument. If none, a default Charset
100 instance will be used.
101
102 You can later append to the header with append(s, charset) below;
103 charset does not have to be the same as the one initially specified
104 here. In fact, it's optional, and if not given, defaults to the
105 charset specified in the constructor.
106
Barry Warsaw812031b2002-05-19 23:47:53 +0000107 The maximum line length can be specified explicitly via maxlinelen.
108 You can also pass None for maxlinelen and the name of a header field
109 (e.g. "Subject") to let the constructor guess the best line length to
110 use. The default maxlinelen is 76.
Barry Warsaw409a4c02002-04-10 21:01:31 +0000111 """
112 if charset is None:
113 charset = Charset()
114 self._charset = charset
115 # BAW: I believe `chunks' and `maxlinelen' should be non-public.
116 self._chunks = []
117 self.append(s, charset)
Barry Warsaw812031b2002-05-19 23:47:53 +0000118 if maxlinelen is None:
119 if header_name is None:
120 self._maxlinelen = MAXLINELEN
121 else:
122 self.guess_maxlinelen(header_name)
123 else:
124 self._maxlinelen = maxlinelen
Barry Warsaw409a4c02002-04-10 21:01:31 +0000125
126 def __str__(self):
127 """A synonym for self.encode()."""
128 return self.encode()
129
130 def guess_maxlinelen(self, s=None):
131 """Guess the maximum length to make each header line.
132
133 Given a header name (e.g. "Subject"), set this header's maximum line
134 length to an appropriate length to avoid line wrapping. If s is not
135 given, return the previous maximum line length and don't set it.
136
137 Returns the new maximum line length.
138 """
139 # BAW: is this semantic necessary?
140 if s is not None:
141 self._maxlinelen = MAXLINELEN - len(s) - 2
142 return self._maxlinelen
143
144 def append(self, s, charset=None):
145 """Append string s with Charset charset to the MIME header.
146
147 charset defaults to the one given in the class constructor.
148 """
149 if charset is None:
150 charset = self._charset
151 self._chunks.append((s, charset))
152
153 def _split(self, s, charset):
154 # Split up a header safely for use with encode_chunks. BAW: this
155 # appears to be a private convenience method.
156 splittable = charset.to_splittable(s)
157 encoded = charset.from_splittable(splittable)
Barry Warsaw812031b2002-05-19 23:47:53 +0000158 elen = charset.encoded_header_len(encoded)
Barry Warsaw409a4c02002-04-10 21:01:31 +0000159
Barry Warsaw812031b2002-05-19 23:47:53 +0000160 if elen <= self._maxlinelen:
Barry Warsaw409a4c02002-04-10 21:01:31 +0000161 return [(encoded, charset)]
Barry Warsaw812031b2002-05-19 23:47:53 +0000162 # BAW: should we use encoded?
163 elif elen == len(s):
164 # We can split on _maxlinelen boundaries because we know that the
165 # encoding won't change the size of the string
166 splitpnt = self._maxlinelen
167 first = charset.from_splittable(splittable[:splitpnt], 0)
168 last = charset.from_splittable(splittable[splitpnt:], 0)
169 return self._split(first, charset) + self._split(last, charset)
Barry Warsaw409a4c02002-04-10 21:01:31 +0000170 else:
171 # Divide and conquer. BAW: halfway depends on integer division.
172 # When porting to Python 2.2, use the // operator.
Barry Warsaw812031b2002-05-19 23:47:53 +0000173 halfway = _intdiv2(len(splittable))
Barry Warsaw409a4c02002-04-10 21:01:31 +0000174 first = charset.from_splittable(splittable[:halfway], 0)
175 last = charset.from_splittable(splittable[halfway:], 0)
176 return self._split(first, charset) + self._split(last, charset)
177
178 def encode(self):
179 """Encode a message header, possibly converting charset and encoding.
180
181 There are many issues involved in converting a given string for use in
182 an email header. Only certain character sets are readable in most
183 email clients, and as header strings can only contain a subset of
184 7-bit ASCII, care must be taken to properly convert and encode (with
185 Base64 or quoted-printable) header strings. In addition, there is a
186 75-character length limit on any given encoded header field, so
187 line-wrapping must be performed, even with double-byte character sets.
188
189 This method will do its best to convert the string to the correct
190 character set used in email, and encode and line wrap it safely with
191 the appropriate scheme for that character set.
192
193 If the given charset is not known or an error occurs during
194 conversion, this function will return the header untouched.
195 """
196 newchunks = []
197 for s, charset in self._chunks:
198 newchunks += self._split(s, charset)
199 self._chunks = newchunks
200 return self.encode_chunks()
201
202 def encode_chunks(self):
203 """MIME-encode a header with many different charsets and/or encodings.
204
205 Given a list of pairs (string, charset), return a MIME-encoded string
206 suitable for use in a header field. Each pair may have different
207 charsets and/or encodings, and the resulting header will accurately
208 reflect each setting.
209
210 Each encoding can be email.Utils.QP (quoted-printable, for ASCII-like
211 character sets like iso-8859-1), email.Utils.BASE64 (Base64, for
212 non-ASCII like character sets like KOI8-R and iso-2022-jp), or None
213 (no encoding).
214
215 Each pair will be represented on a separate line; the resulting string
216 will be in the format:
217
218 "=?charset1?q?Mar=EDa_Gonz=E1lez_Alonso?=\n
219 =?charset2?b?SvxyZ2VuIEL2aW5n?="
220 """
221 chunks = []
222 for header, charset in self._chunks:
223 if charset is None:
224 _max_append(chunks, header, self._maxlinelen, ' ')
225 else:
226 _max_append(chunks, charset.header_encode(header, 0),
227 self._maxlinelen, ' ')
228 return NLSPACE.join(chunks)