blob: 097b9783335c38b41fc43b0dd2c623bb0d519009 [file] [log] [blame]
Barry Warsaw409a4c02002-04-10 21:01:31 +00001# Copyright (C) 2002 Python Software Foundation
2# Author: che@debian.org (Ben Gertzfield)
3
4"""Header encoding and decoding functionality."""
5
6import re
7import email.quopriMIME
8import email.base64MIME
9from email.Charset import Charset
10
11CRLFSPACE = '\r\n '
12CRLF = '\r\n'
13NLSPACE = '\n '
14
15MAXLINELEN = 76
16
17ENCODE = 1
18DECODE = 2
19
20# Match encoded-word strings in the form =?charset?q?Hello_World?=
21ecre = re.compile(r'''
22 =\? # literal =?
23 (?P<charset>[^?]*?) # non-greedy up to the next ? is the charset
24 \? # literal ?
25 (?P<encoding>[qb]) # either a "q" or a "b", case insensitive
26 \? # literal ?
27 (?P<encoded>.*?) # non-greedy up to the next ?= is the encoded string
28 \?= # literal ?=
29 ''', re.VERBOSE | re.IGNORECASE)
30
31
32
33# Helpers
34_max_append = email.quopriMIME._max_append
35
36
37
38def decode_header(header):
39 """Decode a message header value without converting charset.
40
41 Returns a list of (decoded_string, charset) pairs containing each of the
42 decoded parts of the header. Charset is None for non-encoded parts of the
43 header, otherwise a lower-case string containing the name of the character
44 set specified in the encoded string.
45 """
46 # If no encoding, just return the header
47 header = str(header)
48 if not ecre.search(header):
49 return [(header, None)]
50
51 decoded = []
52 dec = ''
53 for line in header.splitlines():
54 # This line might not have an encoding in it
55 if not ecre.search(line):
56 decoded.append((line, None))
57 continue
58
59 parts = ecre.split(line)
60 while parts:
61 unenc = parts.pop(0).strip()
62 if unenc:
63 # Should we continue a long line?
64 if decoded and decoded[-1][1] is None:
65 decoded[-1] = (decoded[-1][0] + dec, None)
66 else:
67 decoded.append((unenc, None))
68 if parts:
69 charset, encoding = [s.lower() for s in parts[0:2]]
70 encoded = parts[2]
71 dec = ''
72 if encoding == 'q':
73 dec = email.quopriMIME.header_decode(encoded)
74 elif encoding == 'b':
75 dec = email.base64MIME.decode(encoded)
76 else:
77 dec = encoded
78
79 if decoded and decoded[-1][1] == charset:
80 decoded[-1] = (decoded[-1][0] + dec, decoded[-1][1])
81 else:
82 decoded.append((dec, charset))
83 del parts[0:3]
84 return decoded
85
86
87
88class Header:
89 def __init__(self, s, charset=None, maxlinelen=MAXLINELEN,
90 header_name=None):
91 """Create a MIME-compliant header that can contain many languages.
92
93 Specify the initial header value in s. Specify its character set as a
94 Charset object in the charset argument. If none, a default Charset
95 instance will be used.
96
97 You can later append to the header with append(s, charset) below;
98 charset does not have to be the same as the one initially specified
99 here. In fact, it's optional, and if not given, defaults to the
100 charset specified in the constructor.
101
102 The maximum line length can either be specified by maxlinelen, or you
103 can pass in the name of the header field (e.g. "Subject") to let this
104 class guess the best line length to use to prevent wrapping. The
105 default maxlinelen is 76.
106 """
107 if charset is None:
108 charset = Charset()
109 self._charset = charset
110 # BAW: I believe `chunks' and `maxlinelen' should be non-public.
111 self._chunks = []
112 self.append(s, charset)
113 self._maxlinelen = maxlinelen
114 if header_name is not None:
115 self.guess_maxlinelen(header_name)
116
117 def __str__(self):
118 """A synonym for self.encode()."""
119 return self.encode()
120
121 def guess_maxlinelen(self, s=None):
122 """Guess the maximum length to make each header line.
123
124 Given a header name (e.g. "Subject"), set this header's maximum line
125 length to an appropriate length to avoid line wrapping. If s is not
126 given, return the previous maximum line length and don't set it.
127
128 Returns the new maximum line length.
129 """
130 # BAW: is this semantic necessary?
131 if s is not None:
132 self._maxlinelen = MAXLINELEN - len(s) - 2
133 return self._maxlinelen
134
135 def append(self, s, charset=None):
136 """Append string s with Charset charset to the MIME header.
137
138 charset defaults to the one given in the class constructor.
139 """
140 if charset is None:
141 charset = self._charset
142 self._chunks.append((s, charset))
143
144 def _split(self, s, charset):
145 # Split up a header safely for use with encode_chunks. BAW: this
146 # appears to be a private convenience method.
147 splittable = charset.to_splittable(s)
148 encoded = charset.from_splittable(splittable)
149
150 if charset.encoded_header_len(encoded) < self._maxlinelen:
151 return [(encoded, charset)]
152 else:
153 # Divide and conquer. BAW: halfway depends on integer division.
154 # When porting to Python 2.2, use the // operator.
155 halfway = len(splittable) // 2
156 first = charset.from_splittable(splittable[:halfway], 0)
157 last = charset.from_splittable(splittable[halfway:], 0)
158 return self._split(first, charset) + self._split(last, charset)
159
160 def encode(self):
161 """Encode a message header, possibly converting charset and encoding.
162
163 There are many issues involved in converting a given string for use in
164 an email header. Only certain character sets are readable in most
165 email clients, and as header strings can only contain a subset of
166 7-bit ASCII, care must be taken to properly convert and encode (with
167 Base64 or quoted-printable) header strings. In addition, there is a
168 75-character length limit on any given encoded header field, so
169 line-wrapping must be performed, even with double-byte character sets.
170
171 This method will do its best to convert the string to the correct
172 character set used in email, and encode and line wrap it safely with
173 the appropriate scheme for that character set.
174
175 If the given charset is not known or an error occurs during
176 conversion, this function will return the header untouched.
177 """
178 newchunks = []
179 for s, charset in self._chunks:
180 newchunks += self._split(s, charset)
181 self._chunks = newchunks
182 return self.encode_chunks()
183
184 def encode_chunks(self):
185 """MIME-encode a header with many different charsets and/or encodings.
186
187 Given a list of pairs (string, charset), return a MIME-encoded string
188 suitable for use in a header field. Each pair may have different
189 charsets and/or encodings, and the resulting header will accurately
190 reflect each setting.
191
192 Each encoding can be email.Utils.QP (quoted-printable, for ASCII-like
193 character sets like iso-8859-1), email.Utils.BASE64 (Base64, for
194 non-ASCII like character sets like KOI8-R and iso-2022-jp), or None
195 (no encoding).
196
197 Each pair will be represented on a separate line; the resulting string
198 will be in the format:
199
200 "=?charset1?q?Mar=EDa_Gonz=E1lez_Alonso?=\n
201 =?charset2?b?SvxyZ2VuIEL2aW5n?="
202 """
203 chunks = []
204 for header, charset in self._chunks:
205 if charset is None:
206 _max_append(chunks, header, self._maxlinelen, ' ')
207 else:
208 _max_append(chunks, charset.header_encode(header, 0),
209 self._maxlinelen, ' ')
210 return NLSPACE.join(chunks)