blob: 002034e169c55bd09fc82026888b26a70d443ec0 [file] [log] [blame]
Barry Warsaw409a4c02002-04-10 21:01:31 +00001# Copyright (C) 2001,2002 Python Software Foundation
2# Author: che@debian.org (Ben Gertzfield)
3
4"""Quoted-printable content transfer encoding per RFCs 2045-2047.
5
6This module handles the content transfer encoding method defined in RFC 2045
7to encode US ASCII-like 8-bit data called `quoted-printable'. It is used to
8safely encode text that is in a character set similar to the 7-bit US ASCII
9character set, but that includes some 8-bit characters that are normally not
10allowed in email bodies or headers.
11
12Quoted-printable is very space-inefficient for encoding binary files; use the
13email.base64MIME module for that instead.
14
15This module provides an interface to encode and decode both headers and bodies
16with quoted-printable encoding.
17
18RFC 2045 defines a method for including character set information in an
19`encoded-word' in a header. This method is commonly used for 8-bit real names
20in To:/From:/Cc: etc. fields, as well as Subject: lines.
21
22This module does not do the line wrapping or end-of-line character
23conversion necessary for proper internationalized headers; it only
24does dumb encoding and decoding. To deal with the various line
25wrapping issues, use the email.Header module.
26"""
27
28import re
29from string import hexdigits
30from email.Utils import fix_eols
31
32CRLF = '\r\n'
33NL = '\n'
34
35# See also Charset.py
36MISC_LEN = 7
37
38hqre = re.compile(r'[^-a-zA-Z0-9!*+/ ]')
39bqre = re.compile(r'[^ !-<>-~\t]')
40
41
42
43# Helpers
44def header_quopri_check(c):
45 """Return true if the character should be escaped with header quopri."""
46 return hqre.match(c) and 1
47
48
49def body_quopri_check(c):
50 """Return true if the character should be escaped with body quopri."""
51 return bqre.match(c) and 1
52
53
54def header_quopri_len(s):
55 """Return the length of str when it is encoded with header quopri."""
56 count = 0
57 for c in s:
58 if hqre.match(c):
59 count += 3
60 else:
61 count += 1
62 return count
63
64
65def body_quopri_len(str):
66 """Return the length of str when it is encoded with body quopri."""
67 count = 0
68 for c in str:
69 if bqre.match(c):
70 count += 3
71 else:
72 count += 1
73 return count
74
75
76def _max_append(L, s, maxlen, extra=''):
77 if not L:
78 L.append(s)
79 elif len(L[-1]) + len(s) < maxlen:
80 L[-1] += extra + s
81 else:
82 L.append(s)
83
84
85def unquote(s):
86 """Turn a string in the form =AB to the ASCII character with value 0xab"""
87 return chr(int(s[1:3], 16))
88
89
90def quote(c):
91 return "=%02X" % ord(c)
92
93
94
95def header_encode(header, charset="iso-8859-1", keep_eols=0, maxlinelen=76,
96 eol=NL):
97 """Encode a single header line with quoted-printable (like) encoding.
98
99 Defined in RFC 2045, this `Q' encoding is similar to quoted-printable, but
100 used specifically for email header fields to allow charsets with mostly 7
101 bit characters (and some 8 bit) to remain more or less readable in non-RFC
102 2045 aware mail clients.
103
104 charset names the character set to use to encode the header. It defaults
105 to iso-8859-1.
106
107 The resulting string will be in the form:
108
109 "=?charset?q?I_f=E2rt_in_your_g=E8n=E8ral_dire=E7tion?\\n
110 =?charset?q?Silly_=C8nglish_Kn=EEghts?="
111
112 with each line wrapped safely at, at most, maxlinelen characters (defaults
113 to 76 characters).
114
115 End-of-line characters (\\r, \\n, \\r\\n) will be automatically converted
116 to the canonical email line separator \\r\\n unless the keep_eols
117 parameter is set to true (the default is false).
118
119 Each line of the header will be terminated in the value of eol, which
120 defaults to "\\n". Set this to "\\r\\n" if you are using the result of
121 this function directly in email.
122 """
123 # Return empty headers unchanged
124 if not header:
125 return header
126
127 if not keep_eols:
128 header = fix_eols(header)
129
130 # Quopri encode each line, in encoded chunks no greater than maxlinelen in
131 # lenght, after the RFC chrome is added in.
132 quoted = []
133 max_encoded = maxlinelen - len(charset) - MISC_LEN
134
135 for c in header:
136 # Space may be represented as _ instead of =20 for readability
137 if c == ' ':
138 _max_append(quoted, '_', max_encoded)
139 # These characters can be included verbatim
140 elif not hqre.match(c):
141 _max_append(quoted, c, max_encoded)
142 # Otherwise, replace with hex value like =E2
143 else:
144 _max_append(quoted, "=%02X" % ord(c), max_encoded)
145
146 # Now add the RFC chrome to each encoded chunk and glue the chunks
147 # together. BAW: should we be able to specify the leading whitespace in
148 # the joiner?
149 joiner = eol + ' '
150 return joiner.join(['=?%s?q?%s?=' % (charset, line) for line in quoted])
151
152
153
154def encode(body, binary=0, maxlinelen=76, eol=NL):
155 """Encode with quoted-printable, wrapping at maxlinelen characters.
156
157 If binary is false (the default), end-of-line characters will be converted
158 to the canonical email end-of-line sequence \\r\\n. Otherwise they will
159 be left verbatim.
160
161 Each line of encoded text will end with eol, which defaults to "\\n". Set
162 this to "\\r\\n" if you will be using the result of this function directly
163 in an email.
164
165 Each line will be wrapped at, at most, maxlinelen characters (defaults to
166 76 characters). Long lines will have the `soft linefeed' quoted-printable
167 character "=" appended to them, so the decoded text will be identical to
168 the original text.
169 """
170 if not body:
171 return body
172
173 if not binary:
174 body = fix_eols(body)
175
176 # BAW: We're accumulating the body text by string concatenation. That
177 # can't be very efficient, but I don't have time now to rewrite it. It
178 # just feels like this algorithm could be more efficient.
179 encoded_body = ''
180 lineno = -1
181 # Preserve line endings here so we can check later to see an eol needs to
182 # be added to the output later.
183 lines = body.splitlines(1)
184 for line in lines:
185 # But strip off line-endings for processing this line.
186 if line.endswith(CRLF):
187 line = line[:-2]
188 elif line[-1] in CRLF:
189 line = line[:-1]
190
191 lineno += 1
192 encoded_line = ''
193 prev = None
194 linelen = len(line)
195 # Now we need to examine every character to see if it needs to be
196 # quopri encoded. BAW: again, string concatenation is inefficient.
197 for j in range(linelen):
198 c = line[j]
199 prev = c
200 if bqre.match(c):
201 c = quote(c)
202 elif j+1 == linelen:
203 # Check for whitespace at end of line; special case
204 if c not in ' \t':
205 encoded_line += c
206 prev = c
207 continue
208 # Check to see to see if the line has reached its maximum length
209 if len(encoded_line) + len(c) >= maxlinelen:
210 encoded_body += encoded_line + '=' + eol
211 encoded_line = ''
212 encoded_line += c
213 # Now at end of line..
214 if prev and prev in ' \t':
215 # Special case for whitespace at end of file
216 if lineno+1 == len(lines):
217 prev = quote(prev)
218 if len(encoded_line) + len(prev) > maxlinelen:
219 encoded_body += encoded_line + '=' + eol + prev
220 else:
221 encoded_body += encoded_line + prev
222 # Just normal whitespace at end of line
223 else:
224 encoded_body += encoded_line + prev + '=' + eol
225 encoded_line = ''
226 # Now look at the line we just finished and it has a line ending, we
227 # need to add eol to the end of the line.
228 if lines[lineno].endswith(CRLF) or lines[lineno][-1] in CRLF:
229 encoded_body += encoded_line + eol
230 else:
231 encoded_body += encoded_line
232 encoded_line = ''
233 return encoded_body
234
235
236# For convenience and backwards compatibility w/ standard base64 module
237body_encode = encode
238encodestring = encode
239
240
241
242# BAW: I'm not sure if the intent was for the signature of this function to be
243# the same as base64MIME.decode() or not...
244def decode(encoded, eol=NL):
245 """Decode a quoted-printable string.
246
247 Lines are separated with eol, which defaults to \\n.
248 """
249 if not encoded:
250 return encoded
251 # BAW: see comment in encode() above. Again, we're building up the
252 # decoded string with string concatenation, which could be done much more
253 # efficiently.
254 decoded = ''
255
256 for line in encoded.splitlines():
257 line = line.rstrip()
258 if not line:
259 decoded += eol
260 continue
261
262 i = 0
263 n = len(line)
264 while i < n:
265 c = line[i]
266 if c <> '=':
267 decoded += c
268 i += 1
269 # Otherwise, c == "=". Are we at the end of the line? If so, add
270 # a soft line break.
271 elif i+1 == n:
272 i += 1
273 continue
274 # Decode if in form =AB
275 elif i+2 < n and line[i+1] in hexdigits and line[i+2] in hexdigits:
276 decoded += unquote(line[i:i+3])
277 i += 3
278 # Otherwise, not in form =AB, pass literally
279 else:
280 decoded += c
281 i += 1
282
283 if i == n:
284 decoded += eol
285 # Special case if original string did not end with eol
286 if encoded[-1] <> eol and decoded[-1] == eol:
287 decoded = decoded[:-1]
288 return decoded
289
290
291# For convenience and backwards compatibility w/ standard base64 module
292body_decode = decode
293decodestring = decode
294
295
296
297def _unquote_match(match):
298 """Turn a match in the form =AB to the ASCII character with value 0xab"""
299 s = match.group(0)
300 return unquote(s)
301
302
303# Header decoding is done a bit differently
304def header_decode(s):
305 """Decode a string encoded with RFC 2045 MIME header `Q' encoding.
306
307 This function does not parse a full MIME header value encoded with
308 quoted-printable (like =?iso-8895-1?q?Hello_World?=) -- please use
309 the high level email.Header class for that functionality.
310 """
311 s = s.replace('_', ' ')
312 return re.sub(r'=\w{2}', _unquote_match, s)