blob: 68dc11c695e36c171cd743dbbf6311813c9f9ad1 [file] [log] [blame]
Guido van Rossum8b3febe2007-08-30 01:15:14 +00001# Copyright (C) 2001-2006 Python Software Foundation
2# Author: Ben Gertzfield
3# Contact: email-sig@python.org
4
5"""Quoted-printable content transfer encoding per RFCs 2045-2047.
6
7This module handles the content transfer encoding method defined in RFC 2045
8to encode US ASCII-like 8-bit data called `quoted-printable'. It is used to
9safely encode text that is in a character set similar to the 7-bit US ASCII
10character set, but that includes some 8-bit characters that are normally not
11allowed in email bodies or headers.
12
13Quoted-printable is very space-inefficient for encoding binary files; use the
14email.base64MIME module for that instead.
15
16This module provides an interface to encode and decode both headers and bodies
17with quoted-printable encoding.
18
19RFC 2045 defines a method for including character set information in an
20`encoded-word' in a header. This method is commonly used for 8-bit real names
21in To:/From:/Cc: etc. fields, as well as Subject: lines.
22
23This module does not do the line wrapping or end-of-line character
24conversion necessary for proper internationalized headers; it only
25does dumb encoding and decoding. To deal with the various line
26wrapping issues, use the email.Header module.
27"""
28
29__all__ = [
30 'body_decode',
31 'body_encode',
Guido van Rossum9604e662007-08-30 03:46:43 +000032 'body_length',
Guido van Rossum8b3febe2007-08-30 01:15:14 +000033 'decode',
34 'decodestring',
35 'encode',
36 'encodestring',
37 'header_decode',
38 'header_encode',
Guido van Rossum9604e662007-08-30 03:46:43 +000039 'header_length',
Guido van Rossum8b3febe2007-08-30 01:15:14 +000040 'quote',
41 'unquote',
42 ]
43
44import re
45
46from string import ascii_letters, digits, hexdigits
Guido van Rossum8b3febe2007-08-30 01:15:14 +000047
48CRLF = '\r\n'
49NL = '\n'
50EMPTYSTRING = ''
51
Guido van Rossum9604e662007-08-30 03:46:43 +000052# Build a mapping of octets to the expansion of that octet. Since we're only
53# going to have 256 of these things, this isn't terribly inefficient
54# space-wise. Remember that headers and bodies have different sets of safe
55# characters. Initialize both maps with the full expansion, and then override
56# the safe bytes with the more compact form.
57_QUOPRI_HEADER_MAP = dict((c, '=%02X' % c) for c in range(256))
58_QUOPRI_BODY_MAP = _QUOPRI_HEADER_MAP.copy()
Guido van Rossum8b3febe2007-08-30 01:15:14 +000059
Guido van Rossum9604e662007-08-30 03:46:43 +000060# Safe header bytes which need no encoding.
Barry Warsaw2cc1f6d2007-08-30 14:28:55 +000061for c in b'-!*+/' + ascii_letters.encode('ascii') + digits.encode('ascii'):
Guido van Rossum9604e662007-08-30 03:46:43 +000062 _QUOPRI_HEADER_MAP[c] = chr(c)
63# Headers have one other special encoding; spaces become underscores.
64_QUOPRI_HEADER_MAP[ord(' ')] = '_'
Barry Warsaw8b3d6592007-08-30 02:10:49 +000065
Guido van Rossum9604e662007-08-30 03:46:43 +000066# Safe body bytes which need no encoding.
67for c in (b' !"#$%&\'()*+,-./0123456789:;<>'
68 b'?@ABCDEFGHIJKLMNOPQRSTUVWXYZ[\\]^_`'
69 b'abcdefghijklmnopqrstuvwxyz{|}~\t'):
70 _QUOPRI_BODY_MAP[c] = chr(c)
Guido van Rossum8b3febe2007-08-30 01:15:14 +000071
72
73
74# Helpers
Guido van Rossum9604e662007-08-30 03:46:43 +000075def header_check(octet):
76 """Return True if the octet should be escaped with header quopri."""
77 return chr(octet) != _QUOPRI_HEADER_MAP[octet]
Guido van Rossum8b3febe2007-08-30 01:15:14 +000078
79
Guido van Rossum9604e662007-08-30 03:46:43 +000080def body_check(octet):
81 """Return True if the octet should be escaped with body quopri."""
82 return chr(octet) != _QUOPRI_BODY_MAP[octet]
Guido van Rossum8b3febe2007-08-30 01:15:14 +000083
84
Guido van Rossum9604e662007-08-30 03:46:43 +000085def header_length(bytearray):
86 """Return a header quoted-printable encoding length.
Guido van Rossum8b3febe2007-08-30 01:15:14 +000087
88 Note that this does not include any RFC 2047 chrome added by
89 `header_encode()`.
Guido van Rossum9604e662007-08-30 03:46:43 +000090
91 :param bytearray: An array of bytes (a.k.a. octets).
92 :return: The length in bytes of the byte array when it is encoded with
93 quoted-printable for headers.
Guido van Rossum8b3febe2007-08-30 01:15:14 +000094 """
Guido van Rossum9604e662007-08-30 03:46:43 +000095 return sum(len(_QUOPRI_HEADER_MAP[octet]) for octet in bytearray)
Guido van Rossum8b3febe2007-08-30 01:15:14 +000096
97
Guido van Rossum9604e662007-08-30 03:46:43 +000098def body_length(bytearray):
99 """Return a body quoted-printable encoding length.
100
101 :param bytearray: An array of bytes (a.k.a. octets).
102 :return: The length in bytes of the byte array when it is encoded with
103 quoted-printable for bodies.
104 """
105 return sum(len(_QUOPRI_BODY_MAP[octet]) for octet in bytearray)
Guido van Rossum8b3febe2007-08-30 01:15:14 +0000106
107
108def _max_append(L, s, maxlen, extra=''):
109 if not isinstance(s, str):
110 s = chr(s)
111 if not L:
112 L.append(s.lstrip())
113 elif len(L[-1]) + len(s) <= maxlen:
114 L[-1] += extra + s
115 else:
116 L.append(s.lstrip())
117
118
119def unquote(s):
120 """Turn a string in the form =AB to the ASCII character with value 0xab"""
121 return chr(int(s[1:3], 16))
122
123
124def quote(c):
125 return '=%02X' % ord(c)
126
127
128
129def header_encode(header_bytes, charset='iso-8859-1'):
130 """Encode a single header line with quoted-printable (like) encoding.
131
132 Defined in RFC 2045, this `Q' encoding is similar to quoted-printable, but
133 used specifically for email header fields to allow charsets with mostly 7
134 bit characters (and some 8 bit) to remain more or less readable in non-RFC
135 2045 aware mail clients.
136
137 charset names the character set to use in the RFC 2046 header. It
138 defaults to iso-8859-1.
139 """
140 # Return empty headers unchanged
141 if not header_bytes:
142 return str(header_bytes)
143 # Iterate over every byte, encoding if necessary.
144 encoded = []
Guido van Rossum9604e662007-08-30 03:46:43 +0000145 for octet in header_bytes:
146 encoded.append(_QUOPRI_HEADER_MAP[octet])
Guido van Rossum8b3febe2007-08-30 01:15:14 +0000147 # Now add the RFC chrome to each encoded chunk and glue the chunks
148 # together.
149 return '=?%s?q?%s?=' % (charset, EMPTYSTRING.join(encoded))
150
151
152
Guido van Rossum9604e662007-08-30 03:46:43 +0000153def body_encode(body, maxlinelen=76, eol=NL):
Guido van Rossum8b3febe2007-08-30 01:15:14 +0000154 """Encode with quoted-printable, wrapping at maxlinelen characters.
155
Guido van Rossum8b3febe2007-08-30 01:15:14 +0000156 Each line of encoded text will end with eol, which defaults to "\\n". Set
157 this to "\\r\\n" if you will be using the result of this function directly
158 in an email.
159
160 Each line will be wrapped at, at most, maxlinelen characters (defaults to
161 76 characters). Long lines will have the `soft linefeed' quoted-printable
162 character "=" appended to them, so the decoded text will be identical to
163 the original text.
164 """
165 if not body:
166 return body
167
Guido van Rossum8b3febe2007-08-30 01:15:14 +0000168 # BAW: We're accumulating the body text by string concatenation. That
169 # can't be very efficient, but I don't have time now to rewrite it. It
170 # just feels like this algorithm could be more efficient.
171 encoded_body = ''
172 lineno = -1
173 # Preserve line endings here so we can check later to see an eol needs to
174 # be added to the output later.
175 lines = body.splitlines(1)
176 for line in lines:
177 # But strip off line-endings for processing this line.
178 if line.endswith(CRLF):
179 line = line[:-2]
180 elif line[-1] in CRLF:
181 line = line[:-1]
182
183 lineno += 1
184 encoded_line = ''
185 prev = None
186 linelen = len(line)
187 # Now we need to examine every character to see if it needs to be
188 # quopri encoded. BAW: again, string concatenation is inefficient.
189 for j in range(linelen):
190 c = line[j]
191 prev = c
Barry Warsaw7aa02e62007-08-31 03:26:19 +0000192 if body_check(ord(c)):
Guido van Rossum8b3febe2007-08-30 01:15:14 +0000193 c = quote(c)
194 elif j+1 == linelen:
195 # Check for whitespace at end of line; special case
196 if c not in ' \t':
197 encoded_line += c
198 prev = c
199 continue
200 # Check to see to see if the line has reached its maximum length
201 if len(encoded_line) + len(c) >= maxlinelen:
202 encoded_body += encoded_line + '=' + eol
203 encoded_line = ''
204 encoded_line += c
205 # Now at end of line..
206 if prev and prev in ' \t':
207 # Special case for whitespace at end of file
208 if lineno + 1 == len(lines):
209 prev = quote(prev)
210 if len(encoded_line) + len(prev) > maxlinelen:
211 encoded_body += encoded_line + '=' + eol + prev
212 else:
213 encoded_body += encoded_line + prev
214 # Just normal whitespace at end of line
215 else:
216 encoded_body += encoded_line + prev + '=' + eol
217 encoded_line = ''
218 # Now look at the line we just finished and it has a line ending, we
219 # need to add eol to the end of the line.
220 if lines[lineno].endswith(CRLF) or lines[lineno][-1] in CRLF:
221 encoded_body += encoded_line + eol
222 else:
223 encoded_body += encoded_line
224 encoded_line = ''
225 return encoded_body
226
227
Guido van Rossum8b3febe2007-08-30 01:15:14 +0000228
229# BAW: I'm not sure if the intent was for the signature of this function to be
230# the same as base64MIME.decode() or not...
231def decode(encoded, eol=NL):
232 """Decode a quoted-printable string.
233
234 Lines are separated with eol, which defaults to \\n.
235 """
236 if not encoded:
237 return encoded
238 # BAW: see comment in encode() above. Again, we're building up the
239 # decoded string with string concatenation, which could be done much more
240 # efficiently.
241 decoded = ''
242
243 for line in encoded.splitlines():
244 line = line.rstrip()
245 if not line:
246 decoded += eol
247 continue
248
249 i = 0
250 n = len(line)
251 while i < n:
252 c = line[i]
253 if c != '=':
254 decoded += c
255 i += 1
256 # Otherwise, c == "=". Are we at the end of the line? If so, add
257 # a soft line break.
258 elif i+1 == n:
259 i += 1
260 continue
261 # Decode if in form =AB
262 elif i+2 < n and line[i+1] in hexdigits and line[i+2] in hexdigits:
263 decoded += unquote(line[i:i+3])
264 i += 3
265 # Otherwise, not in form =AB, pass literally
266 else:
267 decoded += c
268 i += 1
269
270 if i == n:
271 decoded += eol
272 # Special case if original string did not end with eol
273 if not encoded.endswith(eol) and decoded.endswith(eol):
274 decoded = decoded[:-1]
275 return decoded
276
277
278# For convenience and backwards compatibility w/ standard base64 module
279body_decode = decode
280decodestring = decode
281
282
283
284def _unquote_match(match):
285 """Turn a match in the form =AB to the ASCII character with value 0xab"""
286 s = match.group(0)
287 return unquote(s)
288
289
290# Header decoding is done a bit differently
291def header_decode(s):
292 """Decode a string encoded with RFC 2045 MIME header `Q' encoding.
293
294 This function does not parse a full MIME header value encoded with
295 quoted-printable (like =?iso-8895-1?q?Hello_World?=) -- please use
296 the high level email.Header class for that functionality.
297 """
298 s = s.replace('_', ' ')
299 return re.sub(r'=\w{2}', _unquote_match, s)