blob: 85efc088aa2a4ec2a4a2adebd7bdfd1eaa303f12 [file] [log] [blame]
Guido van Rossum8b3febe2007-08-30 01:15:14 +00001# Copyright (C) 2001-2006 Python Software Foundation
2# Author: Ben Gertzfield
3# Contact: email-sig@python.org
4
5"""Quoted-printable content transfer encoding per RFCs 2045-2047.
6
7This module handles the content transfer encoding method defined in RFC 2045
8to encode US ASCII-like 8-bit data called `quoted-printable'. It is used to
9safely encode text that is in a character set similar to the 7-bit US ASCII
10character set, but that includes some 8-bit characters that are normally not
11allowed in email bodies or headers.
12
13Quoted-printable is very space-inefficient for encoding binary files; use the
14email.base64MIME module for that instead.
15
16This module provides an interface to encode and decode both headers and bodies
17with quoted-printable encoding.
18
19RFC 2045 defines a method for including character set information in an
20`encoded-word' in a header. This method is commonly used for 8-bit real names
21in To:/From:/Cc: etc. fields, as well as Subject: lines.
22
23This module does not do the line wrapping or end-of-line character
24conversion necessary for proper internationalized headers; it only
25does dumb encoding and decoding. To deal with the various line
26wrapping issues, use the email.Header module.
27"""
28
29__all__ = [
30 'body_decode',
31 'body_encode',
Guido van Rossum9604e662007-08-30 03:46:43 +000032 'body_length',
Guido van Rossum8b3febe2007-08-30 01:15:14 +000033 'decode',
34 'decodestring',
Guido van Rossum8b3febe2007-08-30 01:15:14 +000035 'header_decode',
36 'header_encode',
Guido van Rossum9604e662007-08-30 03:46:43 +000037 'header_length',
Guido van Rossum8b3febe2007-08-30 01:15:14 +000038 'quote',
39 'unquote',
40 ]
41
42import re
43
44from string import ascii_letters, digits, hexdigits
Guido van Rossum8b3febe2007-08-30 01:15:14 +000045
46CRLF = '\r\n'
47NL = '\n'
48EMPTYSTRING = ''
49
Guido van Rossum9604e662007-08-30 03:46:43 +000050# Build a mapping of octets to the expansion of that octet. Since we're only
51# going to have 256 of these things, this isn't terribly inefficient
52# space-wise. Remember that headers and bodies have different sets of safe
53# characters. Initialize both maps with the full expansion, and then override
54# the safe bytes with the more compact form.
55_QUOPRI_HEADER_MAP = dict((c, '=%02X' % c) for c in range(256))
56_QUOPRI_BODY_MAP = _QUOPRI_HEADER_MAP.copy()
Guido van Rossum8b3febe2007-08-30 01:15:14 +000057
Guido van Rossum9604e662007-08-30 03:46:43 +000058# Safe header bytes which need no encoding.
Barry Warsaw2cc1f6d2007-08-30 14:28:55 +000059for c in b'-!*+/' + ascii_letters.encode('ascii') + digits.encode('ascii'):
Guido van Rossum9604e662007-08-30 03:46:43 +000060 _QUOPRI_HEADER_MAP[c] = chr(c)
61# Headers have one other special encoding; spaces become underscores.
62_QUOPRI_HEADER_MAP[ord(' ')] = '_'
Barry Warsaw8b3d6592007-08-30 02:10:49 +000063
Guido van Rossum9604e662007-08-30 03:46:43 +000064# Safe body bytes which need no encoding.
65for c in (b' !"#$%&\'()*+,-./0123456789:;<>'
66 b'?@ABCDEFGHIJKLMNOPQRSTUVWXYZ[\\]^_`'
67 b'abcdefghijklmnopqrstuvwxyz{|}~\t'):
68 _QUOPRI_BODY_MAP[c] = chr(c)
Guido van Rossum8b3febe2007-08-30 01:15:14 +000069
70
Antoine Pitroufd036452008-08-19 17:56:33 +000071
Guido van Rossum8b3febe2007-08-30 01:15:14 +000072# Helpers
Guido van Rossum9604e662007-08-30 03:46:43 +000073def header_check(octet):
74 """Return True if the octet should be escaped with header quopri."""
75 return chr(octet) != _QUOPRI_HEADER_MAP[octet]
Guido van Rossum8b3febe2007-08-30 01:15:14 +000076
77
Guido van Rossum9604e662007-08-30 03:46:43 +000078def body_check(octet):
79 """Return True if the octet should be escaped with body quopri."""
80 return chr(octet) != _QUOPRI_BODY_MAP[octet]
Guido van Rossum8b3febe2007-08-30 01:15:14 +000081
82
Guido van Rossum9604e662007-08-30 03:46:43 +000083def header_length(bytearray):
84 """Return a header quoted-printable encoding length.
Guido van Rossum8b3febe2007-08-30 01:15:14 +000085
86 Note that this does not include any RFC 2047 chrome added by
87 `header_encode()`.
Guido van Rossum9604e662007-08-30 03:46:43 +000088
89 :param bytearray: An array of bytes (a.k.a. octets).
90 :return: The length in bytes of the byte array when it is encoded with
91 quoted-printable for headers.
Guido van Rossum8b3febe2007-08-30 01:15:14 +000092 """
Guido van Rossum9604e662007-08-30 03:46:43 +000093 return sum(len(_QUOPRI_HEADER_MAP[octet]) for octet in bytearray)
Guido van Rossum8b3febe2007-08-30 01:15:14 +000094
95
Guido van Rossum9604e662007-08-30 03:46:43 +000096def body_length(bytearray):
97 """Return a body quoted-printable encoding length.
98
99 :param bytearray: An array of bytes (a.k.a. octets).
100 :return: The length in bytes of the byte array when it is encoded with
101 quoted-printable for bodies.
102 """
103 return sum(len(_QUOPRI_BODY_MAP[octet]) for octet in bytearray)
Guido van Rossum8b3febe2007-08-30 01:15:14 +0000104
105
106def _max_append(L, s, maxlen, extra=''):
107 if not isinstance(s, str):
108 s = chr(s)
109 if not L:
110 L.append(s.lstrip())
111 elif len(L[-1]) + len(s) <= maxlen:
112 L[-1] += extra + s
113 else:
114 L.append(s.lstrip())
115
116
117def unquote(s):
118 """Turn a string in the form =AB to the ASCII character with value 0xab"""
119 return chr(int(s[1:3], 16))
120
121
122def quote(c):
123 return '=%02X' % ord(c)
124
125
Antoine Pitroufd036452008-08-19 17:56:33 +0000126
Guido van Rossum8b3febe2007-08-30 01:15:14 +0000127def header_encode(header_bytes, charset='iso-8859-1'):
128 """Encode a single header line with quoted-printable (like) encoding.
129
130 Defined in RFC 2045, this `Q' encoding is similar to quoted-printable, but
131 used specifically for email header fields to allow charsets with mostly 7
132 bit characters (and some 8 bit) to remain more or less readable in non-RFC
133 2045 aware mail clients.
134
135 charset names the character set to use in the RFC 2046 header. It
136 defaults to iso-8859-1.
137 """
138 # Return empty headers unchanged
139 if not header_bytes:
140 return str(header_bytes)
141 # Iterate over every byte, encoding if necessary.
142 encoded = []
Guido van Rossum9604e662007-08-30 03:46:43 +0000143 for octet in header_bytes:
144 encoded.append(_QUOPRI_HEADER_MAP[octet])
Guido van Rossum8b3febe2007-08-30 01:15:14 +0000145 # Now add the RFC chrome to each encoded chunk and glue the chunks
146 # together.
147 return '=?%s?q?%s?=' % (charset, EMPTYSTRING.join(encoded))
148
149
Antoine Pitroufd036452008-08-19 17:56:33 +0000150
Guido van Rossum9604e662007-08-30 03:46:43 +0000151def body_encode(body, maxlinelen=76, eol=NL):
Guido van Rossum8b3febe2007-08-30 01:15:14 +0000152 """Encode with quoted-printable, wrapping at maxlinelen characters.
153
Guido van Rossum8b3febe2007-08-30 01:15:14 +0000154 Each line of encoded text will end with eol, which defaults to "\\n". Set
155 this to "\\r\\n" if you will be using the result of this function directly
156 in an email.
157
158 Each line will be wrapped at, at most, maxlinelen characters (defaults to
159 76 characters). Long lines will have the `soft linefeed' quoted-printable
160 character "=" appended to them, so the decoded text will be identical to
161 the original text.
162 """
163 if not body:
164 return body
165
Guido van Rossum8b3febe2007-08-30 01:15:14 +0000166 # BAW: We're accumulating the body text by string concatenation. That
167 # can't be very efficient, but I don't have time now to rewrite it. It
168 # just feels like this algorithm could be more efficient.
169 encoded_body = ''
170 lineno = -1
171 # Preserve line endings here so we can check later to see an eol needs to
172 # be added to the output later.
173 lines = body.splitlines(1)
174 for line in lines:
175 # But strip off line-endings for processing this line.
176 if line.endswith(CRLF):
177 line = line[:-2]
178 elif line[-1] in CRLF:
179 line = line[:-1]
180
181 lineno += 1
182 encoded_line = ''
183 prev = None
184 linelen = len(line)
185 # Now we need to examine every character to see if it needs to be
186 # quopri encoded. BAW: again, string concatenation is inefficient.
187 for j in range(linelen):
188 c = line[j]
189 prev = c
Barry Warsaw7aa02e62007-08-31 03:26:19 +0000190 if body_check(ord(c)):
Guido van Rossum8b3febe2007-08-30 01:15:14 +0000191 c = quote(c)
192 elif j+1 == linelen:
193 # Check for whitespace at end of line; special case
194 if c not in ' \t':
195 encoded_line += c
196 prev = c
197 continue
198 # Check to see to see if the line has reached its maximum length
199 if len(encoded_line) + len(c) >= maxlinelen:
200 encoded_body += encoded_line + '=' + eol
201 encoded_line = ''
202 encoded_line += c
203 # Now at end of line..
204 if prev and prev in ' \t':
205 # Special case for whitespace at end of file
206 if lineno + 1 == len(lines):
207 prev = quote(prev)
208 if len(encoded_line) + len(prev) > maxlinelen:
209 encoded_body += encoded_line + '=' + eol + prev
210 else:
211 encoded_body += encoded_line + prev
212 # Just normal whitespace at end of line
213 else:
214 encoded_body += encoded_line + prev + '=' + eol
215 encoded_line = ''
216 # Now look at the line we just finished and it has a line ending, we
217 # need to add eol to the end of the line.
218 if lines[lineno].endswith(CRLF) or lines[lineno][-1] in CRLF:
219 encoded_body += encoded_line + eol
220 else:
221 encoded_body += encoded_line
222 encoded_line = ''
223 return encoded_body
224
225
Antoine Pitroufd036452008-08-19 17:56:33 +0000226
Guido van Rossum8b3febe2007-08-30 01:15:14 +0000227# BAW: I'm not sure if the intent was for the signature of this function to be
228# the same as base64MIME.decode() or not...
229def decode(encoded, eol=NL):
230 """Decode a quoted-printable string.
231
232 Lines are separated with eol, which defaults to \\n.
233 """
234 if not encoded:
235 return encoded
236 # BAW: see comment in encode() above. Again, we're building up the
237 # decoded string with string concatenation, which could be done much more
238 # efficiently.
239 decoded = ''
240
241 for line in encoded.splitlines():
242 line = line.rstrip()
243 if not line:
244 decoded += eol
245 continue
246
247 i = 0
248 n = len(line)
249 while i < n:
250 c = line[i]
251 if c != '=':
252 decoded += c
253 i += 1
254 # Otherwise, c == "=". Are we at the end of the line? If so, add
255 # a soft line break.
256 elif i+1 == n:
257 i += 1
258 continue
259 # Decode if in form =AB
260 elif i+2 < n and line[i+1] in hexdigits and line[i+2] in hexdigits:
261 decoded += unquote(line[i:i+3])
262 i += 3
263 # Otherwise, not in form =AB, pass literally
264 else:
265 decoded += c
266 i += 1
267
268 if i == n:
269 decoded += eol
270 # Special case if original string did not end with eol
271 if not encoded.endswith(eol) and decoded.endswith(eol):
272 decoded = decoded[:-1]
273 return decoded
274
275
276# For convenience and backwards compatibility w/ standard base64 module
277body_decode = decode
278decodestring = decode
279
280
Antoine Pitroufd036452008-08-19 17:56:33 +0000281
Guido van Rossum8b3febe2007-08-30 01:15:14 +0000282def _unquote_match(match):
283 """Turn a match in the form =AB to the ASCII character with value 0xab"""
284 s = match.group(0)
285 return unquote(s)
286
287
288# Header decoding is done a bit differently
289def header_decode(s):
290 """Decode a string encoded with RFC 2045 MIME header `Q' encoding.
291
292 This function does not parse a full MIME header value encoded with
293 quoted-printable (like =?iso-8895-1?q?Hello_World?=) -- please use
294 the high level email.Header class for that functionality.
295 """
296 s = s.replace('_', ' ')
R. David Murrayf9c957f2010-10-01 15:45:48 +0000297 return re.sub(r'=[a-fA-F0-9]{2}', _unquote_match, s, re.ASCII)