Blame - Lib/email/quoprimime.py - platform/external/python/cpython3

blob: bc02281b1d7f1cf8341c042f69e41eaad8de0925 [file] [log] [blame]

Guido van Rossum	8b3febe	2007-08-30 01:15:14 +0000	[diff] [blame]	1	# Copyright (C) 2001-2006 Python Software Foundation
				2	# Author: Ben Gertzfield
				3	# Contact: email-sig@python.org
				4
				5	"""Quoted-printable content transfer encoding per RFCs 2045-2047.
				6
				7	This module handles the content transfer encoding method defined in RFC 2045
				8	to encode US ASCII-like 8-bit data called `quoted-printable'. It is used to
				9	safely encode text that is in a character set similar to the 7-bit US ASCII
				10	character set, but that includes some 8-bit characters that are normally not
				11	allowed in email bodies or headers.
				12
				13	Quoted-printable is very space-inefficient for encoding binary files; use the
Amaury Forgeot d'Arc	1c25de6	2009-07-12 16:43:19 +0000	[diff] [blame]	14	email.base64mime module for that instead.
Guido van Rossum	8b3febe	2007-08-30 01:15:14 +0000	[diff] [blame]	15
				16	This module provides an interface to encode and decode both headers and bodies
				17	with quoted-printable encoding.
				18
				19	RFC 2045 defines a method for including character set information in an
				20	`encoded-word' in a header. This method is commonly used for 8-bit real names
				21	in To:/From:/Cc: etc. fields, as well as Subject: lines.
				22
				23	This module does not do the line wrapping or end-of-line character
				24	conversion necessary for proper internationalized headers; it only
				25	does dumb encoding and decoding. To deal with the various line
Amaury Forgeot d'Arc	1c25de6	2009-07-12 16:43:19 +0000	[diff] [blame]	26	wrapping issues, use the email.header module.
Guido van Rossum	8b3febe	2007-08-30 01:15:14 +0000	[diff] [blame]	27	"""
				28
				29	__all__ = [
				30	'body_decode',
				31	'body_encode',
Guido van Rossum	9604e66	2007-08-30 03:46:43 +0000	[diff] [blame]	32	'body_length',
Guido van Rossum	8b3febe	2007-08-30 01:15:14 +0000	[diff] [blame]	33	'decode',
				34	'decodestring',
Guido van Rossum	8b3febe	2007-08-30 01:15:14 +0000	[diff] [blame]	35	'header_decode',
				36	'header_encode',
Guido van Rossum	9604e66	2007-08-30 03:46:43 +0000	[diff] [blame]	37	'header_length',
Guido van Rossum	8b3febe	2007-08-30 01:15:14 +0000	[diff] [blame]	38	'quote',
				39	'unquote',
				40	]
				41
				42	import re
R David Murray	b938c8c	2011-03-24 12:19:26 -0400	[diff] [blame]	43	import io
Guido van Rossum	8b3febe	2007-08-30 01:15:14 +0000	[diff] [blame]	44
				45	from string import ascii_letters, digits, hexdigits
Guido van Rossum	8b3febe	2007-08-30 01:15:14 +0000	[diff] [blame]	46
				47	CRLF = '\r\n'
				48	NL = '\n'
				49	EMPTYSTRING = ''
				50
Guido van Rossum	9604e66	2007-08-30 03:46:43 +0000	[diff] [blame]	51	# Build a mapping of octets to the expansion of that octet. Since we're only
				52	# going to have 256 of these things, this isn't terribly inefficient
				53	# space-wise. Remember that headers and bodies have different sets of safe
				54	# characters. Initialize both maps with the full expansion, and then override
				55	# the safe bytes with the more compact form.
				56	_QUOPRI_HEADER_MAP = dict((c, '=%02X' % c) for c in range(256))
				57	_QUOPRI_BODY_MAP = _QUOPRI_HEADER_MAP.copy()
Guido van Rossum	8b3febe	2007-08-30 01:15:14 +0000	[diff] [blame]	58
Guido van Rossum	9604e66	2007-08-30 03:46:43 +0000	[diff] [blame]	59	# Safe header bytes which need no encoding.
Barry Warsaw	2cc1f6d	2007-08-30 14:28:55 +0000	[diff] [blame]	60	for c in b'-!*+/' + ascii_letters.encode('ascii') + digits.encode('ascii'):
Guido van Rossum	9604e66	2007-08-30 03:46:43 +0000	[diff] [blame]	61	_QUOPRI_HEADER_MAP[c] = chr(c)
				62	# Headers have one other special encoding; spaces become underscores.
				63	_QUOPRI_HEADER_MAP[ord(' ')] = '_'
Barry Warsaw	8b3d659	2007-08-30 02:10:49 +0000	[diff] [blame]	64
Guido van Rossum	9604e66	2007-08-30 03:46:43 +0000	[diff] [blame]	65	# Safe body bytes which need no encoding.
				66	for c in (b' !"#$%&\'()*+,-./0123456789:;<>'
				67	b'?@ABCDEFGHIJKLMNOPQRSTUVWXYZ[\\]^_`'
				68	b'abcdefghijklmnopqrstuvwxyz{\|}~\t'):
				69	_QUOPRI_BODY_MAP[c] = chr(c)
Guido van Rossum	8b3febe	2007-08-30 01:15:14 +0000	[diff] [blame]	70
				71
Antoine Pitrou	fd03645	2008-08-19 17:56:33 +0000	[diff] [blame]	72
Guido van Rossum	8b3febe	2007-08-30 01:15:14 +0000	[diff] [blame]	73	# Helpers
Guido van Rossum	9604e66	2007-08-30 03:46:43 +0000	[diff] [blame]	74	def header_check(octet):
				75	"""Return True if the octet should be escaped with header quopri."""
				76	return chr(octet) != _QUOPRI_HEADER_MAP[octet]
Guido van Rossum	8b3febe	2007-08-30 01:15:14 +0000	[diff] [blame]	77
				78
Guido van Rossum	9604e66	2007-08-30 03:46:43 +0000	[diff] [blame]	79	def body_check(octet):
				80	"""Return True if the octet should be escaped with body quopri."""
				81	return chr(octet) != _QUOPRI_BODY_MAP[octet]
Guido van Rossum	8b3febe	2007-08-30 01:15:14 +0000	[diff] [blame]	82
				83
Guido van Rossum	9604e66	2007-08-30 03:46:43 +0000	[diff] [blame]	84	def header_length(bytearray):
				85	"""Return a header quoted-printable encoding length.
Guido van Rossum	8b3febe	2007-08-30 01:15:14 +0000	[diff] [blame]	86
				87	Note that this does not include any RFC 2047 chrome added by
				88	`header_encode()`.
Guido van Rossum	9604e66	2007-08-30 03:46:43 +0000	[diff] [blame]	89
				90	:param bytearray: An array of bytes (a.k.a. octets).
				91	:return: The length in bytes of the byte array when it is encoded with
				92	quoted-printable for headers.
Guido van Rossum	8b3febe	2007-08-30 01:15:14 +0000	[diff] [blame]	93	"""
Guido van Rossum	9604e66	2007-08-30 03:46:43 +0000	[diff] [blame]	94	return sum(len(_QUOPRI_HEADER_MAP[octet]) for octet in bytearray)
Guido van Rossum	8b3febe	2007-08-30 01:15:14 +0000	[diff] [blame]	95
				96
Guido van Rossum	9604e66	2007-08-30 03:46:43 +0000	[diff] [blame]	97	def body_length(bytearray):
				98	"""Return a body quoted-printable encoding length.
				99
				100	:param bytearray: An array of bytes (a.k.a. octets).
				101	:return: The length in bytes of the byte array when it is encoded with
				102	quoted-printable for bodies.
				103	"""
				104	return sum(len(_QUOPRI_BODY_MAP[octet]) for octet in bytearray)
Guido van Rossum	8b3febe	2007-08-30 01:15:14 +0000	[diff] [blame]	105
				106
				107	def _max_append(L, s, maxlen, extra=''):
				108	if not isinstance(s, str):
				109	s = chr(s)
				110	if not L:
				111	L.append(s.lstrip())
				112	elif len(L[-1]) + len(s) <= maxlen:
				113	L[-1] += extra + s
				114	else:
				115	L.append(s.lstrip())
				116
				117
				118	def unquote(s):
				119	"""Turn a string in the form =AB to the ASCII character with value 0xab"""
				120	return chr(int(s[1:3], 16))
				121
				122
				123	def quote(c):
				124	return '=%02X' % ord(c)
				125
				126
Antoine Pitrou	fd03645	2008-08-19 17:56:33 +0000	[diff] [blame]	127
Guido van Rossum	8b3febe	2007-08-30 01:15:14 +0000	[diff] [blame]	128	def header_encode(header_bytes, charset='iso-8859-1'):
				129	"""Encode a single header line with quoted-printable (like) encoding.
				130
				131	Defined in RFC 2045, this `Q' encoding is similar to quoted-printable, but
				132	used specifically for email header fields to allow charsets with mostly 7
				133	bit characters (and some 8 bit) to remain more or less readable in non-RFC
				134	2045 aware mail clients.
				135
				136	charset names the character set to use in the RFC 2046 header. It
				137	defaults to iso-8859-1.
				138	"""
R David Murray	cafd79d	2011-03-23 15:25:55 -0400	[diff] [blame]	139	# Return empty headers as an empty string.
Guido van Rossum	8b3febe	2007-08-30 01:15:14 +0000	[diff] [blame]	140	if not header_bytes:
R David Murray	cafd79d	2011-03-23 15:25:55 -0400	[diff] [blame]	141	return ''
Guido van Rossum	8b3febe	2007-08-30 01:15:14 +0000	[diff] [blame]	142	# Iterate over every byte, encoding if necessary.
				143	encoded = []
Guido van Rossum	9604e66	2007-08-30 03:46:43 +0000	[diff] [blame]	144	for octet in header_bytes:
				145	encoded.append(_QUOPRI_HEADER_MAP[octet])
Guido van Rossum	8b3febe	2007-08-30 01:15:14 +0000	[diff] [blame]	146	# Now add the RFC chrome to each encoded chunk and glue the chunks
				147	# together.
				148	return '=?%s?q?%s?=' % (charset, EMPTYSTRING.join(encoded))
				149
				150
R David Murray	b938c8c	2011-03-24 12:19:26 -0400	[diff] [blame]	151	class _body_accumulator(io.StringIO):
				152
				153	def __init__(self, maxlinelen, eol, args, *kw):
				154	super().__init__(args, *kw)
				155	self.eol = eol
				156	self.maxlinelen = self.room = maxlinelen
				157
				158	def write_str(self, s):
				159	"""Add string s to the accumulated body."""
				160	self.write(s)
				161	self.room -= len(s)
				162
				163	def newline(self):
				164	"""Write eol, then start new line."""
				165	self.write_str(self.eol)
				166	self.room = self.maxlinelen
				167
				168	def write_soft_break(self):
				169	"""Write a soft break, then start a new line."""
				170	self.write_str('=')
				171	self.newline()
				172
				173	def write_wrapped(self, s, extra_room=0):
				174	"""Add a soft line break if needed, then write s."""
				175	if self.room < len(s) + extra_room:
				176	self.write_soft_break()
				177	self.write_str(s)
				178
				179	def write_char(self, c, is_last_char):
				180	if not is_last_char:
				181	# Another character follows on this line, so we must leave
				182	# extra room, either for it or a soft break, and whitespace
				183	# need not be quoted.
				184	self.write_wrapped(c, extra_room=1)
				185	elif c not in ' \t':
				186	# For this and remaining cases, no more characters follow,
				187	# so there is no need to reserve extra room (since a hard
				188	# break will immediately follow).
				189	self.write_wrapped(c)
				190	elif self.room >= 3:
				191	# It's a whitespace character at end-of-line, and we have room
				192	# for the three-character quoted encoding.
				193	self.write(quote(c))
				194	elif self.room == 2:
				195	# There's room for the whitespace character and a soft break.
				196	self.write(c)
				197	self.write_soft_break()
				198	else:
				199	# There's room only for a soft break. The quoted whitespace
				200	# will be the only content on the subsequent line.
				201	self.write_soft_break()
				202	self.write(quote(c))
				203
Antoine Pitrou	fd03645	2008-08-19 17:56:33 +0000	[diff] [blame]	204
Guido van Rossum	9604e66	2007-08-30 03:46:43 +0000	[diff] [blame]	205	def body_encode(body, maxlinelen=76, eol=NL):
Guido van Rossum	8b3febe	2007-08-30 01:15:14 +0000	[diff] [blame]	206	"""Encode with quoted-printable, wrapping at maxlinelen characters.
				207
Guido van Rossum	8b3febe	2007-08-30 01:15:14 +0000	[diff] [blame]	208	Each line of encoded text will end with eol, which defaults to "\\n". Set
				209	this to "\\r\\n" if you will be using the result of this function directly
				210	in an email.
				211
R David Murray	b938c8c	2011-03-24 12:19:26 -0400	[diff] [blame]	212	Each line will be wrapped at, at most, maxlinelen characters before the
				213	eol string (maxlinelen defaults to 76 characters, the maximum value
				214	permitted by RFC 2045). Long lines will have the 'soft line break'
				215	quoted-printable character "=" appended to them, so the decoded text will
				216	be identical to the original text.
				217
				218	The minimum maxlinelen is 4 to have room for a quoted character ("=XX")
				219	followed by a soft line break. Smaller values will generate a
				220	ValueError.
				221
Guido van Rossum	8b3febe	2007-08-30 01:15:14 +0000	[diff] [blame]	222	"""
R David Murray	b938c8c	2011-03-24 12:19:26 -0400	[diff] [blame]	223
				224	if maxlinelen < 4:
				225	raise ValueError("maxlinelen must be at least 4")
Guido van Rossum	8b3febe	2007-08-30 01:15:14 +0000	[diff] [blame]	226	if not body:
				227	return body
				228
R David Murray	b938c8c	2011-03-24 12:19:26 -0400	[diff] [blame]	229	# The last line may or may not end in eol, but all other lines do.
				230	last_has_eol = (body[-1] in '\r\n')
Guido van Rossum	8b3febe	2007-08-30 01:15:14 +0000	[diff] [blame]	231
R David Murray	b938c8c	2011-03-24 12:19:26 -0400	[diff] [blame]	232	# This accumulator will make it easier to build the encoded body.
				233	encoded_body = _body_accumulator(maxlinelen, eol)
				234
				235	lines = body.splitlines()
				236	last_line_no = len(lines) - 1
				237	for line_no, line in enumerate(lines):
				238	last_char_index = len(line) - 1
				239	for i, c in enumerate(line):
Barry Warsaw	7aa02e6	2007-08-31 03:26:19 +0000	[diff] [blame]	240	if body_check(ord(c)):
Guido van Rossum	8b3febe	2007-08-30 01:15:14 +0000	[diff] [blame]	241	c = quote(c)
R David Murray	b938c8c	2011-03-24 12:19:26 -0400	[diff] [blame]	242	encoded_body.write_char(c, i==last_char_index)
				243	# Add an eol if input line had eol. All input lines have eol except
				244	# possibly the last one.
				245	if line_no < last_line_no or last_has_eol:
				246	encoded_body.newline()
				247
				248	return encoded_body.getvalue()
Guido van Rossum	8b3febe	2007-08-30 01:15:14 +0000	[diff] [blame]	249
				250
Antoine Pitrou	fd03645	2008-08-19 17:56:33 +0000	[diff] [blame]	251
Guido van Rossum	8b3febe	2007-08-30 01:15:14 +0000	[diff] [blame]	252	# BAW: I'm not sure if the intent was for the signature of this function to be
				253	# the same as base64MIME.decode() or not...
				254	def decode(encoded, eol=NL):
				255	"""Decode a quoted-printable string.
				256
				257	Lines are separated with eol, which defaults to \\n.
				258	"""
				259	if not encoded:
				260	return encoded
				261	# BAW: see comment in encode() above. Again, we're building up the
				262	# decoded string with string concatenation, which could be done much more
				263	# efficiently.
				264	decoded = ''
				265
				266	for line in encoded.splitlines():
				267	line = line.rstrip()
				268	if not line:
				269	decoded += eol
				270	continue
				271
				272	i = 0
				273	n = len(line)
				274	while i < n:
				275	c = line[i]
				276	if c != '=':
				277	decoded += c
				278	i += 1
				279	# Otherwise, c == "=". Are we at the end of the line? If so, add
				280	# a soft line break.
				281	elif i+1 == n:
				282	i += 1
				283	continue
				284	# Decode if in form =AB
				285	elif i+2 < n and line[i+1] in hexdigits and line[i+2] in hexdigits:
				286	decoded += unquote(line[i:i+3])
				287	i += 3
				288	# Otherwise, not in form =AB, pass literally
				289	else:
				290	decoded += c
				291	i += 1
				292
				293	if i == n:
				294	decoded += eol
				295	# Special case if original string did not end with eol
R David Murray	cafd79d	2011-03-23 15:25:55 -0400	[diff] [blame]	296	if encoded[-1] not in '\r\n' and decoded.endswith(eol):
Guido van Rossum	8b3febe	2007-08-30 01:15:14 +0000	[diff] [blame]	297	decoded = decoded[:-1]
				298	return decoded
				299
				300
				301	# For convenience and backwards compatibility w/ standard base64 module
				302	body_decode = decode
				303	decodestring = decode
				304
				305
Antoine Pitrou	fd03645	2008-08-19 17:56:33 +0000	[diff] [blame]	306
Guido van Rossum	8b3febe	2007-08-30 01:15:14 +0000	[diff] [blame]	307	def _unquote_match(match):
				308	"""Turn a match in the form =AB to the ASCII character with value 0xab"""
				309	s = match.group(0)
				310	return unquote(s)
				311
				312
				313	# Header decoding is done a bit differently
				314	def header_decode(s):
				315	"""Decode a string encoded with RFC 2045 MIME header `Q' encoding.
				316
				317	This function does not parse a full MIME header value encoded with
				318	quoted-printable (like =?iso-8895-1?q?Hello_World?=) -- please use
Amaury Forgeot d'Arc	1c25de6	2009-07-12 16:43:19 +0000	[diff] [blame]	319	the high level email.header class for that functionality.
Guido van Rossum	8b3febe	2007-08-30 01:15:14 +0000	[diff] [blame]	320	"""
				321	s = s.replace('_', ' ')
Ezio Melotti	2a99d5d	2013-07-06 17:16:04 +0200	[diff] [blame]	322	return re.sub(r'=[a-fA-F0-9]{2}', _unquote_match, s, flags=re.ASCII)