Blame - Lib/email/quopriMIME.py - platform/external/python/cpython3

blob: 002034e169c55bd09fc82026888b26a70d443ec0 [file] [log] [blame]

Barry Warsaw	409a4c0	2002-04-10 21:01:31 +0000	[diff] [blame^]	1	# Copyright (C) 2001,2002 Python Software Foundation
				2	# Author: che@debian.org (Ben Gertzfield)
				3
				4	"""Quoted-printable content transfer encoding per RFCs 2045-2047.
				5
				6	This module handles the content transfer encoding method defined in RFC 2045
				7	to encode US ASCII-like 8-bit data called `quoted-printable'. It is used to
				8	safely encode text that is in a character set similar to the 7-bit US ASCII
				9	character set, but that includes some 8-bit characters that are normally not
				10	allowed in email bodies or headers.
				11
				12	Quoted-printable is very space-inefficient for encoding binary files; use the
				13	email.base64MIME module for that instead.
				14
				15	This module provides an interface to encode and decode both headers and bodies
				16	with quoted-printable encoding.
				17
				18	RFC 2045 defines a method for including character set information in an
				19	`encoded-word' in a header. This method is commonly used for 8-bit real names
				20	in To:/From:/Cc: etc. fields, as well as Subject: lines.
				21
				22	This module does not do the line wrapping or end-of-line character
				23	conversion necessary for proper internationalized headers; it only
				24	does dumb encoding and decoding. To deal with the various line
				25	wrapping issues, use the email.Header module.
				26	"""
				27
				28	import re
				29	from string import hexdigits
				30	from email.Utils import fix_eols
				31
				32	CRLF = '\r\n'
				33	NL = '\n'
				34
				35	# See also Charset.py
				36	MISC_LEN = 7
				37
				38	hqre = re.compile(r'[^-a-zA-Z0-9!*+/ ]')
				39	bqre = re.compile(r'[^ !-<>-~\t]')
				40
				41
				42
				43	# Helpers
				44	def header_quopri_check(c):
				45	"""Return true if the character should be escaped with header quopri."""
				46	return hqre.match(c) and 1
				47
				48
				49	def body_quopri_check(c):
				50	"""Return true if the character should be escaped with body quopri."""
				51	return bqre.match(c) and 1
				52
				53
				54	def header_quopri_len(s):
				55	"""Return the length of str when it is encoded with header quopri."""
				56	count = 0
				57	for c in s:
				58	if hqre.match(c):
				59	count += 3
				60	else:
				61	count += 1
				62	return count
				63
				64
				65	def body_quopri_len(str):
				66	"""Return the length of str when it is encoded with body quopri."""
				67	count = 0
				68	for c in str:
				69	if bqre.match(c):
				70	count += 3
				71	else:
				72	count += 1
				73	return count
				74
				75
				76	def _max_append(L, s, maxlen, extra=''):
				77	if not L:
				78	L.append(s)
				79	elif len(L[-1]) + len(s) < maxlen:
				80	L[-1] += extra + s
				81	else:
				82	L.append(s)
				83
				84
				85	def unquote(s):
				86	"""Turn a string in the form =AB to the ASCII character with value 0xab"""
				87	return chr(int(s[1:3], 16))
				88
				89
				90	def quote(c):
				91	return "=%02X" % ord(c)
				92
				93
				94
				95	def header_encode(header, charset="iso-8859-1", keep_eols=0, maxlinelen=76,
				96	eol=NL):
				97	"""Encode a single header line with quoted-printable (like) encoding.
				98
				99	Defined in RFC 2045, this `Q' encoding is similar to quoted-printable, but
				100	used specifically for email header fields to allow charsets with mostly 7
				101	bit characters (and some 8 bit) to remain more or less readable in non-RFC
				102	2045 aware mail clients.
				103
				104	charset names the character set to use to encode the header. It defaults
				105	to iso-8859-1.
				106
				107	The resulting string will be in the form:
				108
				109	"=?charset?q?I_f=E2rt_in_your_g=E8n=E8ral_dire=E7tion?\\n
				110	=?charset?q?Silly_=C8nglish_Kn=EEghts?="
				111
				112	with each line wrapped safely at, at most, maxlinelen characters (defaults
				113	to 76 characters).
				114
				115	End-of-line characters (\\r, \\n, \\r\\n) will be automatically converted
				116	to the canonical email line separator \\r\\n unless the keep_eols
				117	parameter is set to true (the default is false).
				118
				119	Each line of the header will be terminated in the value of eol, which
				120	defaults to "\\n". Set this to "\\r\\n" if you are using the result of
				121	this function directly in email.
				122	"""
				123	# Return empty headers unchanged
				124	if not header:
				125	return header
				126
				127	if not keep_eols:
				128	header = fix_eols(header)
				129
				130	# Quopri encode each line, in encoded chunks no greater than maxlinelen in
				131	# lenght, after the RFC chrome is added in.
				132	quoted = []
				133	max_encoded = maxlinelen - len(charset) - MISC_LEN
				134
				135	for c in header:
				136	# Space may be represented as _ instead of =20 for readability
				137	if c == ' ':
				138	_max_append(quoted, '_', max_encoded)
				139	# These characters can be included verbatim
				140	elif not hqre.match(c):
				141	_max_append(quoted, c, max_encoded)
				142	# Otherwise, replace with hex value like =E2
				143	else:
				144	_max_append(quoted, "=%02X" % ord(c), max_encoded)
				145
				146	# Now add the RFC chrome to each encoded chunk and glue the chunks
				147	# together. BAW: should we be able to specify the leading whitespace in
				148	# the joiner?
				149	joiner = eol + ' '
				150	return joiner.join(['=?%s?q?%s?=' % (charset, line) for line in quoted])
				151
				152
				153
				154	def encode(body, binary=0, maxlinelen=76, eol=NL):
				155	"""Encode with quoted-printable, wrapping at maxlinelen characters.
				156
				157	If binary is false (the default), end-of-line characters will be converted
				158	to the canonical email end-of-line sequence \\r\\n. Otherwise they will
				159	be left verbatim.
				160
				161	Each line of encoded text will end with eol, which defaults to "\\n". Set
				162	this to "\\r\\n" if you will be using the result of this function directly
				163	in an email.
				164
				165	Each line will be wrapped at, at most, maxlinelen characters (defaults to
				166	76 characters). Long lines will have the `soft linefeed' quoted-printable
				167	character "=" appended to them, so the decoded text will be identical to
				168	the original text.
				169	"""
				170	if not body:
				171	return body
				172
				173	if not binary:
				174	body = fix_eols(body)
				175
				176	# BAW: We're accumulating the body text by string concatenation. That
				177	# can't be very efficient, but I don't have time now to rewrite it. It
				178	# just feels like this algorithm could be more efficient.
				179	encoded_body = ''
				180	lineno = -1
				181	# Preserve line endings here so we can check later to see an eol needs to
				182	# be added to the output later.
				183	lines = body.splitlines(1)
				184	for line in lines:
				185	# But strip off line-endings for processing this line.
				186	if line.endswith(CRLF):
				187	line = line[:-2]
				188	elif line[-1] in CRLF:
				189	line = line[:-1]
				190
				191	lineno += 1
				192	encoded_line = ''
				193	prev = None
				194	linelen = len(line)
				195	# Now we need to examine every character to see if it needs to be
				196	# quopri encoded. BAW: again, string concatenation is inefficient.
				197	for j in range(linelen):
				198	c = line[j]
				199	prev = c
				200	if bqre.match(c):
				201	c = quote(c)
				202	elif j+1 == linelen:
				203	# Check for whitespace at end of line; special case
				204	if c not in ' \t':
				205	encoded_line += c
				206	prev = c
				207	continue
				208	# Check to see to see if the line has reached its maximum length
				209	if len(encoded_line) + len(c) >= maxlinelen:
				210	encoded_body += encoded_line + '=' + eol
				211	encoded_line = ''
				212	encoded_line += c
				213	# Now at end of line..
				214	if prev and prev in ' \t':
				215	# Special case for whitespace at end of file
				216	if lineno+1 == len(lines):
				217	prev = quote(prev)
				218	if len(encoded_line) + len(prev) > maxlinelen:
				219	encoded_body += encoded_line + '=' + eol + prev
				220	else:
				221	encoded_body += encoded_line + prev
				222	# Just normal whitespace at end of line
				223	else:
				224	encoded_body += encoded_line + prev + '=' + eol
				225	encoded_line = ''
				226	# Now look at the line we just finished and it has a line ending, we
				227	# need to add eol to the end of the line.
				228	if lines[lineno].endswith(CRLF) or lines[lineno][-1] in CRLF:
				229	encoded_body += encoded_line + eol
				230	else:
				231	encoded_body += encoded_line
				232	encoded_line = ''
				233	return encoded_body
				234
				235
				236	# For convenience and backwards compatibility w/ standard base64 module
				237	body_encode = encode
				238	encodestring = encode
				239
				240
				241
				242	# BAW: I'm not sure if the intent was for the signature of this function to be
				243	# the same as base64MIME.decode() or not...
				244	def decode(encoded, eol=NL):
				245	"""Decode a quoted-printable string.
				246
				247	Lines are separated with eol, which defaults to \\n.
				248	"""
				249	if not encoded:
				250	return encoded
				251	# BAW: see comment in encode() above. Again, we're building up the
				252	# decoded string with string concatenation, which could be done much more
				253	# efficiently.
				254	decoded = ''
				255
				256	for line in encoded.splitlines():
				257	line = line.rstrip()
				258	if not line:
				259	decoded += eol
				260	continue
				261
				262	i = 0
				263	n = len(line)
				264	while i < n:
				265	c = line[i]
				266	if c <> '=':
				267	decoded += c
				268	i += 1
				269	# Otherwise, c == "=". Are we at the end of the line? If so, add
				270	# a soft line break.
				271	elif i+1 == n:
				272	i += 1
				273	continue
				274	# Decode if in form =AB
				275	elif i+2 < n and line[i+1] in hexdigits and line[i+2] in hexdigits:
				276	decoded += unquote(line[i:i+3])
				277	i += 3
				278	# Otherwise, not in form =AB, pass literally
				279	else:
				280	decoded += c
				281	i += 1
				282
				283	if i == n:
				284	decoded += eol
				285	# Special case if original string did not end with eol
				286	if encoded[-1] <> eol and decoded[-1] == eol:
				287	decoded = decoded[:-1]
				288	return decoded
				289
				290
				291	# For convenience and backwards compatibility w/ standard base64 module
				292	body_decode = decode
				293	decodestring = decode
				294
				295
				296
				297	def _unquote_match(match):
				298	"""Turn a match in the form =AB to the ASCII character with value 0xab"""
				299	s = match.group(0)
				300	return unquote(s)
				301
				302
				303	# Header decoding is done a bit differently
				304	def header_decode(s):
				305	"""Decode a string encoded with RFC 2045 MIME header `Q' encoding.
				306
				307	This function does not parse a full MIME header value encoded with
				308	quoted-printable (like =?iso-8895-1?q?Hello_World?=) -- please use
				309	the high level email.Header class for that functionality.
				310	"""
				311	s = s.replace('_', ' ')
				312	return re.sub(r'=\w{2}', _unquote_match, s)