Blame - Lib/email/charset.py - platform/external/python/cpython2

blob: 1435ee5749a10a216c9dffa3726bd7665065e0c5 [file] [log] [blame]

Guido van Rossum	8b3febe	2007-08-30 01:15:14 +0000	[diff] [blame]	1	# Copyright (C) 2001-2007 Python Software Foundation
				2	# Author: Ben Gertzfield, Barry Warsaw
				3	# Contact: email-sig@python.org
				4
				5	__all__ = [
				6	'Charset',
				7	'add_alias',
				8	'add_charset',
				9	'add_codec',
				10	]
				11
				12	import email.base64mime
				13	import email.quoprimime
				14
				15	from email import errors
				16	from email.encoders import encode_7or8bit
				17
				18
				19
				20	# Flags for types of header encodings
				21	QP = 1 # Quoted-Printable
				22	BASE64 = 2 # Base64
				23	SHORTEST = 3 # the shorter of QP and base64, but only for headers
				24
				25	# In "=?charset?q?hello_world?=", the =?, ?q?, and ?= add up to 7
				26	MISC_LEN = 7
				27
				28	DEFAULT_CHARSET = 'us-ascii'
				29
				30
				31
				32	# Defaults
				33	CHARSETS = {
				34	# input header enc body enc output conv
				35	'iso-8859-1': (QP, QP, None),
				36	'iso-8859-2': (QP, QP, None),
				37	'iso-8859-3': (QP, QP, None),
				38	'iso-8859-4': (QP, QP, None),
				39	# iso-8859-5 is Cyrillic, and not especially used
				40	# iso-8859-6 is Arabic, also not particularly used
				41	# iso-8859-7 is Greek, QP will not make it readable
				42	# iso-8859-8 is Hebrew, QP will not make it readable
				43	'iso-8859-9': (QP, QP, None),
				44	'iso-8859-10': (QP, QP, None),
				45	# iso-8859-11 is Thai, QP will not make it readable
				46	'iso-8859-13': (QP, QP, None),
				47	'iso-8859-14': (QP, QP, None),
				48	'iso-8859-15': (QP, QP, None),
				49	'iso-8859-16': (QP, QP, None),
				50	'windows-1252':(QP, QP, None),
				51	'viscii': (QP, QP, None),
				52	'us-ascii': (None, None, None),
				53	'big5': (BASE64, BASE64, None),
				54	'gb2312': (BASE64, BASE64, None),
				55	'euc-jp': (BASE64, None, 'iso-2022-jp'),
				56	'shift_jis': (BASE64, None, 'iso-2022-jp'),
				57	'iso-2022-jp': (BASE64, None, None),
				58	'koi8-r': (BASE64, BASE64, None),
				59	'utf-8': (SHORTEST, BASE64, 'utf-8'),
				60	}
				61
				62	# Aliases for other commonly-used names for character sets. Map
				63	# them to the real ones used in email.
				64	ALIASES = {
				65	'latin_1': 'iso-8859-1',
				66	'latin-1': 'iso-8859-1',
				67	'latin_2': 'iso-8859-2',
				68	'latin-2': 'iso-8859-2',
				69	'latin_3': 'iso-8859-3',
				70	'latin-3': 'iso-8859-3',
				71	'latin_4': 'iso-8859-4',
				72	'latin-4': 'iso-8859-4',
				73	'latin_5': 'iso-8859-9',
				74	'latin-5': 'iso-8859-9',
				75	'latin_6': 'iso-8859-10',
				76	'latin-6': 'iso-8859-10',
				77	'latin_7': 'iso-8859-13',
				78	'latin-7': 'iso-8859-13',
				79	'latin_8': 'iso-8859-14',
				80	'latin-8': 'iso-8859-14',
				81	'latin_9': 'iso-8859-15',
				82	'latin-9': 'iso-8859-15',
				83	'latin_10':'iso-8859-16',
				84	'latin-10':'iso-8859-16',
				85	'cp949': 'ks_c_5601-1987',
				86	'euc_jp': 'euc-jp',
				87	'euc_kr': 'euc-kr',
				88	'ascii': 'us-ascii',
				89	}
				90
				91
				92	# Map charsets to their Unicode codec strings.
				93	CODEC_MAP = {
				94	'gb2312': 'eucgb2312_cn',
				95	'big5': 'big5_tw',
				96	# Hack: We don't want any conversion for stuff marked us-ascii, as all
				97	# sorts of garbage might be sent to us in the guise of 7-bit us-ascii.
				98	# Let that stuff pass through without conversion to/from Unicode.
				99	'us-ascii': None,
				100	}
				101
				102
				103
				104	# Convenience functions for extending the above mappings
				105	def add_charset(charset, header_enc=None, body_enc=None, output_charset=None):
				106	"""Add character set properties to the global registry.
				107
				108	charset is the input character set, and must be the canonical name of a
				109	character set.
				110
				111	Optional header_enc and body_enc is either Charset.QP for
				112	quoted-printable, Charset.BASE64 for base64 encoding, Charset.SHORTEST for
				113	the shortest of qp or base64 encoding, or None for no encoding. SHORTEST
				114	is only valid for header_enc. It describes how message headers and
				115	message bodies in the input charset are to be encoded. Default is no
				116	encoding.
				117
				118	Optional output_charset is the character set that the output should be
				119	in. Conversions will proceed from input charset, to Unicode, to the
				120	output charset when the method Charset.convert() is called. The default
				121	is to output in the same character set as the input.
				122
				123	Both input_charset and output_charset must have Unicode codec entries in
				124	the module's charset-to-codec mapping; use add_codec(charset, codecname)
				125	to add codecs the module does not know about. See the codecs module's
				126	documentation for more information.
				127	"""
				128	if body_enc == SHORTEST:
				129	raise ValueError('SHORTEST not allowed for body_enc')
				130	CHARSETS[charset] = (header_enc, body_enc, output_charset)
				131
				132
				133	def add_alias(alias, canonical):
				134	"""Add a character set alias.
				135
				136	alias is the alias name, e.g. latin-1
				137	canonical is the character set's canonical name, e.g. iso-8859-1
				138	"""
				139	ALIASES[alias] = canonical
				140
				141
				142	def add_codec(charset, codecname):
				143	"""Add a codec that map characters in the given charset to/from Unicode.
				144
				145	charset is the canonical name of a character set. codecname is the name
				146	of a Python codec, as appropriate for the second argument to the unicode()
				147	built-in, or to the encode() method of a Unicode string.
				148	"""
				149	CODEC_MAP[charset] = codecname
				150
				151
				152
				153	class Charset:
				154	"""Map character sets to their email properties.
				155
				156	This class provides information about the requirements imposed on email
				157	for a specific character set. It also provides convenience routines for
				158	converting between character sets, given the availability of the
				159	applicable codecs. Given a character set, it will do its best to provide
				160	information on how to use that character set in an email in an
				161	RFC-compliant way.
				162
				163	Certain character sets must be encoded with quoted-printable or base64
				164	when used in email headers or bodies. Certain character sets must be
				165	converted outright, and are not allowed in email. Instances of this
				166	module expose the following information about a character set:
				167
				168	input_charset: The initial character set specified. Common aliases
				169	are converted to their `official' email names (e.g. latin_1
				170	is converted to iso-8859-1). Defaults to 7-bit us-ascii.
				171
				172	header_encoding: If the character set must be encoded before it can be
				173	used in an email header, this attribute will be set to
				174	Charset.QP (for quoted-printable), Charset.BASE64 (for
				175	base64 encoding), or Charset.SHORTEST for the shortest of
				176	QP or BASE64 encoding. Otherwise, it will be None.
				177
				178	body_encoding: Same as header_encoding, but describes the encoding for the
				179	mail message's body, which indeed may be different than the
				180	header encoding. Charset.SHORTEST is not allowed for
				181	body_encoding.
				182
				183	output_charset: Some character sets must be converted before the can be
				184	used in email headers or bodies. If the input_charset is
				185	one of them, this attribute will contain the name of the
				186	charset output will be converted to. Otherwise, it will
				187	be None.
				188
				189	input_codec: The name of the Python codec used to convert the
				190	input_charset to Unicode. If no conversion codec is
				191	necessary, this attribute will be None.
				192
				193	output_codec: The name of the Python codec used to convert Unicode
				194	to the output_charset. If no conversion codec is necessary,
				195	this attribute will have the same value as the input_codec.
				196	"""
				197	def __init__(self, input_charset=DEFAULT_CHARSET):
				198	# RFC 2046, $4.1.2 says charsets are not case sensitive. We coerce to
				199	# unicode because its .lower() is locale insensitive. If the argument
				200	# is already a unicode, we leave it at that, but ensure that the
				201	# charset is ASCII, as the standard (RFC XXX) requires.
				202	try:
				203	if isinstance(input_charset, str):
				204	input_charset.encode('ascii')
				205	else:
				206	input_charset = str(input_charset, 'ascii')
				207	except UnicodeError:
				208	raise errors.CharsetError(input_charset)
				209	input_charset = input_charset.lower()
				210	# Set the input charset after filtering through the aliases
				211	self.input_charset = ALIASES.get(input_charset, input_charset)
				212	# We can try to guess which encoding and conversion to use by the
				213	# charset_map dictionary. Try that first, but let the user override
				214	# it.
				215	henc, benc, conv = CHARSETS.get(self.input_charset,
				216	(SHORTEST, BASE64, None))
				217	if not conv:
				218	conv = self.input_charset
				219	# Set the attributes, allowing the arguments to override the default.
				220	self.header_encoding = henc
				221	self.body_encoding = benc
				222	self.output_charset = ALIASES.get(conv, conv)
				223	# Now set the codecs. If one isn't defined for input_charset,
				224	# guess and try a Unicode codec with the same name as input_codec.
				225	self.input_codec = CODEC_MAP.get(self.input_charset,
				226	self.input_charset)
				227	self.output_codec = CODEC_MAP.get(self.output_charset,
				228	self.output_charset)
				229
				230	def __str__(self):
				231	return self.input_charset.lower()
				232
				233	__repr__ = __str__
				234
				235	def __eq__(self, other):
				236	return str(self) == str(other).lower()
				237
				238	def __ne__(self, other):
				239	return not self.__eq__(other)
				240
				241	def get_body_encoding(self):
				242	"""Return the content-transfer-encoding used for body encoding.
				243
				244	This is either the string `quoted-printable' or `base64' depending on
				245	the encoding used, or it is a function in which case you should call
				246	the function with a single argument, the Message object being
				247	encoded. The function should then set the Content-Transfer-Encoding
				248	header itself to whatever is appropriate.
				249
				250	Returns "quoted-printable" if self.body_encoding is QP.
				251	Returns "base64" if self.body_encoding is BASE64.
				252	Returns "7bit" otherwise.
				253	"""
				254	assert self.body_encoding != SHORTEST
				255	if self.body_encoding == QP:
				256	return 'quoted-printable'
				257	elif self.body_encoding == BASE64:
				258	return 'base64'
				259	else:
				260	return encode_7or8bit
				261
				262	def convert(self, s):
				263	"""Convert a string from the input_codec to the output_codec."""
				264	if self.input_codec != self.output_codec:
				265	rawbytes = bytes(ord(c) for c in s)
				266	decoded = rawbytes.decode(self.input_codec)
				267	encoded = decoded.encode(self.output_codec)
				268	return str(encoded)
				269	else:
				270	return s
				271
				272	def to_splittable(self, s):
				273	"""Convert a possibly multibyte string to a safely splittable format.
				274
				275	Uses the input_codec to try and convert the string to Unicode, so it
				276	can be safely split on character boundaries (even for multibyte
				277	characters).
				278
				279	Returns the string as-is if it isn't known how to convert it to
				280	Unicode with the input_charset.
				281
				282	Characters that could not be converted to Unicode will be replaced
				283	with the Unicode replacement character U+FFFD.
				284	"""
				285	if isinstance(s, str) or self.input_codec is None:
				286	return s
				287	try:
				288	return str(s, self.input_codec, 'replace')
				289	except LookupError:
				290	# Input codec not installed on system, so return the original
				291	# string unchanged.
				292	return s
				293
				294	def from_splittable(self, ustr, to_output=True):
				295	"""Convert a splittable string back into an encoded string.
				296
				297	Uses the proper codec to try and convert the string from Unicode back
				298	into an encoded format. Return the string as-is if it is not Unicode,
				299	or if it could not be converted from Unicode.
				300
				301	Characters that could not be converted from Unicode will be replaced
				302	with an appropriate character (usually '?').
				303
				304	If to_output is True (the default), uses output_codec to convert to an
				305	encoded format. If to_output is False, uses input_codec.
				306	"""
				307	if to_output:
				308	codec = self.output_codec
				309	else:
				310	codec = self.input_codec
				311	if not isinstance(ustr, str) or codec is None:
				312	return ustr
				313	try:
				314	return str(ustr.encode(codec, 'replace'))
				315	except LookupError:
				316	# Output codec not installed
				317	return ustr
				318
				319	def get_output_charset(self):
				320	"""Return the output character set.
				321
				322	This is self.output_charset if that is not None, otherwise it is
				323	self.input_charset.
				324	"""
				325	return self.output_charset or self.input_charset
				326
				327	def encoded_header_len(self, s):
				328	"""Return the length of the encoded header string."""
				329	cset = self.get_output_charset()
				330	# The len(s) of a 7bit encoding is len(s)
				331	if self.header_encoding == BASE64:
				332	return email.base64mime.base64_len(s) + len(cset) + MISC_LEN
				333	elif self.header_encoding == QP:
				334	return email.quoprimime.header_quopri_len(s) + len(cset) + MISC_LEN
				335	elif self.header_encoding == SHORTEST:
				336	lenb64 = email.base64mime.base64_len(s)
				337	lenqp = email.quoprimime.header_quopri_len(s)
				338	return min(lenb64, lenqp) + len(cset) + MISC_LEN
				339	else:
				340	return len(s)
				341
				342	def header_encode(self, string):
				343	"""Header-encode a string by converting it first to bytes.
				344
				345	:param string: A unicode string for the header. This must be
				346	encodable to bytes using the current character set's `output_codec`.
				347
				348	The type of encoding (base64 or quoted-printable) will be based on
				349	this charset's `header_encoding`.
				350	"""
				351	codec = self.output_codec or 'us-ascii'
				352	charset = self.get_output_charset()
				353	header_bytes = string.encode(codec)
				354	# 7bit/8bit encodings return the string unchanged (modulo conversions)
				355	if self.header_encoding == BASE64:
				356	encoder = email.base64mime.header_encode
				357	elif self.header_encoding == QP:
				358	encoder = email.quoprimime.header_encode
				359	elif self.header_encoding == SHORTEST:
				360	lenb64 = email.base64mime.base64_len(header_bytes)
				361	lenqp = email.quoprimime.header_quopri_len(header_bytes)
				362	if lenb64 < lenqp:
				363	encoder = email.base64mime.header_encode
				364	else:
				365	encoder = email.quoprimime.header_encode
				366	else:
				367	return string
				368	return encoder(header_bytes, codec)
				369
				370	def body_encode(self, s, convert=True):
				371	"""Body-encode a string and convert it to output_charset.
				372
				373	If convert is True (the default), the string will be converted from
				374	the input charset to output charset automatically. Unlike
				375	header_encode(), there are no issues with byte boundaries and
				376	multibyte charsets in email bodies, so this is usually pretty safe.
				377
				378	The type of encoding (base64 or quoted-printable) will be based on
				379	self.body_encoding.
				380	"""
				381	if convert:
				382	s = self.convert(s)
				383	# 7bit/8bit encodings return the string unchanged (module conversions)
				384	if self.body_encoding is BASE64:
				385	return email.base64mime.body_encode(s)
				386	elif self.body_encoding is QP:
				387	return email.quoprimime.body_encode(s)
				388	else:
				389	return s