Blame - Lib/email/Charset.py - platform/external/python/cpython2

blob: 4874597a188913287f8fe2bdaccb098dc10e5046 [file] [log] [blame]

Barry Warsaw	409a4c0	2002-04-10 21:01:31 +0000	[diff] [blame^]	1	# Copyright (C) 2001,2002 Python Software Foundation
				2	# Author: che@debian.org (Ben Gertzfield)
				3
				4	from types import UnicodeType
				5	from email.Encoders import encode_7or8bit
				6	import email.base64MIME
				7	import email.quopriMIME
				8
				9
				10
				11	# Flags for types of header encodings
				12	QP = 1 # Quoted-Printable
				13	BASE64 = 2 # Base64
				14
				15	# In "=?charset?q?hello_world?=", the =?, ?q?, and ?= add up to 7
				16	MISC_LEN = 7
				17
				18	DEFAULT_CHARSET = 'us-ascii'
				19
				20
				21
				22	# Defaults
				23	CHARSETS = {
				24	# input header enc body enc output conv
				25	'iso-8859-1': (QP, QP, None),
				26	'iso-8859-2': (QP, QP, None),
				27	'us-ascii': (None, None, None),
				28	'big5': (BASE64, BASE64, None),
				29	'gb2312': (BASE64, BASE64, None),
				30	'euc-jp': (BASE64, None, 'iso-2022-jp'),
				31	'shift_jis': (BASE64, None, 'iso-2022-jp'),
				32	'iso-2022-jp': (BASE64, None, None),
				33	'koi8-r': (BASE64, BASE64, None),
				34	'utf-8': (BASE64, BASE64, 'utf-8'),
				35	}
				36
				37	# Aliases for other commonly-used names for character sets. Map
				38	# them to the real ones used in email.
				39	ALIASES = {
				40	'latin_1': 'iso-8859-1',
				41	'latin-1': 'iso-8859-1',
				42	'ascii': 'us-ascii',
				43	}
				44
				45	# Map charsets to their Unicode codec strings. Note that the Japanese
				46	# examples included below do not (yet) come with Python! They are available
				47	# from http://pseudo.grad.sccs.chukyo-u.ac.jp/~kajiyama/python/
				48
				49	# The Chinese and Korean codecs are available from SourceForge:
				50	#
				51	# http://sourceforge.net/projects/python-codecs/
				52	#
				53	# although you'll need to check them out of cvs since they haven't been file
				54	# released yet. You might also try to use
				55	#
				56	# http://www.freshports.org/port-description.php3?port=6702
				57	#
				58	# if you can get logged in. AFAICT, both the Chinese and Korean codecs are
				59	# fairly experimental at this point.
				60	CODEC_MAP = {
				61	'euc-jp': 'japanese.euc-jp',
				62	'iso-2022-jp': 'japanese.iso-2022-jp',
				63	'shift_jis': 'japanese.shift_jis',
				64	'gb2132': 'eucgb2312_cn',
				65	'big5': 'big5_tw',
				66	'utf-8': 'utf-8',
				67	# Hack: We don't want any conversion for stuff marked us-ascii, as all
				68	# sorts of garbage might be sent to us in the guise of 7-bit us-ascii.
				69	# Let that stuff pass through without conversion to/from Unicode.
				70	'us-ascii': None,
				71	}
				72
				73
				74
				75	# Convenience functions for extending the above mappings
				76	def add_charset(charset, header_enc=None, body_enc=None, output_charset=None):
				77	"""Add charset properties to the global map.
				78
				79	charset is the input character set, and must be the canonical name of a
				80	character set.
				81
				82	Optional header_enc and body_enc is either Charset.QP for
				83	quoted-printable, Charset.BASE64 for base64 encoding, or None for no
				84	encoding. It describes how message headers and message bodies in the
				85	input charset are to be encoded. Default is no encoding.
				86
				87	Optional output_charset is the character set that the output should be
				88	in. Conversions will proceed from input charset, to Unicode, to the
				89	output charset when the method Charset.convert() is called. The default
				90	is to output in the same character set as the input.
				91
				92	Both input_charset and output_charset must have Unicode codec entries in
				93	the module's charset-to-codec mapping; use add_codec(charset, codecname)
				94	to add codecs the module does not know about. See the codec module's
				95	documentation for more information.
				96	"""
				97	CHARSETS[charset] = (header_enc, body_enc, output_charset)
				98
				99
				100	def add_alias(alias, canonical):
				101	"""Add a character set alias.
				102
				103	alias is the alias name, e.g. latin-1
				104	canonical is the character set's canonical name, e.g. iso-8859-1
				105	"""
				106	ALIASES[alias] = canonical
				107
				108
				109	def add_codec(charset, codecname):
				110	"""Add a codec that map characters in the given charset to/from Unicode.
				111
				112	charset is the canonical name of a character set. codecname is the name
				113	of a Python codec, as appropriate for the second argument to the unicode()
				114	built-in, or to the .encode() method of a Unicode string.
				115	"""
				116	CODEC_MAP[charset] = codecname
				117
				118
				119
				120	class Charset:
				121	"""Map character sets to their email properties.
				122
				123	This class provides information about the requirements imposed on email
				124	for a specific character set. It also provides convenience routines for
				125	converting between character sets, given the availability of the
				126	applicable codecs. Given an character set, it will do its best to provide
				127	information on how to use that character set in an email.
				128
				129	Certain character sets must be encoded with quoted-printable or base64
				130	when used in email headers or bodies. Certain character sets must be
				131	converted outright, and are not allowed in email. Instances of this
				132	module expose the following information about a character set:
				133
				134	input_charset: The initial character set specified. Common aliases
				135	are converted to their `official' email names (e.g. latin_1
				136	is converted to iso-8859-1). Defaults to 7-bit us-ascii.
				137
				138	header_encoding: If the character set must be encoded before it can be
				139	used in an email header, this attribute will be set to
				140	Charset.QP (for quoted-printable) or Charset.BASE64 (for
				141	base64 encoding). Otherwise, it will be None.
				142
				143	body_encoding: Same as header_encoding, but describes the encoding for the
				144	mail message's body, which indeed may be different than the
				145	header encoding.
				146
				147	output_charset: Some character sets must be converted before the can be
				148	used in email headers or bodies. If the input_charset is
				149	one of them, this attribute will contain the name of the
				150	charset output will be converted to. Otherwise, it will
				151	be None.
				152
				153	input_codec: The name of the Python codec used to convert the
				154	input_charset to Unicode. If no conversion codec is
				155	necessary, this attribute will be None.
				156
				157	output_codec: The name of the Python codec used to convert Unicode
				158	to the output_charset. If no conversion codec is necessary,
				159	this attribute will have the same value as the input_codec.
				160	"""
				161	def __init__(self, input_charset=DEFAULT_CHARSET):
				162	# Set the input charset after filtering through the aliases
				163	self.input_charset = ALIASES.get(input_charset, input_charset)
				164	# We can try to guess which encoding and conversion to use by the
				165	# charset_map dictionary. Try that first, but let the user override
				166	# it.
				167	henc, benc, conv = CHARSETS.get(self.input_charset,
				168	(BASE64, BASE64, None))
				169	# Set the attributes, allowing the arguments to override the default.
				170	self.header_encoding = henc
				171	self.body_encoding = benc
				172	self.output_charset = ALIASES.get(conv, conv)
				173	# Now set the codecs. If one isn't defined for input_charset,
				174	# guess and try a Unicode codec with the same name as input_codec.
				175	self.input_codec = CODEC_MAP.get(self.input_charset,
				176	self.input_charset)
				177	self.output_codec = CODEC_MAP.get(self.output_charset,
				178	self.input_codec)
				179
				180	def __str__(self):
				181	return self.input_charset.lower()
				182
				183	def __eq__(self, other):
				184	return str(self) == str(other).lower()
				185
				186	def __ne__(self, other):
				187	return not self.__eq__(other)
				188
				189	def get_body_encoding(self):
				190	"""Return the content-transfer-encoding used for body encoding.
				191
				192	This is either the string `quoted-printable' or `base64' depending on
				193	the encoding used, or it is a function in which case you should call
				194	the function with a single argument, the Message object being
				195	encoded. The function should then set the Content-Transfer-Encoding:
				196	header itself to whatever is appropriate.
				197
				198	Returns "quoted-printable" if self.body_encoding is QP.
				199	Returns "base64" if self.body_encoding is BASE64.
				200	Returns "7bit" otherwise.
				201	"""
				202	if self.body_encoding == QP:
				203	return 'quoted-printable'
				204	elif self.body_encoding == BASE64:
				205	return 'base64'
				206	else:
				207	return encode_7or8bit
				208
				209	def convert(self, s):
				210	"""Convert a string from the input_codec to the output_codec."""
				211	if self.input_codec <> self.output_codec:
				212	return unicode(s, self.input_codec).encode(self.output_codec)
				213	else:
				214	return s
				215
				216	def to_splittable(self, s):
				217	"""Convert a possibly multibyte string to a safely splittable format.
				218
				219	Uses the input_codec to try and convert the string to Unicode, so it
				220	can be safely split on character boundaries (even for double-byte
				221	characters).
				222
				223	Returns the string untouched if we don't know how to convert it to
				224	Unicode with the input_charset.
				225
				226	Characters that could not be converted to Unicode will be replaced
				227	with the Unicode replacement character U+FFFD.
				228	"""
				229	if isinstance(s, UnicodeType) or self.input_codec is None:
				230	return s
				231	try:
				232	return unicode(s, self.input_codec, 'replace')
				233	except LookupError:
				234	# Input codec not installed on system, so return the original
				235	# string unchanged.
				236	return s
				237
				238	def from_splittable(self, ustr, to_output=1):
				239	"""Convert a splittable string back into an encoded string.
				240
				241	Uses the proper codec to try and convert the string from
				242	Unicode back into an encoded format. Return the string as-is
				243	if it is not Unicode, or if it could not be encoded from
				244	Unicode.
				245
				246	Characters that could not be converted from Unicode will be replaced
				247	with an appropriate character (usually '?').
				248
				249	If to_output is true, uses output_codec to convert to an encoded
				250	format. If to_output is false, uses input_codec. to_output defaults
				251	to 1.
				252	"""
				253	if to_output:
				254	codec = self.output_codec
				255	else:
				256	codec = self.input_codec
				257	if not isinstance(ustr, UnicodeType) or codec is None:
				258	return ustr
				259	try:
				260	return ustr.encode(codec, 'replace')
				261	except LookupError:
				262	# Output codec not installed
				263	return ustr
				264
				265	def get_output_charset(self):
				266	"""Return the output character set.
				267
				268	This is self.output_charset if that is set, otherwise it is
				269	self.input_charset.
				270	"""
				271	return self.output_charset or self.input_charset
				272
				273	def encoded_header_len(self, s):
				274	"""Return the length of the encoded header string."""
				275	cset = self.get_output_charset()
				276	# The len(s) of a 7bit encoding is len(s)
				277	if self.header_encoding is BASE64:
				278	return email.base64MIME.base64_len(s) + len(cset) + MISC_LEN
				279	elif self.header_encoding is QP:
				280	return email.quopriMIME.header_quopri_len(s) + len(cset) + MISC_LEN
				281	else:
				282	return len(s)
				283
				284	def header_encode(self, s, convert=0):
				285	"""Header-encode a string, optionally converting it to output_charset.
				286
				287	If convert is true, the string will be converted from the input
				288	charset to the output charset automatically. This is not useful for
				289	multibyte character sets, which have line length issues (multibyte
				290	characters must be split on a character, not a byte boundary); use the
				291	high-level Header class to deal with these issues. convert defaults
				292	to 0.
				293
				294	The type of encoding (base64 or quoted-printable) will be based on
				295	self.header_encoding.
				296	"""
				297	cset = self.get_output_charset()
				298	if convert:
				299	s = self.convert(s)
				300	# 7bit/8bit encodings return the string unchanged (modulo conversions)
				301	if self.header_encoding is BASE64:
				302	return email.base64MIME.header_encode(s, cset)
				303	elif self.header_encoding is QP:
				304	return email.quopriMIME.header_encode(s, cset)
				305	else:
				306	return s
				307
				308	def body_encode(self, s, convert=1):
				309	"""Body-encode a string and convert it to output_charset.
				310
				311	If convert is true (the default), the string will be converted from
				312	the input charset to output charset automatically. Unlike
				313	header_encode(), there are no issues with byte boundaries and
				314	multibyte charsets in email bodies, so this is usually pretty safe.
				315
				316	The type of encoding (base64 or quoted-printable) will be based on
				317	self.body_encoding.
				318	"""
				319	if convert:
				320	s = self.convert(s)
				321	# 7bit/8bit encodings return the string unchanged (module conversions)
				322	if self.body_encoding is BASE64:
				323	return email.base64MIME.body_encode(s)
				324	elif self.header_encoding is QP:
				325	return email.quopriMIME.body_encode(s)
				326	else:
				327	return s