Blame - Lib/email/Charset.py - platform/external/python/cpython3

blob: 9a54dab9ef864e8b55593465c4b83f36c1abdac7 [file] [log] [blame]

Barry Warsaw	409a4c0	2002-04-10 21:01:31 +0000	[diff] [blame]	1	# Copyright (C) 2001,2002 Python Software Foundation
				2	# Author: che@debian.org (Ben Gertzfield)
				3
Guido van Rossum	1a7ac35	2002-05-28 18:49:03 +0000	[diff] [blame]	4	try:
				5	unicode
				6	except NameError:
				7	def _is_unicode(x):
				8	return 1==0
				9	else:
Barry Warsaw	bb98c8c	2002-06-01 03:56:07 +0000	[diff] [blame]	10	# Use UnicodeType instead of built-in unicode for Py2.1 compatibility
				11	from types import UnicodeType
Guido van Rossum	1a7ac35	2002-05-28 18:49:03 +0000	[diff] [blame]	12	def _is_unicode(x):
Barry Warsaw	bb98c8c	2002-06-01 03:56:07 +0000	[diff] [blame]	13	return isinstance(x, UnicodeType)
Guido van Rossum	1a7ac35	2002-05-28 18:49:03 +0000	[diff] [blame]	14
Barry Warsaw	409a4c0	2002-04-10 21:01:31 +0000	[diff] [blame]	15	from email.Encoders import encode_7or8bit
				16	import email.base64MIME
				17	import email.quopriMIME
				18
				19
				20
				21	# Flags for types of header encodings
				22	QP = 1 # Quoted-Printable
				23	BASE64 = 2 # Base64
				24
				25	# In "=?charset?q?hello_world?=", the =?, ?q?, and ?= add up to 7
Tim Peters	8ac1495	2002-05-23 15:15:30 +0000	[diff] [blame]	26	MISC_LEN = 7
Barry Warsaw	409a4c0	2002-04-10 21:01:31 +0000	[diff] [blame]	27
				28	DEFAULT_CHARSET = 'us-ascii'
				29
				30
				31
				32	# Defaults
				33	CHARSETS = {
				34	# input header enc body enc output conv
Tim Peters	8ac1495	2002-05-23 15:15:30 +0000	[diff] [blame]	35	'iso-8859-1': (QP, QP, None),
Barry Warsaw	409a4c0	2002-04-10 21:01:31 +0000	[diff] [blame]	36	'iso-8859-2': (QP, QP, None),
				37	'us-ascii': (None, None, None),
				38	'big5': (BASE64, BASE64, None),
Tim Peters	8ac1495	2002-05-23 15:15:30 +0000	[diff] [blame]	39	'gb2312': (BASE64, BASE64, None),
Barry Warsaw	409a4c0	2002-04-10 21:01:31 +0000	[diff] [blame]	40	'euc-jp': (BASE64, None, 'iso-2022-jp'),
				41	'shift_jis': (BASE64, None, 'iso-2022-jp'),
				42	'iso-2022-jp': (BASE64, None, None),
				43	'koi8-r': (BASE64, BASE64, None),
				44	'utf-8': (BASE64, BASE64, 'utf-8'),
				45	}
				46
				47	# Aliases for other commonly-used names for character sets. Map
				48	# them to the real ones used in email.
				49	ALIASES = {
				50	'latin_1': 'iso-8859-1',
				51	'latin-1': 'iso-8859-1',
				52	'ascii': 'us-ascii',
				53	}
				54
				55	# Map charsets to their Unicode codec strings. Note that the Japanese
				56	# examples included below do not (yet) come with Python! They are available
				57	# from http://pseudo.grad.sccs.chukyo-u.ac.jp/~kajiyama/python/
				58
				59	# The Chinese and Korean codecs are available from SourceForge:
				60	#
				61	# http://sourceforge.net/projects/python-codecs/
				62	#
				63	# although you'll need to check them out of cvs since they haven't been file
				64	# released yet. You might also try to use
				65	#
				66	# http://www.freshports.org/port-description.php3?port=6702
				67	#
				68	# if you can get logged in. AFAICT, both the Chinese and Korean codecs are
				69	# fairly experimental at this point.
				70	CODEC_MAP = {
				71	'euc-jp': 'japanese.euc-jp',
				72	'iso-2022-jp': 'japanese.iso-2022-jp',
				73	'shift_jis': 'japanese.shift_jis',
				74	'gb2132': 'eucgb2312_cn',
				75	'big5': 'big5_tw',
				76	'utf-8': 'utf-8',
				77	# Hack: We don't want any conversion for stuff marked us-ascii, as all
				78	# sorts of garbage might be sent to us in the guise of 7-bit us-ascii.
				79	# Let that stuff pass through without conversion to/from Unicode.
				80	'us-ascii': None,
				81	}
				82
				83
				84
				85	# Convenience functions for extending the above mappings
				86	def add_charset(charset, header_enc=None, body_enc=None, output_charset=None):
				87	"""Add charset properties to the global map.
				88
				89	charset is the input character set, and must be the canonical name of a
				90	character set.
				91
				92	Optional header_enc and body_enc is either Charset.QP for
				93	quoted-printable, Charset.BASE64 for base64 encoding, or None for no
				94	encoding. It describes how message headers and message bodies in the
				95	input charset are to be encoded. Default is no encoding.
				96
				97	Optional output_charset is the character set that the output should be
				98	in. Conversions will proceed from input charset, to Unicode, to the
				99	output charset when the method Charset.convert() is called. The default
				100	is to output in the same character set as the input.
				101
				102	Both input_charset and output_charset must have Unicode codec entries in
				103	the module's charset-to-codec mapping; use add_codec(charset, codecname)
				104	to add codecs the module does not know about. See the codec module's
				105	documentation for more information.
				106	"""
				107	CHARSETS[charset] = (header_enc, body_enc, output_charset)
				108
				109
				110	def add_alias(alias, canonical):
				111	"""Add a character set alias.
				112
				113	alias is the alias name, e.g. latin-1
				114	canonical is the character set's canonical name, e.g. iso-8859-1
				115	"""
				116	ALIASES[alias] = canonical
				117
				118
				119	def add_codec(charset, codecname):
				120	"""Add a codec that map characters in the given charset to/from Unicode.
				121
				122	charset is the canonical name of a character set. codecname is the name
				123	of a Python codec, as appropriate for the second argument to the unicode()
				124	built-in, or to the .encode() method of a Unicode string.
				125	"""
				126	CODEC_MAP[charset] = codecname
				127
				128
				129
				130	class Charset:
				131	"""Map character sets to their email properties.
				132
				133	This class provides information about the requirements imposed on email
				134	for a specific character set. It also provides convenience routines for
				135	converting between character sets, given the availability of the
				136	applicable codecs. Given an character set, it will do its best to provide
				137	information on how to use that character set in an email.
Tim Peters	8ac1495	2002-05-23 15:15:30 +0000	[diff] [blame]	138
Barry Warsaw	409a4c0	2002-04-10 21:01:31 +0000	[diff] [blame]	139	Certain character sets must be encoded with quoted-printable or base64
				140	when used in email headers or bodies. Certain character sets must be
				141	converted outright, and are not allowed in email. Instances of this
				142	module expose the following information about a character set:
				143
				144	input_charset: The initial character set specified. Common aliases
				145	are converted to their `official' email names (e.g. latin_1
				146	is converted to iso-8859-1). Defaults to 7-bit us-ascii.
				147
				148	header_encoding: If the character set must be encoded before it can be
				149	used in an email header, this attribute will be set to
				150	Charset.QP (for quoted-printable) or Charset.BASE64 (for
				151	base64 encoding). Otherwise, it will be None.
				152
				153	body_encoding: Same as header_encoding, but describes the encoding for the
				154	mail message's body, which indeed may be different than the
				155	header encoding.
				156
				157	output_charset: Some character sets must be converted before the can be
				158	used in email headers or bodies. If the input_charset is
				159	one of them, this attribute will contain the name of the
				160	charset output will be converted to. Otherwise, it will
				161	be None.
				162
				163	input_codec: The name of the Python codec used to convert the
				164	input_charset to Unicode. If no conversion codec is
				165	necessary, this attribute will be None.
				166
				167	output_codec: The name of the Python codec used to convert Unicode
				168	to the output_charset. If no conversion codec is necessary,
				169	this attribute will have the same value as the input_codec.
				170	"""
				171	def __init__(self, input_charset=DEFAULT_CHARSET):
				172	# Set the input charset after filtering through the aliases
				173	self.input_charset = ALIASES.get(input_charset, input_charset)
				174	# We can try to guess which encoding and conversion to use by the
				175	# charset_map dictionary. Try that first, but let the user override
				176	# it.
				177	henc, benc, conv = CHARSETS.get(self.input_charset,
				178	(BASE64, BASE64, None))
				179	# Set the attributes, allowing the arguments to override the default.
				180	self.header_encoding = henc
				181	self.body_encoding = benc
				182	self.output_charset = ALIASES.get(conv, conv)
				183	# Now set the codecs. If one isn't defined for input_charset,
				184	# guess and try a Unicode codec with the same name as input_codec.
				185	self.input_codec = CODEC_MAP.get(self.input_charset,
				186	self.input_charset)
				187	self.output_codec = CODEC_MAP.get(self.output_charset,
				188	self.input_codec)
				189
				190	def __str__(self):
				191	return self.input_charset.lower()
				192
				193	def __eq__(self, other):
				194	return str(self) == str(other).lower()
				195
				196	def __ne__(self, other):
				197	return not self.__eq__(other)
				198
				199	def get_body_encoding(self):
				200	"""Return the content-transfer-encoding used for body encoding.
				201
				202	This is either the string `quoted-printable' or `base64' depending on
				203	the encoding used, or it is a function in which case you should call
				204	the function with a single argument, the Message object being
				205	encoded. The function should then set the Content-Transfer-Encoding:
				206	header itself to whatever is appropriate.
				207
				208	Returns "quoted-printable" if self.body_encoding is QP.
				209	Returns "base64" if self.body_encoding is BASE64.
				210	Returns "7bit" otherwise.
				211	"""
				212	if self.body_encoding == QP:
				213	return 'quoted-printable'
				214	elif self.body_encoding == BASE64:
				215	return 'base64'
				216	else:
				217	return encode_7or8bit
				218
				219	def convert(self, s):
				220	"""Convert a string from the input_codec to the output_codec."""
				221	if self.input_codec <> self.output_codec:
				222	return unicode(s, self.input_codec).encode(self.output_codec)
				223	else:
				224	return s
				225
				226	def to_splittable(self, s):
				227	"""Convert a possibly multibyte string to a safely splittable format.
				228
				229	Uses the input_codec to try and convert the string to Unicode, so it
				230	can be safely split on character boundaries (even for double-byte
				231	characters).
				232
				233	Returns the string untouched if we don't know how to convert it to
				234	Unicode with the input_charset.
				235
				236	Characters that could not be converted to Unicode will be replaced
				237	with the Unicode replacement character U+FFFD.
				238	"""
Guido van Rossum	1a7ac35	2002-05-28 18:49:03 +0000	[diff] [blame]	239	if _is_unicode(s) or self.input_codec is None:
Barry Warsaw	409a4c0	2002-04-10 21:01:31 +0000	[diff] [blame]	240	return s
				241	try:
				242	return unicode(s, self.input_codec, 'replace')
				243	except LookupError:
				244	# Input codec not installed on system, so return the original
				245	# string unchanged.
				246	return s
				247
				248	def from_splittable(self, ustr, to_output=1):
				249	"""Convert a splittable string back into an encoded string.
				250
				251	Uses the proper codec to try and convert the string from
				252	Unicode back into an encoded format. Return the string as-is
				253	if it is not Unicode, or if it could not be encoded from
				254	Unicode.
				255
				256	Characters that could not be converted from Unicode will be replaced
				257	with an appropriate character (usually '?').
				258
				259	If to_output is true, uses output_codec to convert to an encoded
				260	format. If to_output is false, uses input_codec. to_output defaults
				261	to 1.
				262	"""
				263	if to_output:
				264	codec = self.output_codec
				265	else:
				266	codec = self.input_codec
Guido van Rossum	1a7ac35	2002-05-28 18:49:03 +0000	[diff] [blame]	267	if not _is_unicode(ustr) or codec is None:
Barry Warsaw	409a4c0	2002-04-10 21:01:31 +0000	[diff] [blame]	268	return ustr
				269	try:
				270	return ustr.encode(codec, 'replace')
				271	except LookupError:
				272	# Output codec not installed
				273	return ustr
				274
				275	def get_output_charset(self):
				276	"""Return the output character set.
				277
				278	This is self.output_charset if that is set, otherwise it is
				279	self.input_charset.
				280	"""
				281	return self.output_charset or self.input_charset
				282
				283	def encoded_header_len(self, s):
				284	"""Return the length of the encoded header string."""
				285	cset = self.get_output_charset()
				286	# The len(s) of a 7bit encoding is len(s)
				287	if self.header_encoding is BASE64:
				288	return email.base64MIME.base64_len(s) + len(cset) + MISC_LEN
				289	elif self.header_encoding is QP:
				290	return email.quopriMIME.header_quopri_len(s) + len(cset) + MISC_LEN
				291	else:
				292	return len(s)
				293
				294	def header_encode(self, s, convert=0):
				295	"""Header-encode a string, optionally converting it to output_charset.
				296
				297	If convert is true, the string will be converted from the input
				298	charset to the output charset automatically. This is not useful for
				299	multibyte character sets, which have line length issues (multibyte
				300	characters must be split on a character, not a byte boundary); use the
				301	high-level Header class to deal with these issues. convert defaults
				302	to 0.
				303
				304	The type of encoding (base64 or quoted-printable) will be based on
				305	self.header_encoding.
				306	"""
				307	cset = self.get_output_charset()
				308	if convert:
				309	s = self.convert(s)
				310	# 7bit/8bit encodings return the string unchanged (modulo conversions)
				311	if self.header_encoding is BASE64:
				312	return email.base64MIME.header_encode(s, cset)
				313	elif self.header_encoding is QP:
				314	return email.quopriMIME.header_encode(s, cset)
				315	else:
				316	return s
				317
				318	def body_encode(self, s, convert=1):
				319	"""Body-encode a string and convert it to output_charset.
				320
				321	If convert is true (the default), the string will be converted from
				322	the input charset to output charset automatically. Unlike
				323	header_encode(), there are no issues with byte boundaries and
				324	multibyte charsets in email bodies, so this is usually pretty safe.
				325
				326	The type of encoding (base64 or quoted-printable) will be based on
				327	self.body_encoding.
				328	"""
				329	if convert:
				330	s = self.convert(s)
				331	# 7bit/8bit encodings return the string unchanged (module conversions)
				332	if self.body_encoding is BASE64:
				333	return email.base64MIME.body_encode(s)
				334	elif self.header_encoding is QP:
				335	return email.quopriMIME.body_encode(s)
				336	else:
				337	return s