blob: 4874597a188913287f8fe2bdaccb098dc10e5046 [file] [log] [blame]
Barry Warsaw409a4c02002-04-10 21:01:31 +00001# Copyright (C) 2001,2002 Python Software Foundation
2# Author: che@debian.org (Ben Gertzfield)
3
4from types import UnicodeType
5from email.Encoders import encode_7or8bit
6import email.base64MIME
7import email.quopriMIME
8
9
10
11# Flags for types of header encodings
12QP = 1 # Quoted-Printable
13BASE64 = 2 # Base64
14
15# In "=?charset?q?hello_world?=", the =?, ?q?, and ?= add up to 7
16MISC_LEN = 7
17
18DEFAULT_CHARSET = 'us-ascii'
19
20
21
22# Defaults
23CHARSETS = {
24 # input header enc body enc output conv
25 'iso-8859-1': (QP, QP, None),
26 'iso-8859-2': (QP, QP, None),
27 'us-ascii': (None, None, None),
28 'big5': (BASE64, BASE64, None),
29 'gb2312': (BASE64, BASE64, None),
30 'euc-jp': (BASE64, None, 'iso-2022-jp'),
31 'shift_jis': (BASE64, None, 'iso-2022-jp'),
32 'iso-2022-jp': (BASE64, None, None),
33 'koi8-r': (BASE64, BASE64, None),
34 'utf-8': (BASE64, BASE64, 'utf-8'),
35 }
36
37# Aliases for other commonly-used names for character sets. Map
38# them to the real ones used in email.
39ALIASES = {
40 'latin_1': 'iso-8859-1',
41 'latin-1': 'iso-8859-1',
42 'ascii': 'us-ascii',
43 }
44
45# Map charsets to their Unicode codec strings. Note that the Japanese
46# examples included below do not (yet) come with Python! They are available
47# from http://pseudo.grad.sccs.chukyo-u.ac.jp/~kajiyama/python/
48
49# The Chinese and Korean codecs are available from SourceForge:
50#
51# http://sourceforge.net/projects/python-codecs/
52#
53# although you'll need to check them out of cvs since they haven't been file
54# released yet. You might also try to use
55#
56# http://www.freshports.org/port-description.php3?port=6702
57#
58# if you can get logged in. AFAICT, both the Chinese and Korean codecs are
59# fairly experimental at this point.
60CODEC_MAP = {
61 'euc-jp': 'japanese.euc-jp',
62 'iso-2022-jp': 'japanese.iso-2022-jp',
63 'shift_jis': 'japanese.shift_jis',
64 'gb2132': 'eucgb2312_cn',
65 'big5': 'big5_tw',
66 'utf-8': 'utf-8',
67 # Hack: We don't want *any* conversion for stuff marked us-ascii, as all
68 # sorts of garbage might be sent to us in the guise of 7-bit us-ascii.
69 # Let that stuff pass through without conversion to/from Unicode.
70 'us-ascii': None,
71 }
72
73
74
75# Convenience functions for extending the above mappings
76def add_charset(charset, header_enc=None, body_enc=None, output_charset=None):
77 """Add charset properties to the global map.
78
79 charset is the input character set, and must be the canonical name of a
80 character set.
81
82 Optional header_enc and body_enc is either Charset.QP for
83 quoted-printable, Charset.BASE64 for base64 encoding, or None for no
84 encoding. It describes how message headers and message bodies in the
85 input charset are to be encoded. Default is no encoding.
86
87 Optional output_charset is the character set that the output should be
88 in. Conversions will proceed from input charset, to Unicode, to the
89 output charset when the method Charset.convert() is called. The default
90 is to output in the same character set as the input.
91
92 Both input_charset and output_charset must have Unicode codec entries in
93 the module's charset-to-codec mapping; use add_codec(charset, codecname)
94 to add codecs the module does not know about. See the codec module's
95 documentation for more information.
96 """
97 CHARSETS[charset] = (header_enc, body_enc, output_charset)
98
99
100def add_alias(alias, canonical):
101 """Add a character set alias.
102
103 alias is the alias name, e.g. latin-1
104 canonical is the character set's canonical name, e.g. iso-8859-1
105 """
106 ALIASES[alias] = canonical
107
108
109def add_codec(charset, codecname):
110 """Add a codec that map characters in the given charset to/from Unicode.
111
112 charset is the canonical name of a character set. codecname is the name
113 of a Python codec, as appropriate for the second argument to the unicode()
114 built-in, or to the .encode() method of a Unicode string.
115 """
116 CODEC_MAP[charset] = codecname
117
118
119
120class Charset:
121 """Map character sets to their email properties.
122
123 This class provides information about the requirements imposed on email
124 for a specific character set. It also provides convenience routines for
125 converting between character sets, given the availability of the
126 applicable codecs. Given an character set, it will do its best to provide
127 information on how to use that character set in an email.
128
129 Certain character sets must be encoded with quoted-printable or base64
130 when used in email headers or bodies. Certain character sets must be
131 converted outright, and are not allowed in email. Instances of this
132 module expose the following information about a character set:
133
134 input_charset: The initial character set specified. Common aliases
135 are converted to their `official' email names (e.g. latin_1
136 is converted to iso-8859-1). Defaults to 7-bit us-ascii.
137
138 header_encoding: If the character set must be encoded before it can be
139 used in an email header, this attribute will be set to
140 Charset.QP (for quoted-printable) or Charset.BASE64 (for
141 base64 encoding). Otherwise, it will be None.
142
143 body_encoding: Same as header_encoding, but describes the encoding for the
144 mail message's body, which indeed may be different than the
145 header encoding.
146
147 output_charset: Some character sets must be converted before the can be
148 used in email headers or bodies. If the input_charset is
149 one of them, this attribute will contain the name of the
150 charset output will be converted to. Otherwise, it will
151 be None.
152
153 input_codec: The name of the Python codec used to convert the
154 input_charset to Unicode. If no conversion codec is
155 necessary, this attribute will be None.
156
157 output_codec: The name of the Python codec used to convert Unicode
158 to the output_charset. If no conversion codec is necessary,
159 this attribute will have the same value as the input_codec.
160 """
161 def __init__(self, input_charset=DEFAULT_CHARSET):
162 # Set the input charset after filtering through the aliases
163 self.input_charset = ALIASES.get(input_charset, input_charset)
164 # We can try to guess which encoding and conversion to use by the
165 # charset_map dictionary. Try that first, but let the user override
166 # it.
167 henc, benc, conv = CHARSETS.get(self.input_charset,
168 (BASE64, BASE64, None))
169 # Set the attributes, allowing the arguments to override the default.
170 self.header_encoding = henc
171 self.body_encoding = benc
172 self.output_charset = ALIASES.get(conv, conv)
173 # Now set the codecs. If one isn't defined for input_charset,
174 # guess and try a Unicode codec with the same name as input_codec.
175 self.input_codec = CODEC_MAP.get(self.input_charset,
176 self.input_charset)
177 self.output_codec = CODEC_MAP.get(self.output_charset,
178 self.input_codec)
179
180 def __str__(self):
181 return self.input_charset.lower()
182
183 def __eq__(self, other):
184 return str(self) == str(other).lower()
185
186 def __ne__(self, other):
187 return not self.__eq__(other)
188
189 def get_body_encoding(self):
190 """Return the content-transfer-encoding used for body encoding.
191
192 This is either the string `quoted-printable' or `base64' depending on
193 the encoding used, or it is a function in which case you should call
194 the function with a single argument, the Message object being
195 encoded. The function should then set the Content-Transfer-Encoding:
196 header itself to whatever is appropriate.
197
198 Returns "quoted-printable" if self.body_encoding is QP.
199 Returns "base64" if self.body_encoding is BASE64.
200 Returns "7bit" otherwise.
201 """
202 if self.body_encoding == QP:
203 return 'quoted-printable'
204 elif self.body_encoding == BASE64:
205 return 'base64'
206 else:
207 return encode_7or8bit
208
209 def convert(self, s):
210 """Convert a string from the input_codec to the output_codec."""
211 if self.input_codec <> self.output_codec:
212 return unicode(s, self.input_codec).encode(self.output_codec)
213 else:
214 return s
215
216 def to_splittable(self, s):
217 """Convert a possibly multibyte string to a safely splittable format.
218
219 Uses the input_codec to try and convert the string to Unicode, so it
220 can be safely split on character boundaries (even for double-byte
221 characters).
222
223 Returns the string untouched if we don't know how to convert it to
224 Unicode with the input_charset.
225
226 Characters that could not be converted to Unicode will be replaced
227 with the Unicode replacement character U+FFFD.
228 """
229 if isinstance(s, UnicodeType) or self.input_codec is None:
230 return s
231 try:
232 return unicode(s, self.input_codec, 'replace')
233 except LookupError:
234 # Input codec not installed on system, so return the original
235 # string unchanged.
236 return s
237
238 def from_splittable(self, ustr, to_output=1):
239 """Convert a splittable string back into an encoded string.
240
241 Uses the proper codec to try and convert the string from
242 Unicode back into an encoded format. Return the string as-is
243 if it is not Unicode, or if it could not be encoded from
244 Unicode.
245
246 Characters that could not be converted from Unicode will be replaced
247 with an appropriate character (usually '?').
248
249 If to_output is true, uses output_codec to convert to an encoded
250 format. If to_output is false, uses input_codec. to_output defaults
251 to 1.
252 """
253 if to_output:
254 codec = self.output_codec
255 else:
256 codec = self.input_codec
257 if not isinstance(ustr, UnicodeType) or codec is None:
258 return ustr
259 try:
260 return ustr.encode(codec, 'replace')
261 except LookupError:
262 # Output codec not installed
263 return ustr
264
265 def get_output_charset(self):
266 """Return the output character set.
267
268 This is self.output_charset if that is set, otherwise it is
269 self.input_charset.
270 """
271 return self.output_charset or self.input_charset
272
273 def encoded_header_len(self, s):
274 """Return the length of the encoded header string."""
275 cset = self.get_output_charset()
276 # The len(s) of a 7bit encoding is len(s)
277 if self.header_encoding is BASE64:
278 return email.base64MIME.base64_len(s) + len(cset) + MISC_LEN
279 elif self.header_encoding is QP:
280 return email.quopriMIME.header_quopri_len(s) + len(cset) + MISC_LEN
281 else:
282 return len(s)
283
284 def header_encode(self, s, convert=0):
285 """Header-encode a string, optionally converting it to output_charset.
286
287 If convert is true, the string will be converted from the input
288 charset to the output charset automatically. This is not useful for
289 multibyte character sets, which have line length issues (multibyte
290 characters must be split on a character, not a byte boundary); use the
291 high-level Header class to deal with these issues. convert defaults
292 to 0.
293
294 The type of encoding (base64 or quoted-printable) will be based on
295 self.header_encoding.
296 """
297 cset = self.get_output_charset()
298 if convert:
299 s = self.convert(s)
300 # 7bit/8bit encodings return the string unchanged (modulo conversions)
301 if self.header_encoding is BASE64:
302 return email.base64MIME.header_encode(s, cset)
303 elif self.header_encoding is QP:
304 return email.quopriMIME.header_encode(s, cset)
305 else:
306 return s
307
308 def body_encode(self, s, convert=1):
309 """Body-encode a string and convert it to output_charset.
310
311 If convert is true (the default), the string will be converted from
312 the input charset to output charset automatically. Unlike
313 header_encode(), there are no issues with byte boundaries and
314 multibyte charsets in email bodies, so this is usually pretty safe.
315
316 The type of encoding (base64 or quoted-printable) will be based on
317 self.body_encoding.
318 """
319 if convert:
320 s = self.convert(s)
321 # 7bit/8bit encodings return the string unchanged (module conversions)
322 if self.body_encoding is BASE64:
323 return email.base64MIME.body_encode(s)
324 elif self.header_encoding is QP:
325 return email.quopriMIME.body_encode(s)
326 else:
327 return s