Blame - Lib/email/charset.py - platform/external/python/cpython3

2003-12-30 16:52:25 +0000

[diff] [blame]

4

Thomas Wouters

2006-04-21 10:40:58 +0000

[diff] [blame^]

__all__ = [

'Charset',

'add_alias',

'add_charset',

'add_codec',

]

import email.base64mime

13

import email.quoprimime

14

15

from email import errors

16

from email.encoders import encode_7or8bit

Barry Warsaw

2002-09-28 17:47:56 +0000

[diff] [blame]

17

Barry Warsaw

2002-04-10 21:01:31 +0000

[diff] [blame]

18

19

20

# Flags for types of header encodings

Barry Warsaw

bb11386

2004-10-03 03:16:19 +0000

[diff] [blame]

21

QP = 1 # Quoted-Printable

22

BASE64 = 2 # Base64

23

SHORTEST = 3 # the shorter of QP and base64, but only for headers

Barry Warsaw

2002-04-10 21:01:31 +0000

[diff] [blame]

24

25

# In "=?charset?q?hello_world?=", the =?, ?q?, and ?= add up to 7

Tim Peters

2002-05-23 15:15:30 +0000

[diff] [blame]

26

MISC_LEN = 7

Barry Warsaw

2002-04-10 21:01:31 +0000

[diff] [blame]

27

28

DEFAULT_CHARSET = 'us-ascii'

# Defaults

CHARSETS = {

# input header enc body enc output conv

Tim Peters

2002-05-23 15:15:30 +0000

[diff] [blame]

35

'iso-8859-1': (QP, QP, None),

Barry Warsaw

2002-04-10 21:01:31 +0000

[diff] [blame]

36

'iso-8859-2': (QP, QP, None),

Barry Warsaw

4e68a1e

2003-01-07 00:29:07 +0000

[diff] [blame]

37

'iso-8859-3': (QP, QP, None),

38

'iso-8859-4': (QP, QP, None),

39

# iso-8859-5 is Cyrillic, and not especially used

40

# iso-8859-6 is Arabic, also not particularly used

41

# iso-8859-7 is Greek, QP will not make it readable

42

# iso-8859-8 is Hebrew, QP will not make it readable

43

'iso-8859-9': (QP, QP, None),

44

'iso-8859-10': (QP, QP, None),

45

# iso-8859-11 is Thai, QP will not make it readable

46

'iso-8859-13': (QP, QP, None),

47

'iso-8859-14': (QP, QP, None),

48

'iso-8859-15': (QP, QP, None),

49

'windows-1252':(QP, QP, None),

50

'viscii': (QP, QP, None),

Barry Warsaw

2002-04-10 21:01:31 +0000

[diff] [blame]

51

'us-ascii': (None, None, None),

52

'big5': (BASE64, BASE64, None),

Tim Peters

2002-05-23 15:15:30 +0000

[diff] [blame]

53

'gb2312': (BASE64, BASE64, None),

Barry Warsaw

2002-04-10 21:01:31 +0000

[diff] [blame]

54

'euc-jp': (BASE64, None, 'iso-2022-jp'),

55

'shift_jis': (BASE64, None, 'iso-2022-jp'),

56

'iso-2022-jp': (BASE64, None, None),

57

'koi8-r': (BASE64, BASE64, None),

Barry Warsaw

2002-09-28 17:47:56 +0000

[diff] [blame]

58

'utf-8': (SHORTEST, BASE64, 'utf-8'),

Barry Warsaw

7cd7240

2002-10-14 15:06:55 +0000

[diff] [blame]

59

# We're making this one up to represent raw unencoded 8-bit

60

'8bit': (None, BASE64, 'utf-8'),

Barry Warsaw

2002-04-10 21:01:31 +0000

[diff] [blame]

61

}

62

63

# Aliases for other commonly-used names for character sets. Map

64

# them to the real ones used in email.

65

ALIASES = {

66

'latin_1': 'iso-8859-1',

67

'latin-1': 'iso-8859-1',

Barry Warsaw

4e68a1e

2003-01-07 00:29:07 +0000

[diff] [blame]

68

'latin_2': 'iso-8859-2',

69

'latin-2': 'iso-8859-2',

70

'latin_3': 'iso-8859-3',

71

'latin-3': 'iso-8859-3',

72

'latin_4': 'iso-8859-4',

73

'latin-4': 'iso-8859-4',

74

'latin_5': 'iso-8859-9',

75

'latin-5': 'iso-8859-9',

76

'latin_6': 'iso-8859-10',

77

'latin-6': 'iso-8859-10',

78

'latin_7': 'iso-8859-13',

79

'latin-7': 'iso-8859-13',

80

'latin_8': 'iso-8859-14',

81

'latin-8': 'iso-8859-14',

82

'latin_9': 'iso-8859-15',

83

'latin-9': 'iso-8859-15',

84

'cp949': 'ks_c_5601-1987',

85

'euc_jp': 'euc-jp',

86

'euc_kr': 'euc-kr',

Barry Warsaw

2002-04-10 21:01:31 +0000

[diff] [blame]

'ascii': 'us-ascii',

}

Barry Warsaw

2002-04-10 21:01:31 +0000

[diff] [blame]

90

Barry Warsaw

2003-12-30 16:52:25 +0000

[diff] [blame]

91

# Map charsets to their Unicode codec strings.

Barry Warsaw

2002-04-10 21:01:31 +0000

[diff] [blame]

92

CODEC_MAP = {

Barry Warsaw

2003-12-30 16:52:25 +0000

[diff] [blame]

93

'gb2312': 'eucgb2312_cn',

Barry Warsaw

2002-04-10 21:01:31 +0000

[diff] [blame]

94

'big5': 'big5_tw',

Barry Warsaw

2002-04-10 21:01:31 +0000

[diff] [blame]

95

# Hack: We don't want *any* conversion for stuff marked us-ascii, as all

96

# sorts of garbage might be sent to us in the guise of 7-bit us-ascii.

97

# Let that stuff pass through without conversion to/from Unicode.

'us-ascii': None,

}

# Convenience functions for extending the above mappings

104

def add_charset(charset, header_enc=None, body_enc=None, output_charset=None):

Barry Warsaw

2002-10-01 00:05:24 +0000

[diff] [blame]

105

"""Add character set properties to the global registry.

Barry Warsaw

2002-04-10 21:01:31 +0000

[diff] [blame]

106

107

charset is the input character set, and must be the canonical name of a

108

character set.

109

110

Optional header_enc and body_enc is either Charset.QP for

Barry Warsaw

2002-09-28 17:47:56 +0000

[diff] [blame]

111

quoted-printable, Charset.BASE64 for base64 encoding, Charset.SHORTEST for

112

the shortest of qp or base64 encoding, or None for no encoding. SHORTEST

113

is only valid for header_enc. It describes how message headers and

114

message bodies in the input charset are to be encoded. Default is no

115

encoding.

Barry Warsaw

2002-04-10 21:01:31 +0000

[diff] [blame]

116

117

Optional output_charset is the character set that the output should be

118

in. Conversions will proceed from input charset, to Unicode, to the

119

output charset when the method Charset.convert() is called. The default

120

is to output in the same character set as the input.

121

122

Both input_charset and output_charset must have Unicode codec entries in

123

the module's charset-to-codec mapping; use add_codec(charset, codecname)

Barry Warsaw

2002-10-01 00:05:24 +0000

[diff] [blame]

124

to add codecs the module does not know about. See the codecs module's

Barry Warsaw

2002-04-10 21:01:31 +0000

[diff] [blame]

125

documentation for more information.

126

"""

Barry Warsaw

2002-09-28 17:47:56 +0000

[diff] [blame]

127

if body_enc == SHORTEST:

Barry Warsaw

bb11386

2004-10-03 03:16:19 +0000

[diff] [blame]

128

raise ValueError('SHORTEST not allowed for body_enc')

Barry Warsaw

2002-04-10 21:01:31 +0000

[diff] [blame]

129

CHARSETS[charset] = (header_enc, body_enc, output_charset)

130

131

132

def add_alias(alias, canonical):

133

"""Add a character set alias.

134

135

alias is the alias name, e.g. latin-1

136

canonical is the character set's canonical name, e.g. iso-8859-1

137

"""

138

ALIASES[alias] = canonical

139

140

141

def add_codec(charset, codecname):

142

"""Add a codec that map characters in the given charset to/from Unicode.

143

144

charset is the canonical name of a character set. codecname is the name

145

of a Python codec, as appropriate for the second argument to the unicode()

Barry Warsaw

2002-10-01 00:05:24 +0000

[diff] [blame]

146

built-in, or to the encode() method of a Unicode string.

Barry Warsaw

2002-04-10 21:01:31 +0000

[diff] [blame]

147

"""

148

CODEC_MAP[charset] = codecname

class Charset:

"""Map character sets to their email properties.

154

155

This class provides information about the requirements imposed on email

156

for a specific character set. It also provides convenience routines for

157

converting between character sets, given the availability of the

Barry Warsaw

2002-10-01 00:05:24 +0000

[diff] [blame]

158

applicable codecs. Given a character set, it will do its best to provide

159

information on how to use that character set in an email in an

160

RFC-compliant way.

Tim Peters

2002-05-23 15:15:30 +0000

[diff] [blame]

161

Barry Warsaw

2002-04-10 21:01:31 +0000

[diff] [blame]

162

Certain character sets must be encoded with quoted-printable or base64

163

when used in email headers or bodies. Certain character sets must be

164

converted outright, and are not allowed in email. Instances of this

165

module expose the following information about a character set:

166

167

input_charset: The initial character set specified. Common aliases

168

are converted to their `official' email names (e.g. latin_1

169

is converted to iso-8859-1). Defaults to 7-bit us-ascii.

170

171

header_encoding: If the character set must be encoded before it can be

172

used in an email header, this attribute will be set to

Barry Warsaw

2002-09-28 17:47:56 +0000

[diff] [blame]

173

Charset.QP (for quoted-printable), Charset.BASE64 (for

174

base64 encoding), or Charset.SHORTEST for the shortest of

175

QP or BASE64 encoding. Otherwise, it will be None.

Barry Warsaw

2002-04-10 21:01:31 +0000

[diff] [blame]

176

177

body_encoding: Same as header_encoding, but describes the encoding for the

178

mail message's body, which indeed may be different than the

Barry Warsaw

2002-09-28 17:47:56 +0000

[diff] [blame]

179

header encoding. Charset.SHORTEST is not allowed for

180

body_encoding.

Barry Warsaw

2002-04-10 21:01:31 +0000

[diff] [blame]

181

182

output_charset: Some character sets must be converted before the can be

183

used in email headers or bodies. If the input_charset is

184

one of them, this attribute will contain the name of the

185

charset output will be converted to. Otherwise, it will

186

be None.

187

188

input_codec: The name of the Python codec used to convert the

189

input_charset to Unicode. If no conversion codec is

190

necessary, this attribute will be None.

191

192

output_codec: The name of the Python codec used to convert Unicode

193

to the output_charset. If no conversion codec is necessary,

194

this attribute will have the same value as the input_codec.

195

"""

196

def __init__(self, input_charset=DEFAULT_CHARSET):

Barry Warsaw

ea7c7af

2004-10-09 21:08:30 +0000

[diff] [blame]

197

# RFC 2046, $4.1.2 says charsets are not case sensitive. We coerce to

Thomas Wouters

2006-04-21 10:40:58 +0000

[diff] [blame^]

198

# unicode because its .lower() is locale insensitive. If the argument

199

# is already a unicode, we leave it at that, but ensure that the

200

# charset is ASCII, as the standard (RFC XXX) requires.

201

try:

202

if isinstance(input_charset, unicode):

203

input_charset.encode('ascii')

204

else:

205

input_charset = unicode(input_charset, 'ascii')

206

except UnicodeError:

207

raise errors.CharsetError(input_charset)

208

input_charset = input_charset.lower()

Barry Warsaw

2002-04-10 21:01:31 +0000

[diff] [blame]

209

# Set the input charset after filtering through the aliases

210

self.input_charset = ALIASES.get(input_charset, input_charset)

211

# We can try to guess which encoding and conversion to use by the

212

# charset_map dictionary. Try that first, but let the user override

213

# it.

214

henc, benc, conv = CHARSETS.get(self.input_charset,

Barry Warsaw

14fc464

2002-10-10 15:11:20 +0000

[diff] [blame]

215

(SHORTEST, BASE64, None))

Barry Warsaw

2003-12-30 16:52:25 +0000

[diff] [blame]

216

if not conv:

217

conv = self.input_charset

Barry Warsaw

2002-04-10 21:01:31 +0000

[diff] [blame]

218

# Set the attributes, allowing the arguments to override the default.

219

self.header_encoding = henc

220

self.body_encoding = benc

221

self.output_charset = ALIASES.get(conv, conv)

222

# Now set the codecs. If one isn't defined for input_charset,

223

# guess and try a Unicode codec with the same name as input_codec.

224

self.input_codec = CODEC_MAP.get(self.input_charset,

225

self.input_charset)

226

self.output_codec = CODEC_MAP.get(self.output_charset,

Barry Warsaw

e58df82

2006-02-08 14:34:21 +0000

[diff] [blame]

227

self.output_charset)

Barry Warsaw

2002-04-10 21:01:31 +0000

[diff] [blame]

228

229

def __str__(self):

230

return self.input_charset.lower()

231

Barry Warsaw

784cf6a

2003-03-06 05:16:29 +0000

[diff] [blame]

232

__repr__ = __str__

233

Barry Warsaw

2002-04-10 21:01:31 +0000

[diff] [blame]

234

def __eq__(self, other):

235

return str(self) == str(other).lower()

236

237

def __ne__(self, other):

238

return not self.__eq__(other)

239

240

def get_body_encoding(self):

241

"""Return the content-transfer-encoding used for body encoding.

242

243

This is either the string `quoted-printable' or `base64' depending on

244

the encoding used, or it is a function in which case you should call

245

the function with a single argument, the Message object being

Barry Warsaw

2002-10-01 00:05:24 +0000

[diff] [blame]

246

encoded. The function should then set the Content-Transfer-Encoding

Barry Warsaw

2002-04-10 21:01:31 +0000

[diff] [blame]

247

header itself to whatever is appropriate.

248

249

Returns "quoted-printable" if self.body_encoding is QP.

250

Returns "base64" if self.body_encoding is BASE64.

251

Returns "7bit" otherwise.

252

"""

Barry Warsaw

2002-09-28 17:47:56 +0000

[diff] [blame]

253

assert self.body_encoding <> SHORTEST

Barry Warsaw

2002-04-10 21:01:31 +0000

[diff] [blame]

254

if self.body_encoding == QP:

255

return 'quoted-printable'

256

elif self.body_encoding == BASE64:

257

return 'base64'

258

else:

259

return encode_7or8bit

260

261

def convert(self, s):

262

"""Convert a string from the input_codec to the output_codec."""

263

if self.input_codec <> self.output_codec:

264

return unicode(s, self.input_codec).encode(self.output_codec)

else:

return s

def to_splittable(self, s):

269

"""Convert a possibly multibyte string to a safely splittable format.

270

271

Uses the input_codec to try and convert the string to Unicode, so it

Barry Warsaw

2002-10-01 00:05:24 +0000

[diff] [blame]

272

can be safely split on character boundaries (even for multibyte

Barry Warsaw

2002-04-10 21:01:31 +0000

[diff] [blame]

273

characters).

274

Barry Warsaw

2002-10-01 00:05:24 +0000

[diff] [blame]

275

Returns the string as-is if it isn't known how to convert it to

Barry Warsaw

2002-04-10 21:01:31 +0000

[diff] [blame]

276

Unicode with the input_charset.

277

278

Characters that could not be converted to Unicode will be replaced

279

with the Unicode replacement character U+FFFD.

280

"""

Barry Warsaw

41f6ad6

2004-05-09 03:24:43 +0000

[diff] [blame]

281

if isinstance(s, unicode) or self.input_codec is None:

Barry Warsaw

2002-04-10 21:01:31 +0000

[diff] [blame]

282

return s

283

try:

284

return unicode(s, self.input_codec, 'replace')

285

except LookupError:

286

# Input codec not installed on system, so return the original

# string unchanged.

return s

Barry Warsaw

2002-09-28 17:47:56 +0000

[diff] [blame]

290

def from_splittable(self, ustr, to_output=True):

Barry Warsaw

2002-04-10 21:01:31 +0000

[diff] [blame]

291

"""Convert a splittable string back into an encoded string.

292

Barry Warsaw

2002-10-01 00:05:24 +0000

[diff] [blame]

293

Uses the proper codec to try and convert the string from Unicode back

294

into an encoded format. Return the string as-is if it is not Unicode,

295

or if it could not be converted from Unicode.

Barry Warsaw

2002-04-10 21:01:31 +0000

[diff] [blame]

296

297

Characters that could not be converted from Unicode will be replaced

298

with an appropriate character (usually '?').

299

Barry Warsaw

2002-09-28 17:47:56 +0000

[diff] [blame]

300

If to_output is True (the default), uses output_codec to convert to an

301

encoded format. If to_output is False, uses input_codec.

Barry Warsaw

2002-04-10 21:01:31 +0000

[diff] [blame]

302

"""

303

if to_output:

304

codec = self.output_codec

305

else:

306

codec = self.input_codec

Barry Warsaw

41f6ad6

2004-05-09 03:24:43 +0000

[diff] [blame]

307

if not isinstance(ustr, unicode) or codec is None:

Barry Warsaw

2002-04-10 21:01:31 +0000

[diff] [blame]

308

return ustr

309

try:

310

return ustr.encode(codec, 'replace')

311

except LookupError:

312

# Output codec not installed

313

return ustr

314

315

def get_output_charset(self):

316

"""Return the output character set.

317

Barry Warsaw

2002-10-01 00:05:24 +0000

[diff] [blame]

318

This is self.output_charset if that is not None, otherwise it is

Barry Warsaw

2002-04-10 21:01:31 +0000

[diff] [blame]

319

self.input_charset.

320

"""

321

return self.output_charset or self.input_charset

322

323

def encoded_header_len(self, s):

324

"""Return the length of the encoded header string."""

325

cset = self.get_output_charset()

326

# The len(s) of a 7bit encoding is len(s)

Barry Warsaw

2002-09-28 17:47:56 +0000

[diff] [blame]

327

if self.header_encoding == BASE64:

Thomas Wouters

2006-04-21 10:40:58 +0000

[diff] [blame^]

328

return email.base64mime.base64_len(s) + len(cset) + MISC_LEN

Barry Warsaw

2002-09-28 17:47:56 +0000

[diff] [blame]

329

elif self.header_encoding == QP:

Thomas Wouters

2006-04-21 10:40:58 +0000

[diff] [blame^]

330

return email.quoprimime.header_quopri_len(s) + len(cset) + MISC_LEN

Barry Warsaw

2002-09-28 17:47:56 +0000

[diff] [blame]

331

elif self.header_encoding == SHORTEST:

Thomas Wouters

2006-04-21 10:40:58 +0000

[diff] [blame^]

332

lenb64 = email.base64mime.base64_len(s)

333

lenqp = email.quoprimime.header_quopri_len(s)

Barry Warsaw

2002-09-28 17:47:56 +0000

[diff] [blame]

334

return min(lenb64, lenqp) + len(cset) + MISC_LEN

Barry Warsaw

2002-04-10 21:01:31 +0000

[diff] [blame]

else:

return len(s)

Barry Warsaw

2002-09-28 17:47:56 +0000

[diff] [blame]

338

def header_encode(self, s, convert=False):

Barry Warsaw

2002-04-10 21:01:31 +0000

[diff] [blame]

339

"""Header-encode a string, optionally converting it to output_charset.

340

Barry Warsaw

2002-09-28 17:47:56 +0000

[diff] [blame]

341

If convert is True, the string will be converted from the input

Barry Warsaw

2002-04-10 21:01:31 +0000

[diff] [blame]

342

charset to the output charset automatically. This is not useful for

343

multibyte character sets, which have line length issues (multibyte

344

characters must be split on a character, not a byte boundary); use the

345

high-level Header class to deal with these issues. convert defaults

Barry Warsaw

2002-09-28 17:47:56 +0000

[diff] [blame]

346

to False.

Barry Warsaw

2002-04-10 21:01:31 +0000

[diff] [blame]

347

348

The type of encoding (base64 or quoted-printable) will be based on

349

self.header_encoding.

350

"""

351

cset = self.get_output_charset()

352

if convert:

353

s = self.convert(s)

354

# 7bit/8bit encodings return the string unchanged (modulo conversions)

Barry Warsaw

2002-09-28 17:47:56 +0000

[diff] [blame]

355

if self.header_encoding == BASE64:

Thomas Wouters

2006-04-21 10:40:58 +0000

[diff] [blame^]

356

return email.base64mime.header_encode(s, cset)

Barry Warsaw

2002-09-28 17:47:56 +0000

[diff] [blame]

357

elif self.header_encoding == QP:

Thomas Wouters

2006-04-21 10:40:58 +0000

[diff] [blame^]

358

return email.quoprimime.header_encode(s, cset, maxlinelen=None)

Barry Warsaw

2002-09-28 17:47:56 +0000

[diff] [blame]

359

elif self.header_encoding == SHORTEST:

Thomas Wouters

2006-04-21 10:40:58 +0000

[diff] [blame^]

360

lenb64 = email.base64mime.base64_len(s)

361

lenqp = email.quoprimime.header_quopri_len(s)

Barry Warsaw

2002-09-28 17:47:56 +0000

[diff] [blame]

362

if lenb64 < lenqp:

Thomas Wouters

2006-04-21 10:40:58 +0000

[diff] [blame^]

363

return email.base64mime.header_encode(s, cset)

Barry Warsaw

2002-09-28 17:47:56 +0000

[diff] [blame]

364

else:

Thomas Wouters

2006-04-21 10:40:58 +0000

[diff] [blame^]

365

return email.quoprimime.header_encode(s, cset, maxlinelen=None)

Barry Warsaw

2002-04-10 21:01:31 +0000

[diff] [blame]

else:

return s

Barry Warsaw

2002-09-28 17:47:56 +0000

[diff] [blame]

369

def body_encode(self, s, convert=True):

Barry Warsaw

2002-04-10 21:01:31 +0000

[diff] [blame]

370

"""Body-encode a string and convert it to output_charset.

371

Barry Warsaw

2002-09-28 17:47:56 +0000

[diff] [blame]

372

If convert is True (the default), the string will be converted from

Barry Warsaw

2002-04-10 21:01:31 +0000

[diff] [blame]

373

the input charset to output charset automatically. Unlike

374

header_encode(), there are no issues with byte boundaries and

375

multibyte charsets in email bodies, so this is usually pretty safe.

376

377

The type of encoding (base64 or quoted-printable) will be based on

self.body_encoding.

"""

if convert:

s = self.convert(s)

# 7bit/8bit encodings return the string unchanged (module conversions)

383

if self.body_encoding is BASE64:

Thomas Wouters

2006-04-21 10:40:58 +0000

[diff] [blame^]

384

return email.base64mime.body_encode(s)

Barry Warsaw

3d57589

2002-10-21 05:29:53 +0000

[diff] [blame]

385

elif self.body_encoding is QP:

Thomas Wouters

2006-04-21 10:40:58 +0000

[diff] [blame^]

386

return email.quoprimime.body_encode(s)

Barry Warsaw