Blame - Lib/email/charset.py - platform/external/python/cpython3

2007-08-30 01:15:14 +0000

[diff] [blame]

1

2

# Author: Ben Gertzfield, Barry Warsaw

3

# Contact: email-sig@python.org

__all__ = [

'Charset',

'add_alias',

'add_charset',

'add_codec',

]

Guido van Rossum

2007-08-30 03:46:43 +0000

[diff] [blame]

12

from functools import partial

13

Guido van Rossum

2007-08-30 01:15:14 +0000

[diff] [blame]

14

import email.base64mime

15

import email.quoprimime

16

17

from email import errors

18

from email.encoders import encode_7or8bit

# Flags for types of header encodings

23

QP = 1 # Quoted-Printable

24

BASE64 = 2 # Base64

25

SHORTEST = 3 # the shorter of QP and base64, but only for headers

26

27

# In "=?charset?q?hello_world?=", the =?, ?q?, and ?= add up to 7

Guido van Rossum

2007-08-30 03:46:43 +0000

[diff] [blame]

28

RFC2047_CHROME_LEN = 7

Guido van Rossum

2007-08-30 01:15:14 +0000

[diff] [blame]

29

30

DEFAULT_CHARSET = 'us-ascii'

R. David Murray

2011-01-07 23:25:30 +0000

[diff] [blame]

31

UNKNOWN8BIT = 'unknown-8bit'

Guido van Rossum

2007-08-30 03:46:43 +0000

[diff] [blame]

32

EMPTYSTRING = ''

Guido van Rossum

2007-08-30 01:15:14 +0000

[diff] [blame]

# Defaults

CHARSETS = {

# input header enc body enc output conv

39

'iso-8859-1': (QP, QP, None),

40

'iso-8859-2': (QP, QP, None),

41

'iso-8859-3': (QP, QP, None),

42

'iso-8859-4': (QP, QP, None),

43

# iso-8859-5 is Cyrillic, and not especially used

44

# iso-8859-6 is Arabic, also not particularly used

45

# iso-8859-7 is Greek, QP will not make it readable

46

# iso-8859-8 is Hebrew, QP will not make it readable

47

'iso-8859-9': (QP, QP, None),

48

'iso-8859-10': (QP, QP, None),

49

# iso-8859-11 is Thai, QP will not make it readable

50

'iso-8859-13': (QP, QP, None),

51

'iso-8859-14': (QP, QP, None),

52

'iso-8859-15': (QP, QP, None),

53

'iso-8859-16': (QP, QP, None),

54

'windows-1252':(QP, QP, None),

55

'viscii': (QP, QP, None),

56

'us-ascii': (None, None, None),

57

'big5': (BASE64, BASE64, None),

58

'gb2312': (BASE64, BASE64, None),

59

'euc-jp': (BASE64, None, 'iso-2022-jp'),

60

'shift_jis': (BASE64, None, 'iso-2022-jp'),

61

'iso-2022-jp': (BASE64, None, None),

62

'koi8-r': (BASE64, BASE64, None),

63

'utf-8': (SHORTEST, BASE64, 'utf-8'),

64

}

65

66

# Aliases for other commonly-used names for character sets. Map

67

# them to the real ones used in email.

68

ALIASES = {

69

'latin_1': 'iso-8859-1',

70

'latin-1': 'iso-8859-1',

71

'latin_2': 'iso-8859-2',

72

'latin-2': 'iso-8859-2',

73

'latin_3': 'iso-8859-3',

74

'latin-3': 'iso-8859-3',

75

'latin_4': 'iso-8859-4',

76

'latin-4': 'iso-8859-4',

77

'latin_5': 'iso-8859-9',

78

'latin-5': 'iso-8859-9',

79

'latin_6': 'iso-8859-10',

80

'latin-6': 'iso-8859-10',

81

'latin_7': 'iso-8859-13',

82

'latin-7': 'iso-8859-13',

83

'latin_8': 'iso-8859-14',

84

'latin-8': 'iso-8859-14',

85

'latin_9': 'iso-8859-15',

86

'latin-9': 'iso-8859-15',

87

'latin_10':'iso-8859-16',

88

'latin-10':'iso-8859-16',

89

'cp949': 'ks_c_5601-1987',

'euc_jp': 'euc-jp',

'euc_kr': 'euc-kr',

'ascii': 'us-ascii',

}

# Map charsets to their Unicode codec strings.

97

CODEC_MAP = {

98

'gb2312': 'eucgb2312_cn',

99

'big5': 'big5_tw',

100

# Hack: We don't want *any* conversion for stuff marked us-ascii, as all

101

# sorts of garbage might be sent to us in the guise of 7-bit us-ascii.

102

# Let that stuff pass through without conversion to/from Unicode.

'us-ascii': None,

}

# Convenience functions for extending the above mappings

109

def add_charset(charset, header_enc=None, body_enc=None, output_charset=None):

110

"""Add character set properties to the global registry.

111

112

charset is the input character set, and must be the canonical name of a

113

character set.

114

115

Optional header_enc and body_enc is either Charset.QP for

116

quoted-printable, Charset.BASE64 for base64 encoding, Charset.SHORTEST for

117

the shortest of qp or base64 encoding, or None for no encoding. SHORTEST

118

is only valid for header_enc. It describes how message headers and

119

message bodies in the input charset are to be encoded. Default is no

120

encoding.

121

122

Optional output_charset is the character set that the output should be

123

in. Conversions will proceed from input charset, to Unicode, to the

124

output charset when the method Charset.convert() is called. The default

125

is to output in the same character set as the input.

126

127

Both input_charset and output_charset must have Unicode codec entries in

128

the module's charset-to-codec mapping; use add_codec(charset, codecname)

129

to add codecs the module does not know about. See the codecs module's

130

documentation for more information.

131

"""

132

if body_enc == SHORTEST:

133

raise ValueError('SHORTEST not allowed for body_enc')

134

CHARSETS[charset] = (header_enc, body_enc, output_charset)

135

136

137

def add_alias(alias, canonical):

138

"""Add a character set alias.

139

140

alias is the alias name, e.g. latin-1

141

canonical is the character set's canonical name, e.g. iso-8859-1

142

"""

143

ALIASES[alias] = canonical

144

145

146

def add_codec(charset, codecname):

147

"""Add a codec that map characters in the given charset to/from Unicode.

148

149

charset is the canonical name of a character set. codecname is the name

150

of a Python codec, as appropriate for the second argument to the unicode()

151

built-in, or to the encode() method of a Unicode string.

152

"""

153

CODEC_MAP[charset] = codecname

R. David Murray

2011-01-07 23:25:30 +0000

[diff] [blame]

157

# Convenience function for encoding strings, taking into account

158

# that they might be unknown-8bit (ie: have surrogate-escaped bytes)

159

def _encode(string, codec):

160

if codec == UNKNOWN8BIT:

161

return string.encode('ascii', 'surrogateescape')

162

else:

163

return string.encode(codec)

Guido van Rossum

2007-08-30 01:15:14 +0000

[diff] [blame]

167

class Charset:

168

"""Map character sets to their email properties.

169

170

This class provides information about the requirements imposed on email

171

for a specific character set. It also provides convenience routines for

172

converting between character sets, given the availability of the

173

applicable codecs. Given a character set, it will do its best to provide

174

information on how to use that character set in an email in an

175

RFC-compliant way.

176

177

Certain character sets must be encoded with quoted-printable or base64

178

when used in email headers or bodies. Certain character sets must be

179

converted outright, and are not allowed in email. Instances of this

180

module expose the following information about a character set:

181

182

input_charset: The initial character set specified. Common aliases

183

are converted to their `official' email names (e.g. latin_1

184

is converted to iso-8859-1). Defaults to 7-bit us-ascii.

185

186

header_encoding: If the character set must be encoded before it can be

187

used in an email header, this attribute will be set to

188

Charset.QP (for quoted-printable), Charset.BASE64 (for

189

base64 encoding), or Charset.SHORTEST for the shortest of

190

QP or BASE64 encoding. Otherwise, it will be None.

191

192

body_encoding: Same as header_encoding, but describes the encoding for the

193

mail message's body, which indeed may be different than the

194

header encoding. Charset.SHORTEST is not allowed for

195

body_encoding.

196

R David Murray

037f658

2013-07-12 22:55:43 -0400

[diff] [blame]

197

output_charset: Some character sets must be converted before they can be

Guido van Rossum

2007-08-30 01:15:14 +0000

[diff] [blame]

198

used in email headers or bodies. If the input_charset is

199

one of them, this attribute will contain the name of the

200

charset output will be converted to. Otherwise, it will

201

be None.

202

203

input_codec: The name of the Python codec used to convert the

204

input_charset to Unicode. If no conversion codec is

205

necessary, this attribute will be None.

206

207

output_codec: The name of the Python codec used to convert Unicode

208

to the output_charset. If no conversion codec is necessary,

209

this attribute will have the same value as the input_codec.

210

"""

211

def __init__(self, input_charset=DEFAULT_CHARSET):

212

# RFC 2046, $4.1.2 says charsets are not case sensitive. We coerce to

213

# unicode because its .lower() is locale insensitive. If the argument

214

# is already a unicode, we leave it at that, but ensure that the

215

# charset is ASCII, as the standard (RFC XXX) requires.

216

try:

217

if isinstance(input_charset, str):

218

input_charset.encode('ascii')

219

else:

220

input_charset = str(input_charset, 'ascii')

221

except UnicodeError:

222

raise errors.CharsetError(input_charset)

223

input_charset = input_charset.lower()

224

# Set the input charset after filtering through the aliases

225

self.input_charset = ALIASES.get(input_charset, input_charset)

226

# We can try to guess which encoding and conversion to use by the

227

# charset_map dictionary. Try that first, but let the user override

228

# it.

229

henc, benc, conv = CHARSETS.get(self.input_charset,

230

(SHORTEST, BASE64, None))

231

if not conv:

232

conv = self.input_charset

233

# Set the attributes, allowing the arguments to override the default.

234

self.header_encoding = henc

235

self.body_encoding = benc

236

self.output_charset = ALIASES.get(conv, conv)

237

# Now set the codecs. If one isn't defined for input_charset,

238

# guess and try a Unicode codec with the same name as input_codec.

239

self.input_codec = CODEC_MAP.get(self.input_charset,

240

self.input_charset)

241

self.output_codec = CODEC_MAP.get(self.output_charset,

self.output_charset)

def __str__(self):

return self.input_charset.lower()

__repr__ = __str__

def __eq__(self, other):

250

return str(self) == str(other).lower()

251

Guido van Rossum

2007-08-30 01:15:14 +0000

[diff] [blame]

252

def get_body_encoding(self):

253

"""Return the content-transfer-encoding used for body encoding.

254

255

This is either the string `quoted-printable' or `base64' depending on

256

the encoding used, or it is a function in which case you should call

257

the function with a single argument, the Message object being

258

encoded. The function should then set the Content-Transfer-Encoding

259

header itself to whatever is appropriate.

260

261

Returns "quoted-printable" if self.body_encoding is QP.

262

Returns "base64" if self.body_encoding is BASE64.

R David Murray

56a9d7e

2011-03-15 12:20:02 -0400

[diff] [blame]

263

Returns conversion function otherwise.

Guido van Rossum

2007-08-30 01:15:14 +0000

[diff] [blame]

264

"""

265

assert self.body_encoding != SHORTEST

266

if self.body_encoding == QP:

267

return 'quoted-printable'

268

elif self.body_encoding == BASE64:

269

return 'base64'

270

else:

271

return encode_7or8bit

272

Guido van Rossum

2007-08-30 01:15:14 +0000

[diff] [blame]

273

def get_output_charset(self):

274

"""Return the output character set.

275

276

This is self.output_charset if that is not None, otherwise it is

277

self.input_charset.

278

"""

279

return self.output_charset or self.input_charset

280

Guido van Rossum

2007-08-30 01:15:14 +0000

[diff] [blame]

281

def header_encode(self, string):

282

"""Header-encode a string by converting it first to bytes.

283

Guido van Rossum

2007-08-30 01:15:14 +0000

[diff] [blame]

284

The type of encoding (base64 or quoted-printable) will be based on

285

this charset's `header_encoding`.

Guido van Rossum

2007-08-30 03:46:43 +0000

[diff] [blame]

286

287

:param string: A unicode string for the header. It must be possible

288

to encode this string to bytes using the character set's

289

output codec.

290

:return: The encoded string, with RFC 2047 chrome.

Guido van Rossum

2007-08-30 01:15:14 +0000

[diff] [blame]

291

"""

292

codec = self.output_codec or 'us-ascii'

R. David Murray

2011-01-07 23:25:30 +0000

[diff] [blame]

293

header_bytes = _encode(string, codec)

Guido van Rossum

2007-08-30 01:15:14 +0000

[diff] [blame]

294

# 7bit/8bit encodings return the string unchanged (modulo conversions)

Guido van Rossum

2007-08-30 03:46:43 +0000

[diff] [blame]

295

encoder_module = self._get_encoder(header_bytes)

296

if encoder_module is None:

Guido van Rossum

2007-08-30 01:15:14 +0000

[diff] [blame]

297

return string

Guido van Rossum

2007-08-30 03:46:43 +0000

[diff] [blame]

298

return encoder_module.header_encode(header_bytes, codec)

Guido van Rossum

2007-08-30 01:15:14 +0000

[diff] [blame]

299

Guido van Rossum

2007-08-30 03:46:43 +0000

[diff] [blame]

300

def header_encode_lines(self, string, maxlengths):

301

"""Header-encode a string by converting it first to bytes.

Guido van Rossum

2007-08-30 01:15:14 +0000

[diff] [blame]

302

Guido van Rossum

2007-08-30 03:46:43 +0000

[diff] [blame]

303

This is similar to `header_encode()` except that the string is fit

Georg Brandl

e98628d

2011-01-06 09:15:45 +0000

[diff] [blame]

304

into maximum line lengths as given by the argument.

Guido van Rossum

2007-08-30 03:46:43 +0000

[diff] [blame]

305

306

:param string: A unicode string for the header. It must be possible

307

to encode this string to bytes using the character set's

308

output codec.

309

:param maxlengths: Maximum line length iterator. Each element

310

returned from this iterator will provide the next maximum line

311

length. This parameter is used as an argument to built-in next()

312

and should never be exhausted. The maximum line lengths should

313

not count the RFC 2047 chrome. These line lengths are only a

314

hint; the splitter does the best it can.

Guido van Rossum

2007-08-30 03:46:43 +0000

[diff] [blame]

315

:return: Lines of encoded strings, each with RFC 2047 chrome.

316

"""

317

# See which encoding we should use.

318

codec = self.output_codec or 'us-ascii'

R. David Murray

2011-01-07 23:25:30 +0000

[diff] [blame]

319

header_bytes = _encode(string, codec)

Guido van Rossum

2007-08-30 03:46:43 +0000

[diff] [blame]

320

encoder_module = self._get_encoder(header_bytes)

R David Murray

3a6152f

2011-03-14 21:13:03 -0400

[diff] [blame]

321

encoder = partial(encoder_module.header_encode, charset=codec)

Guido van Rossum

2007-08-30 03:46:43 +0000

[diff] [blame]

322

# Calculate the number of characters that the RFC 2047 chrome will

323

# contribute to each line.

324

charset = self.get_output_charset()

325

extra = len(charset) + RFC2047_CHROME_LEN

326

# Now comes the hard part. We must encode bytes but we can't split on

327

# bytes because some character sets are variable length and each

328

# encoded word must stand on its own. So the problem is you have to

329

# encode to bytes to figure out this word's length, but you must split

330

# on characters. This causes two problems: first, we don't know how

331

# many octets a specific substring of unicode characters will get

332

# encoded to, and second, we don't know how many ASCII characters

333

# those octets will get encoded to. Unless we try it. Which seems

334

# inefficient. In the interest of being correct rather than fast (and

335

# in the hope that there will be few encoded headers in any such

336

# message), brute force it. :(

337

lines = []

338

current_line = []

339

maxlen = next(maxlengths) - extra

340

for character in string:

341

current_line.append(character)

342

this_line = EMPTYSTRING.join(current_line)

R. David Murray

2011-01-07 23:25:30 +0000

[diff] [blame]

343

length = encoder_module.header_length(_encode(this_line, charset))

Guido van Rossum

2007-08-30 03:46:43 +0000

[diff] [blame]

344

if length > maxlen:

345

# This last character doesn't fit so pop it off.

346

current_line.pop()

347

# Does nothing fit on the first line?

348

if not lines and not current_line:

349

lines.append(None)

350

else:

351

separator = (' ' if lines else '')

352

joined_line = EMPTYSTRING.join(current_line)

R. David Murray

2011-01-07 23:25:30 +0000

[diff] [blame]

353

header_bytes = _encode(joined_line, codec)

Guido van Rossum

2007-08-30 03:46:43 +0000

[diff] [blame]

354

lines.append(encoder(header_bytes))

355

current_line = [character]

356

maxlen = next(maxlengths) - extra

357

joined_line = EMPTYSTRING.join(current_line)

R. David Murray

2011-01-07 23:25:30 +0000

[diff] [blame]

358

header_bytes = _encode(joined_line, codec)

Guido van Rossum

2007-08-30 03:46:43 +0000

[diff] [blame]

359

lines.append(encoder(header_bytes))

360

return lines

361

362

def _get_encoder(self, header_bytes):

363

if self.header_encoding == BASE64:

364

return email.base64mime

365

elif self.header_encoding == QP:

366

return email.quoprimime

367

elif self.header_encoding == SHORTEST:

368

len64 = email.base64mime.header_length(header_bytes)

369

lenqp = email.quoprimime.header_length(header_bytes)

370

if len64 < lenqp:

371

return email.base64mime

372

else:

373

return email.quoprimime

else:

return None

def body_encode(self, string):

R David Murray

2014-02-07 12:40:37 -0500

[diff] [blame]

378

"""Body-encode a string by converting it first to bytes.

Guido van Rossum

2007-08-30 01:15:14 +0000

[diff] [blame]

379

380

The type of encoding (base64 or quoted-printable) will be based on

R David Murray

2014-02-07 12:40:37 -0500

[diff] [blame]

381

self.body_encoding. If body_encoding is None, we assume the

382

output charset is a 7bit encoding, so re-encoding the decoded

383

string using the ascii codec produces the correct string version

384

of the content.

Guido van Rossum

2007-08-30 01:15:14 +0000

[diff] [blame]

385

"""

R David Murray

15a693a

2014-02-07 12:46:17 -0500

[diff] [blame]

386

if not string:

387

return string

Guido van Rossum

2007-08-30 01:15:14 +0000

[diff] [blame]

388

if self.body_encoding is BASE64:

R David Murray

2014-02-07 12:40:37 -0500

[diff] [blame]

389

if isinstance(string, str):

390

string = string.encode(self.output_charset)

Guido van Rossum

2007-08-30 03:46:43 +0000

[diff] [blame]

391

return email.base64mime.body_encode(string)

Guido van Rossum

2007-08-30 01:15:14 +0000

[diff] [blame]

392

elif self.body_encoding is QP:

R David Murray

f581b37

2013-02-05 10:49:49 -0500

[diff] [blame]

393

# quopromime.body_encode takes a string, but operates on it as if

394

# it were a list of byte codes. For a (minimal) history on why

395

# this is so, see changeset 0cf700464177. To correctly encode a

396

# character set, then, we must turn it into pseudo bytes via the

397

# latin1 charset, which will encode any byte as a single code point

398

# between 0 and 255, which is what body_encode is expecting.

R David Murray

2014-02-07 12:40:37 -0500

[diff] [blame]

399

if isinstance(string, str):

R David Murray

15a693a

2014-02-07 12:46:17 -0500

[diff] [blame]

400

string = string.encode(self.output_charset)

401

string = string.decode('latin1')

Guido van Rossum

2007-08-30 03:46:43 +0000

[diff] [blame]

402

return email.quoprimime.body_encode(string)

Guido van Rossum

2007-08-30 01:15:14 +0000

[diff] [blame]

403

else:

R David Murray