Blame - Lib/email/Header.py - platform/external/python/cpython3

2002-04-10 21:01:31 +0000

[diff] [blame]

1

2

# Author: che@debian.org (Ben Gertzfield)

3

4

"""Header encoding and decoding functionality."""

5

6

import re

7

import email.quopriMIME

8

import email.base64MIME

9

from email.Charset import Charset

10

Barry Warsaw

2002-05-19 23:47:53 +0000

[diff] [blame]

11

try:

Barry Warsaw

1c30aa2

2002-06-01 05:49:17 +0000

[diff] [blame]

12

from email._compat22 import _floordiv

Barry Warsaw

2002-05-19 23:47:53 +0000

[diff] [blame]

13

except SyntaxError:

14

# Python 2.1 spells integer division differently

Barry Warsaw

1c30aa2

2002-06-01 05:49:17 +0000

[diff] [blame]

15

from email._compat21 import _floordiv

Barry Warsaw

2002-05-19 23:47:53 +0000

[diff] [blame]

16

Barry Warsaw

2002-04-10 21:01:31 +0000

[diff] [blame]

17

CRLFSPACE = '\r\n '

18

CRLF = '\r\n'

Barry Warsaw

2002-06-28 23:46:53 +0000

[diff] [blame]

19

NL = '\n'

20

SPACE8 = ' ' * 8

21

EMPTYSTRING = ''

Barry Warsaw

2002-04-10 21:01:31 +0000

[diff] [blame]

MAXLINELEN = 76

ENCODE = 1

DECODE = 2

# Match encoded-word strings in the form =?charset?q?Hello_World?=

29

ecre = re.compile(r'''

30

=\? # literal =?

31

(?P<charset>[^?]*?) # non-greedy up to the next ? is the charset

32

\? # literal ?

33

(?P<encoding>[qb]) # either a "q" or a "b", case insensitive

34

\? # literal ?

35

(?P<encoded>.*?) # non-greedy up to the next ?= is the encoded string

36

\?= # literal ?=

37

''', re.VERBOSE | re.IGNORECASE)

# Helpers

_max_append = email.quopriMIME._max_append

def decode_header(header):

47

"""Decode a message header value without converting charset.

48

49

Returns a list of (decoded_string, charset) pairs containing each of the

50

decoded parts of the header. Charset is None for non-encoded parts of the

51

header, otherwise a lower-case string containing the name of the character

52

set specified in the encoded string.

53

"""

54

# If no encoding, just return the header

55

header = str(header)

56

if not ecre.search(header):

57

return [(header, None)]

Barry Warsaw

2002-04-10 21:01:31 +0000

[diff] [blame]

58

decoded = []

59

dec = ''

60

for line in header.splitlines():

61

# This line might not have an encoding in it

62

if not ecre.search(line):

63

decoded.append((line, None))

64

continue

Barry Warsaw

2002-04-10 21:01:31 +0000

[diff] [blame]

65

parts = ecre.split(line)

66

while parts:

67

unenc = parts.pop(0).strip()

68

if unenc:

69

# Should we continue a long line?

70

if decoded and decoded[-1][1] is None:

71

decoded[-1] = (decoded[-1][0] + dec, None)

72

else:

73

decoded.append((unenc, None))

74

if parts:

75

charset, encoding = [s.lower() for s in parts[0:2]]

encoded = parts[2]

dec = ''

if encoding == 'q':

dec = email.quopriMIME.header_decode(encoded)

80

elif encoding == 'b':

81

dec = email.base64MIME.decode(encoded)

else:

dec = encoded

if decoded and decoded[-1][1] == charset:

86

decoded[-1] = (decoded[-1][0] + dec, decoded[-1][1])

87

else:

88

decoded.append((dec, charset))

del parts[0:3]

return decoded

Barry Warsaw

2002-07-09 16:33:47 +0000

[diff] [blame]

94

def make_header(decoded_seq, maxlinelen=None, header_name=None,

95

continuation_ws=' '):

96

"""Create a Header from a sequence of pairs as returned by decode_header()

97

98

decode_header() takes a header value string and returns a sequence of

99

pairs of the format (decoded_string, charset) where charset is the string

100

name of the character set.

101

102

This function takes one of those sequence of pairs and returns a Header

103

instance. Optional maxlinelen, header_name, and continuation_ws are as in

104

the Header constructor.

105

"""

106

h = Header(maxlinelen=maxlinelen, header_name=header_name,

107

continuation_ws=continuation_ws)

108

for s, charset in decoded_seq:

Barry Warsaw

15d3739

2002-07-23 04:29:54 +0000

[diff] [blame^]

109

# None means us-ascii but we can simply pass it on to h.append()

110

if charset is not None and not isinstance(charset, Charset):

Barry Warsaw

2002-07-09 16:33:47 +0000

[diff] [blame]

111

charset = Charset(charset)

h.append(s, charset)

return h

Barry Warsaw

2002-04-10 21:01:31 +0000

[diff] [blame]

117

class Header:

Barry Warsaw

2002-07-09 16:33:47 +0000

[diff] [blame]

118

def __init__(self, s=None, charset=None, maxlinelen=None, header_name=None,

Barry Warsaw

2002-06-28 23:46:53 +0000

[diff] [blame]

119

continuation_ws=' '):

Barry Warsaw

2002-04-10 21:01:31 +0000

[diff] [blame]

120

"""Create a MIME-compliant header that can contain many languages.

121

Barry Warsaw

2002-07-09 16:33:47 +0000

[diff] [blame]

122

Specify the initial header value in s. If None, the initial header

123

value is not set.

124

125

Specify both s's character set, and the default character set by

126

setting the charset argument to a Charset object (not a character set

127

name string!). If None, a us-ascii Charset is used as both s's

128

initial charset and as the default character set for subsequent

129

.append() calls.

Barry Warsaw

2002-04-10 21:01:31 +0000

[diff] [blame]

130

131

You can later append to the header with append(s, charset) below;

132

charset does not have to be the same as the one initially specified

133

here. In fact, it's optional, and if not given, defaults to the

134

charset specified in the constructor.

135

Barry Warsaw

2002-06-28 23:46:53 +0000

[diff] [blame]

136

The maximum line length can be specified explicit via maxlinelen. For

137

splitting the first line to a shorter value (to account for the field

138

header which isn't included in s, e.g. `Subject') pass in the name of

139

the field in header_name. The default maxlinelen is 76.

140

141

continuation_ws must be RFC 2822 compliant folding whitespace (usually

142

either a space or a hard tab) which will be prepended to continuation

143

lines.

Barry Warsaw

2002-04-10 21:01:31 +0000

[diff] [blame]

"""

if charset is None:

charset = Charset()

self._charset = charset

Barry Warsaw

2002-06-28 23:46:53 +0000

[diff] [blame]

148

self._continuation_ws = continuation_ws

149

cws_expanded_len = len(continuation_ws.replace('\t', SPACE8))

Barry Warsaw

2002-04-10 21:01:31 +0000

[diff] [blame]

150

# BAW: I believe `chunks' and `maxlinelen' should be non-public.

151

self._chunks = []

Barry Warsaw

2002-07-09 16:33:47 +0000

[diff] [blame]

152

if s is not None:

153

self.append(s, charset)

Barry Warsaw

2002-05-19 23:47:53 +0000

[diff] [blame]

154

if maxlinelen is None:

Barry Warsaw

2002-06-28 23:46:53 +0000

[diff] [blame]

155

maxlinelen = MAXLINELEN

156

if header_name is None:

157

# We don't know anything about the field header so the first line

158

# is the same length as subsequent lines.

159

self._firstlinelen = maxlinelen

Barry Warsaw

2002-05-19 23:47:53 +0000

[diff] [blame]

160

else:

Barry Warsaw

2002-06-28 23:46:53 +0000

[diff] [blame]

161

# The first line should be shorter to take into account the field

162

# header. Also subtract off 2 extra for the colon and space.

163

self._firstlinelen = maxlinelen - len(header_name) - 2

164

# Second and subsequent lines should subtract off the length in

165

# columns of the continuation whitespace prefix.

166

self._maxlinelen = maxlinelen - cws_expanded_len

Barry Warsaw

2002-04-10 21:01:31 +0000

[diff] [blame]

167

168

def __str__(self):

169

"""A synonym for self.encode()."""

170

return self.encode()

171

Barry Warsaw

8e69bda

2002-06-29 03:26:58 +0000

[diff] [blame]

172

def __unicode__(self):

173

"""Helper for the built-in unicode function."""

174

# charset item is a Charset instance so we need to stringify it.

175

uchunks = [unicode(s, str(charset)) for s, charset in self._chunks]

176

return u''.join(uchunks)

177

Barry Warsaw

2002-07-09 16:33:47 +0000

[diff] [blame]

178

# Rich comparison operators for equality only. BAW: does it make sense to

179

# have or explicitly disable <, <=, >, >= operators?

180

def __eq__(self, other):

181

# other may be a Header or a string. Both are fine so coerce

182

# ourselves to a string, swap the args and do another comparison.

183

return other == self.encode()

184

185

def __ne__(self, other):

186

return not self == other

187

Barry Warsaw

2002-04-10 21:01:31 +0000

[diff] [blame]

188

def append(self, s, charset=None):

189

"""Append string s with Charset charset to the MIME header.

190

Barry Warsaw

6ee7156

2002-07-03 05:04:04 +0000

[diff] [blame]

191

charset defaults to the one given in the class constructor. If

Barry Warsaw

2002-07-09 16:33:47 +0000

[diff] [blame]

192

charset is given, it should be an instance of Charset (not a character

193

set name string!).

Barry Warsaw

2002-04-10 21:01:31 +0000

[diff] [blame]

194

"""

195

if charset is None:

196

charset = self._charset

197

self._chunks.append((s, charset))

Tim Peters

8ac1495

2002-05-23 15:15:30 +0000

[diff] [blame]

198

Barry Warsaw

2002-06-28 23:46:53 +0000

[diff] [blame]

199

def _split(self, s, charset, firstline=0):

Barry Warsaw

2002-04-10 21:01:31 +0000

[diff] [blame]

200

# Split up a header safely for use with encode_chunks. BAW: this

201

# appears to be a private convenience method.

202

splittable = charset.to_splittable(s)

203

encoded = charset.from_splittable(splittable)

Barry Warsaw

2002-05-19 23:47:53 +0000

[diff] [blame]

204

elen = charset.encoded_header_len(encoded)

Tim Peters

8ac1495

2002-05-23 15:15:30 +0000

[diff] [blame]

205

Barry Warsaw

2002-05-19 23:47:53 +0000

[diff] [blame]

206

if elen <= self._maxlinelen:

Barry Warsaw

2002-04-10 21:01:31 +0000

[diff] [blame]

207

return [(encoded, charset)]

Barry Warsaw

2002-06-28 23:46:53 +0000

[diff] [blame]

208

# BAW: I'm not sure what the right test here is. What we're trying to

209

# do is be faithful to RFC 2822's recommendation that ($2.2.3):

210

#

211

# "Note: Though structured field bodies are defined in such a way that

212

# folding can take place between many of the lexical tokens (and even

213

# within some of the lexical tokens), folding SHOULD be limited to

214

# placing the CRLF at higher-level syntactic breaks."

215

#

216

# For now, I can only imagine doing this when the charset is us-ascii,

217

# although it's possible that other charsets may also benefit from the

218

# higher-level syntactic breaks.

219

#

220

elif charset == 'us-ascii':

221

return self._ascii_split(s, charset, firstline)

Barry Warsaw

2002-05-19 23:47:53 +0000

[diff] [blame]

222

# BAW: should we use encoded?

223

elif elen == len(s):

224

# We can split on _maxlinelen boundaries because we know that the

225

# encoding won't change the size of the string

226

splitpnt = self._maxlinelen

227

first = charset.from_splittable(splittable[:splitpnt], 0)

228

last = charset.from_splittable(splittable[splitpnt:], 0)

Barry Warsaw

2002-04-10 21:01:31 +0000

[diff] [blame]

229

else:

Barry Warsaw

1c30aa2

2002-06-01 05:49:17 +0000

[diff] [blame]

230

# Divide and conquer.

231

halfway = _floordiv(len(splittable), 2)

Barry Warsaw

2002-04-10 21:01:31 +0000

[diff] [blame]

232

first = charset.from_splittable(splittable[:halfway], 0)

233

last = charset.from_splittable(splittable[halfway:], 0)

Barry Warsaw

2002-06-28 23:46:53 +0000

[diff] [blame]

234

# Do the split

235

return self._split(first, charset, firstline) + \

236

self._split(last, charset)

237

238

def _ascii_split(self, s, charset, firstline):

239

# Attempt to split the line at the highest-level syntactic break

240

# possible. Note that we don't have a lot of smarts about field

241

# syntax; we just try to break on semi-colons, then whitespace.

242

rtn = []

243

lines = s.splitlines()

while lines:

line = lines.pop(0)

if firstline:

maxlinelen = self._firstlinelen

firstline = 0

else:

line = line.lstrip()

maxlinelen = self._maxlinelen

252

# Short lines can remain unchanged

253

if len(line.replace('\t', SPACE8)) <= maxlinelen:

rtn.append(line)

else:

oldlen = len(line)

# Try to break the line on semicolons, but if that doesn't

258

# work, try to split on folding whitespace.

259

while len(line) > maxlinelen:

260

i = line.rfind(';', 0, maxlinelen)

261

if i < 0:

262

break

263

rtn.append(line[:i] + ';')

264

line = line[i+1:]

265

# Is the remaining stuff still longer than maxlinelen?

266

if len(line) <= maxlinelen:

267

# Splitting on semis worked

268

rtn.append(line)

269

continue

270

# Splitting on semis didn't finish the job. If it did any

271

# work at all, stick the remaining junk on the front of the

272

# `lines' sequence and let the next pass do its thing.

273

if len(line) <> oldlen:

274

lines.insert(0, line)

275

continue

276

# Otherwise, splitting on semis didn't help at all.

277

parts = re.split(r'(\s+)', line)

278

if len(parts) == 1 or (len(parts) == 3 and

279

parts[0].endswith(':')):

280

# This line can't be split on whitespace. There's now

281

# little we can do to get this into maxlinelen. BAW:

282

# We're still potentially breaking the RFC by possibly

283

# allowing lines longer than the absolute maximum of 998

284

# characters. For now, let it slide.

285

#

286

# len(parts) will be 1 if this line has no `Field: '

287

# prefix, otherwise it will be len(3).

288

rtn.append(line)

289

continue

290

# There is whitespace we can split on.

first = parts.pop(0)

sublines = [first]

acc = len(first)

while parts:

len0 = len(parts[0])

len1 = len(parts[1])

if acc + len0 + len1 <= maxlinelen:

298

sublines.append(parts.pop(0))

299

sublines.append(parts.pop(0))

300

acc += len0 + len1

301

else:

302

# Split it here, but don't forget to ignore the

303

# next whitespace-only part

304

if first <> '':

305

rtn.append(EMPTYSTRING.join(sublines))

del parts[0]

first = parts.pop(0)

sublines = [first]

acc = len(first)

rtn.append(EMPTYSTRING.join(sublines))

311

return [(chunk, charset) for chunk in rtn]

312

313

def _encode_chunks(self):

314

"""MIME-encode a header with many different charsets and/or encodings.

315

316

Given a list of pairs (string, charset), return a MIME-encoded string

317

suitable for use in a header field. Each pair may have different

318

charsets and/or encodings, and the resulting header will accurately

319

reflect each setting.

320

321

Each encoding can be email.Utils.QP (quoted-printable, for ASCII-like

322

character sets like iso-8859-1), email.Utils.BASE64 (Base64, for

323

non-ASCII like character sets like KOI8-R and iso-2022-jp), or None

324

(no encoding).

325

326

Each pair will be represented on a separate line; the resulting string

327

will be in the format:

328

329

"=?charset1?q?Mar=EDa_Gonz=E1lez_Alonso?=\n

330

=?charset2?b?SvxyZ2VuIEL2aW5n?="

331

"""

332

chunks = []

333

for header, charset in self._chunks:

334

if charset is None or charset.header_encoding is None:

335

# There's no encoding for this chunk's charsets

336

_max_append(chunks, header, self._maxlinelen)

337

else:

338

_max_append(chunks, charset.header_encode(header, 0),

339

self._maxlinelen, ' ')

340

joiner = NL + self._continuation_ws

341

return joiner.join(chunks)

Barry Warsaw

2002-04-10 21:01:31 +0000

[diff] [blame]

342

343

def encode(self):

344

"""Encode a message header, possibly converting charset and encoding.

345

346

There are many issues involved in converting a given string for use in

347

an email header. Only certain character sets are readable in most

348

email clients, and as header strings can only contain a subset of

349

7-bit ASCII, care must be taken to properly convert and encode (with

350

Base64 or quoted-printable) header strings. In addition, there is a

351

75-character length limit on any given encoded header field, so

352

line-wrapping must be performed, even with double-byte character sets.

Tim Peters

8ac1495

2002-05-23 15:15:30 +0000

[diff] [blame]

353

Barry Warsaw

2002-04-10 21:01:31 +0000

[diff] [blame]

354

This method will do its best to convert the string to the correct

355

character set used in email, and encode and line wrap it safely with

356

the appropriate scheme for that character set.

357

358

If the given charset is not known or an error occurs during

359

conversion, this function will return the header untouched.

360

"""

361

newchunks = []

362

for s, charset in self._chunks:

Barry Warsaw

2002-06-28 23:46:53 +0000

[diff] [blame]

363

newchunks += self._split(s, charset, 1)

Barry Warsaw

2002-04-10 21:01:31 +0000

[diff] [blame]

364

self._chunks = newchunks

Barry Warsaw