Blame - Lib/email/Header.py - platform/external/python/cpython3

2002-04-10 21:01:31 +0000

[diff] [blame]

1

2

# Author: che@debian.org (Ben Gertzfield)

3

4

"""Header encoding and decoding functionality."""

5

6

import re

7

import email.quopriMIME

8

import email.base64MIME

9

from email.Charset import Charset

10

Barry Warsaw

2002-05-19 23:47:53 +0000

[diff] [blame]

11

try:

Barry Warsaw

1c30aa2

2002-06-01 05:49:17 +0000

[diff] [blame]

12

from email._compat22 import _floordiv

Barry Warsaw

2002-05-19 23:47:53 +0000

[diff] [blame]

13

except SyntaxError:

14

# Python 2.1 spells integer division differently

Barry Warsaw

1c30aa2

2002-06-01 05:49:17 +0000

[diff] [blame]

15

from email._compat21 import _floordiv

Barry Warsaw

2002-05-19 23:47:53 +0000

[diff] [blame]

16

Barry Warsaw

2002-04-10 21:01:31 +0000

[diff] [blame]

17

CRLFSPACE = '\r\n '

18

CRLF = '\r\n'

Barry Warsaw

2002-06-28 23:46:53 +0000

[diff] [blame]

19

NL = '\n'

20

SPACE8 = ' ' * 8

21

EMPTYSTRING = ''

Barry Warsaw

2002-04-10 21:01:31 +0000

[diff] [blame]

MAXLINELEN = 76

ENCODE = 1

DECODE = 2

# Match encoded-word strings in the form =?charset?q?Hello_World?=

29

ecre = re.compile(r'''

30

=\? # literal =?

31

(?P<charset>[^?]*?) # non-greedy up to the next ? is the charset

32

\? # literal ?

33

(?P<encoding>[qb]) # either a "q" or a "b", case insensitive

34

\? # literal ?

35

(?P<encoded>.*?) # non-greedy up to the next ?= is the encoded string

36

\?= # literal ?=

37

''', re.VERBOSE | re.IGNORECASE)

# Helpers

_max_append = email.quopriMIME._max_append

def decode_header(header):

47

"""Decode a message header value without converting charset.

48

49

Returns a list of (decoded_string, charset) pairs containing each of the

50

decoded parts of the header. Charset is None for non-encoded parts of the

51

header, otherwise a lower-case string containing the name of the character

52

set specified in the encoded string.

53

"""

54

# If no encoding, just return the header

55

header = str(header)

56

if not ecre.search(header):

57

return [(header, None)]

decoded = []

dec = ''

for line in header.splitlines():

62

# This line might not have an encoding in it

63

if not ecre.search(line):

64

decoded.append((line, None))

65

continue

Tim Peters

2002-05-23 15:15:30 +0000

[diff] [blame]

66

Barry Warsaw

2002-04-10 21:01:31 +0000

[diff] [blame]

67

parts = ecre.split(line)

68

while parts:

69

unenc = parts.pop(0).strip()

70

if unenc:

71

# Should we continue a long line?

72

if decoded and decoded[-1][1] is None:

73

decoded[-1] = (decoded[-1][0] + dec, None)

74

else:

75

decoded.append((unenc, None))

76

if parts:

77

charset, encoding = [s.lower() for s in parts[0:2]]

encoded = parts[2]

dec = ''

if encoding == 'q':

dec = email.quopriMIME.header_decode(encoded)

82

elif encoding == 'b':

83

dec = email.base64MIME.decode(encoded)

else:

dec = encoded

if decoded and decoded[-1][1] == charset:

88

decoded[-1] = (decoded[-1][0] + dec, decoded[-1][1])

89

else:

90

decoded.append((dec, charset))

del parts[0:3]

return decoded

Barry Warsaw

2002-07-09 16:33:47 +0000

[diff] [blame^]

96

def make_header(decoded_seq, maxlinelen=None, header_name=None,

97

continuation_ws=' '):

98

"""Create a Header from a sequence of pairs as returned by decode_header()

99

100

decode_header() takes a header value string and returns a sequence of

101

pairs of the format (decoded_string, charset) where charset is the string

102

name of the character set.

103

104

This function takes one of those sequence of pairs and returns a Header

105

instance. Optional maxlinelen, header_name, and continuation_ws are as in

106

the Header constructor.

107

"""

108

h = Header(maxlinelen=maxlinelen, header_name=header_name,

109

continuation_ws=continuation_ws)

110

for s, charset in decoded_seq:

111

if not isinstance(charset, Charset):

112

charset = Charset(charset)

h.append(s, charset)

return h

Barry Warsaw

2002-04-10 21:01:31 +0000

[diff] [blame]

118

class Header:

Barry Warsaw

2002-07-09 16:33:47 +0000

[diff] [blame^]

119

def __init__(self, s=None, charset=None, maxlinelen=None, header_name=None,

Barry Warsaw

2002-06-28 23:46:53 +0000

[diff] [blame]

120

continuation_ws=' '):

Barry Warsaw

2002-04-10 21:01:31 +0000

[diff] [blame]

121

"""Create a MIME-compliant header that can contain many languages.

122

Barry Warsaw

2002-07-09 16:33:47 +0000

[diff] [blame^]

123

Specify the initial header value in s. If None, the initial header

124

value is not set.

125

126

Specify both s's character set, and the default character set by

127

setting the charset argument to a Charset object (not a character set

128

name string!). If None, a us-ascii Charset is used as both s's

129

initial charset and as the default character set for subsequent

130

.append() calls.

Barry Warsaw

2002-04-10 21:01:31 +0000

[diff] [blame]

131

132

You can later append to the header with append(s, charset) below;

133

charset does not have to be the same as the one initially specified

134

here. In fact, it's optional, and if not given, defaults to the

135

charset specified in the constructor.

136

Barry Warsaw

2002-06-28 23:46:53 +0000

[diff] [blame]

137

The maximum line length can be specified explicit via maxlinelen. For

138

splitting the first line to a shorter value (to account for the field

139

header which isn't included in s, e.g. `Subject') pass in the name of

140

the field in header_name. The default maxlinelen is 76.

141

142

continuation_ws must be RFC 2822 compliant folding whitespace (usually

143

either a space or a hard tab) which will be prepended to continuation

144

lines.

Barry Warsaw

2002-04-10 21:01:31 +0000

[diff] [blame]

"""

if charset is None:

charset = Charset()

self._charset = charset

Barry Warsaw

2002-06-28 23:46:53 +0000

[diff] [blame]

149

self._continuation_ws = continuation_ws

150

cws_expanded_len = len(continuation_ws.replace('\t', SPACE8))

Barry Warsaw

2002-04-10 21:01:31 +0000

[diff] [blame]

151

# BAW: I believe `chunks' and `maxlinelen' should be non-public.

152

self._chunks = []

Barry Warsaw

2002-07-09 16:33:47 +0000

[diff] [blame^]

153

if s is not None:

154

self.append(s, charset)

Barry Warsaw

2002-05-19 23:47:53 +0000

[diff] [blame]

155

if maxlinelen is None:

Barry Warsaw

2002-06-28 23:46:53 +0000

[diff] [blame]

156

maxlinelen = MAXLINELEN

157

if header_name is None:

158

# We don't know anything about the field header so the first line

159

# is the same length as subsequent lines.

160

self._firstlinelen = maxlinelen

Barry Warsaw

2002-05-19 23:47:53 +0000

[diff] [blame]

161

else:

Barry Warsaw

2002-06-28 23:46:53 +0000

[diff] [blame]

162

# The first line should be shorter to take into account the field

163

# header. Also subtract off 2 extra for the colon and space.

164

self._firstlinelen = maxlinelen - len(header_name) - 2

165

# Second and subsequent lines should subtract off the length in

166

# columns of the continuation whitespace prefix.

167

self._maxlinelen = maxlinelen - cws_expanded_len

Barry Warsaw

2002-04-10 21:01:31 +0000

[diff] [blame]

168

169

def __str__(self):

170

"""A synonym for self.encode()."""

171

return self.encode()

172

Barry Warsaw

8e69bda

2002-06-29 03:26:58 +0000

[diff] [blame]

173

def __unicode__(self):

174

"""Helper for the built-in unicode function."""

175

# charset item is a Charset instance so we need to stringify it.

176

uchunks = [unicode(s, str(charset)) for s, charset in self._chunks]

177

return u''.join(uchunks)

178

Barry Warsaw

2002-07-09 16:33:47 +0000

[diff] [blame^]

179

# Rich comparison operators for equality only. BAW: does it make sense to

180

# have or explicitly disable <, <=, >, >= operators?

181

def __eq__(self, other):

182

# other may be a Header or a string. Both are fine so coerce

183

# ourselves to a string, swap the args and do another comparison.

184

return other == self.encode()

185

186

def __ne__(self, other):

187

return not self == other

188

Barry Warsaw

2002-04-10 21:01:31 +0000

[diff] [blame]

189

def append(self, s, charset=None):

190

"""Append string s with Charset charset to the MIME header.

191

Barry Warsaw

6ee7156

2002-07-03 05:04:04 +0000

[diff] [blame]

192

charset defaults to the one given in the class constructor. If

Barry Warsaw

2002-07-09 16:33:47 +0000

[diff] [blame^]

193

charset is given, it should be an instance of Charset (not a character

194

set name string!).

Barry Warsaw

2002-04-10 21:01:31 +0000

[diff] [blame]

195

"""

196

if charset is None:

197

charset = self._charset

198

self._chunks.append((s, charset))

Tim Peters

2002-05-23 15:15:30 +0000

[diff] [blame]

199

Barry Warsaw

2002-06-28 23:46:53 +0000

[diff] [blame]

200

def _split(self, s, charset, firstline=0):

Barry Warsaw

2002-04-10 21:01:31 +0000

[diff] [blame]

201

# Split up a header safely for use with encode_chunks. BAW: this

202

# appears to be a private convenience method.

203

splittable = charset.to_splittable(s)

204

encoded = charset.from_splittable(splittable)

Barry Warsaw

2002-05-19 23:47:53 +0000

[diff] [blame]

205

elen = charset.encoded_header_len(encoded)

Tim Peters

2002-05-23 15:15:30 +0000

[diff] [blame]

206

Barry Warsaw

2002-05-19 23:47:53 +0000

[diff] [blame]

207

if elen <= self._maxlinelen:

Barry Warsaw

2002-04-10 21:01:31 +0000

[diff] [blame]

208

return [(encoded, charset)]

Barry Warsaw

2002-06-28 23:46:53 +0000

[diff] [blame]

209

# BAW: I'm not sure what the right test here is. What we're trying to

210

# do is be faithful to RFC 2822's recommendation that ($2.2.3):

211

#

212

# "Note: Though structured field bodies are defined in such a way that

213

# folding can take place between many of the lexical tokens (and even

214

# within some of the lexical tokens), folding SHOULD be limited to

215

# placing the CRLF at higher-level syntactic breaks."

216

#

217

# For now, I can only imagine doing this when the charset is us-ascii,

218

# although it's possible that other charsets may also benefit from the

219

# higher-level syntactic breaks.

220

#

221

elif charset == 'us-ascii':

222

return self._ascii_split(s, charset, firstline)

Barry Warsaw

2002-05-19 23:47:53 +0000

[diff] [blame]

223

# BAW: should we use encoded?

224

elif elen == len(s):

225

# We can split on _maxlinelen boundaries because we know that the

226

# encoding won't change the size of the string

227

splitpnt = self._maxlinelen

228

first = charset.from_splittable(splittable[:splitpnt], 0)

229

last = charset.from_splittable(splittable[splitpnt:], 0)

Barry Warsaw

2002-04-10 21:01:31 +0000

[diff] [blame]

230

else:

Barry Warsaw

1c30aa2

2002-06-01 05:49:17 +0000

[diff] [blame]

231

# Divide and conquer.

232

halfway = _floordiv(len(splittable), 2)

Barry Warsaw

2002-04-10 21:01:31 +0000

[diff] [blame]

233

first = charset.from_splittable(splittable[:halfway], 0)

234

last = charset.from_splittable(splittable[halfway:], 0)

Barry Warsaw

2002-06-28 23:46:53 +0000

[diff] [blame]

235

# Do the split

236

return self._split(first, charset, firstline) + \

237

self._split(last, charset)

238

239

def _ascii_split(self, s, charset, firstline):

240

# Attempt to split the line at the highest-level syntactic break

241

# possible. Note that we don't have a lot of smarts about field

242

# syntax; we just try to break on semi-colons, then whitespace.

243

rtn = []

244

lines = s.splitlines()

while lines:

line = lines.pop(0)

if firstline:

maxlinelen = self._firstlinelen

firstline = 0

else:

line = line.lstrip()

maxlinelen = self._maxlinelen

253

# Short lines can remain unchanged

254

if len(line.replace('\t', SPACE8)) <= maxlinelen:

rtn.append(line)

else:

oldlen = len(line)

# Try to break the line on semicolons, but if that doesn't

259

# work, try to split on folding whitespace.

260

while len(line) > maxlinelen:

261

i = line.rfind(';', 0, maxlinelen)

262

if i < 0:

263

break

264

rtn.append(line[:i] + ';')

265

line = line[i+1:]

266

# Is the remaining stuff still longer than maxlinelen?

267

if len(line) <= maxlinelen:

268

# Splitting on semis worked

269

rtn.append(line)

270

continue

271

# Splitting on semis didn't finish the job. If it did any

272

# work at all, stick the remaining junk on the front of the

273

# `lines' sequence and let the next pass do its thing.

274

if len(line) <> oldlen:

275

lines.insert(0, line)

276

continue

277

# Otherwise, splitting on semis didn't help at all.

278

parts = re.split(r'(\s+)', line)

279

if len(parts) == 1 or (len(parts) == 3 and

280

parts[0].endswith(':')):

281

# This line can't be split on whitespace. There's now

282

# little we can do to get this into maxlinelen. BAW:

283

# We're still potentially breaking the RFC by possibly

284

# allowing lines longer than the absolute maximum of 998

285

# characters. For now, let it slide.

286

#

287

# len(parts) will be 1 if this line has no `Field: '

288

# prefix, otherwise it will be len(3).

289

rtn.append(line)

290

continue

291

# There is whitespace we can split on.

first = parts.pop(0)

sublines = [first]

acc = len(first)

while parts:

len0 = len(parts[0])

len1 = len(parts[1])

if acc + len0 + len1 <= maxlinelen:

299

sublines.append(parts.pop(0))

300

sublines.append(parts.pop(0))

301

acc += len0 + len1

302

else:

303

# Split it here, but don't forget to ignore the

304

# next whitespace-only part

305

if first <> '':

306

rtn.append(EMPTYSTRING.join(sublines))

del parts[0]

first = parts.pop(0)

sublines = [first]

acc = len(first)

rtn.append(EMPTYSTRING.join(sublines))

312

return [(chunk, charset) for chunk in rtn]

313

314

def _encode_chunks(self):

315

"""MIME-encode a header with many different charsets and/or encodings.

316

317

Given a list of pairs (string, charset), return a MIME-encoded string

318

suitable for use in a header field. Each pair may have different

319

charsets and/or encodings, and the resulting header will accurately

320

reflect each setting.

321

322

Each encoding can be email.Utils.QP (quoted-printable, for ASCII-like

323

character sets like iso-8859-1), email.Utils.BASE64 (Base64, for

324

non-ASCII like character sets like KOI8-R and iso-2022-jp), or None

325

(no encoding).

326

327

Each pair will be represented on a separate line; the resulting string

328

will be in the format:

329

330

"=?charset1?q?Mar=EDa_Gonz=E1lez_Alonso?=\n

331

=?charset2?b?SvxyZ2VuIEL2aW5n?="

332

"""

333

chunks = []

334

for header, charset in self._chunks:

335

if charset is None or charset.header_encoding is None:

336

# There's no encoding for this chunk's charsets

337

_max_append(chunks, header, self._maxlinelen)

338

else:

339

_max_append(chunks, charset.header_encode(header, 0),

340

self._maxlinelen, ' ')

341

joiner = NL + self._continuation_ws

342

return joiner.join(chunks)

Barry Warsaw

2002-04-10 21:01:31 +0000

[diff] [blame]

343

344

def encode(self):

345

"""Encode a message header, possibly converting charset and encoding.

346

347

There are many issues involved in converting a given string for use in

348

an email header. Only certain character sets are readable in most

349

email clients, and as header strings can only contain a subset of

350

7-bit ASCII, care must be taken to properly convert and encode (with

351

Base64 or quoted-printable) header strings. In addition, there is a

352

75-character length limit on any given encoded header field, so

353

line-wrapping must be performed, even with double-byte character sets.

Tim Peters

2002-05-23 15:15:30 +0000

[diff] [blame]

354

Barry Warsaw

2002-04-10 21:01:31 +0000

[diff] [blame]

355

This method will do its best to convert the string to the correct

356

character set used in email, and encode and line wrap it safely with

357

the appropriate scheme for that character set.

358

359

If the given charset is not known or an error occurs during

360

conversion, this function will return the header untouched.

361

"""

362

newchunks = []

363

for s, charset in self._chunks:

Barry Warsaw

2002-06-28 23:46:53 +0000

[diff] [blame]

364

newchunks += self._split(s, charset, 1)

Barry Warsaw

2002-04-10 21:01:31 +0000

[diff] [blame]

365

self._chunks = newchunks

Barry Warsaw