Blame - Lib/email/header.py - platform/external/python/cpython2

2006-03-18 15:41:53 +0000

[diff] [blame]

1

Barry Warsaw

bb11386

2004-10-03 03:16:19 +0000

[diff] [blame]

2

# Author: Ben Gertzfield, Barry Warsaw

3

# Contact: email-sig@python.org

Barry Warsaw

2002-04-10 21:01:31 +0000

[diff] [blame]

4

5

"""Header encoding and decoding functionality."""

6

Barry Warsaw

2006-03-18 15:41:53 +0000

[diff] [blame]

__all__ = [

'Header',

'decode_header',

'make_header',

]

Barry Warsaw

2002-04-10 21:01:31 +0000

[diff] [blame]

13

import re

Barry Warsaw

2003-03-06 05:39:46 +0000

[diff] [blame]

14

import binascii

Barry Warsaw

2002-09-30 15:51:31 +0000

[diff] [blame]

15

Barry Warsaw

2006-03-18 15:41:53 +0000

[diff] [blame]

16

import email.quoprimime

17

import email.base64mime

18

19

from email.errors import HeaderParseError

20

from email.charset import Charset

Barry Warsaw

2002-04-10 21:01:31 +0000

[diff] [blame]

21

Barry Warsaw

2002-06-28 23:46:53 +0000

[diff] [blame]

22

NL = '\n'

Barry Warsaw

2003-03-06 05:39:46 +0000

[diff] [blame]

23

SPACE = ' '

Barry Warsaw

2003-03-06 16:10:30 +0000

[diff] [blame]

24

USPACE = u' '

Barry Warsaw

2002-06-28 23:46:53 +0000

[diff] [blame]

25

SPACE8 = ' ' * 8

Barry Warsaw

2003-03-06 16:10:30 +0000

[diff] [blame]

26

UEMPTYSTRING = u''

Barry Warsaw

2002-04-10 21:01:31 +0000

[diff] [blame]

MAXLINELEN = 76

Barry Warsaw

2002-09-30 15:51:31 +0000

[diff] [blame]

30

USASCII = Charset('us-ascii')

31

UTF8 = Charset('utf-8')

32

Barry Warsaw

2002-04-10 21:01:31 +0000

[diff] [blame]

33

# Match encoded-word strings in the form =?charset?q?Hello_World?=

34

ecre = re.compile(r'''

35

=\? # literal =?

36

(?P<charset>[^?]*?) # non-greedy up to the next ? is the charset

37

\? # literal ?

38

(?P<encoding>[qb]) # either a "q" or a "b", case insensitive

39

\? # literal ?

40

(?P<encoded>.*?) # non-greedy up to the next ?= is the encoded string

41

\?= # literal ?=

Barry Warsaw

dcd24ae

2007-03-14 04:59:50 +0000

[diff] [blame]

42

(?=[ \t]|$) # whitespace or the end of the string

43

''', re.VERBOSE | re.IGNORECASE | re.MULTILINE)

Barry Warsaw

2002-04-10 21:01:31 +0000

[diff] [blame]

44

Barry Warsaw

2003-03-06 05:39:46 +0000

[diff] [blame]

45

# Field name regexp, including trailing colon, but not separating whitespace,

46

# according to RFC 2822. Character range is from tilde to exclamation mark.

47

# For use with .match()

48

fcre = re.compile(r'[\041-\176]+:$')

49

Ezio Melotti

c2077b0

2011-03-16 12:34:31 +0200

[diff] [blame]

50

# Find a header embedded in a putative header value. Used to check for

R. David Murray

d97f5ce

2011-01-09 03:02:04 +0000

[diff] [blame]

51

# header injection attack.

52

_embeded_header = re.compile(r'\n[^ \t]+:')

53

Barry Warsaw

2002-04-10 21:01:31 +0000

[diff] [blame]

54

55

56

# Helpers

Barry Warsaw

2006-03-18 15:41:53 +0000

[diff] [blame]

57

_max_append = email.quoprimime._max_append

Barry Warsaw

2002-04-10 21:01:31 +0000

[diff] [blame]

def decode_header(header):

62

"""Decode a message header value without converting charset.

63

64

Returns a list of (decoded_string, charset) pairs containing each of the

65

decoded parts of the header. Charset is None for non-encoded parts of the

66

header, otherwise a lower-case string containing the name of the character

67

set specified in the encoded string.

Barry Warsaw

2003-03-06 05:39:46 +0000

[diff] [blame]

68

Amaury Forgeot d'Arc

74b8d33

2009-07-11 14:33:51 +0000

[diff] [blame]

69

An email.errors.HeaderParseError may be raised when certain decoding error

Barry Warsaw

2003-03-06 05:39:46 +0000

[diff] [blame]

70

occurs (e.g. a base64 decoding exception).

Barry Warsaw

2002-04-10 21:01:31 +0000

[diff] [blame]

71

"""

72

# If no encoding, just return the header

73

header = str(header)

74

if not ecre.search(header):

75

return [(header, None)]

Barry Warsaw

2002-04-10 21:01:31 +0000

[diff] [blame]

76

decoded = []

77

dec = ''

78

for line in header.splitlines():

79

# This line might not have an encoding in it

80

if not ecre.search(line):

81

decoded.append((line, None))

82

continue

Barry Warsaw

2002-04-10 21:01:31 +0000

[diff] [blame]

83

parts = ecre.split(line)

84

while parts:

85

unenc = parts.pop(0).strip()

86

if unenc:

87

# Should we continue a long line?

88

if decoded and decoded[-1][1] is None:

Barry Warsaw

671c3e6

2003-03-06 06:37:42 +0000

[diff] [blame]

89

decoded[-1] = (decoded[-1][0] + SPACE + unenc, None)

Barry Warsaw

2002-04-10 21:01:31 +0000

[diff] [blame]

90

else:

91

decoded.append((unenc, None))

92

if parts:

93

charset, encoding = [s.lower() for s in parts[0:2]]

94

encoded = parts[2]

Barry Warsaw

2003-03-06 05:39:46 +0000

[diff] [blame]

95

dec = None

Barry Warsaw

2002-04-10 21:01:31 +0000

[diff] [blame]

96

if encoding == 'q':

Barry Warsaw

2006-03-18 15:41:53 +0000

[diff] [blame]

97

dec = email.quoprimime.header_decode(encoded)

Barry Warsaw

2002-04-10 21:01:31 +0000

[diff] [blame]

98

elif encoding == 'b':

R. David Murray

75a292e

2010-08-04 00:05:50 +0000

[diff] [blame]

99

paderr = len(encoded) % 4 # Postel's law: add missing padding

100

if paderr:

101

encoded += '==='[:4 - paderr]

Barry Warsaw

2003-03-06 05:39:46 +0000

[diff] [blame]

102

try:

Barry Warsaw

2006-03-18 15:41:53 +0000

[diff] [blame]

103

dec = email.base64mime.decode(encoded)

Barry Warsaw

2003-03-06 05:39:46 +0000

[diff] [blame]

104

except binascii.Error:

105

# Turn this into a higher level exception. BAW: Right

Andrew Svetlov

bd64460

2012-12-19 22:47:05 +0200

[diff] [blame]

106

# now we throw the lower level exception away but

Barry Warsaw

2003-03-06 05:39:46 +0000

[diff] [blame]

107

# when/if we get exception chaining, we'll preserve it.

108

raise HeaderParseError

109

if dec is None:

Barry Warsaw

2002-04-10 21:01:31 +0000

[diff] [blame]

110

dec = encoded

111

112

if decoded and decoded[-1][1] == charset:

113

decoded[-1] = (decoded[-1][0] + dec, decoded[-1][1])

114

else:

115

decoded.append((dec, charset))

del parts[0:3]

return decoded

Barry Warsaw

2002-07-09 16:33:47 +0000

[diff] [blame]

121

def make_header(decoded_seq, maxlinelen=None, header_name=None,

122

continuation_ws=' '):

123

"""Create a Header from a sequence of pairs as returned by decode_header()

124

125

decode_header() takes a header value string and returns a sequence of

126

pairs of the format (decoded_string, charset) where charset is the string

127

name of the character set.

128

129

This function takes one of those sequence of pairs and returns a Header

130

instance. Optional maxlinelen, header_name, and continuation_ws are as in

131

the Header constructor.

132

"""

133

h = Header(maxlinelen=maxlinelen, header_name=header_name,

134

continuation_ws=continuation_ws)

135

for s, charset in decoded_seq:

Barry Warsaw

15d3739

2002-07-23 04:29:54 +0000

[diff] [blame]

136

# None means us-ascii but we can simply pass it on to h.append()

137

if charset is not None and not isinstance(charset, Charset):

Barry Warsaw

2002-07-09 16:33:47 +0000

[diff] [blame]

138

charset = Charset(charset)

h.append(s, charset)

return h

Barry Warsaw

2002-04-10 21:01:31 +0000

[diff] [blame]

144

class Header:

Barry Warsaw

2003-03-06 05:39:46 +0000

[diff] [blame]

145

def __init__(self, s=None, charset=None,

146

maxlinelen=None, header_name=None,

Barry Warsaw

2002-12-30 19:13:00 +0000

[diff] [blame]

147

continuation_ws=' ', errors='strict'):

Barry Warsaw

2002-09-30 15:51:31 +0000

[diff] [blame]

148

"""Create a MIME-compliant header that can contain many character sets.

Barry Warsaw

2002-04-10 21:01:31 +0000

[diff] [blame]

149

Barry Warsaw

2002-09-30 15:51:31 +0000

[diff] [blame]

150

Optional s is the initial header value. If None, the initial header

151

value is not set. You can later append to the header with .append()

152

method calls. s may be a byte string or a Unicode string, but see the

153

.append() documentation for semantics.

Barry Warsaw

2002-07-09 16:33:47 +0000

[diff] [blame]

154

Barry Warsaw

2002-09-30 15:51:31 +0000

[diff] [blame]

155

Optional charset serves two purposes: it has the same meaning as the

156

charset argument to the .append() method. It also sets the default

157

character set for all subsequent .append() calls that omit the charset

158

argument. If charset is not provided in the constructor, the us-ascii

159

charset is used both as s's initial charset and as the default for

160

subsequent .append() calls.

Barry Warsaw

2002-04-10 21:01:31 +0000

[diff] [blame]

161

Barry Warsaw

2002-06-28 23:46:53 +0000

[diff] [blame]

162

The maximum line length can be specified explicit via maxlinelen. For

163

splitting the first line to a shorter value (to account for the field

164

header which isn't included in s, e.g. `Subject') pass in the name of

165

the field in header_name. The default maxlinelen is 76.

166

167

continuation_ws must be RFC 2822 compliant folding whitespace (usually

168

either a space or a hard tab) which will be prepended to continuation

169

lines.

Barry Warsaw

2002-12-30 19:13:00 +0000

[diff] [blame]

170

171

errors is passed through to the .append() call.

Barry Warsaw

2002-04-10 21:01:31 +0000

[diff] [blame]

172

"""

173

if charset is None:

Barry Warsaw

2002-09-30 15:51:31 +0000

[diff] [blame]

174

charset = USASCII

Barry Warsaw

2002-10-14 15:13:17 +0000

[diff] [blame]

175

if not isinstance(charset, Charset):

176

charset = Charset(charset)

Barry Warsaw

2002-04-10 21:01:31 +0000

[diff] [blame]

177

self._charset = charset

Barry Warsaw

2002-06-28 23:46:53 +0000

[diff] [blame]

178

self._continuation_ws = continuation_ws

179

cws_expanded_len = len(continuation_ws.replace('\t', SPACE8))

Barry Warsaw

2002-04-10 21:01:31 +0000

[diff] [blame]

180

# BAW: I believe `chunks' and `maxlinelen' should be non-public.

181

self._chunks = []

Barry Warsaw

2002-07-09 16:33:47 +0000

[diff] [blame]

182

if s is not None:

Barry Warsaw

2002-12-30 19:13:00 +0000

[diff] [blame]

183

self.append(s, charset, errors)

Barry Warsaw

2002-05-19 23:47:53 +0000

[diff] [blame]

184

if maxlinelen is None:

Barry Warsaw

2002-06-28 23:46:53 +0000

[diff] [blame]

185

maxlinelen = MAXLINELEN

186

if header_name is None:

187

# We don't know anything about the field header so the first line

188

# is the same length as subsequent lines.

189

self._firstlinelen = maxlinelen

Barry Warsaw

2002-05-19 23:47:53 +0000

[diff] [blame]

190

else:

Barry Warsaw

2002-06-28 23:46:53 +0000

[diff] [blame]

191

# The first line should be shorter to take into account the field

192

# header. Also subtract off 2 extra for the colon and space.

193

self._firstlinelen = maxlinelen - len(header_name) - 2

194

# Second and subsequent lines should subtract off the length in

195

# columns of the continuation whitespace prefix.

196

self._maxlinelen = maxlinelen - cws_expanded_len

Barry Warsaw

2002-04-10 21:01:31 +0000

[diff] [blame]

197

198

def __str__(self):

199

"""A synonym for self.encode()."""

200

return self.encode()

201

Barry Warsaw

8e69bda

2002-06-29 03:26:58 +0000

[diff] [blame]

202

def __unicode__(self):

203

"""Helper for the built-in unicode function."""

Barry Warsaw

2003-03-06 16:10:30 +0000

[diff] [blame]

204

uchunks = []

205

lastcs = None

206

for s, charset in self._chunks:

207

# We must preserve spaces between encoded and non-encoded word

208

# boundaries, which means for us we need to add a space when we go

209

# from a charset to None/us-ascii, or from None/us-ascii to a

210

# charset. Only do this for the second and subsequent chunks.

211

nextcs = charset

212

if uchunks:

Barry Warsaw

ba1548a

2003-03-30 20:46:47 +0000

[diff] [blame]

213

if lastcs not in (None, 'us-ascii'):

214

if nextcs in (None, 'us-ascii'):

Barry Warsaw

2003-03-06 16:10:30 +0000

[diff] [blame]

215

uchunks.append(USPACE)

216

nextcs = None

Barry Warsaw

ba1548a

2003-03-30 20:46:47 +0000

[diff] [blame]

217

elif nextcs not in (None, 'us-ascii'):

Barry Warsaw

2003-03-06 16:10:30 +0000

[diff] [blame]

218

uchunks.append(USPACE)

219

lastcs = nextcs

220

uchunks.append(unicode(s, str(charset)))

221

return UEMPTYSTRING.join(uchunks)

Barry Warsaw

8e69bda

2002-06-29 03:26:58 +0000

[diff] [blame]

222

Barry Warsaw

2002-07-09 16:33:47 +0000

[diff] [blame]

223

# Rich comparison operators for equality only. BAW: does it make sense to

224

# have or explicitly disable <, <=, >, >= operators?

225

def __eq__(self, other):

226

# other may be a Header or a string. Both are fine so coerce

227

# ourselves to a string, swap the args and do another comparison.

228

return other == self.encode()

229

230

def __ne__(self, other):

231

return not self == other

232

Barry Warsaw

2002-12-30 19:13:00 +0000

[diff] [blame]

233

def append(self, s, charset=None, errors='strict'):

Barry Warsaw

2002-09-30 15:51:31 +0000

[diff] [blame]

234

"""Append a string to the MIME header.

Barry Warsaw

2002-04-10 21:01:31 +0000

[diff] [blame]

235

Barry Warsaw

2002-09-30 15:51:31 +0000

[diff] [blame]

236

Optional charset, if given, should be a Charset instance or the name

237

of a character set (which will be converted to a Charset instance). A

238

value of None (the default) means that the charset given in the

239

constructor is used.

240

241

s may be a byte string or a Unicode string. If it is a byte string

Barry Warsaw

3d1f397

2004-05-09 03:40:17 +0000

[diff] [blame]

242

(i.e. isinstance(s, str) is true), then charset is the encoding of

243

that byte string, and a UnicodeError will be raised if the string

Barry Warsaw

4833068

2002-09-30 23:07:35 +0000

[diff] [blame]

244

cannot be decoded with that charset. If s is a Unicode string, then

Barry Warsaw

2002-09-30 15:51:31 +0000

[diff] [blame]

245

charset is a hint specifying the character set of the characters in

246

the string. In this case, when producing an RFC 2822 compliant header

247

using RFC 2047 rules, the Unicode string will be encoded using the

Barry Warsaw

4833068

2002-09-30 23:07:35 +0000

[diff] [blame]

248

following charsets in order: us-ascii, the charset hint, utf-8. The

249

first character set not to provoke a UnicodeError is used.

Barry Warsaw

2002-12-30 19:13:00 +0000

[diff] [blame]

250

251

Optional `errors' is passed as the third argument to any unicode() or

252

ustr.encode() call.

Barry Warsaw

2002-04-10 21:01:31 +0000

[diff] [blame]

253

"""

254

if charset is None:

255

charset = self._charset

Barry Warsaw

92825a9

2002-07-23 06:08:10 +0000

[diff] [blame]

256

elif not isinstance(charset, Charset):

257

charset = Charset(charset)

Barry Warsaw

2002-10-14 16:52:41 +0000

[diff] [blame]

258

# If the charset is our faux 8bit charset, leave the string unchanged

Brett Cannon

1f571c6

2008-08-03 23:27:32 +0000

[diff] [blame]

259

if charset != '8bit':

Barry Warsaw

2002-10-14 16:52:41 +0000

[diff] [blame]

260

# We need to test that the string can be converted to unicode and

261

# back to a byte string, given the input and output codecs of the

262

# charset.

Barry Warsaw

3d1f397

2004-05-09 03:40:17 +0000

[diff] [blame]

263

if isinstance(s, str):

Barry Warsaw

2002-10-14 16:52:41 +0000

[diff] [blame]

264

# Possibly raise UnicodeError if the byte string can't be

265

# converted to a unicode with the input codec of the charset.

266

incodec = charset.input_codec or 'us-ascii'

Barry Warsaw

2002-12-30 19:13:00 +0000

[diff] [blame]

267

ustr = unicode(s, incodec, errors)

Barry Warsaw

2002-10-14 16:52:41 +0000

[diff] [blame]

268

# Now make sure that the unicode could be converted back to a

269

# byte string with the output codec, which may be different

270

# than the iput coded. Still, use the original byte string.

271

outcodec = charset.output_codec or 'us-ascii'

Barry Warsaw

2002-12-30 19:13:00 +0000

[diff] [blame]

272

ustr.encode(outcodec, errors)

Barry Warsaw

3d1f397

2004-05-09 03:40:17 +0000

[diff] [blame]

273

elif isinstance(s, unicode):

Barry Warsaw

2002-10-14 16:52:41 +0000

[diff] [blame]

274

# Now we have to be sure the unicode string can be converted

275

# to a byte string with a reasonable output codec. We want to

276

# use the byte string in the chunk.

277

for charset in USASCII, charset, UTF8:

278

try:

279

outcodec = charset.output_codec or 'us-ascii'

Barry Warsaw

2002-12-30 19:13:00 +0000

[diff] [blame]

280

s = s.encode(outcodec, errors)

Barry Warsaw

2002-10-14 16:52:41 +0000

[diff] [blame]

break

except UnicodeError:

pass

else:

assert False, 'utf-8 conversion failed'

Barry Warsaw

2002-04-10 21:01:31 +0000

[diff] [blame]

286

self._chunks.append((s, charset))

Tim Peters

8ac1495

2002-05-23 15:15:30 +0000

[diff] [blame]

287

Barry Warsaw

2003-03-06 05:39:46 +0000

[diff] [blame]

288

def _split(self, s, charset, maxlinelen, splitchars):

Barry Warsaw

2002-10-14 15:13:17 +0000

[diff] [blame]

289

# Split up a header safely for use with encode_chunks.

Barry Warsaw

2002-04-10 21:01:31 +0000

[diff] [blame]

290

splittable = charset.to_splittable(s)

Barry Warsaw

2003-03-06 05:39:46 +0000

[diff] [blame]

291

encoded = charset.from_splittable(splittable, True)

Barry Warsaw

2002-05-19 23:47:53 +0000

[diff] [blame]

292

elen = charset.encoded_header_len(encoded)

Barry Warsaw

2003-03-06 05:39:46 +0000

[diff] [blame]

293

# If the line's encoded length first, just return it

294

if elen <= maxlinelen:

Barry Warsaw

2002-04-10 21:01:31 +0000

[diff] [blame]

295

return [(encoded, charset)]

Barry Warsaw

2002-10-14 15:13:17 +0000

[diff] [blame]

296

# If we have undetermined raw 8bit characters sitting in a byte

297

# string, we really don't know what the right thing to do is. We

298

# can't really split it because it might be multibyte data which we

299

# could break if we split it between pairs. The least harm seems to

300

# be to not split the header at all, but that means they could go out

301

# longer than maxlinelen.

Barry Warsaw

2003-03-06 05:39:46 +0000

[diff] [blame]

302

if charset == '8bit':

Barry Warsaw

2002-10-14 15:13:17 +0000

[diff] [blame]

303

return [(s, charset)]

Barry Warsaw

2002-06-28 23:46:53 +0000

[diff] [blame]

304

# BAW: I'm not sure what the right test here is. What we're trying to

305

# do is be faithful to RFC 2822's recommendation that ($2.2.3):

306

#

307

# "Note: Though structured field bodies are defined in such a way that

308

# folding can take place between many of the lexical tokens (and even

309

# within some of the lexical tokens), folding SHOULD be limited to

310

# placing the CRLF at higher-level syntactic breaks."

311

#

312

# For now, I can only imagine doing this when the charset is us-ascii,

313

# although it's possible that other charsets may also benefit from the

314

# higher-level syntactic breaks.

Barry Warsaw

2002-06-28 23:46:53 +0000

[diff] [blame]

315

elif charset == 'us-ascii':

Barry Warsaw

2003-03-06 05:39:46 +0000

[diff] [blame]

316

return self._split_ascii(s, charset, maxlinelen, splitchars)

Barry Warsaw

2002-05-19 23:47:53 +0000

[diff] [blame]

317

# BAW: should we use encoded?

318

elif elen == len(s):

319

# We can split on _maxlinelen boundaries because we know that the

320

# encoding won't change the size of the string

Barry Warsaw

2003-03-06 05:39:46 +0000

[diff] [blame]

321

splitpnt = maxlinelen

Barry Warsaw

2002-09-30 15:51:31 +0000

[diff] [blame]

322

first = charset.from_splittable(splittable[:splitpnt], False)

323

last = charset.from_splittable(splittable[splitpnt:], False)

Barry Warsaw

2002-04-10 21:01:31 +0000

[diff] [blame]

324

else:

Barry Warsaw

2003-03-06 05:39:46 +0000

[diff] [blame]

325

# Binary search for split point

326

first, last = _binsplit(splittable, charset, maxlinelen)

327

# first is of the proper length so just wrap it in the appropriate

328

# chrome. last must be recursively split.

329

fsplittable = charset.to_splittable(first)

330

fencoded = charset.from_splittable(fsplittable, True)

331

chunk = [(fencoded, charset)]

332

return chunk + self._split(last, charset, self._maxlinelen, splitchars)

Barry Warsaw

2002-06-28 23:46:53 +0000

[diff] [blame]

333

Barry Warsaw

2003-03-06 05:39:46 +0000

[diff] [blame]

334

def _split_ascii(self, s, charset, firstlen, splitchars):

Barry Warsaw

2003-03-10 15:14:08 +0000

[diff] [blame]

335

chunks = _split_ascii(s, firstlen, self._maxlinelen,

336

self._continuation_ws, splitchars)

337

return zip(chunks, [charset]*len(chunks))

Barry Warsaw

2002-06-28 23:46:53 +0000

[diff] [blame]

338

Barry Warsaw

2003-03-07 15:39:37 +0000

[diff] [blame]

339

def _encode_chunks(self, newchunks, maxlinelen):

Barry Warsaw

0c35825

2002-10-13 04:06:28 +0000

[diff] [blame]

340

# MIME-encode a header with many different charsets and/or encodings.

341

#

342

# Given a list of pairs (string, charset), return a MIME-encoded

343

# string suitable for use in a header field. Each pair may have

344

# different charsets and/or encodings, and the resulting header will

345

# accurately reflect each setting.

346

#

Amaury Forgeot d'Arc

74b8d33

2009-07-11 14:33:51 +0000

[diff] [blame]

347

# Each encoding can be email.utils.QP (quoted-printable, for

348

# ASCII-like character sets like iso-8859-1), email.utils.BASE64

Barry Warsaw

0c35825

2002-10-13 04:06:28 +0000

[diff] [blame]

349

# (Base64, for non-ASCII like character sets like KOI8-R and

350

# iso-2022-jp), or None (no encoding).

351

#

352

# Each pair will be represented on a separate line; the resulting

353

# string will be in the format:

354

#

355

# =?charset1?q?Mar=EDa_Gonz=E1lez_Alonso?=\n

356

# =?charset2?b?SvxyZ2VuIEL2aW5n?="

Barry Warsaw

2002-06-28 23:46:53 +0000

[diff] [blame]

357

chunks = []

Barry Warsaw

0c35825

2002-10-13 04:06:28 +0000

[diff] [blame]

358

for header, charset in newchunks:

Barry Warsaw

6613fb8

2003-03-17 20:36:20 +0000

[diff] [blame]

359

if not header:

360

continue

Barry Warsaw

2002-06-28 23:46:53 +0000

[diff] [blame]

361

if charset is None or charset.header_encoding is None:

Barry Warsaw

2003-03-06 05:39:46 +0000

[diff] [blame]

362

s = header

Barry Warsaw

2002-06-28 23:46:53 +0000

[diff] [blame]

363

else:

Barry Warsaw

2003-03-06 05:39:46 +0000

[diff] [blame]

364

s = charset.header_encode(header)

Barry Warsaw

2003-03-10 15:14:08 +0000

[diff] [blame]

365

# Don't add more folding whitespace than necessary

366

if chunks and chunks[-1].endswith(' '):

extra = ''

else:

extra = ' '

_max_append(chunks, s, maxlinelen, extra)

Barry Warsaw

2002-06-28 23:46:53 +0000

[diff] [blame]

371

joiner = NL + self._continuation_ws

372

return joiner.join(chunks)

Barry Warsaw

2002-04-10 21:01:31 +0000

[diff] [blame]

373

Barry Warsaw

2003-03-06 05:39:46 +0000

[diff] [blame]

374

def encode(self, splitchars=';, '):

Barry Warsaw

4833068

2002-09-30 23:07:35 +0000

[diff] [blame]

375

"""Encode a message header into an RFC-compliant format.

Barry Warsaw

2002-04-10 21:01:31 +0000

[diff] [blame]

376

377

There are many issues involved in converting a given string for use in

378

an email header. Only certain character sets are readable in most

379

email clients, and as header strings can only contain a subset of

380

7-bit ASCII, care must be taken to properly convert and encode (with

381

Base64 or quoted-printable) header strings. In addition, there is a

382

75-character length limit on any given encoded header field, so

383

line-wrapping must be performed, even with double-byte character sets.

Tim Peters

8ac1495

2002-05-23 15:15:30 +0000

[diff] [blame]

384

Barry Warsaw

2002-04-10 21:01:31 +0000

[diff] [blame]

385

This method will do its best to convert the string to the correct

386

character set used in email, and encode and line wrap it safely with

387

the appropriate scheme for that character set.

388

389

If the given charset is not known or an error occurs during

390

conversion, this function will return the header untouched.

Barry Warsaw

2003-03-06 05:39:46 +0000

[diff] [blame]

391

392

Optional splitchars is a string containing characters to split long

393

ASCII lines on, in rough support of RFC 2822's `highest level

394

syntactic breaks'. This doesn't affect RFC 2047 encoded lines.

Barry Warsaw

2002-04-10 21:01:31 +0000

[diff] [blame]

395

"""

396

newchunks = []

Barry Warsaw

2003-03-06 05:39:46 +0000

[diff] [blame]

397

maxlinelen = self._firstlinelen

398

lastlen = 0

Barry Warsaw

2002-04-10 21:01:31 +0000

[diff] [blame]

399

for s, charset in self._chunks:

Barry Warsaw

2003-03-06 05:39:46 +0000

[diff] [blame]

400

# The first bit of the next chunk should be just long enough to

401

# fill the next line. Don't forget the space separating the

402

# encoded words.

403

targetlen = maxlinelen - lastlen - 1

404

if targetlen < charset.encoded_header_len(''):

405

# Stick it on the next line

406

targetlen = maxlinelen

407

newchunks += self._split(s, charset, targetlen, splitchars)

408

lastchunk, lastcharset = newchunks[-1]

409

lastlen = lastcharset.encoded_header_len(lastchunk)

R. David Murray

d97f5ce

2011-01-09 03:02:04 +0000

[diff] [blame]

410

value = self._encode_chunks(newchunks, maxlinelen)

411

if _embeded_header.search(value):

412

raise HeaderParseError("header value appears to contain "

413

"an embedded header: {!r}".format(value))

414

return value

Barry Warsaw

2003-03-06 05:39:46 +0000

[diff] [blame]

def _split_ascii(s, firstlen, restlen, continuation_ws, splitchars):

419

lines = []

420

maxlen = firstlen

421

for line in s.splitlines():

Barry Warsaw

33975ea

2003-03-07 23:24:34 +0000

[diff] [blame]

422

# Ignore any leading whitespace (i.e. continuation whitespace) already

423

# on the line, since we'll be adding our own.

424

line = line.lstrip()

Barry Warsaw

2003-03-06 05:39:46 +0000

[diff] [blame]

425

if len(line) < maxlen:

lines.append(line)

maxlen = restlen

continue

# Attempt to split the line at the highest-level syntactic break

430

# possible. Note that we don't have a lot of smarts about field

431

# syntax; we just try to break on semi-colons, then commas, then

432

# whitespace.

433

for ch in splitchars:

Barry Warsaw

6f3b033

2004-05-10 14:44:04 +0000

[diff] [blame]

434

if ch in line:

Barry Warsaw

2003-03-06 05:39:46 +0000

[diff] [blame]

435

break

436

else:

437

# There's nothing useful to split the line on, not even spaces, so

438

# just append this line unchanged

lines.append(line)

maxlen = restlen

continue

# Now split the line on the character plus trailing whitespace

443

cre = re.compile(r'%s\s*' % ch)

if ch in ';,':

eol = ch

else:

eol = ''

joiner = eol + ' '

joinlen = len(joiner)

450

wslen = len(continuation_ws.replace('\t', SPACE8))

451

this = []

452

linelen = 0

453

for part in cre.split(line):

454

curlen = linelen + max(0, len(this)-1) * joinlen

455

partlen = len(part)

456

onfirstline = not lines

457

# We don't want to split after the field name, if we're on the

458

# first line and the field name is present in the header string.

459

if ch == ' ' and onfirstline and \

460

len(this) == 1 and fcre.match(this[0]):

461

this.append(part)

462

linelen += partlen

463

elif curlen + partlen > maxlen:

464

if this:

465

lines.append(joiner.join(this) + eol)

Barry Warsaw

bd836df

2003-03-06 20:33:04 +0000

[diff] [blame]

466

# If this part is longer than maxlen and we aren't already

467

# splitting on whitespace, try to recursively split this line

468

# on whitespace.

Brett Cannon

1f571c6

2008-08-03 23:27:32 +0000

[diff] [blame]

469

if partlen > maxlen and ch != ' ':

Barry Warsaw

2003-03-10 15:14:08 +0000

[diff] [blame]

470

subl = _split_ascii(part, maxlen, restlen,

Barry Warsaw

2003-03-07 15:39:37 +0000

[diff] [blame]

471

continuation_ws, ' ')

Barry Warsaw

2003-03-07 15:39:37 +0000

[diff] [blame]

472

lines.extend(subl[:-1])

473

this = [subl[-1]]

Barry Warsaw

bd836df

2003-03-06 20:33:04 +0000

[diff] [blame]

474

else:

475

this = [part]

Barry Warsaw

2003-03-07 15:39:37 +0000

[diff] [blame]

476

linelen = wslen + len(this[-1])

Barry Warsaw

2003-03-06 05:39:46 +0000

[diff] [blame]

maxlen = restlen

else:

this.append(part)

linelen += partlen

# Put any left over parts on a line by themselves

482

if this:

483

lines.append(joiner.join(this))

Barry Warsaw

2003-03-10 15:14:08 +0000

[diff] [blame]

484

return lines

Barry Warsaw

2003-03-06 05:39:46 +0000

[diff] [blame]

def _binsplit(splittable, charset, maxlinelen):

i = 0

j = len(splittable)

while i < j:

# Invariants:

# 1. splittable[:k] fits for all k <= i (note that we *assume*,

494

# at the start, that splittable[:0] fits).

495

# 2. splittable[:k] does not fit for any k > j (at the start,

496

# this means we shouldn't look at any k > len(splittable)).

497

# 3. We don't know about splittable[:k] for k in i+1..j.

498

# 4. We want to set i to the largest k that fits, with i <= k <= j.

499

#

500

m = (i+j+1) >> 1 # ceiling((i+j)/2); i < m <= j

501

chunk = charset.from_splittable(splittable[:m], True)

502

chunklen = charset.encoded_header_len(chunk)

503

if chunklen <= maxlinelen:

504

# m is acceptable, so is a new lower bound.

505

i = m

506

else:

Tim Peters

2b48213

2003-03-06 23:41:58 +0000

[diff] [blame]

507

# m is not acceptable, so final i must be < m.

Barry Warsaw